X-Git-Url: https://git.lttng.org/?p=lttng-tools.git;a=blobdiff_plain;f=src%2Fbin%2Flttng-sessiond%2Fmain.c;h=91fba2643fc2ae94acf79850fd0e9aa589cfbf90;hp=cedd35611c6ba4ecdca0039b2de79364a7f27dab;hb=c617c0c651432f9d5ae7adf4c5c1a5fd92ad828e;hpb=6d805429e9cb049eb0c9205fcf742a53e3166caf diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c index cedd35611..91fba2643 100644 --- a/src/bin/lttng-sessiond/main.c +++ b/src/bin/lttng-sessiond/main.c @@ -59,7 +59,6 @@ #include "ust-consumer.h" #include "utils.h" #include "fd-limit.h" -#include "filter.h" #include "health.h" #include "testpoint.h" @@ -228,6 +227,11 @@ struct health_state health_thread_app_manage; struct health_state health_thread_app_reg; struct health_state health_thread_kernel; +/* + * Socket timeout for receiving and sending in seconds. + */ +static int app_socket_timeout; + static void setup_consumerd_path(void) { @@ -392,7 +396,7 @@ static void stop_threads(void) static void cleanup(void) { int ret; - char *cmd; + char *cmd = NULL; struct ltt_session *sess, *stmp; DBG("Cleaning up"); @@ -442,9 +446,6 @@ static void cleanup(void) modprobe_remove_lttng_all(); } - utils_close_pipe(kernel_poll_pipe); - utils_close_pipe(apps_cmd_pipe); - /* */ DBG("%c[%d;%dm*** assert failed :-) *** ==> %c[%dm%c[%d;%dm" "Matthew, BEET driven development works!%c[%dm", @@ -460,7 +461,7 @@ static void cleanup(void) static int send_unix_sock(int sock, void *buf, size_t len) { /* Check valid length */ - if (len <= 0) { + if (len == 0) { return -1; } @@ -632,7 +633,7 @@ static int update_kernel_stream(struct consumer_data *consumer_data, int fd) assert(socket->fd >= 0); pthread_mutex_lock(socket->lock); - ret = kernel_consumer_send_channel_stream(socket->fd, + ret = kernel_consumer_send_channel_stream(socket, channel, ksess); pthread_mutex_unlock(socket->lock); if (ret < 0) { @@ -688,14 +689,14 @@ static void *thread_manage_kernel(void *data) char tmp; struct lttng_poll_event events; - DBG("Thread manage kernel started"); + DBG("[thread] Thread manage kernel started"); - testpoint(thread_manage_kernel); + if (testpoint(thread_manage_kernel)) { + goto error_testpoint; + } health_code_update(&health_thread_kernel); - testpoint(thread_manage_kernel_before_loop); - ret = create_thread_poll_set(&events, 2); if (ret < 0) { goto error_poll_create; @@ -706,6 +707,10 @@ static void *thread_manage_kernel(void *data) goto error; } + if (testpoint(thread_manage_kernel_before_loop)) { + goto error; + } + while (1) { health_code_update(&health_thread_kernel); @@ -723,12 +728,7 @@ static void *thread_manage_kernel(void *data) update_poll_flag = 0; } - nb_fd = LTTNG_POLL_GETNB(&events); - - DBG("Thread kernel polling on %d fds", nb_fd); - - /* Zeroed the poll events */ - lttng_poll_reset(&events); + DBG("Thread kernel polling on %d fds", events.nb_fd); /* Poll infinite value of time */ restart: @@ -750,6 +750,8 @@ static void *thread_manage_kernel(void *data) continue; } + nb_fd = ret; + for (i = 0; i < nb_fd; i++) { /* Fetch once the poll data */ revents = LTTNG_POLL_GETEV(&events, i); @@ -793,9 +795,14 @@ exit: error: lttng_poll_clean(&events); error_poll_create: +error_testpoint: + utils_close_pipe(kernel_poll_pipe); + kernel_poll_pipe[0] = kernel_poll_pipe[1] = -1; if (err) { health_error(&health_thread_kernel); ERR("Health error occurred in %s", __func__); + WARN("Kernel thread died unexpectedly. " + "Kernel tracing can continue but CPU hotplug is disabled."); } health_exit(&health_thread_kernel); DBG("Kernel thread dying"); @@ -857,13 +864,6 @@ static void *thread_manage_consumer(void *data) */ health_poll_update(&consumer_data->health); - health_code_update(&consumer_data->health); - - ret = lttcomm_listen_unix_sock(consumer_data->err_sock); - if (ret < 0) { - goto error_listen; - } - /* * Pass 2 as size here for the thread quit pipe and kconsumerd_err_sock. * Nothing more will be added to this poll set. @@ -873,20 +873,25 @@ static void *thread_manage_consumer(void *data) goto error_poll; } + /* + * The error socket here is already in a listening state which was done + * just before spawning this thread to avoid a race between the consumer + * daemon exec trying to connect and the listen() call. + */ ret = lttng_poll_add(&events, consumer_data->err_sock, LPOLLIN | LPOLLRDHUP); if (ret < 0) { goto error; } - nb_fd = LTTNG_POLL_GETNB(&events); - health_code_update(&consumer_data->health); /* Inifinite blocking call, waiting for transmission */ restart: health_poll_update(&consumer_data->health); - testpoint(thread_manage_consumer); + if (testpoint(thread_manage_consumer)) { + goto error; + } ret = lttng_poll_wait(&events, -1); health_poll_update(&consumer_data->health); @@ -900,6 +905,8 @@ restart: goto error; } + nb_fd = ret; + for (i = 0; i < nb_fd; i++) { /* Fetch once the poll data */ revents = LTTNG_POLL_GETEV(&events, i); @@ -928,6 +935,12 @@ restart: goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because either way, the + * show must go on. + */ + (void) utils_set_fd_cloexec(sock); + health_code_update(&consumer_data->health); DBG2("Receiving code from consumer err_sock"); @@ -971,9 +984,6 @@ restart: health_code_update(&consumer_data->health); - /* Update number of fd */ - nb_fd = LTTNG_POLL_GETNB(&events); - /* Inifinite blocking call, waiting for transmission */ restart_poll: health_poll_update(&consumer_data->health); @@ -989,6 +999,8 @@ restart_poll: goto error; } + nb_fd = ret; + for (i = 0; i < nb_fd; i++) { /* Fetch once the poll data */ revents = LTTNG_POLL_GETEV(&events, i); @@ -1062,7 +1074,6 @@ error: lttng_poll_clean(&events); error_poll: -error_listen: if (err) { health_error(&consumer_data->health); ERR("Health error occurred in %s", __func__); @@ -1085,11 +1096,13 @@ static void *thread_manage_apps(void *data) DBG("[thread] Manage application started"); - testpoint(thread_manage_apps); - rcu_register_thread(); rcu_thread_online(); + if (testpoint(thread_manage_apps)) { + goto error_testpoint; + } + health_code_update(&health_thread_app_manage); ret = create_thread_poll_set(&events, 2); @@ -1102,17 +1115,14 @@ static void *thread_manage_apps(void *data) goto error; } - testpoint(thread_manage_apps_before_loop); + if (testpoint(thread_manage_apps_before_loop)) { + goto error; + } health_code_update(&health_thread_app_manage); while (1) { - /* Zeroed the events structure */ - lttng_poll_reset(&events); - - nb_fd = LTTNG_POLL_GETNB(&events); - - DBG("Apps thread polling on %d fds", nb_fd); + DBG("Apps thread polling on %d fds", events.nb_fd); /* Inifinite blocking call, waiting for transmission */ restart: @@ -1129,6 +1139,8 @@ static void *thread_manage_apps(void *data) goto error; } + nb_fd = ret; + for (i = 0; i < nb_fd; i++) { /* Fetch once the poll data */ revents = LTTNG_POLL_GETEV(&events, i); @@ -1192,16 +1204,22 @@ static void *thread_manage_apps(void *data) ust_app_unregister(ust_cmd.sock); } else { /* - * We just need here to monitor the close of the UST - * socket and poll set monitor those by default. - * Listen on POLLIN (even if we never expect any - * data) to ensure that hangup wakes us. + * We only monitor the error events of the socket. This + * thread does not handle any incoming data from UST + * (POLLIN). */ - ret = lttng_poll_add(&events, ust_cmd.sock, LPOLLIN); + ret = lttng_poll_add(&events, ust_cmd.sock, + LPOLLERR & LPOLLHUP & LPOLLRDHUP); if (ret < 0) { goto error; } + /* Set socket timeout for both receiving and ending */ + (void) lttcomm_setsockopt_rcv_timeout(ust_cmd.sock, + app_socket_timeout); + (void) lttcomm_setsockopt_snd_timeout(ust_cmd.sock, + app_socket_timeout); + DBG("Apps with sock %d added to poll set", ust_cmd.sock); } @@ -1236,6 +1254,16 @@ exit: error: lttng_poll_clean(&events); error_poll_create: +error_testpoint: + utils_close_pipe(apps_cmd_pipe); + apps_cmd_pipe[0] = apps_cmd_pipe[1] = -1; + + /* + * We don't clean the UST app hash table here since already registered + * applications can still be controlled so let them be until the session + * daemon dies or the applications stop. + */ + if (err) { health_error(&health_thread_app_manage); ERR("Health error occurred in %s", __func__); @@ -1285,18 +1313,26 @@ static void *thread_dispatch_ust_registration(void *data) * call is blocking so we can be assured that the data will be read * at some point in time or wait to the end of the world :) */ - ret = write(apps_cmd_pipe[1], ust_cmd, - sizeof(struct ust_command)); - if (ret < 0) { - PERROR("write apps cmd pipe"); - if (errno == EBADF) { - /* - * We can't inform the application thread to process - * registration. We will exit or else application - * registration will not occur and tracing will never - * start. - */ - goto error; + if (apps_cmd_pipe[1] >= 0) { + ret = write(apps_cmd_pipe[1], ust_cmd, + sizeof(struct ust_command)); + if (ret < 0) { + PERROR("write apps cmd pipe"); + if (errno == EBADF) { + /* + * We can't inform the application thread to process + * registration. We will exit or else application + * registration will not occur and tracing will never + * start. + */ + goto error; + } + } + } else { + /* Application manager thread is not available. */ + ret = close(ust_cmd->sock); + if (ret < 0) { + PERROR("close ust_cmd sock"); } } free(ust_cmd); @@ -1327,7 +1363,9 @@ static void *thread_registration_apps(void *data) DBG("[thread] Manage application registration started"); - testpoint(thread_registration_apps); + if (testpoint(thread_registration_apps)) { + goto error_testpoint; + } ret = lttcomm_listen_unix_sock(apps_sock); if (ret < 0) { @@ -1360,8 +1398,6 @@ static void *thread_registration_apps(void *data) while (1) { DBG("Accepting application registration"); - nb_fd = LTTNG_POLL_GETNB(&events); - /* Inifinite blocking call, waiting for transmission */ restart: health_poll_update(&health_thread_app_reg); @@ -1377,6 +1413,8 @@ static void *thread_registration_apps(void *data) goto error; } + nb_fd = ret; + for (i = 0; i < nb_fd; i++) { health_code_update(&health_thread_app_reg); @@ -1402,6 +1440,12 @@ static void *thread_registration_apps(void *data) goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because + * either way, the show must go on. + */ + (void) utils_set_fd_cloexec(sock); + /* Create UST registration command for enqueuing */ ust_cmd = zmalloc(sizeof(struct ust_command)); if (ust_cmd == NULL) { @@ -1476,7 +1520,6 @@ error: health_error(&health_thread_app_reg); ERR("Health error occurred in %s", __func__); } - health_exit(&health_thread_app_reg); /* Notify that the registration thread is gone */ notify_ust_apps(0); @@ -1500,7 +1543,9 @@ error_poll_add: lttng_poll_clean(&events); error_listen: error_create_poll: +error_testpoint: DBG("UST Registration thread cleanup complete"); + health_exit(&health_thread_app_reg); return NULL; } @@ -1632,10 +1677,10 @@ error: static int join_consumer_thread(struct consumer_data *consumer_data) { void *status; - int ret; /* Consumer pid must be a real one. */ if (consumer_data->pid > 0) { + int ret; ret = kill(consumer_data->pid, SIGTERM); if (ret) { ERR("Error killing consumer daemon"); @@ -1816,6 +1861,16 @@ static int start_consumerd(struct consumer_data *consumer_data) { int ret; + /* + * Set the listen() state on the socket since there is a possible race + * between the exec() of the consumer daemon and this call if place in the + * consumer thread. See bug #366 for more details. + */ + ret = lttcomm_listen_unix_sock(consumer_data->err_sock); + if (ret < 0) { + goto error; + } + pthread_mutex_lock(&consumer_data->pid_mutex); if (consumer_data->pid != 0) { pthread_mutex_unlock(&consumer_data->pid_mutex); @@ -1845,6 +1900,15 @@ end: return 0; error: + /* Cleanup already created socket on error. */ + if (consumer_data->err_sock >= 0) { + int err; + + err = close(consumer_data->err_sock); + if (err < 0) { + PERROR("close consumer data error socket"); + } + } return ret; } @@ -2407,8 +2471,7 @@ skip_domain: { ret = cmd_add_context(cmd_ctx->session, cmd_ctx->lsm->domain.type, cmd_ctx->lsm->u.context.channel_name, - cmd_ctx->lsm->u.context.event_name, - &cmd_ctx->lsm->u.context.ctx); + &cmd_ctx->lsm->u.context.ctx, kernel_poll_pipe[1]); break; } case LTTNG_DISABLE_CHANNEL: @@ -2471,7 +2534,7 @@ skip_domain: { ret = cmd_enable_event(cmd_ctx->session, cmd_ctx->lsm->domain.type, cmd_ctx->lsm->u.enable.channel_name, - &cmd_ctx->lsm->u.enable.event, kernel_poll_pipe[1]); + &cmd_ctx->lsm->u.enable.event, NULL, kernel_poll_pipe[1]); break; } case LTTNG_ENABLE_ALL_EVENT: @@ -2480,7 +2543,7 @@ skip_domain: ret = cmd_enable_event_all(cmd_ctx->session, cmd_ctx->lsm->domain.type, cmd_ctx->lsm->u.enable.channel_name, - cmd_ctx->lsm->u.enable.event.type, kernel_poll_pipe[1]); + cmd_ctx->lsm->u.enable.event.type, NULL, kernel_poll_pipe[1]); break; } case LTTNG_LIST_TRACEPOINTS: @@ -2792,15 +2855,19 @@ skip_domain: cmd_ctx->lsm->u.reg.path, cdata); break; } - case LTTNG_SET_FILTER: + case LTTNG_ENABLE_EVENT_WITH_FILTER: { struct lttng_filter_bytecode *bytecode; - if (cmd_ctx->lsm->u.filter.bytecode_len > LTTNG_FILTER_MAX_LEN) { + if (cmd_ctx->lsm->u.enable.bytecode_len > LTTNG_FILTER_MAX_LEN) { ret = LTTNG_ERR_FILTER_INVAL; goto error; } - bytecode = zmalloc(cmd_ctx->lsm->u.filter.bytecode_len); + if (cmd_ctx->lsm->u.enable.bytecode_len == 0) { + ret = LTTNG_ERR_FILTER_INVAL; + goto error; + } + bytecode = zmalloc(cmd_ctx->lsm->u.enable.bytecode_len); if (!bytecode) { ret = LTTNG_ERR_FILTER_NOMEM; goto error; @@ -2808,7 +2875,7 @@ skip_domain: /* Receive var. len. data */ DBG("Receiving var len data from client ..."); ret = lttcomm_recv_unix_sock(sock, bytecode, - cmd_ctx->lsm->u.filter.bytecode_len); + cmd_ctx->lsm->u.enable.bytecode_len); if (ret <= 0) { DBG("Nothing recv() from client var len data... continuing"); *sock_error = 1; @@ -2817,16 +2884,15 @@ skip_domain: } if (bytecode->len + sizeof(*bytecode) - != cmd_ctx->lsm->u.filter.bytecode_len) { + != cmd_ctx->lsm->u.enable.bytecode_len) { free(bytecode); ret = LTTNG_ERR_FILTER_INVAL; goto error; } - ret = cmd_set_filter(cmd_ctx->session, cmd_ctx->lsm->domain.type, - cmd_ctx->lsm->u.filter.channel_name, - cmd_ctx->lsm->u.filter.event_name, - bytecode); + ret = cmd_enable_event(cmd_ctx->session, cmd_ctx->lsm->domain.type, + cmd_ctx->lsm->u.enable.channel_name, + &cmd_ctx->lsm->u.enable.event, bytecode, kernel_poll_pipe[1]); break; } case LTTNG_DATA_PENDING: @@ -2882,6 +2948,12 @@ static void *thread_manage_health(void *data) goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because either way, the + * show must go on. + */ + (void) utils_set_fd_cloexec(sock); + ret = lttcomm_listen_unix_sock(sock); if (ret < 0) { goto error; @@ -2905,8 +2977,6 @@ static void *thread_manage_health(void *data) while (1) { DBG("Health check ready"); - nb_fd = LTTNG_POLL_GETNB(&events); - /* Inifinite blocking call, waiting for transmission */ restart: ret = lttng_poll_wait(&events, -1); @@ -2920,6 +2990,8 @@ restart: goto error; } + nb_fd = ret; + for (i = 0; i < nb_fd; i++) { /* Fetch once the poll data */ revents = LTTNG_POLL_GETEV(&events, i); @@ -2946,6 +3018,12 @@ restart: goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because either way, the + * show must go on. + */ + (void) utils_set_fd_cloexec(new_sock); + DBG("Receiving data from client for health..."); ret = lttcomm_recv_unix_sock(new_sock, (void *)&msg, sizeof(msg)); if (ret <= 0) { @@ -3054,15 +3132,17 @@ static void *thread_manage_clients(void *data) DBG("[thread] Manage client started"); - testpoint(thread_manage_clients); - rcu_register_thread(); + if (testpoint(thread_manage_clients)) { + goto error_testpoint; + } + health_code_update(&health_thread_cmd); ret = lttcomm_listen_unix_sock(client_sock); if (ret < 0) { - goto error; + goto error_listen; } /* @@ -3071,7 +3151,7 @@ static void *thread_manage_clients(void *data) */ ret = create_thread_poll_set(&events, 2); if (ret < 0) { - goto error; + goto error_create_poll; } /* Add the application registration socket */ @@ -3087,15 +3167,15 @@ static void *thread_manage_clients(void *data) kill(ppid, SIGUSR1); } - testpoint(thread_manage_clients_before_loop); + if (testpoint(thread_manage_clients_before_loop)) { + goto error; + } health_code_update(&health_thread_cmd); while (1) { DBG("Accepting client command ..."); - nb_fd = LTTNG_POLL_GETNB(&events); - /* Inifinite blocking call, waiting for transmission */ restart: health_poll_update(&health_thread_cmd); @@ -3111,6 +3191,8 @@ static void *thread_manage_clients(void *data) goto error; } + nb_fd = ret; + for (i = 0; i < nb_fd; i++) { /* Fetch once the poll data */ revents = LTTNG_POLL_GETEV(&events, i); @@ -3143,6 +3225,12 @@ static void *thread_manage_clients(void *data) goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because either way, the + * show must go on. + */ + (void) utils_set_fd_cloexec(sock); + /* Set socket option for credentials retrieval */ ret = lttcomm_setsockopt_creds_unix_sock(sock); if (ret < 0) { @@ -3244,13 +3332,19 @@ static void *thread_manage_clients(void *data) exit: error: - if (err) { - health_error(&health_thread_cmd); - ERR("Health error occurred in %s", __func__); + if (sock >= 0) { + ret = close(sock); + if (ret) { + PERROR("close"); + } } - health_exit(&health_thread_cmd); - DBG("Client thread dying"); + lttng_poll_clean(&events); + clean_command_ctx(&cmd_ctx); + +error_listen: +error_create_poll: +error_testpoint: unlink(client_unix_sock_path); if (client_sock >= 0) { ret = close(client_sock); @@ -3258,15 +3352,15 @@ error: PERROR("close"); } } - if (sock >= 0) { - ret = close(sock); - if (ret) { - PERROR("close"); - } + + if (err) { + health_error(&health_thread_cmd); + ERR("Health error occurred in %s", __func__); } - lttng_poll_clean(&events); - clean_command_ctx(&cmd_ctx); + health_exit(&health_thread_cmd); + + DBG("Client thread dying"); rcu_unregister_thread(); return NULL; @@ -3443,6 +3537,14 @@ static int init_daemon_socket(void) goto end; } + /* Set the cloexec flag */ + ret = utils_set_fd_cloexec(client_sock); + if (ret < 0) { + ERR("Unable to set CLOEXEC flag to the client Unix socket (fd: %d). " + "Continuing but note that the consumer daemon will have a " + "reference to this socket on exec()", client_sock); + } + /* File permission MUST be 660 */ ret = chmod(client_unix_sock_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); if (ret < 0) { @@ -3459,6 +3561,14 @@ static int init_daemon_socket(void) goto end; } + /* Set the cloexec flag */ + ret = utils_set_fd_cloexec(apps_sock); + if (ret < 0) { + ERR("Unable to set CLOEXEC flag to the app Unix socket (fd: %d). " + "Continuing but note that the consumer daemon will have a " + "reference to this socket on exec()", apps_sock); + } + /* File permission MUST be 666 */ ret = chmod(apps_unix_sock_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); @@ -3468,6 +3578,9 @@ static int init_daemon_socket(void) goto end; } + DBG3("Session daemon client socket %d and application socket %d created", + client_sock, apps_sock); + end: umask(old_umask); return ret; @@ -3727,7 +3840,7 @@ int main(int argc, char **argv) { int ret = 0; void *status; - const char *home_path; + const char *home_path, *env_app_timeout; init_kernel_workarounds(); @@ -3737,7 +3850,7 @@ int main(int argc, char **argv) /* Parse arguments */ progname = argv[0]; - if ((ret = parse_args(argc, argv) < 0)) { + if ((ret = parse_args(argc, argv)) < 0) { goto error; } @@ -3964,8 +4077,10 @@ int main(int argc, char **argv) } /* Setup the kernel pipe for waking up the kernel thread */ - if ((ret = utils_create_pipe_cloexec(kernel_poll_pipe)) < 0) { - goto exit; + if (is_root && !opt_no_kernel) { + if ((ret = utils_create_pipe_cloexec(kernel_poll_pipe)) < 0) { + goto exit; + } } /* Setup the thread apps communication pipe. */ @@ -4007,6 +4122,14 @@ int main(int argc, char **argv) health_init(&ustconsumer64_data.health); health_poll_update(&ustconsumer64_data.health); + /* Check for the application socket timeout env variable. */ + env_app_timeout = getenv(DEFAULT_APP_SOCKET_TIMEOUT_ENV); + if (env_app_timeout) { + app_socket_timeout = atoi(env_app_timeout); + } else { + app_socket_timeout = DEFAULT_APP_SOCKET_RW_TIMEOUT; + } + /* Create thread to manage the client socket */ ret = pthread_create(&health_thread, NULL, thread_manage_health, (void *) NULL); @@ -4047,18 +4170,21 @@ int main(int argc, char **argv) goto exit_apps; } - /* Create kernel thread to manage kernel event */ - ret = pthread_create(&kernel_thread, NULL, - thread_manage_kernel, (void *) NULL); - if (ret != 0) { - PERROR("pthread_create kernel"); - goto exit_kernel; - } + /* Don't start this thread if kernel tracing is not requested nor root */ + if (is_root && !opt_no_kernel) { + /* Create kernel thread to manage kernel event */ + ret = pthread_create(&kernel_thread, NULL, + thread_manage_kernel, (void *) NULL); + if (ret != 0) { + PERROR("pthread_create kernel"); + goto exit_kernel; + } - ret = pthread_join(kernel_thread, &status); - if (ret != 0) { - PERROR("pthread_join"); - goto error; /* join error, exit without cleanup */ + ret = pthread_join(kernel_thread, &status); + if (ret != 0) { + PERROR("pthread_join"); + goto error; /* join error, exit without cleanup */ + } } exit_kernel: