X-Git-Url: https://git.lttng.org/?p=lttng-tools.git;a=blobdiff_plain;f=src%2Fbin%2Flttng-sessiond%2Fmain.c;h=0a156d8e7e1d024c218b03be6a6e81b29ed7979f;hp=b9c2177fa4f24b9fb31a7f75c7744ebbf6ba359d;hb=178191b3899f114001f000c2e7f46909969f9c6f;hpb=c30ce0b3d524a2c15bc688356d50d38fa9b43f85 diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c index b9c2177fa..0a156d8e7 100644 --- a/src/bin/lttng-sessiond/main.c +++ b/src/bin/lttng-sessiond/main.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -62,6 +61,7 @@ #include "fd-limit.h" #include "filter.h" #include "health.h" +#include "testpoint.h" #define CONSUMERD_FILE "lttng-consumerd" @@ -81,7 +81,10 @@ static int is_root; /* Set to 1 if the daemon is running as root */ static pid_t ppid; /* Parent PID for --sig-parent option */ static char *rundir; -/* Consumer daemon specific control data */ +/* + * Consumer daemon specific control data. Every value not initialized here is + * set to 0 by the static definition. + */ static struct consumer_data kconsumer_data = { .type = LTTNG_CONSUMER_KERNEL, .err_unix_sock_path = DEFAULT_KCONSUMERD_ERR_SOCK_PATH, @@ -90,6 +93,8 @@ static struct consumer_data kconsumer_data = { .cmd_sock = -1, .pid_mutex = PTHREAD_MUTEX_INITIALIZER, .lock = PTHREAD_MUTEX_INITIALIZER, + .cond = PTHREAD_COND_INITIALIZER, + .cond_mutex = PTHREAD_MUTEX_INITIALIZER, }; static struct consumer_data ustconsumer64_data = { .type = LTTNG_CONSUMER64_UST, @@ -99,6 +104,8 @@ static struct consumer_data ustconsumer64_data = { .cmd_sock = -1, .pid_mutex = PTHREAD_MUTEX_INITIALIZER, .lock = PTHREAD_MUTEX_INITIALIZER, + .cond = PTHREAD_COND_INITIALIZER, + .cond_mutex = PTHREAD_MUTEX_INITIALIZER, }; static struct consumer_data ustconsumer32_data = { .type = LTTNG_CONSUMER32_UST, @@ -108,6 +115,8 @@ static struct consumer_data ustconsumer32_data = { .cmd_sock = -1, .pid_mutex = PTHREAD_MUTEX_INITIALIZER, .lock = PTHREAD_MUTEX_INITIALIZER, + .cond = PTHREAD_COND_INITIALIZER, + .cond_mutex = PTHREAD_MUTEX_INITIALIZER, }; /* Shared between threads */ @@ -219,6 +228,11 @@ struct health_state health_thread_app_manage; struct health_state health_thread_app_reg; struct health_state health_thread_kernel; +/* + * Socket timeout for receiving and sending in seconds. + */ +static int app_socket_timeout; + static void setup_consumerd_path(void) { @@ -383,7 +397,7 @@ static void stop_threads(void) static void cleanup(void) { int ret; - char *cmd; + char *cmd = NULL; struct ltt_session *sess, *stmp; DBG("Cleaning up"); @@ -403,6 +417,7 @@ static void cleanup(void) ERR("Unable to clean %s", rundir); } free(cmd); + free(rundir); DBG("Cleaning up all sessions"); @@ -432,9 +447,6 @@ static void cleanup(void) modprobe_remove_lttng_all(); } - utils_close_pipe(kernel_poll_pipe); - utils_close_pipe(apps_cmd_pipe); - /* */ DBG("%c[%d;%dm*** assert failed :-) *** ==> %c[%dm%c[%d;%dm" "Matthew, BEET driven development works!%c[%dm", @@ -680,8 +692,12 @@ static void *thread_manage_kernel(void *data) DBG("Thread manage kernel started"); + testpoint(thread_manage_kernel); + health_code_update(&health_thread_kernel); + testpoint(thread_manage_kernel_before_loop); + ret = create_thread_poll_set(&events, 2); if (ret < 0) { goto error_poll_create; @@ -779,15 +795,42 @@ exit: error: lttng_poll_clean(&events); error_poll_create: + utils_close_pipe(kernel_poll_pipe); + kernel_poll_pipe[0] = kernel_poll_pipe[1] = -1; if (err) { health_error(&health_thread_kernel); ERR("Health error occurred in %s", __func__); + WARN("Kernel thread died unexpectedly. " + "Kernel tracing can continue but CPU hotplug is disabled."); } health_exit(&health_thread_kernel); DBG("Kernel thread dying"); return NULL; } +/* + * Signal pthread condition of the consumer data that the thread. + */ +static void signal_consumer_condition(struct consumer_data *data, int state) +{ + pthread_mutex_lock(&data->cond_mutex); + + /* + * The state is set before signaling. It can be any value, it's the waiter + * job to correctly interpret this condition variable associated to the + * consumer pthread_cond. + * + * A value of 0 means that the corresponding thread of the consumer data + * was not started. 1 indicates that the thread has started and is ready + * for action. A negative value means that there was an error during the + * thread bootstrap. + */ + data->consumer_thread_is_ready = state; + (void) pthread_cond_signal(&data->cond); + + pthread_mutex_unlock(&data->cond_mutex); +} + /* * This thread manage the consumer error sent back to the session daemon. */ @@ -801,12 +844,24 @@ static void *thread_manage_consumer(void *data) DBG("[thread] Manage consumer started"); - health_code_update(&consumer_data->health); - - ret = lttcomm_listen_unix_sock(consumer_data->err_sock); - if (ret < 0) { - goto error_listen; - } + /* + * Since the consumer thread can be spawned at any moment in time, we init + * the health to a poll status (1, which is a valid health over time). + * When the thread starts, we update here the health to a "code" path being + * an even value so this thread, when reaching a poll wait, does not + * trigger an error with an even value. + * + * Here is the use case we avoid. + * + * +1: the first poll update during initialization (main()) + * +2 * x: multiple code update once in this thread. + * +1: poll wait in this thread (being a good health state). + * == even number which after the wait period shows as a bad health. + * + * In a nutshell, the following poll update to the health state brings back + * the state to an even value meaning a code path. + */ + health_poll_update(&consumer_data->health); /* * Pass 2 as size here for the thread quit pipe and kconsumerd_err_sock. @@ -817,6 +872,11 @@ static void *thread_manage_consumer(void *data) goto error_poll; } + /* + * The error socket here is already in a listening state which was done + * just before spawning this thread to avoid a race between the consumer + * daemon exec trying to connect and the listen() call. + */ ret = lttng_poll_add(&events, consumer_data->err_sock, LPOLLIN | LPOLLRDHUP); if (ret < 0) { goto error; @@ -829,6 +889,9 @@ static void *thread_manage_consumer(void *data) /* Inifinite blocking call, waiting for transmission */ restart: health_poll_update(&consumer_data->health); + + testpoint(thread_manage_consumer); + ret = lttng_poll_wait(&events, -1); health_poll_update(&consumer_data->health); if (ret < 0) { @@ -869,6 +932,12 @@ restart: goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because either way, the + * show must go on. + */ + (void) utils_set_fd_cloexec(sock); + health_code_update(&consumer_data->health); DBG2("Receiving code from consumer err_sock"); @@ -882,17 +951,17 @@ restart: health_code_update(&consumer_data->health); - if (code == CONSUMERD_COMMAND_SOCK_READY) { + if (code == LTTCOMM_CONSUMERD_COMMAND_SOCK_READY) { consumer_data->cmd_sock = lttcomm_connect_unix_sock(consumer_data->cmd_unix_sock_path); if (consumer_data->cmd_sock < 0) { - sem_post(&consumer_data->sem); + /* On error, signal condition and quit. */ + signal_consumer_condition(consumer_data, -1); PERROR("consumer connect"); goto error; } - /* Signal condition to tell that the kconsumerd is ready */ - sem_post(&consumer_data->sem); - DBG("consumer command socket ready"); + signal_consumer_condition(consumer_data, 1); + DBG("Consumer command socket ready"); } else { ERR("consumer error when waiting for SOCK_READY : %s", lttcomm_get_readable_code(-code)); @@ -1003,7 +1072,6 @@ error: lttng_poll_clean(&events); error_poll: -error_listen: if (err) { health_error(&consumer_data->health); ERR("Health error occurred in %s", __func__); @@ -1026,6 +1094,8 @@ static void *thread_manage_apps(void *data) DBG("[thread] Manage application started"); + testpoint(thread_manage_apps); + rcu_register_thread(); rcu_thread_online(); @@ -1041,6 +1111,8 @@ static void *thread_manage_apps(void *data) goto error; } + testpoint(thread_manage_apps_before_loop); + health_code_update(&health_thread_app_manage); while (1) { @@ -1129,16 +1201,22 @@ static void *thread_manage_apps(void *data) ust_app_unregister(ust_cmd.sock); } else { /* - * We just need here to monitor the close of the UST - * socket and poll set monitor those by default. - * Listen on POLLIN (even if we never expect any - * data) to ensure that hangup wakes us. + * We only monitor the error events of the socket. This + * thread does not handle any incoming data from UST + * (POLLIN). */ - ret = lttng_poll_add(&events, ust_cmd.sock, LPOLLIN); + ret = lttng_poll_add(&events, ust_cmd.sock, + LPOLLERR & LPOLLHUP & LPOLLRDHUP); if (ret < 0) { goto error; } + /* Set socket timeout for both receiving and ending */ + (void) lttcomm_setsockopt_rcv_timeout(ust_cmd.sock, + app_socket_timeout); + (void) lttcomm_setsockopt_snd_timeout(ust_cmd.sock, + app_socket_timeout); + DBG("Apps with sock %d added to poll set", ust_cmd.sock); } @@ -1173,6 +1251,15 @@ exit: error: lttng_poll_clean(&events); error_poll_create: + utils_close_pipe(apps_cmd_pipe); + apps_cmd_pipe[0] = apps_cmd_pipe[1] = -1; + + /* + * We don't clean the UST app hash table here since already registered + * applications can still be controlled so let them be until the session + * daemon dies or the applications stop. + */ + if (err) { health_error(&health_thread_app_manage); ERR("Health error occurred in %s", __func__); @@ -1222,18 +1309,26 @@ static void *thread_dispatch_ust_registration(void *data) * call is blocking so we can be assured that the data will be read * at some point in time or wait to the end of the world :) */ - ret = write(apps_cmd_pipe[1], ust_cmd, - sizeof(struct ust_command)); - if (ret < 0) { - PERROR("write apps cmd pipe"); - if (errno == EBADF) { - /* - * We can't inform the application thread to process - * registration. We will exit or else application - * registration will not occur and tracing will never - * start. - */ - goto error; + if (apps_cmd_pipe[1] >= 0) { + ret = write(apps_cmd_pipe[1], ust_cmd, + sizeof(struct ust_command)); + if (ret < 0) { + PERROR("write apps cmd pipe"); + if (errno == EBADF) { + /* + * We can't inform the application thread to process + * registration. We will exit or else application + * registration will not occur and tracing will never + * start. + */ + goto error; + } + } + } else { + /* Application manager thread is not available. */ + ret = close(ust_cmd->sock); + if (ret < 0) { + PERROR("close ust_cmd sock"); } } free(ust_cmd); @@ -1264,6 +1359,8 @@ static void *thread_registration_apps(void *data) DBG("[thread] Manage application registration started"); + testpoint(thread_registration_apps); + ret = lttcomm_listen_unix_sock(apps_sock); if (ret < 0) { goto error_listen; @@ -1337,6 +1434,12 @@ static void *thread_registration_apps(void *data) goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because + * either way, the show must go on. + */ + (void) utils_set_fd_cloexec(sock); + /* Create UST registration command for enqueuing */ ust_cmd = zmalloc(sizeof(struct ust_command)); if (ust_cmd == NULL) { @@ -1446,59 +1549,110 @@ error_create_poll: */ static int spawn_consumer_thread(struct consumer_data *consumer_data) { - int ret; + int ret, clock_ret; struct timespec timeout; - timeout.tv_sec = DEFAULT_SEM_WAIT_TIMEOUT; - timeout.tv_nsec = 0; + /* Make sure we set the readiness flag to 0 because we are NOT ready */ + consumer_data->consumer_thread_is_ready = 0; - /* Setup semaphore */ - ret = sem_init(&consumer_data->sem, 0, 0); - if (ret < 0) { - PERROR("sem_init consumer semaphore"); + /* Setup pthread condition */ + ret = pthread_condattr_init(&consumer_data->condattr); + if (ret != 0) { + errno = ret; + PERROR("pthread_condattr_init consumer data"); + goto error; + } + + /* + * Set the monotonic clock in order to make sure we DO NOT jump in time + * between the clock_gettime() call and the timedwait call. See bug #324 + * for a more details and how we noticed it. + */ + ret = pthread_condattr_setclock(&consumer_data->condattr, CLOCK_MONOTONIC); + if (ret != 0) { + errno = ret; + PERROR("pthread_condattr_setclock consumer data"); + goto error; + } + + ret = pthread_cond_init(&consumer_data->cond, &consumer_data->condattr); + if (ret != 0) { + errno = ret; + PERROR("pthread_cond_init consumer data"); goto error; } - ret = pthread_create(&consumer_data->thread, NULL, - thread_manage_consumer, consumer_data); + ret = pthread_create(&consumer_data->thread, NULL, thread_manage_consumer, + consumer_data); if (ret != 0) { PERROR("pthread_create consumer"); ret = -1; goto error; } + /* We are about to wait on a pthread condition */ + pthread_mutex_lock(&consumer_data->cond_mutex); + /* Get time for sem_timedwait absolute timeout */ - ret = clock_gettime(CLOCK_REALTIME, &timeout); - if (ret < 0) { - PERROR("clock_gettime spawn consumer"); - /* Infinite wait for the kconsumerd thread to be ready */ - ret = sem_wait(&consumer_data->sem); - } else { - /* Normal timeout if the gettime was successful */ - timeout.tv_sec += DEFAULT_SEM_WAIT_TIMEOUT; - ret = sem_timedwait(&consumer_data->sem, &timeout); + clock_ret = clock_gettime(CLOCK_MONOTONIC, &timeout); + /* + * Set the timeout for the condition timed wait even if the clock gettime + * call fails since we might loop on that call and we want to avoid to + * increment the timeout too many times. + */ + timeout.tv_sec += DEFAULT_SEM_WAIT_TIMEOUT; + + /* + * The following loop COULD be skipped in some conditions so this is why we + * set ret to 0 in order to make sure at least one round of the loop is + * done. + */ + ret = 0; + + /* + * Loop until the condition is reached or when a timeout is reached. Note + * that the pthread_cond_timedwait(P) man page specifies that EINTR can NOT + * be returned but the pthread_cond(3), from the glibc-doc, says that it is + * possible. This loop does not take any chances and works with both of + * them. + */ + while (!consumer_data->consumer_thread_is_ready && ret != ETIMEDOUT) { + if (clock_ret < 0) { + PERROR("clock_gettime spawn consumer"); + /* Infinite wait for the consumerd thread to be ready */ + ret = pthread_cond_wait(&consumer_data->cond, + &consumer_data->cond_mutex); + } else { + ret = pthread_cond_timedwait(&consumer_data->cond, + &consumer_data->cond_mutex, &timeout); + } } - if (ret < 0) { - if (errno == ETIMEDOUT) { + /* Release the pthread condition */ + pthread_mutex_unlock(&consumer_data->cond_mutex); + + if (ret != 0) { + errno = ret; + if (ret == ETIMEDOUT) { /* * Call has timed out so we kill the kconsumerd_thread and return * an error. */ - ERR("The consumer thread was never ready. Killing it"); + ERR("Condition timed out. The consumer thread was never ready." + " Killing it"); ret = pthread_cancel(consumer_data->thread); if (ret < 0) { PERROR("pthread_cancel consumer thread"); } } else { - PERROR("semaphore wait failed consumer thread"); + PERROR("pthread_cond_wait failed consumer thread"); } goto error; } pthread_mutex_lock(&consumer_data->pid_mutex); if (consumer_data->pid == 0) { - ERR("Kconsumerd did not start"); + ERR("Consumerd did not start"); pthread_mutex_unlock(&consumer_data->pid_mutex); goto error; } @@ -1698,7 +1852,17 @@ error: */ static int start_consumerd(struct consumer_data *consumer_data) { - int ret; + int ret, err; + + /* + * Set the listen() state on the socket since there is a possible race + * between the exec() of the consumer daemon and this call if place in the + * consumer thread. See bug #366 for more details. + */ + ret = lttcomm_listen_unix_sock(consumer_data->err_sock); + if (ret < 0) { + goto error; + } pthread_mutex_lock(&consumer_data->pid_mutex); if (consumer_data->pid != 0) { @@ -1729,6 +1893,13 @@ end: return 0; error: + /* Cleanup already created socket on error. */ + if (consumer_data->err_sock >= 0) { + err = close(consumer_data->err_sock); + if (err < 0) { + PERROR("close consumer data error socket"); + } + } return ret; } @@ -1791,7 +1962,7 @@ error_version: PERROR("close"); } kernel_tracer_fd = -1; - return LTTCOMM_KERN_VERSION; + return LTTNG_ERR_KERN_VERSION; error_modules: ret = close(kernel_tracer_fd); @@ -1806,9 +1977,9 @@ error: WARN("No kernel tracer available"); kernel_tracer_fd = -1; if (!is_root) { - return LTTCOMM_NEED_ROOT_SESSIOND; + return LTTNG_ERR_NEED_ROOT_SESSIOND; } else { - return LTTCOMM_KERN_NA; + return LTTNG_ERR_KERN_NA; } } @@ -1830,6 +2001,15 @@ static int copy_session_consumer(int domain, struct ltt_session *session) switch (domain) { case LTTNG_DOMAIN_KERNEL: DBG3("Copying tracing session consumer output in kernel session"); + /* + * XXX: We should audit the session creation and what this function + * does "extra" in order to avoid a destroy since this function is used + * in the domain session creation (kernel and ust) only. Same for UST + * domain. + */ + if (session->kernel_session->consumer) { + consumer_destroy_output(session->kernel_session->consumer); + } session->kernel_session->consumer = consumer_copy_output(session->consumer); /* Ease our life a bit for the next part */ @@ -1838,6 +2018,9 @@ static int copy_session_consumer(int domain, struct ltt_session *session) break; case LTTNG_DOMAIN_UST: DBG3("Copying tracing session consumer output in UST session"); + if (session->ust_session->consumer) { + consumer_destroy_output(session->ust_session->consumer); + } session->ust_session->consumer = consumer_copy_output(session->consumer); /* Ease our life a bit for the next part */ @@ -1845,13 +2028,7 @@ static int copy_session_consumer(int domain, struct ltt_session *session) dir_name = DEFAULT_UST_TRACE_DIR; break; default: - ret = LTTCOMM_UNKNOWN_DOMAIN; - goto error; - } - - ret = consumer_set_subdir(session->consumer, session->name); - if (ret < 0) { - ret = LTTCOMM_FATAL; + ret = LTTNG_ERR_UNKNOWN_DOMAIN; goto error; } @@ -1860,7 +2037,7 @@ static int copy_session_consumer(int domain, struct ltt_session *session) sizeof(consumer->subdir) - strlen(consumer->subdir) - 1); DBG3("Copy session consumer subdir %s", consumer->subdir); - ret = LTTCOMM_OK; + ret = LTTNG_OK; error: return ret; @@ -1884,7 +2061,7 @@ static int create_ust_session(struct ltt_session *session, break; default: ERR("Unknown UST domain on create session %d", domain->type); - ret = LTTCOMM_UNKNOWN_DOMAIN; + ret = LTTNG_ERR_UNKNOWN_DOMAIN; goto error; } @@ -1892,7 +2069,7 @@ static int create_ust_session(struct ltt_session *session, lus = trace_ust_create_session(session->path, session->id, domain); if (lus == NULL) { - ret = LTTCOMM_UST_SESS_FAIL; + ret = LTTNG_ERR_UST_SESS_FAIL; goto error; } @@ -1902,11 +2079,11 @@ static int create_ust_session(struct ltt_session *session, /* Copy session output to the newly created UST session */ ret = copy_session_consumer(domain->type, session); - if (ret != LTTCOMM_OK) { + if (ret != LTTNG_OK) { goto error; } - return LTTCOMM_OK; + return LTTNG_OK; error: free(lus); @@ -1925,7 +2102,7 @@ static int create_kernel_session(struct ltt_session *session) ret = kernel_create_session(session, kernel_tracer_fd); if (ret < 0) { - ret = LTTCOMM_KERN_SESS_FAIL; + ret = LTTNG_ERR_KERN_SESS_FAIL; goto error; } @@ -1934,7 +2111,7 @@ static int create_kernel_session(struct ltt_session *session) /* Copy session output to the newly created Kernel session */ ret = copy_session_consumer(LTTNG_DOMAIN_KERNEL, session); - if (ret != LTTCOMM_OK) { + if (ret != LTTNG_OK) { goto error; } @@ -1955,7 +2132,7 @@ static int create_kernel_session(struct ltt_session *session) session->kernel_session->uid = session->uid; session->kernel_session->gid = session->gid; - return LTTCOMM_OK; + return LTTNG_OK; error: trace_kernel_destroy_session(session->kernel_session); @@ -1997,7 +2174,7 @@ static unsigned int lttng_sessions_count(uid_t uid, gid_t gid) static int process_client_msg(struct command_ctx *cmd_ctx, int sock, int *sock_error) { - int ret = LTTCOMM_OK; + int ret = LTTNG_OK; int need_tracing_session = 1; int need_domain; @@ -2012,6 +2189,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, case LTTNG_LIST_DOMAINS: case LTTNG_START_TRACE: case LTTNG_STOP_TRACE: + case LTTNG_DATA_PENDING: need_domain = 0; break; default: @@ -2021,9 +2199,9 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, if (opt_no_kernel && need_domain && cmd_ctx->lsm->domain.type == LTTNG_DOMAIN_KERNEL) { if (!is_root) { - ret = LTTCOMM_NEED_ROOT_SESSIOND; + ret = LTTNG_ERR_NEED_ROOT_SESSIOND; } else { - ret = LTTCOMM_KERN_NA; + ret = LTTNG_ERR_KERN_NA; } goto error; } @@ -2032,7 +2210,8 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, if (cmd_ctx->lsm->cmd_type == LTTNG_REGISTER_CONSUMER) { pthread_mutex_lock(&kconsumer_data.pid_mutex); if (kconsumer_data.pid > 0) { - ret = LTTCOMM_KERN_CONSUMER_FAIL; + ret = LTTNG_ERR_KERN_CONSUMER_FAIL; + pthread_mutex_unlock(&kconsumer_data.pid_mutex); goto error; } pthread_mutex_unlock(&kconsumer_data.pid_mutex); @@ -2080,10 +2259,10 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, cmd_ctx->session = session_find_by_name(cmd_ctx->lsm->session.name); if (cmd_ctx->session == NULL) { if (cmd_ctx->lsm->session.name != NULL) { - ret = LTTCOMM_SESS_NOT_FOUND; + ret = LTTNG_ERR_SESS_NOT_FOUND; } else { /* If no session name specified */ - ret = LTTCOMM_SELECT_SESS; + ret = LTTNG_ERR_SELECT_SESS; } goto error; } else { @@ -2103,7 +2282,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, switch (cmd_ctx->lsm->domain.type) { case LTTNG_DOMAIN_KERNEL: if (!is_root) { - ret = LTTCOMM_NEED_ROOT_SESSIOND; + ret = LTTNG_ERR_NEED_ROOT_SESSIOND; goto error; } @@ -2118,7 +2297,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, /* Consumer is in an ERROR state. Report back to client */ if (uatomic_read(&kernel_consumerd_state) == CONSUMER_ERROR) { - ret = LTTCOMM_NO_KERNCONSUMERD; + ret = LTTNG_ERR_NO_KERNCONSUMERD; goto error; } @@ -2127,7 +2306,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, if (cmd_ctx->session->kernel_session == NULL) { ret = create_kernel_session(cmd_ctx->session); if (ret < 0) { - ret = LTTCOMM_KERN_SESS_FAIL; + ret = LTTNG_ERR_KERN_SESS_FAIL; goto error; } } @@ -2140,7 +2319,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, pthread_mutex_unlock(&kconsumer_data.pid_mutex); ret = start_consumerd(&kconsumer_data); if (ret < 0) { - ret = LTTCOMM_KERN_CONSUMER_FAIL; + ret = LTTNG_ERR_KERN_CONSUMER_FAIL; goto error; } uatomic_set(&kernel_consumerd_state, CONSUMER_STARTED); @@ -2164,7 +2343,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, { /* Consumer is in an ERROR state. Report back to client */ if (uatomic_read(&ust_consumerd_state) == CONSUMER_ERROR) { - ret = LTTCOMM_NO_USTCONSUMERD; + ret = LTTNG_ERR_NO_USTCONSUMERD; goto error; } @@ -2173,7 +2352,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, if (cmd_ctx->session->ust_session == NULL) { ret = create_ust_session(cmd_ctx->session, &cmd_ctx->lsm->domain); - if (ret != LTTCOMM_OK) { + if (ret != LTTNG_OK) { goto error; } } @@ -2188,7 +2367,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, pthread_mutex_unlock(&ustconsumer64_data.pid_mutex); ret = start_consumerd(&ustconsumer64_data); if (ret < 0) { - ret = LTTCOMM_UST_CONSUMER64_FAIL; + ret = LTTNG_ERR_UST_CONSUMER64_FAIL; uatomic_set(&ust_consumerd64_fd, -EINVAL); goto error; } @@ -2217,7 +2396,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock, pthread_mutex_unlock(&ustconsumer32_data.pid_mutex); ret = start_consumerd(&ustconsumer32_data); if (ret < 0) { - ret = LTTCOMM_UST_CONSUMER32_FAIL; + ret = LTTNG_ERR_UST_CONSUMER32_FAIL; uatomic_set(&ust_consumerd32_fd, -EINVAL); goto error; } @@ -2251,13 +2430,13 @@ skip_domain: switch (cmd_ctx->lsm->domain.type) { case LTTNG_DOMAIN_UST: if (uatomic_read(&ust_consumerd_state) != CONSUMER_STARTED) { - ret = LTTCOMM_NO_USTCONSUMERD; + ret = LTTNG_ERR_NO_USTCONSUMERD; goto error; } break; case LTTNG_DOMAIN_KERNEL: if (uatomic_read(&kernel_consumerd_state) != CONSUMER_STARTED) { - ret = LTTCOMM_NO_KERNCONSUMERD; + ret = LTTNG_ERR_NO_KERNCONSUMERD; goto error; } break; @@ -2272,7 +2451,7 @@ skip_domain: if (!session_access_ok(cmd_ctx->session, LTTNG_SOCK_GET_UID_CRED(&cmd_ctx->creds), LTTNG_SOCK_GET_GID_CRED(&cmd_ctx->creds))) { - ret = LTTCOMM_EPERM; + ret = LTTNG_ERR_EPERM; goto error; } } @@ -2283,8 +2462,7 @@ skip_domain: { ret = cmd_add_context(cmd_ctx->session, cmd_ctx->lsm->domain.type, cmd_ctx->lsm->u.context.channel_name, - cmd_ctx->lsm->u.context.event_name, - &cmd_ctx->lsm->u.context.ctx); + &cmd_ctx->lsm->u.context.ctx, kernel_poll_pipe[1]); break; } case LTTNG_DISABLE_CHANNEL: @@ -2326,7 +2504,7 @@ skip_domain: * be a DOMAIN enuam. */ ret = cmd_enable_consumer(cmd_ctx->lsm->domain.type, cmd_ctx->session); - if (ret != LTTCOMM_OK) { + if (ret != LTTNG_OK) { goto error; } @@ -2366,6 +2544,7 @@ skip_domain: nb_events = cmd_list_tracepoints(cmd_ctx->lsm->domain.type, &events); if (nb_events < 0) { + /* Return value is a negative lttng_error_code. */ ret = -nb_events; goto error; } @@ -2386,7 +2565,7 @@ skip_domain: free(events); - ret = LTTCOMM_OK; + ret = LTTNG_OK; break; } case LTTNG_LIST_TRACEPOINT_FIELDS: @@ -2397,6 +2576,7 @@ skip_domain: nb_fields = cmd_list_tracepoint_fields(cmd_ctx->lsm->domain.type, &fields); if (nb_fields < 0) { + /* Return value is a negative lttng_error_code. */ ret = -nb_fields; goto error; } @@ -2418,7 +2598,7 @@ skip_domain: free(fields); - ret = LTTCOMM_OK; + ret = LTTNG_OK; break; } case LTTNG_SET_CONSUMER_URI: @@ -2430,13 +2610,13 @@ skip_domain: len = nb_uri * sizeof(struct lttng_uri); if (nb_uri == 0) { - ret = LTTCOMM_INVALID; + ret = LTTNG_ERR_INVALID; goto error; } uris = zmalloc(len); if (uris == NULL) { - ret = LTTCOMM_FATAL; + ret = LTTNG_ERR_FATAL; goto error; } @@ -2446,13 +2626,15 @@ skip_domain: if (ret <= 0) { DBG("No URIs received from client... continuing"); *sock_error = 1; - ret = LTTCOMM_SESSION_FAIL; + ret = LTTNG_ERR_SESSION_FAIL; + free(uris); goto error; } ret = cmd_set_consumer_uri(cmd_ctx->lsm->domain.type, cmd_ctx->session, nb_uri, uris); - if (ret != LTTCOMM_OK) { + if (ret != LTTNG_OK) { + free(uris); goto error; } @@ -2473,6 +2655,8 @@ skip_domain: } } + free(uris); + break; } case LTTNG_START_TRACE: @@ -2496,7 +2680,7 @@ skip_domain: if (nb_uri > 0) { uris = zmalloc(len); if (uris == NULL) { - ret = LTTCOMM_FATAL; + ret = LTTNG_ERR_FATAL; goto error; } @@ -2506,13 +2690,15 @@ skip_domain: if (ret <= 0) { DBG("No URIs received from client... continuing"); *sock_error = 1; - ret = LTTCOMM_SESSION_FAIL; + ret = LTTNG_ERR_SESSION_FAIL; + free(uris); goto error; } if (nb_uri == 1 && uris[0].dtype != LTTNG_DST_PATH) { DBG("Creating session with ONE network URI is a bad call"); - ret = LTTCOMM_SESSION_FAIL; + ret = LTTNG_ERR_SESSION_FAIL; + free(uris); goto error; } } @@ -2520,6 +2706,8 @@ skip_domain: ret = cmd_create_session_uri(cmd_ctx->lsm->session.name, uris, nb_uri, &cmd_ctx->creds); + free(uris); + break; } case LTTNG_DESTROY_SESSION: @@ -2537,6 +2725,7 @@ skip_domain: nb_dom = cmd_list_domains(cmd_ctx->session, &domains); if (nb_dom < 0) { + /* Return value is a negative lttng_error_code. */ ret = -nb_dom; goto error; } @@ -2552,7 +2741,7 @@ skip_domain: free(domains); - ret = LTTCOMM_OK; + ret = LTTNG_OK; break; } case LTTNG_LIST_CHANNELS: @@ -2563,6 +2752,7 @@ skip_domain: nb_chan = cmd_list_channels(cmd_ctx->lsm->domain.type, cmd_ctx->session, &channels); if (nb_chan < 0) { + /* Return value is a negative lttng_error_code. */ ret = -nb_chan; goto error; } @@ -2578,7 +2768,7 @@ skip_domain: free(channels); - ret = LTTCOMM_OK; + ret = LTTNG_OK; break; } case LTTNG_LIST_EVENTS: @@ -2589,6 +2779,7 @@ skip_domain: nb_event = cmd_list_events(cmd_ctx->lsm->domain.type, cmd_ctx->session, cmd_ctx->lsm->u.list.channel_name, &events); if (nb_event < 0) { + /* Return value is a negative lttng_error_code. */ ret = -nb_event; goto error; } @@ -2604,7 +2795,7 @@ skip_domain: free(events); - ret = LTTCOMM_OK; + ret = LTTNG_OK; break; } case LTTNG_LIST_SESSIONS: @@ -2629,7 +2820,7 @@ skip_domain: session_unlock_list(); - ret = LTTCOMM_OK; + ret = LTTNG_OK; break; } case LTTNG_CALIBRATE: @@ -2647,7 +2838,7 @@ skip_domain: cdata = &kconsumer_data; break; default: - ret = LTTCOMM_UND; + ret = LTTNG_ERR_UND; goto error; } @@ -2659,13 +2850,13 @@ skip_domain: { struct lttng_filter_bytecode *bytecode; - if (cmd_ctx->lsm->u.filter.bytecode_len > 65336) { - ret = LTTCOMM_FILTER_INVAL; + if (cmd_ctx->lsm->u.filter.bytecode_len > LTTNG_FILTER_MAX_LEN) { + ret = LTTNG_ERR_FILTER_INVAL; goto error; } bytecode = zmalloc(cmd_ctx->lsm->u.filter.bytecode_len); if (!bytecode) { - ret = LTTCOMM_FILTER_NOMEM; + ret = LTTNG_ERR_FILTER_NOMEM; goto error; } /* Receive var. len. data */ @@ -2675,25 +2866,30 @@ skip_domain: if (ret <= 0) { DBG("Nothing recv() from client var len data... continuing"); *sock_error = 1; - ret = LTTCOMM_FILTER_INVAL; + ret = LTTNG_ERR_FILTER_INVAL; goto error; } if (bytecode->len + sizeof(*bytecode) != cmd_ctx->lsm->u.filter.bytecode_len) { free(bytecode); - ret = LTTCOMM_FILTER_INVAL; + ret = LTTNG_ERR_FILTER_INVAL; goto error; } ret = cmd_set_filter(cmd_ctx->session, cmd_ctx->lsm->domain.type, cmd_ctx->lsm->u.filter.channel_name, - cmd_ctx->lsm->u.filter.event_name, + &cmd_ctx->lsm->u.filter.event, bytecode); break; } + case LTTNG_DATA_PENDING: + { + ret = cmd_data_pending(cmd_ctx->session); + break; + } default: - ret = LTTCOMM_UND; + ret = LTTNG_ERR_UND; break; } @@ -2740,6 +2936,12 @@ static void *thread_manage_health(void *data) goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because either way, the + * show must go on. + */ + (void) utils_set_fd_cloexec(sock); + ret = lttcomm_listen_unix_sock(sock); if (ret < 0) { goto error; @@ -2804,6 +3006,12 @@ restart: goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because either way, the + * show must go on. + */ + (void) utils_set_fd_cloexec(new_sock); + DBG("Receiving data from client for health..."); ret = lttcomm_recv_unix_sock(new_sock, (void *)&msg, sizeof(msg)); if (ret <= 0) { @@ -2843,7 +3051,7 @@ restart: check_consumer_health(); break; default: - reply.ret_code = LTTCOMM_UND; + reply.ret_code = LTTNG_ERR_UND; break; } @@ -2912,13 +3120,15 @@ static void *thread_manage_clients(void *data) DBG("[thread] Manage client started"); + testpoint(thread_manage_clients); + rcu_register_thread(); health_code_update(&health_thread_cmd); ret = lttcomm_listen_unix_sock(client_sock); if (ret < 0) { - goto error; + goto error_listen; } /* @@ -2927,7 +3137,7 @@ static void *thread_manage_clients(void *data) */ ret = create_thread_poll_set(&events, 2); if (ret < 0) { - goto error; + goto error_create_poll; } /* Add the application registration socket */ @@ -2943,6 +3153,8 @@ static void *thread_manage_clients(void *data) kill(ppid, SIGUSR1); } + testpoint(thread_manage_clients_before_loop); + health_code_update(&health_thread_cmd); while (1) { @@ -2997,6 +3209,12 @@ static void *thread_manage_clients(void *data) goto error; } + /* + * Set the CLOEXEC flag. Return code is useless because either way, the + * show must go on. + */ + (void) utils_set_fd_cloexec(sock); + /* Set socket option for credentials retrieval */ ret = lttcomm_setsockopt_creds_unix_sock(sock); if (ret < 0) { @@ -3098,13 +3316,18 @@ static void *thread_manage_clients(void *data) exit: error: - if (err) { - health_error(&health_thread_cmd); - ERR("Health error occurred in %s", __func__); + if (sock >= 0) { + ret = close(sock); + if (ret) { + PERROR("close"); + } } - health_exit(&health_thread_cmd); - DBG("Client thread dying"); + lttng_poll_clean(&events); + clean_command_ctx(&cmd_ctx); + +error_listen: +error_create_poll: unlink(client_unix_sock_path); if (client_sock >= 0) { ret = close(client_sock); @@ -3112,15 +3335,15 @@ error: PERROR("close"); } } - if (sock >= 0) { - ret = close(sock); - if (ret) { - PERROR("close"); - } + + if (err) { + health_error(&health_thread_cmd); + ERR("Health error occurred in %s", __func__); } - lttng_poll_clean(&events); - clean_command_ctx(&cmd_ctx); + health_exit(&health_thread_cmd); + + DBG("Client thread dying"); rcu_unregister_thread(); return NULL; @@ -3297,6 +3520,14 @@ static int init_daemon_socket(void) goto end; } + /* Set the cloexec flag */ + ret = utils_set_fd_cloexec(client_sock); + if (ret < 0) { + ERR("Unable to set CLOEXEC flag to the client Unix socket (fd: %d). " + "Continuing but note that the consumer daemon will have a " + "reference to this socket on exec()", client_sock); + } + /* File permission MUST be 660 */ ret = chmod(client_unix_sock_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); if (ret < 0) { @@ -3313,6 +3544,14 @@ static int init_daemon_socket(void) goto end; } + /* Set the cloexec flag */ + ret = utils_set_fd_cloexec(apps_sock); + if (ret < 0) { + ERR("Unable to set CLOEXEC flag to the app Unix socket (fd: %d). " + "Continuing but note that the consumer daemon will have a " + "reference to this socket on exec()", apps_sock); + } + /* File permission MUST be 666 */ ret = chmod(apps_unix_sock_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH); @@ -3322,6 +3561,9 @@ static int init_daemon_socket(void) goto end; } + DBG3("Session daemon client socket %d and application socket %d created", + client_sock, apps_sock); + end: umask(old_umask); return ret; @@ -3581,7 +3823,7 @@ int main(int argc, char **argv) { int ret = 0; void *status; - const char *home_path; + const char *home_path, *env_app_timeout; init_kernel_workarounds(); @@ -3818,8 +4060,10 @@ int main(int argc, char **argv) } /* Setup the kernel pipe for waking up the kernel thread */ - if ((ret = utils_create_pipe_cloexec(kernel_poll_pipe)) < 0) { - goto exit; + if (is_root && !opt_no_kernel) { + if ((ret = utils_create_pipe_cloexec(kernel_poll_pipe)) < 0) { + goto exit; + } } /* Setup the thread apps communication pipe. */ @@ -3850,7 +4094,9 @@ int main(int argc, char **argv) /* * Init health counters of the consumer thread. We do a quick hack here to * the state of the consumer health is fine even if the thread is not - * started. This is simply to ease our life and has no cost what so ever. + * started. Once the thread starts, the health state is updated with a poll + * value to set a health code path. This is simply to ease our life and has + * no cost what so ever. */ health_init(&kconsumer_data.health); health_poll_update(&kconsumer_data.health); @@ -3859,6 +4105,14 @@ int main(int argc, char **argv) health_init(&ustconsumer64_data.health); health_poll_update(&ustconsumer64_data.health); + /* Check for the application socket timeout env variable. */ + env_app_timeout = getenv(DEFAULT_APP_SOCKET_TIMEOUT_ENV); + if (env_app_timeout) { + app_socket_timeout = atoi(env_app_timeout); + } else { + app_socket_timeout = DEFAULT_APP_SOCKET_RW_TIMEOUT; + } + /* Create thread to manage the client socket */ ret = pthread_create(&health_thread, NULL, thread_manage_health, (void *) NULL); @@ -3899,18 +4153,21 @@ int main(int argc, char **argv) goto exit_apps; } - /* Create kernel thread to manage kernel event */ - ret = pthread_create(&kernel_thread, NULL, - thread_manage_kernel, (void *) NULL); - if (ret != 0) { - PERROR("pthread_create kernel"); - goto exit_kernel; - } + /* Don't start this thread if kernel tracing is not requested nor root */ + if (is_root && !opt_no_kernel) { + /* Create kernel thread to manage kernel event */ + ret = pthread_create(&kernel_thread, NULL, + thread_manage_kernel, (void *) NULL); + if (ret != 0) { + PERROR("pthread_create kernel"); + goto exit_kernel; + } - ret = pthread_join(kernel_thread, &status); - if (ret != 0) { - PERROR("pthread_join"); - goto error; /* join error, exit without cleanup */ + ret = pthread_join(kernel_thread, &status); + if (ret != 0) { + PERROR("pthread_join"); + goto error; /* join error, exit without cleanup */ + } } exit_kernel: @@ -3947,7 +4204,25 @@ exit_dispatch: goto error; /* join error, exit without cleanup */ } + ret = join_consumer_thread(&ustconsumer32_data); + if (ret != 0) { + PERROR("join_consumer ust32"); + goto error; /* join error, exit without cleanup */ + } + + ret = join_consumer_thread(&ustconsumer64_data); + if (ret != 0) { + PERROR("join_consumer ust64"); + goto error; /* join error, exit without cleanup */ + } + exit_client: + ret = pthread_join(health_thread, &status); + if (ret != 0) { + PERROR("pthread_join health thread"); + goto error; /* join error, exit without cleanup */ + } + exit_health: exit: /*