X-Git-Url: https://git.lttng.org/?p=lttng-tools.git;a=blobdiff_plain;f=src%2Fbin%2Flttng-sessiond%2Fmain.c;h=0a156d8e7e1d024c218b03be6a6e81b29ed7979f;hp=85a20d795cb253123c0b2753a02a8fcad98e8436;hb=178191b3899f114001f000c2e7f46909969f9c6f;hpb=a23ec3a78becfaeca0041c4a6747d30b3ac44bfd

diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c
index 85a20d795..0a156d8e7 100644
--- a/src/bin/lttng-sessiond/main.c
+++ b/src/bin/lttng-sessiond/main.c
@@ -61,6 +61,7 @@
 #include "fd-limit.h"
 #include "filter.h"
 #include "health.h"
+#include "testpoint.h"
 
 #define CONSUMERD_FILE	"lttng-consumerd"
 
@@ -227,6 +228,11 @@ struct health_state health_thread_app_manage;
 struct health_state health_thread_app_reg;
 struct health_state health_thread_kernel;
 
+/*
+ * Socket timeout for receiving and sending in seconds.
+ */
+static int app_socket_timeout;
+
 static
 void setup_consumerd_path(void)
 {
@@ -391,7 +397,7 @@ static void stop_threads(void)
 static void cleanup(void)
 {
 	int ret;
-	char *cmd;
+	char *cmd = NULL;
 	struct ltt_session *sess, *stmp;
 
 	DBG("Cleaning up");
@@ -411,6 +417,7 @@ static void cleanup(void)
 		ERR("Unable to clean %s", rundir);
 	}
 	free(cmd);
+	free(rundir);
 
 	DBG("Cleaning up all sessions");
 
@@ -440,9 +447,6 @@ static void cleanup(void)
 		modprobe_remove_lttng_all();
 	}
 
-	utils_close_pipe(kernel_poll_pipe);
-	utils_close_pipe(apps_cmd_pipe);
-
 	/* <fun> */
 	DBG("%c[%d;%dm*** assert failed :-) *** ==> %c[%dm%c[%d;%dm"
 			"Matthew, BEET driven development works!%c[%dm",
@@ -688,8 +692,12 @@ static void *thread_manage_kernel(void *data)
 
 	DBG("Thread manage kernel started");
 
+	testpoint(thread_manage_kernel);
+
 	health_code_update(&health_thread_kernel);
 
+	testpoint(thread_manage_kernel_before_loop);
+
 	ret = create_thread_poll_set(&events, 2);
 	if (ret < 0) {
 		goto error_poll_create;
@@ -787,9 +795,13 @@ exit:
 error:
 	lttng_poll_clean(&events);
 error_poll_create:
+	utils_close_pipe(kernel_poll_pipe);
+	kernel_poll_pipe[0] = kernel_poll_pipe[1] = -1;
 	if (err) {
 		health_error(&health_thread_kernel);
 		ERR("Health error occurred in %s", __func__);
+		WARN("Kernel thread died unexpectedly. "
+				"Kernel tracing can continue but CPU hotplug is disabled.");
 	}
 	health_exit(&health_thread_kernel);
 	DBG("Kernel thread dying");
@@ -832,12 +844,24 @@ static void *thread_manage_consumer(void *data)
 
 	DBG("[thread] Manage consumer started");
 
-	health_code_update(&consumer_data->health);
-
-	ret = lttcomm_listen_unix_sock(consumer_data->err_sock);
-	if (ret < 0) {
-		goto error_listen;
-	}
+	/*
+	 * Since the consumer thread can be spawned at any moment in time, we init
+	 * the health to a poll status (1, which is a valid health over time).
+	 * When the thread starts, we update here the health to a "code" path being
+	 * an even value so this thread, when reaching a poll wait, does not
+	 * trigger an error with an even value.
+	 *
+	 * Here is the use case we avoid.
+	 *
+	 * +1: the first poll update during initialization (main())
+	 * +2 * x: multiple code update once in this thread.
+	 * +1: poll wait in this thread (being a good health state).
+	 * == even number which after the wait period shows as a bad health.
+	 *
+	 * In a nutshell, the following poll update to the health state brings back
+	 * the state to an even value meaning a code path.
+	 */
+	health_poll_update(&consumer_data->health);
 
 	/*
 	 * Pass 2 as size here for the thread quit pipe and kconsumerd_err_sock.
@@ -848,6 +872,11 @@ static void *thread_manage_consumer(void *data)
 		goto error_poll;
 	}
 
+	/*
+	 * The error socket here is already in a listening state which was done
+	 * just before spawning this thread to avoid a race between the consumer
+	 * daemon exec trying to connect and the listen() call.
+	 */
 	ret = lttng_poll_add(&events, consumer_data->err_sock, LPOLLIN | LPOLLRDHUP);
 	if (ret < 0) {
 		goto error;
@@ -860,6 +889,9 @@ static void *thread_manage_consumer(void *data)
 	/* Inifinite blocking call, waiting for transmission */
 restart:
 	health_poll_update(&consumer_data->health);
+
+	testpoint(thread_manage_consumer);
+
 	ret = lttng_poll_wait(&events, -1);
 	health_poll_update(&consumer_data->health);
 	if (ret < 0) {
@@ -900,6 +932,12 @@ restart:
 		goto error;
 	}
 
+	/*
+	 * Set the CLOEXEC flag. Return code is useless because either way, the
+	 * show must go on.
+	 */
+	(void) utils_set_fd_cloexec(sock);
+
 	health_code_update(&consumer_data->health);
 
 	DBG2("Receiving code from consumer err_sock");
@@ -1034,7 +1072,6 @@ error:
 
 	lttng_poll_clean(&events);
 error_poll:
-error_listen:
 	if (err) {
 		health_error(&consumer_data->health);
 		ERR("Health error occurred in %s", __func__);
@@ -1057,6 +1094,8 @@ static void *thread_manage_apps(void *data)
 
 	DBG("[thread] Manage application started");
 
+	testpoint(thread_manage_apps);
+
 	rcu_register_thread();
 	rcu_thread_online();
 
@@ -1072,6 +1111,8 @@ static void *thread_manage_apps(void *data)
 		goto error;
 	}
 
+	testpoint(thread_manage_apps_before_loop);
+
 	health_code_update(&health_thread_app_manage);
 
 	while (1) {
@@ -1160,16 +1201,22 @@ static void *thread_manage_apps(void *data)
 						ust_app_unregister(ust_cmd.sock);
 					} else {
 						/*
-						 * We just need here to monitor the close of the UST
-						 * socket and poll set monitor those by default.
-						 * Listen on POLLIN (even if we never expect any
-						 * data) to ensure that hangup wakes us.
+						 * We only monitor the error events of the socket. This
+						 * thread does not handle any incoming data from UST
+						 * (POLLIN).
 						 */
-						ret = lttng_poll_add(&events, ust_cmd.sock, LPOLLIN);
+						ret = lttng_poll_add(&events, ust_cmd.sock,
+								LPOLLERR & LPOLLHUP & LPOLLRDHUP);
 						if (ret < 0) {
 							goto error;
 						}
 
+						/* Set socket timeout for both receiving and ending */
+						(void) lttcomm_setsockopt_rcv_timeout(ust_cmd.sock,
+								app_socket_timeout);
+						(void) lttcomm_setsockopt_snd_timeout(ust_cmd.sock,
+								app_socket_timeout);
+
 						DBG("Apps with sock %d added to poll set",
 								ust_cmd.sock);
 					}
@@ -1204,6 +1251,15 @@ exit:
 error:
 	lttng_poll_clean(&events);
 error_poll_create:
+	utils_close_pipe(apps_cmd_pipe);
+	apps_cmd_pipe[0] = apps_cmd_pipe[1] = -1;
+
+	/*
+	 * We don't clean the UST app hash table here since already registered
+	 * applications can still be controlled so let them be until the session
+	 * daemon dies or the applications stop.
+	 */
+
 	if (err) {
 		health_error(&health_thread_app_manage);
 		ERR("Health error occurred in %s", __func__);
@@ -1253,18 +1309,26 @@ static void *thread_dispatch_ust_registration(void *data)
 			 * call is blocking so we can be assured that the data will be read
 			 * at some point in time or wait to the end of the world :)
 			 */
-			ret = write(apps_cmd_pipe[1], ust_cmd,
-					sizeof(struct ust_command));
-			if (ret < 0) {
-				PERROR("write apps cmd pipe");
-				if (errno == EBADF) {
-					/*
-					 * We can't inform the application thread to process
-					 * registration. We will exit or else application
-					 * registration will not occur and tracing will never
-					 * start.
-					 */
-					goto error;
+			if (apps_cmd_pipe[1] >= 0) {
+				ret = write(apps_cmd_pipe[1], ust_cmd,
+						sizeof(struct ust_command));
+				if (ret < 0) {
+					PERROR("write apps cmd pipe");
+					if (errno == EBADF) {
+						/*
+						 * We can't inform the application thread to process
+						 * registration. We will exit or else application
+						 * registration will not occur and tracing will never
+						 * start.
+						 */
+						goto error;
+					}
+				}
+			} else {
+				/* Application manager thread is not available. */
+				ret = close(ust_cmd->sock);
+				if (ret < 0) {
+					PERROR("close ust_cmd sock");
 				}
 			}
 			free(ust_cmd);
@@ -1295,6 +1359,8 @@ static void *thread_registration_apps(void *data)
 
 	DBG("[thread] Manage application registration started");
 
+	testpoint(thread_registration_apps);
+
 	ret = lttcomm_listen_unix_sock(apps_sock);
 	if (ret < 0) {
 		goto error_listen;
@@ -1368,6 +1434,12 @@ static void *thread_registration_apps(void *data)
 						goto error;
 					}
 
+					/*
+					 * Set the CLOEXEC flag. Return code is useless because
+					 * either way, the show must go on.
+					 */
+					(void) utils_set_fd_cloexec(sock);
+
 					/* Create UST registration command for enqueuing */
 					ust_cmd = zmalloc(sizeof(struct ust_command));
 					if (ust_cmd == NULL) {
@@ -1780,7 +1852,17 @@ error:
  */
 static int start_consumerd(struct consumer_data *consumer_data)
 {
-	int ret;
+	int ret, err;
+
+	/*
+	 * Set the listen() state on the socket since there is a possible race
+	 * between the exec() of the consumer daemon and this call if place in the
+	 * consumer thread. See bug #366 for more details.
+	 */
+	ret = lttcomm_listen_unix_sock(consumer_data->err_sock);
+	if (ret < 0) {
+		goto error;
+	}
 
 	pthread_mutex_lock(&consumer_data->pid_mutex);
 	if (consumer_data->pid != 0) {
@@ -1811,6 +1893,13 @@ end:
 	return 0;
 
 error:
+	/* Cleanup already created socket on error. */
+	if (consumer_data->err_sock >= 0) {
+		err = close(consumer_data->err_sock);
+		if (err < 0) {
+			PERROR("close consumer data error socket");
+		}
+	}
 	return ret;
 }
 
@@ -1912,6 +2001,15 @@ static int copy_session_consumer(int domain, struct ltt_session *session)
 	switch (domain) {
 	case LTTNG_DOMAIN_KERNEL:
 		DBG3("Copying tracing session consumer output in kernel session");
+		/*
+		 * XXX: We should audit the session creation and what this function
+		 * does "extra" in order to avoid a destroy since this function is used
+		 * in the domain session creation (kernel and ust) only. Same for UST
+		 * domain.
+		 */
+		if (session->kernel_session->consumer) {
+			consumer_destroy_output(session->kernel_session->consumer);
+		}
 		session->kernel_session->consumer =
 			consumer_copy_output(session->consumer);
 		/* Ease our life a bit for the next part */
@@ -1920,6 +2018,9 @@ static int copy_session_consumer(int domain, struct ltt_session *session)
 		break;
 	case LTTNG_DOMAIN_UST:
 		DBG3("Copying tracing session consumer output in UST session");
+		if (session->ust_session->consumer) {
+			consumer_destroy_output(session->ust_session->consumer);
+		}
 		session->ust_session->consumer =
 			consumer_copy_output(session->consumer);
 		/* Ease our life a bit for the next part */
@@ -2088,6 +2189,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock,
 	case LTTNG_LIST_DOMAINS:
 	case LTTNG_START_TRACE:
 	case LTTNG_STOP_TRACE:
+	case LTTNG_DATA_PENDING:
 		need_domain = 0;
 		break;
 	default:
@@ -2360,8 +2462,7 @@ skip_domain:
 	{
 		ret = cmd_add_context(cmd_ctx->session, cmd_ctx->lsm->domain.type,
 				cmd_ctx->lsm->u.context.channel_name,
-				cmd_ctx->lsm->u.context.event_name,
-				&cmd_ctx->lsm->u.context.ctx);
+				&cmd_ctx->lsm->u.context.ctx, kernel_poll_pipe[1]);
 		break;
 	}
 	case LTTNG_DISABLE_CHANNEL:
@@ -2526,12 +2627,14 @@ skip_domain:
 			DBG("No URIs received from client... continuing");
 			*sock_error = 1;
 			ret = LTTNG_ERR_SESSION_FAIL;
+			free(uris);
 			goto error;
 		}
 
 		ret = cmd_set_consumer_uri(cmd_ctx->lsm->domain.type, cmd_ctx->session,
 				nb_uri, uris);
 		if (ret != LTTNG_OK) {
+			free(uris);
 			goto error;
 		}
 
@@ -2552,6 +2655,8 @@ skip_domain:
 			}
 		}
 
+		free(uris);
+
 		break;
 	}
 	case LTTNG_START_TRACE:
@@ -2586,12 +2691,14 @@ skip_domain:
 				DBG("No URIs received from client... continuing");
 				*sock_error = 1;
 				ret = LTTNG_ERR_SESSION_FAIL;
+				free(uris);
 				goto error;
 			}
 
 			if (nb_uri == 1 && uris[0].dtype != LTTNG_DST_PATH) {
 				DBG("Creating session with ONE network URI is a bad call");
 				ret = LTTNG_ERR_SESSION_FAIL;
+				free(uris);
 				goto error;
 			}
 		}
@@ -2599,6 +2706,8 @@ skip_domain:
 		ret = cmd_create_session_uri(cmd_ctx->lsm->session.name, uris, nb_uri,
 			&cmd_ctx->creds);
 
+		free(uris);
+
 		break;
 	}
 	case LTTNG_DESTROY_SESSION:
@@ -2770,10 +2879,15 @@ skip_domain:
 
 		ret = cmd_set_filter(cmd_ctx->session, cmd_ctx->lsm->domain.type,
 				cmd_ctx->lsm->u.filter.channel_name,
-				cmd_ctx->lsm->u.filter.event_name,
+				&cmd_ctx->lsm->u.filter.event,
 				bytecode);
 		break;
 	}
+	case LTTNG_DATA_PENDING:
+	{
+		ret = cmd_data_pending(cmd_ctx->session);
+		break;
+	}
 	default:
 		ret = LTTNG_ERR_UND;
 		break;
@@ -2822,6 +2936,12 @@ static void *thread_manage_health(void *data)
 		goto error;
 	}
 
+	/*
+	 * Set the CLOEXEC flag. Return code is useless because either way, the
+	 * show must go on.
+	 */
+	(void) utils_set_fd_cloexec(sock);
+
 	ret = lttcomm_listen_unix_sock(sock);
 	if (ret < 0) {
 		goto error;
@@ -2886,6 +3006,12 @@ restart:
 			goto error;
 		}
 
+		/*
+		 * Set the CLOEXEC flag. Return code is useless because either way, the
+		 * show must go on.
+		 */
+		(void) utils_set_fd_cloexec(new_sock);
+
 		DBG("Receiving data from client for health...");
 		ret = lttcomm_recv_unix_sock(new_sock, (void *)&msg, sizeof(msg));
 		if (ret <= 0) {
@@ -2994,13 +3120,15 @@ static void *thread_manage_clients(void *data)
 
 	DBG("[thread] Manage client started");
 
+	testpoint(thread_manage_clients);
+
 	rcu_register_thread();
 
 	health_code_update(&health_thread_cmd);
 
 	ret = lttcomm_listen_unix_sock(client_sock);
 	if (ret < 0) {
-		goto error;
+		goto error_listen;
 	}
 
 	/*
@@ -3009,7 +3137,7 @@ static void *thread_manage_clients(void *data)
 	 */
 	ret = create_thread_poll_set(&events, 2);
 	if (ret < 0) {
-		goto error;
+		goto error_create_poll;
 	}
 
 	/* Add the application registration socket */
@@ -3025,6 +3153,8 @@ static void *thread_manage_clients(void *data)
 		kill(ppid, SIGUSR1);
 	}
 
+	testpoint(thread_manage_clients_before_loop);
+
 	health_code_update(&health_thread_cmd);
 
 	while (1) {
@@ -3079,6 +3209,12 @@ static void *thread_manage_clients(void *data)
 			goto error;
 		}
 
+		/*
+		 * Set the CLOEXEC flag. Return code is useless because either way, the
+		 * show must go on.
+		 */
+		(void) utils_set_fd_cloexec(sock);
+
 		/* Set socket option for credentials retrieval */
 		ret = lttcomm_setsockopt_creds_unix_sock(sock);
 		if (ret < 0) {
@@ -3180,13 +3316,18 @@ static void *thread_manage_clients(void *data)
 
 exit:
 error:
-	if (err) {
-		health_error(&health_thread_cmd);
-		ERR("Health error occurred in %s", __func__);
+	if (sock >= 0) {
+		ret = close(sock);
+		if (ret) {
+			PERROR("close");
+		}
 	}
-	health_exit(&health_thread_cmd);
 
-	DBG("Client thread dying");
+	lttng_poll_clean(&events);
+	clean_command_ctx(&cmd_ctx);
+
+error_listen:
+error_create_poll:
 	unlink(client_unix_sock_path);
 	if (client_sock >= 0) {
 		ret = close(client_sock);
@@ -3194,15 +3335,15 @@ error:
 			PERROR("close");
 		}
 	}
-	if (sock >= 0) {
-		ret = close(sock);
-		if (ret) {
-			PERROR("close");
-		}
+
+	if (err) {
+		health_error(&health_thread_cmd);
+		ERR("Health error occurred in %s", __func__);
 	}
 
-	lttng_poll_clean(&events);
-	clean_command_ctx(&cmd_ctx);
+	health_exit(&health_thread_cmd);
+
+	DBG("Client thread dying");
 
 	rcu_unregister_thread();
 	return NULL;
@@ -3379,6 +3520,14 @@ static int init_daemon_socket(void)
 		goto end;
 	}
 
+	/* Set the cloexec flag */
+	ret = utils_set_fd_cloexec(client_sock);
+	if (ret < 0) {
+		ERR("Unable to set CLOEXEC flag to the client Unix socket (fd: %d). "
+				"Continuing but note that the consumer daemon will have a "
+				"reference to this socket on exec()", client_sock);
+	}
+
 	/* File permission MUST be 660 */
 	ret = chmod(client_unix_sock_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
 	if (ret < 0) {
@@ -3395,6 +3544,14 @@ static int init_daemon_socket(void)
 		goto end;
 	}
 
+	/* Set the cloexec flag */
+	ret = utils_set_fd_cloexec(apps_sock);
+	if (ret < 0) {
+		ERR("Unable to set CLOEXEC flag to the app Unix socket (fd: %d). "
+				"Continuing but note that the consumer daemon will have a "
+				"reference to this socket on exec()", apps_sock);
+	}
+
 	/* File permission MUST be 666 */
 	ret = chmod(apps_unix_sock_path,
 			S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
@@ -3404,6 +3561,9 @@ static int init_daemon_socket(void)
 		goto end;
 	}
 
+	DBG3("Session daemon client socket %d and application socket %d created",
+			client_sock, apps_sock);
+
 end:
 	umask(old_umask);
 	return ret;
@@ -3663,7 +3823,7 @@ int main(int argc, char **argv)
 {
 	int ret = 0;
 	void *status;
-	const char *home_path;
+	const char *home_path, *env_app_timeout;
 
 	init_kernel_workarounds();
 
@@ -3900,8 +4060,10 @@ int main(int argc, char **argv)
 	}
 
 	/* Setup the kernel pipe for waking up the kernel thread */
-	if ((ret = utils_create_pipe_cloexec(kernel_poll_pipe)) < 0) {
-		goto exit;
+	if (is_root && !opt_no_kernel) {
+		if ((ret = utils_create_pipe_cloexec(kernel_poll_pipe)) < 0) {
+			goto exit;
+		}
 	}
 
 	/* Setup the thread apps communication pipe. */
@@ -3932,7 +4094,9 @@ int main(int argc, char **argv)
 	/*
 	 * Init health counters of the consumer thread. We do a quick hack here to
 	 * the state of the consumer health is fine even if the thread is not
-	 * started.  This is simply to ease our life and has no cost what so ever.
+	 * started. Once the thread starts, the health state is updated with a poll
+	 * value to set a health code path. This is simply to ease our life and has
+	 * no cost what so ever.
 	 */
 	health_init(&kconsumer_data.health);
 	health_poll_update(&kconsumer_data.health);
@@ -3941,6 +4105,14 @@ int main(int argc, char **argv)
 	health_init(&ustconsumer64_data.health);
 	health_poll_update(&ustconsumer64_data.health);
 
+	/* Check for the application socket timeout env variable. */
+	env_app_timeout = getenv(DEFAULT_APP_SOCKET_TIMEOUT_ENV);
+	if (env_app_timeout) {
+		app_socket_timeout = atoi(env_app_timeout);
+	} else {
+		app_socket_timeout = DEFAULT_APP_SOCKET_RW_TIMEOUT;
+	}
+
 	/* Create thread to manage the client socket */
 	ret = pthread_create(&health_thread, NULL,
 			thread_manage_health, (void *) NULL);
@@ -3981,18 +4153,21 @@ int main(int argc, char **argv)
 		goto exit_apps;
 	}
 
-	/* Create kernel thread to manage kernel event */
-	ret = pthread_create(&kernel_thread, NULL,
-			thread_manage_kernel, (void *) NULL);
-	if (ret != 0) {
-		PERROR("pthread_create kernel");
-		goto exit_kernel;
-	}
+	/* Don't start this thread if kernel tracing is not requested nor root */
+	if (is_root && !opt_no_kernel) {
+		/* Create kernel thread to manage kernel event */
+		ret = pthread_create(&kernel_thread, NULL,
+				thread_manage_kernel, (void *) NULL);
+		if (ret != 0) {
+			PERROR("pthread_create kernel");
+			goto exit_kernel;
+		}
 
-	ret = pthread_join(kernel_thread, &status);
-	if (ret != 0) {
-		PERROR("pthread_join");
-		goto error;	/* join error, exit without cleanup */
+		ret = pthread_join(kernel_thread, &status);
+		if (ret != 0) {
+			PERROR("pthread_join");
+			goto error;	/* join error, exit without cleanup */
+		}
 	}
 
 exit_kernel:
@@ -4029,7 +4204,25 @@ exit_dispatch:
 		goto error;	/* join error, exit without cleanup */
 	}
 
+	ret = join_consumer_thread(&ustconsumer32_data);
+	if (ret != 0) {
+		PERROR("join_consumer ust32");
+		goto error;	/* join error, exit without cleanup */
+	}
+
+	ret = join_consumer_thread(&ustconsumer64_data);
+	if (ret != 0) {
+		PERROR("join_consumer ust64");
+		goto error;	/* join error, exit without cleanup */
+	}
+
 exit_client:
+	ret = pthread_join(health_thread, &status);
+	if (ret != 0) {
+		PERROR("pthread_join health thread");
+		goto error;	/* join error, exit without cleanup */
+	}
+
 exit_health:
 exit:
 	/*