X-Git-Url: https://git.lttng.org/?a=blobdiff_plain;f=src%2Fbin%2Flttng-sessiond%2Fmain.c;h=022bbc6d1dc5b293974166f0a92bd04818ba29e2;hb=edb8b045f1abe76804200921e7b2eb42ec66b5e4;hp=41cdf112687c86fc60af8d832b07684dfc4c49bd;hpb=b1d41407438e92d79fa17cde7f6da0cc559386f9;p=lttng-tools.git

diff --git a/src/bin/lttng-sessiond/main.c b/src/bin/lttng-sessiond/main.c
index 41cdf1126..022bbc6d1 100644
--- a/src/bin/lttng-sessiond/main.c
+++ b/src/bin/lttng-sessiond/main.c
@@ -412,6 +412,7 @@ static void cleanup(void)
 		ERR("Unable to clean %s", rundir);
 	}
 	free(cmd);
+	free(rundir);
 
 	DBG("Cleaning up all sessions");
 
@@ -837,12 +838,24 @@ static void *thread_manage_consumer(void *data)
 
 	DBG("[thread] Manage consumer started");
 
-	health_code_update(&consumer_data->health);
-
-	ret = lttcomm_listen_unix_sock(consumer_data->err_sock);
-	if (ret < 0) {
-		goto error_listen;
-	}
+	/*
+	 * Since the consumer thread can be spawned at any moment in time, we init
+	 * the health to a poll status (1, which is a valid health over time).
+	 * When the thread starts, we update here the health to a "code" path being
+	 * an even value so this thread, when reaching a poll wait, does not
+	 * trigger an error with an even value.
+	 *
+	 * Here is the use case we avoid.
+	 *
+	 * +1: the first poll update during initialization (main())
+	 * +2 * x: multiple code update once in this thread.
+	 * +1: poll wait in this thread (being a good health state).
+	 * == even number which after the wait period shows as a bad health.
+	 *
+	 * In a nutshell, the following poll update to the health state brings back
+	 * the state to an even value meaning a code path.
+	 */
+	health_poll_update(&consumer_data->health);
 
 	/*
 	 * Pass 2 as size here for the thread quit pipe and kconsumerd_err_sock.
@@ -853,6 +866,11 @@ static void *thread_manage_consumer(void *data)
 		goto error_poll;
 	}
 
+	/*
+	 * The error socket here is already in a listening state which was done
+	 * just before spawning this thread to avoid a race between the consumer
+	 * daemon exec trying to connect and the listen() call.
+	 */
 	ret = lttng_poll_add(&events, consumer_data->err_sock, LPOLLIN | LPOLLRDHUP);
 	if (ret < 0) {
 		goto error;
@@ -908,6 +926,12 @@ restart:
 		goto error;
 	}
 
+	/*
+	 * Set the CLOEXEC flag. Return code is useless because either way, the
+	 * show must go on.
+	 */
+	(void) utils_set_fd_cloexec(sock);
+
 	health_code_update(&consumer_data->health);
 
 	DBG2("Receiving code from consumer err_sock");
@@ -1042,7 +1066,6 @@ error:
 
 	lttng_poll_clean(&events);
 error_poll:
-error_listen:
 	if (err) {
 		health_error(&consumer_data->health);
 		ERR("Health error occurred in %s", __func__);
@@ -1172,12 +1195,12 @@ static void *thread_manage_apps(void *data)
 						ust_app_unregister(ust_cmd.sock);
 					} else {
 						/*
-						 * We just need here to monitor the close of the UST
-						 * socket and poll set monitor those by default.
-						 * Listen on POLLIN (even if we never expect any
-						 * data) to ensure that hangup wakes us.
+						 * We only monitor the error events of the socket. This
+						 * thread does not handle any incoming data from UST
+						 * (POLLIN).
 						 */
-						ret = lttng_poll_add(&events, ust_cmd.sock, LPOLLIN);
+						ret = lttng_poll_add(&events, ust_cmd.sock,
+								LPOLLERR & LPOLLHUP & LPOLLRDHUP);
 						if (ret < 0) {
 							goto error;
 						}
@@ -1382,6 +1405,12 @@ static void *thread_registration_apps(void *data)
 						goto error;
 					}
 
+					/*
+					 * Set the CLOEXEC flag. Return code is useless because
+					 * either way, the show must go on.
+					 */
+					(void) utils_set_fd_cloexec(sock);
+
 					/* Create UST registration command for enqueuing */
 					ust_cmd = zmalloc(sizeof(struct ust_command));
 					if (ust_cmd == NULL) {
@@ -1794,7 +1823,17 @@ error:
  */
 static int start_consumerd(struct consumer_data *consumer_data)
 {
-	int ret;
+	int ret, err;
+
+	/*
+	 * Set the listen() state on the socket since there is a possible race
+	 * between the exec() of the consumer daemon and this call if place in the
+	 * consumer thread. See bug #366 for more details.
+	 */
+	ret = lttcomm_listen_unix_sock(consumer_data->err_sock);
+	if (ret < 0) {
+		goto error;
+	}
 
 	pthread_mutex_lock(&consumer_data->pid_mutex);
 	if (consumer_data->pid != 0) {
@@ -1825,6 +1864,13 @@ end:
 	return 0;
 
 error:
+	/* Cleanup already created socket on error. */
+	if (consumer_data->err_sock >= 0) {
+		err = close(consumer_data->err_sock);
+		if (err < 0) {
+			PERROR("close consumer data error socket");
+		}
+	}
 	return ret;
 }
 
@@ -2114,7 +2160,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx, int sock,
 	case LTTNG_LIST_DOMAINS:
 	case LTTNG_START_TRACE:
 	case LTTNG_STOP_TRACE:
-	case LTTNG_DATA_AVAILABLE:
+	case LTTNG_DATA_PENDING:
 		need_domain = 0;
 		break;
 	default:
@@ -2388,7 +2434,7 @@ skip_domain:
 		ret = cmd_add_context(cmd_ctx->session, cmd_ctx->lsm->domain.type,
 				cmd_ctx->lsm->u.context.channel_name,
 				cmd_ctx->lsm->u.context.event_name,
-				&cmd_ctx->lsm->u.context.ctx);
+				&cmd_ctx->lsm->u.context.ctx, kernel_poll_pipe[1]);
 		break;
 	}
 	case LTTNG_DISABLE_CHANNEL:
@@ -2809,9 +2855,9 @@ skip_domain:
 				bytecode);
 		break;
 	}
-	case LTTNG_DATA_AVAILABLE:
+	case LTTNG_DATA_PENDING:
 	{
-		ret = cmd_data_available(cmd_ctx->session);
+		ret = cmd_data_pending(cmd_ctx->session);
 		break;
 	}
 	default:
@@ -2862,6 +2908,12 @@ static void *thread_manage_health(void *data)
 		goto error;
 	}
 
+	/*
+	 * Set the CLOEXEC flag. Return code is useless because either way, the
+	 * show must go on.
+	 */
+	(void) utils_set_fd_cloexec(sock);
+
 	ret = lttcomm_listen_unix_sock(sock);
 	if (ret < 0) {
 		goto error;
@@ -2926,6 +2978,12 @@ restart:
 			goto error;
 		}
 
+		/*
+		 * Set the CLOEXEC flag. Return code is useless because either way, the
+		 * show must go on.
+		 */
+		(void) utils_set_fd_cloexec(new_sock);
+
 		DBG("Receiving data from client for health...");
 		ret = lttcomm_recv_unix_sock(new_sock, (void *)&msg, sizeof(msg));
 		if (ret <= 0) {
@@ -3123,6 +3181,12 @@ static void *thread_manage_clients(void *data)
 			goto error;
 		}
 
+		/*
+		 * Set the CLOEXEC flag. Return code is useless because either way, the
+		 * show must go on.
+		 */
+		(void) utils_set_fd_cloexec(sock);
+
 		/* Set socket option for credentials retrieval */
 		ret = lttcomm_setsockopt_creds_unix_sock(sock);
 		if (ret < 0) {
@@ -3423,6 +3487,14 @@ static int init_daemon_socket(void)
 		goto end;
 	}
 
+	/* Set the cloexec flag */
+	ret = utils_set_fd_cloexec(client_sock);
+	if (ret < 0) {
+		ERR("Unable to set CLOEXEC flag to the client Unix socket (fd: %d). "
+				"Continuing but note that the consumer daemon will have a "
+				"reference to this socket on exec()", client_sock);
+	}
+
 	/* File permission MUST be 660 */
 	ret = chmod(client_unix_sock_path, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
 	if (ret < 0) {
@@ -3439,6 +3511,14 @@ static int init_daemon_socket(void)
 		goto end;
 	}
 
+	/* Set the cloexec flag */
+	ret = utils_set_fd_cloexec(apps_sock);
+	if (ret < 0) {
+		ERR("Unable to set CLOEXEC flag to the app Unix socket (fd: %d). "
+				"Continuing but note that the consumer daemon will have a "
+				"reference to this socket on exec()", apps_sock);
+	}
+
 	/* File permission MUST be 666 */
 	ret = chmod(apps_unix_sock_path,
 			S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
@@ -3448,6 +3528,9 @@ static int init_daemon_socket(void)
 		goto end;
 	}
 
+	DBG3("Session daemon client socket %d and application socket %d created",
+			client_sock, apps_sock);
+
 end:
 	umask(old_umask);
 	return ret;
@@ -3976,7 +4059,9 @@ int main(int argc, char **argv)
 	/*
 	 * Init health counters of the consumer thread. We do a quick hack here to
 	 * the state of the consumer health is fine even if the thread is not
-	 * started.  This is simply to ease our life and has no cost what so ever.
+	 * started. Once the thread starts, the health state is updated with a poll
+	 * value to set a health code path. This is simply to ease our life and has
+	 * no cost what so ever.
 	 */
 	health_init(&kconsumer_data.health);
 	health_poll_update(&kconsumer_data.health);
@@ -4073,7 +4158,25 @@ exit_dispatch:
 		goto error;	/* join error, exit without cleanup */
 	}
 
+	ret = join_consumer_thread(&ustconsumer32_data);
+	if (ret != 0) {
+		PERROR("join_consumer ust32");
+		goto error;	/* join error, exit without cleanup */
+	}
+
+	ret = join_consumer_thread(&ustconsumer64_data);
+	if (ret != 0) {
+		PERROR("join_consumer ust64");
+		goto error;	/* join error, exit without cleanup */
+	}
+
 exit_client:
+	ret = pthread_join(health_thread, &status);
+	if (ret != 0) {
+		PERROR("pthread_join health thread");
+		goto error;	/* join error, exit without cleanup */
+	}
+
 exit_health:
 exit:
 	/*