Fix: report error to client on consumerd error
[lttng-tools.git] / src / bin / lttng-sessiond / main.c
index 0c4d4b1f9b544f3916caaec859cd6c2cff06a27b..7327c3cb2262ddf4337b46c08f2d02160f1136e7 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <urcu/futex.h>
+#include <urcu/uatomic.h>
 #include <unistd.h>
 #include <config.h>
 
@@ -54,6 +55,7 @@
 #include "shm.h"
 #include "ust-ctl.h"
 #include "utils.h"
+#include "fd-limit.h"
 
 #define CONSUMERD_FILE "lttng-consumerd"
 
@@ -178,6 +180,40 @@ static const char *consumerd64_bin = CONFIG_CONSUMERD64_BIN;
 static const char *consumerd32_libdir = CONFIG_CONSUMERD32_LIBDIR;
 static const char *consumerd64_libdir = CONFIG_CONSUMERD64_LIBDIR;
 
+/*
+ * Consumer daemon state which is changed when spawning it, killing it or in
+ * case of a fatal error.
+ */
+enum consumerd_state {
+       CONSUMER_STARTED = 1,
+       CONSUMER_STOPPED = 2,
+       CONSUMER_ERROR   = 3,
+};
+
+/*
+ * This consumer daemon state is used to validate if a client command will be
+ * able to reach the consumer. If not, the client is informed. For instance,
+ * doing a "lttng start" when the consumer state is set to ERROR will return an
+ * error to the client.
+ *
+ * The following example shows a possible race condition of this scheme:
+ *
+ * consumer thread error happens
+ *                                    client cmd arrives
+ *                                    client cmd checks state -> still OK
+ * consumer thread exit, sets error
+ *                                    client cmd try to talk to consumer
+ *                                    ...
+ *
+ * However, since the consumer is a different daemon, we have no way of making
+ * sure the command will reach it safely even with this state flag. This is why
+ * we consider that up to the state validation during command processing, the
+ * command is safe. After that, we can not guarantee the correctness of the
+ * client request vis-a-vis the consumer.
+ */
+static enum consumerd_state ust_consumerd_state;
+static enum consumerd_state kernel_consumerd_state;
+
 static
 void setup_consumerd_path(void)
 {
@@ -449,7 +485,6 @@ static void cleanup(void)
                        if (ret) {
                                PERROR("close");
                        }
-                       
                }
        }
        for (i = 0; i < 2; i++) {
@@ -595,7 +630,7 @@ static int send_kconsumer_session_streams(struct consumer_data *consumer_data,
                lkm.u.channel.channel_key = session->metadata->fd;
                lkm.u.channel.max_sb_size = session->metadata->conf->attr.subbuf_size;
                lkm.u.channel.mmap_len = 0;     /* for kernel */
-               DBG("Sending metadata channel %d to consumer", lkm.u.stream.stream_key);
+               DBG("Sending metadata channel %d to consumer", lkm.u.channel.channel_key);
                ret = lttcomm_send_unix_sock(sock, &lkm, sizeof(lkm));
                if (ret < 0) {
                        PERROR("send consumer channel");
@@ -1089,6 +1124,17 @@ restart_poll:
        ERR("consumer return code : %s", lttcomm_get_readable_code(-code));
 
 error:
+       /* Immediately set the consumerd state to stopped */
+       if (consumer_data->type == LTTNG_CONSUMER_KERNEL) {
+               uatomic_set(&kernel_consumerd_state, CONSUMER_ERROR);
+       } else if (consumer_data->type == LTTNG_CONSUMER64_UST ||
+                       consumer_data->type == LTTNG_CONSUMER32_UST) {
+               uatomic_set(&ust_consumerd_state, CONSUMER_ERROR);
+       } else {
+               /* Code flow error... */
+               assert(0);
+       }
+
        if (consumer_data->err_sock >= 0) {
                ret = close(consumer_data->err_sock);
                if (ret) {
@@ -1424,6 +1470,17 @@ static void *thread_registration_apps(void *data)
                                         * Using message-based transmissions to ensure we don't
                                         * have to deal with partially received messages.
                                         */
+                                       ret = lttng_fd_get(LTTNG_FD_APPS, 1);
+                                       if (ret < 0) {
+                                               ERR("Exhausted file descriptors allowed for applications.");
+                                               free(ust_cmd);
+                                               ret = close(sock);
+                                               if (ret) {
+                                                       PERROR("close");
+                                               }
+                                               sock = -1;
+                                               continue;
+                                       }
                                        ret = lttcomm_recv_unix_sock(sock, &ust_cmd->reg_msg,
                                                        sizeof(struct ust_register_msg));
                                        if (ret < 0 || ret < sizeof(struct ust_register_msg)) {
@@ -1437,6 +1494,7 @@ static void *thread_registration_apps(void *data)
                                                if (ret) {
                                                        PERROR("close");
                                                }
+                                               lttng_fd_put(LTTNG_FD_APPS, 1);
                                                sock = -1;
                                                continue;
                                        }
@@ -1482,6 +1540,7 @@ error:
                if (ret) {
                        PERROR("close");
                }
+               lttng_fd_put(LTTNG_FD_APPS, 1);
        }
        unlink(apps_unix_sock_path);
 
@@ -3303,9 +3362,13 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                break;
        default:
                DBG("Getting session %s by name", cmd_ctx->lsm->session.name);
+               /*
+                * We keep the session list lock across _all_ commands
+                * for now, because the per-session lock does not
+                * handle teardown properly.
+                */
                session_lock_list();
                cmd_ctx->session = session_find_by_name(cmd_ctx->lsm->session.name);
-               session_unlock_list();
                if (cmd_ctx->session == NULL) {
                        if (cmd_ctx->lsm->session.name != NULL) {
                                ret = LTTCOMM_SESS_NOT_FOUND;
@@ -3343,6 +3406,12 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                        }
                }
 
+               /* Consumer is in an ERROR state. Report back to client */
+               if (uatomic_read(&kernel_consumerd_state) == CONSUMER_ERROR) {
+                       ret = LTTCOMM_NO_KERNCONSUMERD;
+                       goto error;
+               }
+
                /* Need a session for kernel command */
                if (need_tracing_session) {
                        if (cmd_ctx->session->kernel_session == NULL) {
@@ -3363,13 +3432,21 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                                        ret = LTTCOMM_KERN_CONSUMER_FAIL;
                                        goto error;
                                }
+                               uatomic_set(&kernel_consumerd_state, CONSUMER_STARTED);
                        } else {
                                pthread_mutex_unlock(&kconsumer_data.pid_mutex);
                        }
                }
+
                break;
        case LTTNG_DOMAIN_UST:
        {
+               /* Consumer is in an ERROR state. Report back to client */
+               if (uatomic_read(&ust_consumerd_state) == CONSUMER_ERROR) {
+                       ret = LTTCOMM_NO_USTCONSUMERD;
+                       goto error;
+               }
+
                if (need_tracing_session) {
                        if (cmd_ctx->session->ust_session == NULL) {
                                ret = create_ust_session(cmd_ctx->session,
@@ -3393,6 +3470,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                                }
 
                                ust_consumerd64_fd = ustconsumer64_data.cmd_sock;
+                               uatomic_set(&ust_consumerd_state, CONSUMER_STARTED);
                        } else {
                                pthread_mutex_unlock(&ustconsumer64_data.pid_mutex);
                        }
@@ -3407,7 +3485,9 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                                        ust_consumerd32_fd = -EINVAL;
                                        goto error;
                                }
+
                                ust_consumerd32_fd = ustconsumer32_data.cmd_sock;
+                               uatomic_set(&ust_consumerd_state, CONSUMER_STARTED);
                        } else {
                                pthread_mutex_unlock(&ustconsumer32_data.pid_mutex);
                        }
@@ -3419,6 +3499,25 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
        }
 skip_domain:
 
+       /* Validate consumer daemon state when start/stop trace command */
+       if (cmd_ctx->lsm->cmd_type == LTTNG_START_TRACE ||
+                       cmd_ctx->lsm->cmd_type == LTTNG_STOP_TRACE) {
+               switch (cmd_ctx->lsm->domain.type) {
+               case LTTNG_DOMAIN_UST:
+                       if (uatomic_read(&ust_consumerd_state) != CONSUMER_STARTED) {
+                               ret = LTTCOMM_NO_USTCONSUMERD;
+                               goto error;
+                       }
+                       break;
+               case LTTNG_DOMAIN_KERNEL:
+                       if (uatomic_read(&kernel_consumerd_state) != CONSUMER_STARTED) {
+                               ret = LTTCOMM_NO_KERNCONSUMERD;
+                               goto error;
+                       }
+                       break;
+               }
+       }
+
        /*
         * Check that the UID or GID match that of the tracing session.
         * The root user can interact with all sessions.
@@ -3535,6 +3634,11 @@ skip_domain:
        {
                ret = cmd_destroy_session(cmd_ctx->session,
                                cmd_ctx->lsm->session.name);
+               /*
+                * Set session to NULL so we do not unlock it after
+                * free.
+                */
+               cmd_ctx->session = NULL;
                break;
        }
        case LTTNG_LIST_DOMAINS:
@@ -3669,6 +3773,9 @@ setup_error:
        if (cmd_ctx->session) {
                session_unlock(cmd_ctx->session);
        }
+       if (need_tracing_session) {
+               session_unlock_list();
+       }
 init_setup_error:
        return ret;
 }
@@ -4478,6 +4585,10 @@ int main(int argc, char **argv)
                }
        }
 
+       /* Set consumer initial state */
+       kernel_consumerd_state = CONSUMER_STOPPED;
+       ust_consumerd_state = CONSUMER_STOPPED;
+
        DBG("Client socket path %s", client_unix_sock_path);
        DBG("Application socket path %s", apps_unix_sock_path);
        DBG("LTTng run directory path: %s", rundir);
@@ -4544,6 +4655,8 @@ int main(int argc, char **argv)
                /* Set ulimit for open files */
                set_ulimit();
        }
+       /* init lttng_fd tracking must be done after set_ulimit. */
+       lttng_fd_init();
 
        ret = set_consumer_sockets(&ustconsumer64_data, rundir);
        if (ret < 0) {
This page took 0.026699 seconds and 4 git commands to generate.