Fix: report error to client on consumerd error
authorDavid Goulet <dgoulet@efficios.com>
Thu, 19 Apr 2012 14:58:42 +0000 (10:58 -0400)
committerDavid Goulet <dgoulet@efficios.com>
Thu, 19 Apr 2012 15:05:22 +0000 (11:05 -0400)
Adds a consumer daemon state flag that allows us to know the state of
the consumer during client command processing.

Please refer to the comment in lttng-sessiond/main.c above the
definition of those flags for more information and a possible race
condition issue.

This is a quick fix for bug #137 that will be merged in 2.0-stable
branch.

Signed-off-by: David Goulet <dgoulet@efficios.com>
doc/man/lttng.1
src/bin/lttng-sessiond/main.c
src/common/sessiond-comm/sessiond-comm.c
src/common/sessiond-comm/sessiond-comm.h

index 5c873d797cbb07ca2cc3d05cf68caa28e2c389f2..f13d5d341d608b8e6a3c77106bf5e9be9e5ff235 100644 (file)
@@ -639,6 +639,12 @@ Tracing already started
 
 .IP "81"
 Tracing already stopped
+
+.IP "98"
+No UST consumer detected
+
+.IP "99"
+No Kernel consumer detected
 .PP
 .SH "ENVIRONMENT VARIABLES"
 
index 3c917b25f51c87f484a33096873f441c0c2da92b..7327c3cb2262ddf4337b46c08f2d02160f1136e7 100644 (file)
@@ -34,6 +34,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <urcu/futex.h>
+#include <urcu/uatomic.h>
 #include <unistd.h>
 #include <config.h>
 
@@ -179,6 +180,40 @@ static const char *consumerd64_bin = CONFIG_CONSUMERD64_BIN;
 static const char *consumerd32_libdir = CONFIG_CONSUMERD32_LIBDIR;
 static const char *consumerd64_libdir = CONFIG_CONSUMERD64_LIBDIR;
 
+/*
+ * Consumer daemon state which is changed when spawning it, killing it or in
+ * case of a fatal error.
+ */
+enum consumerd_state {
+       CONSUMER_STARTED = 1,
+       CONSUMER_STOPPED = 2,
+       CONSUMER_ERROR   = 3,
+};
+
+/*
+ * This consumer daemon state is used to validate if a client command will be
+ * able to reach the consumer. If not, the client is informed. For instance,
+ * doing a "lttng start" when the consumer state is set to ERROR will return an
+ * error to the client.
+ *
+ * The following example shows a possible race condition of this scheme:
+ *
+ * consumer thread error happens
+ *                                    client cmd arrives
+ *                                    client cmd checks state -> still OK
+ * consumer thread exit, sets error
+ *                                    client cmd try to talk to consumer
+ *                                    ...
+ *
+ * However, since the consumer is a different daemon, we have no way of making
+ * sure the command will reach it safely even with this state flag. This is why
+ * we consider that up to the state validation during command processing, the
+ * command is safe. After that, we can not guarantee the correctness of the
+ * client request vis-a-vis the consumer.
+ */
+static enum consumerd_state ust_consumerd_state;
+static enum consumerd_state kernel_consumerd_state;
+
 static
 void setup_consumerd_path(void)
 {
@@ -450,7 +485,6 @@ static void cleanup(void)
                        if (ret) {
                                PERROR("close");
                        }
-                       
                }
        }
        for (i = 0; i < 2; i++) {
@@ -1090,6 +1124,17 @@ restart_poll:
        ERR("consumer return code : %s", lttcomm_get_readable_code(-code));
 
 error:
+       /* Immediately set the consumerd state to stopped */
+       if (consumer_data->type == LTTNG_CONSUMER_KERNEL) {
+               uatomic_set(&kernel_consumerd_state, CONSUMER_ERROR);
+       } else if (consumer_data->type == LTTNG_CONSUMER64_UST ||
+                       consumer_data->type == LTTNG_CONSUMER32_UST) {
+               uatomic_set(&ust_consumerd_state, CONSUMER_ERROR);
+       } else {
+               /* Code flow error... */
+               assert(0);
+       }
+
        if (consumer_data->err_sock >= 0) {
                ret = close(consumer_data->err_sock);
                if (ret) {
@@ -3361,6 +3406,12 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                        }
                }
 
+               /* Consumer is in an ERROR state. Report back to client */
+               if (uatomic_read(&kernel_consumerd_state) == CONSUMER_ERROR) {
+                       ret = LTTCOMM_NO_KERNCONSUMERD;
+                       goto error;
+               }
+
                /* Need a session for kernel command */
                if (need_tracing_session) {
                        if (cmd_ctx->session->kernel_session == NULL) {
@@ -3381,13 +3432,21 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                                        ret = LTTCOMM_KERN_CONSUMER_FAIL;
                                        goto error;
                                }
+                               uatomic_set(&kernel_consumerd_state, CONSUMER_STARTED);
                        } else {
                                pthread_mutex_unlock(&kconsumer_data.pid_mutex);
                        }
                }
+
                break;
        case LTTNG_DOMAIN_UST:
        {
+               /* Consumer is in an ERROR state. Report back to client */
+               if (uatomic_read(&ust_consumerd_state) == CONSUMER_ERROR) {
+                       ret = LTTCOMM_NO_USTCONSUMERD;
+                       goto error;
+               }
+
                if (need_tracing_session) {
                        if (cmd_ctx->session->ust_session == NULL) {
                                ret = create_ust_session(cmd_ctx->session,
@@ -3411,6 +3470,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                                }
 
                                ust_consumerd64_fd = ustconsumer64_data.cmd_sock;
+                               uatomic_set(&ust_consumerd_state, CONSUMER_STARTED);
                        } else {
                                pthread_mutex_unlock(&ustconsumer64_data.pid_mutex);
                        }
@@ -3425,7 +3485,9 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                                        ust_consumerd32_fd = -EINVAL;
                                        goto error;
                                }
+
                                ust_consumerd32_fd = ustconsumer32_data.cmd_sock;
+                               uatomic_set(&ust_consumerd_state, CONSUMER_STARTED);
                        } else {
                                pthread_mutex_unlock(&ustconsumer32_data.pid_mutex);
                        }
@@ -3437,6 +3499,25 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
        }
 skip_domain:
 
+       /* Validate consumer daemon state when start/stop trace command */
+       if (cmd_ctx->lsm->cmd_type == LTTNG_START_TRACE ||
+                       cmd_ctx->lsm->cmd_type == LTTNG_STOP_TRACE) {
+               switch (cmd_ctx->lsm->domain.type) {
+               case LTTNG_DOMAIN_UST:
+                       if (uatomic_read(&ust_consumerd_state) != CONSUMER_STARTED) {
+                               ret = LTTCOMM_NO_USTCONSUMERD;
+                               goto error;
+                       }
+                       break;
+               case LTTNG_DOMAIN_KERNEL:
+                       if (uatomic_read(&kernel_consumerd_state) != CONSUMER_STARTED) {
+                               ret = LTTCOMM_NO_KERNCONSUMERD;
+                               goto error;
+                       }
+                       break;
+               }
+       }
+
        /*
         * Check that the UID or GID match that of the tracing session.
         * The root user can interact with all sessions.
@@ -4504,6 +4585,10 @@ int main(int argc, char **argv)
                }
        }
 
+       /* Set consumer initial state */
+       kernel_consumerd_state = CONSUMER_STOPPED;
+       ust_consumerd_state = CONSUMER_STOPPED;
+
        DBG("Client socket path %s", client_unix_sock_path);
        DBG("Application socket path %s", apps_unix_sock_path);
        DBG("LTTng run directory path: %s", rundir);
index 11319f333e84f34d30a75132d981444d136f7e24..6ad4a3775198959b1ded8ffc04f50eb8bcbba7ea 100644 (file)
@@ -123,6 +123,8 @@ static const char *lttcomm_readable_code[] = {
        [ LTTCOMM_ERR_INDEX(CONSUMERD_SPLICE_ESPIPE) ] = "consumerd splice ESPIPE",
        [ LTTCOMM_ERR_INDEX(LTTCOMM_NO_EVENT) ] = "Event not found",
        [ LTTCOMM_ERR_INDEX(LTTCOMM_INVALID) ] = "Invalid parameter",
+       [ LTTCOMM_ERR_INDEX(LTTCOMM_NO_USTCONSUMERD) ] = "No UST consumer detected",
+       [ LTTCOMM_ERR_INDEX(LTTCOMM_NO_KERNCONSUMERD) ] = "No kernel consumer detected",
 };
 
 /*
index 6cc98d542e9a0b54797fb143cc588dbb49133348..4d31289a4682a932428cf37e0a11dd29e9b7712d 100644 (file)
@@ -157,6 +157,8 @@ enum lttcomm_return_code {
        CONSUMERD_SPLICE_ENOMEM,                /* ENOMEM from splice(2) */
        CONSUMERD_SPLICE_ESPIPE,                /* ESPIPE from splice(2) */
        LTTCOMM_INVALID,                        /* Invalid parameter */
+       LTTCOMM_NO_USTCONSUMERD,        /* No UST consumer detected */
+       LTTCOMM_NO_KERNCONSUMERD,       /* No Kernel consumer detected */
 
        /* MUST be last element */
        LTTCOMM_NR,                                             /* Last element */
This page took 0.030552 seconds and 4 git commands to generate.