Fix: report error to client on consumerd error
[lttng-tools.git] / src / bin / lttng-sessiond / main.c
index 2ef18ae09f5a9291073c6763c863a1e5a610048d..7327c3cb2262ddf4337b46c08f2d02160f1136e7 100644 (file)
@@ -2,18 +2,18 @@
  * Copyright (C) 2011 - David Goulet <david.goulet@polymtl.ca>
  *                      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; only version 2 of the License.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2 only,
+ * as published by the Free Software Foundation.
  *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA  02111-1307, USA.
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #define _GNU_SOURCE
@@ -34,6 +34,7 @@
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <urcu/futex.h>
+#include <urcu/uatomic.h>
 #include <unistd.h>
 #include <config.h>
 
@@ -54,6 +55,7 @@
 #include "shm.h"
 #include "ust-ctl.h"
 #include "utils.h"
+#include "fd-limit.h"
 
 #define CONSUMERD_FILE "lttng-consumerd"
 
@@ -81,14 +83,10 @@ const char default_tracing_group[] = DEFAULT_TRACING_GROUP;
 const char default_ust_sock_dir[] = DEFAULT_UST_SOCK_DIR;
 const char default_global_apps_pipe[] = DEFAULT_GLOBAL_APPS_PIPE;
 
-/* Variables */
-int opt_verbose;    /* Not static for lttngerr.h */
-int opt_verbose_consumer;    /* Not static for lttngerr.h */
-int opt_quiet;      /* Not static for lttngerr.h */
-
 const char *progname;
 const char *opt_tracing_group;
 static int opt_sig_parent;
+static int opt_verbose_consumer;
 static int opt_daemon;
 static int opt_no_kernel;
 static int is_root;                    /* Set to 1 if the daemon is running as root */
@@ -100,16 +98,22 @@ static struct consumer_data kconsumer_data = {
        .type = LTTNG_CONSUMER_KERNEL,
        .err_unix_sock_path = DEFAULT_KCONSUMERD_ERR_SOCK_PATH,
        .cmd_unix_sock_path = DEFAULT_KCONSUMERD_CMD_SOCK_PATH,
+       .err_sock = -1,
+       .cmd_sock = -1,
 };
 static struct consumer_data ustconsumer64_data = {
        .type = LTTNG_CONSUMER64_UST,
        .err_unix_sock_path = DEFAULT_USTCONSUMERD64_ERR_SOCK_PATH,
        .cmd_unix_sock_path = DEFAULT_USTCONSUMERD64_CMD_SOCK_PATH,
+       .err_sock = -1,
+       .cmd_sock = -1,
 };
 static struct consumer_data ustconsumer32_data = {
        .type = LTTNG_CONSUMER32_UST,
        .err_unix_sock_path = DEFAULT_USTCONSUMERD32_ERR_SOCK_PATH,
        .cmd_unix_sock_path = DEFAULT_USTCONSUMERD32_CMD_SOCK_PATH,
+       .err_sock = -1,
+       .cmd_sock = -1,
 };
 
 static int dispatch_thread_exit;
@@ -176,6 +180,40 @@ static const char *consumerd64_bin = CONFIG_CONSUMERD64_BIN;
 static const char *consumerd32_libdir = CONFIG_CONSUMERD32_LIBDIR;
 static const char *consumerd64_libdir = CONFIG_CONSUMERD64_LIBDIR;
 
+/*
+ * Consumer daemon state which is changed when spawning it, killing it or in
+ * case of a fatal error.
+ */
+enum consumerd_state {
+       CONSUMER_STARTED = 1,
+       CONSUMER_STOPPED = 2,
+       CONSUMER_ERROR   = 3,
+};
+
+/*
+ * This consumer daemon state is used to validate if a client command will be
+ * able to reach the consumer. If not, the client is informed. For instance,
+ * doing a "lttng start" when the consumer state is set to ERROR will return an
+ * error to the client.
+ *
+ * The following example shows a possible race condition of this scheme:
+ *
+ * consumer thread error happens
+ *                                    client cmd arrives
+ *                                    client cmd checks state -> still OK
+ * consumer thread exit, sets error
+ *                                    client cmd try to talk to consumer
+ *                                    ...
+ *
+ * However, since the consumer is a different daemon, we have no way of making
+ * sure the command will reach it safely even with this state flag. This is why
+ * we consider that up to the state validation during command processing, the
+ * command is safe. After that, we can not guarantee the correctness of the
+ * client request vis-a-vis the consumer.
+ */
+static enum consumerd_state ust_consumerd_state;
+static enum consumerd_state kernel_consumerd_state;
+
 static
 void setup_consumerd_path(void)
 {
@@ -332,7 +370,8 @@ static void teardown_kernel_session(struct ltt_session *session)
         * If a custom kernel consumer was registered, close the socket before
         * tearing down the complete kernel session structure
         */
-       if (session->kernel_session->consumer_fd != kconsumer_data.cmd_sock) {
+       if (kconsumer_data.cmd_sock >= 0 &&
+                       session->kernel_session->consumer_fd != kconsumer_data.cmd_sock) {
                lttcomm_close_unix_sock(session->kernel_session->consumer_fd);
        }
 
@@ -446,7 +485,6 @@ static void cleanup(void)
                        if (ret) {
                                PERROR("close");
                        }
-                       
                }
        }
        for (i = 0; i < 2; i++) {
@@ -581,18 +619,18 @@ static int send_kconsumer_session_streams(struct consumer_data *consumer_data,
 
        DBG("Sending metadata stream fd");
 
-       /* Extra protection. It's NOT supposed to be set to 0 at this point */
-       if (session->consumer_fd == 0) {
+       /* Extra protection. It's NOT supposed to be set to -1 at this point */
+       if (session->consumer_fd < 0) {
                session->consumer_fd = consumer_data->cmd_sock;
        }
 
-       if (session->metadata_stream_fd != 0) {
+       if (session->metadata_stream_fd >= 0) {
                /* Send metadata channel fd */
                lkm.cmd_type = LTTNG_CONSUMER_ADD_CHANNEL;
                lkm.u.channel.channel_key = session->metadata->fd;
                lkm.u.channel.max_sb_size = session->metadata->conf->attr.subbuf_size;
                lkm.u.channel.mmap_len = 0;     /* for kernel */
-               DBG("Sending metadata channel %d to consumer", lkm.u.stream.stream_key);
+               DBG("Sending metadata channel %d to consumer", lkm.u.channel.channel_key);
                ret = lttcomm_send_unix_sock(sock, &lkm, sizeof(lkm));
                if (ret < 0) {
                        PERROR("send consumer channel");
@@ -760,8 +798,8 @@ static int update_kernel_stream(struct consumer_data *consumer_data, int fd)
                        continue;
                }
 
-               /* This is not suppose to be 0 but this is an extra security check */
-               if (session->kernel_session->consumer_fd == 0) {
+               /* This is not suppose to be -1 but this is an extra security check */
+               if (session->kernel_session->consumer_fd < 0) {
                        session->kernel_session->consumer_fd = consumer_data->cmd_sock;
                }
 
@@ -1086,6 +1124,17 @@ restart_poll:
        ERR("consumer return code : %s", lttcomm_get_readable_code(-code));
 
 error:
+       /* Immediately set the consumerd state to stopped */
+       if (consumer_data->type == LTTNG_CONSUMER_KERNEL) {
+               uatomic_set(&kernel_consumerd_state, CONSUMER_ERROR);
+       } else if (consumer_data->type == LTTNG_CONSUMER64_UST ||
+                       consumer_data->type == LTTNG_CONSUMER32_UST) {
+               uatomic_set(&ust_consumerd_state, CONSUMER_ERROR);
+       } else {
+               /* Code flow error... */
+               assert(0);
+       }
+
        if (consumer_data->err_sock >= 0) {
                ret = close(consumer_data->err_sock);
                if (ret) {
@@ -1421,6 +1470,17 @@ static void *thread_registration_apps(void *data)
                                         * Using message-based transmissions to ensure we don't
                                         * have to deal with partially received messages.
                                         */
+                                       ret = lttng_fd_get(LTTNG_FD_APPS, 1);
+                                       if (ret < 0) {
+                                               ERR("Exhausted file descriptors allowed for applications.");
+                                               free(ust_cmd);
+                                               ret = close(sock);
+                                               if (ret) {
+                                                       PERROR("close");
+                                               }
+                                               sock = -1;
+                                               continue;
+                                       }
                                        ret = lttcomm_recv_unix_sock(sock, &ust_cmd->reg_msg,
                                                        sizeof(struct ust_register_msg));
                                        if (ret < 0 || ret < sizeof(struct ust_register_msg)) {
@@ -1434,11 +1494,13 @@ static void *thread_registration_apps(void *data)
                                                if (ret) {
                                                        PERROR("close");
                                                }
+                                               lttng_fd_put(LTTNG_FD_APPS, 1);
                                                sock = -1;
                                                continue;
                                        }
 
                                        ust_cmd->sock = sock;
+                                       sock = -1;
 
                                        DBG("UST registration received with pid:%d ppid:%d uid:%d"
                                                        " gid:%d sock:%d name:%s (version %d.%d)",
@@ -1473,11 +1535,12 @@ error:
                        PERROR("close");
                }
        }
-       if (clock >= 0) {
+       if (sock >= 0) {
                ret = close(sock);
                if (ret) {
                        PERROR("close");
                }
+               lttng_fd_put(LTTNG_FD_APPS, 1);
        }
        unlink(apps_unix_sock_path);
 
@@ -1845,7 +1908,11 @@ error_open:
 error:
        WARN("No kernel tracer available");
        kernel_tracer_fd = -1;
-       return LTTCOMM_KERN_NA;
+       if (!is_root) {
+               return LTTCOMM_NEED_ROOT_SESSIOND;
+       } else {
+               return LTTCOMM_KERN_NA;
+       }
 }
 
 /*
@@ -1858,10 +1925,10 @@ static int init_kernel_tracing(struct ltt_kernel_session *session)
        if (session->consumer_fds_sent == 0) {
                /*
                 * Assign default kernel consumer socket if no consumer assigned to the
-                * kernel session. At this point, it's NOT suppose to be 0 but this is
+                * kernel session. At this point, it's NOT supposed to be -1 but this is
                 * an extra security check.
                 */
-               if (session->consumer_fd == 0) {
+               if (session->consumer_fd < 0) {
                        session->consumer_fd = kconsumer_data.cmd_sock;
                }
 
@@ -1949,7 +2016,7 @@ static int create_kernel_session(struct ltt_session *session)
        }
 
        /* Set kernel consumer socket fd */
-       if (kconsumer_data.cmd_sock) {
+       if (kconsumer_data.cmd_sock >= 0) {
                session->kernel_session->consumer_fd = kconsumer_data.cmd_sock;
        }
 
@@ -2838,7 +2905,8 @@ static int cmd_start_trace(struct ltt_session *session)
        usess = session->ust_session;
 
        if (session->enabled) {
-               ret = LTTCOMM_UST_START_FAIL;
+               /* Already started. */
+               ret = LTTCOMM_TRACE_ALREADY_STARTED;
                goto error;
        }
 
@@ -2858,7 +2926,7 @@ static int cmd_start_trace(struct ltt_session *session)
                }
 
                /* Open kernel metadata stream */
-               if (ksession->metadata_stream_fd == 0) {
+               if (ksession->metadata_stream_fd < 0) {
                        ret = kernel_open_metadata_stream(ksession);
                        if (ret < 0) {
                                ERR("Kernel create metadata stream failed");
@@ -2930,7 +2998,7 @@ static int cmd_stop_trace(struct ltt_session *session)
        usess = session->ust_session;
 
        if (!session->enabled) {
-               ret = LTTCOMM_UST_STOP_FAIL;
+               ret = LTTCOMM_TRACE_ALREADY_STOPPED;
                goto error;
        }
 
@@ -3255,7 +3323,11 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
 
        if (opt_no_kernel && need_domain
                        && cmd_ctx->lsm->domain.type == LTTNG_DOMAIN_KERNEL) {
-               ret = LTTCOMM_KERN_NA;
+               if (!is_root) {
+                       ret = LTTCOMM_NEED_ROOT_SESSIOND;
+               } else {
+                       ret = LTTCOMM_KERN_NA;
+               }
                goto error;
        }
 
@@ -3290,9 +3362,13 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                break;
        default:
                DBG("Getting session %s by name", cmd_ctx->lsm->session.name);
+               /*
+                * We keep the session list lock across _all_ commands
+                * for now, because the per-session lock does not
+                * handle teardown properly.
+                */
                session_lock_list();
                cmd_ctx->session = session_find_by_name(cmd_ctx->lsm->session.name);
-               session_unlock_list();
                if (cmd_ctx->session == NULL) {
                        if (cmd_ctx->lsm->session.name != NULL) {
                                ret = LTTCOMM_SESS_NOT_FOUND;
@@ -3317,7 +3393,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
        switch (cmd_ctx->lsm->domain.type) {
        case LTTNG_DOMAIN_KERNEL:
                if (!is_root) {
-                       ret = LTTCOMM_KERN_NA;
+                       ret = LTTCOMM_NEED_ROOT_SESSIOND;
                        goto error;
                }
 
@@ -3330,6 +3406,12 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                        }
                }
 
+               /* Consumer is in an ERROR state. Report back to client */
+               if (uatomic_read(&kernel_consumerd_state) == CONSUMER_ERROR) {
+                       ret = LTTCOMM_NO_KERNCONSUMERD;
+                       goto error;
+               }
+
                /* Need a session for kernel command */
                if (need_tracing_session) {
                        if (cmd_ctx->session->kernel_session == NULL) {
@@ -3350,13 +3432,21 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                                        ret = LTTCOMM_KERN_CONSUMER_FAIL;
                                        goto error;
                                }
+                               uatomic_set(&kernel_consumerd_state, CONSUMER_STARTED);
                        } else {
                                pthread_mutex_unlock(&kconsumer_data.pid_mutex);
                        }
                }
+
                break;
        case LTTNG_DOMAIN_UST:
        {
+               /* Consumer is in an ERROR state. Report back to client */
+               if (uatomic_read(&ust_consumerd_state) == CONSUMER_ERROR) {
+                       ret = LTTCOMM_NO_USTCONSUMERD;
+                       goto error;
+               }
+
                if (need_tracing_session) {
                        if (cmd_ctx->session->ust_session == NULL) {
                                ret = create_ust_session(cmd_ctx->session,
@@ -3380,6 +3470,7 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                                }
 
                                ust_consumerd64_fd = ustconsumer64_data.cmd_sock;
+                               uatomic_set(&ust_consumerd_state, CONSUMER_STARTED);
                        } else {
                                pthread_mutex_unlock(&ustconsumer64_data.pid_mutex);
                        }
@@ -3394,7 +3485,9 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
                                        ust_consumerd32_fd = -EINVAL;
                                        goto error;
                                }
+
                                ust_consumerd32_fd = ustconsumer32_data.cmd_sock;
+                               uatomic_set(&ust_consumerd_state, CONSUMER_STARTED);
                        } else {
                                pthread_mutex_unlock(&ustconsumer32_data.pid_mutex);
                        }
@@ -3406,6 +3499,25 @@ static int process_client_msg(struct command_ctx *cmd_ctx)
        }
 skip_domain:
 
+       /* Validate consumer daemon state when start/stop trace command */
+       if (cmd_ctx->lsm->cmd_type == LTTNG_START_TRACE ||
+                       cmd_ctx->lsm->cmd_type == LTTNG_STOP_TRACE) {
+               switch (cmd_ctx->lsm->domain.type) {
+               case LTTNG_DOMAIN_UST:
+                       if (uatomic_read(&ust_consumerd_state) != CONSUMER_STARTED) {
+                               ret = LTTCOMM_NO_USTCONSUMERD;
+                               goto error;
+                       }
+                       break;
+               case LTTNG_DOMAIN_KERNEL:
+                       if (uatomic_read(&kernel_consumerd_state) != CONSUMER_STARTED) {
+                               ret = LTTCOMM_NO_KERNCONSUMERD;
+                               goto error;
+                       }
+                       break;
+               }
+       }
+
        /*
         * Check that the UID or GID match that of the tracing session.
         * The root user can interact with all sessions.
@@ -3522,6 +3634,11 @@ skip_domain:
        {
                ret = cmd_destroy_session(cmd_ctx->session,
                                cmd_ctx->lsm->session.name);
+               /*
+                * Set session to NULL so we do not unlock it after
+                * free.
+                */
+               cmd_ctx->session = NULL;
                break;
        }
        case LTTNG_LIST_DOMAINS:
@@ -3656,6 +3773,9 @@ setup_error:
        if (cmd_ctx->session) {
                session_unlock(cmd_ctx->session);
        }
+       if (need_tracing_session) {
+               session_unlock_list();
+       }
 init_setup_error:
        return ret;
 }
@@ -3785,7 +3905,7 @@ static void *thread_manage_clients(void *data)
                                PERROR("close");
                        }
                        sock = -1;
-                       free(cmd_ctx);
+                       clean_command_ctx(&cmd_ctx);
                        continue;
                }
 
@@ -3972,11 +4092,11 @@ static int parse_args(int argc, char **argv)
                        opt_no_kernel = 1;
                        break;
                case 'q':
-                       opt_quiet = 1;
+                       lttng_opt_quiet = 1;
                        break;
                case 'v':
                        /* Verbose level can increase using multiple -v */
-                       opt_verbose += 1;
+                       lttng_opt_verbose += 1;
                        break;
                case 'Z':
                        opt_verbose_consumer += 1;
@@ -4239,10 +4359,11 @@ static int set_consumer_sockets(struct consumer_data *consumer_data,
        ret = mkdir(path, S_IRWXU);
        if (ret < 0) {
                if (errno != EEXIST) {
+                       PERROR("mkdir");
                        ERR("Failed to create %s", path);
                        goto error;
                }
-               ret = 0;
+               ret = -1;
        }
 
        /* Create the kconsumerd error unix socket */
@@ -4464,6 +4585,10 @@ int main(int argc, char **argv)
                }
        }
 
+       /* Set consumer initial state */
+       kernel_consumerd_state = CONSUMER_STOPPED;
+       ust_consumerd_state = CONSUMER_STOPPED;
+
        DBG("Client socket path %s", client_unix_sock_path);
        DBG("Application socket path %s", apps_unix_sock_path);
        DBG("LTTng run directory path: %s", rundir);
@@ -4502,6 +4627,12 @@ int main(int argc, char **argv)
                goto error;
        }
 
+       /*
+        * Init UST app hash table. Alloc hash table before this point since
+        * cleanup() can get called after that point.
+        */
+       ust_app_ht_alloc();
+
        /* After this point, we can safely call cleanup() with "goto exit" */
 
        /*
@@ -4524,6 +4655,8 @@ int main(int argc, char **argv)
                /* Set ulimit for open files */
                set_ulimit();
        }
+       /* init lttng_fd tracking must be done after set_ulimit. */
+       lttng_fd_init();
 
        ret = set_consumer_sockets(&ustconsumer64_data, rundir);
        if (ret < 0) {
@@ -4567,9 +4700,6 @@ int main(int argc, char **argv)
        /* Init UST command queue. */
        cds_wfq_init(&ust_cmd_queue.queue);
 
-       /* Init UST app hash table */
-       ust_app_ht_alloc();
-
        /*
         * Get session list pointer. This pointer MUST NOT be free(). This list is
         * statically declared in session.c
This page took 0.03046 seconds and 4 git commands to generate.