Fix: ust-app: per-PID app unregister vs tracing stop races
[lttng-tools.git] / src / bin / lttng-sessiond / ust-app.c
index 34d4c9b1558de816bd1935ba02eef50b1c90e511..2576111022505a18de261a5d5dca90a8f74090a7 100644 (file)
@@ -40,6 +40,9 @@
 #include "ust-ctl.h"
 #include "utils.h"
 
+static
+int ust_app_flush_app_session(struct ust_app *app, struct ust_app_session *ua_sess);
+
 /* Next available channel key. Access under next_channel_key_lock. */
 static uint64_t _next_channel_key;
 static pthread_mutex_t next_channel_key_lock = PTHREAD_MUTEX_INITIALIZER;
@@ -825,6 +828,7 @@ struct ust_app_session *alloc_ust_app_session(struct ust_app *app)
 
        ua_sess->handle = -1;
        ua_sess->channels = lttng_ht_new(0, LTTNG_HT_TYPE_STRING);
+       ua_sess->metadata_attr.type = LTTNG_UST_CHAN_METADATA;
        pthread_mutex_init(&ua_sess->lock, NULL);
 
        return ua_sess;
@@ -1445,7 +1449,32 @@ int create_ust_event(struct ust_app *app, struct ust_app_session *ua_sess,
        }
 
        /* If event not enabled, disable it on the tracer */
-       if (ua_event->enabled == 0) {
+       if (ua_event->enabled) {
+               /*
+                * We now need to explicitly enable the event, since it
+                * is now disabled at creation.
+                */
+               ret = enable_ust_event(app, ua_sess, ua_event);
+               if (ret < 0) {
+                       /*
+                        * If we hit an EPERM, something is wrong with our enable call. If
+                        * we get an EEXIST, there is a problem on the tracer side since we
+                        * just created it.
+                        */
+                       switch (ret) {
+                       case -LTTNG_UST_ERR_PERM:
+                               /* Code flow problem */
+                               assert(0);
+                       case -LTTNG_UST_ERR_EXIST:
+                               /* It's OK for our use case. */
+                               ret = 0;
+                               break;
+                       default:
+                               break;
+                       }
+                       goto error;
+               }
+       } else {
                ret = disable_ust_event(app, ua_sess, ua_event);
                if (ret < 0) {
                        /*
@@ -1608,6 +1637,8 @@ static void shadow_copy_session(struct ust_app_session *ua_sess,
        ua_sess->consumer = usess->consumer;
        ua_sess->output_traces = usess->output_traces;
        ua_sess->live_timer_interval = usess->live_timer_interval;
+       copy_channel_attr_to_ustctl(&ua_sess->metadata_attr,
+                       &usess->metadata_attr);
 
        switch (ua_sess->buffer_type) {
        case LTTNG_BUFFER_PER_PID:
@@ -2714,8 +2745,7 @@ error:
  * Called with UST app session lock held and RCU read side lock.
  */
 static int create_ust_app_metadata(struct ust_app_session *ua_sess,
-               struct ust_app *app, struct consumer_output *consumer,
-               struct ustctl_consumer_channel_attr *attr)
+               struct ust_app *app, struct consumer_output *consumer)
 {
        int ret = 0;
        struct ust_app_channel *metadata;
@@ -2743,20 +2773,7 @@ static int create_ust_app_metadata(struct ust_app_session *ua_sess,
                goto error;
        }
 
-       if (!attr) {
-               /* Set default attributes for metadata. */
-               metadata->attr.overwrite = DEFAULT_CHANNEL_OVERWRITE;
-               metadata->attr.subbuf_size = default_get_metadata_subbuf_size();
-               metadata->attr.num_subbuf = DEFAULT_METADATA_SUBBUF_NUM;
-               metadata->attr.switch_timer_interval = DEFAULT_METADATA_SWITCH_TIMER;
-               metadata->attr.read_timer_interval = DEFAULT_METADATA_READ_TIMER;
-               metadata->attr.output = LTTNG_UST_MMAP;
-               metadata->attr.type = LTTNG_UST_CHAN_METADATA;
-       } else {
-               memcpy(&metadata->attr, attr, sizeof(metadata->attr));
-               metadata->attr.output = LTTNG_UST_MMAP;
-               metadata->attr.type = LTTNG_UST_CHAN_METADATA;
-       }
+       memcpy(&metadata->attr, &ua_sess->metadata_attr, sizeof(metadata->attr));
 
        /* Need one fd for the channel. */
        ret = lttng_fd_get(LTTNG_FD_APPS, 1);
@@ -2983,6 +3000,7 @@ void ust_app_unregister(int sock)
 {
        struct ust_app *lta;
        struct lttng_ht_node_ulong *node;
+       struct lttng_ht_iter ust_app_sock_iter;
        struct lttng_ht_iter iter;
        struct ust_app_session *ua_sess;
        int ret;
@@ -2990,39 +3008,19 @@ void ust_app_unregister(int sock)
        rcu_read_lock();
 
        /* Get the node reference for a call_rcu */
-       lttng_ht_lookup(ust_app_ht_by_sock, (void *)((unsigned long) sock), &iter);
-       node = lttng_ht_iter_get_node_ulong(&iter);
+       lttng_ht_lookup(ust_app_ht_by_sock, (void *)((unsigned long) sock), &ust_app_sock_iter);
+       node = lttng_ht_iter_get_node_ulong(&ust_app_sock_iter);
        assert(node);
 
        lta = caa_container_of(node, struct ust_app, sock_n);
        DBG("PID %d unregistering with sock %d", lta->pid, sock);
 
-       /* Remove application from PID hash table */
-       ret = lttng_ht_del(ust_app_ht_by_sock, &iter);
-       assert(!ret);
-
-       /*
-        * Remove application from notify hash table. The thread handling the
-        * notify socket could have deleted the node so ignore on error because
-        * either way it's valid. The close of that socket is handled by the other
-        * thread.
-        */
-       iter.iter.node = &lta->notify_sock_n.node;
-       (void) lttng_ht_del(ust_app_ht_by_notify_sock, &iter);
-
        /*
-        * Ignore return value since the node might have been removed before by an
-        * add replace during app registration because the PID can be reassigned by
-        * the OS.
+        * Perform "push metadata" and flush all application streams
+        * before removing app from hash tables, ensuring proper
+        * behavior of data_pending check.
+        * Remove sessions so they are not visible during deletion.
         */
-       iter.iter.node = &lta->pid_n.node;
-       ret = lttng_ht_del(ust_app_ht, &iter);
-       if (ret) {
-               DBG3("Unregister app by PID %d failed. This can happen on pid reuse",
-                               lta->pid);
-       }
-
-       /* Remove sessions so they are not visible during deletion.*/
        cds_lfht_for_each_entry(lta->sessions->ht, &iter.iter, ua_sess,
                        node.node) {
                struct ust_registry_session *registry;
@@ -3033,6 +3031,8 @@ void ust_app_unregister(int sock)
                        continue;
                }
 
+               (void) ust_app_flush_app_session(lta, ua_sess);
+
                /*
                 * Add session to list for teardown. This is safe since at this point we
                 * are the only one using this list.
@@ -3067,11 +3067,36 @@ void ust_app_unregister(int sock)
                                (void) close_metadata(registry, ua_sess->consumer);
                        }
                }
-
                cds_list_add(&ua_sess->teardown_node, &lta->teardown_head);
+
                pthread_mutex_unlock(&ua_sess->lock);
        }
 
+       /* Remove application from PID hash table */
+       ret = lttng_ht_del(ust_app_ht_by_sock, &ust_app_sock_iter);
+       assert(!ret);
+
+       /*
+        * Remove application from notify hash table. The thread handling the
+        * notify socket could have deleted the node so ignore on error because
+        * either way it's valid. The close of that socket is handled by the other
+        * thread.
+        */
+       iter.iter.node = &lta->notify_sock_n.node;
+       (void) lttng_ht_del(ust_app_ht_by_notify_sock, &iter);
+
+       /*
+        * Ignore return value since the node might have been removed before by an
+        * add replace during app registration because the PID can be reassigned by
+        * the OS.
+        */
+       iter.iter.node = &lta->pid_n.node;
+       ret = lttng_ht_del(ust_app_ht, &iter);
+       if (ret) {
+               DBG3("Unregister app by PID %d failed. This can happen on pid reuse",
+                               lta->pid);
+       }
+
        /* Free memory */
        call_rcu(&lta->pid_n.head, delete_ust_app_rcu);
 
@@ -3144,19 +3169,25 @@ int ust_app_list_events(struct lttng_event **events)
                        health_code_update();
                        if (count >= nbmem) {
                                /* In case the realloc fails, we free the memory */
-                               void *ptr;
-
-                               DBG2("Reallocating event list from %zu to %zu entries", nbmem,
-                                               2 * nbmem);
-                               nbmem *= 2;
-                               ptr = realloc(tmp_event, nbmem * sizeof(struct lttng_event));
-                               if (ptr == NULL) {
+                               struct lttng_event *new_tmp_event;
+                               size_t new_nbmem;
+
+                               new_nbmem = nbmem << 1;
+                               DBG2("Reallocating event list from %zu to %zu entries",
+                                               nbmem, new_nbmem);
+                               new_tmp_event = realloc(tmp_event,
+                                       new_nbmem * sizeof(struct lttng_event));
+                               if (new_tmp_event == NULL) {
                                        PERROR("realloc ust app events");
                                        free(tmp_event);
                                        ret = -ENOMEM;
                                        goto rcu_error;
                                }
-                               tmp_event = ptr;
+                               /* Zero the new memory */
+                               memset(new_tmp_event + nbmem, 0,
+                                       (new_nbmem - nbmem) * sizeof(struct lttng_event));
+                               nbmem = new_nbmem;
+                               tmp_event = new_tmp_event;
                        }
                        memcpy(tmp_event[count].name, uiter.name, LTTNG_UST_SYM_NAME_LEN);
                        tmp_event[count].loglevel = uiter.loglevel;
@@ -3244,19 +3275,25 @@ int ust_app_list_event_fields(struct lttng_event_field **fields)
                        health_code_update();
                        if (count >= nbmem) {
                                /* In case the realloc fails, we free the memory */
-                               void *ptr;
-
-                               DBG2("Reallocating event field list from %zu to %zu entries", nbmem,
-                                               2 * nbmem);
-                               nbmem *= 2;
-                               ptr = realloc(tmp_event, nbmem * sizeof(struct lttng_event_field));
-                               if (ptr == NULL) {
+                               struct lttng_event_field *new_tmp_event;
+                               size_t new_nbmem;
+
+                               new_nbmem = nbmem << 1;
+                               DBG2("Reallocating event field list from %zu to %zu entries",
+                                               nbmem, new_nbmem);
+                               new_tmp_event = realloc(tmp_event,
+                                       new_nbmem * sizeof(struct lttng_event_field));
+                               if (new_tmp_event == NULL) {
                                        PERROR("realloc ust app event fields");
                                        free(tmp_event);
                                        ret = -ENOMEM;
                                        goto rcu_error;
                                }
-                               tmp_event = ptr;
+                               /* Zero the new memory */
+                               memset(new_tmp_event + nbmem, 0,
+                                       (new_nbmem - nbmem) * sizeof(struct lttng_event_field));
+                               nbmem = new_nbmem;
+                               tmp_event = new_tmp_event;
                        }
 
                        memcpy(tmp_event[count].field_name, uiter.field_name, LTTNG_UST_SYM_NAME_LEN);
@@ -3570,10 +3607,8 @@ int ust_app_create_channel_glb(struct ltt_ust_session *usess,
                pthread_mutex_lock(&ua_sess->lock);
                if (!strncmp(uchan->name, DEFAULT_METADATA_NAME,
                                        sizeof(uchan->name))) {
-                       struct ustctl_consumer_channel_attr attr;
-                       copy_channel_attr_to_ustctl(&attr, &uchan->attr);
-                       ret = create_ust_app_metadata(ua_sess, app, usess->consumer,
-                                       &attr);
+                       copy_channel_attr_to_ustctl(&ua_sess->metadata_attr, &uchan->attr);
+                       ret = 0;
                } else {
                        /* Create channel onto application. We don't need the chan ref. */
                        ret = create_ust_app_channel(ua_sess, uchan, app,
@@ -3778,7 +3813,7 @@ int ust_app_start_trace(struct ltt_ust_session *usess, struct ust_app *app)
         * Create the metadata for the application. This returns gracefully if a
         * metadata was already set for the session.
         */
-       ret = create_ust_app_metadata(ua_sess, app, usess->consumer, NULL);
+       ret = create_ust_app_metadata(ua_sess, app, usess->consumer);
        if (ret < 0) {
                goto error_unlock;
        }
@@ -3919,28 +3954,21 @@ error_rcu_unlock:
        return -1;
 }
 
-/*
- * Flush buffers for a specific UST session and app.
- */
 static
-int ust_app_flush_trace(struct ltt_ust_session *usess, struct ust_app *app)
+int ust_app_flush_app_session(struct ust_app *app,
+               struct ust_app_session *ua_sess)
 {
-       int ret = 0;
+       int ret, retval = 0;
        struct lttng_ht_iter iter;
-       struct ust_app_session *ua_sess;
        struct ust_app_channel *ua_chan;
+       struct consumer_socket *socket;
 
-       DBG("Flushing buffers for ust app pid %d", app->pid);
+       DBG("Flushing app session buffers for ust app pid %d", app->pid);
 
        rcu_read_lock();
 
        if (!app->compatible) {
-               goto end_no_session;
-       }
-
-       ua_sess = lookup_session_by_app(usess, app);
-       if (ua_sess == NULL) {
-               goto end_no_session;
+               goto end_not_compatible;
        }
 
        pthread_mutex_lock(&ua_sess->lock);
@@ -3948,25 +3976,16 @@ int ust_app_flush_trace(struct ltt_ust_session *usess, struct ust_app *app)
        health_code_update();
 
        /* Flushing buffers */
+       socket = consumer_find_socket_by_bitness(app->bits_per_long,
+                       ua_sess->consumer);
        cds_lfht_for_each_entry(ua_sess->channels->ht, &iter.iter, ua_chan,
                        node.node) {
                health_code_update();
                assert(ua_chan->is_sent);
-               ret = ustctl_sock_flush_buffer(app->sock, ua_chan->obj);
-               if (ret < 0) {
-                       if (ret != -EPIPE && ret != -LTTNG_UST_ERR_EXITING) {
-                               ERR("UST app PID %d channel %s flush failed with ret %d",
-                                               app->pid, ua_chan->name, ret);
-                       } else {
-                               DBG3("UST app failed to flush %s. Application is dead.",
-                                               ua_chan->name);
-                               /*
-                                * This is normal behavior, an application can die during the
-                                * creation process. Don't report an error so the execution can
-                                * continue normally.
-                                */
-                       }
-                       /* Continuing flushing all buffers */
+               ret = consumer_flush_channel(socket, ua_chan->key);
+               if (ret) {
+                       ERR("Error flushing consumer channel");
+                       retval = -1;
                        continue;
                }
        }
@@ -3974,10 +3993,37 @@ int ust_app_flush_trace(struct ltt_ust_session *usess, struct ust_app *app)
        health_code_update();
 
        pthread_mutex_unlock(&ua_sess->lock);
+end_not_compatible:
+       rcu_read_unlock();
+       health_code_update();
+       return retval;
+}
+
+/*
+ * Flush buffers for a specific UST session and app.
+ */
+static
+int ust_app_flush_session(struct ust_app *app, struct ltt_ust_session *usess)
+
+{
+       int ret;
+       struct ust_app_session *ua_sess;
+
+       DBG("Flushing session buffers for ust app pid %d", app->pid);
+
+       rcu_read_lock();
+
+       ua_sess = lookup_session_by_app(usess, app);
+       if (ua_sess == NULL) {
+               ret = -1;
+               goto end_no_session;
+       }
+       ret = ust_app_flush_app_session(app, ua_sess);
+
 end_no_session:
        rcu_read_unlock();
        health_code_update();
-       return 0;
+       return ret;
 }
 
 /*
@@ -4111,7 +4157,7 @@ int ust_app_stop_trace_all(struct ltt_ust_session *usess)
        }
        case LTTNG_BUFFER_PER_PID:
                cds_lfht_for_each_entry(ust_app_ht->ht, &iter.iter, app, pid_n.node) {
-                       ret = ust_app_flush_trace(usess, app);
+                       ret = ust_app_flush_session(app, usess);
                        if (ret < 0) {
                                /* Continue to next apps even on error */
                                continue;
@@ -4205,31 +4251,14 @@ void ust_app_global_update(struct ltt_ust_session *usess, int sock)
         */
        cds_lfht_for_each_entry(ua_sess->channels->ht, &iter.iter, ua_chan,
                        node.node) {
-               /*
-                * For a metadata channel, handle it differently.
-                */
-               if (!strncmp(ua_chan->name, DEFAULT_METADATA_NAME,
-                                       sizeof(ua_chan->name))) {
-                       ret = create_ust_app_metadata(ua_sess, app, usess->consumer,
-                                       &ua_chan->attr);
-                       if (ret < 0) {
-                               goto error_unlock;
-                       }
-                       /* Remove it from the hash table and continue!. */
-                       ret = lttng_ht_del(ua_sess->channels, &iter);
-                       assert(!ret);
-                       delete_ust_app_channel(-1, ua_chan, app);
-                       continue;
-               } else {
-                       ret = do_create_channel(app, usess, ua_sess, ua_chan);
-                       if (ret < 0) {
-                               /*
-                                * Stop everything. On error, the application failed, no more
-                                * file descriptor are available or ENOMEM so stopping here is
-                                * the only thing we can do for now.
-                                */
-                               goto error_unlock;
-                       }
+               ret = do_create_channel(app, usess, ua_sess, ua_chan);
+               if (ret < 0) {
+                       /*
+                        * Stop everything. On error, the application failed, no more
+                        * file descriptor are available or ENOMEM so stopping here is
+                        * the only thing we can do for now.
+                        */
+                       goto error_unlock;
                }
 
                /*
@@ -5117,10 +5146,12 @@ unsigned int ust_app_get_nb_stream(struct ltt_ust_session *usess)
                cds_list_for_each_entry(reg, &usess->buffer_reg_uid_list, lnode) {
                        struct buffer_reg_channel *reg_chan;
 
+                       rcu_read_lock();
                        cds_lfht_for_each_entry(reg->registry->channels->ht, &iter.iter,
                                        reg_chan, node.node) {
                                ret += reg_chan->stream_count;
                        }
+                       rcu_read_unlock();
                }
                break;
        }
This page took 0.029819 seconds and 4 git commands to generate.