Fix: release reference to created chunk if it can't be published
[lttng-tools.git] / src / common / consumer / consumer.c
index b2f4c2686d2f7f81b5bc21c4249a681e92a5bad3..322492631af40f22a8833224032fbc871f88cb34 100644 (file)
@@ -411,7 +411,8 @@ void consumer_del_channel(struct lttng_consumer_channel *channel)
                rcu_read_unlock();
        }
 
-        call_rcu(&channel->node.head, free_channel_rcu);
+       channel->is_deleted = true;
+       call_rcu(&channel->node.head, free_channel_rcu);
 end:
        pthread_mutex_unlock(&channel->lock);
        pthread_mutex_unlock(&consumer_data.lock);
@@ -1021,6 +1022,16 @@ int lttng_consumer_channel_set_trace_chunk(
        unsigned long channel_hash;
 
        pthread_mutex_lock(&channel->lock);
+       if (channel->is_deleted) {
+               /*
+                * The channel has been logically deleted and should no longer
+                * be used. It has released its reference to its current trace
+                * chunk and should not acquire a new one.
+                *
+                * Return success as there is nothing for the caller to do.
+                */
+               goto end;
+       }
        /*
         * A stream can transition to a state where it and its channel
         * no longer belong to a trace chunk. For instance, this happens when
@@ -1397,6 +1408,7 @@ void lttng_consumer_cleanup(void)
 {
        struct lttng_ht_iter iter;
        struct lttng_consumer_channel *channel;
+       unsigned int trace_chunks_left;
 
        rcu_read_lock();
 
@@ -1421,6 +1433,27 @@ void lttng_consumer_cleanup(void)
         */
        lttng_ht_destroy(consumer_data.stream_list_ht);
 
+       /*
+        * Trace chunks in the registry may still exist if the session
+        * daemon has encountered an internal error and could not
+        * tear down its sessions and/or trace chunks properly.
+        *
+        * Release the session daemon's implicit reference to any remaining
+        * trace chunk and print an error if any trace chunk was found. Note
+        * that there are _no_ legitimate cases for trace chunks to be left,
+        * it is a leak. However, it can happen following a crash of the
+        * session daemon and not emptying the registry would cause an assertion
+        * to hit.
+        */
+       trace_chunks_left = lttng_trace_chunk_registry_put_each_chunk(
+                       consumer_data.chunk_registry);
+       if (trace_chunks_left) {
+               ERR("%u trace chunks are leaked by lttng-consumerd. "
+                               "This can be caused by an internal error of the session daemon.",
+                               trace_chunks_left);
+       }
+       /* Run all callbacks freeing each chunk. */
+       rcu_barrier();
        lttng_trace_chunk_registry_destroy(consumer_data.chunk_registry);
 }
 
@@ -1727,9 +1760,8 @@ ssize_t lttng_consumer_on_read_subbuffer_mmap(
 
        /* RCU lock for the relayd pointer */
        rcu_read_lock();
-
        assert(stream->net_seq_idx != (uint64_t) -1ULL ||
-                       stream->chan->trace_chunk);
+                       stream->trace_chunk);
 
        /* Flag that the current stream if set for network streaming. */
        if (stream->net_seq_idx != (uint64_t) -1ULL) {
@@ -2254,7 +2286,8 @@ void lttng_consumer_close_all_metadata(void)
 void consumer_del_metadata_stream(struct lttng_consumer_stream *stream,
                struct lttng_ht *ht)
 {
-       struct lttng_consumer_channel *free_chan = NULL;
+       struct lttng_consumer_channel *channel = NULL;
+       bool free_channel = false;
 
        assert(stream);
        /*
@@ -2266,11 +2299,17 @@ void consumer_del_metadata_stream(struct lttng_consumer_stream *stream,
        DBG3("Consumer delete metadata stream %d", stream->wait_fd);
 
        pthread_mutex_lock(&consumer_data.lock);
-       pthread_mutex_lock(&stream->chan->lock);
+       /*
+        * Note that this assumes that a stream's channel is never changed and
+        * that the stream's lock doesn't need to be taken to sample its
+        * channel.
+        */
+       channel = stream->chan;
+       pthread_mutex_lock(&channel->lock);
        pthread_mutex_lock(&stream->lock);
-       if (stream->chan->metadata_cache) {
+       if (channel->metadata_cache) {
                /* Only applicable to userspace consumers. */
-               pthread_mutex_lock(&stream->chan->metadata_cache->lock);
+               pthread_mutex_lock(&channel->metadata_cache->lock);
        }
 
        /* Remove any reference to that stream. */
@@ -2282,28 +2321,29 @@ void consumer_del_metadata_stream(struct lttng_consumer_stream *stream,
        consumer_stream_destroy_buffers(stream);
 
        /* Atomically decrement channel refcount since other threads can use it. */
-       if (!uatomic_sub_return(&stream->chan->refcount, 1)
-                       && !uatomic_read(&stream->chan->nb_init_stream_left)) {
+       if (!uatomic_sub_return(&channel->refcount, 1)
+                       && !uatomic_read(&channel->nb_init_stream_left)) {
                /* Go for channel deletion! */
-               free_chan = stream->chan;
+               free_channel = true;
        }
+       stream->chan = NULL;
 
        /*
         * Nullify the stream reference so it is not used after deletion. The
         * channel lock MUST be acquired before being able to check for a NULL
         * pointer value.
         */
-       stream->chan->metadata_stream = NULL;
+       channel->metadata_stream = NULL;
 
-       if (stream->chan->metadata_cache) {
-               pthread_mutex_unlock(&stream->chan->metadata_cache->lock);
+       if (channel->metadata_cache) {
+               pthread_mutex_unlock(&channel->metadata_cache->lock);
        }
        pthread_mutex_unlock(&stream->lock);
-       pthread_mutex_unlock(&stream->chan->lock);
+       pthread_mutex_unlock(&channel->lock);
        pthread_mutex_unlock(&consumer_data.lock);
 
-       if (free_chan) {
-               consumer_del_channel(free_chan);
+       if (free_channel) {
+               consumer_del_channel(channel);
        }
 
        lttng_trace_chunk_put(stream->trace_chunk);
@@ -4073,6 +4113,10 @@ int lttng_consumer_rotate_channel(struct lttng_consumer_channel *channel,
                        stream->rotate_ready = true;
                }
 
+               /*
+                * Active flush; has no effect if the production position
+                * is at a packet boundary.
+                */
                ret = consumer_flush_buffer(stream, 1);
                if (ret < 0) {
                        ERR("Failed to flush stream %" PRIu64 " during channel rotation",
@@ -4081,10 +4125,34 @@ int lttng_consumer_rotate_channel(struct lttng_consumer_channel *channel,
                }
 
                if (!is_local_trace) {
+                       /*
+                        * The relay daemon control protocol expects a rotation
+                        * position as "the sequence number of the first packet
+                        * _after_ the current trace chunk.
+                        *
+                        * At the moment when the positions of the buffers are
+                        * sampled, the production position does not necessarily
+                        * sit at a packet boundary. The 'active' flush
+                        * operation above will push the production position to
+                        * the next packet boundary _if_ it is not already
+                        * sitting at such a boundary.
+                        *
+                        * Assuming a current production position that is not
+                        * on the bound of a packet, the 'target' sequence
+                        * number is
+                        *   (consumed_pos / subbuffer_size) + 1
+                        * Note the '+ 1' to ensure the current packet is
+                        * part of the current trace chunk.
+                        *
+                        * However, if the production position is already at
+                        * a packet boundary, the '+ 1' is not necessary as the
+                        * last packet of the current chunk is already
+                        * 'complete'.
+                        */
                        const struct relayd_stream_rotation_position position = {
                                .stream_id = stream->relayd_stream_id,
-                               .rotate_at_seq_num = (stream->rotate_position /
-                                               stream->max_sb_size) + 1,
+                               .rotate_at_seq_num = (stream->rotate_position / stream->max_sb_size) +
+                                       !!(stream->rotate_position % stream->max_sb_size),
                        };
 
                        ret = lttng_dynamic_array_add_element(
@@ -4512,11 +4580,13 @@ enum lttcomm_return_code lttng_consumer_create_trace_chunk(
                         * channels.
                         */
                        enum lttcomm_return_code close_ret;
+                       char path[LTTNG_PATH_MAX];
 
                        DBG("Failed to set new trace chunk on existing channels, rolling back");
                        close_ret = lttng_consumer_close_trace_chunk(relayd_id,
                                        session_id, chunk_id,
-                                       chunk_creation_timestamp, NULL);
+                                       chunk_creation_timestamp, NULL,
+                                       path);
                        if (close_ret != LTTCOMM_CONSUMERD_SUCCESS) {
                                ERR("Failed to roll-back the creation of new chunk: session_id = %" PRIu64 ", chunk_id = %" PRIu64,
                                                session_id, chunk_id);
@@ -4542,12 +4612,13 @@ enum lttcomm_return_code lttng_consumer_create_trace_chunk(
 
                if (!relayd || ret) {
                        enum lttcomm_return_code close_ret;
+                       char path[LTTNG_PATH_MAX];
 
                        close_ret = lttng_consumer_close_trace_chunk(relayd_id,
                                        session_id,
                                        chunk_id,
                                        chunk_creation_timestamp,
-                                       NULL);
+                                       NULL, path);
                        if (close_ret != LTTCOMM_CONSUMERD_SUCCESS) {
                                ERR("Failed to roll-back the creation of new chunk: session_id = %" PRIu64 ", chunk_id = %" PRIu64,
                                                session_id,
@@ -4562,6 +4633,7 @@ error:
        rcu_read_unlock();
        /* Release the reference returned by the "publish" operation. */
        lttng_trace_chunk_put(published_chunk);
+       lttng_trace_chunk_put(created_chunk);
 end:
        return ret_code;
 }
@@ -4569,7 +4641,8 @@ end:
 enum lttcomm_return_code lttng_consumer_close_trace_chunk(
                const uint64_t *relayd_id, uint64_t session_id,
                uint64_t chunk_id, time_t chunk_close_timestamp,
-               const enum lttng_trace_chunk_command_type *close_command)
+               const enum lttng_trace_chunk_command_type *close_command,
+               char *path)
 {
        enum lttcomm_return_code ret_code = LTTCOMM_CONSUMERD_SUCCESS;
        struct lttng_trace_chunk *chunk;
@@ -4667,7 +4740,8 @@ enum lttcomm_return_code lttng_consumer_close_trace_chunk(
                if (relayd) {
                        pthread_mutex_lock(&relayd->ctrl_sock_mutex);
                        ret = relayd_close_trace_chunk(
-                                       &relayd->control_sock, chunk);
+                                       &relayd->control_sock, chunk,
+                                       path);
                        pthread_mutex_unlock(&relayd->ctrl_sock_mutex);
                } else {
                        ERR("Failed to find relay daemon socket: relayd_id = %" PRIu64,
This page took 0.027861 seconds and 4 git commands to generate.