From a4beac76c09632fda439fac25fdb0f7191234191 Mon Sep 17 00:00:00 2001 From: Jonathan Rajotte Date: Wed, 1 Jun 2022 15:20:55 -0400 Subject: [PATCH] Fix: consumer: snapshot: assertion on subsequent snapshot MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Observed issue ============== While a snapshot is being taken, the containing folder can disappear unexpectedly. This can lead to the following errors, which are expected and mostly handled fine: PERROR - 14:47:32.002564464 [2922498/2922507]: Failed to open file relative to trace chunk file_path = "channel0_0", flags = 577, mode = 432: No such file or directory (in _lttng_trace_chunk_open_fs_handle_locked() at trace-chunk.cpp:1411) Error: Failed to open stream file "channel0_0" Error: Snapshot channel failed The problem happens on the subsequent snapshot for the session: #0 __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50 #1 0x00007fbbdadb3859 in __GI_abort () at abort.c:79 #2 0x00007fbbdadb3729 in __assert_fail_base (fmt=0x7fbbdaf49588 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", assertion=0x55c4212cfbb5 "!stream->trace_chunk", file=0x55c4212cf820 "kernel-co #3 0x00007fbbdadc5006 in __GI___assert_fail (assertion=0x55c4212cfbb5 "!stream->trace_chunk", file=0x55c4212cf820 "kernel-consumer/kernel-consumer.cpp", line=188, function=0x55c4212cfb00 " #4 0x000055c421268cc6 in lttng_kconsumer_snapshot_channel (channel=0x7fbbc4000b60, key=1, path=0x7fbbd37f8fd4 "", relayd_id=18446744073709551615, nb_packets_per_stream=0) at kernel-consume #5 0x000055c42126b39d in lttng_kconsumer_recv_cmd (ctx=0x55c421b80a90, sock=31, consumer_sockpoll=0x7fbbd37fd280) at kernel-consumer/kernel-consumer.cpp:986 #6 0x000055c4212546d1 in lttng_consumer_recv_cmd (ctx=0x55c421b80a90, sock=31, consumer_sockpoll=0x7fbbd37fd280) at consumer/consumer.cpp:2090 #7 0x000055c421259963 in consumer_thread_sessiond_poll (data=0x55c421b80a90) at consumer/consumer.cpp:3281 #8 0x00007fbbdaf8b609 in start_thread (arg=) at pthread_create.c:477 #9 0x00007fbbdaeb0163 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 How to reproduce: 1. Setting a breakpoint on snapshot_channel() inside src/common/ust-consumer/ust-consumer.cpp 2. When the breakpoint hits, remove the the complete lttng directory containing the session data. 3. Continue the lttng_consumerd process from gdb. 4. In that case you see a negative return value -1 from consumer_stream_create_output_files() inside snapshot_channel(). 5. Take another snapshot and lttng_consumerd crashes because of the `assert(!stream->trace_chunk)` in snapshot_channel(). This last action does not require any breakpoint intervention. Cause ===== During the snapshot, the stream is assigned the channel current chunk. It is expected that the stream does not have a chunk at this point. The error handling is faulty here, the stream chunk must be invalidated/reset on error to allow its reuse later on. The problem exists for both consumer domains (user/kernel). Solution ======== For the ust consumer, we can directly use the `error_close_stream` label. For the kernel consumer, the code path is slightly different since it does not uses `consumer_stream_close`. Note that `consumer_stream_close` cannot be used as is for the kernel consumer. The current implementation partially resembles `consumer_stream_close` at the end of the iteration. It is extracted to its own function for easier reuse from the new `error_finalize_stream` label. Known drawbacks ========= None. Fixes: #1352 Signed-off-by: Marcel Hamer Signed-off-by: Jonathan Rajotte Signed-off-by: Jérémie Galarneau Change-Id: I9fc81917b19aa436ed8e8679672648f2d5baf41a --- .../kernel-consumer/kernel-consumer.cpp | 58 +++++++++++-------- src/common/ust-consumer/ust-consumer.cpp | 10 ++-- 2 files changed, 40 insertions(+), 28 deletions(-) diff --git a/src/common/kernel-consumer/kernel-consumer.cpp b/src/common/kernel-consumer/kernel-consumer.cpp index 9bc66a478..d22aea183 100644 --- a/src/common/kernel-consumer/kernel-consumer.cpp +++ b/src/common/kernel-consumer/kernel-consumer.cpp @@ -135,6 +135,30 @@ error: return ret; } +static void finalize_snapshot_stream( + struct lttng_consumer_stream *stream, uint64_t relayd_id) +{ + ASSERT_LOCKED(stream->lock); + + if (relayd_id == (uint64_t) -1ULL) { + if (stream->out_fd >= 0) { + const int ret = close(stream->out_fd); + + if (ret < 0) { + PERROR("Failed to close stream snapshot output file descriptor"); + } + + stream->out_fd = -1; + } + } else { + close_relayd_stream(stream); + stream->net_seq_idx = (uint64_t) -1ULL; + } + + lttng_trace_chunk_put(stream->trace_chunk); + stream->trace_chunk = NULL; +} + /* * Take a snapshot of all the stream of a channel * RCU read-side lock must be held across this function to ensure existence of @@ -198,13 +222,13 @@ static int lttng_kconsumer_snapshot_channel( ret = consumer_send_relayd_stream(stream, path); if (ret < 0) { ERR("sending stream to relayd"); - goto end_unlock; + goto error_finalize_stream; } } else { ret = consumer_stream_create_output_files(stream, false); if (ret < 0) { - goto end_unlock; + goto error_finalize_stream; } DBG("Kernel consumer snapshot stream (%" PRIu64 ")", stream->key); @@ -222,7 +246,7 @@ static int lttng_kconsumer_snapshot_channel( ret = kernctl_buffer_flush(stream->wait_fd); if (ret < 0) { ERR("Failed to flush kernel stream"); - goto end_unlock; + goto error_finalize_stream; } goto end_unlock; } @@ -230,19 +254,19 @@ static int lttng_kconsumer_snapshot_channel( ret = lttng_kconsumer_take_snapshot(stream); if (ret < 0) { ERR("Taking kernel snapshot"); - goto end_unlock; + goto error_finalize_stream; } ret = lttng_kconsumer_get_produced_snapshot(stream, &produced_pos); if (ret < 0) { ERR("Produced kernel snapshot position"); - goto end_unlock; + goto error_finalize_stream; } ret = lttng_kconsumer_get_consumed_snapshot(stream, &consumed_pos); if (ret < 0) { ERR("Consumerd kernel snapshot position"); - goto end_unlock; + goto error_finalize_stream; } consumed_pos = consumer_get_consume_start_pos(consumed_pos, @@ -262,7 +286,7 @@ static int lttng_kconsumer_snapshot_channel( if (ret < 0) { if (ret != -EAGAIN) { PERROR("kernctl_get_subbuf snapshot"); - goto end_unlock; + goto error_finalize_stream; } DBG("Kernel consumer get subbuf failed. Skipping it."); consumed_pos += stream->max_sb_size; @@ -312,26 +336,12 @@ static int lttng_kconsumer_snapshot_channel( ret = kernctl_put_subbuf(stream->wait_fd); if (ret < 0) { ERR("Snapshot kernctl_put_subbuf"); - goto end_unlock; + goto error_finalize_stream; } consumed_pos += stream->max_sb_size; } - if (relayd_id == (uint64_t) -1ULL) { - if (stream->out_fd >= 0) { - ret = close(stream->out_fd); - if (ret < 0) { - PERROR("Kernel consumer snapshot close out_fd"); - goto end_unlock; - } - stream->out_fd = -1; - } - } else { - close_relayd_stream(stream); - stream->net_seq_idx = (uint64_t) -1ULL; - } - lttng_trace_chunk_put(stream->trace_chunk); - stream->trace_chunk = NULL; + finalize_snapshot_stream(stream, relayd_id); pthread_mutex_unlock(&stream->lock); } @@ -344,6 +354,8 @@ error_put_subbuf: if (ret < 0) { ERR("Snapshot kernctl_put_subbuf error path"); } +error_finalize_stream: + finalize_snapshot_stream(stream, relayd_id); end_unlock: pthread_mutex_unlock(&stream->lock); end: diff --git a/src/common/ust-consumer/ust-consumer.cpp b/src/common/ust-consumer/ust-consumer.cpp index 30d1f1021..b08cafdff 100644 --- a/src/common/ust-consumer/ust-consumer.cpp +++ b/src/common/ust-consumer/ust-consumer.cpp @@ -1089,13 +1089,13 @@ static int snapshot_channel(struct lttng_consumer_channel *channel, if (use_relayd) { ret = consumer_send_relayd_stream(stream, path); if (ret < 0) { - goto error_unlock; + goto error_close_stream; } } else { ret = consumer_stream_create_output_files(stream, false); if (ret < 0) { - goto error_unlock; + goto error_close_stream; } DBG("UST consumer snapshot stream (%" PRIu64 ")", stream->key); @@ -1117,19 +1117,19 @@ static int snapshot_channel(struct lttng_consumer_channel *channel, ret = lttng_ustconsumer_take_snapshot(stream); if (ret < 0) { ERR("Taking UST snapshot"); - goto error_unlock; + goto error_close_stream; } ret = lttng_ustconsumer_get_produced_snapshot(stream, &produced_pos); if (ret < 0) { ERR("Produced UST snapshot position"); - goto error_unlock; + goto error_close_stream; } ret = lttng_ustconsumer_get_consumed_snapshot(stream, &consumed_pos); if (ret < 0) { ERR("Consumerd UST snapshot position"); - goto error_unlock; + goto error_close_stream; } /* -- 2.34.1