From 9edaf114d28249f4740de16bc9f58c43cfe8042e Mon Sep 17 00:00:00 2001 From: =?utf8?q?J=C3=A9r=C3=A9mie=20Galarneau?= Date: Tue, 28 Jul 2020 11:29:18 -0400 Subject: [PATCH] Fix: relayd: live connection fails to open file during clear MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Issue observed ============== A `session clear` test occasionaly fails on the CI (very rarely, and more often on PowerPC executors for some reason) with babeltrace reporting that the live connection was closed by the remote side: PASS: tools/clear/test_ust 276 - Waiting for live viewers on url: net://localhost 07-21 16:17:07.058 23855 23855 E PLUGIN/SRC.CTF.LTTNG-LIVE/VIEWER lttng_live_recv@viewer-connection.c:198 [lttng-live] Remote side has closed connection 07-21 16:17:07.058 23855 23855 E PLUGIN/SRC.CTF.LTTNG-LIVE/VIEWER lttng_live_session_get_new_streams@viewer-connection.c:1701 [lttng-live] Error receiving get new streams reply 07-21 16:17:07.058 23855 23855 E PLUGIN/SRC.CTF.LTTNG-LIVE lttng_live_msg_iter_next@lttng-live.c:1665 [lttng-live] Error preparing the next batch of messages: live-iter-status=LTTNG_LIVE_ITERATOR_STATUS_ERROR 07-21 16:17:07.058 23855 23855 W LIB/MSG-ITER bt_message_iterator_next@iterator.c:864 Component input port message iterator's "next" method failed: iter-addr=0x1014d6a8, iter-upstream-comp-name="lttng-live", iter-upstream-comp-log-level=WARNING, iter-upstream-comp-class-type=SOURCE, iter-upstream-comp-class-name="lttng-live", iter-upstream-comp-class-partial-descr="Connect to an LTTng relay daemon", iter-upstream-port-type=OUTPUT, iter-upstream-port-name="out", status=ERROR 07-21 16:17:07.059 23855 23855 E PLUGIN/FLT.UTILS.MUXER muxer_upstream_msg_iter_next@muxer.c:454 [muxer] Upstream iterator's next method returned an error: status=ERROR 07-21 16:17:07.059 23855 23855 E PLUGIN/FLT.UTILS.MUXER validate_muxer_upstream_msg_iters@muxer.c:991 [muxer] Cannot validate muxer's upstream message iterator wrapper: muxer-msg-iter-addr=0x1014d668, muxer-upstream-msg-iter-wrap-addr=0x1014e210 07-21 16:17:07.059 23855 23855 E PLUGIN/FLT.UTILS.MUXER muxer_msg_iter_next@muxer.c:1415 [muxer] Cannot get next message: comp-addr=0x1014cca8, muxer-comp-addr=0x1015afd8, muxer-msg-iter-addr=0x1014d668, msg-iter-addr=0x1014d598, status=ERROR 07-21 16:17:07.059 23855 23855 W LIB/MSG-ITER bt_message_iterator_next@iterator.c:864 Component input port message iterator's "next" method failed: iter-addr=0x1014d598, iter-upstream-comp-name="muxer", iter-upstream-comp-log-level=WARNING, iter-upstream-comp-class-type=FILTER, iter-upstream-comp-class-name="muxer", iter-upstream-comp-class-partial-descr="Sort messages from multiple inpu", iter-upstream-port-type=OUTPUT, iter-upstream-port-name="out", status=ERROR 07-21 16:17:07.059 23855 23855 W LIB/GRAPH consume_graph_sink@graph.c:473 Component's "consume" method failed: status=ERROR, comp-addr=0x1014d128, comp-name="pretty", comp-log-level=WARNING, comp-class-type=SINK, comp-class-name="pretty", comp-class-partial-descr="Pretty-print messages (`text` fo", comp-class-is-frozen=1, comp-class-so-handle-addr=0x10159dd8, comp-class-so-handle-path="/home/jenkins/workspace/lttng-tools_stable-2.12_portbuild/arch/powerpc/babeltrace_version/stable-2.0/build/std/conf/agents/liburcu_version/stable-0.12/test_type/base/deps/build/lib/babeltrace2/plugins/babeltrace-plugin-text.so", comp-input-port-count=1, comp-output-port-count=0 07-21 16:17:07.059 23855 23855 E CLI cmd_run@babeltrace2.c:2548 Graph failed to complete successfully ERROR: [Babeltrace CLI] (babeltrace2.c:2548) Graph failed to complete successfully CAUSED BY [libbabeltrace2] (graph.c:473) Component's "consume" method failed: status=ERROR, comp-addr=0x1014d128, comp-name="pretty", comp-log-level=WARNING, comp-class-type=SINK, comp-class-name="pretty", comp-class-partial-descr="Pretty-print messages (`text` fo", comp-class-is-frozen=1, comp-class-so-handle-addr=0x10159dd8, comp-class-so-handle-path="/home/jenkins/workspace/lttng-tools_stable-2.12_portbuild/arch/powerpc/babeltrace_version/stable-2.0/build/std/conf/agents/liburcu_version/stable-0.12/test_type/base/deps/build/lib/babeltrace2/plugins/babeltrace-plugin-text.so", comp-input-port-count=1, comp-output-port-count=0 CAUSED BY [libbabeltrace2] (iterator.c:864) Component input port message iterator's "next" method failed: iter-addr=0x1014d598, iter-upstream-comp-name="muxer", iter-upstream-comp-log-level=WARNING, iter-upstream-comp-class-type=FILTER, iter-upstream-comp-class-name="muxer", iter-upstream-comp-class-partial-descr="Sort messages from multiple inpu", iter-upstream-port-type=OUTPUT, iter-upstream-port-name="out", status=ERROR CAUSED BY [muxer: 'filter.utils.muxer'] (muxer.c:991) Cannot validate muxer's upstream message iterator wrapper: muxer-msg-iter-addr=0x1014d668, muxer-upstream-msg-iter-wrap-addr=0x1014e210 CAUSED BY [muxer: 'filter.utils.muxer'] (muxer.c:454) Upstream iterator's next method returned an error: status=ERROR CAUSED BY [libbabeltrace2] (iterator.c:864) Component input port message iterator's "next" method failed: iter-addr=0x1014d6a8, iter-upstream-comp-name="lttng-live", iter-upstream-comp-log-level=WARNING, iter-upstream-comp-class-type=SOURCE, iter-upstream-comp-class-name="lttng-live", iter-upstream-comp-class-partial-descr="Connect to an LTTng relay daemon", iter-upstream-port-type=OUTPUT, iter-upstream-port-name="out", status=ERROR CAUSED BY [lttng-live: 'source.ctf.lttng-live'] (lttng-live.c:1665) Error preparing the next batch of messages: live-iter-status=LTTNG_LIVE_ITERATOR_STATUS_ERROR CAUSED BY [lttng-live: 'source.ctf.lttng-live'] (viewer-connection.c:1701) Error receiving get new streams reply CAUSED BY [lttng-live: 'source.ctf.lttng-live'] (viewer-connection.c:198) Remote side has closed connection ok 277 - Clear session J7WXjh7fmMleTCfE PASS: tools/clear/test_ust 277 - Clear session J7WXjh7fmMleTCfE ok 278 - Clear session J7WXjh7fmMleTCfE PASS: tools/clear/test_ust 278 - Clear session J7WXjh7fmMleTCfE ok 279 - Stop lttng tracing for session J7WXjh7fmMleTCfE PASS: tools/clear/test_ust 279 - Stop lttng tracing for session J7WXjh7fmMleTCfE ok 280 - Destroy session J7WXjh7fmMleTCfE PASS: tools/clear/test_ust 280 - Destroy session J7WXjh7fmMleTCfE # Wait for viewer to exit not ok 281 - Babeltrace succeeds Cause ===== Looking at the relay daemon logs, it appears that the live client requests an enumeration of the available streams while a rotation is ongoing (clear). Ultimately, this results in the relay daemon attempting to open a non-existing file: PERROR - 16:33:59.242388809 [734380/734387]: Failed to open fs handle to ust/uid/1000/64-bit/chan_0, open() returned: No such file or directory (in fd_tracker_open_fs_handle() at fd-tracker.c:550) The logs indicate that this file existed at some point. However, it was unlinked and its newest instance was created in a trace chunk named "20200727T163359-0400-1". This chunk name is a temporary name used until the original trace chunk can be unlinked (cleared) and the newest can be moved in its place. The file is being opened as part of the creation of a viewer stream when make_viewer_stream() fails to find it. This implies that, somehow, an outdated trace chunk is being used to open the viewer stream's file. The reason why is that make_viewer_stream is called with the viewer session's current trace chunk. During a rotation, the use of the viewer session's current trace chunk is touchy due to the way the switch-over to a new chunk is handled. How viewer session/stream trace chunks are rotated -------------------------------------------------- The viewer polls the relay daemon for new data/metadata to consume using the `GET_NEXT_INDEX` and `GET_METADATA` commands. Both commands will indicate to the viewer that it should try again later if a rotation is ongoing on the "side" of the relay session. When a rotation is not ongoing, the relay compares the `id` of the target viewer stream's trace chunk with the relay session's current trace chunk. If those `id`s don't match, the viewer session's current trace chunk is then updated to a copy of the relay session's trace chunk. The viewer stream's files are then closed and re-opened in the context of the viewer session's now-updated trace chunk. While the live protocol allows `GET_NEXT_INDEX` and `GET_METADATA` to communicate to the viewer that it should "retry" later, there is no such provisions made for the paths that lead to the creation of the viewer streams. This means viewer streams must be created even if a rotation is ongoing. Solution ======== If a rotation is ongoing at the moment of the creation of the viewer streams, we wish to use a copy of the trace chunk being used by the relay stream. This way, we can be sure that the streams' files exist. It is okay for viewer streams to hold references to different copies of the trace chunks since no user-visible actions are performed when the reference to those chunks is released. This is different from relay streams where the detection of the completion of a rotation is done when the last relay stream releases its reference to a specific trace chunk instance. Known drawbacks =============== None beyond a slight increase (temporary until the next rotation) in the number of FDs used when a client connects during a session rotation. Note ==== Since make_viewer_streams now acts on relay and viewer counterparts of the session and stream objects, the various variables are prefixed with `relay` and `viewer` to make the code easier to understand. The locking period of the relay stream is extended to most of the iteration in make_viewer_stream() rather than only in viewer_stream_create(). As a result, callers of viewer_stream_create() must hold the relay stream's lock. Signed-off-by: Jérémie Galarneau Change-Id: I3b9c52a57835134712eadba0f68f6a21e922356b --- src/bin/lttng-relayd/live.c | 130 ++++++++++++++++++-------- src/bin/lttng-relayd/viewer-session.h | 1 - src/bin/lttng-relayd/viewer-stream.c | 18 ++-- 3 files changed, 100 insertions(+), 49 deletions(-) diff --git a/src/bin/lttng-relayd/live.c b/src/bin/lttng-relayd/live.c index d5f045824..7d6dc1bc2 100644 --- a/src/bin/lttng-relayd/live.c +++ b/src/bin/lttng-relayd/live.c @@ -268,8 +268,8 @@ end_unlock: * * Return 0 on success or else a negative value. */ -static int make_viewer_streams(struct relay_session *session, - struct lttng_trace_chunk *viewer_trace_chunk, +static int make_viewer_streams(struct relay_session *relay_session, + struct relay_viewer_session *viewer_session, enum lttng_viewer_seek seek_t, uint32_t *nb_total, uint32_t *nb_unsent, @@ -279,18 +279,19 @@ static int make_viewer_streams(struct relay_session *session, int ret; struct lttng_ht_iter iter; struct ctf_trace *ctf_trace; + struct relay_stream *relay_stream = NULL; - assert(session); - ASSERT_LOCKED(session->lock); + assert(relay_session); + ASSERT_LOCKED(relay_session->lock); - if (!viewer_trace_chunk) { + if (!viewer_session->current_trace_chunk) { ERR("Internal error: viewer session associated with session \"%s\" has a NULL trace chunk", - session->session_name); + relay_session->session_name); ret = -1; goto error; } - if (session->connection_closed) { + if (relay_session->connection_closed) { *closed = true; } @@ -299,10 +300,9 @@ static int make_viewer_streams(struct relay_session *session, * used for a the given session id only. */ rcu_read_lock(); - cds_lfht_for_each_entry(session->ctf_traces_ht->ht, &iter.iter, ctf_trace, - node.node) { + cds_lfht_for_each_entry (relay_session->ctf_traces_ht->ht, &iter.iter, + ctf_trace, node.node) { bool trace_has_metadata_stream = false; - struct relay_stream *stream; health_code_update(); @@ -314,15 +314,23 @@ static int make_viewer_streams(struct relay_session *session, * Iterate over all the streams of the trace to see if we have a * metadata stream. */ - cds_list_for_each_entry_rcu( - stream, &ctf_trace->stream_list, stream_node) + cds_list_for_each_entry_rcu(relay_stream, + &ctf_trace->stream_list, stream_node) { - if (stream->is_metadata) { + bool is_metadata_stream; + + pthread_mutex_lock(&relay_stream->lock); + is_metadata_stream = relay_stream->is_metadata; + pthread_mutex_unlock(&relay_stream->lock); + + if (is_metadata_stream) { trace_has_metadata_stream = true; break; } } + relay_stream = NULL; + /* * If there is no metadata stream in this trace at the moment * and we never sent one to the viewer, skip the trace. We @@ -334,35 +342,72 @@ static int make_viewer_streams(struct relay_session *session, continue; } - cds_list_for_each_entry_rcu(stream, &ctf_trace->stream_list, stream_node) { - struct relay_viewer_stream *vstream; + cds_list_for_each_entry_rcu(relay_stream, + &ctf_trace->stream_list, stream_node) + { + struct relay_viewer_stream *viewer_stream; - if (!stream_get(stream)) { + if (!stream_get(relay_stream)) { continue; } + + pthread_mutex_lock(&relay_stream->lock); /* * stream published is protected by the session lock. */ - if (!stream->published) { + if (!relay_stream->published) { goto next; } - vstream = viewer_stream_get_by_id(stream->stream_handle); - if (!vstream) { + viewer_stream = viewer_stream_get_by_id( + relay_stream->stream_handle); + if (!viewer_stream) { + struct lttng_trace_chunk *viewer_stream_trace_chunk; + /* * Save that we sent the metadata stream to the * viewer. So that we know what trace the viewer * is aware of. */ - if (stream->is_metadata) { - ctf_trace->metadata_stream_sent_to_viewer = - true; + if (relay_stream->is_metadata) { + ctf_trace->metadata_stream_sent_to_viewer = true; } - vstream = viewer_stream_create(stream, - viewer_trace_chunk, seek_t); - if (!vstream) { + + /* + * If a rotation is ongoing, use a copy of the + * relay stream's chunk to ensure the stream + * files exist. + * + * Otherwise, the viewer session's current trace + * chunk can be used safely. + */ + if ((relay_stream->ongoing_rotation.is_set || + relay_session->ongoing_rotation) && + relay_stream->trace_chunk) { + viewer_stream_trace_chunk = lttng_trace_chunk_copy( + relay_stream->trace_chunk); + if (!viewer_stream_trace_chunk) { + ret = -1; + ctf_trace_put(ctf_trace); + goto error_unlock; + } + } else { + const bool reference_acquired = lttng_trace_chunk_get( + viewer_session->current_trace_chunk); + + assert(reference_acquired); + viewer_stream_trace_chunk = + viewer_session->current_trace_chunk; + } + + viewer_stream = viewer_stream_create( + relay_stream, + viewer_stream_trace_chunk, + seek_t); + lttng_trace_chunk_put(viewer_stream_trace_chunk); + viewer_stream_trace_chunk = NULL; + if (!viewer_stream) { ret = -1; ctf_trace_put(ctf_trace); - stream_put(stream); goto error_unlock; } @@ -374,36 +419,40 @@ static int make_viewer_streams(struct relay_session *session, * Ensure a self-reference is preserved even * after we have put our local reference. */ - if (!viewer_stream_get(vstream)) { + if (!viewer_stream_get(viewer_stream)) { ERR("Unable to get self-reference on viewer stream, logic error."); abort(); } } else { - if (!vstream->sent_flag && nb_unsent) { + if (!viewer_stream->sent_flag && nb_unsent) { /* Update number of unsent stream counter. */ (*nb_unsent)++; } } /* Update number of total stream counter. */ if (nb_total) { - if (stream->is_metadata) { - if (!stream->closed || - stream->metadata_received > vstream->metadata_sent) { + if (relay_stream->is_metadata) { + if (!relay_stream->closed || + relay_stream->metadata_received > + viewer_stream->metadata_sent) { (*nb_total)++; } } else { - if (!stream->closed || - !(((int64_t) (stream->prev_data_seq - stream->last_net_seq_num)) >= 0)) { - + if (!relay_stream->closed || + !(((int64_t)(relay_stream->prev_data_seq - + relay_stream->last_net_seq_num)) >= + 0)) { (*nb_total)++; } } } /* Put local reference. */ - viewer_stream_put(vstream); + viewer_stream_put(viewer_stream); next: - stream_put(stream); + pthread_mutex_unlock(&relay_stream->lock); + stream_put(relay_stream); } + relay_stream = NULL; ctf_trace_put(ctf_trace); } @@ -412,6 +461,11 @@ static int make_viewer_streams(struct relay_session *session, error_unlock: rcu_read_unlock(); error: + if (relay_stream) { + pthread_mutex_unlock(&relay_stream->lock); + stream_put(relay_stream); + } + return ret; } @@ -1088,7 +1142,7 @@ int viewer_get_new_streams(struct relay_connection *conn) pthread_mutex_lock(&session->lock); ret = make_viewer_streams(session, - conn->viewer_session->current_trace_chunk, + conn->viewer_session, LTTNG_VIEWER_SEEK_LAST, &nb_total, &nb_unsent, &nb_created, &closed); if (ret < 0) { @@ -1237,7 +1291,7 @@ int viewer_attach_session(struct relay_connection *conn) } ret = make_viewer_streams(session, - conn->viewer_session->current_trace_chunk, seek_type, + conn->viewer_session, seek_type, &nb_streams, NULL, NULL, &closed); if (ret < 0) { goto end_put_session; diff --git a/src/bin/lttng-relayd/viewer-session.h b/src/bin/lttng-relayd/viewer-session.h index ebde2dd1d..2895db0ef 100644 --- a/src/bin/lttng-relayd/viewer-session.h +++ b/src/bin/lttng-relayd/viewer-session.h @@ -30,7 +30,6 @@ struct relay_viewer_session { */ struct cds_list_head session_list; /* RCU list. */ pthread_mutex_t session_list_lock; /* Protects list updates. */ - /* Once set, the current trace chunk of a viewer must not change. */ struct lttng_trace_chunk *current_trace_chunk; }; diff --git a/src/bin/lttng-relayd/viewer-stream.c b/src/bin/lttng-relayd/viewer-stream.c index 05499215a..f07e062f8 100644 --- a/src/bin/lttng-relayd/viewer-stream.c +++ b/src/bin/lttng-relayd/viewer-stream.c @@ -34,14 +34,15 @@ static void viewer_stream_destroy_rcu(struct rcu_head *head) viewer_stream_destroy(vstream); } +/* Relay stream's lock must be held by the caller. */ struct relay_viewer_stream *viewer_stream_create(struct relay_stream *stream, - struct lttng_trace_chunk *viewer_trace_chunk, + struct lttng_trace_chunk *trace_chunk, enum lttng_viewer_seek seek_t) { struct relay_viewer_stream *vstream = NULL; - const bool acquired_reference = lttng_trace_chunk_get( - viewer_trace_chunk); + const bool acquired_reference = lttng_trace_chunk_get(trace_chunk); + ASSERT_LOCKED(stream->lock); if (!acquired_reference) { goto error; } @@ -52,8 +53,8 @@ struct relay_viewer_stream *viewer_stream_create(struct relay_stream *stream, goto error; } - vstream->stream_file.trace_chunk = viewer_trace_chunk; - viewer_trace_chunk = NULL; + vstream->stream_file.trace_chunk = trace_chunk; + trace_chunk = NULL; vstream->path_name = lttng_strndup(stream->path_name, LTTNG_VIEWER_PATH_MAX); if (vstream->path_name == NULL) { PERROR("relay viewer path_name alloc"); @@ -72,8 +73,6 @@ struct relay_viewer_stream *viewer_stream_create(struct relay_stream *stream, } vstream->stream = stream; - pthread_mutex_lock(&stream->lock); - if (stream->is_metadata && stream->trace->viewer_metadata_stream) { ERR("Cannot attach viewer metadata stream to trace (busy)."); goto error_unlock; @@ -183,7 +182,6 @@ struct relay_viewer_stream *viewer_stream_create(struct relay_stream *stream, rcu_assign_pointer(stream->trace->viewer_metadata_stream, vstream); } - pthread_mutex_unlock(&stream->lock); /* Globally visible after the add unique. */ lttng_ht_node_init_u64(&vstream->stream_n, stream->stream_handle); @@ -198,8 +196,8 @@ error: if (vstream) { viewer_stream_destroy(vstream); } - if (viewer_trace_chunk && acquired_reference) { - lttng_trace_chunk_put(viewer_trace_chunk); + if (trace_chunk && acquired_reference) { + lttng_trace_chunk_put(trace_chunk); } return NULL; } -- 2.34.1