X-Git-Url: https://git.lttng.org/?a=blobdiff_plain;f=libringbuffer%2Fring_buffer_frontend.c;h=f9bf5c8d7a00f9d5f08216cd99d69e227757eea2;hb=295097ea1074019c494918a976d76feef44e59c9;hp=eb4e48629ca3d26a08b3c015b069a0e3c29f7d68;hpb=46dba5ef6aa339f476a7c630818696ccf5f4574f;p=lttng-ust.git

diff --git a/libringbuffer/ring_buffer_frontend.c b/libringbuffer/ring_buffer_frontend.c
index eb4e4862..f9bf5c8d 100644
--- a/libringbuffer/ring_buffer_frontend.c
+++ b/libringbuffer/ring_buffer_frontend.c
@@ -84,6 +84,8 @@
 #define LTTNG_UST_RB_SIG_READ		SIGRTMIN + 1
 #define LTTNG_UST_RB_SIG_TEARDOWN	SIGRTMIN + 2
 #define CLOCKID		CLOCK_MONOTONIC
+#define LTTNG_UST_RING_BUFFER_GET_RETRY		10
+#define LTTNG_UST_RING_BUFFER_RETRY_DELAY_MS	10
 
 /*
  * Use POSIX SHM: shm_open(3) and shm_unlink(3).
@@ -1067,8 +1069,7 @@ int lib_ring_buffer_get_subbuf(struct lttng_ust_lib_ring_buffer *buf,
 	struct channel *chan = shmp(handle, buf->backend.chan);
 	const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 	unsigned long consumed_cur, consumed_idx, commit_count, write_offset;
-	int ret;
-	int finalized;
+	int ret, finalized, nr_retry = LTTNG_UST_RING_BUFFER_GET_RETRY;
 
 retry:
 	finalized = CMM_ACCESS_ONCE(buf->finalized);
@@ -1103,14 +1104,66 @@ retry:
 
 	/*
 	 * Check that the subbuffer we are trying to consume has been
-	 * already fully committed.
+	 * already fully committed. There are a few causes that can make
+	 * this unavailability situation occur:
+	 *
+	 * Temporary (short-term) situation:
+	 * - Application is running on a different CPU, between reserve
+	 *   and commit ring buffer operations,
+	 * - Application is preempted between reserve and commit ring
+	 *   buffer operations,
+	 *
+	 * Long-term situation:
+	 * - Application is stopped (SIGSTOP) between reserve and commit
+	 *   ring buffer operations. Could eventually be resumed by
+	 *   SIGCONT.
+	 * - Application is killed (SIGTERM, SIGINT, SIGKILL) between
+	 *   reserve and commit ring buffer operation.
+	 *
+	 * From a consumer perspective, handling short-term
+	 * unavailability situations is performed by retrying a few
+	 * times after a delay. Handling long-term unavailability
+	 * situations is handled by failing to get the sub-buffer.
+	 *
+	 * In all of those situations, if the application is taking a
+	 * long time to perform its commit after ring buffer space
+	 * reservation, we can end up in a situation where the producer
+	 * will fill the ring buffer and try to write into the same
+	 * sub-buffer again (which has a missing commit). This is
+	 * handled by the producer in the sub-buffer switch handling
+	 * code of the reserve routine by detecting unbalanced
+	 * reserve/commit counters and discarding all further events
+	 * until the situation is resolved in those situations. Two
+	 * scenarios can occur:
+	 *
+	 * 1) The application causing the reserve/commit counters to be
+	 *    unbalanced has been terminated. In this situation, all
+	 *    further events will be discarded in the buffers, and no
+	 *    further buffer data will be readable by the consumer
+	 *    daemon. Tearing down the UST tracing session and starting
+	 *    anew is a work-around for those situations. Note that this
+	 *    only affects per-UID tracing. In per-PID tracing, the
+	 *    application vanishes with the termination, and therefore
+	 *    no more data needs to be written to the buffers.
+	 * 2) The application causing the unbalance has been delayed for
+	 *    a long time, but will eventually try to increment the
+	 *    commit counter after eventually writing to the sub-buffer.
+	 *    This situation can cause events to be discarded until the
+	 *    application resumes its operations.
 	 */
 	if (((commit_count - chan->backend.subbuf_size)
 	     & chan->commit_count_mask)
 	    - (buf_trunc(consumed, chan)
 	       >> chan->backend.num_subbuf_order)
-	    != 0)
-		goto nodata;
+	    != 0) {
+		if (nr_retry-- > 0) {
+			if (nr_retry <= (LTTNG_UST_RING_BUFFER_GET_RETRY >> 1))
+				(void) poll(NULL, 0, LTTNG_UST_RING_BUFFER_RETRY_DELAY_MS);
+			goto retry;
+		} else {
+			goto nodata;
+		}
+	}
 
 	/*
 	 * Check that we are not about to read the same subbuffer in
@@ -1126,12 +1179,23 @@ retry:
 	 * the writer is getting access to a subbuffer we were trying to get
 	 * access to. Also checks that the "consumed" buffer count we are
 	 * looking for matches the one contained in the subbuffer id.
+	 *
+	 * The short-lived race window described here can be affected by
+	 * application signals and preemption, thus requiring to bound
+	 * the loop to a maximum number of retry.
 	 */
 	ret = update_read_sb_index(config, &buf->backend, &chan->backend,
 				   consumed_idx, buf_trunc_val(consumed, chan),
 				   handle);
-	if (ret)
-		goto retry;
+	if (ret) {
+		if (nr_retry-- > 0) {
+			if (nr_retry <= (LTTNG_UST_RING_BUFFER_GET_RETRY >> 1))
+				(void) poll(NULL, 0, LTTNG_UST_RING_BUFFER_RETRY_DELAY_MS);
+			goto retry;
+		} else {
+			goto nodata;
+		}
+	}
 	subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id);
 
 	buf->get_subbuf_consumed = consumed;