X-Git-Url: https://git.lttng.org/?a=blobdiff_plain;f=libringbuffer%2Fring_buffer_frontend.c;h=aeb7f7d94a10a9c4b63d098df1191ba4cdcef47e;hb=93dccd5a8a7657cf4d51bdd042055db17fd678ae;hp=80a0c5f438c371aeca4e3d5fee0bf0853081f066;hpb=1b7b0501cb18e5799d102c95592b208ca33f317a;p=lttng-ust.git diff --git a/libringbuffer/ring_buffer_frontend.c b/libringbuffer/ring_buffer_frontend.c index 80a0c5f4..aeb7f7d9 100644 --- a/libringbuffer/ring_buffer_frontend.c +++ b/libringbuffer/ring_buffer_frontend.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include "smp.h" @@ -84,6 +85,8 @@ #define LTTNG_UST_RB_SIG_READ SIGRTMIN + 1 #define LTTNG_UST_RB_SIG_TEARDOWN SIGRTMIN + 2 #define CLOCKID CLOCK_MONOTONIC +#define LTTNG_UST_RING_BUFFER_GET_RETRY 10 +#define LTTNG_UST_RING_BUFFER_RETRY_DELAY_MS 10 /* * Use POSIX SHM: shm_open(3) and shm_unlink(3). @@ -1067,8 +1070,7 @@ int lib_ring_buffer_get_subbuf(struct lttng_ust_lib_ring_buffer *buf, struct channel *chan = shmp(handle, buf->backend.chan); const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config; unsigned long consumed_cur, consumed_idx, commit_count, write_offset; - int ret; - int finalized; + int ret, finalized, nr_retry = LTTNG_UST_RING_BUFFER_GET_RETRY; retry: finalized = CMM_ACCESS_ONCE(buf->finalized); @@ -1103,14 +1105,66 @@ retry: /* * Check that the subbuffer we are trying to consume has been - * already fully committed. + * already fully committed. There are a few causes that can make + * this unavailability situation occur: + * + * Temporary (short-term) situation: + * - Application is running on a different CPU, between reserve + * and commit ring buffer operations, + * - Application is preempted between reserve and commit ring + * buffer operations, + * + * Long-term situation: + * - Application is stopped (SIGSTOP) between reserve and commit + * ring buffer operations. Could eventually be resumed by + * SIGCONT. + * - Application is killed (SIGTERM, SIGINT, SIGKILL) between + * reserve and commit ring buffer operation. + * + * From a consumer perspective, handling short-term + * unavailability situations is performed by retrying a few + * times after a delay. Handling long-term unavailability + * situations is handled by failing to get the sub-buffer. + * + * In all of those situations, if the application is taking a + * long time to perform its commit after ring buffer space + * reservation, we can end up in a situation where the producer + * will fill the ring buffer and try to write into the same + * sub-buffer again (which has a missing commit). This is + * handled by the producer in the sub-buffer switch handling + * code of the reserve routine by detecting unbalanced + * reserve/commit counters and discarding all further events + * until the situation is resolved in those situations. Two + * scenarios can occur: + * + * 1) The application causing the reserve/commit counters to be + * unbalanced has been terminated. In this situation, all + * further events will be discarded in the buffers, and no + * further buffer data will be readable by the consumer + * daemon. Tearing down the UST tracing session and starting + * anew is a work-around for those situations. Note that this + * only affects per-UID tracing. In per-PID tracing, the + * application vanishes with the termination, and therefore + * no more data needs to be written to the buffers. + * 2) The application causing the unbalance has been delayed for + * a long time, but will eventually try to increment the + * commit counter after eventually writing to the sub-buffer. + * This situation can cause events to be discarded until the + * application resumes its operations. */ if (((commit_count - chan->backend.subbuf_size) & chan->commit_count_mask) - (buf_trunc(consumed, chan) >> chan->backend.num_subbuf_order) - != 0) - goto nodata; + != 0) { + if (nr_retry-- > 0) { + if (nr_retry <= (LTTNG_UST_RING_BUFFER_GET_RETRY >> 1)) + (void) poll(NULL, 0, LTTNG_UST_RING_BUFFER_RETRY_DELAY_MS); + goto retry; + } else { + goto nodata; + } + } /* * Check that we are not about to read the same subbuffer in @@ -1126,12 +1180,23 @@ retry: * the writer is getting access to a subbuffer we were trying to get * access to. Also checks that the "consumed" buffer count we are * looking for matches the one contained in the subbuffer id. + * + * The short-lived race window described here can be affected by + * application signals and preemption, thus requiring to bound + * the loop to a maximum number of retry. */ ret = update_read_sb_index(config, &buf->backend, &chan->backend, consumed_idx, buf_trunc_val(consumed, chan), handle); - if (ret) - goto retry; + if (ret) { + if (nr_retry-- > 0) { + if (nr_retry <= (LTTNG_UST_RING_BUFFER_GET_RETRY >> 1)) + (void) poll(NULL, 0, LTTNG_UST_RING_BUFFER_RETRY_DELAY_MS); + goto retry; + } else { + goto nodata; + } + } subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id); buf->get_subbuf_consumed = consumed; @@ -1335,9 +1400,8 @@ void lib_ring_buffer_switch_old_start(struct lttng_ust_lib_ring_buffer *buf, lib_ring_buffer_check_deliver(config, buf, chan, offsets->old, commit_count, oldidx, handle, tsc); lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx, - offsets->old, commit_count, - config->cb.subbuffer_header_size(), - handle); + offsets->old + config->cb.subbuffer_header_size(), + commit_count, handle); } /* @@ -1374,8 +1438,7 @@ void lib_ring_buffer_switch_old_end(struct lttng_ust_lib_ring_buffer *buf, lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1, commit_count, oldidx, handle, tsc); lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx, - offsets->old, commit_count, - padding_size, handle); + offsets->old + padding_size, commit_count, handle); } /* @@ -1410,9 +1473,8 @@ void lib_ring_buffer_switch_new_start(struct lttng_ust_lib_ring_buffer *buf, lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin, commit_count, beginidx, handle, tsc); lib_ring_buffer_write_commit_counter(config, buf, chan, beginidx, - offsets->begin, commit_count, - config->cb.subbuffer_header_size(), - handle); + offsets->begin + config->cb.subbuffer_header_size(), + commit_count, handle); } /*