libringbuffer/ring_buffer_frontend.c

   1 /*
   2  * ring_buffer_frontend.c
   3  *
   4  * Copyright (C) 2005-2012 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; only
   9  * version 2.1 of the License.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  *
  20  *
  21  * Ring buffer wait-free buffer synchronization. Producer-consumer and flight
  22  * recorder (overwrite) modes. See thesis:
  23  *
  24  * Desnoyers, Mathieu (2009), "Low-Impact Operating System Tracing", Ph.D.
  25  * dissertation, Ecole Polytechnique de Montreal.
  26  * http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf
  27  *
  28  * - Algorithm presentation in Chapter 5:
  29  *     "Lockless Multi-Core High-Throughput Buffering".
  30  * - Algorithm formal verification in Section 8.6:
  31  *     "Formal verification of LTTng"
  32  *
  33  * Author:
  34  *      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  35  *
  36  * Inspired from LTT and RelayFS:
  37  *  Karim Yaghmour <karim@opersys.com>
  38  *  Tom Zanussi <zanussi@us.ibm.com>
  39  *  Bob Wisniewski <bob@watson.ibm.com>
  40  * And from K42 :
  41  *  Bob Wisniewski <bob@watson.ibm.com>
  42  *
  43  * Buffer reader semantic :
  44  *
  45  * - get_subbuf_size
  46  * while buffer is not finalized and empty
  47  *   - get_subbuf
  48  *     - if return value != 0, continue
  49  *   - splice one subbuffer worth of data to a pipe
  50  *   - splice the data from pipe to disk/network
  51  *   - put_subbuf
  52  */
  53
  54 #define _GNU_SOURCE
  55 #define _LGPL_SOURCE
  56 #include <sys/types.h>
  57 #include <sys/mman.h>
  58 #include <sys/stat.h>
  59 #include <unistd.h>
  60 #include <fcntl.h>
  61 #include <signal.h>
  62 #include <time.h>
  63 #include <stdbool.h>
  64 #include <urcu/compiler.h>
  65 #include <urcu/ref.h>
  66 #include <urcu/tls-compat.h>
  67 #include <poll.h>
  68 #include <helper.h>
  69
  70 #include "smp.h"
  71 #include <lttng/ringbuffer-config.h>
  72 #include "vatomic.h"
  73 #include "backend.h"
  74 #include "frontend.h"
  75 #include "shm.h"
  76 #include "rb-init.h"
  77 #include "../liblttng-ust/compat.h"     /* For ENODATA */
  78
  79 /* Print DBG() messages about events lost only every 1048576 hits */
  80 #define DBG_PRINT_NR_LOST       (1UL << 20)
  81
  82 #define LTTNG_UST_RB_SIG_FLUSH          SIGRTMIN
  83 #define LTTNG_UST_RB_SIG_READ           SIGRTMIN + 1
  84 #define LTTNG_UST_RB_SIG_TEARDOWN       SIGRTMIN + 2
  85 #define CLOCKID         CLOCK_MONOTONIC
  86 #define LTTNG_UST_RING_BUFFER_GET_RETRY         10
  87 #define LTTNG_UST_RING_BUFFER_RETRY_DELAY_MS    10
  88 #define RETRY_DELAY_MS                          100     /* 100 ms. */
  89
  90 /*
  91  * Non-static to ensure the compiler does not optimize away the xor.
  92  */
  93 uint8_t lttng_crash_magic_xor[] = RB_CRASH_DUMP_ABI_MAGIC_XOR;
  94
  95 /*
  96  * Use POSIX SHM: shm_open(3) and shm_unlink(3).
  97  * close(2) to close the fd returned by shm_open.
  98  * shm_unlink releases the shared memory object name.
  99  * ftruncate(2) sets the size of the memory object.
 100  * mmap/munmap maps the shared memory obj to a virtual address in the
 101  * calling proceess (should be done both in libust and consumer).
 102  * See shm_overview(7) for details.
 103  * Pass file descriptor returned by shm_open(3) to ltt-sessiond through
 104  * a UNIX socket.
 105  *
 106  * Since we don't need to access the object using its name, we can
 107  * immediately shm_unlink(3) it, and only keep the handle with its file
 108  * descriptor.
 109  */
 110
 111 /*
 112  * Internal structure representing offsets to use at a sub-buffer switch.
 113  */
 114 struct switch_offsets {
 115         unsigned long begin, end, old;
 116         size_t pre_header_padding, size;
 117         unsigned int switch_new_start:1, switch_new_end:1, switch_old_start:1,
 118                      switch_old_end:1;
 119 };
 120
 121 DEFINE_URCU_TLS(unsigned int, lib_ring_buffer_nesting);
 122
 123 /*
 124  * wakeup_fd_mutex protects wakeup fd use by timer from concurrent
 125  * close.
 126  */
 127 static pthread_mutex_t wakeup_fd_mutex = PTHREAD_MUTEX_INITIALIZER;
 128
 129 static
 130 void lib_ring_buffer_print_errors(struct channel *chan,
 131                                 struct lttng_ust_lib_ring_buffer *buf, int cpu,
 132                                 struct lttng_ust_shm_handle *handle);
 133
 134 /*
 135  * Handle timer teardown race wrt memory free of private data by
 136  * ring buffer signals are handled by a single thread, which permits
 137  * a synchronization point between handling of each signal.
 138  * Protected by the lock within the structure.
 139  */
 140 struct timer_signal_data {
 141         pthread_t tid;  /* thread id managing signals */
 142         int setup_done;
 143         int qs_done;
 144         pthread_mutex_t lock;
 145 };
 146
 147 static struct timer_signal_data timer_signal = {
 148         .tid = 0,
 149         .setup_done = 0,
 150         .qs_done = 0,
 151         .lock = PTHREAD_MUTEX_INITIALIZER,
 152 };
 153
 154 static bool lttng_ust_allow_blocking;
 155
 156 void lttng_ust_ringbuffer_set_allow_blocking(void)
 157 {
 158         lttng_ust_allow_blocking = true;
 159 }
 160
 161 /* Get blocking timeout, in ms */
 162 static int lttng_ust_ringbuffer_get_timeout(struct channel *chan)
 163 {
 164         if (!lttng_ust_allow_blocking)
 165                 return 0;
 166         return chan->u.s.blocking_timeout_ms;
 167 }
 168
 169 /**
 170  * lib_ring_buffer_reset - Reset ring buffer to initial values.
 171  * @buf: Ring buffer.
 172  *
 173  * Effectively empty the ring buffer. Should be called when the buffer is not
 174  * used for writing. The ring buffer can be opened for reading, but the reader
 175  * should not be using the iterator concurrently with reset. The previous
 176  * current iterator record is reset.
 177  */
 178 void lib_ring_buffer_reset(struct lttng_ust_lib_ring_buffer *buf,
 179                            struct lttng_ust_shm_handle *handle)
 180 {
 181         struct channel *chan;
 182         const struct lttng_ust_lib_ring_buffer_config *config;
 183         unsigned int i;
 184
 185         chan = shmp(handle, buf->backend.chan);
 186         if (!chan)
 187                 return;
 188         config = &chan->backend.config;
 189         /*
 190          * Reset iterator first. It will put the subbuffer if it currently holds
 191          * it.
 192          */
 193         v_set(config, &buf->offset, 0);
 194         for (i = 0; i < chan->backend.num_subbuf; i++) {
 195                 struct commit_counters_hot *cc_hot;
 196                 struct commit_counters_cold *cc_cold;
 197
 198                 cc_hot = shmp_index(handle, buf->commit_hot, i);
 199                 if (!cc_hot)
 200                         return;
 201                 cc_cold = shmp_index(handle, buf->commit_cold, i);
 202                 if (!cc_cold)
 203                         return;
 204                 v_set(config, &cc_hot->cc, 0);
 205                 v_set(config, &cc_hot->seq, 0);
 206                 v_set(config, &cc_cold->cc_sb, 0);
 207         }
 208         uatomic_set(&buf->consumed, 0);
 209         uatomic_set(&buf->record_disabled, 0);
 210         v_set(config, &buf->last_tsc, 0);
 211         lib_ring_buffer_backend_reset(&buf->backend, handle);
 212         /* Don't reset number of active readers */
 213         v_set(config, &buf->records_lost_full, 0);
 214         v_set(config, &buf->records_lost_wrap, 0);
 215         v_set(config, &buf->records_lost_big, 0);
 216         v_set(config, &buf->records_count, 0);
 217         v_set(config, &buf->records_overrun, 0);
 218         buf->finalized = 0;
 219 }
 220
 221 /**
 222  * channel_reset - Reset channel to initial values.
 223  * @chan: Channel.
 224  *
 225  * Effectively empty the channel. Should be called when the channel is not used
 226  * for writing. The channel can be opened for reading, but the reader should not
 227  * be using the iterator concurrently with reset. The previous current iterator
 228  * record is reset.
 229  */
 230 void channel_reset(struct channel *chan)
 231 {
 232         /*
 233          * Reset iterators first. Will put the subbuffer if held for reading.
 234          */
 235         uatomic_set(&chan->record_disabled, 0);
 236         /* Don't reset commit_count_mask, still valid */
 237         channel_backend_reset(&chan->backend);
 238         /* Don't reset switch/read timer interval */
 239         /* Don't reset notifiers and notifier enable bits */
 240         /* Don't reset reader reference count */
 241 }
 242
 243 static
 244 void init_crash_abi(const struct lttng_ust_lib_ring_buffer_config *config,
 245                 struct lttng_crash_abi *crash_abi,
 246                 struct lttng_ust_lib_ring_buffer *buf,
 247                 struct channel_backend *chanb,
 248                 struct shm_object *shmobj,
 249                 struct lttng_ust_shm_handle *handle)
 250 {
 251         int i;
 252
 253         for (i = 0; i < RB_CRASH_DUMP_ABI_MAGIC_LEN; i++)
 254                 crash_abi->magic[i] = lttng_crash_magic_xor[i] ^ 0xFF;
 255         crash_abi->mmap_length = shmobj->memory_map_size;
 256         crash_abi->endian = RB_CRASH_ENDIAN;
 257         crash_abi->major = RB_CRASH_DUMP_ABI_MAJOR;
 258         crash_abi->minor = RB_CRASH_DUMP_ABI_MINOR;
 259         crash_abi->word_size = sizeof(unsigned long);
 260         crash_abi->layout_type = LTTNG_CRASH_TYPE_UST;
 261
 262         /* Offset of fields */
 263         crash_abi->offset.prod_offset =
 264                 (uint32_t) ((char *) &buf->offset - (char *) buf);
 265         crash_abi->offset.consumed_offset =
 266                 (uint32_t) ((char *) &buf->consumed - (char *) buf);
 267         crash_abi->offset.commit_hot_array =
 268                 (uint32_t) ((char *) shmp(handle, buf->commit_hot) - (char *) buf);
 269         crash_abi->offset.commit_hot_seq =
 270                 offsetof(struct commit_counters_hot, seq);
 271         crash_abi->offset.buf_wsb_array =
 272                 (uint32_t) ((char *) shmp(handle, buf->backend.buf_wsb) - (char *) buf);
 273         crash_abi->offset.buf_wsb_id =
 274                 offsetof(struct lttng_ust_lib_ring_buffer_backend_subbuffer, id);
 275         crash_abi->offset.sb_array =
 276                 (uint32_t) ((char *) shmp(handle, buf->backend.array) - (char *) buf);
 277         crash_abi->offset.sb_array_shmp_offset =
 278                 offsetof(struct lttng_ust_lib_ring_buffer_backend_pages_shmp,
 279                         shmp._ref.offset);
 280         crash_abi->offset.sb_backend_p_offset =
 281                 offsetof(struct lttng_ust_lib_ring_buffer_backend_pages,
 282                         p._ref.offset);
 283
 284         /* Field length */
 285         crash_abi->length.prod_offset = sizeof(buf->offset);
 286         crash_abi->length.consumed_offset = sizeof(buf->consumed);
 287         crash_abi->length.commit_hot_seq =
 288                 sizeof(((struct commit_counters_hot *) NULL)->seq);
 289         crash_abi->length.buf_wsb_id =
 290                 sizeof(((struct lttng_ust_lib_ring_buffer_backend_subbuffer *) NULL)->id);
 291         crash_abi->length.sb_array_shmp_offset =
 292                 sizeof(((struct lttng_ust_lib_ring_buffer_backend_pages_shmp *) NULL)->shmp._ref.offset);
 293         crash_abi->length.sb_backend_p_offset =
 294                 sizeof(((struct lttng_ust_lib_ring_buffer_backend_pages *) NULL)->p._ref.offset);
 295
 296         /* Array stride */
 297         crash_abi->stride.commit_hot_array =
 298                 sizeof(struct commit_counters_hot);
 299         crash_abi->stride.buf_wsb_array =
 300                 sizeof(struct lttng_ust_lib_ring_buffer_backend_subbuffer);
 301         crash_abi->stride.sb_array =
 302                 sizeof(struct lttng_ust_lib_ring_buffer_backend_pages_shmp);
 303
 304         /* Buffer constants */
 305         crash_abi->buf_size = chanb->buf_size;
 306         crash_abi->subbuf_size = chanb->subbuf_size;
 307         crash_abi->num_subbuf = chanb->num_subbuf;
 308         crash_abi->mode = (uint32_t) chanb->config.mode;
 309
 310         if (config->cb.content_size_field) {
 311                 size_t offset, length;
 312
 313                 config->cb.content_size_field(config, &offset, &length);
 314                 crash_abi->offset.content_size = offset;
 315                 crash_abi->length.content_size = length;
 316         } else {
 317                 crash_abi->offset.content_size = 0;
 318                 crash_abi->length.content_size = 0;
 319         }
 320         if (config->cb.packet_size_field) {
 321                 size_t offset, length;
 322
 323                 config->cb.packet_size_field(config, &offset, &length);
 324                 crash_abi->offset.packet_size = offset;
 325                 crash_abi->length.packet_size = length;
 326         } else {
 327                 crash_abi->offset.packet_size = 0;
 328                 crash_abi->length.packet_size = 0;
 329         }
 330 }
 331
 332 /*
 333  * Must be called under cpu hotplug protection.
 334  */
 335 int lib_ring_buffer_create(struct lttng_ust_lib_ring_buffer *buf,
 336                            struct channel_backend *chanb, int cpu,
 337                            struct lttng_ust_shm_handle *handle,
 338                            struct shm_object *shmobj)
 339 {
 340         const struct lttng_ust_lib_ring_buffer_config *config = &chanb->config;
 341         struct channel *chan = caa_container_of(chanb, struct channel, backend);
 342         struct lttng_ust_lib_ring_buffer_backend_subbuffer *wsb;
 343         struct channel *shmp_chan;
 344         struct commit_counters_hot *cc_hot;
 345         void *priv = channel_get_private(chan);
 346         size_t subbuf_header_size;
 347         uint64_t tsc;
 348         int ret;
 349
 350         /* Test for cpu hotplug */
 351         if (buf->backend.allocated)
 352                 return 0;
 353
 354         align_shm(shmobj, __alignof__(struct commit_counters_hot));
 355         set_shmp(buf->commit_hot,
 356                  zalloc_shm(shmobj,
 357                         sizeof(struct commit_counters_hot) * chan->backend.num_subbuf));
 358         if (!shmp(handle, buf->commit_hot)) {
 359                 return -ENOMEM;
 360         }
 361
 362         align_shm(shmobj, __alignof__(struct commit_counters_cold));
 363         set_shmp(buf->commit_cold,
 364                  zalloc_shm(shmobj,
 365                         sizeof(struct commit_counters_cold) * chan->backend.num_subbuf));
 366         if (!shmp(handle, buf->commit_cold)) {
 367                 ret = -ENOMEM;
 368                 goto free_commit;
 369         }
 370
 371         ret = lib_ring_buffer_backend_create(&buf->backend, &chan->backend,
 372                         cpu, handle, shmobj);
 373         if (ret) {
 374                 goto free_init;
 375         }
 376
 377         /*
 378          * Write the subbuffer header for first subbuffer so we know the total
 379          * duration of data gathering.
 380          */
 381         subbuf_header_size = config->cb.subbuffer_header_size();
 382         v_set(config, &buf->offset, subbuf_header_size);
 383         wsb = shmp_index(handle, buf->backend.buf_wsb, 0);
 384         if (!wsb) {
 385                 ret = -EPERM;
 386                 goto free_chanbuf;
 387         }
 388         subbuffer_id_clear_noref(config, &wsb->id);
 389         shmp_chan = shmp(handle, buf->backend.chan);
 390         if (!shmp_chan) {
 391                 ret = -EPERM;
 392                 goto free_chanbuf;
 393         }
 394         tsc = config->cb.ring_buffer_clock_read(shmp_chan);
 395         config->cb.buffer_begin(buf, tsc, 0, handle);
 396         cc_hot = shmp_index(handle, buf->commit_hot, 0);
 397         if (!cc_hot) {
 398                 ret = -EPERM;
 399                 goto free_chanbuf;
 400         }
 401         v_add(config, subbuf_header_size, &cc_hot->cc);
 402         v_add(config, subbuf_header_size, &cc_hot->seq);
 403
 404         if (config->cb.buffer_create) {
 405                 ret = config->cb.buffer_create(buf, priv, cpu, chanb->name, handle);
 406                 if (ret)
 407                         goto free_chanbuf;
 408         }
 409
 410         init_crash_abi(config, &buf->crash_abi, buf, chanb, shmobj, handle);
 411
 412         buf->backend.allocated = 1;
 413         return 0;
 414
 415         /* Error handling */
 416 free_init:
 417         /* commit_cold will be freed by shm teardown */
 418 free_commit:
 419         /* commit_hot will be freed by shm teardown */
 420 free_chanbuf:
 421         return ret;
 422 }
 423
 424 static
 425 void lib_ring_buffer_channel_switch_timer(int sig, siginfo_t *si, void *uc)
 426 {
 427         const struct lttng_ust_lib_ring_buffer_config *config;
 428         struct lttng_ust_shm_handle *handle;
 429         struct channel *chan;
 430         int cpu;
 431
 432         assert(CMM_LOAD_SHARED(timer_signal.tid) == pthread_self());
 433
 434         chan = si->si_value.sival_ptr;
 435         handle = chan->handle;
 436         config = &chan->backend.config;
 437
 438         DBG("Switch timer for channel %p\n", chan);
 439
 440         /*
 441          * Only flush buffers periodically if readers are active.
 442          */
 443         pthread_mutex_lock(&wakeup_fd_mutex);
 444         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 445                 for_each_possible_cpu(cpu) {
 446                         struct lttng_ust_lib_ring_buffer *buf =
 447                                 shmp(handle, chan->backend.buf[cpu].shmp);
 448
 449                         if (!buf)
 450                                 goto end;
 451                         if (uatomic_read(&buf->active_readers))
 452                                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE,
 453                                         chan->handle);
 454                 }
 455         } else {
 456                 struct lttng_ust_lib_ring_buffer *buf =
 457                         shmp(handle, chan->backend.buf[0].shmp);
 458
 459                 if (!buf)
 460                         goto end;
 461                 if (uatomic_read(&buf->active_readers))
 462                         lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE,
 463                                 chan->handle);
 464         }
 465 end:
 466         pthread_mutex_unlock(&wakeup_fd_mutex);
 467         return;
 468 }
 469
 470 static
 471 int lib_ring_buffer_poll_deliver(const struct lttng_ust_lib_ring_buffer_config *config,
 472                                  struct lttng_ust_lib_ring_buffer *buf,
 473                                  struct channel *chan,
 474                                  struct lttng_ust_shm_handle *handle)
 475 {
 476         unsigned long consumed_old, consumed_idx, commit_count, write_offset;
 477         struct commit_counters_cold *cc_cold;
 478
 479         consumed_old = uatomic_read(&buf->consumed);
 480         consumed_idx = subbuf_index(consumed_old, chan);
 481         cc_cold = shmp_index(handle, buf->commit_cold, consumed_idx);
 482         if (!cc_cold)
 483                 return 0;
 484         commit_count = v_read(config, &cc_cold->cc_sb);
 485         /*
 486          * No memory barrier here, since we are only interested
 487          * in a statistically correct polling result. The next poll will
 488          * get the data is we are racing. The mb() that ensures correct
 489          * memory order is in get_subbuf.
 490          */
 491         write_offset = v_read(config, &buf->offset);
 492
 493         /*
 494          * Check that the subbuffer we are trying to consume has been
 495          * already fully committed.
 496          */
 497
 498         if (((commit_count - chan->backend.subbuf_size)
 499              & chan->commit_count_mask)
 500             - (buf_trunc(consumed_old, chan)
 501                >> chan->backend.num_subbuf_order)
 502             != 0)
 503                 return 0;
 504
 505         /*
 506          * Check that we are not about to read the same subbuffer in
 507          * which the writer head is.
 508          */
 509         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_old, chan)
 510             == 0)
 511                 return 0;
 512
 513         return 1;
 514 }
 515
 516 static
 517 void lib_ring_buffer_wakeup(struct lttng_ust_lib_ring_buffer *buf,
 518                 struct lttng_ust_shm_handle *handle)
 519 {
 520         int wakeup_fd = shm_get_wakeup_fd(handle, &buf->self._ref);
 521         sigset_t sigpipe_set, pending_set, old_set;
 522         int ret, sigpipe_was_pending = 0;
 523
 524         if (wakeup_fd < 0)
 525                 return;
 526
 527         /*
 528          * Wake-up the other end by writing a null byte in the pipe
 529          * (non-blocking).  Important note: Because writing into the
 530          * pipe is non-blocking (and therefore we allow dropping wakeup
 531          * data, as long as there is wakeup data present in the pipe
 532          * buffer to wake up the consumer), the consumer should perform
 533          * the following sequence for waiting:
 534          * 1) empty the pipe (reads).
 535          * 2) check if there is data in the buffer.
 536          * 3) wait on the pipe (poll).
 537          *
 538          * Discard the SIGPIPE from write(), not disturbing any SIGPIPE
 539          * that might be already pending. If a bogus SIGPIPE is sent to
 540          * the entire process concurrently by a malicious user, it may
 541          * be simply discarded.
 542          */
 543         ret = sigemptyset(&pending_set);
 544         assert(!ret);
 545         /*
 546          * sigpending returns the mask of signals that are _both_
 547          * blocked for the thread _and_ pending for either the thread or
 548          * the entire process.
 549          */
 550         ret = sigpending(&pending_set);
 551         assert(!ret);
 552         sigpipe_was_pending = sigismember(&pending_set, SIGPIPE);
 553         /*
 554          * If sigpipe was pending, it means it was already blocked, so
 555          * no need to block it.
 556          */
 557         if (!sigpipe_was_pending) {
 558                 ret = sigemptyset(&sigpipe_set);
 559                 assert(!ret);
 560                 ret = sigaddset(&sigpipe_set, SIGPIPE);
 561                 assert(!ret);
 562                 ret = pthread_sigmask(SIG_BLOCK, &sigpipe_set, &old_set);
 563                 assert(!ret);
 564         }
 565         do {
 566                 ret = write(wakeup_fd, "", 1);
 567         } while (ret == -1L && errno == EINTR);
 568         if (ret == -1L && errno == EPIPE && !sigpipe_was_pending) {
 569                 struct timespec timeout = { 0, 0 };
 570                 do {
 571                         ret = sigtimedwait(&sigpipe_set, NULL,
 572                                 &timeout);
 573                 } while (ret == -1L && errno == EINTR);
 574         }
 575         if (!sigpipe_was_pending) {
 576                 ret = pthread_sigmask(SIG_SETMASK, &old_set, NULL);
 577                 assert(!ret);
 578         }
 579 }
 580
 581 static
 582 void lib_ring_buffer_channel_do_read(struct channel *chan)
 583 {
 584         const struct lttng_ust_lib_ring_buffer_config *config;
 585         struct lttng_ust_shm_handle *handle;
 586         int cpu;
 587
 588         handle = chan->handle;
 589         config = &chan->backend.config;
 590
 591         /*
 592          * Only flush buffers periodically if readers are active.
 593          */
 594         pthread_mutex_lock(&wakeup_fd_mutex);
 595         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 596                 for_each_possible_cpu(cpu) {
 597                         struct lttng_ust_lib_ring_buffer *buf =
 598                                 shmp(handle, chan->backend.buf[cpu].shmp);
 599
 600                         if (!buf)
 601                                 goto end;
 602                         if (uatomic_read(&buf->active_readers)
 603                             && lib_ring_buffer_poll_deliver(config, buf,
 604                                         chan, handle)) {
 605                                 lib_ring_buffer_wakeup(buf, handle);
 606                         }
 607                 }
 608         } else {
 609                 struct lttng_ust_lib_ring_buffer *buf =
 610                         shmp(handle, chan->backend.buf[0].shmp);
 611
 612                 if (!buf)
 613                         goto end;
 614                 if (uatomic_read(&buf->active_readers)
 615                     && lib_ring_buffer_poll_deliver(config, buf,
 616                                 chan, handle)) {
 617                         lib_ring_buffer_wakeup(buf, handle);
 618                 }
 619         }
 620 end:
 621         pthread_mutex_unlock(&wakeup_fd_mutex);
 622 }
 623
 624 static
 625 void lib_ring_buffer_channel_read_timer(int sig, siginfo_t *si, void *uc)
 626 {
 627         struct channel *chan;
 628
 629         assert(CMM_LOAD_SHARED(timer_signal.tid) == pthread_self());
 630         chan = si->si_value.sival_ptr;
 631         DBG("Read timer for channel %p\n", chan);
 632         lib_ring_buffer_channel_do_read(chan);
 633         return;
 634 }
 635
 636 static
 637 void rb_setmask(sigset_t *mask)
 638 {
 639         int ret;
 640
 641         ret = sigemptyset(mask);
 642         if (ret) {
 643                 PERROR("sigemptyset");
 644         }
 645         ret = sigaddset(mask, LTTNG_UST_RB_SIG_FLUSH);
 646         if (ret) {
 647                 PERROR("sigaddset");
 648         }
 649         ret = sigaddset(mask, LTTNG_UST_RB_SIG_READ);
 650         if (ret) {
 651                 PERROR("sigaddset");
 652         }
 653         ret = sigaddset(mask, LTTNG_UST_RB_SIG_TEARDOWN);
 654         if (ret) {
 655                 PERROR("sigaddset");
 656         }
 657 }
 658
 659 static
 660 void *sig_thread(void *arg)
 661 {
 662         sigset_t mask;
 663         siginfo_t info;
 664         int signr;
 665
 666         /* Only self thread will receive signal mask. */
 667         rb_setmask(&mask);
 668         CMM_STORE_SHARED(timer_signal.tid, pthread_self());
 669
 670         for (;;) {
 671                 signr = sigwaitinfo(&mask, &info);
 672                 if (signr == -1) {
 673                         if (errno != EINTR)
 674                                 PERROR("sigwaitinfo");
 675                         continue;
 676                 }
 677                 if (signr == LTTNG_UST_RB_SIG_FLUSH) {
 678                         lib_ring_buffer_channel_switch_timer(info.si_signo,
 679                                         &info, NULL);
 680                 } else if (signr == LTTNG_UST_RB_SIG_READ) {
 681                         lib_ring_buffer_channel_read_timer(info.si_signo,
 682                                         &info, NULL);
 683                 } else if (signr == LTTNG_UST_RB_SIG_TEARDOWN) {
 684                         cmm_smp_mb();
 685                         CMM_STORE_SHARED(timer_signal.qs_done, 1);
 686                         cmm_smp_mb();
 687                 } else {
 688                         ERR("Unexptected signal %d\n", info.si_signo);
 689                 }
 690         }
 691         return NULL;
 692 }
 693
 694 /*
 695  * Ensure only a single thread listens on the timer signal.
 696  */
 697 static
 698 void lib_ring_buffer_setup_timer_thread(void)
 699 {
 700         pthread_t thread;
 701         int ret;
 702
 703         pthread_mutex_lock(&timer_signal.lock);
 704         if (timer_signal.setup_done)
 705                 goto end;
 706
 707         ret = pthread_create(&thread, NULL, &sig_thread, NULL);
 708         if (ret) {
 709                 errno = ret;
 710                 PERROR("pthread_create");
 711         }
 712         ret = pthread_detach(thread);
 713         if (ret) {
 714                 errno = ret;
 715                 PERROR("pthread_detach");
 716         }
 717         timer_signal.setup_done = 1;
 718 end:
 719         pthread_mutex_unlock(&timer_signal.lock);
 720 }
 721
 722 /*
 723  * Wait for signal-handling thread quiescent state.
 724  */
 725 static
 726 void lib_ring_buffer_wait_signal_thread_qs(unsigned int signr)
 727 {
 728         sigset_t pending_set;
 729         int ret;
 730
 731         /*
 732          * We need to be the only thread interacting with the thread
 733          * that manages signals for teardown synchronization.
 734          */
 735         pthread_mutex_lock(&timer_signal.lock);
 736
 737         /*
 738          * Ensure we don't have any signal queued for this channel.
 739          */
 740         for (;;) {
 741                 ret = sigemptyset(&pending_set);
 742                 if (ret == -1) {
 743                         PERROR("sigemptyset");
 744                 }
 745                 ret = sigpending(&pending_set);
 746                 if (ret == -1) {
 747                         PERROR("sigpending");
 748                 }
 749                 if (!sigismember(&pending_set, signr))
 750                         break;
 751                 caa_cpu_relax();
 752         }
 753
 754         /*
 755          * From this point, no new signal handler will be fired that
 756          * would try to access "chan". However, we still need to wait
 757          * for any currently executing handler to complete.
 758          */
 759         cmm_smp_mb();
 760         CMM_STORE_SHARED(timer_signal.qs_done, 0);
 761         cmm_smp_mb();
 762
 763         /*
 764          * Kill with LTTNG_UST_RB_SIG_TEARDOWN, so signal management
 765          * thread wakes up.
 766          */
 767         kill(getpid(), LTTNG_UST_RB_SIG_TEARDOWN);
 768
 769         while (!CMM_LOAD_SHARED(timer_signal.qs_done))
 770                 caa_cpu_relax();
 771         cmm_smp_mb();
 772
 773         pthread_mutex_unlock(&timer_signal.lock);
 774 }
 775
 776 static
 777 void lib_ring_buffer_channel_switch_timer_start(struct channel *chan)
 778 {
 779         struct sigevent sev;
 780         struct itimerspec its;
 781         int ret;
 782
 783         if (!chan->switch_timer_interval || chan->switch_timer_enabled)
 784                 return;
 785
 786         chan->switch_timer_enabled = 1;
 787
 788         lib_ring_buffer_setup_timer_thread();
 789
 790         sev.sigev_notify = SIGEV_SIGNAL;
 791         sev.sigev_signo = LTTNG_UST_RB_SIG_FLUSH;
 792         sev.sigev_value.sival_ptr = chan;
 793         ret = timer_create(CLOCKID, &sev, &chan->switch_timer);
 794         if (ret == -1) {
 795                 PERROR("timer_create");
 796         }
 797
 798         its.it_value.tv_sec = chan->switch_timer_interval / 1000000;
 799         its.it_value.tv_nsec = (chan->switch_timer_interval % 1000000) * 1000;
 800         its.it_interval.tv_sec = its.it_value.tv_sec;
 801         its.it_interval.tv_nsec = its.it_value.tv_nsec;
 802
 803         ret = timer_settime(chan->switch_timer, 0, &its, NULL);
 804         if (ret == -1) {
 805                 PERROR("timer_settime");
 806         }
 807 }
 808
 809 static
 810 void lib_ring_buffer_channel_switch_timer_stop(struct channel *chan)
 811 {
 812         int ret;
 813
 814         if (!chan->switch_timer_interval || !chan->switch_timer_enabled)
 815                 return;
 816
 817         ret = timer_delete(chan->switch_timer);
 818         if (ret == -1) {
 819                 PERROR("timer_delete");
 820         }
 821
 822         lib_ring_buffer_wait_signal_thread_qs(LTTNG_UST_RB_SIG_FLUSH);
 823
 824         chan->switch_timer = 0;
 825         chan->switch_timer_enabled = 0;
 826 }
 827
 828 static
 829 void lib_ring_buffer_channel_read_timer_start(struct channel *chan)
 830 {
 831         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 832         struct sigevent sev;
 833         struct itimerspec its;
 834         int ret;
 835
 836         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 837                         || !chan->read_timer_interval || chan->read_timer_enabled)
 838                 return;
 839
 840         chan->read_timer_enabled = 1;
 841
 842         lib_ring_buffer_setup_timer_thread();
 843
 844         sev.sigev_notify = SIGEV_SIGNAL;
 845         sev.sigev_signo = LTTNG_UST_RB_SIG_READ;
 846         sev.sigev_value.sival_ptr = chan;
 847         ret = timer_create(CLOCKID, &sev, &chan->read_timer);
 848         if (ret == -1) {
 849                 PERROR("timer_create");
 850         }
 851
 852         its.it_value.tv_sec = chan->read_timer_interval / 1000000;
 853         its.it_value.tv_nsec = (chan->read_timer_interval % 1000000) * 1000;
 854         its.it_interval.tv_sec = its.it_value.tv_sec;
 855         its.it_interval.tv_nsec = its.it_value.tv_nsec;
 856
 857         ret = timer_settime(chan->read_timer, 0, &its, NULL);
 858         if (ret == -1) {
 859                 PERROR("timer_settime");
 860         }
 861 }
 862
 863 static
 864 void lib_ring_buffer_channel_read_timer_stop(struct channel *chan)
 865 {
 866         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 867         int ret;
 868
 869         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 870                         || !chan->read_timer_interval || !chan->read_timer_enabled)
 871                 return;
 872
 873         ret = timer_delete(chan->read_timer);
 874         if (ret == -1) {
 875                 PERROR("timer_delete");
 876         }
 877
 878         /*
 879          * do one more check to catch data that has been written in the last
 880          * timer period.
 881          */
 882         lib_ring_buffer_channel_do_read(chan);
 883
 884         lib_ring_buffer_wait_signal_thread_qs(LTTNG_UST_RB_SIG_READ);
 885
 886         chan->read_timer = 0;
 887         chan->read_timer_enabled = 0;
 888 }
 889
 890 static void channel_unregister_notifiers(struct channel *chan,
 891                            struct lttng_ust_shm_handle *handle)
 892 {
 893         lib_ring_buffer_channel_switch_timer_stop(chan);
 894         lib_ring_buffer_channel_read_timer_stop(chan);
 895 }
 896
 897 static void channel_print_errors(struct channel *chan,
 898                 struct lttng_ust_shm_handle *handle)
 899 {
 900         const struct lttng_ust_lib_ring_buffer_config *config =
 901                         &chan->backend.config;
 902         int cpu;
 903
 904         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 905                 for_each_possible_cpu(cpu) {
 906                         struct lttng_ust_lib_ring_buffer *buf =
 907                                 shmp(handle, chan->backend.buf[cpu].shmp);
 908                         if (buf)
 909                                 lib_ring_buffer_print_errors(chan, buf, cpu, handle);
 910                 }
 911         } else {
 912                 struct lttng_ust_lib_ring_buffer *buf =
 913                         shmp(handle, chan->backend.buf[0].shmp);
 914
 915                 if (buf)
 916                         lib_ring_buffer_print_errors(chan, buf, -1, handle);
 917         }
 918 }
 919
 920 static void channel_free(struct channel *chan,
 921                 struct lttng_ust_shm_handle *handle,
 922                 int consumer)
 923 {
 924         channel_backend_free(&chan->backend, handle);
 925         /* chan is freed by shm teardown */
 926         shm_object_table_destroy(handle->table, consumer);
 927         free(handle);
 928 }
 929
 930 /**
 931  * channel_create - Create channel.
 932  * @config: ring buffer instance configuration
 933  * @name: name of the channel
 934  * @priv_data: ring buffer client private data area pointer (output)
 935  * @priv_data_size: length, in bytes, of the private data area.
 936  * @priv_data_init: initialization data for private data.
 937  * @buf_addr: pointer the the beginning of the preallocated buffer contiguous
 938  *            address mapping. It is used only by RING_BUFFER_STATIC
 939  *            configuration. It can be set to NULL for other backends.
 940  * @subbuf_size: subbuffer size
 941  * @num_subbuf: number of subbuffers
 942  * @switch_timer_interval: Time interval (in us) to fill sub-buffers with
 943  *                         padding to let readers get those sub-buffers.
 944  *                         Used for live streaming.
 945  * @read_timer_interval: Time interval (in us) to wake up pending readers.
 946  * @stream_fds: array of stream file descriptors.
 947  * @nr_stream_fds: number of file descriptors in array.
 948  *
 949  * Holds cpu hotplug.
 950  * Returns NULL on failure.
 951  */
 952 struct lttng_ust_shm_handle *channel_create(const struct lttng_ust_lib_ring_buffer_config *config,
 953                    const char *name,
 954                    void **priv_data,
 955                    size_t priv_data_align,
 956                    size_t priv_data_size,
 957                    void *priv_data_init,
 958                    void *buf_addr, size_t subbuf_size,
 959                    size_t num_subbuf, unsigned int switch_timer_interval,
 960                    unsigned int read_timer_interval,
 961                    const int *stream_fds, int nr_stream_fds,
 962                    int64_t blocking_timeout)
 963 {
 964         int ret;
 965         size_t shmsize, chansize;
 966         struct channel *chan;
 967         struct lttng_ust_shm_handle *handle;
 968         struct shm_object *shmobj;
 969         unsigned int nr_streams;
 970         int64_t blocking_timeout_ms;
 971
 972         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 973                 nr_streams = num_possible_cpus();
 974         else
 975                 nr_streams = 1;
 976
 977         if (nr_stream_fds != nr_streams)
 978                 return NULL;
 979
 980         if (blocking_timeout < -1) {
 981                 return NULL;
 982         }
 983         /* usec to msec */
 984         if (blocking_timeout == -1) {
 985                 blocking_timeout_ms = -1;
 986         } else {
 987                 blocking_timeout_ms = blocking_timeout / 1000;
 988                 if (blocking_timeout_ms != (int32_t) blocking_timeout_ms) {
 989                         return NULL;
 990                 }
 991         }
 992
 993         if (lib_ring_buffer_check_config(config, switch_timer_interval,
 994                                          read_timer_interval))
 995                 return NULL;
 996
 997         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
 998         if (!handle)
 999                 return NULL;
1000
1001         /* Allocate table for channel + per-cpu buffers */
1002         handle->table = shm_object_table_create(1 + num_possible_cpus());
1003         if (!handle->table)
1004                 goto error_table_alloc;
1005
1006         /* Calculate the shm allocation layout */
1007         shmsize = sizeof(struct channel);
1008         shmsize += offset_align(shmsize, __alignof__(struct lttng_ust_lib_ring_buffer_shmp));
1009         shmsize += sizeof(struct lttng_ust_lib_ring_buffer_shmp) * nr_streams;
1010         chansize = shmsize;
1011         if (priv_data_align)
1012                 shmsize += offset_align(shmsize, priv_data_align);
1013         shmsize += priv_data_size;
1014
1015         /* Allocate normal memory for channel (not shared) */
1016         shmobj = shm_object_table_alloc(handle->table, shmsize, SHM_OBJECT_MEM,
1017                         -1, -1);
1018         if (!shmobj)
1019                 goto error_append;
1020         /* struct channel is at object 0, offset 0 (hardcoded) */
1021         set_shmp(handle->chan, zalloc_shm(shmobj, chansize));
1022         assert(handle->chan._ref.index == 0);
1023         assert(handle->chan._ref.offset == 0);
1024         chan = shmp(handle, handle->chan);
1025         if (!chan)
1026                 goto error_append;
1027         chan->nr_streams = nr_streams;
1028
1029         /* space for private data */
1030         if (priv_data_size) {
1031                 DECLARE_SHMP(void, priv_data_alloc);
1032
1033                 align_shm(shmobj, priv_data_align);
1034                 chan->priv_data_offset = shmobj->allocated_len;
1035                 set_shmp(priv_data_alloc, zalloc_shm(shmobj, priv_data_size));
1036                 if (!shmp(handle, priv_data_alloc))
1037                         goto error_append;
1038                 *priv_data = channel_get_private(chan);
1039                 memcpy(*priv_data, priv_data_init, priv_data_size);
1040         } else {
1041                 chan->priv_data_offset = -1;
1042                 if (priv_data)
1043                         *priv_data = NULL;
1044         }
1045
1046         chan->u.s.blocking_timeout_ms = (int32_t) blocking_timeout_ms;
1047
1048         ret = channel_backend_init(&chan->backend, name, config,
1049                                    subbuf_size, num_subbuf, handle,
1050                                    stream_fds);
1051         if (ret)
1052                 goto error_backend_init;
1053
1054         chan->handle = handle;
1055         chan->commit_count_mask = (~0UL >> chan->backend.num_subbuf_order);
1056
1057         chan->switch_timer_interval = switch_timer_interval;
1058         chan->read_timer_interval = read_timer_interval;
1059         lib_ring_buffer_channel_switch_timer_start(chan);
1060         lib_ring_buffer_channel_read_timer_start(chan);
1061
1062         return handle;
1063
1064 error_backend_init:
1065 error_append:
1066         shm_object_table_destroy(handle->table, 1);
1067 error_table_alloc:
1068         free(handle);
1069         return NULL;
1070 }
1071
1072 struct lttng_ust_shm_handle *channel_handle_create(void *data,
1073                                         uint64_t memory_map_size,
1074                                         int wakeup_fd)
1075 {
1076         struct lttng_ust_shm_handle *handle;
1077         struct shm_object *object;
1078
1079         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
1080         if (!handle)
1081                 return NULL;
1082
1083         /* Allocate table for channel + per-cpu buffers */
1084         handle->table = shm_object_table_create(1 + num_possible_cpus());
1085         if (!handle->table)
1086                 goto error_table_alloc;
1087         /* Add channel object */
1088         object = shm_object_table_append_mem(handle->table, data,
1089                         memory_map_size, wakeup_fd);
1090         if (!object)
1091                 goto error_table_object;
1092         /* struct channel is at object 0, offset 0 (hardcoded) */
1093         handle->chan._ref.index = 0;
1094         handle->chan._ref.offset = 0;
1095         return handle;
1096
1097 error_table_object:
1098         shm_object_table_destroy(handle->table, 0);
1099 error_table_alloc:
1100         free(handle);
1101         return NULL;
1102 }
1103
1104 int channel_handle_add_stream(struct lttng_ust_shm_handle *handle,
1105                 int shm_fd, int wakeup_fd, uint32_t stream_nr,
1106                 uint64_t memory_map_size)
1107 {
1108         struct shm_object *object;
1109
1110         /* Add stream object */
1111         object = shm_object_table_append_shm(handle->table,
1112                         shm_fd, wakeup_fd, stream_nr,
1113                         memory_map_size);
1114         if (!object)
1115                 return -EINVAL;
1116         return 0;
1117 }
1118
1119 unsigned int channel_handle_get_nr_streams(struct lttng_ust_shm_handle *handle)
1120 {
1121         assert(handle->table);
1122         return handle->table->allocated_len - 1;
1123 }
1124
1125 static
1126 void channel_release(struct channel *chan, struct lttng_ust_shm_handle *handle,
1127                 int consumer)
1128 {
1129         channel_free(chan, handle, consumer);
1130 }
1131
1132 /**
1133  * channel_destroy - Finalize, wait for q.s. and destroy channel.
1134  * @chan: channel to destroy
1135  *
1136  * Holds cpu hotplug.
1137  * Call "destroy" callback, finalize channels, decrement the channel
1138  * reference count. Note that when readers have completed data
1139  * consumption of finalized channels, get_subbuf() will return -ENODATA.
1140  * They should release their handle at that point.
1141  */
1142 void channel_destroy(struct channel *chan, struct lttng_ust_shm_handle *handle,
1143                 int consumer)
1144 {
1145         if (consumer) {
1146                 /*
1147                  * Note: the consumer takes care of finalizing and
1148                  * switching the buffers.
1149                  */
1150                 channel_unregister_notifiers(chan, handle);
1151                 /*
1152                  * The consumer prints errors.
1153                  */
1154                 channel_print_errors(chan, handle);
1155         }
1156
1157         /*
1158          * sessiond/consumer are keeping a reference on the shm file
1159          * descriptor directly. No need to refcount.
1160          */
1161         channel_release(chan, handle, consumer);
1162         return;
1163 }
1164
1165 struct lttng_ust_lib_ring_buffer *channel_get_ring_buffer(
1166                                         const struct lttng_ust_lib_ring_buffer_config *config,
1167                                         struct channel *chan, int cpu,
1168                                         struct lttng_ust_shm_handle *handle,
1169                                         int *shm_fd, int *wait_fd,
1170                                         int *wakeup_fd,
1171                                         uint64_t *memory_map_size)
1172 {
1173         struct shm_ref *ref;
1174
1175         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) {
1176                 cpu = 0;
1177         } else {
1178                 if (cpu >= num_possible_cpus())
1179                         return NULL;
1180         }
1181         ref = &chan->backend.buf[cpu].shmp._ref;
1182         *shm_fd = shm_get_shm_fd(handle, ref);
1183         *wait_fd = shm_get_wait_fd(handle, ref);
1184         *wakeup_fd = shm_get_wakeup_fd(handle, ref);
1185         if (shm_get_shm_size(handle, ref, memory_map_size))
1186                 return NULL;
1187         return shmp(handle, chan->backend.buf[cpu].shmp);
1188 }
1189
1190 int ring_buffer_channel_close_wait_fd(const struct lttng_ust_lib_ring_buffer_config *config,
1191                         struct channel *chan,
1192                         struct lttng_ust_shm_handle *handle)
1193 {
1194         struct shm_ref *ref;
1195
1196         ref = &handle->chan._ref;
1197         return shm_close_wait_fd(handle, ref);
1198 }
1199
1200 int ring_buffer_channel_close_wakeup_fd(const struct lttng_ust_lib_ring_buffer_config *config,
1201                         struct channel *chan,
1202                         struct lttng_ust_shm_handle *handle)
1203 {
1204         struct shm_ref *ref;
1205
1206         ref = &handle->chan._ref;
1207         return shm_close_wakeup_fd(handle, ref);
1208 }
1209
1210 int ring_buffer_stream_close_wait_fd(const struct lttng_ust_lib_ring_buffer_config *config,
1211                         struct channel *chan,
1212                         struct lttng_ust_shm_handle *handle,
1213                         int cpu)
1214 {
1215         struct shm_ref *ref;
1216
1217         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) {
1218                 cpu = 0;
1219         } else {
1220                 if (cpu >= num_possible_cpus())
1221                         return -EINVAL;
1222         }
1223         ref = &chan->backend.buf[cpu].shmp._ref;
1224         return shm_close_wait_fd(handle, ref);
1225 }
1226
1227 int ring_buffer_stream_close_wakeup_fd(const struct lttng_ust_lib_ring_buffer_config *config,
1228                         struct channel *chan,
1229                         struct lttng_ust_shm_handle *handle,
1230                         int cpu)
1231 {
1232         struct shm_ref *ref;
1233         int ret;
1234
1235         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) {
1236                 cpu = 0;
1237         } else {
1238                 if (cpu >= num_possible_cpus())
1239                         return -EINVAL;
1240         }
1241         ref = &chan->backend.buf[cpu].shmp._ref;
1242         pthread_mutex_lock(&wakeup_fd_mutex);
1243         ret = shm_close_wakeup_fd(handle, ref);
1244         pthread_mutex_unlock(&wakeup_fd_mutex);
1245         return ret;
1246 }
1247
1248 int lib_ring_buffer_open_read(struct lttng_ust_lib_ring_buffer *buf,
1249                               struct lttng_ust_shm_handle *handle)
1250 {
1251         if (uatomic_cmpxchg(&buf->active_readers, 0, 1) != 0)
1252                 return -EBUSY;
1253         cmm_smp_mb();
1254         return 0;
1255 }
1256
1257 void lib_ring_buffer_release_read(struct lttng_ust_lib_ring_buffer *buf,
1258                                   struct lttng_ust_shm_handle *handle)
1259 {
1260         struct channel *chan = shmp(handle, buf->backend.chan);
1261
1262         if (!chan)
1263                 return;
1264         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1);
1265         cmm_smp_mb();
1266         uatomic_dec(&buf->active_readers);
1267 }
1268
1269 /**
1270  * lib_ring_buffer_snapshot - save subbuffer position snapshot (for read)
1271  * @buf: ring buffer
1272  * @consumed: consumed count indicating the position where to read
1273  * @produced: produced count, indicates position when to stop reading
1274  *
1275  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
1276  * data to read at consumed position, or 0 if the get operation succeeds.
1277  */
1278
1279 int lib_ring_buffer_snapshot(struct lttng_ust_lib_ring_buffer *buf,
1280                              unsigned long *consumed, unsigned long *produced,
1281                              struct lttng_ust_shm_handle *handle)
1282 {
1283         struct channel *chan;
1284         const struct lttng_ust_lib_ring_buffer_config *config;
1285         unsigned long consumed_cur, write_offset;
1286         int finalized;
1287
1288         chan = shmp(handle, buf->backend.chan);
1289         if (!chan)
1290                 return -EPERM;
1291         config = &chan->backend.config;
1292         finalized = CMM_ACCESS_ONCE(buf->finalized);
1293         /*
1294          * Read finalized before counters.
1295          */
1296         cmm_smp_rmb();
1297         consumed_cur = uatomic_read(&buf->consumed);
1298         /*
1299          * No need to issue a memory barrier between consumed count read and
1300          * write offset read, because consumed count can only change
1301          * concurrently in overwrite mode, and we keep a sequence counter
1302          * identifier derived from the write offset to check we are getting
1303          * the same sub-buffer we are expecting (the sub-buffers are atomically
1304          * "tagged" upon writes, tags are checked upon read).
1305          */
1306         write_offset = v_read(config, &buf->offset);
1307
1308         /*
1309          * Check that we are not about to read the same subbuffer in
1310          * which the writer head is.
1311          */
1312         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
1313             == 0)
1314                 goto nodata;
1315
1316         *consumed = consumed_cur;
1317         *produced = subbuf_trunc(write_offset, chan);
1318
1319         return 0;
1320
1321 nodata:
1322         /*
1323          * The memory barriers __wait_event()/wake_up_interruptible() take care
1324          * of "raw_spin_is_locked" memory ordering.
1325          */
1326         if (finalized)
1327                 return -ENODATA;
1328         else
1329                 return -EAGAIN;
1330 }
1331
1332 /**
1333  * Performs the same function as lib_ring_buffer_snapshot(), but the positions
1334  * are saved regardless of whether the consumed and produced positions are
1335  * in the same subbuffer.
1336  * @buf: ring buffer
1337  * @consumed: consumed byte count indicating the last position read
1338  * @produced: produced byte count indicating the last position written
1339  *
1340  * This function is meant to provide information on the exact producer and
1341  * consumer positions without regard for the "snapshot" feature.
1342  */
1343 int lib_ring_buffer_snapshot_sample_positions(
1344                              struct lttng_ust_lib_ring_buffer *buf,
1345                              unsigned long *consumed, unsigned long *produced,
1346                              struct lttng_ust_shm_handle *handle)
1347 {
1348         struct channel *chan;
1349         const struct lttng_ust_lib_ring_buffer_config *config;
1350
1351         chan = shmp(handle, buf->backend.chan);
1352         if (!chan)
1353                 return -EPERM;
1354         config = &chan->backend.config;
1355         cmm_smp_rmb();
1356         *consumed = uatomic_read(&buf->consumed);
1357         /*
1358          * No need to issue a memory barrier between consumed count read and
1359          * write offset read, because consumed count can only change
1360          * concurrently in overwrite mode, and we keep a sequence counter
1361          * identifier derived from the write offset to check we are getting
1362          * the same sub-buffer we are expecting (the sub-buffers are atomically
1363          * "tagged" upon writes, tags are checked upon read).
1364          */
1365         *produced = v_read(config, &buf->offset);
1366         return 0;
1367 }
1368
1369 /**
1370  * lib_ring_buffer_move_consumer - move consumed counter forward
1371  * @buf: ring buffer
1372  * @consumed_new: new consumed count value
1373  */
1374 void lib_ring_buffer_move_consumer(struct lttng_ust_lib_ring_buffer *buf,
1375                                    unsigned long consumed_new,
1376                                    struct lttng_ust_shm_handle *handle)
1377 {
1378         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
1379         struct channel *chan;
1380         unsigned long consumed;
1381
1382         chan = shmp(handle, bufb->chan);
1383         if (!chan)
1384                 return;
1385         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1);
1386
1387         /*
1388          * Only push the consumed value forward.
1389          * If the consumed cmpxchg fails, this is because we have been pushed by
1390          * the writer in flight recorder mode.
1391          */
1392         consumed = uatomic_read(&buf->consumed);
1393         while ((long) consumed - (long) consumed_new < 0)
1394                 consumed = uatomic_cmpxchg(&buf->consumed, consumed,
1395                                            consumed_new);
1396 }
1397
1398 /**
1399  * lib_ring_buffer_get_subbuf - get exclusive access to subbuffer for reading
1400  * @buf: ring buffer
1401  * @consumed: consumed count indicating the position where to read
1402  *
1403  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
1404  * data to read at consumed position, or 0 if the get operation succeeds.
1405  */
1406 int lib_ring_buffer_get_subbuf(struct lttng_ust_lib_ring_buffer *buf,
1407                                unsigned long consumed,
1408                                struct lttng_ust_shm_handle *handle)
1409 {
1410         struct channel *chan;
1411         const struct lttng_ust_lib_ring_buffer_config *config;
1412         unsigned long consumed_cur, consumed_idx, commit_count, write_offset;
1413         int ret, finalized, nr_retry = LTTNG_UST_RING_BUFFER_GET_RETRY;
1414         struct commit_counters_cold *cc_cold;
1415
1416         chan = shmp(handle, buf->backend.chan);
1417         if (!chan)
1418                 return -EPERM;
1419         config = &chan->backend.config;
1420 retry:
1421         finalized = CMM_ACCESS_ONCE(buf->finalized);
1422         /*
1423          * Read finalized before counters.
1424          */
1425         cmm_smp_rmb();
1426         consumed_cur = uatomic_read(&buf->consumed);
1427         consumed_idx = subbuf_index(consumed, chan);
1428         cc_cold = shmp_index(handle, buf->commit_cold, consumed_idx);
1429         if (!cc_cold)
1430                 return -EPERM;
1431         commit_count = v_read(config, &cc_cold->cc_sb);
1432         /*
1433          * Make sure we read the commit count before reading the buffer
1434          * data and the write offset. Correct consumed offset ordering
1435          * wrt commit count is insured by the use of cmpxchg to update
1436          * the consumed offset.
1437          */
1438         /*
1439          * Local rmb to match the remote wmb to read the commit count
1440          * before the buffer data and the write offset.
1441          */
1442         cmm_smp_rmb();
1443
1444         write_offset = v_read(config, &buf->offset);
1445
1446         /*
1447          * Check that the buffer we are getting is after or at consumed_cur
1448          * position.
1449          */
1450         if ((long) subbuf_trunc(consumed, chan)
1451             - (long) subbuf_trunc(consumed_cur, chan) < 0)
1452                 goto nodata;
1453
1454         /*
1455          * Check that the subbuffer we are trying to consume has been
1456          * already fully committed. There are a few causes that can make
1457          * this unavailability situation occur:
1458          *
1459          * Temporary (short-term) situation:
1460          * - Application is running on a different CPU, between reserve
1461          *   and commit ring buffer operations,
1462          * - Application is preempted between reserve and commit ring
1463          *   buffer operations,
1464          *
1465          * Long-term situation:
1466          * - Application is stopped (SIGSTOP) between reserve and commit
1467          *   ring buffer operations. Could eventually be resumed by
1468          *   SIGCONT.
1469          * - Application is killed (SIGTERM, SIGINT, SIGKILL) between
1470          *   reserve and commit ring buffer operation.
1471          *
1472          * From a consumer perspective, handling short-term
1473          * unavailability situations is performed by retrying a few
1474          * times after a delay. Handling long-term unavailability
1475          * situations is handled by failing to get the sub-buffer.
1476          *
1477          * In all of those situations, if the application is taking a
1478          * long time to perform its commit after ring buffer space
1479          * reservation, we can end up in a situation where the producer
1480          * will fill the ring buffer and try to write into the same
1481          * sub-buffer again (which has a missing commit). This is
1482          * handled by the producer in the sub-buffer switch handling
1483          * code of the reserve routine by detecting unbalanced
1484          * reserve/commit counters and discarding all further events
1485          * until the situation is resolved in those situations. Two
1486          * scenarios can occur:
1487          *
1488          * 1) The application causing the reserve/commit counters to be
1489          *    unbalanced has been terminated. In this situation, all
1490          *    further events will be discarded in the buffers, and no
1491          *    further buffer data will be readable by the consumer
1492          *    daemon. Tearing down the UST tracing session and starting
1493          *    anew is a work-around for those situations. Note that this
1494          *    only affects per-UID tracing. In per-PID tracing, the
1495          *    application vanishes with the termination, and therefore
1496          *    no more data needs to be written to the buffers.
1497          * 2) The application causing the unbalance has been delayed for
1498          *    a long time, but will eventually try to increment the
1499          *    commit counter after eventually writing to the sub-buffer.
1500          *    This situation can cause events to be discarded until the
1501          *    application resumes its operations.
1502          */
1503         if (((commit_count - chan->backend.subbuf_size)
1504              & chan->commit_count_mask)
1505             - (buf_trunc(consumed, chan)
1506                >> chan->backend.num_subbuf_order)
1507             != 0) {
1508                 if (nr_retry-- > 0) {
1509                         if (nr_retry <= (LTTNG_UST_RING_BUFFER_GET_RETRY >> 1))
1510                                 (void) poll(NULL, 0, LTTNG_UST_RING_BUFFER_RETRY_DELAY_MS);
1511                         goto retry;
1512                 } else {
1513                         goto nodata;
1514                 }
1515         }
1516
1517         /*
1518          * Check that we are not about to read the same subbuffer in
1519          * which the writer head is.
1520          */
1521         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed, chan)
1522             == 0)
1523                 goto nodata;
1524
1525         /*
1526          * Failure to get the subbuffer causes a busy-loop retry without going
1527          * to a wait queue. These are caused by short-lived race windows where
1528          * the writer is getting access to a subbuffer we were trying to get
1529          * access to. Also checks that the "consumed" buffer count we are
1530          * looking for matches the one contained in the subbuffer id.
1531          *
1532          * The short-lived race window described here can be affected by
1533          * application signals and preemption, thus requiring to bound
1534          * the loop to a maximum number of retry.
1535          */
1536         ret = update_read_sb_index(config, &buf->backend, &chan->backend,
1537                                    consumed_idx, buf_trunc_val(consumed, chan),
1538                                    handle);
1539         if (ret) {
1540                 if (nr_retry-- > 0) {
1541                         if (nr_retry <= (LTTNG_UST_RING_BUFFER_GET_RETRY >> 1))
1542                                 (void) poll(NULL, 0, LTTNG_UST_RING_BUFFER_RETRY_DELAY_MS);
1543                         goto retry;
1544                 } else {
1545                         goto nodata;
1546                 }
1547         }
1548         subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id);
1549
1550         buf->get_subbuf_consumed = consumed;
1551         buf->get_subbuf = 1;
1552
1553         return 0;
1554
1555 nodata:
1556         /*
1557          * The memory barriers __wait_event()/wake_up_interruptible() take care
1558          * of "raw_spin_is_locked" memory ordering.
1559          */
1560         if (finalized)
1561                 return -ENODATA;
1562         else
1563                 return -EAGAIN;
1564 }
1565
1566 /**
1567  * lib_ring_buffer_put_subbuf - release exclusive subbuffer access
1568  * @buf: ring buffer
1569  */
1570 void lib_ring_buffer_put_subbuf(struct lttng_ust_lib_ring_buffer *buf,
1571                                 struct lttng_ust_shm_handle *handle)
1572 {
1573         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
1574         struct channel *chan;
1575         const struct lttng_ust_lib_ring_buffer_config *config;
1576         unsigned long sb_bindex, consumed_idx, consumed;
1577         struct lttng_ust_lib_ring_buffer_backend_pages_shmp *rpages;
1578         struct lttng_ust_lib_ring_buffer_backend_pages *backend_pages;
1579
1580         chan = shmp(handle, bufb->chan);
1581         if (!chan)
1582                 return;
1583         config = &chan->backend.config;
1584         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1);
1585
1586         if (!buf->get_subbuf) {
1587                 /*
1588                  * Reader puts a subbuffer it did not get.
1589                  */
1590                 CHAN_WARN_ON(chan, 1);
1591                 return;
1592         }
1593         consumed = buf->get_subbuf_consumed;
1594         buf->get_subbuf = 0;
1595
1596         /*
1597          * Clear the records_unread counter. (overruns counter)
1598          * Can still be non-zero if a file reader simply grabbed the data
1599          * without using iterators.
1600          * Can be below zero if an iterator is used on a snapshot more than
1601          * once.
1602          */
1603         sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id);
1604         rpages = shmp_index(handle, bufb->array, sb_bindex);
1605         if (!rpages)
1606                 return;
1607         backend_pages = shmp(handle, rpages->shmp);
1608         if (!backend_pages)
1609                 return;
1610         v_add(config, v_read(config, &backend_pages->records_unread),
1611                         &bufb->records_read);
1612         v_set(config, &backend_pages->records_unread, 0);
1613         CHAN_WARN_ON(chan, config->mode == RING_BUFFER_OVERWRITE
1614                      && subbuffer_id_is_noref(config, bufb->buf_rsb.id));
1615         subbuffer_id_set_noref(config, &bufb->buf_rsb.id);
1616
1617         /*
1618          * Exchange the reader subbuffer with the one we put in its place in the
1619          * writer subbuffer table. Expect the original consumed count. If
1620          * update_read_sb_index fails, this is because the writer updated the
1621          * subbuffer concurrently. We should therefore keep the subbuffer we
1622          * currently have: it has become invalid to try reading this sub-buffer
1623          * consumed count value anyway.
1624          */
1625         consumed_idx = subbuf_index(consumed, chan);
1626         update_read_sb_index(config, &buf->backend, &chan->backend,
1627                              consumed_idx, buf_trunc_val(consumed, chan),
1628                              handle);
1629         /*
1630          * update_read_sb_index return value ignored. Don't exchange sub-buffer
1631          * if the writer concurrently updated it.
1632          */
1633 }
1634
1635 /*
1636  * cons_offset is an iterator on all subbuffer offsets between the reader
1637  * position and the writer position. (inclusive)
1638  */
1639 static
1640 void lib_ring_buffer_print_subbuffer_errors(struct lttng_ust_lib_ring_buffer *buf,
1641                                             struct channel *chan,
1642                                             unsigned long cons_offset,
1643                                             int cpu,
1644                                             struct lttng_ust_shm_handle *handle)
1645 {
1646         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1647         unsigned long cons_idx, commit_count, commit_count_sb;
1648         struct commit_counters_hot *cc_hot;
1649         struct commit_counters_cold *cc_cold;
1650
1651         cons_idx = subbuf_index(cons_offset, chan);
1652         cc_hot = shmp_index(handle, buf->commit_hot, cons_idx);
1653         if (!cc_hot)
1654                 return;
1655         cc_cold = shmp_index(handle, buf->commit_cold, cons_idx);
1656         if (!cc_cold)
1657                 return;
1658         commit_count = v_read(config, &cc_hot->cc);
1659         commit_count_sb = v_read(config, &cc_cold->cc_sb);
1660
1661         if (subbuf_offset(commit_count, chan) != 0)
1662                 DBG("ring buffer %s, cpu %d: "
1663                        "commit count in subbuffer %lu,\n"
1664                        "expecting multiples of %lu bytes\n"
1665                        "  [ %lu bytes committed, %lu bytes reader-visible ]\n",
1666                        chan->backend.name, cpu, cons_idx,
1667                        chan->backend.subbuf_size,
1668                        commit_count, commit_count_sb);
1669
1670         DBG("ring buffer: %s, cpu %d: %lu bytes committed\n",
1671                chan->backend.name, cpu, commit_count);
1672 }
1673
1674 static
1675 void lib_ring_buffer_print_buffer_errors(struct lttng_ust_lib_ring_buffer *buf,
1676                                          struct channel *chan,
1677                                          void *priv, int cpu,
1678                                          struct lttng_ust_shm_handle *handle)
1679 {
1680         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1681         unsigned long write_offset, cons_offset;
1682
1683         /*
1684          * No need to order commit_count, write_offset and cons_offset reads
1685          * because we execute at teardown when no more writer nor reader
1686          * references are left.
1687          */
1688         write_offset = v_read(config, &buf->offset);
1689         cons_offset = uatomic_read(&buf->consumed);
1690         if (write_offset != cons_offset)
1691                 DBG("ring buffer %s, cpu %d: "
1692                        "non-consumed data\n"
1693                        "  [ %lu bytes written, %lu bytes read ]\n",
1694                        chan->backend.name, cpu, write_offset, cons_offset);
1695
1696         for (cons_offset = uatomic_read(&buf->consumed);
1697              (long) (subbuf_trunc((unsigned long) v_read(config, &buf->offset),
1698                                   chan)
1699                      - cons_offset) > 0;
1700              cons_offset = subbuf_align(cons_offset, chan))
1701                 lib_ring_buffer_print_subbuffer_errors(buf, chan, cons_offset,
1702                                                        cpu, handle);
1703 }
1704
1705 static
1706 void lib_ring_buffer_print_errors(struct channel *chan,
1707                                 struct lttng_ust_lib_ring_buffer *buf, int cpu,
1708                                 struct lttng_ust_shm_handle *handle)
1709 {
1710         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1711         void *priv = channel_get_private(chan);
1712
1713         if (!strcmp(chan->backend.name, "relay-metadata-mmap")) {
1714                 DBG("ring buffer %s: %lu records written, "
1715                         "%lu records overrun\n",
1716                         chan->backend.name,
1717                         v_read(config, &buf->records_count),
1718                         v_read(config, &buf->records_overrun));
1719         } else {
1720                 DBG("ring buffer %s, cpu %d: %lu records written, "
1721                         "%lu records overrun\n",
1722                         chan->backend.name, cpu,
1723                         v_read(config, &buf->records_count),
1724                         v_read(config, &buf->records_overrun));
1725
1726                 if (v_read(config, &buf->records_lost_full)
1727                     || v_read(config, &buf->records_lost_wrap)
1728                     || v_read(config, &buf->records_lost_big))
1729                         DBG("ring buffer %s, cpu %d: records were lost. Caused by:\n"
1730                                 "  [ %lu buffer full, %lu nest buffer wrap-around, "
1731                                 "%lu event too big ]\n",
1732                                 chan->backend.name, cpu,
1733                                 v_read(config, &buf->records_lost_full),
1734                                 v_read(config, &buf->records_lost_wrap),
1735                                 v_read(config, &buf->records_lost_big));
1736         }
1737         lib_ring_buffer_print_buffer_errors(buf, chan, priv, cpu, handle);
1738 }
1739
1740 /*
1741  * lib_ring_buffer_switch_old_start: Populate old subbuffer header.
1742  *
1743  * Only executed by SWITCH_FLUSH, which can be issued while tracing is
1744  * active or at buffer finalization (destroy).
1745  */
1746 static
1747 void lib_ring_buffer_switch_old_start(struct lttng_ust_lib_ring_buffer *buf,
1748                                       struct channel *chan,
1749                                       struct switch_offsets *offsets,
1750                                       uint64_t tsc,
1751                                       struct lttng_ust_shm_handle *handle)
1752 {
1753         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1754         unsigned long oldidx = subbuf_index(offsets->old, chan);
1755         unsigned long commit_count;
1756         struct commit_counters_hot *cc_hot;
1757
1758         config->cb.buffer_begin(buf, tsc, oldidx, handle);
1759
1760         /*
1761          * Order all writes to buffer before the commit count update that will
1762          * determine that the subbuffer is full.
1763          */
1764         cmm_smp_wmb();
1765         cc_hot = shmp_index(handle, buf->commit_hot, oldidx);
1766         if (!cc_hot)
1767                 return;
1768         v_add(config, config->cb.subbuffer_header_size(),
1769               &cc_hot->cc);
1770         commit_count = v_read(config, &cc_hot->cc);
1771         /* Check if the written buffer has to be delivered */
1772         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old,
1773                                       commit_count, oldidx, handle, tsc);
1774         lib_ring_buffer_write_commit_counter(config, buf, chan,
1775                         offsets->old + config->cb.subbuffer_header_size(),
1776                         commit_count, handle, cc_hot);
1777 }
1778
1779 /*
1780  * lib_ring_buffer_switch_old_end: switch old subbuffer
1781  *
1782  * Note : offset_old should never be 0 here. It is ok, because we never perform
1783  * buffer switch on an empty subbuffer in SWITCH_ACTIVE mode. The caller
1784  * increments the offset_old value when doing a SWITCH_FLUSH on an empty
1785  * subbuffer.
1786  */
1787 static
1788 void lib_ring_buffer_switch_old_end(struct lttng_ust_lib_ring_buffer *buf,
1789                                     struct channel *chan,
1790                                     struct switch_offsets *offsets,
1791                                     uint64_t tsc,
1792                                     struct lttng_ust_shm_handle *handle)
1793 {
1794         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1795         unsigned long oldidx = subbuf_index(offsets->old - 1, chan);
1796         unsigned long commit_count, padding_size, data_size;
1797         struct commit_counters_hot *cc_hot;
1798
1799         data_size = subbuf_offset(offsets->old - 1, chan) + 1;
1800         padding_size = chan->backend.subbuf_size - data_size;
1801         subbuffer_set_data_size(config, &buf->backend, oldidx, data_size,
1802                                 handle);
1803
1804         /*
1805          * Order all writes to buffer before the commit count update that will
1806          * determine that the subbuffer is full.
1807          */
1808         cmm_smp_wmb();
1809         cc_hot = shmp_index(handle, buf->commit_hot, oldidx);
1810         if (!cc_hot)
1811                 return;
1812         v_add(config, padding_size, &cc_hot->cc);
1813         commit_count = v_read(config, &cc_hot->cc);
1814         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1,
1815                                       commit_count, oldidx, handle, tsc);
1816         lib_ring_buffer_write_commit_counter(config, buf, chan,
1817                         offsets->old + padding_size, commit_count, handle,
1818                         cc_hot);
1819 }
1820
1821 /*
1822  * lib_ring_buffer_switch_new_start: Populate new subbuffer.
1823  *
1824  * This code can be executed unordered : writers may already have written to the
1825  * sub-buffer before this code gets executed, caution.  The commit makes sure
1826  * that this code is executed before the deliver of this sub-buffer.
1827  */
1828 static
1829 void lib_ring_buffer_switch_new_start(struct lttng_ust_lib_ring_buffer *buf,
1830                                       struct channel *chan,
1831                                       struct switch_offsets *offsets,
1832                                       uint64_t tsc,
1833                                       struct lttng_ust_shm_handle *handle)
1834 {
1835         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1836         unsigned long beginidx = subbuf_index(offsets->begin, chan);
1837         unsigned long commit_count;
1838         struct commit_counters_hot *cc_hot;
1839
1840         config->cb.buffer_begin(buf, tsc, beginidx, handle);
1841
1842         /*
1843          * Order all writes to buffer before the commit count update that will
1844          * determine that the subbuffer is full.
1845          */
1846         cmm_smp_wmb();
1847         cc_hot = shmp_index(handle, buf->commit_hot, beginidx);
1848         if (!cc_hot)
1849                 return;
1850         v_add(config, config->cb.subbuffer_header_size(), &cc_hot->cc);
1851         commit_count = v_read(config, &cc_hot->cc);
1852         /* Check if the written buffer has to be delivered */
1853         lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin,
1854                                       commit_count, beginidx, handle, tsc);
1855         lib_ring_buffer_write_commit_counter(config, buf, chan,
1856                         offsets->begin + config->cb.subbuffer_header_size(),
1857                         commit_count, handle, cc_hot);
1858 }
1859
1860 /*
1861  * lib_ring_buffer_switch_new_end: finish switching current subbuffer
1862  *
1863  * Calls subbuffer_set_data_size() to set the data size of the current
1864  * sub-buffer. We do not need to perform check_deliver nor commit here,
1865  * since this task will be done by the "commit" of the event for which
1866  * we are currently doing the space reservation.
1867  */
1868 static
1869 void lib_ring_buffer_switch_new_end(struct lttng_ust_lib_ring_buffer *buf,
1870                                     struct channel *chan,
1871                                     struct switch_offsets *offsets,
1872                                     uint64_t tsc,
1873                                     struct lttng_ust_shm_handle *handle)
1874 {
1875         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1876         unsigned long endidx, data_size;
1877
1878         endidx = subbuf_index(offsets->end - 1, chan);
1879         data_size = subbuf_offset(offsets->end - 1, chan) + 1;
1880         subbuffer_set_data_size(config, &buf->backend, endidx, data_size,
1881                                 handle);
1882 }
1883
1884 /*
1885  * Returns :
1886  * 0 if ok
1887  * !0 if execution must be aborted.
1888  */
1889 static
1890 int lib_ring_buffer_try_switch_slow(enum switch_mode mode,
1891                                     struct lttng_ust_lib_ring_buffer *buf,
1892                                     struct channel *chan,
1893                                     struct switch_offsets *offsets,
1894                                     uint64_t *tsc,
1895                                     struct lttng_ust_shm_handle *handle)
1896 {
1897         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1898         unsigned long off, reserve_commit_diff;
1899
1900         offsets->begin = v_read(config, &buf->offset);
1901         offsets->old = offsets->begin;
1902         offsets->switch_old_start = 0;
1903         off = subbuf_offset(offsets->begin, chan);
1904
1905         *tsc = config->cb.ring_buffer_clock_read(chan);
1906
1907         /*
1908          * Ensure we flush the header of an empty subbuffer when doing the
1909          * finalize (SWITCH_FLUSH). This ensures that we end up knowing the
1910          * total data gathering duration even if there were no records saved
1911          * after the last buffer switch.
1912          * In SWITCH_ACTIVE mode, switch the buffer when it contains events.
1913          * SWITCH_ACTIVE only flushes the current subbuffer, dealing with end of
1914          * subbuffer header as appropriate.
1915          * The next record that reserves space will be responsible for
1916          * populating the following subbuffer header. We choose not to populate
1917          * the next subbuffer header here because we want to be able to use
1918          * SWITCH_ACTIVE for periodical buffer flush, which must
1919          * guarantee that all the buffer content (records and header
1920          * timestamps) are visible to the reader. This is required for
1921          * quiescence guarantees for the fusion merge.
1922          */
1923         if (mode != SWITCH_FLUSH && !off)
1924                 return -1;      /* we do not have to switch : buffer is empty */
1925
1926         if (caa_unlikely(off == 0)) {
1927                 unsigned long sb_index, commit_count;
1928                 struct commit_counters_cold *cc_cold;
1929
1930                 /*
1931                  * We are performing a SWITCH_FLUSH. There may be concurrent
1932                  * writes into the buffer if e.g. invoked while performing a
1933                  * snapshot on an active trace.
1934                  *
1935                  * If the client does not save any header information
1936                  * (sub-buffer header size == 0), don't switch empty subbuffer
1937                  * on finalize, because it is invalid to deliver a completely
1938                  * empty subbuffer.
1939                  */
1940                 if (!config->cb.subbuffer_header_size())
1941                         return -1;
1942
1943                 /* Test new buffer integrity */
1944                 sb_index = subbuf_index(offsets->begin, chan);
1945                 cc_cold = shmp_index(handle, buf->commit_cold, sb_index);
1946                 if (!cc_cold)
1947                         return -1;
1948                 commit_count = v_read(config, &cc_cold->cc_sb);
1949                 reserve_commit_diff =
1950                   (buf_trunc(offsets->begin, chan)
1951                    >> chan->backend.num_subbuf_order)
1952                   - (commit_count & chan->commit_count_mask);
1953                 if (caa_likely(reserve_commit_diff == 0)) {
1954                         /* Next subbuffer not being written to. */
1955                         if (caa_unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1956                                 subbuf_trunc(offsets->begin, chan)
1957                                  - subbuf_trunc((unsigned long)
1958                                      uatomic_read(&buf->consumed), chan)
1959                                 >= chan->backend.buf_size)) {
1960                                 /*
1961                                  * We do not overwrite non consumed buffers
1962                                  * and we are full : don't switch.
1963                                  */
1964                                 return -1;
1965                         } else {
1966                                 /*
1967                                  * Next subbuffer not being written to, and we
1968                                  * are either in overwrite mode or the buffer is
1969                                  * not full. It's safe to write in this new
1970                                  * subbuffer.
1971                                  */
1972                         }
1973                 } else {
1974                         /*
1975                          * Next subbuffer reserve offset does not match the
1976                          * commit offset. Don't perform switch in
1977                          * producer-consumer and overwrite mode.  Caused by
1978                          * either a writer OOPS or too many nested writes over a
1979                          * reserve/commit pair.
1980                          */
1981                         return -1;
1982                 }
1983
1984                 /*
1985                  * Need to write the subbuffer start header on finalize.
1986                  */
1987                 offsets->switch_old_start = 1;
1988         }
1989         offsets->begin = subbuf_align(offsets->begin, chan);
1990         /* Note: old points to the next subbuf at offset 0 */
1991         offsets->end = offsets->begin;
1992         return 0;
1993 }
1994
1995 /*
1996  * Force a sub-buffer switch. This operation is completely reentrant : can be
1997  * called while tracing is active with absolutely no lock held.
1998  *
1999  * For RING_BUFFER_SYNC_PER_CPU ring buffers, as a v_cmpxchg is used for
2000  * some atomic operations, this function must be called from the CPU
2001  * which owns the buffer for a ACTIVE flush. However, for
2002  * RING_BUFFER_SYNC_GLOBAL ring buffers, this function can be called
2003  * from any CPU.
2004  */
2005 void lib_ring_buffer_switch_slow(struct lttng_ust_lib_ring_buffer *buf, enum switch_mode mode,
2006                                  struct lttng_ust_shm_handle *handle)
2007 {
2008         struct channel *chan;
2009         const struct lttng_ust_lib_ring_buffer_config *config;
2010         struct switch_offsets offsets;
2011         unsigned long oldidx;
2012         uint64_t tsc;
2013
2014         chan = shmp(handle, buf->backend.chan);
2015         if (!chan)
2016                 return;
2017         config = &chan->backend.config;
2018
2019         offsets.size = 0;
2020
2021         /*
2022          * Perform retryable operations.
2023          */
2024         do {
2025                 if (lib_ring_buffer_try_switch_slow(mode, buf, chan, &offsets,
2026                                                     &tsc, handle))
2027                         return; /* Switch not needed */
2028         } while (v_cmpxchg(config, &buf->offset, offsets.old, offsets.end)
2029                  != offsets.old);
2030
2031         /*
2032          * Atomically update last_tsc. This update races against concurrent
2033          * atomic updates, but the race will always cause supplementary full TSC
2034          * records, never the opposite (missing a full TSC record when it would
2035          * be needed).
2036          */
2037         save_last_tsc(config, buf, tsc);
2038
2039         /*
2040          * Push the reader if necessary
2041          */
2042         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.old);
2043
2044         oldidx = subbuf_index(offsets.old, chan);
2045         lib_ring_buffer_clear_noref(config, &buf->backend, oldidx, handle);
2046
2047         /*
2048          * May need to populate header start on SWITCH_FLUSH.
2049          */
2050         if (offsets.switch_old_start) {
2051                 lib_ring_buffer_switch_old_start(buf, chan, &offsets, tsc, handle);
2052                 offsets.old += config->cb.subbuffer_header_size();
2053         }
2054
2055         /*
2056          * Switch old subbuffer.
2057          */
2058         lib_ring_buffer_switch_old_end(buf, chan, &offsets, tsc, handle);
2059 }
2060
2061 static
2062 bool handle_blocking_retry(int *timeout_left_ms)
2063 {
2064         int timeout = *timeout_left_ms, delay;
2065
2066         if (caa_likely(!timeout))
2067                 return false;   /* Do not retry, discard event. */
2068         if (timeout < 0)        /* Wait forever. */
2069                 delay = RETRY_DELAY_MS;
2070         else
2071                 delay = min_t(int, timeout, RETRY_DELAY_MS);
2072         (void) poll(NULL, 0, delay);
2073         if (timeout > 0)
2074                 *timeout_left_ms -= delay;
2075         return true;    /* Retry. */
2076 }
2077
2078 /*
2079  * Returns :
2080  * 0 if ok
2081  * -ENOSPC if event size is too large for packet.
2082  * -ENOBUFS if there is currently not enough space in buffer for the event.
2083  * -EIO if data cannot be written into the buffer for any other reason.
2084  */
2085 static
2086 int lib_ring_buffer_try_reserve_slow(struct lttng_ust_lib_ring_buffer *buf,
2087                                      struct channel *chan,
2088                                      struct switch_offsets *offsets,
2089                                      struct lttng_ust_lib_ring_buffer_ctx *ctx,
2090                                      void *client_ctx)
2091 {
2092         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
2093         struct lttng_ust_shm_handle *handle = ctx->handle;
2094         unsigned long reserve_commit_diff, offset_cmp;
2095         int timeout_left_ms = lttng_ust_ringbuffer_get_timeout(chan);
2096
2097 retry:
2098         offsets->begin = offset_cmp = v_read(config, &buf->offset);
2099         offsets->old = offsets->begin;
2100         offsets->switch_new_start = 0;
2101         offsets->switch_new_end = 0;
2102         offsets->switch_old_end = 0;
2103         offsets->pre_header_padding = 0;
2104
2105         ctx->tsc = config->cb.ring_buffer_clock_read(chan);
2106         if ((int64_t) ctx->tsc == -EIO)
2107                 return -EIO;
2108
2109         if (last_tsc_overflow(config, buf, ctx->tsc))
2110                 ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC;
2111
2112         if (caa_unlikely(subbuf_offset(offsets->begin, ctx->chan) == 0)) {
2113                 offsets->switch_new_start = 1;          /* For offsets->begin */
2114         } else {
2115                 offsets->size = config->cb.record_header_size(config, chan,
2116                                                 offsets->begin,
2117                                                 &offsets->pre_header_padding,
2118                                                 ctx, client_ctx);
2119                 offsets->size +=
2120                         lib_ring_buffer_align(offsets->begin + offsets->size,
2121                                               ctx->largest_align)
2122                         + ctx->data_size;
2123                 if (caa_unlikely(subbuf_offset(offsets->begin, chan) +
2124                              offsets->size > chan->backend.subbuf_size)) {
2125                         offsets->switch_old_end = 1;    /* For offsets->old */
2126                         offsets->switch_new_start = 1;  /* For offsets->begin */
2127                 }
2128         }
2129         if (caa_unlikely(offsets->switch_new_start)) {
2130                 unsigned long sb_index, commit_count;
2131                 struct commit_counters_cold *cc_cold;
2132
2133                 /*
2134                  * We are typically not filling the previous buffer completely.
2135                  */
2136                 if (caa_likely(offsets->switch_old_end))
2137                         offsets->begin = subbuf_align(offsets->begin, chan);
2138                 offsets->begin = offsets->begin
2139                                  + config->cb.subbuffer_header_size();
2140                 /* Test new buffer integrity */
2141                 sb_index = subbuf_index(offsets->begin, chan);
2142                 /*
2143                  * Read buf->offset before buf->commit_cold[sb_index].cc_sb.
2144                  * lib_ring_buffer_check_deliver() has the matching
2145                  * memory barriers required around commit_cold cc_sb
2146                  * updates to ensure reserve and commit counter updates
2147                  * are not seen reordered when updated by another CPU.
2148                  */
2149                 cmm_smp_rmb();
2150                 cc_cold = shmp_index(handle, buf->commit_cold, sb_index);
2151                 if (!cc_cold)
2152                         return -1;
2153                 commit_count = v_read(config, &cc_cold->cc_sb);
2154                 /* Read buf->commit_cold[sb_index].cc_sb before buf->offset. */
2155                 cmm_smp_rmb();
2156                 if (caa_unlikely(offset_cmp != v_read(config, &buf->offset))) {
2157                         /*
2158                          * The reserve counter have been concurrently updated
2159                          * while we read the commit counter. This means the
2160                          * commit counter we read might not match buf->offset
2161                          * due to concurrent update. We therefore need to retry.
2162                          */
2163                         goto retry;
2164                 }
2165                 reserve_commit_diff =
2166                   (buf_trunc(offsets->begin, chan)
2167                    >> chan->backend.num_subbuf_order)
2168                   - (commit_count & chan->commit_count_mask);
2169                 if (caa_likely(reserve_commit_diff == 0)) {
2170                         /* Next subbuffer not being written to. */
2171                         if (caa_unlikely(config->mode != RING_BUFFER_OVERWRITE &&
2172                                 subbuf_trunc(offsets->begin, chan)
2173                                  - subbuf_trunc((unsigned long)
2174                                      uatomic_read(&buf->consumed), chan)
2175                                 >= chan->backend.buf_size)) {
2176                                 unsigned long nr_lost;
2177
2178                                 if (handle_blocking_retry(&timeout_left_ms))
2179                                         goto retry;
2180
2181                                 /*
2182                                  * We do not overwrite non consumed buffers
2183                                  * and we are full : record is lost.
2184                                  */
2185                                 nr_lost = v_read(config, &buf->records_lost_full);
2186                                 v_inc(config, &buf->records_lost_full);
2187                                 if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
2188                                         DBG("%lu or more records lost in (%s:%d) (buffer full)\n",
2189                                                 nr_lost + 1, chan->backend.name,
2190                                                 buf->backend.cpu);
2191                                 }
2192                                 return -ENOBUFS;
2193                         } else {
2194                                 /*
2195                                  * Next subbuffer not being written to, and we
2196                                  * are either in overwrite mode or the buffer is
2197                                  * not full. It's safe to write in this new
2198                                  * subbuffer.
2199                                  */
2200                         }
2201                 } else {
2202                         unsigned long nr_lost;
2203
2204                         /*
2205                          * Next subbuffer reserve offset does not match the
2206                          * commit offset, and this did not involve update to the
2207                          * reserve counter. Drop record in producer-consumer and
2208                          * overwrite mode. Caused by either a writer OOPS or too
2209                          * many nested writes over a reserve/commit pair.
2210                          */
2211                         nr_lost = v_read(config, &buf->records_lost_wrap);
2212                         v_inc(config, &buf->records_lost_wrap);
2213                         if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
2214                                 DBG("%lu or more records lost in (%s:%d) (wrap-around)\n",
2215                                         nr_lost + 1, chan->backend.name,
2216                                         buf->backend.cpu);
2217                         }
2218                         return -EIO;
2219                 }
2220                 offsets->size =
2221                         config->cb.record_header_size(config, chan,
2222                                                 offsets->begin,
2223                                                 &offsets->pre_header_padding,
2224                                                 ctx, client_ctx);
2225                 offsets->size +=
2226                         lib_ring_buffer_align(offsets->begin + offsets->size,
2227                                               ctx->largest_align)
2228                         + ctx->data_size;
2229                 if (caa_unlikely(subbuf_offset(offsets->begin, chan)
2230                              + offsets->size > chan->backend.subbuf_size)) {
2231                         unsigned long nr_lost;
2232
2233                         /*
2234                          * Record too big for subbuffers, report error, don't
2235                          * complete the sub-buffer switch.
2236                          */
2237                         nr_lost = v_read(config, &buf->records_lost_big);
2238                         v_inc(config, &buf->records_lost_big);
2239                         if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
2240                                 DBG("%lu or more records lost in (%s:%d) record size "
2241                                         " of %zu bytes is too large for buffer\n",
2242                                         nr_lost + 1, chan->backend.name,
2243                                         buf->backend.cpu, offsets->size);
2244                         }
2245                         return -ENOSPC;
2246                 } else {
2247                         /*
2248                          * We just made a successful buffer switch and the
2249                          * record fits in the new subbuffer. Let's write.
2250                          */
2251                 }
2252         } else {
2253                 /*
2254                  * Record fits in the current buffer and we are not on a switch
2255                  * boundary. It's safe to write.
2256                  */
2257         }
2258         offsets->end = offsets->begin + offsets->size;
2259
2260         if (caa_unlikely(subbuf_offset(offsets->end, chan) == 0)) {
2261                 /*
2262                  * The offset_end will fall at the very beginning of the next
2263                  * subbuffer.
2264                  */
2265                 offsets->switch_new_end = 1;    /* For offsets->begin */
2266         }
2267         return 0;
2268 }
2269
2270 /**
2271  * lib_ring_buffer_reserve_slow - Atomic slot reservation in a buffer.
2272  * @ctx: ring buffer context.
2273  *
2274  * Return : -NOBUFS if not enough space, -ENOSPC if event size too large,
2275  * -EIO for other errors, else returns 0.
2276  * It will take care of sub-buffer switching.
2277  */
2278 int lib_ring_buffer_reserve_slow(struct lttng_ust_lib_ring_buffer_ctx *ctx,
2279                 void *client_ctx)
2280 {
2281         struct channel *chan = ctx->chan;
2282         struct lttng_ust_shm_handle *handle = ctx->handle;
2283         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
2284         struct lttng_ust_lib_ring_buffer *buf;
2285         struct switch_offsets offsets;
2286         int ret;
2287
2288         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
2289                 buf = shmp(handle, chan->backend.buf[ctx->cpu].shmp);
2290         else
2291                 buf = shmp(handle, chan->backend.buf[0].shmp);
2292         if (!buf)
2293                 return -EIO;
2294         ctx->buf = buf;
2295
2296         offsets.size = 0;
2297
2298         do {
2299                 ret = lib_ring_buffer_try_reserve_slow(buf, chan, &offsets,
2300                                                        ctx, client_ctx);
2301                 if (caa_unlikely(ret))
2302                         return ret;
2303         } while (caa_unlikely(v_cmpxchg(config, &buf->offset, offsets.old,
2304                                     offsets.end)
2305                           != offsets.old));
2306
2307         /*
2308          * Atomically update last_tsc. This update races against concurrent
2309          * atomic updates, but the race will always cause supplementary full TSC
2310          * records, never the opposite (missing a full TSC record when it would
2311          * be needed).
2312          */
2313         save_last_tsc(config, buf, ctx->tsc);
2314
2315         /*
2316          * Push the reader if necessary
2317          */
2318         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.end - 1);
2319
2320         /*
2321          * Clear noref flag for this subbuffer.
2322          */
2323         lib_ring_buffer_clear_noref(config, &buf->backend,
2324                                     subbuf_index(offsets.end - 1, chan),
2325                                     handle);
2326
2327         /*
2328          * Switch old subbuffer if needed.
2329          */
2330         if (caa_unlikely(offsets.switch_old_end)) {
2331                 lib_ring_buffer_clear_noref(config, &buf->backend,
2332                                             subbuf_index(offsets.old - 1, chan),
2333                                             handle);
2334                 lib_ring_buffer_switch_old_end(buf, chan, &offsets, ctx->tsc, handle);
2335         }
2336
2337         /*
2338          * Populate new subbuffer.
2339          */
2340         if (caa_unlikely(offsets.switch_new_start))
2341                 lib_ring_buffer_switch_new_start(buf, chan, &offsets, ctx->tsc, handle);
2342
2343         if (caa_unlikely(offsets.switch_new_end))
2344                 lib_ring_buffer_switch_new_end(buf, chan, &offsets, ctx->tsc, handle);
2345
2346         ctx->slot_size = offsets.size;
2347         ctx->pre_offset = offsets.begin;
2348         ctx->buf_offset = offsets.begin + offsets.pre_header_padding;
2349         return 0;
2350 }
2351
2352 static
2353 void lib_ring_buffer_vmcore_check_deliver(const struct lttng_ust_lib_ring_buffer_config *config,
2354                                           struct lttng_ust_lib_ring_buffer *buf,
2355                                           unsigned long commit_count,
2356                                           unsigned long idx,
2357                                           struct lttng_ust_shm_handle *handle)
2358 {
2359         struct commit_counters_hot *cc_hot;
2360
2361         if (config->oops != RING_BUFFER_OOPS_CONSISTENCY)
2362                 return;
2363         cc_hot = shmp_index(handle, buf->commit_hot, idx);
2364         if (!cc_hot)
2365                 return;
2366         v_set(config, &cc_hot->seq, commit_count);
2367 }
2368
2369 /*
2370  * The ring buffer can count events recorded and overwritten per buffer,
2371  * but it is disabled by default due to its performance overhead.
2372  */
2373 #ifdef LTTNG_RING_BUFFER_COUNT_EVENTS
2374 static
2375 void deliver_count_events(const struct lttng_ust_lib_ring_buffer_config *config,
2376                 struct lttng_ust_lib_ring_buffer *buf,
2377                 unsigned long idx,
2378                 struct lttng_ust_shm_handle *handle)
2379 {
2380         v_add(config, subbuffer_get_records_count(config,
2381                         &buf->backend, idx, handle),
2382                 &buf->records_count);
2383         v_add(config, subbuffer_count_records_overrun(config,
2384                         &buf->backend, idx, handle),
2385                 &buf->records_overrun);
2386 }
2387 #else /* LTTNG_RING_BUFFER_COUNT_EVENTS */
2388 static
2389 void deliver_count_events(const struct lttng_ust_lib_ring_buffer_config *config,
2390                 struct lttng_ust_lib_ring_buffer *buf,
2391                 unsigned long idx,
2392                 struct lttng_ust_shm_handle *handle)
2393 {
2394 }
2395 #endif /* #else LTTNG_RING_BUFFER_COUNT_EVENTS */
2396
2397 void lib_ring_buffer_check_deliver_slow(const struct lttng_ust_lib_ring_buffer_config *config,
2398                                    struct lttng_ust_lib_ring_buffer *buf,
2399                                    struct channel *chan,
2400                                    unsigned long offset,
2401                                    unsigned long commit_count,
2402                                    unsigned long idx,
2403                                    struct lttng_ust_shm_handle *handle,
2404                                    uint64_t tsc)
2405 {
2406         unsigned long old_commit_count = commit_count
2407                                          - chan->backend.subbuf_size;
2408         struct commit_counters_cold *cc_cold;
2409
2410         /*
2411          * If we succeeded at updating cc_sb below, we are the subbuffer
2412          * writer delivering the subbuffer. Deals with concurrent
2413          * updates of the "cc" value without adding a add_return atomic
2414          * operation to the fast path.
2415          *
2416          * We are doing the delivery in two steps:
2417          * - First, we cmpxchg() cc_sb to the new value
2418          *   old_commit_count + 1. This ensures that we are the only
2419          *   subbuffer user successfully filling the subbuffer, but we
2420          *   do _not_ set the cc_sb value to "commit_count" yet.
2421          *   Therefore, other writers that would wrap around the ring
2422          *   buffer and try to start writing to our subbuffer would
2423          *   have to drop records, because it would appear as
2424          *   non-filled.
2425          *   We therefore have exclusive access to the subbuffer control
2426          *   structures.  This mutual exclusion with other writers is
2427          *   crucially important to perform record overruns count in
2428          *   flight recorder mode locklessly.
2429          * - When we are ready to release the subbuffer (either for
2430          *   reading or for overrun by other writers), we simply set the
2431          *   cc_sb value to "commit_count" and perform delivery.
2432          *
2433          * The subbuffer size is least 2 bytes (minimum size: 1 page).
2434          * This guarantees that old_commit_count + 1 != commit_count.
2435          */
2436
2437         /*
2438          * Order prior updates to reserve count prior to the
2439          * commit_cold cc_sb update.
2440          */
2441         cmm_smp_wmb();
2442         cc_cold = shmp_index(handle, buf->commit_cold, idx);
2443         if (!cc_cold)
2444                 return;
2445         if (caa_likely(v_cmpxchg(config, &cc_cold->cc_sb,
2446                                  old_commit_count, old_commit_count + 1)
2447                    == old_commit_count)) {
2448                 /*
2449                  * Start of exclusive subbuffer access. We are
2450                  * guaranteed to be the last writer in this subbuffer
2451                  * and any other writer trying to access this subbuffer
2452                  * in this state is required to drop records.
2453                  */
2454                 deliver_count_events(config, buf, idx, handle);
2455                 config->cb.buffer_end(buf, tsc, idx,
2456                                       lib_ring_buffer_get_data_size(config,
2457                                                                 buf,
2458                                                                 idx,
2459                                                                 handle),
2460                                       handle);
2461
2462                 /*
2463                  * Increment the packet counter while we have exclusive
2464                  * access.
2465                  */
2466                 subbuffer_inc_packet_count(config, &buf->backend, idx, handle);
2467
2468                 /*
2469                  * Set noref flag and offset for this subbuffer id.
2470                  * Contains a memory barrier that ensures counter stores
2471                  * are ordered before set noref and offset.
2472                  */
2473                 lib_ring_buffer_set_noref_offset(config, &buf->backend, idx,
2474                                                  buf_trunc_val(offset, chan), handle);
2475
2476                 /*
2477                  * Order set_noref and record counter updates before the
2478                  * end of subbuffer exclusive access. Orders with
2479                  * respect to writers coming into the subbuffer after
2480                  * wrap around, and also order wrt concurrent readers.
2481                  */
2482                 cmm_smp_mb();
2483                 /* End of exclusive subbuffer access */
2484                 v_set(config, &cc_cold->cc_sb, commit_count);
2485                 /*
2486                  * Order later updates to reserve count after
2487                  * the commit cold cc_sb update.
2488                  */
2489                 cmm_smp_wmb();
2490                 lib_ring_buffer_vmcore_check_deliver(config, buf,
2491                                          commit_count, idx, handle);
2492
2493                 /*
2494                  * RING_BUFFER_WAKEUP_BY_WRITER wakeup is not lock-free.
2495                  */
2496                 if (config->wakeup == RING_BUFFER_WAKEUP_BY_WRITER
2497                     && uatomic_read(&buf->active_readers)
2498                     && lib_ring_buffer_poll_deliver(config, buf, chan, handle)) {
2499                         lib_ring_buffer_wakeup(buf, handle);
2500                 }
2501         }
2502 }
2503
2504 /*
2505  * Force a read (imply TLS fixup for dlopen) of TLS variables.
2506  */
2507 void lttng_fixup_ringbuffer_tls(void)
2508 {
2509         asm volatile ("" : : "m" (URCU_TLS(lib_ring_buffer_nesting)));
2510 }
2511
2512 void lib_ringbuffer_signal_init(void)
2513 {
2514         sigset_t mask;
2515         int ret;
2516
2517         /*
2518          * Block signal for entire process, so only our thread processes
2519          * it.
2520          */
2521         rb_setmask(&mask);
2522         ret = pthread_sigmask(SIG_BLOCK, &mask, NULL);
2523         if (ret) {
2524                 errno = ret;
2525                 PERROR("pthread_sigmask");
2526         }
2527 }