libringbuffer/ring_buffer_frontend.c

   1 /*
   2  * ring_buffer_frontend.c
   3  *
   4  * (C) Copyright 2005-2010 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   5  *
   6  * Ring buffer wait-free buffer synchronization. Producer-consumer and flight
   7  * recorder (overwrite) modes. See thesis:
   8  *
   9  * Desnoyers, Mathieu (2009), "Low-Impact Operating System Tracing", Ph.D.
  10  * dissertation, Ecole Polytechnique de Montreal.
  11  * http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf
  12  *
  13  * - Algorithm presentation in Chapter 5:
  14  *     "Lockless Multi-Core High-Throughput Buffering".
  15  * - Algorithm formal verification in Section 8.6:
  16  *     "Formal verification of LTTng"
  17  *
  18  * Author:
  19  *      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  20  *
  21  * Inspired from LTT and RelayFS:
  22  *  Karim Yaghmour <karim@opersys.com>
  23  *  Tom Zanussi <zanussi@us.ibm.com>
  24  *  Bob Wisniewski <bob@watson.ibm.com>
  25  * And from K42 :
  26  *  Bob Wisniewski <bob@watson.ibm.com>
  27  *
  28  * Buffer reader semantic :
  29  *
  30  * - get_subbuf_size
  31  * while buffer is not finalized and empty
  32  *   - get_subbuf
  33  *     - if return value != 0, continue
  34  *   - splice one subbuffer worth of data to a pipe
  35  *   - splice the data from pipe to disk/network
  36  *   - put_subbuf
  37  *
  38  * Dual LGPL v2.1/GPL v2 license.
  39  */
  40
  41 #define _GNU_SOURCE
  42 #include <sys/types.h>
  43 #include <sys/mman.h>
  44 #include <sys/stat.h>
  45 #include <fcntl.h>
  46 #include <urcu/compiler.h>
  47 #include <urcu/ref.h>
  48 #include <helper.h>
  49
  50 #include "smp.h"
  51 #include <lttng/ringbuffer-config.h>
  52 #include "vatomic.h"
  53 #include "backend.h"
  54 #include "frontend.h"
  55 #include "shm.h"
  56 #include "tlsfixup.h"
  57 #include "../liblttng-ust/compat.h"     /* For ENODATA */
  58
  59 #ifndef max
  60 #define max(a, b)       ((a) > (b) ? (a) : (b))
  61 #endif
  62
  63 /* Print DBG() messages about events lost only every 1048576 hits */
  64 #define DBG_PRINT_NR_LOST       (1UL << 20)
  65
  66 /*
  67  * Use POSIX SHM: shm_open(3) and shm_unlink(3).
  68  * close(2) to close the fd returned by shm_open.
  69  * shm_unlink releases the shared memory object name.
  70  * ftruncate(2) sets the size of the memory object.
  71  * mmap/munmap maps the shared memory obj to a virtual address in the
  72  * calling proceess (should be done both in libust and consumer).
  73  * See shm_overview(7) for details.
  74  * Pass file descriptor returned by shm_open(3) to ltt-sessiond through
  75  * a UNIX socket.
  76  *
  77  * Since we don't need to access the object using its name, we can
  78  * immediately shm_unlink(3) it, and only keep the handle with its file
  79  * descriptor.
  80  */
  81
  82 /*
  83  * Internal structure representing offsets to use at a sub-buffer switch.
  84  */
  85 struct switch_offsets {
  86         unsigned long begin, end, old;
  87         size_t pre_header_padding, size;
  88         unsigned int switch_new_start:1, switch_new_end:1, switch_old_start:1,
  89                      switch_old_end:1;
  90 };
  91
  92 __thread unsigned int lib_ring_buffer_nesting;
  93
  94 /*
  95  * TODO: this is unused. Errors are saved within the ring buffer.
  96  * Eventually, allow consumerd to print these errors.
  97  */
  98 static
  99 void lib_ring_buffer_print_errors(struct channel *chan,
 100                                   struct lttng_ust_lib_ring_buffer *buf, int cpu,
 101                                   struct lttng_ust_shm_handle *handle)
 102         __attribute__((unused));
 103
 104 /**
 105  * lib_ring_buffer_reset - Reset ring buffer to initial values.
 106  * @buf: Ring buffer.
 107  *
 108  * Effectively empty the ring buffer. Should be called when the buffer is not
 109  * used for writing. The ring buffer can be opened for reading, but the reader
 110  * should not be using the iterator concurrently with reset. The previous
 111  * current iterator record is reset.
 112  */
 113 void lib_ring_buffer_reset(struct lttng_ust_lib_ring_buffer *buf,
 114                            struct lttng_ust_shm_handle *handle)
 115 {
 116         struct channel *chan = shmp(handle, buf->backend.chan);
 117         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 118         unsigned int i;
 119
 120         /*
 121          * Reset iterator first. It will put the subbuffer if it currently holds
 122          * it.
 123          */
 124         v_set(config, &buf->offset, 0);
 125         for (i = 0; i < chan->backend.num_subbuf; i++) {
 126                 v_set(config, &shmp_index(handle, buf->commit_hot, i)->cc, 0);
 127                 v_set(config, &shmp_index(handle, buf->commit_hot, i)->seq, 0);
 128                 v_set(config, &shmp_index(handle, buf->commit_cold, i)->cc_sb, 0);
 129         }
 130         uatomic_set(&buf->consumed, 0);
 131         uatomic_set(&buf->record_disabled, 0);
 132         v_set(config, &buf->last_tsc, 0);
 133         lib_ring_buffer_backend_reset(&buf->backend, handle);
 134         /* Don't reset number of active readers */
 135         v_set(config, &buf->records_lost_full, 0);
 136         v_set(config, &buf->records_lost_wrap, 0);
 137         v_set(config, &buf->records_lost_big, 0);
 138         v_set(config, &buf->records_count, 0);
 139         v_set(config, &buf->records_overrun, 0);
 140         buf->finalized = 0;
 141 }
 142
 143 /**
 144  * channel_reset - Reset channel to initial values.
 145  * @chan: Channel.
 146  *
 147  * Effectively empty the channel. Should be called when the channel is not used
 148  * for writing. The channel can be opened for reading, but the reader should not
 149  * be using the iterator concurrently with reset. The previous current iterator
 150  * record is reset.
 151  */
 152 void channel_reset(struct channel *chan)
 153 {
 154         /*
 155          * Reset iterators first. Will put the subbuffer if held for reading.
 156          */
 157         uatomic_set(&chan->record_disabled, 0);
 158         /* Don't reset commit_count_mask, still valid */
 159         channel_backend_reset(&chan->backend);
 160         /* Don't reset switch/read timer interval */
 161         /* Don't reset notifiers and notifier enable bits */
 162         /* Don't reset reader reference count */
 163 }
 164
 165 /*
 166  * Must be called under cpu hotplug protection.
 167  */
 168 int lib_ring_buffer_create(struct lttng_ust_lib_ring_buffer *buf,
 169                            struct channel_backend *chanb, int cpu,
 170                            struct lttng_ust_shm_handle *handle,
 171                            struct shm_object *shmobj)
 172 {
 173         const struct lttng_ust_lib_ring_buffer_config *config = &chanb->config;
 174         struct channel *chan = caa_container_of(chanb, struct channel, backend);
 175         void *priv = channel_get_private(chan);
 176         size_t subbuf_header_size;
 177         uint64_t tsc;
 178         int ret;
 179
 180         /* Test for cpu hotplug */
 181         if (buf->backend.allocated)
 182                 return 0;
 183
 184         ret = lib_ring_buffer_backend_create(&buf->backend, &chan->backend,
 185                         cpu, handle, shmobj);
 186         if (ret)
 187                 return ret;
 188
 189         align_shm(shmobj, __alignof__(struct commit_counters_hot));
 190         set_shmp(buf->commit_hot,
 191                  zalloc_shm(shmobj,
 192                         sizeof(struct commit_counters_hot) * chan->backend.num_subbuf));
 193         if (!shmp(handle, buf->commit_hot)) {
 194                 ret = -ENOMEM;
 195                 goto free_chanbuf;
 196         }
 197
 198         align_shm(shmobj, __alignof__(struct commit_counters_cold));
 199         set_shmp(buf->commit_cold,
 200                  zalloc_shm(shmobj,
 201                         sizeof(struct commit_counters_cold) * chan->backend.num_subbuf));
 202         if (!shmp(handle, buf->commit_cold)) {
 203                 ret = -ENOMEM;
 204                 goto free_commit;
 205         }
 206
 207         /*
 208          * Write the subbuffer header for first subbuffer so we know the total
 209          * duration of data gathering.
 210          */
 211         subbuf_header_size = config->cb.subbuffer_header_size();
 212         v_set(config, &buf->offset, subbuf_header_size);
 213         subbuffer_id_clear_noref(config, &shmp_index(handle, buf->backend.buf_wsb, 0)->id);
 214         tsc = config->cb.ring_buffer_clock_read(shmp(handle, buf->backend.chan));
 215         config->cb.buffer_begin(buf, tsc, 0, handle);
 216         v_add(config, subbuf_header_size, &shmp_index(handle, buf->commit_hot, 0)->cc);
 217
 218         if (config->cb.buffer_create) {
 219                 ret = config->cb.buffer_create(buf, priv, cpu, chanb->name, handle);
 220                 if (ret)
 221                         goto free_init;
 222         }
 223         buf->backend.allocated = 1;
 224         return 0;
 225
 226         /* Error handling */
 227 free_init:
 228         /* commit_cold will be freed by shm teardown */
 229 free_commit:
 230         /* commit_hot will be freed by shm teardown */
 231 free_chanbuf:
 232         return ret;
 233 }
 234
 235 #if 0
 236 static void switch_buffer_timer(unsigned long data)
 237 {
 238         struct lttng_ust_lib_ring_buffer *buf = (struct lttng_ust_lib_ring_buffer *)data;
 239         struct channel *chan = shmp(handle, buf->backend.chan);
 240         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 241
 242         /*
 243          * Only flush buffers periodically if readers are active.
 244          */
 245         if (uatomic_read(&buf->active_readers) || uatomic_read(&buf->active_shadow_readers))
 246                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE, handle);
 247
 248         //TODO timers
 249         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 250         //      mod_timer_pinned(&buf->switch_timer,
 251         //                       jiffies + chan->switch_timer_interval);
 252         //else
 253         //      mod_timer(&buf->switch_timer,
 254         //                jiffies + chan->switch_timer_interval);
 255 }
 256 #endif //0
 257
 258 static void lib_ring_buffer_start_switch_timer(struct lttng_ust_lib_ring_buffer *buf,
 259                            struct lttng_ust_shm_handle *handle)
 260 {
 261         struct channel *chan = shmp(handle, buf->backend.chan);
 262         //const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 263
 264         if (!chan->switch_timer_interval || buf->switch_timer_enabled)
 265                 return;
 266         //TODO
 267         //init_timer(&buf->switch_timer);
 268         //buf->switch_timer.function = switch_buffer_timer;
 269         //buf->switch_timer.expires = jiffies + chan->switch_timer_interval;
 270         //buf->switch_timer.data = (unsigned long)buf;
 271         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 272         //      add_timer_on(&buf->switch_timer, buf->backend.cpu);
 273         //else
 274         //      add_timer(&buf->switch_timer);
 275         buf->switch_timer_enabled = 1;
 276 }
 277
 278 static void lib_ring_buffer_stop_switch_timer(struct lttng_ust_lib_ring_buffer *buf,
 279                            struct lttng_ust_shm_handle *handle)
 280 {
 281         struct channel *chan = shmp(handle, buf->backend.chan);
 282
 283         if (!chan->switch_timer_interval || !buf->switch_timer_enabled)
 284                 return;
 285
 286         //TODO
 287         //del_timer_sync(&buf->switch_timer);
 288         buf->switch_timer_enabled = 0;
 289 }
 290
 291 #if 0
 292 /*
 293  * Polling timer to check the channels for data.
 294  */
 295 static void read_buffer_timer(unsigned long data)
 296 {
 297         struct lttng_ust_lib_ring_buffer *buf = (struct lttng_ust_lib_ring_buffer *)data;
 298         struct channel *chan = shmp(handle, buf->backend.chan);
 299         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 300
 301         CHAN_WARN_ON(chan, !buf->backend.allocated);
 302
 303         if (uatomic_read(&buf->active_readers) || uatomic_read(&buf->active_shadow_readers))
 304             && lib_ring_buffer_poll_deliver(config, buf, chan)) {
 305                 //TODO
 306                 //wake_up_interruptible(&buf->read_wait);
 307                 //wake_up_interruptible(&chan->read_wait);
 308         }
 309
 310         //TODO
 311         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 312         //      mod_timer_pinned(&buf->read_timer,
 313         //                       jiffies + chan->read_timer_interval);
 314         //else
 315         //      mod_timer(&buf->read_timer,
 316         //                jiffies + chan->read_timer_interval);
 317 }
 318 #endif //0
 319
 320 static void lib_ring_buffer_start_read_timer(struct lttng_ust_lib_ring_buffer *buf,
 321                            struct lttng_ust_shm_handle *handle)
 322 {
 323         struct channel *chan = shmp(handle, buf->backend.chan);
 324         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 325
 326         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 327             || !chan->read_timer_interval
 328             || buf->read_timer_enabled)
 329                 return;
 330
 331         //TODO
 332         //init_timer(&buf->read_timer);
 333         //buf->read_timer.function = read_buffer_timer;
 334         //buf->read_timer.expires = jiffies + chan->read_timer_interval;
 335         //buf->read_timer.data = (unsigned long)buf;
 336
 337         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 338         //      add_timer_on(&buf->read_timer, buf->backend.cpu);
 339         //else
 340         //      add_timer(&buf->read_timer);
 341         buf->read_timer_enabled = 1;
 342 }
 343
 344 static void lib_ring_buffer_stop_read_timer(struct lttng_ust_lib_ring_buffer *buf,
 345                            struct lttng_ust_shm_handle *handle)
 346 {
 347         struct channel *chan = shmp(handle, buf->backend.chan);
 348         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 349
 350         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 351             || !chan->read_timer_interval
 352             || !buf->read_timer_enabled)
 353                 return;
 354
 355         //TODO
 356         //del_timer_sync(&buf->read_timer);
 357         /*
 358          * do one more check to catch data that has been written in the last
 359          * timer period.
 360          */
 361         if (lib_ring_buffer_poll_deliver(config, buf, chan, handle)) {
 362                 //TODO
 363                 //wake_up_interruptible(&buf->read_wait);
 364                 //wake_up_interruptible(&chan->read_wait);
 365         }
 366         buf->read_timer_enabled = 0;
 367 }
 368
 369 static void channel_unregister_notifiers(struct channel *chan,
 370                            struct lttng_ust_shm_handle *handle)
 371 {
 372         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 373         int cpu;
 374
 375         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 376                 for_each_possible_cpu(cpu) {
 377                         struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[cpu].shmp);
 378
 379                         lib_ring_buffer_stop_switch_timer(buf, handle);
 380                         lib_ring_buffer_stop_read_timer(buf, handle);
 381                 }
 382         } else {
 383                 struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[0].shmp);
 384
 385                 lib_ring_buffer_stop_switch_timer(buf, handle);
 386                 lib_ring_buffer_stop_read_timer(buf, handle);
 387         }
 388         //channel_backend_unregister_notifiers(&chan->backend);
 389 }
 390
 391 static void channel_free(struct channel *chan, struct lttng_ust_shm_handle *handle,
 392                 int shadow)
 393 {
 394         if (!shadow)
 395                 channel_backend_free(&chan->backend, handle);
 396         /* chan is freed by shm teardown */
 397         shm_object_table_destroy(handle->table);
 398         free(handle);
 399 }
 400
 401 /**
 402  * channel_create - Create channel.
 403  * @config: ring buffer instance configuration
 404  * @name: name of the channel
 405  * @priv_data: ring buffer client private data area pointer (output)
 406  * @priv_data_size: length, in bytes, of the private data area.
 407  * @priv_data_init: initialization data for private data.
 408  * @buf_addr: pointer the the beginning of the preallocated buffer contiguous
 409  *            address mapping. It is used only by RING_BUFFER_STATIC
 410  *            configuration. It can be set to NULL for other backends.
 411  * @subbuf_size: subbuffer size
 412  * @num_subbuf: number of subbuffers
 413  * @switch_timer_interval: Time interval (in us) to fill sub-buffers with
 414  *                         padding to let readers get those sub-buffers.
 415  *                         Used for live streaming.
 416  * @read_timer_interval: Time interval (in us) to wake up pending readers.
 417  *
 418  * Holds cpu hotplug.
 419  * Returns NULL on failure.
 420  */
 421 struct lttng_ust_shm_handle *channel_create(const struct lttng_ust_lib_ring_buffer_config *config,
 422                    const char *name,
 423                    void **priv_data,
 424                    size_t priv_data_align,
 425                    size_t priv_data_size,
 426                    void *priv_data_init,
 427                    void *buf_addr, size_t subbuf_size,
 428                    size_t num_subbuf, unsigned int switch_timer_interval,
 429                    unsigned int read_timer_interval,
 430                    int **shm_fd, int **wait_fd, uint64_t **memory_map_size)
 431 {
 432         int ret, cpu;
 433         size_t shmsize, chansize;
 434         struct channel *chan;
 435         struct lttng_ust_shm_handle *handle;
 436         struct shm_object *shmobj;
 437         struct shm_ref *ref;
 438
 439         if (lib_ring_buffer_check_config(config, switch_timer_interval,
 440                                          read_timer_interval))
 441                 return NULL;
 442
 443         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
 444         if (!handle)
 445                 return NULL;
 446
 447         /* Allocate table for channel + per-cpu buffers */
 448         handle->table = shm_object_table_create(1 + num_possible_cpus());
 449         if (!handle->table)
 450                 goto error_table_alloc;
 451
 452         /* Calculate the shm allocation layout */
 453         shmsize = sizeof(struct channel);
 454         shmsize += offset_align(shmsize, __alignof__(struct lttng_ust_lib_ring_buffer_shmp));
 455         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 456                 shmsize += sizeof(struct lttng_ust_lib_ring_buffer_shmp) * num_possible_cpus();
 457         else
 458                 shmsize += sizeof(struct lttng_ust_lib_ring_buffer_shmp);
 459         chansize = shmsize;
 460         shmsize += offset_align(shmsize, priv_data_align);
 461         shmsize += priv_data_size;
 462
 463         shmobj = shm_object_table_append(handle->table, shmsize);
 464         if (!shmobj)
 465                 goto error_append;
 466         /* struct channel is at object 0, offset 0 (hardcoded) */
 467         set_shmp(handle->chan, zalloc_shm(shmobj, chansize));
 468         assert(handle->chan._ref.index == 0);
 469         assert(handle->chan._ref.offset == 0);
 470         chan = shmp(handle, handle->chan);
 471         if (!chan)
 472                 goto error_append;
 473
 474         /* space for private data */
 475         if (priv_data_size) {
 476                 DECLARE_SHMP(void, priv_data_alloc);
 477
 478                 align_shm(shmobj, priv_data_align);
 479                 chan->priv_data_offset = shmobj->allocated_len;
 480                 set_shmp(priv_data_alloc, zalloc_shm(shmobj, priv_data_size));
 481                 if (!shmp(handle, priv_data_alloc))
 482                         goto error_append;
 483                 *priv_data = channel_get_private(chan);
 484                 memcpy(*priv_data, priv_data_init, priv_data_size);
 485         } else {
 486                 chan->priv_data_offset = -1;
 487                 *priv_data = NULL;
 488         }
 489
 490         ret = channel_backend_init(&chan->backend, name, config,
 491                                    subbuf_size, num_subbuf, handle);
 492         if (ret)
 493                 goto error_backend_init;
 494
 495         chan->commit_count_mask = (~0UL >> chan->backend.num_subbuf_order);
 496         //TODO
 497         //chan->switch_timer_interval = usecs_to_jiffies(switch_timer_interval);
 498         //chan->read_timer_interval = usecs_to_jiffies(read_timer_interval);
 499         //TODO
 500         //init_waitqueue_head(&chan->read_wait);
 501         //init_waitqueue_head(&chan->hp_wait);
 502
 503         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 504                 /*
 505                  * In case of non-hotplug cpu, if the ring-buffer is allocated
 506                  * in early initcall, it will not be notified of secondary cpus.
 507                  * In that off case, we need to allocate for all possible cpus.
 508                  */
 509                 for_each_possible_cpu(cpu) {
 510                         struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[cpu].shmp);
 511                         lib_ring_buffer_start_switch_timer(buf, handle);
 512                         lib_ring_buffer_start_read_timer(buf, handle);
 513                 }
 514         } else {
 515                 struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[0].shmp);
 516
 517                 lib_ring_buffer_start_switch_timer(buf, handle);
 518                 lib_ring_buffer_start_read_timer(buf, handle);
 519         }
 520         ref = &handle->chan._ref;
 521         shm_get_object_data(handle, ref, shm_fd, wait_fd, memory_map_size);
 522         return handle;
 523
 524 error_backend_init:
 525 error_append:
 526         shm_object_table_destroy(handle->table);
 527 error_table_alloc:
 528         free(handle);
 529         return NULL;
 530 }
 531
 532 struct lttng_ust_shm_handle *channel_handle_create(int shm_fd, int wait_fd,
 533                                         uint64_t memory_map_size)
 534 {
 535         struct lttng_ust_shm_handle *handle;
 536         struct shm_object *object;
 537
 538         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
 539         if (!handle)
 540                 return NULL;
 541
 542         /* Allocate table for channel + per-cpu buffers */
 543         handle->table = shm_object_table_create(1 + num_possible_cpus());
 544         if (!handle->table)
 545                 goto error_table_alloc;
 546         /* Add channel object */
 547         object = shm_object_table_append_shadow(handle->table,
 548                         shm_fd, wait_fd, memory_map_size);
 549         if (!object)
 550                 goto error_table_object;
 551         /* struct channel is at object 0, offset 0 (hardcoded) */
 552         handle->chan._ref.index = 0;
 553         handle->chan._ref.offset = 0;
 554         return handle;
 555
 556 error_table_object:
 557         shm_object_table_destroy(handle->table);
 558 error_table_alloc:
 559         free(handle);
 560         return NULL;
 561 }
 562
 563 int channel_handle_add_stream(struct lttng_ust_shm_handle *handle,
 564                 int shm_fd, int wait_fd, uint64_t memory_map_size)
 565 {
 566         struct shm_object *object;
 567
 568         /* Add stream object */
 569         object = shm_object_table_append_shadow(handle->table,
 570                         shm_fd, wait_fd, memory_map_size);
 571         if (!object)
 572                 return -1;
 573         return 0;
 574 }
 575
 576 static
 577 void channel_release(struct channel *chan, struct lttng_ust_shm_handle *handle,
 578                 int shadow)
 579 {
 580         channel_free(chan, handle, shadow);
 581 }
 582
 583 /**
 584  * channel_destroy - Finalize, wait for q.s. and destroy channel.
 585  * @chan: channel to destroy
 586  *
 587  * Holds cpu hotplug.
 588  * Call "destroy" callback, finalize channels, decrement the channel
 589  * reference count. Note that when readers have completed data
 590  * consumption of finalized channels, get_subbuf() will return -ENODATA.
 591  * They should release their handle at that point.
 592  */
 593 void channel_destroy(struct channel *chan, struct lttng_ust_shm_handle *handle,
 594                 int shadow)
 595 {
 596         if (shadow) {
 597                 channel_release(chan, handle, shadow);
 598                 return;
 599         }
 600
 601         channel_unregister_notifiers(chan, handle);
 602
 603         /*
 604          * Note: the consumer takes care of finalizing and switching the
 605          * buffers.
 606          */
 607
 608         /*
 609          * sessiond/consumer are keeping a reference on the shm file
 610          * descriptor directly. No need to refcount.
 611          */
 612         channel_release(chan, handle, shadow);
 613         return;
 614 }
 615
 616 struct lttng_ust_lib_ring_buffer *channel_get_ring_buffer(
 617                                         const struct lttng_ust_lib_ring_buffer_config *config,
 618                                         struct channel *chan, int cpu,
 619                                         struct lttng_ust_shm_handle *handle,
 620                                         int **shm_fd, int **wait_fd,
 621                                         uint64_t **memory_map_size)
 622 {
 623         struct shm_ref *ref;
 624
 625         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) {
 626                 ref = &chan->backend.buf[0].shmp._ref;
 627                 shm_get_object_data(handle, ref, shm_fd, wait_fd,
 628                         memory_map_size);
 629                 return shmp(handle, chan->backend.buf[0].shmp);
 630         } else {
 631                 if (cpu >= num_possible_cpus())
 632                         return NULL;
 633                 ref = &chan->backend.buf[cpu].shmp._ref;
 634                 shm_get_object_data(handle, ref, shm_fd, wait_fd,
 635                         memory_map_size);
 636                 return shmp(handle, chan->backend.buf[cpu].shmp);
 637         }
 638 }
 639
 640 int lib_ring_buffer_open_read(struct lttng_ust_lib_ring_buffer *buf,
 641                               struct lttng_ust_shm_handle *handle,
 642                               int shadow)
 643 {
 644         if (shadow) {
 645                 if (uatomic_cmpxchg(&buf->active_shadow_readers, 0, 1) != 0)
 646                         return -EBUSY;
 647                 cmm_smp_mb();
 648                 return 0;
 649         }
 650         if (uatomic_cmpxchg(&buf->active_readers, 0, 1) != 0)
 651                 return -EBUSY;
 652         cmm_smp_mb();
 653         return 0;
 654 }
 655
 656 void lib_ring_buffer_release_read(struct lttng_ust_lib_ring_buffer *buf,
 657                                   struct lttng_ust_shm_handle *handle,
 658                                   int shadow)
 659 {
 660         struct channel *chan = shmp(handle, buf->backend.chan);
 661
 662         if (shadow) {
 663                 CHAN_WARN_ON(chan, uatomic_read(&buf->active_shadow_readers) != 1);
 664                 cmm_smp_mb();
 665                 uatomic_dec(&buf->active_shadow_readers);
 666                 return;
 667         }
 668         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1);
 669         cmm_smp_mb();
 670         uatomic_dec(&buf->active_readers);
 671 }
 672
 673 /**
 674  * lib_ring_buffer_snapshot - save subbuffer position snapshot (for read)
 675  * @buf: ring buffer
 676  * @consumed: consumed count indicating the position where to read
 677  * @produced: produced count, indicates position when to stop reading
 678  *
 679  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 680  * data to read at consumed position, or 0 if the get operation succeeds.
 681  */
 682
 683 int lib_ring_buffer_snapshot(struct lttng_ust_lib_ring_buffer *buf,
 684                              unsigned long *consumed, unsigned long *produced,
 685                              struct lttng_ust_shm_handle *handle)
 686 {
 687         struct channel *chan = shmp(handle, buf->backend.chan);
 688         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 689         unsigned long consumed_cur, write_offset;
 690         int finalized;
 691
 692         finalized = CMM_ACCESS_ONCE(buf->finalized);
 693         /*
 694          * Read finalized before counters.
 695          */
 696         cmm_smp_rmb();
 697         consumed_cur = uatomic_read(&buf->consumed);
 698         /*
 699          * No need to issue a memory barrier between consumed count read and
 700          * write offset read, because consumed count can only change
 701          * concurrently in overwrite mode, and we keep a sequence counter
 702          * identifier derived from the write offset to check we are getting
 703          * the same sub-buffer we are expecting (the sub-buffers are atomically
 704          * "tagged" upon writes, tags are checked upon read).
 705          */
 706         write_offset = v_read(config, &buf->offset);
 707
 708         /*
 709          * Check that we are not about to read the same subbuffer in
 710          * which the writer head is.
 711          */
 712         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
 713             == 0)
 714                 goto nodata;
 715
 716         *consumed = consumed_cur;
 717         *produced = subbuf_trunc(write_offset, chan);
 718
 719         return 0;
 720
 721 nodata:
 722         /*
 723          * The memory barriers __wait_event()/wake_up_interruptible() take care
 724          * of "raw_spin_is_locked" memory ordering.
 725          */
 726         if (finalized)
 727                 return -ENODATA;
 728         else
 729                 return -EAGAIN;
 730 }
 731
 732 /**
 733  * lib_ring_buffer_put_snapshot - move consumed counter forward
 734  * @buf: ring buffer
 735  * @consumed_new: new consumed count value
 736  */
 737 void lib_ring_buffer_move_consumer(struct lttng_ust_lib_ring_buffer *buf,
 738                                    unsigned long consumed_new,
 739                                    struct lttng_ust_shm_handle *handle)
 740 {
 741         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
 742         struct channel *chan = shmp(handle, bufb->chan);
 743         unsigned long consumed;
 744
 745         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1
 746                         && uatomic_read(&buf->active_shadow_readers) != 1);
 747
 748         /*
 749          * Only push the consumed value forward.
 750          * If the consumed cmpxchg fails, this is because we have been pushed by
 751          * the writer in flight recorder mode.
 752          */
 753         consumed = uatomic_read(&buf->consumed);
 754         while ((long) consumed - (long) consumed_new < 0)
 755                 consumed = uatomic_cmpxchg(&buf->consumed, consumed,
 756                                            consumed_new);
 757 }
 758
 759 /**
 760  * lib_ring_buffer_get_subbuf - get exclusive access to subbuffer for reading
 761  * @buf: ring buffer
 762  * @consumed: consumed count indicating the position where to read
 763  *
 764  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 765  * data to read at consumed position, or 0 if the get operation succeeds.
 766  */
 767 int lib_ring_buffer_get_subbuf(struct lttng_ust_lib_ring_buffer *buf,
 768                                unsigned long consumed,
 769                                struct lttng_ust_shm_handle *handle)
 770 {
 771         struct channel *chan = shmp(handle, buf->backend.chan);
 772         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 773         unsigned long consumed_cur, consumed_idx, commit_count, write_offset;
 774         int ret;
 775         int finalized;
 776
 777 retry:
 778         finalized = CMM_ACCESS_ONCE(buf->finalized);
 779         /*
 780          * Read finalized before counters.
 781          */
 782         cmm_smp_rmb();
 783         consumed_cur = uatomic_read(&buf->consumed);
 784         consumed_idx = subbuf_index(consumed, chan);
 785         commit_count = v_read(config, &shmp_index(handle, buf->commit_cold, consumed_idx)->cc_sb);
 786         /*
 787          * Make sure we read the commit count before reading the buffer
 788          * data and the write offset. Correct consumed offset ordering
 789          * wrt commit count is insured by the use of cmpxchg to update
 790          * the consumed offset.
 791          */
 792         /*
 793          * Local rmb to match the remote wmb to read the commit count
 794          * before the buffer data and the write offset.
 795          */
 796         cmm_smp_rmb();
 797
 798         write_offset = v_read(config, &buf->offset);
 799
 800         /*
 801          * Check that the buffer we are getting is after or at consumed_cur
 802          * position.
 803          */
 804         if ((long) subbuf_trunc(consumed, chan)
 805             - (long) subbuf_trunc(consumed_cur, chan) < 0)
 806                 goto nodata;
 807
 808         /*
 809          * Check that the subbuffer we are trying to consume has been
 810          * already fully committed.
 811          */
 812         if (((commit_count - chan->backend.subbuf_size)
 813              & chan->commit_count_mask)
 814             - (buf_trunc(consumed_cur, chan)
 815                >> chan->backend.num_subbuf_order)
 816             != 0)
 817                 goto nodata;
 818
 819         /*
 820          * Check that we are not about to read the same subbuffer in
 821          * which the writer head is.
 822          */
 823         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
 824             == 0)
 825                 goto nodata;
 826
 827         /*
 828          * Failure to get the subbuffer causes a busy-loop retry without going
 829          * to a wait queue. These are caused by short-lived race windows where
 830          * the writer is getting access to a subbuffer we were trying to get
 831          * access to. Also checks that the "consumed" buffer count we are
 832          * looking for matches the one contained in the subbuffer id.
 833          */
 834         ret = update_read_sb_index(config, &buf->backend, &chan->backend,
 835                                    consumed_idx, buf_trunc_val(consumed, chan),
 836                                    handle);
 837         if (ret)
 838                 goto retry;
 839         subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id);
 840
 841         buf->get_subbuf_consumed = consumed;
 842         buf->get_subbuf = 1;
 843
 844         return 0;
 845
 846 nodata:
 847         /*
 848          * The memory barriers __wait_event()/wake_up_interruptible() take care
 849          * of "raw_spin_is_locked" memory ordering.
 850          */
 851         if (finalized)
 852                 return -ENODATA;
 853         else
 854                 return -EAGAIN;
 855 }
 856
 857 /**
 858  * lib_ring_buffer_put_subbuf - release exclusive subbuffer access
 859  * @buf: ring buffer
 860  */
 861 void lib_ring_buffer_put_subbuf(struct lttng_ust_lib_ring_buffer *buf,
 862                                 struct lttng_ust_shm_handle *handle)
 863 {
 864         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
 865         struct channel *chan = shmp(handle, bufb->chan);
 866         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 867         unsigned long read_sb_bindex, consumed_idx, consumed;
 868
 869         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1
 870                         && uatomic_read(&buf->active_shadow_readers) != 1);
 871
 872         if (!buf->get_subbuf) {
 873                 /*
 874                  * Reader puts a subbuffer it did not get.
 875                  */
 876                 CHAN_WARN_ON(chan, 1);
 877                 return;
 878         }
 879         consumed = buf->get_subbuf_consumed;
 880         buf->get_subbuf = 0;
 881
 882         /*
 883          * Clear the records_unread counter. (overruns counter)
 884          * Can still be non-zero if a file reader simply grabbed the data
 885          * without using iterators.
 886          * Can be below zero if an iterator is used on a snapshot more than
 887          * once.
 888          */
 889         read_sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id);
 890         v_add(config, v_read(config,
 891                              &shmp(handle, shmp_index(handle, bufb->array, read_sb_bindex)->shmp)->records_unread),
 892               &bufb->records_read);
 893         v_set(config, &shmp(handle, shmp_index(handle, bufb->array, read_sb_bindex)->shmp)->records_unread, 0);
 894         CHAN_WARN_ON(chan, config->mode == RING_BUFFER_OVERWRITE
 895                      && subbuffer_id_is_noref(config, bufb->buf_rsb.id));
 896         subbuffer_id_set_noref(config, &bufb->buf_rsb.id);
 897
 898         /*
 899          * Exchange the reader subbuffer with the one we put in its place in the
 900          * writer subbuffer table. Expect the original consumed count. If
 901          * update_read_sb_index fails, this is because the writer updated the
 902          * subbuffer concurrently. We should therefore keep the subbuffer we
 903          * currently have: it has become invalid to try reading this sub-buffer
 904          * consumed count value anyway.
 905          */
 906         consumed_idx = subbuf_index(consumed, chan);
 907         update_read_sb_index(config, &buf->backend, &chan->backend,
 908                              consumed_idx, buf_trunc_val(consumed, chan),
 909                              handle);
 910         /*
 911          * update_read_sb_index return value ignored. Don't exchange sub-buffer
 912          * if the writer concurrently updated it.
 913          */
 914 }
 915
 916 /*
 917  * cons_offset is an iterator on all subbuffer offsets between the reader
 918  * position and the writer position. (inclusive)
 919  */
 920 static
 921 void lib_ring_buffer_print_subbuffer_errors(struct lttng_ust_lib_ring_buffer *buf,
 922                                             struct channel *chan,
 923                                             unsigned long cons_offset,
 924                                             int cpu,
 925                                             struct lttng_ust_shm_handle *handle)
 926 {
 927         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 928         unsigned long cons_idx, commit_count, commit_count_sb;
 929
 930         cons_idx = subbuf_index(cons_offset, chan);
 931         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, cons_idx)->cc);
 932         commit_count_sb = v_read(config, &shmp_index(handle, buf->commit_cold, cons_idx)->cc_sb);
 933
 934         if (subbuf_offset(commit_count, chan) != 0)
 935                 DBG("ring buffer %s, cpu %d: "
 936                        "commit count in subbuffer %lu,\n"
 937                        "expecting multiples of %lu bytes\n"
 938                        "  [ %lu bytes committed, %lu bytes reader-visible ]\n",
 939                        chan->backend.name, cpu, cons_idx,
 940                        chan->backend.subbuf_size,
 941                        commit_count, commit_count_sb);
 942
 943         DBG("ring buffer: %s, cpu %d: %lu bytes committed\n",
 944                chan->backend.name, cpu, commit_count);
 945 }
 946
 947 static
 948 void lib_ring_buffer_print_buffer_errors(struct lttng_ust_lib_ring_buffer *buf,
 949                                          struct channel *chan,
 950                                          void *priv, int cpu,
 951                                          struct lttng_ust_shm_handle *handle)
 952 {
 953         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 954         unsigned long write_offset, cons_offset;
 955
 956         /*
 957          * No need to order commit_count, write_offset and cons_offset reads
 958          * because we execute at teardown when no more writer nor reader
 959          * references are left.
 960          */
 961         write_offset = v_read(config, &buf->offset);
 962         cons_offset = uatomic_read(&buf->consumed);
 963         if (write_offset != cons_offset)
 964                 DBG("ring buffer %s, cpu %d: "
 965                        "non-consumed data\n"
 966                        "  [ %lu bytes written, %lu bytes read ]\n",
 967                        chan->backend.name, cpu, write_offset, cons_offset);
 968
 969         for (cons_offset = uatomic_read(&buf->consumed);
 970              (long) (subbuf_trunc((unsigned long) v_read(config, &buf->offset),
 971                                   chan)
 972                      - cons_offset) > 0;
 973              cons_offset = subbuf_align(cons_offset, chan))
 974                 lib_ring_buffer_print_subbuffer_errors(buf, chan, cons_offset,
 975                                                        cpu, handle);
 976 }
 977
 978 static
 979 void lib_ring_buffer_print_errors(struct channel *chan,
 980                                   struct lttng_ust_lib_ring_buffer *buf, int cpu,
 981                                   struct lttng_ust_shm_handle *handle)
 982 {
 983         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 984         void *priv = channel_get_private(chan);
 985
 986         if (!strcmp(chan->backend.name, "relay-metadata-mmap")) {
 987                 DBG("ring buffer %s: %lu records written, "
 988                         "%lu records overrun\n",
 989                         chan->backend.name,
 990                         v_read(config, &buf->records_count),
 991                         v_read(config, &buf->records_overrun));
 992         } else {
 993                 DBG("ring buffer %s, cpu %d: %lu records written, "
 994                         "%lu records overrun\n",
 995                         chan->backend.name, cpu,
 996                         v_read(config, &buf->records_count),
 997                         v_read(config, &buf->records_overrun));
 998
 999                 if (v_read(config, &buf->records_lost_full)
1000                     || v_read(config, &buf->records_lost_wrap)
1001                     || v_read(config, &buf->records_lost_big))
1002                         DBG("ring buffer %s, cpu %d: records were lost. Caused by:\n"
1003                                 "  [ %lu buffer full, %lu nest buffer wrap-around, "
1004                                 "%lu event too big ]\n",
1005                                 chan->backend.name, cpu,
1006                                 v_read(config, &buf->records_lost_full),
1007                                 v_read(config, &buf->records_lost_wrap),
1008                                 v_read(config, &buf->records_lost_big));
1009         }
1010         lib_ring_buffer_print_buffer_errors(buf, chan, priv, cpu, handle);
1011 }
1012
1013 /*
1014  * lib_ring_buffer_switch_old_start: Populate old subbuffer header.
1015  *
1016  * Only executed when the buffer is finalized, in SWITCH_FLUSH.
1017  */
1018 static
1019 void lib_ring_buffer_switch_old_start(struct lttng_ust_lib_ring_buffer *buf,
1020                                       struct channel *chan,
1021                                       struct switch_offsets *offsets,
1022                                       uint64_t tsc,
1023                                       struct lttng_ust_shm_handle *handle)
1024 {
1025         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1026         unsigned long oldidx = subbuf_index(offsets->old, chan);
1027         unsigned long commit_count;
1028
1029         config->cb.buffer_begin(buf, tsc, oldidx, handle);
1030
1031         /*
1032          * Order all writes to buffer before the commit count update that will
1033          * determine that the subbuffer is full.
1034          */
1035         cmm_smp_wmb();
1036         v_add(config, config->cb.subbuffer_header_size(),
1037               &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1038         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1039         /* Check if the written buffer has to be delivered */
1040         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old,
1041                                       commit_count, oldidx, handle);
1042         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1043                                              offsets->old, commit_count,
1044                                              config->cb.subbuffer_header_size(),
1045                                              handle);
1046 }
1047
1048 /*
1049  * lib_ring_buffer_switch_old_end: switch old subbuffer
1050  *
1051  * Note : offset_old should never be 0 here. It is ok, because we never perform
1052  * buffer switch on an empty subbuffer in SWITCH_ACTIVE mode. The caller
1053  * increments the offset_old value when doing a SWITCH_FLUSH on an empty
1054  * subbuffer.
1055  */
1056 static
1057 void lib_ring_buffer_switch_old_end(struct lttng_ust_lib_ring_buffer *buf,
1058                                     struct channel *chan,
1059                                     struct switch_offsets *offsets,
1060                                     uint64_t tsc,
1061                                     struct lttng_ust_shm_handle *handle)
1062 {
1063         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1064         unsigned long oldidx = subbuf_index(offsets->old - 1, chan);
1065         unsigned long commit_count, padding_size, data_size;
1066
1067         data_size = subbuf_offset(offsets->old - 1, chan) + 1;
1068         padding_size = chan->backend.subbuf_size - data_size;
1069         subbuffer_set_data_size(config, &buf->backend, oldidx, data_size,
1070                                 handle);
1071
1072         /*
1073          * Order all writes to buffer before the commit count update that will
1074          * determine that the subbuffer is full.
1075          */
1076         cmm_smp_wmb();
1077         v_add(config, padding_size, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1078         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1079         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1,
1080                                       commit_count, oldidx, handle);
1081         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1082                                              offsets->old, commit_count,
1083                                              padding_size, handle);
1084 }
1085
1086 /*
1087  * lib_ring_buffer_switch_new_start: Populate new subbuffer.
1088  *
1089  * This code can be executed unordered : writers may already have written to the
1090  * sub-buffer before this code gets executed, caution.  The commit makes sure
1091  * that this code is executed before the deliver of this sub-buffer.
1092  */
1093 static
1094 void lib_ring_buffer_switch_new_start(struct lttng_ust_lib_ring_buffer *buf,
1095                                       struct channel *chan,
1096                                       struct switch_offsets *offsets,
1097                                       uint64_t tsc,
1098                                       struct lttng_ust_shm_handle *handle)
1099 {
1100         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1101         unsigned long beginidx = subbuf_index(offsets->begin, chan);
1102         unsigned long commit_count;
1103
1104         config->cb.buffer_begin(buf, tsc, beginidx, handle);
1105
1106         /*
1107          * Order all writes to buffer before the commit count update that will
1108          * determine that the subbuffer is full.
1109          */
1110         cmm_smp_wmb();
1111         v_add(config, config->cb.subbuffer_header_size(),
1112               &shmp_index(handle, buf->commit_hot, beginidx)->cc);
1113         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, beginidx)->cc);
1114         /* Check if the written buffer has to be delivered */
1115         lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin,
1116                                       commit_count, beginidx, handle);
1117         lib_ring_buffer_write_commit_counter(config, buf, chan, beginidx,
1118                                              offsets->begin, commit_count,
1119                                              config->cb.subbuffer_header_size(),
1120                                              handle);
1121 }
1122
1123 /*
1124  * lib_ring_buffer_switch_new_end: finish switching current subbuffer
1125  *
1126  * The only remaining threads could be the ones with pending commits. They will
1127  * have to do the deliver themselves.
1128  */
1129 static
1130 void lib_ring_buffer_switch_new_end(struct lttng_ust_lib_ring_buffer *buf,
1131                                     struct channel *chan,
1132                                     struct switch_offsets *offsets,
1133                                     uint64_t tsc,
1134                                     struct lttng_ust_shm_handle *handle)
1135 {
1136         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1137         unsigned long endidx = subbuf_index(offsets->end - 1, chan);
1138         unsigned long commit_count, padding_size, data_size;
1139
1140         data_size = subbuf_offset(offsets->end - 1, chan) + 1;
1141         padding_size = chan->backend.subbuf_size - data_size;
1142         subbuffer_set_data_size(config, &buf->backend, endidx, data_size,
1143                                 handle);
1144
1145         /*
1146          * Order all writes to buffer before the commit count update that will
1147          * determine that the subbuffer is full.
1148          */
1149         cmm_smp_wmb();
1150         v_add(config, padding_size, &shmp_index(handle, buf->commit_hot, endidx)->cc);
1151         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, endidx)->cc);
1152         lib_ring_buffer_check_deliver(config, buf, chan, offsets->end - 1,
1153                                   commit_count, endidx, handle);
1154         lib_ring_buffer_write_commit_counter(config, buf, chan, endidx,
1155                                              offsets->end, commit_count,
1156                                              padding_size, handle);
1157 }
1158
1159 /*
1160  * Returns :
1161  * 0 if ok
1162  * !0 if execution must be aborted.
1163  */
1164 static
1165 int lib_ring_buffer_try_switch_slow(enum switch_mode mode,
1166                                     struct lttng_ust_lib_ring_buffer *buf,
1167                                     struct channel *chan,
1168                                     struct switch_offsets *offsets,
1169                                     uint64_t *tsc)
1170 {
1171         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1172         unsigned long off;
1173
1174         offsets->begin = v_read(config, &buf->offset);
1175         offsets->old = offsets->begin;
1176         offsets->switch_old_start = 0;
1177         off = subbuf_offset(offsets->begin, chan);
1178
1179         *tsc = config->cb.ring_buffer_clock_read(chan);
1180
1181         /*
1182          * Ensure we flush the header of an empty subbuffer when doing the
1183          * finalize (SWITCH_FLUSH). This ensures that we end up knowing the
1184          * total data gathering duration even if there were no records saved
1185          * after the last buffer switch.
1186          * In SWITCH_ACTIVE mode, switch the buffer when it contains events.
1187          * SWITCH_ACTIVE only flushes the current subbuffer, dealing with end of
1188          * subbuffer header as appropriate.
1189          * The next record that reserves space will be responsible for
1190          * populating the following subbuffer header. We choose not to populate
1191          * the next subbuffer header here because we want to be able to use
1192          * SWITCH_ACTIVE for periodical buffer flush, which must
1193          * guarantee that all the buffer content (records and header
1194          * timestamps) are visible to the reader. This is required for
1195          * quiescence guarantees for the fusion merge.
1196          */
1197         if (mode == SWITCH_FLUSH || off > 0) {
1198                 if (caa_unlikely(off == 0)) {
1199                         /*
1200                          * The client does not save any header information.
1201                          * Don't switch empty subbuffer on finalize, because it
1202                          * is invalid to deliver a completely empty subbuffer.
1203                          */
1204                         if (!config->cb.subbuffer_header_size())
1205                                 return -1;
1206                         /*
1207                          * Need to write the subbuffer start header on finalize.
1208                          */
1209                         offsets->switch_old_start = 1;
1210                 }
1211                 offsets->begin = subbuf_align(offsets->begin, chan);
1212         } else
1213                 return -1;      /* we do not have to switch : buffer is empty */
1214         /* Note: old points to the next subbuf at offset 0 */
1215         offsets->end = offsets->begin;
1216         return 0;
1217 }
1218
1219 /*
1220  * Force a sub-buffer switch. This operation is completely reentrant : can be
1221  * called while tracing is active with absolutely no lock held.
1222  *
1223  * Note, however, that as a v_cmpxchg is used for some atomic
1224  * operations, this function must be called from the CPU which owns the buffer
1225  * for a ACTIVE flush.
1226  */
1227 void lib_ring_buffer_switch_slow(struct lttng_ust_lib_ring_buffer *buf, enum switch_mode mode,
1228                                  struct lttng_ust_shm_handle *handle)
1229 {
1230         struct channel *chan = shmp(handle, buf->backend.chan);
1231         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1232         struct switch_offsets offsets;
1233         unsigned long oldidx;
1234         uint64_t tsc;
1235
1236         offsets.size = 0;
1237
1238         /*
1239          * Perform retryable operations.
1240          */
1241         do {
1242                 if (lib_ring_buffer_try_switch_slow(mode, buf, chan, &offsets,
1243                                                     &tsc))
1244                         return; /* Switch not needed */
1245         } while (v_cmpxchg(config, &buf->offset, offsets.old, offsets.end)
1246                  != offsets.old);
1247
1248         /*
1249          * Atomically update last_tsc. This update races against concurrent
1250          * atomic updates, but the race will always cause supplementary full TSC
1251          * records, never the opposite (missing a full TSC record when it would
1252          * be needed).
1253          */
1254         save_last_tsc(config, buf, tsc);
1255
1256         /*
1257          * Push the reader if necessary
1258          */
1259         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.old);
1260
1261         oldidx = subbuf_index(offsets.old, chan);
1262         lib_ring_buffer_clear_noref(config, &buf->backend, oldidx, handle);
1263
1264         /*
1265          * May need to populate header start on SWITCH_FLUSH.
1266          */
1267         if (offsets.switch_old_start) {
1268                 lib_ring_buffer_switch_old_start(buf, chan, &offsets, tsc, handle);
1269                 offsets.old += config->cb.subbuffer_header_size();
1270         }
1271
1272         /*
1273          * Switch old subbuffer.
1274          */
1275         lib_ring_buffer_switch_old_end(buf, chan, &offsets, tsc, handle);
1276 }
1277
1278 /*
1279  * Returns :
1280  * 0 if ok
1281  * -ENOSPC if event size is too large for packet.
1282  * -ENOBUFS if there is currently not enough space in buffer for the event.
1283  * -EIO if data cannot be written into the buffer for any other reason.
1284  */
1285 static
1286 int lib_ring_buffer_try_reserve_slow(struct lttng_ust_lib_ring_buffer *buf,
1287                                      struct channel *chan,
1288                                      struct switch_offsets *offsets,
1289                                      struct lttng_ust_lib_ring_buffer_ctx *ctx)
1290 {
1291         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1292         struct lttng_ust_shm_handle *handle = ctx->handle;
1293         unsigned long reserve_commit_diff;
1294
1295         offsets->begin = v_read(config, &buf->offset);
1296         offsets->old = offsets->begin;
1297         offsets->switch_new_start = 0;
1298         offsets->switch_new_end = 0;
1299         offsets->switch_old_end = 0;
1300         offsets->pre_header_padding = 0;
1301
1302         ctx->tsc = config->cb.ring_buffer_clock_read(chan);
1303         if ((int64_t) ctx->tsc == -EIO)
1304                 return -EIO;
1305
1306         if (last_tsc_overflow(config, buf, ctx->tsc))
1307                 ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC;
1308
1309         if (caa_unlikely(subbuf_offset(offsets->begin, ctx->chan) == 0)) {
1310                 offsets->switch_new_start = 1;          /* For offsets->begin */
1311         } else {
1312                 offsets->size = config->cb.record_header_size(config, chan,
1313                                                 offsets->begin,
1314                                                 &offsets->pre_header_padding,
1315                                                 ctx);
1316                 offsets->size +=
1317                         lib_ring_buffer_align(offsets->begin + offsets->size,
1318                                               ctx->largest_align)
1319                         + ctx->data_size;
1320                 if (caa_unlikely(subbuf_offset(offsets->begin, chan) +
1321                              offsets->size > chan->backend.subbuf_size)) {
1322                         offsets->switch_old_end = 1;    /* For offsets->old */
1323                         offsets->switch_new_start = 1;  /* For offsets->begin */
1324                 }
1325         }
1326         if (caa_unlikely(offsets->switch_new_start)) {
1327                 unsigned long sb_index;
1328
1329                 /*
1330                  * We are typically not filling the previous buffer completely.
1331                  */
1332                 if (caa_likely(offsets->switch_old_end))
1333                         offsets->begin = subbuf_align(offsets->begin, chan);
1334                 offsets->begin = offsets->begin
1335                                  + config->cb.subbuffer_header_size();
1336                 /* Test new buffer integrity */
1337                 sb_index = subbuf_index(offsets->begin, chan);
1338                 reserve_commit_diff =
1339                   (buf_trunc(offsets->begin, chan)
1340                    >> chan->backend.num_subbuf_order)
1341                   - ((unsigned long) v_read(config,
1342                                             &shmp_index(handle, buf->commit_cold, sb_index)->cc_sb)
1343                      & chan->commit_count_mask);
1344                 if (caa_likely(reserve_commit_diff == 0)) {
1345                         /* Next subbuffer not being written to. */
1346                         if (caa_unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1347                                 subbuf_trunc(offsets->begin, chan)
1348                                  - subbuf_trunc((unsigned long)
1349                                      uatomic_read(&buf->consumed), chan)
1350                                 >= chan->backend.buf_size)) {
1351                                 unsigned long nr_lost;
1352
1353                                 /*
1354                                  * We do not overwrite non consumed buffers
1355                                  * and we are full : record is lost.
1356                                  */
1357                                 nr_lost = v_read(config, &buf->records_lost_full);
1358                                 v_inc(config, &buf->records_lost_full);
1359                                 if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
1360                                         DBG("%lu or more records lost in (%s:%d) (buffer full)\n",
1361                                                 nr_lost + 1, chan->backend.name,
1362                                                 buf->backend.cpu);
1363                                 }
1364                                 return -ENOBUFS;
1365                         } else {
1366                                 /*
1367                                  * Next subbuffer not being written to, and we
1368                                  * are either in overwrite mode or the buffer is
1369                                  * not full. It's safe to write in this new
1370                                  * subbuffer.
1371                                  */
1372                         }
1373                 } else {
1374                         unsigned long nr_lost;
1375
1376                         /*
1377                          * Next subbuffer reserve offset does not match the
1378                          * commit offset. Drop record in producer-consumer and
1379                          * overwrite mode. Caused by either a writer OOPS or too
1380                          * many nested writes over a reserve/commit pair.
1381                          */
1382                         nr_lost = v_read(config, &buf->records_lost_wrap);
1383                         v_inc(config, &buf->records_lost_wrap);
1384                         if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
1385                                 DBG("%lu or more records lost in (%s:%d) (wrap-around)\n",
1386                                         nr_lost + 1, chan->backend.name,
1387                                         buf->backend.cpu);
1388                         }
1389                         return -EIO;
1390                 }
1391                 offsets->size =
1392                         config->cb.record_header_size(config, chan,
1393                                                 offsets->begin,
1394                                                 &offsets->pre_header_padding,
1395                                                 ctx);
1396                 offsets->size +=
1397                         lib_ring_buffer_align(offsets->begin + offsets->size,
1398                                               ctx->largest_align)
1399                         + ctx->data_size;
1400                 if (caa_unlikely(subbuf_offset(offsets->begin, chan)
1401                              + offsets->size > chan->backend.subbuf_size)) {
1402                         unsigned long nr_lost;
1403
1404                         /*
1405                          * Record too big for subbuffers, report error, don't
1406                          * complete the sub-buffer switch.
1407                          */
1408                         nr_lost = v_read(config, &buf->records_lost_big);
1409                         v_inc(config, &buf->records_lost_big);
1410                         if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
1411                                 DBG("%lu or more records lost in (%s:%d) record size "
1412                                         " of %zu bytes is too large for buffer\n",
1413                                         nr_lost + 1, chan->backend.name,
1414                                         buf->backend.cpu, offsets->size);
1415                         }
1416                         return -ENOSPC;
1417                 } else {
1418                         /*
1419                          * We just made a successful buffer switch and the
1420                          * record fits in the new subbuffer. Let's write.
1421                          */
1422                 }
1423         } else {
1424                 /*
1425                  * Record fits in the current buffer and we are not on a switch
1426                  * boundary. It's safe to write.
1427                  */
1428         }
1429         offsets->end = offsets->begin + offsets->size;
1430
1431         if (caa_unlikely(subbuf_offset(offsets->end, chan) == 0)) {
1432                 /*
1433                  * The offset_end will fall at the very beginning of the next
1434                  * subbuffer.
1435                  */
1436                 offsets->switch_new_end = 1;    /* For offsets->begin */
1437         }
1438         return 0;
1439 }
1440
1441 /**
1442  * lib_ring_buffer_reserve_slow - Atomic slot reservation in a buffer.
1443  * @ctx: ring buffer context.
1444  *
1445  * Return : -NOBUFS if not enough space, -ENOSPC if event size too large,
1446  * -EIO for other errors, else returns 0.
1447  * It will take care of sub-buffer switching.
1448  */
1449 int lib_ring_buffer_reserve_slow(struct lttng_ust_lib_ring_buffer_ctx *ctx)
1450 {
1451         struct channel *chan = ctx->chan;
1452         struct lttng_ust_shm_handle *handle = ctx->handle;
1453         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1454         struct lttng_ust_lib_ring_buffer *buf;
1455         struct switch_offsets offsets;
1456         int ret;
1457
1458         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
1459                 buf = shmp(handle, chan->backend.buf[ctx->cpu].shmp);
1460         else
1461                 buf = shmp(handle, chan->backend.buf[0].shmp);
1462         ctx->buf = buf;
1463
1464         offsets.size = 0;
1465
1466         do {
1467                 ret = lib_ring_buffer_try_reserve_slow(buf, chan, &offsets,
1468                                                        ctx);
1469                 if (caa_unlikely(ret))
1470                         return ret;
1471         } while (caa_unlikely(v_cmpxchg(config, &buf->offset, offsets.old,
1472                                     offsets.end)
1473                           != offsets.old));
1474
1475         /*
1476          * Atomically update last_tsc. This update races against concurrent
1477          * atomic updates, but the race will always cause supplementary full TSC
1478          * records, never the opposite (missing a full TSC record when it would
1479          * be needed).
1480          */
1481         save_last_tsc(config, buf, ctx->tsc);
1482
1483         /*
1484          * Push the reader if necessary
1485          */
1486         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.end - 1);
1487
1488         /*
1489          * Clear noref flag for this subbuffer.
1490          */
1491         lib_ring_buffer_clear_noref(config, &buf->backend,
1492                                     subbuf_index(offsets.end - 1, chan),
1493                                     handle);
1494
1495         /*
1496          * Switch old subbuffer if needed.
1497          */
1498         if (caa_unlikely(offsets.switch_old_end)) {
1499                 lib_ring_buffer_clear_noref(config, &buf->backend,
1500                                             subbuf_index(offsets.old - 1, chan),
1501                                             handle);
1502                 lib_ring_buffer_switch_old_end(buf, chan, &offsets, ctx->tsc, handle);
1503         }
1504
1505         /*
1506          * Populate new subbuffer.
1507          */
1508         if (caa_unlikely(offsets.switch_new_start))
1509                 lib_ring_buffer_switch_new_start(buf, chan, &offsets, ctx->tsc, handle);
1510
1511         if (caa_unlikely(offsets.switch_new_end))
1512                 lib_ring_buffer_switch_new_end(buf, chan, &offsets, ctx->tsc, handle);
1513
1514         ctx->slot_size = offsets.size;
1515         ctx->pre_offset = offsets.begin;
1516         ctx->buf_offset = offsets.begin + offsets.pre_header_padding;
1517         return 0;
1518 }
1519
1520 /*
1521  * Force a read (imply TLS fixup for dlopen) of TLS variables.
1522  */
1523 void lttng_fixup_ringbuffer_tls(void)
1524 {
1525         asm volatile ("" : : "m" (lib_ring_buffer_nesting));
1526 }