libringbuffer/ring_buffer_frontend.c

   1 /*
   2  * ring_buffer_frontend.c
   3  *
   4  * (C) Copyright 2005-2010 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   5  *
   6  * Ring buffer wait-free buffer synchronization. Producer-consumer and flight
   7  * recorder (overwrite) modes. See thesis:
   8  *
   9  * Desnoyers, Mathieu (2009), "Low-Impact Operating System Tracing", Ph.D.
  10  * dissertation, Ecole Polytechnique de Montreal.
  11  * http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf
  12  *
  13  * - Algorithm presentation in Chapter 5:
  14  *     "Lockless Multi-Core High-Throughput Buffering".
  15  * - Algorithm formal verification in Section 8.6:
  16  *     "Formal verification of LTTng"
  17  *
  18  * Author:
  19  *      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  20  *
  21  * Inspired from LTT and RelayFS:
  22  *  Karim Yaghmour <karim@opersys.com>
  23  *  Tom Zanussi <zanussi@us.ibm.com>
  24  *  Bob Wisniewski <bob@watson.ibm.com>
  25  * And from K42 :
  26  *  Bob Wisniewski <bob@watson.ibm.com>
  27  *
  28  * Buffer reader semantic :
  29  *
  30  * - get_subbuf_size
  31  * while buffer is not finalized and empty
  32  *   - get_subbuf
  33  *     - if return value != 0, continue
  34  *   - splice one subbuffer worth of data to a pipe
  35  *   - splice the data from pipe to disk/network
  36  *   - put_subbuf
  37  *
  38  * Dual LGPL v2.1/GPL v2 license.
  39  */
  40
  41 #define _GNU_SOURCE
  42 #include <sys/types.h>
  43 #include <sys/mman.h>
  44 #include <sys/stat.h>
  45 #include <fcntl.h>
  46 #include <urcu/compiler.h>
  47 #include <urcu/ref.h>
  48 #include <helper.h>
  49
  50 #include "smp.h"
  51 #include <lttng/ringbuffer-config.h>
  52 #include "vatomic.h"
  53 #include "backend.h"
  54 #include "frontend.h"
  55 #include "shm.h"
  56 #include "tlsfixup.h"
  57
  58 #ifndef max
  59 #define max(a, b)       ((a) > (b) ? (a) : (b))
  60 #endif
  61
  62 /* Print DBG() messages about events lost only every 1048576 hits */
  63 #define DBG_PRINT_NR_LOST       (1UL << 20)
  64
  65 /*
  66  * Use POSIX SHM: shm_open(3) and shm_unlink(3).
  67  * close(2) to close the fd returned by shm_open.
  68  * shm_unlink releases the shared memory object name.
  69  * ftruncate(2) sets the size of the memory object.
  70  * mmap/munmap maps the shared memory obj to a virtual address in the
  71  * calling proceess (should be done both in libust and consumer).
  72  * See shm_overview(7) for details.
  73  * Pass file descriptor returned by shm_open(3) to ltt-sessiond through
  74  * a UNIX socket.
  75  *
  76  * Since we don't need to access the object using its name, we can
  77  * immediately shm_unlink(3) it, and only keep the handle with its file
  78  * descriptor.
  79  */
  80
  81 /*
  82  * Internal structure representing offsets to use at a sub-buffer switch.
  83  */
  84 struct switch_offsets {
  85         unsigned long begin, end, old;
  86         size_t pre_header_padding, size;
  87         unsigned int switch_new_start:1, switch_new_end:1, switch_old_start:1,
  88                      switch_old_end:1;
  89 };
  90
  91 __thread unsigned int lib_ring_buffer_nesting;
  92
  93 /*
  94  * TODO: this is unused. Errors are saved within the ring buffer.
  95  * Eventually, allow consumerd to print these errors.
  96  */
  97 static
  98 void lib_ring_buffer_print_errors(struct channel *chan,
  99                                   struct lttng_ust_lib_ring_buffer *buf, int cpu,
 100                                   struct lttng_ust_shm_handle *handle)
 101         __attribute__((unused));
 102
 103 /**
 104  * lib_ring_buffer_reset - Reset ring buffer to initial values.
 105  * @buf: Ring buffer.
 106  *
 107  * Effectively empty the ring buffer. Should be called when the buffer is not
 108  * used for writing. The ring buffer can be opened for reading, but the reader
 109  * should not be using the iterator concurrently with reset. The previous
 110  * current iterator record is reset.
 111  */
 112 void lib_ring_buffer_reset(struct lttng_ust_lib_ring_buffer *buf,
 113                            struct lttng_ust_shm_handle *handle)
 114 {
 115         struct channel *chan = shmp(handle, buf->backend.chan);
 116         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 117         unsigned int i;
 118
 119         /*
 120          * Reset iterator first. It will put the subbuffer if it currently holds
 121          * it.
 122          */
 123         v_set(config, &buf->offset, 0);
 124         for (i = 0; i < chan->backend.num_subbuf; i++) {
 125                 v_set(config, &shmp_index(handle, buf->commit_hot, i)->cc, 0);
 126                 v_set(config, &shmp_index(handle, buf->commit_hot, i)->seq, 0);
 127                 v_set(config, &shmp_index(handle, buf->commit_cold, i)->cc_sb, 0);
 128         }
 129         uatomic_set(&buf->consumed, 0);
 130         uatomic_set(&buf->record_disabled, 0);
 131         v_set(config, &buf->last_tsc, 0);
 132         lib_ring_buffer_backend_reset(&buf->backend, handle);
 133         /* Don't reset number of active readers */
 134         v_set(config, &buf->records_lost_full, 0);
 135         v_set(config, &buf->records_lost_wrap, 0);
 136         v_set(config, &buf->records_lost_big, 0);
 137         v_set(config, &buf->records_count, 0);
 138         v_set(config, &buf->records_overrun, 0);
 139         buf->finalized = 0;
 140 }
 141
 142 /**
 143  * channel_reset - Reset channel to initial values.
 144  * @chan: Channel.
 145  *
 146  * Effectively empty the channel. Should be called when the channel is not used
 147  * for writing. The channel can be opened for reading, but the reader should not
 148  * be using the iterator concurrently with reset. The previous current iterator
 149  * record is reset.
 150  */
 151 void channel_reset(struct channel *chan)
 152 {
 153         /*
 154          * Reset iterators first. Will put the subbuffer if held for reading.
 155          */
 156         uatomic_set(&chan->record_disabled, 0);
 157         /* Don't reset commit_count_mask, still valid */
 158         channel_backend_reset(&chan->backend);
 159         /* Don't reset switch/read timer interval */
 160         /* Don't reset notifiers and notifier enable bits */
 161         /* Don't reset reader reference count */
 162 }
 163
 164 /*
 165  * Must be called under cpu hotplug protection.
 166  */
 167 int lib_ring_buffer_create(struct lttng_ust_lib_ring_buffer *buf,
 168                            struct channel_backend *chanb, int cpu,
 169                            struct lttng_ust_shm_handle *handle,
 170                            struct shm_object *shmobj)
 171 {
 172         const struct lttng_ust_lib_ring_buffer_config *config = &chanb->config;
 173         struct channel *chan = caa_container_of(chanb, struct channel, backend);
 174         void *priv = channel_get_private(chan);
 175         size_t subbuf_header_size;
 176         uint64_t tsc;
 177         int ret;
 178
 179         /* Test for cpu hotplug */
 180         if (buf->backend.allocated)
 181                 return 0;
 182
 183         ret = lib_ring_buffer_backend_create(&buf->backend, &chan->backend,
 184                         cpu, handle, shmobj);
 185         if (ret)
 186                 return ret;
 187
 188         align_shm(shmobj, __alignof__(struct commit_counters_hot));
 189         set_shmp(buf->commit_hot,
 190                  zalloc_shm(shmobj,
 191                         sizeof(struct commit_counters_hot) * chan->backend.num_subbuf));
 192         if (!shmp(handle, buf->commit_hot)) {
 193                 ret = -ENOMEM;
 194                 goto free_chanbuf;
 195         }
 196
 197         align_shm(shmobj, __alignof__(struct commit_counters_cold));
 198         set_shmp(buf->commit_cold,
 199                  zalloc_shm(shmobj,
 200                         sizeof(struct commit_counters_cold) * chan->backend.num_subbuf));
 201         if (!shmp(handle, buf->commit_cold)) {
 202                 ret = -ENOMEM;
 203                 goto free_commit;
 204         }
 205
 206         /*
 207          * Write the subbuffer header for first subbuffer so we know the total
 208          * duration of data gathering.
 209          */
 210         subbuf_header_size = config->cb.subbuffer_header_size();
 211         v_set(config, &buf->offset, subbuf_header_size);
 212         subbuffer_id_clear_noref(config, &shmp_index(handle, buf->backend.buf_wsb, 0)->id);
 213         tsc = config->cb.ring_buffer_clock_read(shmp(handle, buf->backend.chan));
 214         config->cb.buffer_begin(buf, tsc, 0, handle);
 215         v_add(config, subbuf_header_size, &shmp_index(handle, buf->commit_hot, 0)->cc);
 216
 217         if (config->cb.buffer_create) {
 218                 ret = config->cb.buffer_create(buf, priv, cpu, chanb->name, handle);
 219                 if (ret)
 220                         goto free_init;
 221         }
 222         buf->backend.allocated = 1;
 223         return 0;
 224
 225         /* Error handling */
 226 free_init:
 227         /* commit_cold will be freed by shm teardown */
 228 free_commit:
 229         /* commit_hot will be freed by shm teardown */
 230 free_chanbuf:
 231         return ret;
 232 }
 233
 234 #if 0
 235 static void switch_buffer_timer(unsigned long data)
 236 {
 237         struct lttng_ust_lib_ring_buffer *buf = (struct lttng_ust_lib_ring_buffer *)data;
 238         struct channel *chan = shmp(handle, buf->backend.chan);
 239         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 240
 241         /*
 242          * Only flush buffers periodically if readers are active.
 243          */
 244         if (uatomic_read(&buf->active_readers) || uatomic_read(&buf->active_shadow_readers))
 245                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE, handle);
 246
 247         //TODO timers
 248         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 249         //      mod_timer_pinned(&buf->switch_timer,
 250         //                       jiffies + chan->switch_timer_interval);
 251         //else
 252         //      mod_timer(&buf->switch_timer,
 253         //                jiffies + chan->switch_timer_interval);
 254 }
 255 #endif //0
 256
 257 static void lib_ring_buffer_start_switch_timer(struct lttng_ust_lib_ring_buffer *buf,
 258                            struct lttng_ust_shm_handle *handle)
 259 {
 260         struct channel *chan = shmp(handle, buf->backend.chan);
 261         //const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 262
 263         if (!chan->switch_timer_interval || buf->switch_timer_enabled)
 264                 return;
 265         //TODO
 266         //init_timer(&buf->switch_timer);
 267         //buf->switch_timer.function = switch_buffer_timer;
 268         //buf->switch_timer.expires = jiffies + chan->switch_timer_interval;
 269         //buf->switch_timer.data = (unsigned long)buf;
 270         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 271         //      add_timer_on(&buf->switch_timer, buf->backend.cpu);
 272         //else
 273         //      add_timer(&buf->switch_timer);
 274         buf->switch_timer_enabled = 1;
 275 }
 276
 277 static void lib_ring_buffer_stop_switch_timer(struct lttng_ust_lib_ring_buffer *buf,
 278                            struct lttng_ust_shm_handle *handle)
 279 {
 280         struct channel *chan = shmp(handle, buf->backend.chan);
 281
 282         if (!chan->switch_timer_interval || !buf->switch_timer_enabled)
 283                 return;
 284
 285         //TODO
 286         //del_timer_sync(&buf->switch_timer);
 287         buf->switch_timer_enabled = 0;
 288 }
 289
 290 #if 0
 291 /*
 292  * Polling timer to check the channels for data.
 293  */
 294 static void read_buffer_timer(unsigned long data)
 295 {
 296         struct lttng_ust_lib_ring_buffer *buf = (struct lttng_ust_lib_ring_buffer *)data;
 297         struct channel *chan = shmp(handle, buf->backend.chan);
 298         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 299
 300         CHAN_WARN_ON(chan, !buf->backend.allocated);
 301
 302         if (uatomic_read(&buf->active_readers) || uatomic_read(&buf->active_shadow_readers))
 303             && lib_ring_buffer_poll_deliver(config, buf, chan)) {
 304                 //TODO
 305                 //wake_up_interruptible(&buf->read_wait);
 306                 //wake_up_interruptible(&chan->read_wait);
 307         }
 308
 309         //TODO
 310         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 311         //      mod_timer_pinned(&buf->read_timer,
 312         //                       jiffies + chan->read_timer_interval);
 313         //else
 314         //      mod_timer(&buf->read_timer,
 315         //                jiffies + chan->read_timer_interval);
 316 }
 317 #endif //0
 318
 319 static void lib_ring_buffer_start_read_timer(struct lttng_ust_lib_ring_buffer *buf,
 320                            struct lttng_ust_shm_handle *handle)
 321 {
 322         struct channel *chan = shmp(handle, buf->backend.chan);
 323         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 324
 325         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 326             || !chan->read_timer_interval
 327             || buf->read_timer_enabled)
 328                 return;
 329
 330         //TODO
 331         //init_timer(&buf->read_timer);
 332         //buf->read_timer.function = read_buffer_timer;
 333         //buf->read_timer.expires = jiffies + chan->read_timer_interval;
 334         //buf->read_timer.data = (unsigned long)buf;
 335
 336         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 337         //      add_timer_on(&buf->read_timer, buf->backend.cpu);
 338         //else
 339         //      add_timer(&buf->read_timer);
 340         buf->read_timer_enabled = 1;
 341 }
 342
 343 static void lib_ring_buffer_stop_read_timer(struct lttng_ust_lib_ring_buffer *buf,
 344                            struct lttng_ust_shm_handle *handle)
 345 {
 346         struct channel *chan = shmp(handle, buf->backend.chan);
 347         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 348
 349         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 350             || !chan->read_timer_interval
 351             || !buf->read_timer_enabled)
 352                 return;
 353
 354         //TODO
 355         //del_timer_sync(&buf->read_timer);
 356         /*
 357          * do one more check to catch data that has been written in the last
 358          * timer period.
 359          */
 360         if (lib_ring_buffer_poll_deliver(config, buf, chan, handle)) {
 361                 //TODO
 362                 //wake_up_interruptible(&buf->read_wait);
 363                 //wake_up_interruptible(&chan->read_wait);
 364         }
 365         buf->read_timer_enabled = 0;
 366 }
 367
 368 static void channel_unregister_notifiers(struct channel *chan,
 369                            struct lttng_ust_shm_handle *handle)
 370 {
 371         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 372         int cpu;
 373
 374         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 375                 for_each_possible_cpu(cpu) {
 376                         struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[cpu].shmp);
 377
 378                         lib_ring_buffer_stop_switch_timer(buf, handle);
 379                         lib_ring_buffer_stop_read_timer(buf, handle);
 380                 }
 381         } else {
 382                 struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[0].shmp);
 383
 384                 lib_ring_buffer_stop_switch_timer(buf, handle);
 385                 lib_ring_buffer_stop_read_timer(buf, handle);
 386         }
 387         //channel_backend_unregister_notifiers(&chan->backend);
 388 }
 389
 390 static void channel_free(struct channel *chan, struct lttng_ust_shm_handle *handle,
 391                 int shadow)
 392 {
 393         if (!shadow)
 394                 channel_backend_free(&chan->backend, handle);
 395         /* chan is freed by shm teardown */
 396         shm_object_table_destroy(handle->table);
 397         free(handle);
 398 }
 399
 400 /**
 401  * channel_create - Create channel.
 402  * @config: ring buffer instance configuration
 403  * @name: name of the channel
 404  * @priv_data: ring buffer client private data area pointer (output)
 405  * @priv_data_size: length, in bytes, of the private data area.
 406  * @priv_data_init: initialization data for private data.
 407  * @buf_addr: pointer the the beginning of the preallocated buffer contiguous
 408  *            address mapping. It is used only by RING_BUFFER_STATIC
 409  *            configuration. It can be set to NULL for other backends.
 410  * @subbuf_size: subbuffer size
 411  * @num_subbuf: number of subbuffers
 412  * @switch_timer_interval: Time interval (in us) to fill sub-buffers with
 413  *                         padding to let readers get those sub-buffers.
 414  *                         Used for live streaming.
 415  * @read_timer_interval: Time interval (in us) to wake up pending readers.
 416  *
 417  * Holds cpu hotplug.
 418  * Returns NULL on failure.
 419  */
 420 struct lttng_ust_shm_handle *channel_create(const struct lttng_ust_lib_ring_buffer_config *config,
 421                    const char *name,
 422                    void **priv_data,
 423                    size_t priv_data_align,
 424                    size_t priv_data_size,
 425                    void *priv_data_init,
 426                    void *buf_addr, size_t subbuf_size,
 427                    size_t num_subbuf, unsigned int switch_timer_interval,
 428                    unsigned int read_timer_interval,
 429                    int **shm_fd, int **wait_fd, uint64_t **memory_map_size)
 430 {
 431         int ret, cpu;
 432         size_t shmsize, chansize;
 433         struct channel *chan;
 434         struct lttng_ust_shm_handle *handle;
 435         struct shm_object *shmobj;
 436         struct shm_ref *ref;
 437
 438         if (lib_ring_buffer_check_config(config, switch_timer_interval,
 439                                          read_timer_interval))
 440                 return NULL;
 441
 442         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
 443         if (!handle)
 444                 return NULL;
 445
 446         /* Allocate table for channel + per-cpu buffers */
 447         handle->table = shm_object_table_create(1 + num_possible_cpus());
 448         if (!handle->table)
 449                 goto error_table_alloc;
 450
 451         /* Calculate the shm allocation layout */
 452         shmsize = sizeof(struct channel);
 453         shmsize += offset_align(shmsize, __alignof__(struct lttng_ust_lib_ring_buffer_shmp));
 454         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 455                 shmsize += sizeof(struct lttng_ust_lib_ring_buffer_shmp) * num_possible_cpus();
 456         else
 457                 shmsize += sizeof(struct lttng_ust_lib_ring_buffer_shmp);
 458         chansize = shmsize;
 459         shmsize += offset_align(shmsize, priv_data_align);
 460         shmsize += priv_data_size;
 461
 462         shmobj = shm_object_table_append(handle->table, shmsize);
 463         if (!shmobj)
 464                 goto error_append;
 465         /* struct channel is at object 0, offset 0 (hardcoded) */
 466         set_shmp(handle->chan, zalloc_shm(shmobj, chansize));
 467         assert(handle->chan._ref.index == 0);
 468         assert(handle->chan._ref.offset == 0);
 469         chan = shmp(handle, handle->chan);
 470         if (!chan)
 471                 goto error_append;
 472
 473         /* space for private data */
 474         if (priv_data_size) {
 475                 DECLARE_SHMP(void, priv_data_alloc);
 476
 477                 align_shm(shmobj, priv_data_align);
 478                 chan->priv_data_offset = shmobj->allocated_len;
 479                 set_shmp(priv_data_alloc, zalloc_shm(shmobj, priv_data_size));
 480                 if (!shmp(handle, priv_data_alloc))
 481                         goto error_append;
 482                 *priv_data = channel_get_private(chan);
 483                 memcpy(*priv_data, priv_data_init, priv_data_size);
 484         } else {
 485                 chan->priv_data_offset = -1;
 486                 *priv_data = NULL;
 487         }
 488
 489         ret = channel_backend_init(&chan->backend, name, config,
 490                                    subbuf_size, num_subbuf, handle);
 491         if (ret)
 492                 goto error_backend_init;
 493
 494         chan->commit_count_mask = (~0UL >> chan->backend.num_subbuf_order);
 495         //TODO
 496         //chan->switch_timer_interval = usecs_to_jiffies(switch_timer_interval);
 497         //chan->read_timer_interval = usecs_to_jiffies(read_timer_interval);
 498         //TODO
 499         //init_waitqueue_head(&chan->read_wait);
 500         //init_waitqueue_head(&chan->hp_wait);
 501
 502         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 503                 /*
 504                  * In case of non-hotplug cpu, if the ring-buffer is allocated
 505                  * in early initcall, it will not be notified of secondary cpus.
 506                  * In that off case, we need to allocate for all possible cpus.
 507                  */
 508                 for_each_possible_cpu(cpu) {
 509                         struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[cpu].shmp);
 510                         lib_ring_buffer_start_switch_timer(buf, handle);
 511                         lib_ring_buffer_start_read_timer(buf, handle);
 512                 }
 513         } else {
 514                 struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[0].shmp);
 515
 516                 lib_ring_buffer_start_switch_timer(buf, handle);
 517                 lib_ring_buffer_start_read_timer(buf, handle);
 518         }
 519         ref = &handle->chan._ref;
 520         shm_get_object_data(handle, ref, shm_fd, wait_fd, memory_map_size);
 521         return handle;
 522
 523 error_backend_init:
 524 error_append:
 525         shm_object_table_destroy(handle->table);
 526 error_table_alloc:
 527         free(handle);
 528         return NULL;
 529 }
 530
 531 struct lttng_ust_shm_handle *channel_handle_create(int shm_fd, int wait_fd,
 532                                         uint64_t memory_map_size)
 533 {
 534         struct lttng_ust_shm_handle *handle;
 535         struct shm_object *object;
 536
 537         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
 538         if (!handle)
 539                 return NULL;
 540
 541         /* Allocate table for channel + per-cpu buffers */
 542         handle->table = shm_object_table_create(1 + num_possible_cpus());
 543         if (!handle->table)
 544                 goto error_table_alloc;
 545         /* Add channel object */
 546         object = shm_object_table_append_shadow(handle->table,
 547                         shm_fd, wait_fd, memory_map_size);
 548         if (!object)
 549                 goto error_table_object;
 550         /* struct channel is at object 0, offset 0 (hardcoded) */
 551         handle->chan._ref.index = 0;
 552         handle->chan._ref.offset = 0;
 553         return handle;
 554
 555 error_table_object:
 556         shm_object_table_destroy(handle->table);
 557 error_table_alloc:
 558         free(handle);
 559         return NULL;
 560 }
 561
 562 int channel_handle_add_stream(struct lttng_ust_shm_handle *handle,
 563                 int shm_fd, int wait_fd, uint64_t memory_map_size)
 564 {
 565         struct shm_object *object;
 566
 567         /* Add stream object */
 568         object = shm_object_table_append_shadow(handle->table,
 569                         shm_fd, wait_fd, memory_map_size);
 570         if (!object)
 571                 return -1;
 572         return 0;
 573 }
 574
 575 static
 576 void channel_release(struct channel *chan, struct lttng_ust_shm_handle *handle,
 577                 int shadow)
 578 {
 579         channel_free(chan, handle, shadow);
 580 }
 581
 582 /**
 583  * channel_destroy - Finalize, wait for q.s. and destroy channel.
 584  * @chan: channel to destroy
 585  *
 586  * Holds cpu hotplug.
 587  * Call "destroy" callback, finalize channels, decrement the channel
 588  * reference count. Note that when readers have completed data
 589  * consumption of finalized channels, get_subbuf() will return -ENODATA.
 590  * They should release their handle at that point.
 591  */
 592 void channel_destroy(struct channel *chan, struct lttng_ust_shm_handle *handle,
 593                 int shadow)
 594 {
 595         if (shadow) {
 596                 channel_release(chan, handle, shadow);
 597                 return;
 598         }
 599
 600         channel_unregister_notifiers(chan, handle);
 601
 602         /*
 603          * Note: the consumer takes care of finalizing and switching the
 604          * buffers.
 605          */
 606
 607         /*
 608          * sessiond/consumer are keeping a reference on the shm file
 609          * descriptor directly. No need to refcount.
 610          */
 611         channel_release(chan, handle, shadow);
 612         return;
 613 }
 614
 615 struct lttng_ust_lib_ring_buffer *channel_get_ring_buffer(
 616                                         const struct lttng_ust_lib_ring_buffer_config *config,
 617                                         struct channel *chan, int cpu,
 618                                         struct lttng_ust_shm_handle *handle,
 619                                         int **shm_fd, int **wait_fd,
 620                                         uint64_t **memory_map_size)
 621 {
 622         struct shm_ref *ref;
 623
 624         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) {
 625                 ref = &chan->backend.buf[0].shmp._ref;
 626                 shm_get_object_data(handle, ref, shm_fd, wait_fd,
 627                         memory_map_size);
 628                 return shmp(handle, chan->backend.buf[0].shmp);
 629         } else {
 630                 if (cpu >= num_possible_cpus())
 631                         return NULL;
 632                 ref = &chan->backend.buf[cpu].shmp._ref;
 633                 shm_get_object_data(handle, ref, shm_fd, wait_fd,
 634                         memory_map_size);
 635                 return shmp(handle, chan->backend.buf[cpu].shmp);
 636         }
 637 }
 638
 639 int lib_ring_buffer_open_read(struct lttng_ust_lib_ring_buffer *buf,
 640                               struct lttng_ust_shm_handle *handle,
 641                               int shadow)
 642 {
 643         if (shadow) {
 644                 if (uatomic_cmpxchg(&buf->active_shadow_readers, 0, 1) != 0)
 645                         return -EBUSY;
 646                 cmm_smp_mb();
 647                 return 0;
 648         }
 649         if (uatomic_cmpxchg(&buf->active_readers, 0, 1) != 0)
 650                 return -EBUSY;
 651         cmm_smp_mb();
 652         return 0;
 653 }
 654
 655 void lib_ring_buffer_release_read(struct lttng_ust_lib_ring_buffer *buf,
 656                                   struct lttng_ust_shm_handle *handle,
 657                                   int shadow)
 658 {
 659         struct channel *chan = shmp(handle, buf->backend.chan);
 660
 661         if (shadow) {
 662                 CHAN_WARN_ON(chan, uatomic_read(&buf->active_shadow_readers) != 1);
 663                 cmm_smp_mb();
 664                 uatomic_dec(&buf->active_shadow_readers);
 665                 return;
 666         }
 667         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1);
 668         cmm_smp_mb();
 669         uatomic_dec(&buf->active_readers);
 670 }
 671
 672 /**
 673  * lib_ring_buffer_snapshot - save subbuffer position snapshot (for read)
 674  * @buf: ring buffer
 675  * @consumed: consumed count indicating the position where to read
 676  * @produced: produced count, indicates position when to stop reading
 677  *
 678  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 679  * data to read at consumed position, or 0 if the get operation succeeds.
 680  */
 681
 682 int lib_ring_buffer_snapshot(struct lttng_ust_lib_ring_buffer *buf,
 683                              unsigned long *consumed, unsigned long *produced,
 684                              struct lttng_ust_shm_handle *handle)
 685 {
 686         struct channel *chan = shmp(handle, buf->backend.chan);
 687         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 688         unsigned long consumed_cur, write_offset;
 689         int finalized;
 690
 691         finalized = CMM_ACCESS_ONCE(buf->finalized);
 692         /*
 693          * Read finalized before counters.
 694          */
 695         cmm_smp_rmb();
 696         consumed_cur = uatomic_read(&buf->consumed);
 697         /*
 698          * No need to issue a memory barrier between consumed count read and
 699          * write offset read, because consumed count can only change
 700          * concurrently in overwrite mode, and we keep a sequence counter
 701          * identifier derived from the write offset to check we are getting
 702          * the same sub-buffer we are expecting (the sub-buffers are atomically
 703          * "tagged" upon writes, tags are checked upon read).
 704          */
 705         write_offset = v_read(config, &buf->offset);
 706
 707         /*
 708          * Check that we are not about to read the same subbuffer in
 709          * which the writer head is.
 710          */
 711         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
 712             == 0)
 713                 goto nodata;
 714
 715         *consumed = consumed_cur;
 716         *produced = subbuf_trunc(write_offset, chan);
 717
 718         return 0;
 719
 720 nodata:
 721         /*
 722          * The memory barriers __wait_event()/wake_up_interruptible() take care
 723          * of "raw_spin_is_locked" memory ordering.
 724          */
 725         if (finalized)
 726                 return -ENODATA;
 727         else
 728                 return -EAGAIN;
 729 }
 730
 731 /**
 732  * lib_ring_buffer_put_snapshot - move consumed counter forward
 733  * @buf: ring buffer
 734  * @consumed_new: new consumed count value
 735  */
 736 void lib_ring_buffer_move_consumer(struct lttng_ust_lib_ring_buffer *buf,
 737                                    unsigned long consumed_new,
 738                                    struct lttng_ust_shm_handle *handle)
 739 {
 740         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
 741         struct channel *chan = shmp(handle, bufb->chan);
 742         unsigned long consumed;
 743
 744         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1
 745                         && uatomic_read(&buf->active_shadow_readers) != 1);
 746
 747         /*
 748          * Only push the consumed value forward.
 749          * If the consumed cmpxchg fails, this is because we have been pushed by
 750          * the writer in flight recorder mode.
 751          */
 752         consumed = uatomic_read(&buf->consumed);
 753         while ((long) consumed - (long) consumed_new < 0)
 754                 consumed = uatomic_cmpxchg(&buf->consumed, consumed,
 755                                            consumed_new);
 756 }
 757
 758 /**
 759  * lib_ring_buffer_get_subbuf - get exclusive access to subbuffer for reading
 760  * @buf: ring buffer
 761  * @consumed: consumed count indicating the position where to read
 762  *
 763  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 764  * data to read at consumed position, or 0 if the get operation succeeds.
 765  */
 766 int lib_ring_buffer_get_subbuf(struct lttng_ust_lib_ring_buffer *buf,
 767                                unsigned long consumed,
 768                                struct lttng_ust_shm_handle *handle)
 769 {
 770         struct channel *chan = shmp(handle, buf->backend.chan);
 771         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 772         unsigned long consumed_cur, consumed_idx, commit_count, write_offset;
 773         int ret;
 774         int finalized;
 775
 776 retry:
 777         finalized = CMM_ACCESS_ONCE(buf->finalized);
 778         /*
 779          * Read finalized before counters.
 780          */
 781         cmm_smp_rmb();
 782         consumed_cur = uatomic_read(&buf->consumed);
 783         consumed_idx = subbuf_index(consumed, chan);
 784         commit_count = v_read(config, &shmp_index(handle, buf->commit_cold, consumed_idx)->cc_sb);
 785         /*
 786          * Make sure we read the commit count before reading the buffer
 787          * data and the write offset. Correct consumed offset ordering
 788          * wrt commit count is insured by the use of cmpxchg to update
 789          * the consumed offset.
 790          */
 791         /*
 792          * Local rmb to match the remote wmb to read the commit count
 793          * before the buffer data and the write offset.
 794          */
 795         cmm_smp_rmb();
 796
 797         write_offset = v_read(config, &buf->offset);
 798
 799         /*
 800          * Check that the buffer we are getting is after or at consumed_cur
 801          * position.
 802          */
 803         if ((long) subbuf_trunc(consumed, chan)
 804             - (long) subbuf_trunc(consumed_cur, chan) < 0)
 805                 goto nodata;
 806
 807         /*
 808          * Check that the subbuffer we are trying to consume has been
 809          * already fully committed.
 810          */
 811         if (((commit_count - chan->backend.subbuf_size)
 812              & chan->commit_count_mask)
 813             - (buf_trunc(consumed_cur, chan)
 814                >> chan->backend.num_subbuf_order)
 815             != 0)
 816                 goto nodata;
 817
 818         /*
 819          * Check that we are not about to read the same subbuffer in
 820          * which the writer head is.
 821          */
 822         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
 823             == 0)
 824                 goto nodata;
 825
 826         /*
 827          * Failure to get the subbuffer causes a busy-loop retry without going
 828          * to a wait queue. These are caused by short-lived race windows where
 829          * the writer is getting access to a subbuffer we were trying to get
 830          * access to. Also checks that the "consumed" buffer count we are
 831          * looking for matches the one contained in the subbuffer id.
 832          */
 833         ret = update_read_sb_index(config, &buf->backend, &chan->backend,
 834                                    consumed_idx, buf_trunc_val(consumed, chan),
 835                                    handle);
 836         if (ret)
 837                 goto retry;
 838         subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id);
 839
 840         buf->get_subbuf_consumed = consumed;
 841         buf->get_subbuf = 1;
 842
 843         return 0;
 844
 845 nodata:
 846         /*
 847          * The memory barriers __wait_event()/wake_up_interruptible() take care
 848          * of "raw_spin_is_locked" memory ordering.
 849          */
 850         if (finalized)
 851                 return -ENODATA;
 852         else
 853                 return -EAGAIN;
 854 }
 855
 856 /**
 857  * lib_ring_buffer_put_subbuf - release exclusive subbuffer access
 858  * @buf: ring buffer
 859  */
 860 void lib_ring_buffer_put_subbuf(struct lttng_ust_lib_ring_buffer *buf,
 861                                 struct lttng_ust_shm_handle *handle)
 862 {
 863         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
 864         struct channel *chan = shmp(handle, bufb->chan);
 865         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 866         unsigned long read_sb_bindex, consumed_idx, consumed;
 867
 868         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1
 869                         && uatomic_read(&buf->active_shadow_readers) != 1);
 870
 871         if (!buf->get_subbuf) {
 872                 /*
 873                  * Reader puts a subbuffer it did not get.
 874                  */
 875                 CHAN_WARN_ON(chan, 1);
 876                 return;
 877         }
 878         consumed = buf->get_subbuf_consumed;
 879         buf->get_subbuf = 0;
 880
 881         /*
 882          * Clear the records_unread counter. (overruns counter)
 883          * Can still be non-zero if a file reader simply grabbed the data
 884          * without using iterators.
 885          * Can be below zero if an iterator is used on a snapshot more than
 886          * once.
 887          */
 888         read_sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id);
 889         v_add(config, v_read(config,
 890                              &shmp(handle, shmp_index(handle, bufb->array, read_sb_bindex)->shmp)->records_unread),
 891               &bufb->records_read);
 892         v_set(config, &shmp(handle, shmp_index(handle, bufb->array, read_sb_bindex)->shmp)->records_unread, 0);
 893         CHAN_WARN_ON(chan, config->mode == RING_BUFFER_OVERWRITE
 894                      && subbuffer_id_is_noref(config, bufb->buf_rsb.id));
 895         subbuffer_id_set_noref(config, &bufb->buf_rsb.id);
 896
 897         /*
 898          * Exchange the reader subbuffer with the one we put in its place in the
 899          * writer subbuffer table. Expect the original consumed count. If
 900          * update_read_sb_index fails, this is because the writer updated the
 901          * subbuffer concurrently. We should therefore keep the subbuffer we
 902          * currently have: it has become invalid to try reading this sub-buffer
 903          * consumed count value anyway.
 904          */
 905         consumed_idx = subbuf_index(consumed, chan);
 906         update_read_sb_index(config, &buf->backend, &chan->backend,
 907                              consumed_idx, buf_trunc_val(consumed, chan),
 908                              handle);
 909         /*
 910          * update_read_sb_index return value ignored. Don't exchange sub-buffer
 911          * if the writer concurrently updated it.
 912          */
 913 }
 914
 915 /*
 916  * cons_offset is an iterator on all subbuffer offsets between the reader
 917  * position and the writer position. (inclusive)
 918  */
 919 static
 920 void lib_ring_buffer_print_subbuffer_errors(struct lttng_ust_lib_ring_buffer *buf,
 921                                             struct channel *chan,
 922                                             unsigned long cons_offset,
 923                                             int cpu,
 924                                             struct lttng_ust_shm_handle *handle)
 925 {
 926         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 927         unsigned long cons_idx, commit_count, commit_count_sb;
 928
 929         cons_idx = subbuf_index(cons_offset, chan);
 930         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, cons_idx)->cc);
 931         commit_count_sb = v_read(config, &shmp_index(handle, buf->commit_cold, cons_idx)->cc_sb);
 932
 933         if (subbuf_offset(commit_count, chan) != 0)
 934                 DBG("ring buffer %s, cpu %d: "
 935                        "commit count in subbuffer %lu,\n"
 936                        "expecting multiples of %lu bytes\n"
 937                        "  [ %lu bytes committed, %lu bytes reader-visible ]\n",
 938                        chan->backend.name, cpu, cons_idx,
 939                        chan->backend.subbuf_size,
 940                        commit_count, commit_count_sb);
 941
 942         DBG("ring buffer: %s, cpu %d: %lu bytes committed\n",
 943                chan->backend.name, cpu, commit_count);
 944 }
 945
 946 static
 947 void lib_ring_buffer_print_buffer_errors(struct lttng_ust_lib_ring_buffer *buf,
 948                                          struct channel *chan,
 949                                          void *priv, int cpu,
 950                                          struct lttng_ust_shm_handle *handle)
 951 {
 952         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 953         unsigned long write_offset, cons_offset;
 954
 955         /*
 956          * No need to order commit_count, write_offset and cons_offset reads
 957          * because we execute at teardown when no more writer nor reader
 958          * references are left.
 959          */
 960         write_offset = v_read(config, &buf->offset);
 961         cons_offset = uatomic_read(&buf->consumed);
 962         if (write_offset != cons_offset)
 963                 DBG("ring buffer %s, cpu %d: "
 964                        "non-consumed data\n"
 965                        "  [ %lu bytes written, %lu bytes read ]\n",
 966                        chan->backend.name, cpu, write_offset, cons_offset);
 967
 968         for (cons_offset = uatomic_read(&buf->consumed);
 969              (long) (subbuf_trunc((unsigned long) v_read(config, &buf->offset),
 970                                   chan)
 971                      - cons_offset) > 0;
 972              cons_offset = subbuf_align(cons_offset, chan))
 973                 lib_ring_buffer_print_subbuffer_errors(buf, chan, cons_offset,
 974                                                        cpu, handle);
 975 }
 976
 977 static
 978 void lib_ring_buffer_print_errors(struct channel *chan,
 979                                   struct lttng_ust_lib_ring_buffer *buf, int cpu,
 980                                   struct lttng_ust_shm_handle *handle)
 981 {
 982         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 983         void *priv = channel_get_private(chan);
 984
 985         DBG("ring buffer %s, cpu %d: %lu records written, "
 986                           "%lu records overrun\n",
 987                           chan->backend.name, cpu,
 988                           v_read(config, &buf->records_count),
 989                           v_read(config, &buf->records_overrun));
 990
 991         if (v_read(config, &buf->records_lost_full)
 992             || v_read(config, &buf->records_lost_wrap)
 993             || v_read(config, &buf->records_lost_big))
 994                 DBG("ring buffer %s, cpu %d: records were lost. Caused by:\n"
 995                        "  [ %lu buffer full, %lu nest buffer wrap-around, "
 996                        "%lu event too big ]\n",
 997                        chan->backend.name, cpu,
 998                        v_read(config, &buf->records_lost_full),
 999                        v_read(config, &buf->records_lost_wrap),
1000                        v_read(config, &buf->records_lost_big));
1001
1002         lib_ring_buffer_print_buffer_errors(buf, chan, priv, cpu, handle);
1003 }
1004
1005 /*
1006  * lib_ring_buffer_switch_old_start: Populate old subbuffer header.
1007  *
1008  * Only executed when the buffer is finalized, in SWITCH_FLUSH.
1009  */
1010 static
1011 void lib_ring_buffer_switch_old_start(struct lttng_ust_lib_ring_buffer *buf,
1012                                       struct channel *chan,
1013                                       struct switch_offsets *offsets,
1014                                       uint64_t tsc,
1015                                       struct lttng_ust_shm_handle *handle)
1016 {
1017         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1018         unsigned long oldidx = subbuf_index(offsets->old, chan);
1019         unsigned long commit_count;
1020
1021         config->cb.buffer_begin(buf, tsc, oldidx, handle);
1022
1023         /*
1024          * Order all writes to buffer before the commit count update that will
1025          * determine that the subbuffer is full.
1026          */
1027         cmm_smp_wmb();
1028         v_add(config, config->cb.subbuffer_header_size(),
1029               &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1030         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1031         /* Check if the written buffer has to be delivered */
1032         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old,
1033                                       commit_count, oldidx, handle);
1034         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1035                                              offsets->old, commit_count,
1036                                              config->cb.subbuffer_header_size(),
1037                                              handle);
1038 }
1039
1040 /*
1041  * lib_ring_buffer_switch_old_end: switch old subbuffer
1042  *
1043  * Note : offset_old should never be 0 here. It is ok, because we never perform
1044  * buffer switch on an empty subbuffer in SWITCH_ACTIVE mode. The caller
1045  * increments the offset_old value when doing a SWITCH_FLUSH on an empty
1046  * subbuffer.
1047  */
1048 static
1049 void lib_ring_buffer_switch_old_end(struct lttng_ust_lib_ring_buffer *buf,
1050                                     struct channel *chan,
1051                                     struct switch_offsets *offsets,
1052                                     uint64_t tsc,
1053                                     struct lttng_ust_shm_handle *handle)
1054 {
1055         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1056         unsigned long oldidx = subbuf_index(offsets->old - 1, chan);
1057         unsigned long commit_count, padding_size, data_size;
1058
1059         data_size = subbuf_offset(offsets->old - 1, chan) + 1;
1060         padding_size = chan->backend.subbuf_size - data_size;
1061         subbuffer_set_data_size(config, &buf->backend, oldidx, data_size,
1062                                 handle);
1063
1064         /*
1065          * Order all writes to buffer before the commit count update that will
1066          * determine that the subbuffer is full.
1067          */
1068         cmm_smp_wmb();
1069         v_add(config, padding_size, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1070         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1071         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1,
1072                                       commit_count, oldidx, handle);
1073         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1074                                              offsets->old, commit_count,
1075                                              padding_size, handle);
1076 }
1077
1078 /*
1079  * lib_ring_buffer_switch_new_start: Populate new subbuffer.
1080  *
1081  * This code can be executed unordered : writers may already have written to the
1082  * sub-buffer before this code gets executed, caution.  The commit makes sure
1083  * that this code is executed before the deliver of this sub-buffer.
1084  */
1085 static
1086 void lib_ring_buffer_switch_new_start(struct lttng_ust_lib_ring_buffer *buf,
1087                                       struct channel *chan,
1088                                       struct switch_offsets *offsets,
1089                                       uint64_t tsc,
1090                                       struct lttng_ust_shm_handle *handle)
1091 {
1092         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1093         unsigned long beginidx = subbuf_index(offsets->begin, chan);
1094         unsigned long commit_count;
1095
1096         config->cb.buffer_begin(buf, tsc, beginidx, handle);
1097
1098         /*
1099          * Order all writes to buffer before the commit count update that will
1100          * determine that the subbuffer is full.
1101          */
1102         cmm_smp_wmb();
1103         v_add(config, config->cb.subbuffer_header_size(),
1104               &shmp_index(handle, buf->commit_hot, beginidx)->cc);
1105         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, beginidx)->cc);
1106         /* Check if the written buffer has to be delivered */
1107         lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin,
1108                                       commit_count, beginidx, handle);
1109         lib_ring_buffer_write_commit_counter(config, buf, chan, beginidx,
1110                                              offsets->begin, commit_count,
1111                                              config->cb.subbuffer_header_size(),
1112                                              handle);
1113 }
1114
1115 /*
1116  * lib_ring_buffer_switch_new_end: finish switching current subbuffer
1117  *
1118  * The only remaining threads could be the ones with pending commits. They will
1119  * have to do the deliver themselves.
1120  */
1121 static
1122 void lib_ring_buffer_switch_new_end(struct lttng_ust_lib_ring_buffer *buf,
1123                                     struct channel *chan,
1124                                     struct switch_offsets *offsets,
1125                                     uint64_t tsc,
1126                                     struct lttng_ust_shm_handle *handle)
1127 {
1128         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1129         unsigned long endidx = subbuf_index(offsets->end - 1, chan);
1130         unsigned long commit_count, padding_size, data_size;
1131
1132         data_size = subbuf_offset(offsets->end - 1, chan) + 1;
1133         padding_size = chan->backend.subbuf_size - data_size;
1134         subbuffer_set_data_size(config, &buf->backend, endidx, data_size,
1135                                 handle);
1136
1137         /*
1138          * Order all writes to buffer before the commit count update that will
1139          * determine that the subbuffer is full.
1140          */
1141         cmm_smp_wmb();
1142         v_add(config, padding_size, &shmp_index(handle, buf->commit_hot, endidx)->cc);
1143         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, endidx)->cc);
1144         lib_ring_buffer_check_deliver(config, buf, chan, offsets->end - 1,
1145                                   commit_count, endidx, handle);
1146         lib_ring_buffer_write_commit_counter(config, buf, chan, endidx,
1147                                              offsets->end, commit_count,
1148                                              padding_size, handle);
1149 }
1150
1151 /*
1152  * Returns :
1153  * 0 if ok
1154  * !0 if execution must be aborted.
1155  */
1156 static
1157 int lib_ring_buffer_try_switch_slow(enum switch_mode mode,
1158                                     struct lttng_ust_lib_ring_buffer *buf,
1159                                     struct channel *chan,
1160                                     struct switch_offsets *offsets,
1161                                     uint64_t *tsc)
1162 {
1163         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1164         unsigned long off;
1165
1166         offsets->begin = v_read(config, &buf->offset);
1167         offsets->old = offsets->begin;
1168         offsets->switch_old_start = 0;
1169         off = subbuf_offset(offsets->begin, chan);
1170
1171         *tsc = config->cb.ring_buffer_clock_read(chan);
1172
1173         /*
1174          * Ensure we flush the header of an empty subbuffer when doing the
1175          * finalize (SWITCH_FLUSH). This ensures that we end up knowing the
1176          * total data gathering duration even if there were no records saved
1177          * after the last buffer switch.
1178          * In SWITCH_ACTIVE mode, switch the buffer when it contains events.
1179          * SWITCH_ACTIVE only flushes the current subbuffer, dealing with end of
1180          * subbuffer header as appropriate.
1181          * The next record that reserves space will be responsible for
1182          * populating the following subbuffer header. We choose not to populate
1183          * the next subbuffer header here because we want to be able to use
1184          * SWITCH_ACTIVE for periodical buffer flush, which must
1185          * guarantee that all the buffer content (records and header
1186          * timestamps) are visible to the reader. This is required for
1187          * quiescence guarantees for the fusion merge.
1188          */
1189         if (mode == SWITCH_FLUSH || off > 0) {
1190                 if (caa_unlikely(off == 0)) {
1191                         /*
1192                          * The client does not save any header information.
1193                          * Don't switch empty subbuffer on finalize, because it
1194                          * is invalid to deliver a completely empty subbuffer.
1195                          */
1196                         if (!config->cb.subbuffer_header_size())
1197                                 return -1;
1198                         /*
1199                          * Need to write the subbuffer start header on finalize.
1200                          */
1201                         offsets->switch_old_start = 1;
1202                 }
1203                 offsets->begin = subbuf_align(offsets->begin, chan);
1204         } else
1205                 return -1;      /* we do not have to switch : buffer is empty */
1206         /* Note: old points to the next subbuf at offset 0 */
1207         offsets->end = offsets->begin;
1208         return 0;
1209 }
1210
1211 /*
1212  * Force a sub-buffer switch. This operation is completely reentrant : can be
1213  * called while tracing is active with absolutely no lock held.
1214  *
1215  * Note, however, that as a v_cmpxchg is used for some atomic
1216  * operations, this function must be called from the CPU which owns the buffer
1217  * for a ACTIVE flush.
1218  */
1219 void lib_ring_buffer_switch_slow(struct lttng_ust_lib_ring_buffer *buf, enum switch_mode mode,
1220                                  struct lttng_ust_shm_handle *handle)
1221 {
1222         struct channel *chan = shmp(handle, buf->backend.chan);
1223         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1224         struct switch_offsets offsets;
1225         unsigned long oldidx;
1226         uint64_t tsc;
1227
1228         offsets.size = 0;
1229
1230         /*
1231          * Perform retryable operations.
1232          */
1233         do {
1234                 if (lib_ring_buffer_try_switch_slow(mode, buf, chan, &offsets,
1235                                                     &tsc))
1236                         return; /* Switch not needed */
1237         } while (v_cmpxchg(config, &buf->offset, offsets.old, offsets.end)
1238                  != offsets.old);
1239
1240         /*
1241          * Atomically update last_tsc. This update races against concurrent
1242          * atomic updates, but the race will always cause supplementary full TSC
1243          * records, never the opposite (missing a full TSC record when it would
1244          * be needed).
1245          */
1246         save_last_tsc(config, buf, tsc);
1247
1248         /*
1249          * Push the reader if necessary
1250          */
1251         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.old);
1252
1253         oldidx = subbuf_index(offsets.old, chan);
1254         lib_ring_buffer_clear_noref(config, &buf->backend, oldidx, handle);
1255
1256         /*
1257          * May need to populate header start on SWITCH_FLUSH.
1258          */
1259         if (offsets.switch_old_start) {
1260                 lib_ring_buffer_switch_old_start(buf, chan, &offsets, tsc, handle);
1261                 offsets.old += config->cb.subbuffer_header_size();
1262         }
1263
1264         /*
1265          * Switch old subbuffer.
1266          */
1267         lib_ring_buffer_switch_old_end(buf, chan, &offsets, tsc, handle);
1268 }
1269
1270 /*
1271  * Returns :
1272  * 0 if ok
1273  * -ENOSPC if event size is too large for packet.
1274  * -ENOBUFS if there is currently not enough space in buffer for the event.
1275  * -EIO if data cannot be written into the buffer for any other reason.
1276  */
1277 static
1278 int lib_ring_buffer_try_reserve_slow(struct lttng_ust_lib_ring_buffer *buf,
1279                                      struct channel *chan,
1280                                      struct switch_offsets *offsets,
1281                                      struct lttng_ust_lib_ring_buffer_ctx *ctx)
1282 {
1283         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1284         struct lttng_ust_shm_handle *handle = ctx->handle;
1285         unsigned long reserve_commit_diff;
1286
1287         offsets->begin = v_read(config, &buf->offset);
1288         offsets->old = offsets->begin;
1289         offsets->switch_new_start = 0;
1290         offsets->switch_new_end = 0;
1291         offsets->switch_old_end = 0;
1292         offsets->pre_header_padding = 0;
1293
1294         ctx->tsc = config->cb.ring_buffer_clock_read(chan);
1295         if ((int64_t) ctx->tsc == -EIO)
1296                 return -EIO;
1297
1298         if (last_tsc_overflow(config, buf, ctx->tsc))
1299                 ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC;
1300
1301         if (caa_unlikely(subbuf_offset(offsets->begin, ctx->chan) == 0)) {
1302                 offsets->switch_new_start = 1;          /* For offsets->begin */
1303         } else {
1304                 offsets->size = config->cb.record_header_size(config, chan,
1305                                                 offsets->begin,
1306                                                 &offsets->pre_header_padding,
1307                                                 ctx);
1308                 offsets->size +=
1309                         lib_ring_buffer_align(offsets->begin + offsets->size,
1310                                               ctx->largest_align)
1311                         + ctx->data_size;
1312                 if (caa_unlikely(subbuf_offset(offsets->begin, chan) +
1313                              offsets->size > chan->backend.subbuf_size)) {
1314                         offsets->switch_old_end = 1;    /* For offsets->old */
1315                         offsets->switch_new_start = 1;  /* For offsets->begin */
1316                 }
1317         }
1318         if (caa_unlikely(offsets->switch_new_start)) {
1319                 unsigned long sb_index;
1320
1321                 /*
1322                  * We are typically not filling the previous buffer completely.
1323                  */
1324                 if (caa_likely(offsets->switch_old_end))
1325                         offsets->begin = subbuf_align(offsets->begin, chan);
1326                 offsets->begin = offsets->begin
1327                                  + config->cb.subbuffer_header_size();
1328                 /* Test new buffer integrity */
1329                 sb_index = subbuf_index(offsets->begin, chan);
1330                 reserve_commit_diff =
1331                   (buf_trunc(offsets->begin, chan)
1332                    >> chan->backend.num_subbuf_order)
1333                   - ((unsigned long) v_read(config,
1334                                             &shmp_index(handle, buf->commit_cold, sb_index)->cc_sb)
1335                      & chan->commit_count_mask);
1336                 if (caa_likely(reserve_commit_diff == 0)) {
1337                         /* Next subbuffer not being written to. */
1338                         if (caa_unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1339                                 subbuf_trunc(offsets->begin, chan)
1340                                  - subbuf_trunc((unsigned long)
1341                                      uatomic_read(&buf->consumed), chan)
1342                                 >= chan->backend.buf_size)) {
1343                                 unsigned long nr_lost;
1344
1345                                 /*
1346                                  * We do not overwrite non consumed buffers
1347                                  * and we are full : record is lost.
1348                                  */
1349                                 nr_lost = v_read(config, &buf->records_lost_full);
1350                                 v_inc(config, &buf->records_lost_full);
1351                                 if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
1352                                         DBG("%lu or more records lost in (%s:%d) (buffer full)\n",
1353                                                 nr_lost + 1, chan->backend.name,
1354                                                 buf->backend.cpu);
1355                                 }
1356                                 return -ENOBUFS;
1357                         } else {
1358                                 /*
1359                                  * Next subbuffer not being written to, and we
1360                                  * are either in overwrite mode or the buffer is
1361                                  * not full. It's safe to write in this new
1362                                  * subbuffer.
1363                                  */
1364                         }
1365                 } else {
1366                         unsigned long nr_lost;
1367
1368                         /*
1369                          * Next subbuffer reserve offset does not match the
1370                          * commit offset. Drop record in producer-consumer and
1371                          * overwrite mode. Caused by either a writer OOPS or too
1372                          * many nested writes over a reserve/commit pair.
1373                          */
1374                         nr_lost = v_read(config, &buf->records_lost_wrap);
1375                         v_inc(config, &buf->records_lost_wrap);
1376                         if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
1377                                 DBG("%lu or more records lost in (%s:%d) (wrap-around)\n",
1378                                         nr_lost + 1, chan->backend.name,
1379                                         buf->backend.cpu);
1380                         }
1381                         return -EIO;
1382                 }
1383                 offsets->size =
1384                         config->cb.record_header_size(config, chan,
1385                                                 offsets->begin,
1386                                                 &offsets->pre_header_padding,
1387                                                 ctx);
1388                 offsets->size +=
1389                         lib_ring_buffer_align(offsets->begin + offsets->size,
1390                                               ctx->largest_align)
1391                         + ctx->data_size;
1392                 if (caa_unlikely(subbuf_offset(offsets->begin, chan)
1393                              + offsets->size > chan->backend.subbuf_size)) {
1394                         unsigned long nr_lost;
1395
1396                         /*
1397                          * Record too big for subbuffers, report error, don't
1398                          * complete the sub-buffer switch.
1399                          */
1400                         nr_lost = v_read(config, &buf->records_lost_big);
1401                         v_inc(config, &buf->records_lost_big);
1402                         if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
1403                                 DBG("%lu or more records lost in (%s:%d) record size "
1404                                         " of %zu bytes is too large for buffer\n",
1405                                         nr_lost + 1, chan->backend.name,
1406                                         buf->backend.cpu, offsets->size);
1407                         }
1408                         return -ENOSPC;
1409                 } else {
1410                         /*
1411                          * We just made a successful buffer switch and the
1412                          * record fits in the new subbuffer. Let's write.
1413                          */
1414                 }
1415         } else {
1416                 /*
1417                  * Record fits in the current buffer and we are not on a switch
1418                  * boundary. It's safe to write.
1419                  */
1420         }
1421         offsets->end = offsets->begin + offsets->size;
1422
1423         if (caa_unlikely(subbuf_offset(offsets->end, chan) == 0)) {
1424                 /*
1425                  * The offset_end will fall at the very beginning of the next
1426                  * subbuffer.
1427                  */
1428                 offsets->switch_new_end = 1;    /* For offsets->begin */
1429         }
1430         return 0;
1431 }
1432
1433 /**
1434  * lib_ring_buffer_reserve_slow - Atomic slot reservation in a buffer.
1435  * @ctx: ring buffer context.
1436  *
1437  * Return : -NOBUFS if not enough space, -ENOSPC if event size too large,
1438  * -EIO for other errors, else returns 0.
1439  * It will take care of sub-buffer switching.
1440  */
1441 int lib_ring_buffer_reserve_slow(struct lttng_ust_lib_ring_buffer_ctx *ctx)
1442 {
1443         struct channel *chan = ctx->chan;
1444         struct lttng_ust_shm_handle *handle = ctx->handle;
1445         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1446         struct lttng_ust_lib_ring_buffer *buf;
1447         struct switch_offsets offsets;
1448         int ret;
1449
1450         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
1451                 buf = shmp(handle, chan->backend.buf[ctx->cpu].shmp);
1452         else
1453                 buf = shmp(handle, chan->backend.buf[0].shmp);
1454         ctx->buf = buf;
1455
1456         offsets.size = 0;
1457
1458         do {
1459                 ret = lib_ring_buffer_try_reserve_slow(buf, chan, &offsets,
1460                                                        ctx);
1461                 if (caa_unlikely(ret))
1462                         return ret;
1463         } while (caa_unlikely(v_cmpxchg(config, &buf->offset, offsets.old,
1464                                     offsets.end)
1465                           != offsets.old));
1466
1467         /*
1468          * Atomically update last_tsc. This update races against concurrent
1469          * atomic updates, but the race will always cause supplementary full TSC
1470          * records, never the opposite (missing a full TSC record when it would
1471          * be needed).
1472          */
1473         save_last_tsc(config, buf, ctx->tsc);
1474
1475         /*
1476          * Push the reader if necessary
1477          */
1478         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.end - 1);
1479
1480         /*
1481          * Clear noref flag for this subbuffer.
1482          */
1483         lib_ring_buffer_clear_noref(config, &buf->backend,
1484                                     subbuf_index(offsets.end - 1, chan),
1485                                     handle);
1486
1487         /*
1488          * Switch old subbuffer if needed.
1489          */
1490         if (caa_unlikely(offsets.switch_old_end)) {
1491                 lib_ring_buffer_clear_noref(config, &buf->backend,
1492                                             subbuf_index(offsets.old - 1, chan),
1493                                             handle);
1494                 lib_ring_buffer_switch_old_end(buf, chan, &offsets, ctx->tsc, handle);
1495         }
1496
1497         /*
1498          * Populate new subbuffer.
1499          */
1500         if (caa_unlikely(offsets.switch_new_start))
1501                 lib_ring_buffer_switch_new_start(buf, chan, &offsets, ctx->tsc, handle);
1502
1503         if (caa_unlikely(offsets.switch_new_end))
1504                 lib_ring_buffer_switch_new_end(buf, chan, &offsets, ctx->tsc, handle);
1505
1506         ctx->slot_size = offsets.size;
1507         ctx->pre_offset = offsets.begin;
1508         ctx->buf_offset = offsets.begin + offsets.pre_header_padding;
1509         return 0;
1510 }
1511
1512 /*
1513  * Force a read (imply TLS fixup for dlopen) of TLS variables.
1514  */
1515 void lttng_fixup_ringbuffer_tls(void)
1516 {
1517         asm volatile ("" : : "m" (lib_ring_buffer_nesting));
1518 }