libringbuffer/ring_buffer_frontend.c

   1 /*
   2  * ring_buffer_frontend.c
   3  *
   4  * (C) Copyright 2005-2010 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   5  *
   6  * Ring buffer wait-free buffer synchronization. Producer-consumer and flight
   7  * recorder (overwrite) modes. See thesis:
   8  *
   9  * Desnoyers, Mathieu (2009), "Low-Impact Operating System Tracing", Ph.D.
  10  * dissertation, Ecole Polytechnique de Montreal.
  11  * http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf
  12  *
  13  * - Algorithm presentation in Chapter 5:
  14  *     "Lockless Multi-Core High-Throughput Buffering".
  15  * - Algorithm formal verification in Section 8.6:
  16  *     "Formal verification of LTTng"
  17  *
  18  * Author:
  19  *      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  20  *
  21  * Inspired from LTT and RelayFS:
  22  *  Karim Yaghmour <karim@opersys.com>
  23  *  Tom Zanussi <zanussi@us.ibm.com>
  24  *  Bob Wisniewski <bob@watson.ibm.com>
  25  * And from K42 :
  26  *  Bob Wisniewski <bob@watson.ibm.com>
  27  *
  28  * Buffer reader semantic :
  29  *
  30  * - get_subbuf_size
  31  * while buffer is not finalized and empty
  32  *   - get_subbuf
  33  *     - if return value != 0, continue
  34  *   - splice one subbuffer worth of data to a pipe
  35  *   - splice the data from pipe to disk/network
  36  *   - put_subbuf
  37  *
  38  * Dual LGPL v2.1/GPL v2 license.
  39  */
  40
  41 #define _GNU_SOURCE
  42 #include <sys/types.h>
  43 #include <sys/mman.h>
  44 #include <sys/stat.h>
  45 #include <fcntl.h>
  46 #include <urcu/compiler.h>
  47 #include <urcu/ref.h>
  48 #include <helper.h>
  49
  50 #include "smp.h"
  51 #include <lttng/ringbuffer-config.h>
  52 #include "vatomic.h"
  53 #include "backend.h"
  54 #include "frontend.h"
  55 #include "shm.h"
  56 #include "../liblttng-ust/compat.h"     /* For ENODATA */
  57
  58 #ifndef max
  59 #define max(a, b)       ((a) > (b) ? (a) : (b))
  60 #endif
  61
  62 /*
  63  * Use POSIX SHM: shm_open(3) and shm_unlink(3).
  64  * close(2) to close the fd returned by shm_open.
  65  * shm_unlink releases the shared memory object name.
  66  * ftruncate(2) sets the size of the memory object.
  67  * mmap/munmap maps the shared memory obj to a virtual address in the
  68  * calling proceess (should be done both in libust and consumer).
  69  * See shm_overview(7) for details.
  70  * Pass file descriptor returned by shm_open(3) to ltt-sessiond through
  71  * a UNIX socket.
  72  *
  73  * Since we don't need to access the object using its name, we can
  74  * immediately shm_unlink(3) it, and only keep the handle with its file
  75  * descriptor.
  76  */
  77
  78 /*
  79  * Internal structure representing offsets to use at a sub-buffer switch.
  80  */
  81 struct switch_offsets {
  82         unsigned long begin, end, old;
  83         size_t pre_header_padding, size;
  84         unsigned int switch_new_start:1, switch_new_end:1, switch_old_start:1,
  85                      switch_old_end:1;
  86 };
  87
  88 __thread unsigned int lib_ring_buffer_nesting;
  89
  90 /*
  91  * TODO: this is unused. Errors are saved within the ring buffer.
  92  * Eventually, allow consumerd to print these errors.
  93  */
  94 static
  95 void lib_ring_buffer_print_errors(struct channel *chan,
  96                                   struct lttng_ust_lib_ring_buffer *buf, int cpu,
  97                                   struct lttng_ust_shm_handle *handle)
  98         __attribute__((unused));
  99
 100 /**
 101  * lib_ring_buffer_reset - Reset ring buffer to initial values.
 102  * @buf: Ring buffer.
 103  *
 104  * Effectively empty the ring buffer. Should be called when the buffer is not
 105  * used for writing. The ring buffer can be opened for reading, but the reader
 106  * should not be using the iterator concurrently with reset. The previous
 107  * current iterator record is reset.
 108  */
 109 void lib_ring_buffer_reset(struct lttng_ust_lib_ring_buffer *buf,
 110                            struct lttng_ust_shm_handle *handle)
 111 {
 112         struct channel *chan = shmp(handle, buf->backend.chan);
 113         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 114         unsigned int i;
 115
 116         /*
 117          * Reset iterator first. It will put the subbuffer if it currently holds
 118          * it.
 119          */
 120         v_set(config, &buf->offset, 0);
 121         for (i = 0; i < chan->backend.num_subbuf; i++) {
 122                 v_set(config, &shmp_index(handle, buf->commit_hot, i)->cc, 0);
 123                 v_set(config, &shmp_index(handle, buf->commit_hot, i)->seq, 0);
 124                 v_set(config, &shmp_index(handle, buf->commit_cold, i)->cc_sb, 0);
 125         }
 126         uatomic_set(&buf->consumed, 0);
 127         uatomic_set(&buf->record_disabled, 0);
 128         v_set(config, &buf->last_tsc, 0);
 129         lib_ring_buffer_backend_reset(&buf->backend, handle);
 130         /* Don't reset number of active readers */
 131         v_set(config, &buf->records_lost_full, 0);
 132         v_set(config, &buf->records_lost_wrap, 0);
 133         v_set(config, &buf->records_lost_big, 0);
 134         v_set(config, &buf->records_count, 0);
 135         v_set(config, &buf->records_overrun, 0);
 136         buf->finalized = 0;
 137 }
 138
 139 /**
 140  * channel_reset - Reset channel to initial values.
 141  * @chan: Channel.
 142  *
 143  * Effectively empty the channel. Should be called when the channel is not used
 144  * for writing. The channel can be opened for reading, but the reader should not
 145  * be using the iterator concurrently with reset. The previous current iterator
 146  * record is reset.
 147  */
 148 void channel_reset(struct channel *chan)
 149 {
 150         /*
 151          * Reset iterators first. Will put the subbuffer if held for reading.
 152          */
 153         uatomic_set(&chan->record_disabled, 0);
 154         /* Don't reset commit_count_mask, still valid */
 155         channel_backend_reset(&chan->backend);
 156         /* Don't reset switch/read timer interval */
 157         /* Don't reset notifiers and notifier enable bits */
 158         /* Don't reset reader reference count */
 159 }
 160
 161 /*
 162  * Must be called under cpu hotplug protection.
 163  */
 164 int lib_ring_buffer_create(struct lttng_ust_lib_ring_buffer *buf,
 165                            struct channel_backend *chanb, int cpu,
 166                            struct lttng_ust_shm_handle *handle,
 167                            struct shm_object *shmobj)
 168 {
 169         const struct lttng_ust_lib_ring_buffer_config *config = &chanb->config;
 170         struct channel *chan = caa_container_of(chanb, struct channel, backend);
 171         void *priv = channel_get_private(chan);
 172         size_t subbuf_header_size;
 173         uint64_t tsc;
 174         int ret;
 175
 176         /* Test for cpu hotplug */
 177         if (buf->backend.allocated)
 178                 return 0;
 179
 180         ret = lib_ring_buffer_backend_create(&buf->backend, &chan->backend,
 181                         cpu, handle, shmobj);
 182         if (ret)
 183                 return ret;
 184
 185         align_shm(shmobj, __alignof__(struct commit_counters_hot));
 186         set_shmp(buf->commit_hot,
 187                  zalloc_shm(shmobj,
 188                         sizeof(struct commit_counters_hot) * chan->backend.num_subbuf));
 189         if (!shmp(handle, buf->commit_hot)) {
 190                 ret = -ENOMEM;
 191                 goto free_chanbuf;
 192         }
 193
 194         align_shm(shmobj, __alignof__(struct commit_counters_cold));
 195         set_shmp(buf->commit_cold,
 196                  zalloc_shm(shmobj,
 197                         sizeof(struct commit_counters_cold) * chan->backend.num_subbuf));
 198         if (!shmp(handle, buf->commit_cold)) {
 199                 ret = -ENOMEM;
 200                 goto free_commit;
 201         }
 202
 203         /*
 204          * Write the subbuffer header for first subbuffer so we know the total
 205          * duration of data gathering.
 206          */
 207         subbuf_header_size = config->cb.subbuffer_header_size();
 208         v_set(config, &buf->offset, subbuf_header_size);
 209         subbuffer_id_clear_noref(config, &shmp_index(handle, buf->backend.buf_wsb, 0)->id);
 210         tsc = config->cb.ring_buffer_clock_read(shmp(handle, buf->backend.chan));
 211         config->cb.buffer_begin(buf, tsc, 0, handle);
 212         v_add(config, subbuf_header_size, &shmp_index(handle, buf->commit_hot, 0)->cc);
 213
 214         if (config->cb.buffer_create) {
 215                 ret = config->cb.buffer_create(buf, priv, cpu, chanb->name, handle);
 216                 if (ret)
 217                         goto free_init;
 218         }
 219         buf->backend.allocated = 1;
 220         return 0;
 221
 222         /* Error handling */
 223 free_init:
 224         /* commit_cold will be freed by shm teardown */
 225 free_commit:
 226         /* commit_hot will be freed by shm teardown */
 227 free_chanbuf:
 228         return ret;
 229 }
 230
 231 #if 0
 232 static void switch_buffer_timer(unsigned long data)
 233 {
 234         struct lttng_ust_lib_ring_buffer *buf = (struct lttng_ust_lib_ring_buffer *)data;
 235         struct channel *chan = shmp(handle, buf->backend.chan);
 236         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 237
 238         /*
 239          * Only flush buffers periodically if readers are active.
 240          */
 241         if (uatomic_read(&buf->active_readers) || uatomic_read(&buf->active_shadow_readers))
 242                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE, handle);
 243
 244         //TODO timers
 245         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 246         //      mod_timer_pinned(&buf->switch_timer,
 247         //                       jiffies + chan->switch_timer_interval);
 248         //else
 249         //      mod_timer(&buf->switch_timer,
 250         //                jiffies + chan->switch_timer_interval);
 251 }
 252 #endif //0
 253
 254 static void lib_ring_buffer_start_switch_timer(struct lttng_ust_lib_ring_buffer *buf,
 255                            struct lttng_ust_shm_handle *handle)
 256 {
 257         struct channel *chan = shmp(handle, buf->backend.chan);
 258         //const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 259
 260         if (!chan->switch_timer_interval || buf->switch_timer_enabled)
 261                 return;
 262         //TODO
 263         //init_timer(&buf->switch_timer);
 264         //buf->switch_timer.function = switch_buffer_timer;
 265         //buf->switch_timer.expires = jiffies + chan->switch_timer_interval;
 266         //buf->switch_timer.data = (unsigned long)buf;
 267         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 268         //      add_timer_on(&buf->switch_timer, buf->backend.cpu);
 269         //else
 270         //      add_timer(&buf->switch_timer);
 271         buf->switch_timer_enabled = 1;
 272 }
 273
 274 static void lib_ring_buffer_stop_switch_timer(struct lttng_ust_lib_ring_buffer *buf,
 275                            struct lttng_ust_shm_handle *handle)
 276 {
 277         struct channel *chan = shmp(handle, buf->backend.chan);
 278
 279         if (!chan->switch_timer_interval || !buf->switch_timer_enabled)
 280                 return;
 281
 282         //TODO
 283         //del_timer_sync(&buf->switch_timer);
 284         buf->switch_timer_enabled = 0;
 285 }
 286
 287 #if 0
 288 /*
 289  * Polling timer to check the channels for data.
 290  */
 291 static void read_buffer_timer(unsigned long data)
 292 {
 293         struct lttng_ust_lib_ring_buffer *buf = (struct lttng_ust_lib_ring_buffer *)data;
 294         struct channel *chan = shmp(handle, buf->backend.chan);
 295         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 296
 297         CHAN_WARN_ON(chan, !buf->backend.allocated);
 298
 299         if (uatomic_read(&buf->active_readers) || uatomic_read(&buf->active_shadow_readers))
 300             && lib_ring_buffer_poll_deliver(config, buf, chan)) {
 301                 //TODO
 302                 //wake_up_interruptible(&buf->read_wait);
 303                 //wake_up_interruptible(&chan->read_wait);
 304         }
 305
 306         //TODO
 307         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 308         //      mod_timer_pinned(&buf->read_timer,
 309         //                       jiffies + chan->read_timer_interval);
 310         //else
 311         //      mod_timer(&buf->read_timer,
 312         //                jiffies + chan->read_timer_interval);
 313 }
 314 #endif //0
 315
 316 static void lib_ring_buffer_start_read_timer(struct lttng_ust_lib_ring_buffer *buf,
 317                            struct lttng_ust_shm_handle *handle)
 318 {
 319         struct channel *chan = shmp(handle, buf->backend.chan);
 320         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 321
 322         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 323             || !chan->read_timer_interval
 324             || buf->read_timer_enabled)
 325                 return;
 326
 327         //TODO
 328         //init_timer(&buf->read_timer);
 329         //buf->read_timer.function = read_buffer_timer;
 330         //buf->read_timer.expires = jiffies + chan->read_timer_interval;
 331         //buf->read_timer.data = (unsigned long)buf;
 332
 333         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 334         //      add_timer_on(&buf->read_timer, buf->backend.cpu);
 335         //else
 336         //      add_timer(&buf->read_timer);
 337         buf->read_timer_enabled = 1;
 338 }
 339
 340 static void lib_ring_buffer_stop_read_timer(struct lttng_ust_lib_ring_buffer *buf,
 341                            struct lttng_ust_shm_handle *handle)
 342 {
 343         struct channel *chan = shmp(handle, buf->backend.chan);
 344         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 345
 346         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 347             || !chan->read_timer_interval
 348             || !buf->read_timer_enabled)
 349                 return;
 350
 351         //TODO
 352         //del_timer_sync(&buf->read_timer);
 353         /*
 354          * do one more check to catch data that has been written in the last
 355          * timer period.
 356          */
 357         if (lib_ring_buffer_poll_deliver(config, buf, chan, handle)) {
 358                 //TODO
 359                 //wake_up_interruptible(&buf->read_wait);
 360                 //wake_up_interruptible(&chan->read_wait);
 361         }
 362         buf->read_timer_enabled = 0;
 363 }
 364
 365 static void channel_unregister_notifiers(struct channel *chan,
 366                            struct lttng_ust_shm_handle *handle)
 367 {
 368         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 369         int cpu;
 370
 371         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 372                 for_each_possible_cpu(cpu) {
 373                         struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[cpu].shmp);
 374
 375                         lib_ring_buffer_stop_switch_timer(buf, handle);
 376                         lib_ring_buffer_stop_read_timer(buf, handle);
 377                 }
 378         } else {
 379                 struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[0].shmp);
 380
 381                 lib_ring_buffer_stop_switch_timer(buf, handle);
 382                 lib_ring_buffer_stop_read_timer(buf, handle);
 383         }
 384         //channel_backend_unregister_notifiers(&chan->backend);
 385 }
 386
 387 static void channel_free(struct channel *chan, struct lttng_ust_shm_handle *handle,
 388                 int shadow)
 389 {
 390         if (!shadow)
 391                 channel_backend_free(&chan->backend, handle);
 392         /* chan is freed by shm teardown */
 393         shm_object_table_destroy(handle->table);
 394         free(handle);
 395 }
 396
 397 /**
 398  * channel_create - Create channel.
 399  * @config: ring buffer instance configuration
 400  * @name: name of the channel
 401  * @priv_data: ring buffer client private data area pointer (output)
 402  * @priv_data_size: length, in bytes, of the private data area.
 403  * @priv_data_init: initialization data for private data.
 404  * @buf_addr: pointer the the beginning of the preallocated buffer contiguous
 405  *            address mapping. It is used only by RING_BUFFER_STATIC
 406  *            configuration. It can be set to NULL for other backends.
 407  * @subbuf_size: subbuffer size
 408  * @num_subbuf: number of subbuffers
 409  * @switch_timer_interval: Time interval (in us) to fill sub-buffers with
 410  *                         padding to let readers get those sub-buffers.
 411  *                         Used for live streaming.
 412  * @read_timer_interval: Time interval (in us) to wake up pending readers.
 413  *
 414  * Holds cpu hotplug.
 415  * Returns NULL on failure.
 416  */
 417 struct lttng_ust_shm_handle *channel_create(const struct lttng_ust_lib_ring_buffer_config *config,
 418                    const char *name,
 419                    void **priv_data,
 420                    size_t priv_data_align,
 421                    size_t priv_data_size,
 422                    void *priv_data_init,
 423                    void *buf_addr, size_t subbuf_size,
 424                    size_t num_subbuf, unsigned int switch_timer_interval,
 425                    unsigned int read_timer_interval,
 426                    int **shm_fd, int **wait_fd, uint64_t **memory_map_size)
 427 {
 428         int ret, cpu;
 429         size_t shmsize, chansize;
 430         struct channel *chan;
 431         struct lttng_ust_shm_handle *handle;
 432         struct shm_object *shmobj;
 433         struct shm_ref *ref;
 434
 435         if (lib_ring_buffer_check_config(config, switch_timer_interval,
 436                                          read_timer_interval))
 437                 return NULL;
 438
 439         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
 440         if (!handle)
 441                 return NULL;
 442
 443         /* Allocate table for channel + per-cpu buffers */
 444         handle->table = shm_object_table_create(1 + num_possible_cpus());
 445         if (!handle->table)
 446                 goto error_table_alloc;
 447
 448         /* Calculate the shm allocation layout */
 449         shmsize = sizeof(struct channel);
 450         shmsize += offset_align(shmsize, __alignof__(struct lttng_ust_lib_ring_buffer_shmp));
 451         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 452                 shmsize += sizeof(struct lttng_ust_lib_ring_buffer_shmp) * num_possible_cpus();
 453         else
 454                 shmsize += sizeof(struct lttng_ust_lib_ring_buffer_shmp);
 455         chansize = shmsize;
 456         shmsize += offset_align(shmsize, priv_data_align);
 457         shmsize += priv_data_size;
 458
 459         shmobj = shm_object_table_append(handle->table, shmsize);
 460         if (!shmobj)
 461                 goto error_append;
 462         /* struct channel is at object 0, offset 0 (hardcoded) */
 463         set_shmp(handle->chan, zalloc_shm(shmobj, chansize));
 464         assert(handle->chan._ref.index == 0);
 465         assert(handle->chan._ref.offset == 0);
 466         chan = shmp(handle, handle->chan);
 467         if (!chan)
 468                 goto error_append;
 469
 470         /* space for private data */
 471         if (priv_data_size) {
 472                 DECLARE_SHMP(void, priv_data_alloc);
 473
 474                 align_shm(shmobj, priv_data_align);
 475                 chan->priv_data_offset = shmobj->allocated_len;
 476                 set_shmp(priv_data_alloc, zalloc_shm(shmobj, priv_data_size));
 477                 if (!shmp(handle, priv_data_alloc))
 478                         goto error_append;
 479                 *priv_data = channel_get_private(chan);
 480                 memcpy(*priv_data, priv_data_init, priv_data_size);
 481         } else {
 482                 chan->priv_data_offset = -1;
 483                 *priv_data = NULL;
 484         }
 485
 486         ret = channel_backend_init(&chan->backend, name, config,
 487                                    subbuf_size, num_subbuf, handle);
 488         if (ret)
 489                 goto error_backend_init;
 490
 491         chan->commit_count_mask = (~0UL >> chan->backend.num_subbuf_order);
 492         //TODO
 493         //chan->switch_timer_interval = usecs_to_jiffies(switch_timer_interval);
 494         //chan->read_timer_interval = usecs_to_jiffies(read_timer_interval);
 495         //TODO
 496         //init_waitqueue_head(&chan->read_wait);
 497         //init_waitqueue_head(&chan->hp_wait);
 498
 499         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 500                 /*
 501                  * In case of non-hotplug cpu, if the ring-buffer is allocated
 502                  * in early initcall, it will not be notified of secondary cpus.
 503                  * In that off case, we need to allocate for all possible cpus.
 504                  */
 505                 for_each_possible_cpu(cpu) {
 506                         struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[cpu].shmp);
 507                         lib_ring_buffer_start_switch_timer(buf, handle);
 508                         lib_ring_buffer_start_read_timer(buf, handle);
 509                 }
 510         } else {
 511                 struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[0].shmp);
 512
 513                 lib_ring_buffer_start_switch_timer(buf, handle);
 514                 lib_ring_buffer_start_read_timer(buf, handle);
 515         }
 516         ref = &handle->chan._ref;
 517         shm_get_object_data(handle, ref, shm_fd, wait_fd, memory_map_size);
 518         return handle;
 519
 520 error_backend_init:
 521 error_append:
 522         shm_object_table_destroy(handle->table);
 523 error_table_alloc:
 524         free(handle);
 525         return NULL;
 526 }
 527
 528 struct lttng_ust_shm_handle *channel_handle_create(int shm_fd, int wait_fd,
 529                                         uint64_t memory_map_size)
 530 {
 531         struct lttng_ust_shm_handle *handle;
 532         struct shm_object *object;
 533
 534         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
 535         if (!handle)
 536                 return NULL;
 537
 538         /* Allocate table for channel + per-cpu buffers */
 539         handle->table = shm_object_table_create(1 + num_possible_cpus());
 540         if (!handle->table)
 541                 goto error_table_alloc;
 542         /* Add channel object */
 543         object = shm_object_table_append_shadow(handle->table,
 544                         shm_fd, wait_fd, memory_map_size);
 545         if (!object)
 546                 goto error_table_object;
 547         /* struct channel is at object 0, offset 0 (hardcoded) */
 548         handle->chan._ref.index = 0;
 549         handle->chan._ref.offset = 0;
 550         return handle;
 551
 552 error_table_object:
 553         shm_object_table_destroy(handle->table);
 554 error_table_alloc:
 555         free(handle);
 556         return NULL;
 557 }
 558
 559 int channel_handle_add_stream(struct lttng_ust_shm_handle *handle,
 560                 int shm_fd, int wait_fd, uint64_t memory_map_size)
 561 {
 562         struct shm_object *object;
 563
 564         /* Add stream object */
 565         object = shm_object_table_append_shadow(handle->table,
 566                         shm_fd, wait_fd, memory_map_size);
 567         if (!object)
 568                 return -1;
 569         return 0;
 570 }
 571
 572 static
 573 void channel_release(struct channel *chan, struct lttng_ust_shm_handle *handle,
 574                 int shadow)
 575 {
 576         channel_free(chan, handle, shadow);
 577 }
 578
 579 /**
 580  * channel_destroy - Finalize, wait for q.s. and destroy channel.
 581  * @chan: channel to destroy
 582  *
 583  * Holds cpu hotplug.
 584  * Call "destroy" callback, finalize channels, decrement the channel
 585  * reference count. Note that when readers have completed data
 586  * consumption of finalized channels, get_subbuf() will return -ENODATA.
 587  * They should release their handle at that point.
 588  */
 589 void channel_destroy(struct channel *chan, struct lttng_ust_shm_handle *handle,
 590                 int shadow)
 591 {
 592         if (shadow) {
 593                 channel_release(chan, handle, shadow);
 594                 return;
 595         }
 596
 597         channel_unregister_notifiers(chan, handle);
 598
 599         /*
 600          * Note: the consumer takes care of finalizing and switching the
 601          * buffers.
 602          */
 603
 604         /*
 605          * sessiond/consumer are keeping a reference on the shm file
 606          * descriptor directly. No need to refcount.
 607          */
 608         channel_release(chan, handle, shadow);
 609         return;
 610 }
 611
 612 struct lttng_ust_lib_ring_buffer *channel_get_ring_buffer(
 613                                         const struct lttng_ust_lib_ring_buffer_config *config,
 614                                         struct channel *chan, int cpu,
 615                                         struct lttng_ust_shm_handle *handle,
 616                                         int **shm_fd, int **wait_fd,
 617                                         uint64_t **memory_map_size)
 618 {
 619         struct shm_ref *ref;
 620
 621         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) {
 622                 ref = &chan->backend.buf[0].shmp._ref;
 623                 shm_get_object_data(handle, ref, shm_fd, wait_fd,
 624                         memory_map_size);
 625                 return shmp(handle, chan->backend.buf[0].shmp);
 626         } else {
 627                 if (cpu >= num_possible_cpus())
 628                         return NULL;
 629                 ref = &chan->backend.buf[cpu].shmp._ref;
 630                 shm_get_object_data(handle, ref, shm_fd, wait_fd,
 631                         memory_map_size);
 632                 return shmp(handle, chan->backend.buf[cpu].shmp);
 633         }
 634 }
 635
 636 int lib_ring_buffer_open_read(struct lttng_ust_lib_ring_buffer *buf,
 637                               struct lttng_ust_shm_handle *handle,
 638                               int shadow)
 639 {
 640         if (shadow) {
 641                 if (uatomic_cmpxchg(&buf->active_shadow_readers, 0, 1) != 0)
 642                         return -EBUSY;
 643                 cmm_smp_mb();
 644                 return 0;
 645         }
 646         if (uatomic_cmpxchg(&buf->active_readers, 0, 1) != 0)
 647                 return -EBUSY;
 648         cmm_smp_mb();
 649         return 0;
 650 }
 651
 652 void lib_ring_buffer_release_read(struct lttng_ust_lib_ring_buffer *buf,
 653                                   struct lttng_ust_shm_handle *handle,
 654                                   int shadow)
 655 {
 656         struct channel *chan = shmp(handle, buf->backend.chan);
 657
 658         if (shadow) {
 659                 CHAN_WARN_ON(chan, uatomic_read(&buf->active_shadow_readers) != 1);
 660                 cmm_smp_mb();
 661                 uatomic_dec(&buf->active_shadow_readers);
 662                 return;
 663         }
 664         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1);
 665         cmm_smp_mb();
 666         uatomic_dec(&buf->active_readers);
 667 }
 668
 669 /**
 670  * lib_ring_buffer_snapshot - save subbuffer position snapshot (for read)
 671  * @buf: ring buffer
 672  * @consumed: consumed count indicating the position where to read
 673  * @produced: produced count, indicates position when to stop reading
 674  *
 675  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 676  * data to read at consumed position, or 0 if the get operation succeeds.
 677  */
 678
 679 int lib_ring_buffer_snapshot(struct lttng_ust_lib_ring_buffer *buf,
 680                              unsigned long *consumed, unsigned long *produced,
 681                              struct lttng_ust_shm_handle *handle)
 682 {
 683         struct channel *chan = shmp(handle, buf->backend.chan);
 684         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 685         unsigned long consumed_cur, write_offset;
 686         int finalized;
 687
 688         finalized = CMM_ACCESS_ONCE(buf->finalized);
 689         /*
 690          * Read finalized before counters.
 691          */
 692         cmm_smp_rmb();
 693         consumed_cur = uatomic_read(&buf->consumed);
 694         /*
 695          * No need to issue a memory barrier between consumed count read and
 696          * write offset read, because consumed count can only change
 697          * concurrently in overwrite mode, and we keep a sequence counter
 698          * identifier derived from the write offset to check we are getting
 699          * the same sub-buffer we are expecting (the sub-buffers are atomically
 700          * "tagged" upon writes, tags are checked upon read).
 701          */
 702         write_offset = v_read(config, &buf->offset);
 703
 704         /*
 705          * Check that we are not about to read the same subbuffer in
 706          * which the writer head is.
 707          */
 708         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
 709             == 0)
 710                 goto nodata;
 711
 712         *consumed = consumed_cur;
 713         *produced = subbuf_trunc(write_offset, chan);
 714
 715         return 0;
 716
 717 nodata:
 718         /*
 719          * The memory barriers __wait_event()/wake_up_interruptible() take care
 720          * of "raw_spin_is_locked" memory ordering.
 721          */
 722         if (finalized)
 723                 return -ENODATA;
 724         else
 725                 return -EAGAIN;
 726 }
 727
 728 /**
 729  * lib_ring_buffer_put_snapshot - move consumed counter forward
 730  * @buf: ring buffer
 731  * @consumed_new: new consumed count value
 732  */
 733 void lib_ring_buffer_move_consumer(struct lttng_ust_lib_ring_buffer *buf,
 734                                    unsigned long consumed_new,
 735                                    struct lttng_ust_shm_handle *handle)
 736 {
 737         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
 738         struct channel *chan = shmp(handle, bufb->chan);
 739         unsigned long consumed;
 740
 741         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1
 742                         && uatomic_read(&buf->active_shadow_readers) != 1);
 743
 744         /*
 745          * Only push the consumed value forward.
 746          * If the consumed cmpxchg fails, this is because we have been pushed by
 747          * the writer in flight recorder mode.
 748          */
 749         consumed = uatomic_read(&buf->consumed);
 750         while ((long) consumed - (long) consumed_new < 0)
 751                 consumed = uatomic_cmpxchg(&buf->consumed, consumed,
 752                                            consumed_new);
 753 }
 754
 755 /**
 756  * lib_ring_buffer_get_subbuf - get exclusive access to subbuffer for reading
 757  * @buf: ring buffer
 758  * @consumed: consumed count indicating the position where to read
 759  *
 760  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 761  * data to read at consumed position, or 0 if the get operation succeeds.
 762  */
 763 int lib_ring_buffer_get_subbuf(struct lttng_ust_lib_ring_buffer *buf,
 764                                unsigned long consumed,
 765                                struct lttng_ust_shm_handle *handle)
 766 {
 767         struct channel *chan = shmp(handle, buf->backend.chan);
 768         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 769         unsigned long consumed_cur, consumed_idx, commit_count, write_offset;
 770         int ret;
 771         int finalized;
 772
 773 retry:
 774         finalized = CMM_ACCESS_ONCE(buf->finalized);
 775         /*
 776          * Read finalized before counters.
 777          */
 778         cmm_smp_rmb();
 779         consumed_cur = uatomic_read(&buf->consumed);
 780         consumed_idx = subbuf_index(consumed, chan);
 781         commit_count = v_read(config, &shmp_index(handle, buf->commit_cold, consumed_idx)->cc_sb);
 782         /*
 783          * Make sure we read the commit count before reading the buffer
 784          * data and the write offset. Correct consumed offset ordering
 785          * wrt commit count is insured by the use of cmpxchg to update
 786          * the consumed offset.
 787          */
 788         /*
 789          * Local rmb to match the remote wmb to read the commit count
 790          * before the buffer data and the write offset.
 791          */
 792         cmm_smp_rmb();
 793
 794         write_offset = v_read(config, &buf->offset);
 795
 796         /*
 797          * Check that the buffer we are getting is after or at consumed_cur
 798          * position.
 799          */
 800         if ((long) subbuf_trunc(consumed, chan)
 801             - (long) subbuf_trunc(consumed_cur, chan) < 0)
 802                 goto nodata;
 803
 804         /*
 805          * Check that the subbuffer we are trying to consume has been
 806          * already fully committed.
 807          */
 808         if (((commit_count - chan->backend.subbuf_size)
 809              & chan->commit_count_mask)
 810             - (buf_trunc(consumed_cur, chan)
 811                >> chan->backend.num_subbuf_order)
 812             != 0)
 813                 goto nodata;
 814
 815         /*
 816          * Check that we are not about to read the same subbuffer in
 817          * which the writer head is.
 818          */
 819         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
 820             == 0)
 821                 goto nodata;
 822
 823         /*
 824          * Failure to get the subbuffer causes a busy-loop retry without going
 825          * to a wait queue. These are caused by short-lived race windows where
 826          * the writer is getting access to a subbuffer we were trying to get
 827          * access to. Also checks that the "consumed" buffer count we are
 828          * looking for matches the one contained in the subbuffer id.
 829          */
 830         ret = update_read_sb_index(config, &buf->backend, &chan->backend,
 831                                    consumed_idx, buf_trunc_val(consumed, chan),
 832                                    handle);
 833         if (ret)
 834                 goto retry;
 835         subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id);
 836
 837         buf->get_subbuf_consumed = consumed;
 838         buf->get_subbuf = 1;
 839
 840         return 0;
 841
 842 nodata:
 843         /*
 844          * The memory barriers __wait_event()/wake_up_interruptible() take care
 845          * of "raw_spin_is_locked" memory ordering.
 846          */
 847         if (finalized)
 848                 return -ENODATA;
 849         else
 850                 return -EAGAIN;
 851 }
 852
 853 /**
 854  * lib_ring_buffer_put_subbuf - release exclusive subbuffer access
 855  * @buf: ring buffer
 856  */
 857 void lib_ring_buffer_put_subbuf(struct lttng_ust_lib_ring_buffer *buf,
 858                                 struct lttng_ust_shm_handle *handle)
 859 {
 860         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
 861         struct channel *chan = shmp(handle, bufb->chan);
 862         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 863         unsigned long read_sb_bindex, consumed_idx, consumed;
 864
 865         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1
 866                         && uatomic_read(&buf->active_shadow_readers) != 1);
 867
 868         if (!buf->get_subbuf) {
 869                 /*
 870                  * Reader puts a subbuffer it did not get.
 871                  */
 872                 CHAN_WARN_ON(chan, 1);
 873                 return;
 874         }
 875         consumed = buf->get_subbuf_consumed;
 876         buf->get_subbuf = 0;
 877
 878         /*
 879          * Clear the records_unread counter. (overruns counter)
 880          * Can still be non-zero if a file reader simply grabbed the data
 881          * without using iterators.
 882          * Can be below zero if an iterator is used on a snapshot more than
 883          * once.
 884          */
 885         read_sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id);
 886         v_add(config, v_read(config,
 887                              &shmp(handle, shmp_index(handle, bufb->array, read_sb_bindex)->shmp)->records_unread),
 888               &bufb->records_read);
 889         v_set(config, &shmp(handle, shmp_index(handle, bufb->array, read_sb_bindex)->shmp)->records_unread, 0);
 890         CHAN_WARN_ON(chan, config->mode == RING_BUFFER_OVERWRITE
 891                      && subbuffer_id_is_noref(config, bufb->buf_rsb.id));
 892         subbuffer_id_set_noref(config, &bufb->buf_rsb.id);
 893
 894         /*
 895          * Exchange the reader subbuffer with the one we put in its place in the
 896          * writer subbuffer table. Expect the original consumed count. If
 897          * update_read_sb_index fails, this is because the writer updated the
 898          * subbuffer concurrently. We should therefore keep the subbuffer we
 899          * currently have: it has become invalid to try reading this sub-buffer
 900          * consumed count value anyway.
 901          */
 902         consumed_idx = subbuf_index(consumed, chan);
 903         update_read_sb_index(config, &buf->backend, &chan->backend,
 904                              consumed_idx, buf_trunc_val(consumed, chan),
 905                              handle);
 906         /*
 907          * update_read_sb_index return value ignored. Don't exchange sub-buffer
 908          * if the writer concurrently updated it.
 909          */
 910 }
 911
 912 /*
 913  * cons_offset is an iterator on all subbuffer offsets between the reader
 914  * position and the writer position. (inclusive)
 915  */
 916 static
 917 void lib_ring_buffer_print_subbuffer_errors(struct lttng_ust_lib_ring_buffer *buf,
 918                                             struct channel *chan,
 919                                             unsigned long cons_offset,
 920                                             int cpu,
 921                                             struct lttng_ust_shm_handle *handle)
 922 {
 923         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 924         unsigned long cons_idx, commit_count, commit_count_sb;
 925
 926         cons_idx = subbuf_index(cons_offset, chan);
 927         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, cons_idx)->cc);
 928         commit_count_sb = v_read(config, &shmp_index(handle, buf->commit_cold, cons_idx)->cc_sb);
 929
 930         if (subbuf_offset(commit_count, chan) != 0)
 931                 DBG("ring buffer %s, cpu %d: "
 932                        "commit count in subbuffer %lu,\n"
 933                        "expecting multiples of %lu bytes\n"
 934                        "  [ %lu bytes committed, %lu bytes reader-visible ]\n",
 935                        chan->backend.name, cpu, cons_idx,
 936                        chan->backend.subbuf_size,
 937                        commit_count, commit_count_sb);
 938
 939         DBG("ring buffer: %s, cpu %d: %lu bytes committed\n",
 940                chan->backend.name, cpu, commit_count);
 941 }
 942
 943 static
 944 void lib_ring_buffer_print_buffer_errors(struct lttng_ust_lib_ring_buffer *buf,
 945                                          struct channel *chan,
 946                                          void *priv, int cpu,
 947                                          struct lttng_ust_shm_handle *handle)
 948 {
 949         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 950         unsigned long write_offset, cons_offset;
 951
 952         /*
 953          * No need to order commit_count, write_offset and cons_offset reads
 954          * because we execute at teardown when no more writer nor reader
 955          * references are left.
 956          */
 957         write_offset = v_read(config, &buf->offset);
 958         cons_offset = uatomic_read(&buf->consumed);
 959         if (write_offset != cons_offset)
 960                 DBG("ring buffer %s, cpu %d: "
 961                        "non-consumed data\n"
 962                        "  [ %lu bytes written, %lu bytes read ]\n",
 963                        chan->backend.name, cpu, write_offset, cons_offset);
 964
 965         for (cons_offset = uatomic_read(&buf->consumed);
 966              (long) (subbuf_trunc((unsigned long) v_read(config, &buf->offset),
 967                                   chan)
 968                      - cons_offset) > 0;
 969              cons_offset = subbuf_align(cons_offset, chan))
 970                 lib_ring_buffer_print_subbuffer_errors(buf, chan, cons_offset,
 971                                                        cpu, handle);
 972 }
 973
 974 static
 975 void lib_ring_buffer_print_errors(struct channel *chan,
 976                                   struct lttng_ust_lib_ring_buffer *buf, int cpu,
 977                                   struct lttng_ust_shm_handle *handle)
 978 {
 979         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 980         void *priv = channel_get_private(chan);
 981
 982         DBG("ring buffer %s, cpu %d: %lu records written, "
 983                           "%lu records overrun\n",
 984                           chan->backend.name, cpu,
 985                           v_read(config, &buf->records_count),
 986                           v_read(config, &buf->records_overrun));
 987
 988         if (v_read(config, &buf->records_lost_full)
 989             || v_read(config, &buf->records_lost_wrap)
 990             || v_read(config, &buf->records_lost_big))
 991                 DBG("ring buffer %s, cpu %d: records were lost. Caused by:\n"
 992                        "  [ %lu buffer full, %lu nest buffer wrap-around, "
 993                        "%lu event too big ]\n",
 994                        chan->backend.name, cpu,
 995                        v_read(config, &buf->records_lost_full),
 996                        v_read(config, &buf->records_lost_wrap),
 997                        v_read(config, &buf->records_lost_big));
 998
 999         lib_ring_buffer_print_buffer_errors(buf, chan, priv, cpu, handle);
1000 }
1001
1002 /*
1003  * lib_ring_buffer_switch_old_start: Populate old subbuffer header.
1004  *
1005  * Only executed when the buffer is finalized, in SWITCH_FLUSH.
1006  */
1007 static
1008 void lib_ring_buffer_switch_old_start(struct lttng_ust_lib_ring_buffer *buf,
1009                                       struct channel *chan,
1010                                       struct switch_offsets *offsets,
1011                                       uint64_t tsc,
1012                                       struct lttng_ust_shm_handle *handle)
1013 {
1014         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1015         unsigned long oldidx = subbuf_index(offsets->old, chan);
1016         unsigned long commit_count;
1017
1018         config->cb.buffer_begin(buf, tsc, oldidx, handle);
1019
1020         /*
1021          * Order all writes to buffer before the commit count update that will
1022          * determine that the subbuffer is full.
1023          */
1024         cmm_smp_wmb();
1025         v_add(config, config->cb.subbuffer_header_size(),
1026               &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1027         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1028         /* Check if the written buffer has to be delivered */
1029         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old,
1030                                       commit_count, oldidx, handle);
1031         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1032                                              offsets->old, commit_count,
1033                                              config->cb.subbuffer_header_size(),
1034                                              handle);
1035 }
1036
1037 /*
1038  * lib_ring_buffer_switch_old_end: switch old subbuffer
1039  *
1040  * Note : offset_old should never be 0 here. It is ok, because we never perform
1041  * buffer switch on an empty subbuffer in SWITCH_ACTIVE mode. The caller
1042  * increments the offset_old value when doing a SWITCH_FLUSH on an empty
1043  * subbuffer.
1044  */
1045 static
1046 void lib_ring_buffer_switch_old_end(struct lttng_ust_lib_ring_buffer *buf,
1047                                     struct channel *chan,
1048                                     struct switch_offsets *offsets,
1049                                     uint64_t tsc,
1050                                     struct lttng_ust_shm_handle *handle)
1051 {
1052         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1053         unsigned long oldidx = subbuf_index(offsets->old - 1, chan);
1054         unsigned long commit_count, padding_size, data_size;
1055
1056         data_size = subbuf_offset(offsets->old - 1, chan) + 1;
1057         padding_size = chan->backend.subbuf_size - data_size;
1058         subbuffer_set_data_size(config, &buf->backend, oldidx, data_size,
1059                                 handle);
1060
1061         /*
1062          * Order all writes to buffer before the commit count update that will
1063          * determine that the subbuffer is full.
1064          */
1065         cmm_smp_wmb();
1066         v_add(config, padding_size, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1067         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1068         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1,
1069                                       commit_count, oldidx, handle);
1070         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1071                                              offsets->old, commit_count,
1072                                              padding_size, handle);
1073 }
1074
1075 /*
1076  * lib_ring_buffer_switch_new_start: Populate new subbuffer.
1077  *
1078  * This code can be executed unordered : writers may already have written to the
1079  * sub-buffer before this code gets executed, caution.  The commit makes sure
1080  * that this code is executed before the deliver of this sub-buffer.
1081  */
1082 static
1083 void lib_ring_buffer_switch_new_start(struct lttng_ust_lib_ring_buffer *buf,
1084                                       struct channel *chan,
1085                                       struct switch_offsets *offsets,
1086                                       uint64_t tsc,
1087                                       struct lttng_ust_shm_handle *handle)
1088 {
1089         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1090         unsigned long beginidx = subbuf_index(offsets->begin, chan);
1091         unsigned long commit_count;
1092
1093         config->cb.buffer_begin(buf, tsc, beginidx, handle);
1094
1095         /*
1096          * Order all writes to buffer before the commit count update that will
1097          * determine that the subbuffer is full.
1098          */
1099         cmm_smp_wmb();
1100         v_add(config, config->cb.subbuffer_header_size(),
1101               &shmp_index(handle, buf->commit_hot, beginidx)->cc);
1102         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, beginidx)->cc);
1103         /* Check if the written buffer has to be delivered */
1104         lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin,
1105                                       commit_count, beginidx, handle);
1106         lib_ring_buffer_write_commit_counter(config, buf, chan, beginidx,
1107                                              offsets->begin, commit_count,
1108                                              config->cb.subbuffer_header_size(),
1109                                              handle);
1110 }
1111
1112 /*
1113  * lib_ring_buffer_switch_new_end: finish switching current subbuffer
1114  *
1115  * The only remaining threads could be the ones with pending commits. They will
1116  * have to do the deliver themselves.
1117  */
1118 static
1119 void lib_ring_buffer_switch_new_end(struct lttng_ust_lib_ring_buffer *buf,
1120                                     struct channel *chan,
1121                                     struct switch_offsets *offsets,
1122                                     uint64_t tsc,
1123                                     struct lttng_ust_shm_handle *handle)
1124 {
1125         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1126         unsigned long endidx = subbuf_index(offsets->end - 1, chan);
1127         unsigned long commit_count, padding_size, data_size;
1128
1129         data_size = subbuf_offset(offsets->end - 1, chan) + 1;
1130         padding_size = chan->backend.subbuf_size - data_size;
1131         subbuffer_set_data_size(config, &buf->backend, endidx, data_size,
1132                                 handle);
1133
1134         /*
1135          * Order all writes to buffer before the commit count update that will
1136          * determine that the subbuffer is full.
1137          */
1138         cmm_smp_wmb();
1139         v_add(config, padding_size, &shmp_index(handle, buf->commit_hot, endidx)->cc);
1140         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, endidx)->cc);
1141         lib_ring_buffer_check_deliver(config, buf, chan, offsets->end - 1,
1142                                   commit_count, endidx, handle);
1143         lib_ring_buffer_write_commit_counter(config, buf, chan, endidx,
1144                                              offsets->end, commit_count,
1145                                              padding_size, handle);
1146 }
1147
1148 /*
1149  * Returns :
1150  * 0 if ok
1151  * !0 if execution must be aborted.
1152  */
1153 static
1154 int lib_ring_buffer_try_switch_slow(enum switch_mode mode,
1155                                     struct lttng_ust_lib_ring_buffer *buf,
1156                                     struct channel *chan,
1157                                     struct switch_offsets *offsets,
1158                                     uint64_t *tsc)
1159 {
1160         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1161         unsigned long off;
1162
1163         offsets->begin = v_read(config, &buf->offset);
1164         offsets->old = offsets->begin;
1165         offsets->switch_old_start = 0;
1166         off = subbuf_offset(offsets->begin, chan);
1167
1168         *tsc = config->cb.ring_buffer_clock_read(chan);
1169
1170         /*
1171          * Ensure we flush the header of an empty subbuffer when doing the
1172          * finalize (SWITCH_FLUSH). This ensures that we end up knowing the
1173          * total data gathering duration even if there were no records saved
1174          * after the last buffer switch.
1175          * In SWITCH_ACTIVE mode, switch the buffer when it contains events.
1176          * SWITCH_ACTIVE only flushes the current subbuffer, dealing with end of
1177          * subbuffer header as appropriate.
1178          * The next record that reserves space will be responsible for
1179          * populating the following subbuffer header. We choose not to populate
1180          * the next subbuffer header here because we want to be able to use
1181          * SWITCH_ACTIVE for periodical buffer flush, which must
1182          * guarantee that all the buffer content (records and header
1183          * timestamps) are visible to the reader. This is required for
1184          * quiescence guarantees for the fusion merge.
1185          */
1186         if (mode == SWITCH_FLUSH || off > 0) {
1187                 if (caa_unlikely(off == 0)) {
1188                         /*
1189                          * The client does not save any header information.
1190                          * Don't switch empty subbuffer on finalize, because it
1191                          * is invalid to deliver a completely empty subbuffer.
1192                          */
1193                         if (!config->cb.subbuffer_header_size())
1194                                 return -1;
1195                         /*
1196                          * Need to write the subbuffer start header on finalize.
1197                          */
1198                         offsets->switch_old_start = 1;
1199                 }
1200                 offsets->begin = subbuf_align(offsets->begin, chan);
1201         } else
1202                 return -1;      /* we do not have to switch : buffer is empty */
1203         /* Note: old points to the next subbuf at offset 0 */
1204         offsets->end = offsets->begin;
1205         return 0;
1206 }
1207
1208 /*
1209  * Force a sub-buffer switch. This operation is completely reentrant : can be
1210  * called while tracing is active with absolutely no lock held.
1211  *
1212  * Note, however, that as a v_cmpxchg is used for some atomic
1213  * operations, this function must be called from the CPU which owns the buffer
1214  * for a ACTIVE flush.
1215  */
1216 void lib_ring_buffer_switch_slow(struct lttng_ust_lib_ring_buffer *buf, enum switch_mode mode,
1217                                  struct lttng_ust_shm_handle *handle)
1218 {
1219         struct channel *chan = shmp(handle, buf->backend.chan);
1220         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1221         struct switch_offsets offsets;
1222         unsigned long oldidx;
1223         uint64_t tsc;
1224
1225         offsets.size = 0;
1226
1227         /*
1228          * Perform retryable operations.
1229          */
1230         do {
1231                 if (lib_ring_buffer_try_switch_slow(mode, buf, chan, &offsets,
1232                                                     &tsc))
1233                         return; /* Switch not needed */
1234         } while (v_cmpxchg(config, &buf->offset, offsets.old, offsets.end)
1235                  != offsets.old);
1236
1237         /*
1238          * Atomically update last_tsc. This update races against concurrent
1239          * atomic updates, but the race will always cause supplementary full TSC
1240          * records, never the opposite (missing a full TSC record when it would
1241          * be needed).
1242          */
1243         save_last_tsc(config, buf, tsc);
1244
1245         /*
1246          * Push the reader if necessary
1247          */
1248         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.old);
1249
1250         oldidx = subbuf_index(offsets.old, chan);
1251         lib_ring_buffer_clear_noref(config, &buf->backend, oldidx, handle);
1252
1253         /*
1254          * May need to populate header start on SWITCH_FLUSH.
1255          */
1256         if (offsets.switch_old_start) {
1257                 lib_ring_buffer_switch_old_start(buf, chan, &offsets, tsc, handle);
1258                 offsets.old += config->cb.subbuffer_header_size();
1259         }
1260
1261         /*
1262          * Switch old subbuffer.
1263          */
1264         lib_ring_buffer_switch_old_end(buf, chan, &offsets, tsc, handle);
1265 }
1266
1267 /*
1268  * Returns :
1269  * 0 if ok
1270  * -ENOSPC if event size is too large for packet.
1271  * -ENOBUFS if there is currently not enough space in buffer for the event.
1272  * -EIO if data cannot be written into the buffer for any other reason.
1273  */
1274 static
1275 int lib_ring_buffer_try_reserve_slow(struct lttng_ust_lib_ring_buffer *buf,
1276                                      struct channel *chan,
1277                                      struct switch_offsets *offsets,
1278                                      struct lttng_ust_lib_ring_buffer_ctx *ctx)
1279 {
1280         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1281         struct lttng_ust_shm_handle *handle = ctx->handle;
1282         unsigned long reserve_commit_diff;
1283
1284         offsets->begin = v_read(config, &buf->offset);
1285         offsets->old = offsets->begin;
1286         offsets->switch_new_start = 0;
1287         offsets->switch_new_end = 0;
1288         offsets->switch_old_end = 0;
1289         offsets->pre_header_padding = 0;
1290
1291         ctx->tsc = config->cb.ring_buffer_clock_read(chan);
1292         if ((int64_t) ctx->tsc == -EIO)
1293                 return -EIO;
1294
1295         if (last_tsc_overflow(config, buf, ctx->tsc))
1296                 ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC;
1297
1298         if (caa_unlikely(subbuf_offset(offsets->begin, ctx->chan) == 0)) {
1299                 offsets->switch_new_start = 1;          /* For offsets->begin */
1300         } else {
1301                 offsets->size = config->cb.record_header_size(config, chan,
1302                                                 offsets->begin,
1303                                                 &offsets->pre_header_padding,
1304                                                 ctx);
1305                 offsets->size +=
1306                         lib_ring_buffer_align(offsets->begin + offsets->size,
1307                                               ctx->largest_align)
1308                         + ctx->data_size;
1309                 if (caa_unlikely(subbuf_offset(offsets->begin, chan) +
1310                              offsets->size > chan->backend.subbuf_size)) {
1311                         offsets->switch_old_end = 1;    /* For offsets->old */
1312                         offsets->switch_new_start = 1;  /* For offsets->begin */
1313                 }
1314         }
1315         if (caa_unlikely(offsets->switch_new_start)) {
1316                 unsigned long sb_index;
1317
1318                 /*
1319                  * We are typically not filling the previous buffer completely.
1320                  */
1321                 if (caa_likely(offsets->switch_old_end))
1322                         offsets->begin = subbuf_align(offsets->begin, chan);
1323                 offsets->begin = offsets->begin
1324                                  + config->cb.subbuffer_header_size();
1325                 /* Test new buffer integrity */
1326                 sb_index = subbuf_index(offsets->begin, chan);
1327                 reserve_commit_diff =
1328                   (buf_trunc(offsets->begin, chan)
1329                    >> chan->backend.num_subbuf_order)
1330                   - ((unsigned long) v_read(config,
1331                                             &shmp_index(handle, buf->commit_cold, sb_index)->cc_sb)
1332                      & chan->commit_count_mask);
1333                 if (caa_likely(reserve_commit_diff == 0)) {
1334                         /* Next subbuffer not being written to. */
1335                         if (caa_unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1336                                 subbuf_trunc(offsets->begin, chan)
1337                                  - subbuf_trunc((unsigned long)
1338                                      uatomic_read(&buf->consumed), chan)
1339                                 >= chan->backend.buf_size)) {
1340                                 /*
1341                                  * We do not overwrite non consumed buffers
1342                                  * and we are full : record is lost.
1343                                  */
1344                                 v_inc(config, &buf->records_lost_full);
1345                                 return -ENOBUFS;
1346                         } else {
1347                                 /*
1348                                  * Next subbuffer not being written to, and we
1349                                  * are either in overwrite mode or the buffer is
1350                                  * not full. It's safe to write in this new
1351                                  * subbuffer.
1352                                  */
1353                         }
1354                 } else {
1355                         /*
1356                          * Next subbuffer reserve offset does not match the
1357                          * commit offset. Drop record in producer-consumer and
1358                          * overwrite mode. Caused by either a writer OOPS or too
1359                          * many nested writes over a reserve/commit pair.
1360                          */
1361                         v_inc(config, &buf->records_lost_wrap);
1362                         return -EIO;
1363                 }
1364                 offsets->size =
1365                         config->cb.record_header_size(config, chan,
1366                                                 offsets->begin,
1367                                                 &offsets->pre_header_padding,
1368                                                 ctx);
1369                 offsets->size +=
1370                         lib_ring_buffer_align(offsets->begin + offsets->size,
1371                                               ctx->largest_align)
1372                         + ctx->data_size;
1373                 if (caa_unlikely(subbuf_offset(offsets->begin, chan)
1374                              + offsets->size > chan->backend.subbuf_size)) {
1375                         /*
1376                          * Record too big for subbuffers, report error, don't
1377                          * complete the sub-buffer switch.
1378                          */
1379                         v_inc(config, &buf->records_lost_big);
1380                         return -ENOSPC;
1381                 } else {
1382                         /*
1383                          * We just made a successful buffer switch and the
1384                          * record fits in the new subbuffer. Let's write.
1385                          */
1386                 }
1387         } else {
1388                 /*
1389                  * Record fits in the current buffer and we are not on a switch
1390                  * boundary. It's safe to write.
1391                  */
1392         }
1393         offsets->end = offsets->begin + offsets->size;
1394
1395         if (caa_unlikely(subbuf_offset(offsets->end, chan) == 0)) {
1396                 /*
1397                  * The offset_end will fall at the very beginning of the next
1398                  * subbuffer.
1399                  */
1400                 offsets->switch_new_end = 1;    /* For offsets->begin */
1401         }
1402         return 0;
1403 }
1404
1405 /**
1406  * lib_ring_buffer_reserve_slow - Atomic slot reservation in a buffer.
1407  * @ctx: ring buffer context.
1408  *
1409  * Return : -NOBUFS if not enough space, -ENOSPC if event size too large,
1410  * -EIO for other errors, else returns 0.
1411  * It will take care of sub-buffer switching.
1412  */
1413 int lib_ring_buffer_reserve_slow(struct lttng_ust_lib_ring_buffer_ctx *ctx)
1414 {
1415         struct channel *chan = ctx->chan;
1416         struct lttng_ust_shm_handle *handle = ctx->handle;
1417         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1418         struct lttng_ust_lib_ring_buffer *buf;
1419         struct switch_offsets offsets;
1420         int ret;
1421
1422         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
1423                 buf = shmp(handle, chan->backend.buf[ctx->cpu].shmp);
1424         else
1425                 buf = shmp(handle, chan->backend.buf[0].shmp);
1426         ctx->buf = buf;
1427
1428         offsets.size = 0;
1429
1430         do {
1431                 ret = lib_ring_buffer_try_reserve_slow(buf, chan, &offsets,
1432                                                        ctx);
1433                 if (caa_unlikely(ret))
1434                         return ret;
1435         } while (caa_unlikely(v_cmpxchg(config, &buf->offset, offsets.old,
1436                                     offsets.end)
1437                           != offsets.old));
1438
1439         /*
1440          * Atomically update last_tsc. This update races against concurrent
1441          * atomic updates, but the race will always cause supplementary full TSC
1442          * records, never the opposite (missing a full TSC record when it would
1443          * be needed).
1444          */
1445         save_last_tsc(config, buf, ctx->tsc);
1446
1447         /*
1448          * Push the reader if necessary
1449          */
1450         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.end - 1);
1451
1452         /*
1453          * Clear noref flag for this subbuffer.
1454          */
1455         lib_ring_buffer_clear_noref(config, &buf->backend,
1456                                     subbuf_index(offsets.end - 1, chan),
1457                                     handle);
1458
1459         /*
1460          * Switch old subbuffer if needed.
1461          */
1462         if (caa_unlikely(offsets.switch_old_end)) {
1463                 lib_ring_buffer_clear_noref(config, &buf->backend,
1464                                             subbuf_index(offsets.old - 1, chan),
1465                                             handle);
1466                 lib_ring_buffer_switch_old_end(buf, chan, &offsets, ctx->tsc, handle);
1467         }
1468
1469         /*
1470          * Populate new subbuffer.
1471          */
1472         if (caa_unlikely(offsets.switch_new_start))
1473                 lib_ring_buffer_switch_new_start(buf, chan, &offsets, ctx->tsc, handle);
1474
1475         if (caa_unlikely(offsets.switch_new_end))
1476                 lib_ring_buffer_switch_new_end(buf, chan, &offsets, ctx->tsc, handle);
1477
1478         ctx->slot_size = offsets.size;
1479         ctx->pre_offset = offsets.begin;
1480         ctx->buf_offset = offsets.begin + offsets.pre_header_padding;
1481         return 0;
1482 }