libringbuffer/ring_buffer_frontend.c

   1 /*
   2  * ring_buffer_frontend.c
   3  *
   4  * Copyright (C) 2005-2012 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; only
   9  * version 2.1 of the License.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  *
  20  *
  21  * Ring buffer wait-free buffer synchronization. Producer-consumer and flight
  22  * recorder (overwrite) modes. See thesis:
  23  *
  24  * Desnoyers, Mathieu (2009), "Low-Impact Operating System Tracing", Ph.D.
  25  * dissertation, Ecole Polytechnique de Montreal.
  26  * http://www.lttng.org/pub/thesis/desnoyers-dissertation-2009-12.pdf
  27  *
  28  * - Algorithm presentation in Chapter 5:
  29  *     "Lockless Multi-Core High-Throughput Buffering".
  30  * - Algorithm formal verification in Section 8.6:
  31  *     "Formal verification of LTTng"
  32  *
  33  * Author:
  34  *      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  35  *
  36  * Inspired from LTT and RelayFS:
  37  *  Karim Yaghmour <karim@opersys.com>
  38  *  Tom Zanussi <zanussi@us.ibm.com>
  39  *  Bob Wisniewski <bob@watson.ibm.com>
  40  * And from K42 :
  41  *  Bob Wisniewski <bob@watson.ibm.com>
  42  *
  43  * Buffer reader semantic :
  44  *
  45  * - get_subbuf_size
  46  * while buffer is not finalized and empty
  47  *   - get_subbuf
  48  *     - if return value != 0, continue
  49  *   - splice one subbuffer worth of data to a pipe
  50  *   - splice the data from pipe to disk/network
  51  *   - put_subbuf
  52  */
  53
  54 #define _GNU_SOURCE
  55 #include <sys/types.h>
  56 #include <sys/mman.h>
  57 #include <sys/stat.h>
  58 #include <fcntl.h>
  59 #include <urcu/compiler.h>
  60 #include <urcu/ref.h>
  61 #include <helper.h>
  62
  63 #include "smp.h"
  64 #include <lttng/ringbuffer-config.h>
  65 #include "vatomic.h"
  66 #include "backend.h"
  67 #include "frontend.h"
  68 #include "shm.h"
  69 #include "tlsfixup.h"
  70
  71 #ifndef max
  72 #define max(a, b)       ((a) > (b) ? (a) : (b))
  73 #endif
  74
  75 /* Print DBG() messages about events lost only every 1048576 hits */
  76 #define DBG_PRINT_NR_LOST       (1UL << 20)
  77
  78 /*
  79  * Use POSIX SHM: shm_open(3) and shm_unlink(3).
  80  * close(2) to close the fd returned by shm_open.
  81  * shm_unlink releases the shared memory object name.
  82  * ftruncate(2) sets the size of the memory object.
  83  * mmap/munmap maps the shared memory obj to a virtual address in the
  84  * calling proceess (should be done both in libust and consumer).
  85  * See shm_overview(7) for details.
  86  * Pass file descriptor returned by shm_open(3) to ltt-sessiond through
  87  * a UNIX socket.
  88  *
  89  * Since we don't need to access the object using its name, we can
  90  * immediately shm_unlink(3) it, and only keep the handle with its file
  91  * descriptor.
  92  */
  93
  94 /*
  95  * Internal structure representing offsets to use at a sub-buffer switch.
  96  */
  97 struct switch_offsets {
  98         unsigned long begin, end, old;
  99         size_t pre_header_padding, size;
 100         unsigned int switch_new_start:1, switch_new_end:1, switch_old_start:1,
 101                      switch_old_end:1;
 102 };
 103
 104 __thread unsigned int lib_ring_buffer_nesting;
 105
 106 /*
 107  * TODO: this is unused. Errors are saved within the ring buffer.
 108  * Eventually, allow consumerd to print these errors.
 109  */
 110 static
 111 void lib_ring_buffer_print_errors(struct channel *chan,
 112                                   struct lttng_ust_lib_ring_buffer *buf, int cpu,
 113                                   struct lttng_ust_shm_handle *handle)
 114         __attribute__((unused));
 115
 116 /**
 117  * lib_ring_buffer_reset - Reset ring buffer to initial values.
 118  * @buf: Ring buffer.
 119  *
 120  * Effectively empty the ring buffer. Should be called when the buffer is not
 121  * used for writing. The ring buffer can be opened for reading, but the reader
 122  * should not be using the iterator concurrently with reset. The previous
 123  * current iterator record is reset.
 124  */
 125 void lib_ring_buffer_reset(struct lttng_ust_lib_ring_buffer *buf,
 126                            struct lttng_ust_shm_handle *handle)
 127 {
 128         struct channel *chan = shmp(handle, buf->backend.chan);
 129         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 130         unsigned int i;
 131
 132         /*
 133          * Reset iterator first. It will put the subbuffer if it currently holds
 134          * it.
 135          */
 136         v_set(config, &buf->offset, 0);
 137         for (i = 0; i < chan->backend.num_subbuf; i++) {
 138                 v_set(config, &shmp_index(handle, buf->commit_hot, i)->cc, 0);
 139                 v_set(config, &shmp_index(handle, buf->commit_hot, i)->seq, 0);
 140                 v_set(config, &shmp_index(handle, buf->commit_cold, i)->cc_sb, 0);
 141         }
 142         uatomic_set(&buf->consumed, 0);
 143         uatomic_set(&buf->record_disabled, 0);
 144         v_set(config, &buf->last_tsc, 0);
 145         lib_ring_buffer_backend_reset(&buf->backend, handle);
 146         /* Don't reset number of active readers */
 147         v_set(config, &buf->records_lost_full, 0);
 148         v_set(config, &buf->records_lost_wrap, 0);
 149         v_set(config, &buf->records_lost_big, 0);
 150         v_set(config, &buf->records_count, 0);
 151         v_set(config, &buf->records_overrun, 0);
 152         buf->finalized = 0;
 153 }
 154
 155 /**
 156  * channel_reset - Reset channel to initial values.
 157  * @chan: Channel.
 158  *
 159  * Effectively empty the channel. Should be called when the channel is not used
 160  * for writing. The channel can be opened for reading, but the reader should not
 161  * be using the iterator concurrently with reset. The previous current iterator
 162  * record is reset.
 163  */
 164 void channel_reset(struct channel *chan)
 165 {
 166         /*
 167          * Reset iterators first. Will put the subbuffer if held for reading.
 168          */
 169         uatomic_set(&chan->record_disabled, 0);
 170         /* Don't reset commit_count_mask, still valid */
 171         channel_backend_reset(&chan->backend);
 172         /* Don't reset switch/read timer interval */
 173         /* Don't reset notifiers and notifier enable bits */
 174         /* Don't reset reader reference count */
 175 }
 176
 177 /*
 178  * Must be called under cpu hotplug protection.
 179  */
 180 int lib_ring_buffer_create(struct lttng_ust_lib_ring_buffer *buf,
 181                            struct channel_backend *chanb, int cpu,
 182                            struct lttng_ust_shm_handle *handle,
 183                            struct shm_object *shmobj)
 184 {
 185         const struct lttng_ust_lib_ring_buffer_config *config = &chanb->config;
 186         struct channel *chan = caa_container_of(chanb, struct channel, backend);
 187         void *priv = channel_get_private(chan);
 188         size_t subbuf_header_size;
 189         uint64_t tsc;
 190         int ret;
 191
 192         /* Test for cpu hotplug */
 193         if (buf->backend.allocated)
 194                 return 0;
 195
 196         ret = lib_ring_buffer_backend_create(&buf->backend, &chan->backend,
 197                         cpu, handle, shmobj);
 198         if (ret)
 199                 return ret;
 200
 201         align_shm(shmobj, __alignof__(struct commit_counters_hot));
 202         set_shmp(buf->commit_hot,
 203                  zalloc_shm(shmobj,
 204                         sizeof(struct commit_counters_hot) * chan->backend.num_subbuf));
 205         if (!shmp(handle, buf->commit_hot)) {
 206                 ret = -ENOMEM;
 207                 goto free_chanbuf;
 208         }
 209
 210         align_shm(shmobj, __alignof__(struct commit_counters_cold));
 211         set_shmp(buf->commit_cold,
 212                  zalloc_shm(shmobj,
 213                         sizeof(struct commit_counters_cold) * chan->backend.num_subbuf));
 214         if (!shmp(handle, buf->commit_cold)) {
 215                 ret = -ENOMEM;
 216                 goto free_commit;
 217         }
 218
 219         /*
 220          * Write the subbuffer header for first subbuffer so we know the total
 221          * duration of data gathering.
 222          */
 223         subbuf_header_size = config->cb.subbuffer_header_size();
 224         v_set(config, &buf->offset, subbuf_header_size);
 225         subbuffer_id_clear_noref(config, &shmp_index(handle, buf->backend.buf_wsb, 0)->id);
 226         tsc = config->cb.ring_buffer_clock_read(shmp(handle, buf->backend.chan));
 227         config->cb.buffer_begin(buf, tsc, 0, handle);
 228         v_add(config, subbuf_header_size, &shmp_index(handle, buf->commit_hot, 0)->cc);
 229
 230         if (config->cb.buffer_create) {
 231                 ret = config->cb.buffer_create(buf, priv, cpu, chanb->name, handle);
 232                 if (ret)
 233                         goto free_init;
 234         }
 235         buf->backend.allocated = 1;
 236         return 0;
 237
 238         /* Error handling */
 239 free_init:
 240         /* commit_cold will be freed by shm teardown */
 241 free_commit:
 242         /* commit_hot will be freed by shm teardown */
 243 free_chanbuf:
 244         return ret;
 245 }
 246
 247 #if 0
 248 static void switch_buffer_timer(unsigned long data)
 249 {
 250         struct lttng_ust_lib_ring_buffer *buf = (struct lttng_ust_lib_ring_buffer *)data;
 251         struct channel *chan = shmp(handle, buf->backend.chan);
 252         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 253
 254         /*
 255          * Only flush buffers periodically if readers are active.
 256          */
 257         if (uatomic_read(&buf->active_readers) || uatomic_read(&buf->active_shadow_readers))
 258                 lib_ring_buffer_switch_slow(buf, SWITCH_ACTIVE, handle);
 259
 260         //TODO timers
 261         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 262         //      mod_timer_pinned(&buf->switch_timer,
 263         //                       jiffies + chan->switch_timer_interval);
 264         //else
 265         //      mod_timer(&buf->switch_timer,
 266         //                jiffies + chan->switch_timer_interval);
 267 }
 268 #endif //0
 269
 270 static void lib_ring_buffer_start_switch_timer(struct lttng_ust_lib_ring_buffer *buf,
 271                            struct lttng_ust_shm_handle *handle)
 272 {
 273         struct channel *chan = shmp(handle, buf->backend.chan);
 274         //const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 275
 276         if (!chan->switch_timer_interval || buf->switch_timer_enabled)
 277                 return;
 278         //TODO
 279         //init_timer(&buf->switch_timer);
 280         //buf->switch_timer.function = switch_buffer_timer;
 281         //buf->switch_timer.expires = jiffies + chan->switch_timer_interval;
 282         //buf->switch_timer.data = (unsigned long)buf;
 283         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 284         //      add_timer_on(&buf->switch_timer, buf->backend.cpu);
 285         //else
 286         //      add_timer(&buf->switch_timer);
 287         buf->switch_timer_enabled = 1;
 288 }
 289
 290 static void lib_ring_buffer_stop_switch_timer(struct lttng_ust_lib_ring_buffer *buf,
 291                            struct lttng_ust_shm_handle *handle)
 292 {
 293         struct channel *chan = shmp(handle, buf->backend.chan);
 294
 295         if (!chan->switch_timer_interval || !buf->switch_timer_enabled)
 296                 return;
 297
 298         //TODO
 299         //del_timer_sync(&buf->switch_timer);
 300         buf->switch_timer_enabled = 0;
 301 }
 302
 303 #if 0
 304 /*
 305  * Polling timer to check the channels for data.
 306  */
 307 static void read_buffer_timer(unsigned long data)
 308 {
 309         struct lttng_ust_lib_ring_buffer *buf = (struct lttng_ust_lib_ring_buffer *)data;
 310         struct channel *chan = shmp(handle, buf->backend.chan);
 311         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 312
 313         CHAN_WARN_ON(chan, !buf->backend.allocated);
 314
 315         if (uatomic_read(&buf->active_readers) || uatomic_read(&buf->active_shadow_readers))
 316             && lib_ring_buffer_poll_deliver(config, buf, chan)) {
 317                 //TODO
 318                 //wake_up_interruptible(&buf->read_wait);
 319                 //wake_up_interruptible(&chan->read_wait);
 320         }
 321
 322         //TODO
 323         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 324         //      mod_timer_pinned(&buf->read_timer,
 325         //                       jiffies + chan->read_timer_interval);
 326         //else
 327         //      mod_timer(&buf->read_timer,
 328         //                jiffies + chan->read_timer_interval);
 329 }
 330 #endif //0
 331
 332 static void lib_ring_buffer_start_read_timer(struct lttng_ust_lib_ring_buffer *buf,
 333                            struct lttng_ust_shm_handle *handle)
 334 {
 335         struct channel *chan = shmp(handle, buf->backend.chan);
 336         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 337
 338         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 339             || !chan->read_timer_interval
 340             || buf->read_timer_enabled)
 341                 return;
 342
 343         //TODO
 344         //init_timer(&buf->read_timer);
 345         //buf->read_timer.function = read_buffer_timer;
 346         //buf->read_timer.expires = jiffies + chan->read_timer_interval;
 347         //buf->read_timer.data = (unsigned long)buf;
 348
 349         //if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 350         //      add_timer_on(&buf->read_timer, buf->backend.cpu);
 351         //else
 352         //      add_timer(&buf->read_timer);
 353         buf->read_timer_enabled = 1;
 354 }
 355
 356 static void lib_ring_buffer_stop_read_timer(struct lttng_ust_lib_ring_buffer *buf,
 357                            struct lttng_ust_shm_handle *handle)
 358 {
 359         struct channel *chan = shmp(handle, buf->backend.chan);
 360         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 361
 362         if (config->wakeup != RING_BUFFER_WAKEUP_BY_TIMER
 363             || !chan->read_timer_interval
 364             || !buf->read_timer_enabled)
 365                 return;
 366
 367         //TODO
 368         //del_timer_sync(&buf->read_timer);
 369         /*
 370          * do one more check to catch data that has been written in the last
 371          * timer period.
 372          */
 373         if (lib_ring_buffer_poll_deliver(config, buf, chan, handle)) {
 374                 //TODO
 375                 //wake_up_interruptible(&buf->read_wait);
 376                 //wake_up_interruptible(&chan->read_wait);
 377         }
 378         buf->read_timer_enabled = 0;
 379 }
 380
 381 static void channel_unregister_notifiers(struct channel *chan,
 382                            struct lttng_ust_shm_handle *handle)
 383 {
 384         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 385         int cpu;
 386
 387         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 388                 for_each_possible_cpu(cpu) {
 389                         struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[cpu].shmp);
 390
 391                         lib_ring_buffer_stop_switch_timer(buf, handle);
 392                         lib_ring_buffer_stop_read_timer(buf, handle);
 393                 }
 394         } else {
 395                 struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[0].shmp);
 396
 397                 lib_ring_buffer_stop_switch_timer(buf, handle);
 398                 lib_ring_buffer_stop_read_timer(buf, handle);
 399         }
 400         //channel_backend_unregister_notifiers(&chan->backend);
 401 }
 402
 403 static void channel_free(struct channel *chan, struct lttng_ust_shm_handle *handle,
 404                 int shadow)
 405 {
 406         if (!shadow)
 407                 channel_backend_free(&chan->backend, handle);
 408         /* chan is freed by shm teardown */
 409         shm_object_table_destroy(handle->table);
 410         free(handle);
 411 }
 412
 413 /**
 414  * channel_create - Create channel.
 415  * @config: ring buffer instance configuration
 416  * @name: name of the channel
 417  * @priv_data: ring buffer client private data area pointer (output)
 418  * @priv_data_size: length, in bytes, of the private data area.
 419  * @priv_data_init: initialization data for private data.
 420  * @buf_addr: pointer the the beginning of the preallocated buffer contiguous
 421  *            address mapping. It is used only by RING_BUFFER_STATIC
 422  *            configuration. It can be set to NULL for other backends.
 423  * @subbuf_size: subbuffer size
 424  * @num_subbuf: number of subbuffers
 425  * @switch_timer_interval: Time interval (in us) to fill sub-buffers with
 426  *                         padding to let readers get those sub-buffers.
 427  *                         Used for live streaming.
 428  * @read_timer_interval: Time interval (in us) to wake up pending readers.
 429  *
 430  * Holds cpu hotplug.
 431  * Returns NULL on failure.
 432  */
 433 struct lttng_ust_shm_handle *channel_create(const struct lttng_ust_lib_ring_buffer_config *config,
 434                    const char *name,
 435                    void **priv_data,
 436                    size_t priv_data_align,
 437                    size_t priv_data_size,
 438                    void *priv_data_init,
 439                    void *buf_addr, size_t subbuf_size,
 440                    size_t num_subbuf, unsigned int switch_timer_interval,
 441                    unsigned int read_timer_interval,
 442                    int **shm_fd, int **wait_fd, uint64_t **memory_map_size)
 443 {
 444         int ret, cpu;
 445         size_t shmsize, chansize;
 446         struct channel *chan;
 447         struct lttng_ust_shm_handle *handle;
 448         struct shm_object *shmobj;
 449         struct shm_ref *ref;
 450
 451         if (lib_ring_buffer_check_config(config, switch_timer_interval,
 452                                          read_timer_interval))
 453                 return NULL;
 454
 455         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
 456         if (!handle)
 457                 return NULL;
 458
 459         /* Allocate table for channel + per-cpu buffers */
 460         handle->table = shm_object_table_create(1 + num_possible_cpus());
 461         if (!handle->table)
 462                 goto error_table_alloc;
 463
 464         /* Calculate the shm allocation layout */
 465         shmsize = sizeof(struct channel);
 466         shmsize += offset_align(shmsize, __alignof__(struct lttng_ust_lib_ring_buffer_shmp));
 467         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
 468                 shmsize += sizeof(struct lttng_ust_lib_ring_buffer_shmp) * num_possible_cpus();
 469         else
 470                 shmsize += sizeof(struct lttng_ust_lib_ring_buffer_shmp);
 471         chansize = shmsize;
 472         shmsize += offset_align(shmsize, priv_data_align);
 473         shmsize += priv_data_size;
 474
 475         shmobj = shm_object_table_append(handle->table, shmsize);
 476         if (!shmobj)
 477                 goto error_append;
 478         /* struct channel is at object 0, offset 0 (hardcoded) */
 479         set_shmp(handle->chan, zalloc_shm(shmobj, chansize));
 480         assert(handle->chan._ref.index == 0);
 481         assert(handle->chan._ref.offset == 0);
 482         chan = shmp(handle, handle->chan);
 483         if (!chan)
 484                 goto error_append;
 485
 486         /* space for private data */
 487         if (priv_data_size) {
 488                 DECLARE_SHMP(void, priv_data_alloc);
 489
 490                 align_shm(shmobj, priv_data_align);
 491                 chan->priv_data_offset = shmobj->allocated_len;
 492                 set_shmp(priv_data_alloc, zalloc_shm(shmobj, priv_data_size));
 493                 if (!shmp(handle, priv_data_alloc))
 494                         goto error_append;
 495                 *priv_data = channel_get_private(chan);
 496                 memcpy(*priv_data, priv_data_init, priv_data_size);
 497         } else {
 498                 chan->priv_data_offset = -1;
 499                 *priv_data = NULL;
 500         }
 501
 502         ret = channel_backend_init(&chan->backend, name, config,
 503                                    subbuf_size, num_subbuf, handle);
 504         if (ret)
 505                 goto error_backend_init;
 506
 507         chan->commit_count_mask = (~0UL >> chan->backend.num_subbuf_order);
 508         //TODO
 509         //chan->switch_timer_interval = usecs_to_jiffies(switch_timer_interval);
 510         //chan->read_timer_interval = usecs_to_jiffies(read_timer_interval);
 511         //TODO
 512         //init_waitqueue_head(&chan->read_wait);
 513         //init_waitqueue_head(&chan->hp_wait);
 514
 515         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU) {
 516                 /*
 517                  * In case of non-hotplug cpu, if the ring-buffer is allocated
 518                  * in early initcall, it will not be notified of secondary cpus.
 519                  * In that off case, we need to allocate for all possible cpus.
 520                  */
 521                 for_each_possible_cpu(cpu) {
 522                         struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[cpu].shmp);
 523                         lib_ring_buffer_start_switch_timer(buf, handle);
 524                         lib_ring_buffer_start_read_timer(buf, handle);
 525                 }
 526         } else {
 527                 struct lttng_ust_lib_ring_buffer *buf = shmp(handle, chan->backend.buf[0].shmp);
 528
 529                 lib_ring_buffer_start_switch_timer(buf, handle);
 530                 lib_ring_buffer_start_read_timer(buf, handle);
 531         }
 532         ref = &handle->chan._ref;
 533         shm_get_object_data(handle, ref, shm_fd, wait_fd, memory_map_size);
 534         return handle;
 535
 536 error_backend_init:
 537 error_append:
 538         shm_object_table_destroy(handle->table);
 539 error_table_alloc:
 540         free(handle);
 541         return NULL;
 542 }
 543
 544 struct lttng_ust_shm_handle *channel_handle_create(int shm_fd, int wait_fd,
 545                                         uint64_t memory_map_size)
 546 {
 547         struct lttng_ust_shm_handle *handle;
 548         struct shm_object *object;
 549
 550         handle = zmalloc(sizeof(struct lttng_ust_shm_handle));
 551         if (!handle)
 552                 return NULL;
 553
 554         /* Allocate table for channel + per-cpu buffers */
 555         handle->table = shm_object_table_create(1 + num_possible_cpus());
 556         if (!handle->table)
 557                 goto error_table_alloc;
 558         /* Add channel object */
 559         object = shm_object_table_append_shadow(handle->table,
 560                         shm_fd, wait_fd, memory_map_size);
 561         if (!object)
 562                 goto error_table_object;
 563         /* struct channel is at object 0, offset 0 (hardcoded) */
 564         handle->chan._ref.index = 0;
 565         handle->chan._ref.offset = 0;
 566         return handle;
 567
 568 error_table_object:
 569         shm_object_table_destroy(handle->table);
 570 error_table_alloc:
 571         free(handle);
 572         return NULL;
 573 }
 574
 575 int channel_handle_add_stream(struct lttng_ust_shm_handle *handle,
 576                 int shm_fd, int wait_fd, uint64_t memory_map_size)
 577 {
 578         struct shm_object *object;
 579
 580         /* Add stream object */
 581         object = shm_object_table_append_shadow(handle->table,
 582                         shm_fd, wait_fd, memory_map_size);
 583         if (!object)
 584                 return -1;
 585         return 0;
 586 }
 587
 588 static
 589 void channel_release(struct channel *chan, struct lttng_ust_shm_handle *handle,
 590                 int shadow)
 591 {
 592         channel_free(chan, handle, shadow);
 593 }
 594
 595 /**
 596  * channel_destroy - Finalize, wait for q.s. and destroy channel.
 597  * @chan: channel to destroy
 598  *
 599  * Holds cpu hotplug.
 600  * Call "destroy" callback, finalize channels, decrement the channel
 601  * reference count. Note that when readers have completed data
 602  * consumption of finalized channels, get_subbuf() will return -ENODATA.
 603  * They should release their handle at that point.
 604  */
 605 void channel_destroy(struct channel *chan, struct lttng_ust_shm_handle *handle,
 606                 int shadow)
 607 {
 608         if (shadow) {
 609                 channel_release(chan, handle, shadow);
 610                 return;
 611         }
 612
 613         channel_unregister_notifiers(chan, handle);
 614
 615         /*
 616          * Note: the consumer takes care of finalizing and switching the
 617          * buffers.
 618          */
 619
 620         /*
 621          * sessiond/consumer are keeping a reference on the shm file
 622          * descriptor directly. No need to refcount.
 623          */
 624         channel_release(chan, handle, shadow);
 625         return;
 626 }
 627
 628 struct lttng_ust_lib_ring_buffer *channel_get_ring_buffer(
 629                                         const struct lttng_ust_lib_ring_buffer_config *config,
 630                                         struct channel *chan, int cpu,
 631                                         struct lttng_ust_shm_handle *handle,
 632                                         int **shm_fd, int **wait_fd,
 633                                         uint64_t **memory_map_size)
 634 {
 635         struct shm_ref *ref;
 636
 637         if (config->alloc == RING_BUFFER_ALLOC_GLOBAL) {
 638                 ref = &chan->backend.buf[0].shmp._ref;
 639                 shm_get_object_data(handle, ref, shm_fd, wait_fd,
 640                         memory_map_size);
 641                 return shmp(handle, chan->backend.buf[0].shmp);
 642         } else {
 643                 if (cpu >= num_possible_cpus())
 644                         return NULL;
 645                 ref = &chan->backend.buf[cpu].shmp._ref;
 646                 shm_get_object_data(handle, ref, shm_fd, wait_fd,
 647                         memory_map_size);
 648                 return shmp(handle, chan->backend.buf[cpu].shmp);
 649         }
 650 }
 651
 652 int lib_ring_buffer_open_read(struct lttng_ust_lib_ring_buffer *buf,
 653                               struct lttng_ust_shm_handle *handle,
 654                               int shadow)
 655 {
 656         if (shadow) {
 657                 if (uatomic_cmpxchg(&buf->active_shadow_readers, 0, 1) != 0)
 658                         return -EBUSY;
 659                 cmm_smp_mb();
 660                 return 0;
 661         }
 662         if (uatomic_cmpxchg(&buf->active_readers, 0, 1) != 0)
 663                 return -EBUSY;
 664         cmm_smp_mb();
 665         return 0;
 666 }
 667
 668 void lib_ring_buffer_release_read(struct lttng_ust_lib_ring_buffer *buf,
 669                                   struct lttng_ust_shm_handle *handle,
 670                                   int shadow)
 671 {
 672         struct channel *chan = shmp(handle, buf->backend.chan);
 673
 674         if (shadow) {
 675                 CHAN_WARN_ON(chan, uatomic_read(&buf->active_shadow_readers) != 1);
 676                 cmm_smp_mb();
 677                 uatomic_dec(&buf->active_shadow_readers);
 678                 return;
 679         }
 680         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1);
 681         cmm_smp_mb();
 682         uatomic_dec(&buf->active_readers);
 683 }
 684
 685 /**
 686  * lib_ring_buffer_snapshot - save subbuffer position snapshot (for read)
 687  * @buf: ring buffer
 688  * @consumed: consumed count indicating the position where to read
 689  * @produced: produced count, indicates position when to stop reading
 690  *
 691  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 692  * data to read at consumed position, or 0 if the get operation succeeds.
 693  */
 694
 695 int lib_ring_buffer_snapshot(struct lttng_ust_lib_ring_buffer *buf,
 696                              unsigned long *consumed, unsigned long *produced,
 697                              struct lttng_ust_shm_handle *handle)
 698 {
 699         struct channel *chan = shmp(handle, buf->backend.chan);
 700         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 701         unsigned long consumed_cur, write_offset;
 702         int finalized;
 703
 704         finalized = CMM_ACCESS_ONCE(buf->finalized);
 705         /*
 706          * Read finalized before counters.
 707          */
 708         cmm_smp_rmb();
 709         consumed_cur = uatomic_read(&buf->consumed);
 710         /*
 711          * No need to issue a memory barrier between consumed count read and
 712          * write offset read, because consumed count can only change
 713          * concurrently in overwrite mode, and we keep a sequence counter
 714          * identifier derived from the write offset to check we are getting
 715          * the same sub-buffer we are expecting (the sub-buffers are atomically
 716          * "tagged" upon writes, tags are checked upon read).
 717          */
 718         write_offset = v_read(config, &buf->offset);
 719
 720         /*
 721          * Check that we are not about to read the same subbuffer in
 722          * which the writer head is.
 723          */
 724         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
 725             == 0)
 726                 goto nodata;
 727
 728         *consumed = consumed_cur;
 729         *produced = subbuf_trunc(write_offset, chan);
 730
 731         return 0;
 732
 733 nodata:
 734         /*
 735          * The memory barriers __wait_event()/wake_up_interruptible() take care
 736          * of "raw_spin_is_locked" memory ordering.
 737          */
 738         if (finalized)
 739                 return -ENODATA;
 740         else
 741                 return -EAGAIN;
 742 }
 743
 744 /**
 745  * lib_ring_buffer_put_snapshot - move consumed counter forward
 746  * @buf: ring buffer
 747  * @consumed_new: new consumed count value
 748  */
 749 void lib_ring_buffer_move_consumer(struct lttng_ust_lib_ring_buffer *buf,
 750                                    unsigned long consumed_new,
 751                                    struct lttng_ust_shm_handle *handle)
 752 {
 753         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
 754         struct channel *chan = shmp(handle, bufb->chan);
 755         unsigned long consumed;
 756
 757         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1
 758                         && uatomic_read(&buf->active_shadow_readers) != 1);
 759
 760         /*
 761          * Only push the consumed value forward.
 762          * If the consumed cmpxchg fails, this is because we have been pushed by
 763          * the writer in flight recorder mode.
 764          */
 765         consumed = uatomic_read(&buf->consumed);
 766         while ((long) consumed - (long) consumed_new < 0)
 767                 consumed = uatomic_cmpxchg(&buf->consumed, consumed,
 768                                            consumed_new);
 769 }
 770
 771 /**
 772  * lib_ring_buffer_get_subbuf - get exclusive access to subbuffer for reading
 773  * @buf: ring buffer
 774  * @consumed: consumed count indicating the position where to read
 775  *
 776  * Returns -ENODATA if buffer is finalized, -EAGAIN if there is currently no
 777  * data to read at consumed position, or 0 if the get operation succeeds.
 778  */
 779 int lib_ring_buffer_get_subbuf(struct lttng_ust_lib_ring_buffer *buf,
 780                                unsigned long consumed,
 781                                struct lttng_ust_shm_handle *handle)
 782 {
 783         struct channel *chan = shmp(handle, buf->backend.chan);
 784         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 785         unsigned long consumed_cur, consumed_idx, commit_count, write_offset;
 786         int ret;
 787         int finalized;
 788
 789 retry:
 790         finalized = CMM_ACCESS_ONCE(buf->finalized);
 791         /*
 792          * Read finalized before counters.
 793          */
 794         cmm_smp_rmb();
 795         consumed_cur = uatomic_read(&buf->consumed);
 796         consumed_idx = subbuf_index(consumed, chan);
 797         commit_count = v_read(config, &shmp_index(handle, buf->commit_cold, consumed_idx)->cc_sb);
 798         /*
 799          * Make sure we read the commit count before reading the buffer
 800          * data and the write offset. Correct consumed offset ordering
 801          * wrt commit count is insured by the use of cmpxchg to update
 802          * the consumed offset.
 803          */
 804         /*
 805          * Local rmb to match the remote wmb to read the commit count
 806          * before the buffer data and the write offset.
 807          */
 808         cmm_smp_rmb();
 809
 810         write_offset = v_read(config, &buf->offset);
 811
 812         /*
 813          * Check that the buffer we are getting is after or at consumed_cur
 814          * position.
 815          */
 816         if ((long) subbuf_trunc(consumed, chan)
 817             - (long) subbuf_trunc(consumed_cur, chan) < 0)
 818                 goto nodata;
 819
 820         /*
 821          * Check that the subbuffer we are trying to consume has been
 822          * already fully committed.
 823          */
 824         if (((commit_count - chan->backend.subbuf_size)
 825              & chan->commit_count_mask)
 826             - (buf_trunc(consumed_cur, chan)
 827                >> chan->backend.num_subbuf_order)
 828             != 0)
 829                 goto nodata;
 830
 831         /*
 832          * Check that we are not about to read the same subbuffer in
 833          * which the writer head is.
 834          */
 835         if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_cur, chan)
 836             == 0)
 837                 goto nodata;
 838
 839         /*
 840          * Failure to get the subbuffer causes a busy-loop retry without going
 841          * to a wait queue. These are caused by short-lived race windows where
 842          * the writer is getting access to a subbuffer we were trying to get
 843          * access to. Also checks that the "consumed" buffer count we are
 844          * looking for matches the one contained in the subbuffer id.
 845          */
 846         ret = update_read_sb_index(config, &buf->backend, &chan->backend,
 847                                    consumed_idx, buf_trunc_val(consumed, chan),
 848                                    handle);
 849         if (ret)
 850                 goto retry;
 851         subbuffer_id_clear_noref(config, &buf->backend.buf_rsb.id);
 852
 853         buf->get_subbuf_consumed = consumed;
 854         buf->get_subbuf = 1;
 855
 856         return 0;
 857
 858 nodata:
 859         /*
 860          * The memory barriers __wait_event()/wake_up_interruptible() take care
 861          * of "raw_spin_is_locked" memory ordering.
 862          */
 863         if (finalized)
 864                 return -ENODATA;
 865         else
 866                 return -EAGAIN;
 867 }
 868
 869 /**
 870  * lib_ring_buffer_put_subbuf - release exclusive subbuffer access
 871  * @buf: ring buffer
 872  */
 873 void lib_ring_buffer_put_subbuf(struct lttng_ust_lib_ring_buffer *buf,
 874                                 struct lttng_ust_shm_handle *handle)
 875 {
 876         struct lttng_ust_lib_ring_buffer_backend *bufb = &buf->backend;
 877         struct channel *chan = shmp(handle, bufb->chan);
 878         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 879         unsigned long read_sb_bindex, consumed_idx, consumed;
 880
 881         CHAN_WARN_ON(chan, uatomic_read(&buf->active_readers) != 1
 882                         && uatomic_read(&buf->active_shadow_readers) != 1);
 883
 884         if (!buf->get_subbuf) {
 885                 /*
 886                  * Reader puts a subbuffer it did not get.
 887                  */
 888                 CHAN_WARN_ON(chan, 1);
 889                 return;
 890         }
 891         consumed = buf->get_subbuf_consumed;
 892         buf->get_subbuf = 0;
 893
 894         /*
 895          * Clear the records_unread counter. (overruns counter)
 896          * Can still be non-zero if a file reader simply grabbed the data
 897          * without using iterators.
 898          * Can be below zero if an iterator is used on a snapshot more than
 899          * once.
 900          */
 901         read_sb_bindex = subbuffer_id_get_index(config, bufb->buf_rsb.id);
 902         v_add(config, v_read(config,
 903                              &shmp(handle, shmp_index(handle, bufb->array, read_sb_bindex)->shmp)->records_unread),
 904               &bufb->records_read);
 905         v_set(config, &shmp(handle, shmp_index(handle, bufb->array, read_sb_bindex)->shmp)->records_unread, 0);
 906         CHAN_WARN_ON(chan, config->mode == RING_BUFFER_OVERWRITE
 907                      && subbuffer_id_is_noref(config, bufb->buf_rsb.id));
 908         subbuffer_id_set_noref(config, &bufb->buf_rsb.id);
 909
 910         /*
 911          * Exchange the reader subbuffer with the one we put in its place in the
 912          * writer subbuffer table. Expect the original consumed count. If
 913          * update_read_sb_index fails, this is because the writer updated the
 914          * subbuffer concurrently. We should therefore keep the subbuffer we
 915          * currently have: it has become invalid to try reading this sub-buffer
 916          * consumed count value anyway.
 917          */
 918         consumed_idx = subbuf_index(consumed, chan);
 919         update_read_sb_index(config, &buf->backend, &chan->backend,
 920                              consumed_idx, buf_trunc_val(consumed, chan),
 921                              handle);
 922         /*
 923          * update_read_sb_index return value ignored. Don't exchange sub-buffer
 924          * if the writer concurrently updated it.
 925          */
 926 }
 927
 928 /*
 929  * cons_offset is an iterator on all subbuffer offsets between the reader
 930  * position and the writer position. (inclusive)
 931  */
 932 static
 933 void lib_ring_buffer_print_subbuffer_errors(struct lttng_ust_lib_ring_buffer *buf,
 934                                             struct channel *chan,
 935                                             unsigned long cons_offset,
 936                                             int cpu,
 937                                             struct lttng_ust_shm_handle *handle)
 938 {
 939         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 940         unsigned long cons_idx, commit_count, commit_count_sb;
 941
 942         cons_idx = subbuf_index(cons_offset, chan);
 943         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, cons_idx)->cc);
 944         commit_count_sb = v_read(config, &shmp_index(handle, buf->commit_cold, cons_idx)->cc_sb);
 945
 946         if (subbuf_offset(commit_count, chan) != 0)
 947                 DBG("ring buffer %s, cpu %d: "
 948                        "commit count in subbuffer %lu,\n"
 949                        "expecting multiples of %lu bytes\n"
 950                        "  [ %lu bytes committed, %lu bytes reader-visible ]\n",
 951                        chan->backend.name, cpu, cons_idx,
 952                        chan->backend.subbuf_size,
 953                        commit_count, commit_count_sb);
 954
 955         DBG("ring buffer: %s, cpu %d: %lu bytes committed\n",
 956                chan->backend.name, cpu, commit_count);
 957 }
 958
 959 static
 960 void lib_ring_buffer_print_buffer_errors(struct lttng_ust_lib_ring_buffer *buf,
 961                                          struct channel *chan,
 962                                          void *priv, int cpu,
 963                                          struct lttng_ust_shm_handle *handle)
 964 {
 965         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 966         unsigned long write_offset, cons_offset;
 967
 968         /*
 969          * No need to order commit_count, write_offset and cons_offset reads
 970          * because we execute at teardown when no more writer nor reader
 971          * references are left.
 972          */
 973         write_offset = v_read(config, &buf->offset);
 974         cons_offset = uatomic_read(&buf->consumed);
 975         if (write_offset != cons_offset)
 976                 DBG("ring buffer %s, cpu %d: "
 977                        "non-consumed data\n"
 978                        "  [ %lu bytes written, %lu bytes read ]\n",
 979                        chan->backend.name, cpu, write_offset, cons_offset);
 980
 981         for (cons_offset = uatomic_read(&buf->consumed);
 982              (long) (subbuf_trunc((unsigned long) v_read(config, &buf->offset),
 983                                   chan)
 984                      - cons_offset) > 0;
 985              cons_offset = subbuf_align(cons_offset, chan))
 986                 lib_ring_buffer_print_subbuffer_errors(buf, chan, cons_offset,
 987                                                        cpu, handle);
 988 }
 989
 990 static
 991 void lib_ring_buffer_print_errors(struct channel *chan,
 992                                   struct lttng_ust_lib_ring_buffer *buf, int cpu,
 993                                   struct lttng_ust_shm_handle *handle)
 994 {
 995         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
 996         void *priv = channel_get_private(chan);
 997
 998         if (!strcmp(chan->backend.name, "relay-metadata-mmap")) {
 999                 DBG("ring buffer %s: %lu records written, "
1000                         "%lu records overrun\n",
1001                         chan->backend.name,
1002                         v_read(config, &buf->records_count),
1003                         v_read(config, &buf->records_overrun));
1004         } else {
1005                 DBG("ring buffer %s, cpu %d: %lu records written, "
1006                         "%lu records overrun\n",
1007                         chan->backend.name, cpu,
1008                         v_read(config, &buf->records_count),
1009                         v_read(config, &buf->records_overrun));
1010
1011                 if (v_read(config, &buf->records_lost_full)
1012                     || v_read(config, &buf->records_lost_wrap)
1013                     || v_read(config, &buf->records_lost_big))
1014                         DBG("ring buffer %s, cpu %d: records were lost. Caused by:\n"
1015                                 "  [ %lu buffer full, %lu nest buffer wrap-around, "
1016                                 "%lu event too big ]\n",
1017                                 chan->backend.name, cpu,
1018                                 v_read(config, &buf->records_lost_full),
1019                                 v_read(config, &buf->records_lost_wrap),
1020                                 v_read(config, &buf->records_lost_big));
1021         }
1022         lib_ring_buffer_print_buffer_errors(buf, chan, priv, cpu, handle);
1023 }
1024
1025 /*
1026  * lib_ring_buffer_switch_old_start: Populate old subbuffer header.
1027  *
1028  * Only executed when the buffer is finalized, in SWITCH_FLUSH.
1029  */
1030 static
1031 void lib_ring_buffer_switch_old_start(struct lttng_ust_lib_ring_buffer *buf,
1032                                       struct channel *chan,
1033                                       struct switch_offsets *offsets,
1034                                       uint64_t tsc,
1035                                       struct lttng_ust_shm_handle *handle)
1036 {
1037         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1038         unsigned long oldidx = subbuf_index(offsets->old, chan);
1039         unsigned long commit_count;
1040
1041         config->cb.buffer_begin(buf, tsc, oldidx, handle);
1042
1043         /*
1044          * Order all writes to buffer before the commit count update that will
1045          * determine that the subbuffer is full.
1046          */
1047         cmm_smp_wmb();
1048         v_add(config, config->cb.subbuffer_header_size(),
1049               &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1050         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1051         /* Check if the written buffer has to be delivered */
1052         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old,
1053                                       commit_count, oldidx, handle);
1054         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1055                                              offsets->old, commit_count,
1056                                              config->cb.subbuffer_header_size(),
1057                                              handle);
1058 }
1059
1060 /*
1061  * lib_ring_buffer_switch_old_end: switch old subbuffer
1062  *
1063  * Note : offset_old should never be 0 here. It is ok, because we never perform
1064  * buffer switch on an empty subbuffer in SWITCH_ACTIVE mode. The caller
1065  * increments the offset_old value when doing a SWITCH_FLUSH on an empty
1066  * subbuffer.
1067  */
1068 static
1069 void lib_ring_buffer_switch_old_end(struct lttng_ust_lib_ring_buffer *buf,
1070                                     struct channel *chan,
1071                                     struct switch_offsets *offsets,
1072                                     uint64_t tsc,
1073                                     struct lttng_ust_shm_handle *handle)
1074 {
1075         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1076         unsigned long oldidx = subbuf_index(offsets->old - 1, chan);
1077         unsigned long commit_count, padding_size, data_size;
1078
1079         data_size = subbuf_offset(offsets->old - 1, chan) + 1;
1080         padding_size = chan->backend.subbuf_size - data_size;
1081         subbuffer_set_data_size(config, &buf->backend, oldidx, data_size,
1082                                 handle);
1083
1084         /*
1085          * Order all writes to buffer before the commit count update that will
1086          * determine that the subbuffer is full.
1087          */
1088         cmm_smp_wmb();
1089         v_add(config, padding_size, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1090         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, oldidx)->cc);
1091         lib_ring_buffer_check_deliver(config, buf, chan, offsets->old - 1,
1092                                       commit_count, oldidx, handle);
1093         lib_ring_buffer_write_commit_counter(config, buf, chan, oldidx,
1094                                              offsets->old, commit_count,
1095                                              padding_size, handle);
1096 }
1097
1098 /*
1099  * lib_ring_buffer_switch_new_start: Populate new subbuffer.
1100  *
1101  * This code can be executed unordered : writers may already have written to the
1102  * sub-buffer before this code gets executed, caution.  The commit makes sure
1103  * that this code is executed before the deliver of this sub-buffer.
1104  */
1105 static
1106 void lib_ring_buffer_switch_new_start(struct lttng_ust_lib_ring_buffer *buf,
1107                                       struct channel *chan,
1108                                       struct switch_offsets *offsets,
1109                                       uint64_t tsc,
1110                                       struct lttng_ust_shm_handle *handle)
1111 {
1112         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1113         unsigned long beginidx = subbuf_index(offsets->begin, chan);
1114         unsigned long commit_count;
1115
1116         config->cb.buffer_begin(buf, tsc, beginidx, handle);
1117
1118         /*
1119          * Order all writes to buffer before the commit count update that will
1120          * determine that the subbuffer is full.
1121          */
1122         cmm_smp_wmb();
1123         v_add(config, config->cb.subbuffer_header_size(),
1124               &shmp_index(handle, buf->commit_hot, beginidx)->cc);
1125         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, beginidx)->cc);
1126         /* Check if the written buffer has to be delivered */
1127         lib_ring_buffer_check_deliver(config, buf, chan, offsets->begin,
1128                                       commit_count, beginidx, handle);
1129         lib_ring_buffer_write_commit_counter(config, buf, chan, beginidx,
1130                                              offsets->begin, commit_count,
1131                                              config->cb.subbuffer_header_size(),
1132                                              handle);
1133 }
1134
1135 /*
1136  * lib_ring_buffer_switch_new_end: finish switching current subbuffer
1137  *
1138  * The only remaining threads could be the ones with pending commits. They will
1139  * have to do the deliver themselves.
1140  */
1141 static
1142 void lib_ring_buffer_switch_new_end(struct lttng_ust_lib_ring_buffer *buf,
1143                                     struct channel *chan,
1144                                     struct switch_offsets *offsets,
1145                                     uint64_t tsc,
1146                                     struct lttng_ust_shm_handle *handle)
1147 {
1148         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1149         unsigned long endidx = subbuf_index(offsets->end - 1, chan);
1150         unsigned long commit_count, padding_size, data_size;
1151
1152         data_size = subbuf_offset(offsets->end - 1, chan) + 1;
1153         padding_size = chan->backend.subbuf_size - data_size;
1154         subbuffer_set_data_size(config, &buf->backend, endidx, data_size,
1155                                 handle);
1156
1157         /*
1158          * Order all writes to buffer before the commit count update that will
1159          * determine that the subbuffer is full.
1160          */
1161         cmm_smp_wmb();
1162         v_add(config, padding_size, &shmp_index(handle, buf->commit_hot, endidx)->cc);
1163         commit_count = v_read(config, &shmp_index(handle, buf->commit_hot, endidx)->cc);
1164         lib_ring_buffer_check_deliver(config, buf, chan, offsets->end - 1,
1165                                   commit_count, endidx, handle);
1166         lib_ring_buffer_write_commit_counter(config, buf, chan, endidx,
1167                                              offsets->end, commit_count,
1168                                              padding_size, handle);
1169 }
1170
1171 /*
1172  * Returns :
1173  * 0 if ok
1174  * !0 if execution must be aborted.
1175  */
1176 static
1177 int lib_ring_buffer_try_switch_slow(enum switch_mode mode,
1178                                     struct lttng_ust_lib_ring_buffer *buf,
1179                                     struct channel *chan,
1180                                     struct switch_offsets *offsets,
1181                                     uint64_t *tsc)
1182 {
1183         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1184         unsigned long off;
1185
1186         offsets->begin = v_read(config, &buf->offset);
1187         offsets->old = offsets->begin;
1188         offsets->switch_old_start = 0;
1189         off = subbuf_offset(offsets->begin, chan);
1190
1191         *tsc = config->cb.ring_buffer_clock_read(chan);
1192
1193         /*
1194          * Ensure we flush the header of an empty subbuffer when doing the
1195          * finalize (SWITCH_FLUSH). This ensures that we end up knowing the
1196          * total data gathering duration even if there were no records saved
1197          * after the last buffer switch.
1198          * In SWITCH_ACTIVE mode, switch the buffer when it contains events.
1199          * SWITCH_ACTIVE only flushes the current subbuffer, dealing with end of
1200          * subbuffer header as appropriate.
1201          * The next record that reserves space will be responsible for
1202          * populating the following subbuffer header. We choose not to populate
1203          * the next subbuffer header here because we want to be able to use
1204          * SWITCH_ACTIVE for periodical buffer flush, which must
1205          * guarantee that all the buffer content (records and header
1206          * timestamps) are visible to the reader. This is required for
1207          * quiescence guarantees for the fusion merge.
1208          */
1209         if (mode == SWITCH_FLUSH || off > 0) {
1210                 if (caa_unlikely(off == 0)) {
1211                         /*
1212                          * The client does not save any header information.
1213                          * Don't switch empty subbuffer on finalize, because it
1214                          * is invalid to deliver a completely empty subbuffer.
1215                          */
1216                         if (!config->cb.subbuffer_header_size())
1217                                 return -1;
1218                         /*
1219                          * Need to write the subbuffer start header on finalize.
1220                          */
1221                         offsets->switch_old_start = 1;
1222                 }
1223                 offsets->begin = subbuf_align(offsets->begin, chan);
1224         } else
1225                 return -1;      /* we do not have to switch : buffer is empty */
1226         /* Note: old points to the next subbuf at offset 0 */
1227         offsets->end = offsets->begin;
1228         return 0;
1229 }
1230
1231 /*
1232  * Force a sub-buffer switch. This operation is completely reentrant : can be
1233  * called while tracing is active with absolutely no lock held.
1234  *
1235  * Note, however, that as a v_cmpxchg is used for some atomic
1236  * operations, this function must be called from the CPU which owns the buffer
1237  * for a ACTIVE flush.
1238  */
1239 void lib_ring_buffer_switch_slow(struct lttng_ust_lib_ring_buffer *buf, enum switch_mode mode,
1240                                  struct lttng_ust_shm_handle *handle)
1241 {
1242         struct channel *chan = shmp(handle, buf->backend.chan);
1243         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1244         struct switch_offsets offsets;
1245         unsigned long oldidx;
1246         uint64_t tsc;
1247
1248         offsets.size = 0;
1249
1250         /*
1251          * Perform retryable operations.
1252          */
1253         do {
1254                 if (lib_ring_buffer_try_switch_slow(mode, buf, chan, &offsets,
1255                                                     &tsc))
1256                         return; /* Switch not needed */
1257         } while (v_cmpxchg(config, &buf->offset, offsets.old, offsets.end)
1258                  != offsets.old);
1259
1260         /*
1261          * Atomically update last_tsc. This update races against concurrent
1262          * atomic updates, but the race will always cause supplementary full TSC
1263          * records, never the opposite (missing a full TSC record when it would
1264          * be needed).
1265          */
1266         save_last_tsc(config, buf, tsc);
1267
1268         /*
1269          * Push the reader if necessary
1270          */
1271         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.old);
1272
1273         oldidx = subbuf_index(offsets.old, chan);
1274         lib_ring_buffer_clear_noref(config, &buf->backend, oldidx, handle);
1275
1276         /*
1277          * May need to populate header start on SWITCH_FLUSH.
1278          */
1279         if (offsets.switch_old_start) {
1280                 lib_ring_buffer_switch_old_start(buf, chan, &offsets, tsc, handle);
1281                 offsets.old += config->cb.subbuffer_header_size();
1282         }
1283
1284         /*
1285          * Switch old subbuffer.
1286          */
1287         lib_ring_buffer_switch_old_end(buf, chan, &offsets, tsc, handle);
1288 }
1289
1290 /*
1291  * Returns :
1292  * 0 if ok
1293  * -ENOSPC if event size is too large for packet.
1294  * -ENOBUFS if there is currently not enough space in buffer for the event.
1295  * -EIO if data cannot be written into the buffer for any other reason.
1296  */
1297 static
1298 int lib_ring_buffer_try_reserve_slow(struct lttng_ust_lib_ring_buffer *buf,
1299                                      struct channel *chan,
1300                                      struct switch_offsets *offsets,
1301                                      struct lttng_ust_lib_ring_buffer_ctx *ctx)
1302 {
1303         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1304         struct lttng_ust_shm_handle *handle = ctx->handle;
1305         unsigned long reserve_commit_diff;
1306
1307         offsets->begin = v_read(config, &buf->offset);
1308         offsets->old = offsets->begin;
1309         offsets->switch_new_start = 0;
1310         offsets->switch_new_end = 0;
1311         offsets->switch_old_end = 0;
1312         offsets->pre_header_padding = 0;
1313
1314         ctx->tsc = config->cb.ring_buffer_clock_read(chan);
1315         if ((int64_t) ctx->tsc == -EIO)
1316                 return -EIO;
1317
1318         if (last_tsc_overflow(config, buf, ctx->tsc))
1319                 ctx->rflags |= RING_BUFFER_RFLAG_FULL_TSC;
1320
1321         if (caa_unlikely(subbuf_offset(offsets->begin, ctx->chan) == 0)) {
1322                 offsets->switch_new_start = 1;          /* For offsets->begin */
1323         } else {
1324                 offsets->size = config->cb.record_header_size(config, chan,
1325                                                 offsets->begin,
1326                                                 &offsets->pre_header_padding,
1327                                                 ctx);
1328                 offsets->size +=
1329                         lib_ring_buffer_align(offsets->begin + offsets->size,
1330                                               ctx->largest_align)
1331                         + ctx->data_size;
1332                 if (caa_unlikely(subbuf_offset(offsets->begin, chan) +
1333                              offsets->size > chan->backend.subbuf_size)) {
1334                         offsets->switch_old_end = 1;    /* For offsets->old */
1335                         offsets->switch_new_start = 1;  /* For offsets->begin */
1336                 }
1337         }
1338         if (caa_unlikely(offsets->switch_new_start)) {
1339                 unsigned long sb_index;
1340
1341                 /*
1342                  * We are typically not filling the previous buffer completely.
1343                  */
1344                 if (caa_likely(offsets->switch_old_end))
1345                         offsets->begin = subbuf_align(offsets->begin, chan);
1346                 offsets->begin = offsets->begin
1347                                  + config->cb.subbuffer_header_size();
1348                 /* Test new buffer integrity */
1349                 sb_index = subbuf_index(offsets->begin, chan);
1350                 reserve_commit_diff =
1351                   (buf_trunc(offsets->begin, chan)
1352                    >> chan->backend.num_subbuf_order)
1353                   - ((unsigned long) v_read(config,
1354                                             &shmp_index(handle, buf->commit_cold, sb_index)->cc_sb)
1355                      & chan->commit_count_mask);
1356                 if (caa_likely(reserve_commit_diff == 0)) {
1357                         /* Next subbuffer not being written to. */
1358                         if (caa_unlikely(config->mode != RING_BUFFER_OVERWRITE &&
1359                                 subbuf_trunc(offsets->begin, chan)
1360                                  - subbuf_trunc((unsigned long)
1361                                      uatomic_read(&buf->consumed), chan)
1362                                 >= chan->backend.buf_size)) {
1363                                 unsigned long nr_lost;
1364
1365                                 /*
1366                                  * We do not overwrite non consumed buffers
1367                                  * and we are full : record is lost.
1368                                  */
1369                                 nr_lost = v_read(config, &buf->records_lost_full);
1370                                 v_inc(config, &buf->records_lost_full);
1371                                 if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
1372                                         DBG("%lu or more records lost in (%s:%d) (buffer full)\n",
1373                                                 nr_lost + 1, chan->backend.name,
1374                                                 buf->backend.cpu);
1375                                 }
1376                                 return -ENOBUFS;
1377                         } else {
1378                                 /*
1379                                  * Next subbuffer not being written to, and we
1380                                  * are either in overwrite mode or the buffer is
1381                                  * not full. It's safe to write in this new
1382                                  * subbuffer.
1383                                  */
1384                         }
1385                 } else {
1386                         unsigned long nr_lost;
1387
1388                         /*
1389                          * Next subbuffer reserve offset does not match the
1390                          * commit offset. Drop record in producer-consumer and
1391                          * overwrite mode. Caused by either a writer OOPS or too
1392                          * many nested writes over a reserve/commit pair.
1393                          */
1394                         nr_lost = v_read(config, &buf->records_lost_wrap);
1395                         v_inc(config, &buf->records_lost_wrap);
1396                         if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
1397                                 DBG("%lu or more records lost in (%s:%d) (wrap-around)\n",
1398                                         nr_lost + 1, chan->backend.name,
1399                                         buf->backend.cpu);
1400                         }
1401                         return -EIO;
1402                 }
1403                 offsets->size =
1404                         config->cb.record_header_size(config, chan,
1405                                                 offsets->begin,
1406                                                 &offsets->pre_header_padding,
1407                                                 ctx);
1408                 offsets->size +=
1409                         lib_ring_buffer_align(offsets->begin + offsets->size,
1410                                               ctx->largest_align)
1411                         + ctx->data_size;
1412                 if (caa_unlikely(subbuf_offset(offsets->begin, chan)
1413                              + offsets->size > chan->backend.subbuf_size)) {
1414                         unsigned long nr_lost;
1415
1416                         /*
1417                          * Record too big for subbuffers, report error, don't
1418                          * complete the sub-buffer switch.
1419                          */
1420                         nr_lost = v_read(config, &buf->records_lost_big);
1421                         v_inc(config, &buf->records_lost_big);
1422                         if ((nr_lost & (DBG_PRINT_NR_LOST - 1)) == 0) {
1423                                 DBG("%lu or more records lost in (%s:%d) record size "
1424                                         " of %zu bytes is too large for buffer\n",
1425                                         nr_lost + 1, chan->backend.name,
1426                                         buf->backend.cpu, offsets->size);
1427                         }
1428                         return -ENOSPC;
1429                 } else {
1430                         /*
1431                          * We just made a successful buffer switch and the
1432                          * record fits in the new subbuffer. Let's write.
1433                          */
1434                 }
1435         } else {
1436                 /*
1437                  * Record fits in the current buffer and we are not on a switch
1438                  * boundary. It's safe to write.
1439                  */
1440         }
1441         offsets->end = offsets->begin + offsets->size;
1442
1443         if (caa_unlikely(subbuf_offset(offsets->end, chan) == 0)) {
1444                 /*
1445                  * The offset_end will fall at the very beginning of the next
1446                  * subbuffer.
1447                  */
1448                 offsets->switch_new_end = 1;    /* For offsets->begin */
1449         }
1450         return 0;
1451 }
1452
1453 /**
1454  * lib_ring_buffer_reserve_slow - Atomic slot reservation in a buffer.
1455  * @ctx: ring buffer context.
1456  *
1457  * Return : -NOBUFS if not enough space, -ENOSPC if event size too large,
1458  * -EIO for other errors, else returns 0.
1459  * It will take care of sub-buffer switching.
1460  */
1461 int lib_ring_buffer_reserve_slow(struct lttng_ust_lib_ring_buffer_ctx *ctx)
1462 {
1463         struct channel *chan = ctx->chan;
1464         struct lttng_ust_shm_handle *handle = ctx->handle;
1465         const struct lttng_ust_lib_ring_buffer_config *config = &chan->backend.config;
1466         struct lttng_ust_lib_ring_buffer *buf;
1467         struct switch_offsets offsets;
1468         int ret;
1469
1470         if (config->alloc == RING_BUFFER_ALLOC_PER_CPU)
1471                 buf = shmp(handle, chan->backend.buf[ctx->cpu].shmp);
1472         else
1473                 buf = shmp(handle, chan->backend.buf[0].shmp);
1474         ctx->buf = buf;
1475
1476         offsets.size = 0;
1477
1478         do {
1479                 ret = lib_ring_buffer_try_reserve_slow(buf, chan, &offsets,
1480                                                        ctx);
1481                 if (caa_unlikely(ret))
1482                         return ret;
1483         } while (caa_unlikely(v_cmpxchg(config, &buf->offset, offsets.old,
1484                                     offsets.end)
1485                           != offsets.old));
1486
1487         /*
1488          * Atomically update last_tsc. This update races against concurrent
1489          * atomic updates, but the race will always cause supplementary full TSC
1490          * records, never the opposite (missing a full TSC record when it would
1491          * be needed).
1492          */
1493         save_last_tsc(config, buf, ctx->tsc);
1494
1495         /*
1496          * Push the reader if necessary
1497          */
1498         lib_ring_buffer_reserve_push_reader(buf, chan, offsets.end - 1);
1499
1500         /*
1501          * Clear noref flag for this subbuffer.
1502          */
1503         lib_ring_buffer_clear_noref(config, &buf->backend,
1504                                     subbuf_index(offsets.end - 1, chan),
1505                                     handle);
1506
1507         /*
1508          * Switch old subbuffer if needed.
1509          */
1510         if (caa_unlikely(offsets.switch_old_end)) {
1511                 lib_ring_buffer_clear_noref(config, &buf->backend,
1512                                             subbuf_index(offsets.old - 1, chan),
1513                                             handle);
1514                 lib_ring_buffer_switch_old_end(buf, chan, &offsets, ctx->tsc, handle);
1515         }
1516
1517         /*
1518          * Populate new subbuffer.
1519          */
1520         if (caa_unlikely(offsets.switch_new_start))
1521                 lib_ring_buffer_switch_new_start(buf, chan, &offsets, ctx->tsc, handle);
1522
1523         if (caa_unlikely(offsets.switch_new_end))
1524                 lib_ring_buffer_switch_new_end(buf, chan, &offsets, ctx->tsc, handle);
1525
1526         ctx->slot_size = offsets.size;
1527         ctx->pre_offset = offsets.begin;
1528         ctx->buf_offset = offsets.begin + offsets.pre_header_padding;
1529         return 0;
1530 }
1531
1532 /*
1533  * Force a read (imply TLS fixup for dlopen) of TLS variables.
1534  */
1535 void lttng_fixup_ringbuffer_tls(void)
1536 {
1537         asm volatile ("" : : "m" (lib_ring_buffer_nesting));
1538 }