libust/buffers.c

   1 /*
   2  * buffers.c
   3  * LTTng userspace tracer buffering system
   4  *
   5  * Copyright (C) 2009 - Pierre-Marc Fournier (pierre-marc dot fournier at polymtl dot ca)
   6  * Copyright (C) 2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
  21  */
  22
  23 #include <unistd.h>
  24 #include <sys/mman.h>
  25 #include <sys/ipc.h>
  26 #include <sys/shm.h>
  27 #include <fcntl.h>
  28 #include <stdlib.h>
  29
  30 #include <ust/clock.h>
  31
  32 #include "buffers.h"
  33 #include "channels.h"
  34 #include "tracer.h"
  35 #include "tracercore.h"
  36 #include "usterr.h"
  37
  38 struct ltt_reserve_switch_offsets {
  39         long begin, end, old;
  40         long begin_switch, end_switch_current, end_switch_old;
  41         size_t before_hdr_pad, size;
  42 };
  43
  44
  45 static DEFINE_MUTEX(ust_buffers_channels_mutex);
  46 static CDS_LIST_HEAD(ust_buffers_channels);
  47
  48 static int get_n_cpus(void)
  49 {
  50         int result;
  51         static int n_cpus = 0;
  52
  53         if(!n_cpus) {
  54                 /* On Linux, when some processors are offline
  55                  * _SC_NPROCESSORS_CONF counts the offline
  56                  * processors, whereas _SC_NPROCESSORS_ONLN
  57                  * does not. If we used _SC_NPROCESSORS_ONLN,
  58                  * getcpu() could return a value greater than
  59                  * this sysconf, in which case the arrays
  60                  * indexed by processor would overflow.
  61                  */
  62                 result = sysconf(_SC_NPROCESSORS_CONF);
  63                 if(result == -1) {
  64                         return -1;
  65                 }
  66
  67                 n_cpus = result;
  68         }
  69
  70         return n_cpus;
  71 }
  72
  73 /**
  74  * _ust_buffers_strncpy_fixup - Fix an incomplete string in a ltt_relay buffer.
  75  * @buf : buffer
  76  * @offset : offset within the buffer
  77  * @len : length to write
  78  * @copied: string actually copied
  79  * @terminated: does string end with \0
  80  *
  81  * Fills string with "X" if incomplete.
  82  */
  83 void _ust_buffers_strncpy_fixup(struct ust_buffer *buf, size_t offset,
  84                                 size_t len, size_t copied, int terminated)
  85 {
  86         size_t buf_offset, cpy;
  87
  88         if (copied == len) {
  89                 /*
  90                  * Deal with non-terminated string.
  91                  */
  92                 assert(!terminated);
  93                 offset += copied - 1;
  94                 buf_offset = BUFFER_OFFSET(offset, buf->chan);
  95                 /*
  96                  * Underlying layer should never ask for writes across
  97                  * subbuffers.
  98                  */
  99                 assert(buf_offset
 100                        < buf->chan->subbuf_size*buf->chan->subbuf_cnt);
 101                 ust_buffers_do_memset(buf->buf_data + buf_offset, '\0', 1);
 102                 return;
 103         }
 104
 105         /*
 106          * Deal with incomplete string.
 107          * Overwrite string's \0 with X too.
 108          */
 109         cpy = copied - 1;
 110         assert(terminated);
 111         len -= cpy;
 112         offset += cpy;
 113         buf_offset = BUFFER_OFFSET(offset, buf->chan);
 114
 115         /*
 116          * Underlying layer should never ask for writes across subbuffers.
 117          */
 118         assert(buf_offset
 119                < buf->chan->subbuf_size*buf->chan->subbuf_cnt);
 120
 121         ust_buffers_do_memset(buf->buf_data + buf_offset,
 122                               'X', len);
 123
 124         /*
 125          * Overwrite last 'X' with '\0'.
 126          */
 127         offset += len - 1;
 128         buf_offset = BUFFER_OFFSET(offset, buf->chan);
 129         /*
 130          * Underlying layer should never ask for writes across subbuffers.
 131          */
 132         assert(buf_offset
 133                < buf->chan->subbuf_size*buf->chan->subbuf_cnt);
 134         ust_buffers_do_memset(buf->buf_data + buf_offset, '\0', 1);
 135 }
 136
 137 static void ltt_buffer_begin(struct ust_buffer *buf,
 138                              u64 tsc, unsigned int subbuf_idx)
 139 {
 140         struct ust_channel *channel = buf->chan;
 141         struct ltt_subbuffer_header *header =
 142                 (struct ltt_subbuffer_header *)
 143                         ust_buffers_offset_address(buf,
 144                                 subbuf_idx * buf->chan->subbuf_size);
 145
 146         header->cycle_count_begin = tsc;
 147         header->data_size = 0xFFFFFFFF; /* for recognizing crashed buffers */
 148         header->sb_size = 0xFFFFFFFF; /* for recognizing crashed buffers */
 149         /*
 150          * No memory barrier needed to order data_data/sb_size vs commit count
 151          * update, because commit count update contains a compiler barrier that
 152          * ensures the order of the writes are OK from a program POV. It only
 153          * matters for crash dump recovery which is not executed concurrently,
 154          * so memory write order does not matter.
 155          */
 156         ltt_write_trace_header(channel->trace, header);
 157 }
 158
 159 static int map_buf_data(struct ust_buffer *buf, size_t *size)
 160 {
 161         void *ptr;
 162         int result;
 163
 164         *size = PAGE_ALIGN(*size);
 165
 166         result = buf->shmid = shmget(getpid(), *size, IPC_CREAT | IPC_EXCL | 0700);
 167         if (result < 0 && errno == EINVAL) {
 168                 ERR("shmget() returned EINVAL; maybe /proc/sys/kernel/shmmax should be increased.");
 169                 return -1;
 170         } else if (result < 0) {
 171                 PERROR("shmget");
 172                 return -1;
 173         }
 174
 175         ptr = shmat(buf->shmid, NULL, 0);
 176         if (ptr == (void *) -1) {
 177                 perror("shmat");
 178                 goto destroy_shmem;
 179         }
 180
 181         /* Already mark the shared memory for destruction. This will occur only
 182          * when all users have detached.
 183          */
 184         result = shmctl(buf->shmid, IPC_RMID, NULL);
 185         if(result == -1) {
 186                 perror("shmctl");
 187                 return -1;
 188         }
 189
 190         buf->buf_data = ptr;
 191         buf->buf_size = *size;
 192
 193         return 0;
 194
 195 destroy_shmem:
 196         result = shmctl(buf->shmid, IPC_RMID, NULL);
 197         if(result == -1) {
 198                 perror("shmctl");
 199         }
 200
 201         return -1;
 202 }
 203
 204 static int open_buf(struct ust_channel *chan, int cpu)
 205 {
 206         int result, fds[2];
 207         unsigned int j;
 208         struct ust_trace *trace = chan->trace;
 209         struct ust_buffer *buf = chan->buf[cpu];
 210         unsigned int n_subbufs = chan->subbuf_cnt;
 211
 212
 213         result = map_buf_data(buf, &chan->alloc_size);
 214         if (result < 0)
 215                 return -1;
 216
 217         buf->commit_count =
 218                 zmalloc(sizeof(*buf->commit_count) * n_subbufs);
 219         if (!buf->commit_count)
 220                 goto unmap_buf;
 221
 222         result = pipe(fds);
 223         if (result < 0) {
 224                 PERROR("pipe");
 225                 goto free_commit_count;
 226         }
 227         buf->data_ready_fd_read = fds[0];
 228         buf->data_ready_fd_write = fds[1];
 229
 230         buf->cpu = cpu;
 231         buf->chan = chan;
 232
 233         uatomic_set(&buf->offset, ltt_subbuffer_header_size());
 234         uatomic_set(&buf->consumed, 0);
 235         uatomic_set(&buf->active_readers, 0);
 236         for (j = 0; j < n_subbufs; j++) {
 237                 uatomic_set(&buf->commit_count[j].cc, 0);
 238                 uatomic_set(&buf->commit_count[j].cc_sb, 0);
 239         }
 240
 241         ltt_buffer_begin(buf, trace->start_tsc, 0);
 242
 243         uatomic_add(&buf->commit_count[0].cc, ltt_subbuffer_header_size());
 244
 245         uatomic_set(&buf->events_lost, 0);
 246         uatomic_set(&buf->corrupted_subbuffers, 0);
 247
 248         memset(buf->commit_seq, 0, sizeof(buf->commit_seq[0]) * n_subbufs);
 249
 250         return 0;
 251
 252 free_commit_count:
 253         free(buf->commit_count);
 254
 255 unmap_buf:
 256         if (shmdt(buf->buf_data) < 0) {
 257                 PERROR("shmdt failed");
 258         }
 259
 260         return -1;
 261 }
 262
 263 static void close_buf(struct ust_buffer *buf)
 264 {
 265         int result;
 266
 267         result = shmdt(buf->buf_data);
 268         if (result < 0) {
 269                 PERROR("shmdt");
 270         }
 271
 272         result = close(buf->data_ready_fd_read);
 273         if (result < 0) {
 274                 PERROR("close");
 275         }
 276
 277         result = close(buf->data_ready_fd_write);
 278         if (result < 0 && errno != EBADF) {
 279                 PERROR("close");
 280         }
 281 }
 282
 283
 284 static int open_channel(struct ust_channel *chan, size_t subbuf_size,
 285                         size_t subbuf_cnt)
 286 {
 287         int i;
 288         int result;
 289
 290         if(subbuf_size == 0 || subbuf_cnt == 0)
 291                 return -1;
 292
 293         /* Check that the subbuffer size is larger than a page. */
 294         WARN_ON_ONCE(subbuf_size < PAGE_SIZE);
 295
 296         /*
 297          * Make sure the number of subbuffers and subbuffer size are power of 2.
 298          */
 299         WARN_ON_ONCE(hweight32(subbuf_size) != 1);
 300         WARN_ON(hweight32(subbuf_cnt) != 1);
 301
 302         chan->version = UST_CHANNEL_VERSION;
 303         chan->subbuf_cnt = subbuf_cnt;
 304         chan->subbuf_size = subbuf_size;
 305         chan->subbuf_size_order = get_count_order(subbuf_size);
 306         chan->alloc_size = subbuf_size * subbuf_cnt;
 307
 308         pthread_mutex_lock(&ust_buffers_channels_mutex);
 309         for (i=0; i < chan->n_cpus; i++) {
 310                 result = open_buf(chan, i);
 311                 if (result == -1)
 312                         goto error;
 313         }
 314         cds_list_add(&chan->list, &ust_buffers_channels);
 315         pthread_mutex_unlock(&ust_buffers_channels_mutex);
 316
 317         return 0;
 318
 319         /* Error handling */
 320 error:
 321         for(i--; i >= 0; i--)
 322                 close_buf(chan->buf[i]);
 323
 324         pthread_mutex_unlock(&ust_buffers_channels_mutex);
 325         return -1;
 326 }
 327
 328 static void close_channel(struct ust_channel *chan)
 329 {
 330         int i;
 331         if(!chan)
 332                 return;
 333
 334         pthread_mutex_lock(&ust_buffers_channels_mutex);
 335         for(i=0; i<chan->n_cpus; i++) {
 336         /* FIXME: if we make it here, then all buffers were necessarily allocated. Moreover, we don't
 337          * initialize to NULL so we cannot use this check. Should we? */
 338 //ust//         if (chan->buf[i])
 339                         close_buf(chan->buf[i]);
 340         }
 341
 342         cds_list_del(&chan->list);
 343
 344         pthread_mutex_unlock(&ust_buffers_channels_mutex);
 345 }
 346
 347 static void ltt_force_switch(struct ust_buffer *buf,
 348                 enum force_switch_mode mode);
 349
 350
 351
 352 /*
 353  * offset is assumed to never be 0 here : never deliver a completely empty
 354  * subbuffer. The lost size is between 0 and subbuf_size-1.
 355  */
 356 static notrace void ltt_buffer_end(struct ust_buffer *buf,
 357                 u64 tsc, unsigned int offset, unsigned int subbuf_idx)
 358 {
 359         struct ltt_subbuffer_header *header =
 360                 (struct ltt_subbuffer_header *)
 361                         ust_buffers_offset_address(buf,
 362                                 subbuf_idx * buf->chan->subbuf_size);
 363         u32 data_size = SUBBUF_OFFSET(offset - 1, buf->chan) + 1;
 364
 365         header->sb_size = PAGE_ALIGN(data_size);
 366         header->cycle_count_end = tsc;
 367         header->events_lost = uatomic_read(&buf->events_lost);
 368         header->subbuf_corrupt = uatomic_read(&buf->corrupted_subbuffers);
 369         if(unlikely(header->events_lost > 0)) {
 370                 DBG("Some events (%d) were lost in %s_%d", header->events_lost, buf->chan->channel_name, buf->cpu);
 371         }
 372         /*
 373          * Makes sure data_size write happens after write of the rest of the
 374          * buffer end data, because data_size is used to identify a completely
 375          * written subbuffer in a crash dump.
 376          */
 377         cmm_barrier();
 378         header->data_size = data_size;
 379 }
 380
 381 /*
 382  * This function should not be called from NMI interrupt context
 383  */
 384 static notrace void ltt_buf_unfull(struct ust_buffer *buf,
 385                 unsigned int subbuf_idx,
 386                 long offset)
 387 {
 388 }
 389
 390 /*
 391  * Promote compiler cmm_barrier to a smp_mb().
 392  * For the specific LTTng case, this IPI call should be removed if the
 393  * architecture does not reorder writes.  This should eventually be provided by
 394  * a separate architecture-specific infrastructure.
 395  */
 396 //ust// static void remote_mb(void *info)
 397 //ust// {
 398 //ust//         smp_mb();
 399 //ust// }
 400
 401 int ust_buffers_get_subbuf(struct ust_buffer *buf, long *consumed)
 402 {
 403         struct ust_channel *channel = buf->chan;
 404         long consumed_old, consumed_idx, commit_count, write_offset;
 405 //ust// int retval;
 406
 407         consumed_old = uatomic_read(&buf->consumed);
 408         consumed_idx = SUBBUF_INDEX(consumed_old, buf->chan);
 409         commit_count = uatomic_read(&buf->commit_count[consumed_idx].cc_sb);
 410         /*
 411          * Make sure we read the commit count before reading the buffer
 412          * data and the write offset. Correct consumed offset ordering
 413          * wrt commit count is insured by the use of cmpxchg to update
 414          * the consumed offset.
 415          * smp_call_function_single can fail if the remote CPU is offline,
 416          * this is OK because then there is no wmb to execute there.
 417          * If our thread is executing on the same CPU as the on the buffers
 418          * belongs to, we don't have to synchronize it at all. If we are
 419          * migrated, the scheduler will take care of the memory cmm_barriers.
 420          * Normally, smp_call_function_single() should ensure program order when
 421          * executing the remote function, which implies that it surrounds the
 422          * function execution with :
 423          * smp_mb()
 424          * send IPI
 425          * csd_lock_wait
 426          *                recv IPI
 427          *                smp_mb()
 428          *                exec. function
 429          *                smp_mb()
 430          *                csd unlock
 431          * smp_mb()
 432          *
 433          * However, smp_call_function_single() does not seem to clearly execute
 434          * such barriers. It depends on spinlock semantic to provide the barrier
 435          * before executing the IPI and, when busy-looping, csd_lock_wait only
 436          * executes smp_mb() when it has to wait for the other CPU.
 437          *
 438          * I don't trust this code. Therefore, let's add the smp_mb() sequence
 439          * required ourself, even if duplicated. It has no performance impact
 440          * anyway.
 441          *
 442          * smp_mb() is needed because cmm_smp_rmb() and cmm_smp_wmb() only order read vs
 443          * read and write vs write. They do not ensure core synchronization. We
 444          * really have to ensure total order between the 3 cmm_barriers running on
 445          * the 2 CPUs.
 446          */
 447 //ust// #ifdef LTT_NO_IPI_BARRIER
 448         /*
 449          * Local rmb to match the remote wmb to read the commit count before the
 450          * buffer data and the write offset.
 451          */
 452         cmm_smp_rmb();
 453 //ust// #else
 454 //ust//         if (raw_smp_processor_id() != buf->cpu) {
 455 //ust//                 smp_mb();       /* Total order with IPI handler smp_mb() */
 456 //ust//                 smp_call_function_single(buf->cpu, remote_mb, NULL, 1);
 457 //ust//                 smp_mb();       /* Total order with IPI handler smp_mb() */
 458 //ust//         }
 459 //ust// #endif
 460
 461         write_offset = uatomic_read(&buf->offset);
 462         /*
 463          * Check that the subbuffer we are trying to consume has been
 464          * already fully committed.
 465          */
 466         if (((commit_count - buf->chan->subbuf_size)
 467              & channel->commit_count_mask)
 468             - (BUFFER_TRUNC(consumed_old, buf->chan)
 469                >> channel->n_subbufs_order)
 470             != 0) {
 471                 return -EAGAIN;
 472         }
 473         /*
 474          * Check that we are not about to read the same subbuffer in
 475          * which the writer head is.
 476          */
 477         if ((SUBBUF_TRUNC(write_offset, buf->chan)
 478            - SUBBUF_TRUNC(consumed_old, buf->chan))
 479            == 0) {
 480                 return -EAGAIN;
 481         }
 482
 483         /* FIXME: is this ok to disable the reading feature? */
 484 //ust// retval = update_read_sb_index(buf, consumed_idx);
 485 //ust// if (retval)
 486 //ust//         return retval;
 487
 488         *consumed = consumed_old;
 489
 490         return 0;
 491 }
 492
 493 int ust_buffers_put_subbuf(struct ust_buffer *buf, unsigned long uconsumed_old)
 494 {
 495         long consumed_new, consumed_old;
 496
 497         consumed_old = uatomic_read(&buf->consumed);
 498         consumed_old = consumed_old & (~0xFFFFFFFFL);
 499         consumed_old = consumed_old | uconsumed_old;
 500         consumed_new = SUBBUF_ALIGN(consumed_old, buf->chan);
 501
 502 //ust// spin_lock(&ltt_buf->full_lock);
 503         if (uatomic_cmpxchg(&buf->consumed, consumed_old,
 504                                 consumed_new)
 505             != consumed_old) {
 506                 /* We have been pushed by the writer : the last
 507                  * buffer read _is_ corrupted! It can also
 508                  * happen if this is a buffer we never got. */
 509 //ust//         spin_unlock(&ltt_buf->full_lock);
 510                 return -EIO;
 511         } else {
 512                 /* tell the client that buffer is now unfull */
 513                 int index;
 514                 long data;
 515                 index = SUBBUF_INDEX(consumed_old, buf->chan);
 516                 data = BUFFER_OFFSET(consumed_old, buf->chan);
 517                 ltt_buf_unfull(buf, index, data);
 518 //ust//         spin_unlock(&ltt_buf->full_lock);
 519         }
 520         return 0;
 521 }
 522
 523 static int map_buf_structs(struct ust_channel *chan)
 524 {
 525         void *ptr;
 526         int result;
 527         size_t size;
 528         int i;
 529
 530         size = PAGE_ALIGN(1);
 531
 532         for(i=0; i<chan->n_cpus; i++) {
 533
 534                 result = chan->buf_struct_shmids[i] = shmget(getpid(), size, IPC_CREAT | IPC_EXCL | 0700);
 535                 if(result == -1) {
 536                         PERROR("shmget");
 537                         goto destroy_previous;
 538                 }
 539
 540                 ptr = shmat(chan->buf_struct_shmids[i], NULL, 0);
 541                 if(ptr == (void *) -1) {
 542                         perror("shmat");
 543                         goto destroy_shm;
 544                 }
 545
 546                 /* Already mark the shared memory for destruction. This will occur only
 547                  * when all users have detached.
 548                  */
 549                 result = shmctl(chan->buf_struct_shmids[i], IPC_RMID, NULL);
 550                 if(result == -1) {
 551                         perror("shmctl");
 552                         goto destroy_previous;
 553                 }
 554
 555                 chan->buf[i] = ptr;
 556         }
 557
 558         return 0;
 559
 560         /* Jumping inside this loop occurs from within the other loop above with i as
 561          * counter, so it unallocates the structures for the cpu = current_i down to
 562          * zero. */
 563         for(; i>=0; i--) {
 564                 destroy_shm:
 565                 result = shmctl(chan->buf_struct_shmids[i], IPC_RMID, NULL);
 566                 if(result == -1) {
 567                         perror("shmctl");
 568                 }
 569
 570                 destroy_previous:
 571                 continue;
 572         }
 573
 574         return -1;
 575 }
 576
 577 static int unmap_buf_structs(struct ust_channel *chan)
 578 {
 579         int i;
 580
 581         for (i=0; i < chan->n_cpus; i++) {
 582                 if (shmdt(chan->buf[i]) < 0) {
 583                         PERROR("shmdt");
 584                 }
 585         }
 586         return 0;
 587 }
 588
 589 /*
 590  * Create channel.
 591  */
 592 static int create_channel(const char *trace_name, struct ust_trace *trace,
 593         const char *channel_name, struct ust_channel *chan,
 594         unsigned int subbuf_size, unsigned int n_subbufs, int overwrite)
 595 {
 596         int i, result;
 597
 598         chan->trace = trace;
 599         chan->overwrite = overwrite;
 600         chan->n_subbufs_order = get_count_order(n_subbufs);
 601         chan->commit_count_mask = (~0UL >> chan->n_subbufs_order);
 602         chan->n_cpus = get_n_cpus();
 603
 604         /* These mappings should ideall be per-cpu, if somebody can do that
 605          * from userspace, that would be cool!
 606          */
 607         chan->buf = (void *) zmalloc(chan->n_cpus * sizeof(void *));
 608         if(chan->buf == NULL) {
 609                 goto error;
 610         }
 611         chan->buf_struct_shmids = (int *) zmalloc(chan->n_cpus * sizeof(int));
 612         if(chan->buf_struct_shmids == NULL)
 613                 goto free_buf;
 614
 615         result = map_buf_structs(chan);
 616         if(result != 0) {
 617                 goto free_buf_struct_shmids;
 618         }
 619
 620         result = open_channel(chan, subbuf_size, n_subbufs);
 621         if (result != 0) {
 622                 ERR("Cannot open channel for trace %s", trace_name);
 623                 goto unmap_buf_structs;
 624         }
 625
 626         return 0;
 627
 628 unmap_buf_structs:
 629         for (i=0; i < chan->n_cpus; i++) {
 630                 if (shmdt(chan->buf[i]) < 0) {
 631                         PERROR("shmdt bufstruct");
 632                 }
 633         }
 634
 635 free_buf_struct_shmids:
 636         free(chan->buf_struct_shmids);
 637
 638 free_buf:
 639         free(chan->buf);
 640
 641 error:
 642         return -1;
 643 }
 644
 645
 646 static void remove_channel(struct ust_channel *chan)
 647 {
 648         close_channel(chan);
 649
 650         unmap_buf_structs(chan);
 651
 652         free(chan->buf_struct_shmids);
 653
 654         free(chan->buf);
 655 }
 656
 657 static void ltt_relay_async_wakeup_chan(struct ust_channel *ltt_channel)
 658 {
 659 //ust// unsigned int i;
 660 //ust// struct rchan *rchan = ltt_channel->trans_channel_data;
 661 //ust//
 662 //ust// for_each_possible_cpu(i) {
 663 //ust//         struct ltt_channel_buf_struct *ltt_buf =
 664 //ust//                 percpu_ptr(ltt_channel->buf, i);
 665 //ust//
 666 //ust//         if (uatomic_read(&ltt_buf->wakeup_readers) == 1) {
 667 //ust//                 uatomic_set(&ltt_buf->wakeup_readers, 0);
 668 //ust//                 wake_up_interruptible(&rchan->buf[i]->read_wait);
 669 //ust//         }
 670 //ust// }
 671 }
 672
 673 static void ltt_relay_finish_buffer(struct ust_channel *channel, unsigned int cpu)
 674 {
 675 //      int result;
 676
 677         if (channel->buf[cpu]) {
 678                 struct ust_buffer *buf = channel->buf[cpu];
 679                 ltt_force_switch(buf, FORCE_FLUSH);
 680
 681                 /* closing the pipe tells the consumer the buffer is finished */
 682                 close(buf->data_ready_fd_write);
 683         }
 684 }
 685
 686
 687 static void finish_channel(struct ust_channel *channel)
 688 {
 689         unsigned int i;
 690
 691         for(i=0; i<channel->n_cpus; i++) {
 692                 ltt_relay_finish_buffer(channel, i);
 693         }
 694 }
 695
 696
 697 /*
 698  * ltt_reserve_switch_old_subbuf: switch old subbuffer
 699  *
 700  * Concurrency safe because we are the last and only thread to alter this
 701  * sub-buffer. As long as it is not delivered and read, no other thread can
 702  * alter the offset, alter the reserve_count or call the
 703  * client_buffer_end_callback on this sub-buffer.
 704  *
 705  * The only remaining threads could be the ones with pending commits. They will
 706  * have to do the deliver themselves.  Not concurrency safe in overwrite mode.
 707  * We detect corrupted subbuffers with commit and reserve counts. We keep a
 708  * corrupted sub-buffers count and push the readers across these sub-buffers.
 709  *
 710  * Not concurrency safe if a writer is stalled in a subbuffer and another writer
 711  * switches in, finding out it's corrupted.  The result will be than the old
 712  * (uncommited) subbuffer will be declared corrupted, and that the new subbuffer
 713  * will be declared corrupted too because of the commit count adjustment.
 714  *
 715  * Note : offset_old should never be 0 here.
 716  */
 717 static void ltt_reserve_switch_old_subbuf(
 718                 struct ust_channel *chan, struct ust_buffer *buf,
 719                 struct ltt_reserve_switch_offsets *offsets, u64 *tsc)
 720 {
 721         long oldidx = SUBBUF_INDEX(offsets->old - 1, chan);
 722         long commit_count, padding_size;
 723
 724         padding_size = chan->subbuf_size
 725                         - (SUBBUF_OFFSET(offsets->old - 1, chan) + 1);
 726         ltt_buffer_end(buf, *tsc, offsets->old, oldidx);
 727
 728         /*
 729          * Must write slot data before incrementing commit count.
 730          * This compiler barrier is upgraded into a cmm_smp_wmb() by the IPI
 731          * sent by get_subbuf() when it does its cmm_smp_rmb().
 732          */
 733         cmm_smp_wmb();
 734         uatomic_add(&buf->commit_count[oldidx].cc, padding_size);
 735         commit_count = uatomic_read(&buf->commit_count[oldidx].cc);
 736         ltt_check_deliver(chan, buf, offsets->old - 1, commit_count, oldidx);
 737         ltt_write_commit_counter(chan, buf, oldidx,
 738                 offsets->old, commit_count, padding_size);
 739 }
 740
 741 /*
 742  * ltt_reserve_switch_new_subbuf: Populate new subbuffer.
 743  *
 744  * This code can be executed unordered : writers may already have written to the
 745  * sub-buffer before this code gets executed, caution.  The commit makes sure
 746  * that this code is executed before the deliver of this sub-buffer.
 747  */
 748 static void ltt_reserve_switch_new_subbuf(
 749                 struct ust_channel *chan, struct ust_buffer *buf,
 750                 struct ltt_reserve_switch_offsets *offsets, u64 *tsc)
 751 {
 752         long beginidx = SUBBUF_INDEX(offsets->begin, chan);
 753         long commit_count;
 754
 755         ltt_buffer_begin(buf, *tsc, beginidx);
 756
 757         /*
 758          * Must write slot data before incrementing commit count.
 759          * This compiler barrier is upgraded into a cmm_smp_wmb() by the IPI
 760          * sent by get_subbuf() when it does its cmm_smp_rmb().
 761          */
 762         cmm_smp_wmb();
 763         uatomic_add(&buf->commit_count[beginidx].cc, ltt_subbuffer_header_size());
 764         commit_count = uatomic_read(&buf->commit_count[beginidx].cc);
 765         /* Check if the written buffer has to be delivered */
 766         ltt_check_deliver(chan, buf, offsets->begin, commit_count, beginidx);
 767         ltt_write_commit_counter(chan, buf, beginidx,
 768                 offsets->begin, commit_count, ltt_subbuffer_header_size());
 769 }
 770
 771 /*
 772  * ltt_reserve_end_switch_current: finish switching current subbuffer
 773  *
 774  * Concurrency safe because we are the last and only thread to alter this
 775  * sub-buffer. As long as it is not delivered and read, no other thread can
 776  * alter the offset, alter the reserve_count or call the
 777  * client_buffer_end_callback on this sub-buffer.
 778  *
 779  * The only remaining threads could be the ones with pending commits. They will
 780  * have to do the deliver themselves.  Not concurrency safe in overwrite mode.
 781  * We detect corrupted subbuffers with commit and reserve counts. We keep a
 782  * corrupted sub-buffers count and push the readers across these sub-buffers.
 783  *
 784  * Not concurrency safe if a writer is stalled in a subbuffer and another writer
 785  * switches in, finding out it's corrupted.  The result will be than the old
 786  * (uncommited) subbuffer will be declared corrupted, and that the new subbuffer
 787  * will be declared corrupted too because of the commit count adjustment.
 788  */
 789 static void ltt_reserve_end_switch_current(
 790                 struct ust_channel *chan,
 791                 struct ust_buffer *buf,
 792                 struct ltt_reserve_switch_offsets *offsets, u64 *tsc)
 793 {
 794         long endidx = SUBBUF_INDEX(offsets->end - 1, chan);
 795         long commit_count, padding_size;
 796
 797         padding_size = chan->subbuf_size
 798                         - (SUBBUF_OFFSET(offsets->end - 1, chan) + 1);
 799
 800         ltt_buffer_end(buf, *tsc, offsets->end, endidx);
 801
 802         /*
 803          * Must write slot data before incrementing commit count.
 804          * This compiler barrier is upgraded into a cmm_smp_wmb() by the IPI
 805          * sent by get_subbuf() when it does its cmm_smp_rmb().
 806          */
 807         cmm_smp_wmb();
 808         uatomic_add(&buf->commit_count[endidx].cc, padding_size);
 809         commit_count = uatomic_read(&buf->commit_count[endidx].cc);
 810         ltt_check_deliver(chan, buf,
 811                 offsets->end - 1, commit_count, endidx);
 812         ltt_write_commit_counter(chan, buf, endidx,
 813                 offsets->end, commit_count, padding_size);
 814 }
 815
 816 /*
 817  * Returns :
 818  * 0 if ok
 819  * !0 if execution must be aborted.
 820  */
 821 static int ltt_relay_try_switch_slow(
 822                 enum force_switch_mode mode,
 823                 struct ust_channel *chan,
 824                 struct ust_buffer *buf,
 825                 struct ltt_reserve_switch_offsets *offsets,
 826                 u64 *tsc)
 827 {
 828         long subbuf_index;
 829         long reserve_commit_diff;
 830
 831         offsets->begin = uatomic_read(&buf->offset);
 832         offsets->old = offsets->begin;
 833         offsets->begin_switch = 0;
 834         offsets->end_switch_old = 0;
 835
 836         *tsc = trace_clock_read64();
 837
 838         if (SUBBUF_OFFSET(offsets->begin, buf->chan) != 0) {
 839                 offsets->begin = SUBBUF_ALIGN(offsets->begin, buf->chan);
 840                 offsets->end_switch_old = 1;
 841         } else {
 842                 /* we do not have to switch : buffer is empty */
 843                 return -1;
 844         }
 845         if (mode == FORCE_ACTIVE)
 846                 offsets->begin += ltt_subbuffer_header_size();
 847         /*
 848          * Always begin_switch in FORCE_ACTIVE mode.
 849          * Test new buffer integrity
 850          */
 851         subbuf_index = SUBBUF_INDEX(offsets->begin, buf->chan);
 852         reserve_commit_diff =
 853                 (BUFFER_TRUNC(offsets->begin, buf->chan)
 854                  >> chan->n_subbufs_order)
 855                 - (uatomic_read(&buf->commit_count[subbuf_index].cc_sb)
 856                         & chan->commit_count_mask);
 857         if (reserve_commit_diff == 0) {
 858                 /* Next buffer not corrupted. */
 859                 if (mode == FORCE_ACTIVE
 860                     && !chan->overwrite
 861                     && offsets->begin - uatomic_read(&buf->consumed)
 862                        >= chan->alloc_size) {
 863                         /*
 864                          * We do not overwrite non consumed buffers and we are
 865                          * full : ignore switch while tracing is active.
 866                          */
 867                         return -1;
 868                 }
 869         } else {
 870                 /*
 871                  * Next subbuffer corrupted. Force pushing reader even in normal
 872                  * mode
 873                  */
 874         }
 875         offsets->end = offsets->begin;
 876         return 0;
 877 }
 878
 879 /*
 880  * Force a sub-buffer switch for a per-cpu buffer. This operation is
 881  * completely reentrant : can be called while tracing is active with
 882  * absolutely no lock held.
 883  */
 884 void ltt_force_switch_lockless_slow(struct ust_buffer *buf,
 885                 enum force_switch_mode mode)
 886 {
 887         struct ust_channel *chan = buf->chan;
 888         struct ltt_reserve_switch_offsets offsets;
 889         u64 tsc;
 890
 891         offsets.size = 0;
 892
 893         DBG("Switching (forced) %s_%d", chan->channel_name, buf->cpu);
 894         /*
 895          * Perform retryable operations.
 896          */
 897         do {
 898                 if (ltt_relay_try_switch_slow(mode, chan, buf,
 899                                 &offsets, &tsc))
 900                         return;
 901         } while (uatomic_cmpxchg(&buf->offset, offsets.old,
 902                         offsets.end) != offsets.old);
 903
 904         /*
 905          * Atomically update last_tsc. This update races against concurrent
 906          * atomic updates, but the race will always cause supplementary full TSC
 907          * events, never the opposite (missing a full TSC event when it would be
 908          * needed).
 909          */
 910         save_last_tsc(buf, tsc);
 911
 912         /*
 913          * Push the reader if necessary
 914          */
 915         if (mode == FORCE_ACTIVE) {
 916                 ltt_reserve_push_reader(chan, buf, offsets.end - 1);
 917 //ust//         ltt_clear_noref_flag(chan, buf, SUBBUF_INDEX(offsets.end - 1, chan));
 918         }
 919
 920         /*
 921          * Switch old subbuffer if needed.
 922          */
 923         if (offsets.end_switch_old) {
 924 //ust//         ltt_clear_noref_flag(rchan, buf, SUBBUF_INDEX(offsets.old - 1, rchan));
 925                 ltt_reserve_switch_old_subbuf(chan, buf, &offsets, &tsc);
 926         }
 927
 928         /*
 929          * Populate new subbuffer.
 930          */
 931         if (mode == FORCE_ACTIVE)
 932                 ltt_reserve_switch_new_subbuf(chan, buf, &offsets, &tsc);
 933 }
 934
 935 /*
 936  * Returns :
 937  * 0 if ok
 938  * !0 if execution must be aborted.
 939  */
 940 static int ltt_relay_try_reserve_slow(struct ust_channel *chan, struct ust_buffer *buf,
 941                 struct ltt_reserve_switch_offsets *offsets, size_t data_size,
 942                 u64 *tsc, unsigned int *rflags, int largest_align)
 943 {
 944         long reserve_commit_diff;
 945
 946         offsets->begin = uatomic_read(&buf->offset);
 947         offsets->old = offsets->begin;
 948         offsets->begin_switch = 0;
 949         offsets->end_switch_current = 0;
 950         offsets->end_switch_old = 0;
 951
 952         *tsc = trace_clock_read64();
 953         if (last_tsc_overflow(buf, *tsc))
 954                 *rflags = LTT_RFLAG_ID_SIZE_TSC;
 955
 956         if (unlikely(SUBBUF_OFFSET(offsets->begin, buf->chan) == 0)) {
 957                 offsets->begin_switch = 1;              /* For offsets->begin */
 958         } else {
 959                 offsets->size = ust_get_header_size(chan,
 960                                         offsets->begin, data_size,
 961                                         &offsets->before_hdr_pad, *rflags);
 962                 offsets->size += ltt_align(offsets->begin + offsets->size,
 963                                            largest_align)
 964                                  + data_size;
 965                 if (unlikely((SUBBUF_OFFSET(offsets->begin, buf->chan) +
 966                              offsets->size) > buf->chan->subbuf_size)) {
 967                         offsets->end_switch_old = 1;    /* For offsets->old */
 968                         offsets->begin_switch = 1;      /* For offsets->begin */
 969                 }
 970         }
 971         if (unlikely(offsets->begin_switch)) {
 972                 long subbuf_index;
 973
 974                 /*
 975                  * We are typically not filling the previous buffer completely.
 976                  */
 977                 if (likely(offsets->end_switch_old))
 978                         offsets->begin = SUBBUF_ALIGN(offsets->begin,
 979                                                       buf->chan);
 980                 offsets->begin = offsets->begin + ltt_subbuffer_header_size();
 981                 /* Test new buffer integrity */
 982                 subbuf_index = SUBBUF_INDEX(offsets->begin, buf->chan);
 983                 reserve_commit_diff =
 984                   (BUFFER_TRUNC(offsets->begin, buf->chan)
 985                    >> chan->n_subbufs_order)
 986                   - (uatomic_read(&buf->commit_count[subbuf_index].cc_sb)
 987                                 & chan->commit_count_mask);
 988                 if (likely(reserve_commit_diff == 0)) {
 989                         /* Next buffer not corrupted. */
 990                         if (unlikely(!chan->overwrite &&
 991                                 (SUBBUF_TRUNC(offsets->begin, buf->chan)
 992                                  - SUBBUF_TRUNC(uatomic_read(
 993                                                         &buf->consumed),
 994                                                 buf->chan))
 995                                 >= chan->alloc_size)) {
 996                                 /*
 997                                  * We do not overwrite non consumed buffers
 998                                  * and we are full : event is lost.
 999                                  */
1000                                 uatomic_inc(&buf->events_lost);
1001                                 return -1;
1002                         } else {
1003                                 /*
1004                                  * next buffer not corrupted, we are either in
1005                                  * overwrite mode or the buffer is not full.
1006                                  * It's safe to write in this new subbuffer.
1007                                  */
1008                         }
1009                 } else {
1010                         /*
1011                          * Next subbuffer corrupted. Drop event in normal and
1012                          * overwrite mode. Caused by either a writer OOPS or
1013                          * too many nested writes over a reserve/commit pair.
1014                          */
1015                         uatomic_inc(&buf->events_lost);
1016                         return -1;
1017                 }
1018                 offsets->size = ust_get_header_size(chan,
1019                                         offsets->begin, data_size,
1020                                         &offsets->before_hdr_pad, *rflags);
1021                 offsets->size += ltt_align(offsets->begin + offsets->size,
1022                                            largest_align)
1023                                  + data_size;
1024                 if (unlikely((SUBBUF_OFFSET(offsets->begin, buf->chan)
1025                              + offsets->size) > buf->chan->subbuf_size)) {
1026                         /*
1027                          * Event too big for subbuffers, report error, don't
1028                          * complete the sub-buffer switch.
1029                          */
1030                         uatomic_inc(&buf->events_lost);
1031                         return -1;
1032                 } else {
1033                         /*
1034                          * We just made a successful buffer switch and the event
1035                          * fits in the new subbuffer. Let's write.
1036                          */
1037                 }
1038         } else {
1039                 /*
1040                  * Event fits in the current buffer and we are not on a switch
1041                  * boundary. It's safe to write.
1042                  */
1043         }
1044         offsets->end = offsets->begin + offsets->size;
1045
1046         if (unlikely((SUBBUF_OFFSET(offsets->end, buf->chan)) == 0)) {
1047                 /*
1048                  * The offset_end will fall at the very beginning of the next
1049                  * subbuffer.
1050                  */
1051                 offsets->end_switch_current = 1;        /* For offsets->begin */
1052         }
1053         return 0;
1054 }
1055
1056 /**
1057  * ltt_relay_reserve_slot_lockless_slow - Atomic slot reservation in a buffer.
1058  * @trace: the trace structure to log to.
1059  * @ltt_channel: channel structure
1060  * @transport_data: data structure specific to ltt relay
1061  * @data_size: size of the variable length data to log.
1062  * @slot_size: pointer to total size of the slot (out)
1063  * @buf_offset : pointer to reserved buffer offset (out)
1064  * @tsc: pointer to the tsc at the slot reservation (out)
1065  * @cpu: cpuid
1066  *
1067  * Return : -ENOSPC if not enough space, else returns 0.
1068  * It will take care of sub-buffer switching.
1069  */
1070 int ltt_reserve_slot_lockless_slow(struct ust_channel *chan,
1071                 struct ust_trace *trace, size_t data_size,
1072                 int largest_align, int cpu,
1073                 struct ust_buffer **ret_buf,
1074                 size_t *slot_size, long *buf_offset,
1075                 u64 *tsc, unsigned int *rflags)
1076 {
1077         struct ust_buffer *buf = *ret_buf = chan->buf[cpu];
1078         struct ltt_reserve_switch_offsets offsets;
1079
1080         offsets.size = 0;
1081
1082         do {
1083                 if (unlikely(ltt_relay_try_reserve_slow(chan, buf, &offsets,
1084                                 data_size, tsc, rflags, largest_align)))
1085                         return -ENOSPC;
1086         } while (unlikely(uatomic_cmpxchg(&buf->offset, offsets.old,
1087                         offsets.end) != offsets.old));
1088
1089         /*
1090          * Atomically update last_tsc. This update races against concurrent
1091          * atomic updates, but the race will always cause supplementary full TSC
1092          * events, never the opposite (missing a full TSC event when it would be
1093          * needed).
1094          */
1095         save_last_tsc(buf, *tsc);
1096
1097         /*
1098          * Push the reader if necessary
1099          */
1100         ltt_reserve_push_reader(chan, buf, offsets.end - 1);
1101
1102         /*
1103          * Clear noref flag for this subbuffer.
1104          */
1105 //ust// ltt_clear_noref_flag(chan, buf, SUBBUF_INDEX(offsets.end - 1, chan));
1106
1107         /*
1108          * Switch old subbuffer if needed.
1109          */
1110         if (unlikely(offsets.end_switch_old)) {
1111 //ust//         ltt_clear_noref_flag(chan, buf, SUBBUF_INDEX(offsets.old - 1, chan));
1112                 ltt_reserve_switch_old_subbuf(chan, buf, &offsets, tsc);
1113                 DBG("Switching %s_%d", chan->channel_name, cpu);
1114         }
1115
1116         /*
1117          * Populate new subbuffer.
1118          */
1119         if (unlikely(offsets.begin_switch))
1120                 ltt_reserve_switch_new_subbuf(chan, buf, &offsets, tsc);
1121
1122         if (unlikely(offsets.end_switch_current))
1123                 ltt_reserve_end_switch_current(chan, buf, &offsets, tsc);
1124
1125         *slot_size = offsets.size;
1126         *buf_offset = offsets.begin + offsets.before_hdr_pad;
1127         return 0;
1128 }
1129
1130 static struct ltt_transport ust_relay_transport = {
1131         .name = "ustrelay",
1132         .ops = {
1133                 .create_channel = create_channel,
1134                 .finish_channel = finish_channel,
1135                 .remove_channel = remove_channel,
1136                 .wakeup_channel = ltt_relay_async_wakeup_chan,
1137         },
1138 };
1139
1140 static char initialized = 0;
1141
1142 void __attribute__((constructor)) init_ustrelay_transport(void)
1143 {
1144         if(!initialized) {
1145                 ltt_transport_register(&ust_relay_transport);
1146                 initialized = 1;
1147         }
1148 }
1149
1150 static void __attribute__((destructor)) ust_buffers_exit(void)
1151 {
1152         ltt_transport_unregister(&ust_relay_transport);
1153 }
1154
1155 size_t ltt_write_event_header_slow(struct ust_channel *channel,
1156                 struct ust_buffer *buf, long buf_offset,
1157                 u16 eID, u32 event_size,
1158                 u64 tsc, unsigned int rflags)
1159 {
1160         struct ltt_event_header header;
1161         u16 small_size;
1162
1163         switch (rflags) {
1164         case LTT_RFLAG_ID_SIZE_TSC:
1165                 header.id_time = 29 << LTT_TSC_BITS;
1166                 break;
1167         case LTT_RFLAG_ID_SIZE:
1168                 header.id_time = 30 << LTT_TSC_BITS;
1169                 break;
1170         case LTT_RFLAG_ID:
1171                 header.id_time = 31 << LTT_TSC_BITS;
1172                 break;
1173         default:
1174                 WARN_ON_ONCE(1);
1175                 header.id_time = 0;
1176                 break;
1177         }
1178
1179         header.id_time |= (u32)tsc & LTT_TSC_MASK;
1180         ust_buffers_write(buf, buf_offset, &header, sizeof(header));
1181         buf_offset += sizeof(header);
1182
1183         switch (rflags) {
1184         case LTT_RFLAG_ID_SIZE_TSC:
1185                 small_size = (u16)min_t(u32, event_size, LTT_MAX_SMALL_SIZE);
1186                 ust_buffers_write(buf, buf_offset,
1187                         &eID, sizeof(u16));
1188                 buf_offset += sizeof(u16);
1189                 ust_buffers_write(buf, buf_offset,
1190                         &small_size, sizeof(u16));
1191                 buf_offset += sizeof(u16);
1192                 if (small_size == LTT_MAX_SMALL_SIZE) {
1193                         ust_buffers_write(buf, buf_offset,
1194                                 &event_size, sizeof(u32));
1195                         buf_offset += sizeof(u32);
1196                 }
1197                 buf_offset += ltt_align(buf_offset, sizeof(u64));
1198                 ust_buffers_write(buf, buf_offset,
1199                         &tsc, sizeof(u64));
1200                 buf_offset += sizeof(u64);
1201                 break;
1202         case LTT_RFLAG_ID_SIZE:
1203                 small_size = (u16)min_t(u32, event_size, LTT_MAX_SMALL_SIZE);
1204                 ust_buffers_write(buf, buf_offset,
1205                         &eID, sizeof(u16));
1206                 buf_offset += sizeof(u16);
1207                 ust_buffers_write(buf, buf_offset,
1208                         &small_size, sizeof(u16));
1209                 buf_offset += sizeof(u16);
1210                 if (small_size == LTT_MAX_SMALL_SIZE) {
1211                         ust_buffers_write(buf, buf_offset,
1212                                 &event_size, sizeof(u32));
1213                         buf_offset += sizeof(u32);
1214                 }
1215                 break;
1216         case LTT_RFLAG_ID:
1217                 ust_buffers_write(buf, buf_offset,
1218                         &eID, sizeof(u16));
1219                 buf_offset += sizeof(u16);
1220                 break;
1221         }
1222
1223         return buf_offset;
1224 }