1 #ifndef _LTT_LTT_RELAY_LOCKLESS_H
2 #define _LTT_LTT_RELAY_LOCKLESS_H
5 * ltt/ltt-relay-lockless.h
7 * (C) Copyright 2005-2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
9 * LTTng lockless buffer space management (reader/writer).
12 * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
15 * Karim Yaghmour (karim@opersys.com)
16 * Tom Zanussi (zanussi@us.ibm.com)
17 * Bob Wisniewski (bob@watson.ibm.com)
19 * Bob Wisniewski (bob@watson.ibm.com)
23 * 19/10/05, Complete lockless mechanism.
24 * 27/05/05, Modular redesign and rewrite.
26 * Userspace reader semantic :
27 * while (poll fd != POLLHUP) {
28 * - ioctl RELAY_GET_SUBBUF_SIZE
31 * - splice 1 subbuffer worth of data to a pipe
32 * - splice the data from pipe to disk/network
33 * - ioctl PUT_SUBBUF, check error value
34 * if err val < 0, previous subbuffer was corrupted.
38 * Dual LGPL v2.1/GPL v2 license.
41 #include <linux/cache.h>
42 #include <linux/time.h>
43 #include <linux/module.h>
44 #include <linux/string.h>
45 #include <linux/slab.h>
46 #include <linux/init.h>
47 #include <linux/rcupdate.h>
48 #include <linux/timer.h>
49 #include <linux/sched.h>
50 #include <linux/bitops.h>
52 #include <linux/smp_lock.h>
53 #include <linux/debugfs.h>
54 #include <linux/stat.h>
55 #include <linux/cpu.h>
56 #include <linux/pipe_fs_i.h>
57 #include <linux/splice.h>
58 #include <asm/atomic.h>
59 #include <asm/local.h>
61 #include "ltt-tracer.h"
62 #include "ltt-relay.h"
65 #define printk_dbg(fmt, args...) printk(fmt, args)
67 #define printk_dbg(fmt, args...)
70 struct commit_counters
{
72 local_t cc_sb
; /* Incremented _once_ at sb switch */
73 local_t events
; /* Event count */
76 /* LTTng lockless logging buffer info */
78 struct ltt_chanbuf_alloc a
; /* Parent. First field. */
79 /* First 32 bytes cache-hot cacheline */
80 local_t offset
; /* Current offset in the buffer */
81 struct commit_counters
*commit_count
;
82 /* Commit count per sub-buffer */
83 atomic_long_t consumed
; /*
84 * Current offset in the buffer
85 * standard atomic access (shared)
87 unsigned long last_tsc
; /*
88 * Last timestamp written in the buffer.
90 /* End of first 32 bytes cacheline */
92 local_t
*commit_seq
; /* Consecutive commits */
94 atomic_long_t active_readers
; /*
95 * Active readers count
96 * standard atomic access (shared)
99 local_t corrupted_subbuffers
;
100 spinlock_t full_lock
; /*
101 * buffer full condition spinlock, only
102 * for userspace tracing blocking mode
103 * synchronization with reader.
105 wait_queue_head_t write_wait
; /*
106 * Wait queue for blocking user space
109 wait_queue_head_t read_wait
; /* reader wait queue */
110 unsigned int finalized
; /* buffer has been finalized */
111 struct timer_list switch_timer
; /* timer for periodical switch */
115 * A switch is done during tracing or as a final flush after tracing (so it
116 * won't write in the new sub-buffer).
118 enum force_switch_mode
{ FORCE_ACTIVE
, FORCE_FLUSH
};
121 int ltt_reserve_slot_lockless_slow(struct ltt_chan
*chan
,
122 struct ltt_trace
*trace
, size_t data_size
,
123 int largest_align
, int cpu
,
124 struct ltt_chanbuf
**ret_buf
,
125 size_t *slot_size
, long *buf_offset
,
126 u64
*tsc
, unsigned int *rflags
);
128 extern void ltt_force_switch_lockless_slow(struct ltt_chanbuf
*buf
,
129 enum force_switch_mode mode
);
132 * Last TSC comparison functions. Check if the current TSC overflows
133 * LTT_TSC_BITS bits from the last TSC read. Reads and writes last_tsc
137 #if (BITS_PER_LONG == 32)
138 static __inline__
void save_last_tsc(struct ltt_chanbuf
*buf
, u64 tsc
)
140 buf
->last_tsc
= (unsigned long)(tsc
>> LTT_TSC_BITS
);
143 static __inline__
int last_tsc_overflow(struct ltt_chanbuf
*buf
, u64 tsc
)
145 unsigned long tsc_shifted
= (unsigned long)(tsc
>> LTT_TSC_BITS
);
147 if (unlikely((tsc_shifted
- buf
->last_tsc
)))
153 static __inline__
void save_last_tsc(struct ltt_chanbuf
*buf
, u64 tsc
)
155 buf
->last_tsc
= (unsigned long)tsc
;
158 static __inline__
int last_tsc_overflow(struct ltt_chanbuf
*buf
, u64 tsc
)
160 if (unlikely((tsc
- buf
->last_tsc
) >> LTT_TSC_BITS
))
168 int ltt_chanbuf_create(struct ltt_chanbuf
*buf
, struct ltt_chan_alloc
*chana
,
170 extern void ltt_chanbuf_free(struct ltt_chanbuf
*buf
);
171 extern int ltt_chan_create(const char *base_filename
, struct ltt_chan
*chan
,
172 struct dentry
*parent
, size_t sb_size
, size_t n_sb
,
173 int overwrite
, struct ltt_trace
*trace
);
174 extern void ltt_chan_free(struct kref
*kref
);
175 extern void ltt_chan_remove_files(struct ltt_chan
*chan
);
177 /* Buffer access operations */
179 extern int ltt_chanbuf_open_read(struct ltt_chanbuf
*buf
);
180 extern void ltt_chanbuf_release_read(struct ltt_chanbuf
*buf
);
181 extern int ltt_chanbuf_get_subbuf(struct ltt_chanbuf
*buf
,
182 unsigned long *consumed
);
183 extern int ltt_chanbuf_put_subbuf(struct ltt_chanbuf
*buf
,
184 unsigned long consumed
);
185 extern void ltt_chan_start_switch_timer(struct ltt_chan
*chan
);
186 extern void ltt_chan_stop_switch_timer(struct ltt_chan
*chan
);
188 extern int ltt_relay_init(void);
189 extern void ltt_relay_exit(void);
192 unsigned long ltt_chanbuf_get_offset(struct ltt_chanbuf
*buf
)
194 return local_read(&buf
->offset
);
198 unsigned long ltt_chanbuf_get_consumed(struct ltt_chanbuf
*buf
)
200 return atomic_long_read(&buf
->consumed
);
204 int ltt_chanbuf_is_finalized(struct ltt_chanbuf
*buf
)
206 return buf
->finalized
;
210 void ltt_reserve_push_reader(struct ltt_chanbuf
*buf
, struct ltt_chan
*chan
,
213 long consumed_old
, consumed_new
;
216 consumed_old
= atomic_long_read(&buf
->consumed
);
218 * If buffer is in overwrite mode, push the reader consumed
219 * count if the write position has reached it and we are not
220 * at the first iteration (don't push the reader farther than
221 * the writer). This operation can be done concurrently by many
222 * writers in the same buffer, the writer being at the farthest
223 * write position sub-buffer index in the buffer being the one
224 * which will win this loop.
225 * If the buffer is not in overwrite mode, pushing the reader
226 * only happens if a sub-buffer is corrupted.
228 if (unlikely((SUBBUF_TRUNC(offset
, chan
)
229 - SUBBUF_TRUNC(consumed_old
, chan
))
230 >= chan
->a
.buf_size
))
231 consumed_new
= SUBBUF_ALIGN(consumed_old
, chan
);
234 } while (unlikely(atomic_long_cmpxchg(&buf
->consumed
, consumed_old
,
235 consumed_new
) != consumed_old
));
240 void ltt_vmcore_check_deliver(struct ltt_chanbuf
*buf
, long commit_count
,
243 local_set(&buf
->commit_seq
[idx
], commit_count
);
247 void ltt_vmcore_check_deliver(struct ltt_chanbuf
*buf
, long commit_count
,
254 void ltt_check_deliver(struct ltt_chanbuf
*buf
, struct ltt_chan
*chan
,
255 long offset
, long commit_count
, long idx
)
257 long old_commit_count
= commit_count
- chan
->a
.sb_size
;
259 /* Check if all commits have been done */
260 if (unlikely((BUFFER_TRUNC(offset
, chan
) >> chan
->a
.n_sb_order
)
261 - (old_commit_count
& chan
->commit_count_mask
) == 0)) {
263 * If we succeeded in updating the cc_sb, we are delivering
264 * the subbuffer. Deals with concurrent updates of the "cc"
265 * value without adding a add_return atomic operation to the
268 if (likely(local_cmpxchg(&buf
->commit_count
[idx
].cc_sb
,
269 old_commit_count
, commit_count
)
270 == old_commit_count
)) {
272 * Set noref flag for this subbuffer.
274 ltt_set_noref_flag(&buf
->a
, idx
);
275 ltt_vmcore_check_deliver(buf
, commit_count
, idx
);
282 int ltt_poll_deliver(struct ltt_chanbuf
*buf
, struct ltt_chan
*chan
)
284 long consumed_old
, consumed_idx
, commit_count
, write_offset
;
286 consumed_old
= atomic_long_read(&buf
->consumed
);
287 consumed_idx
= SUBBUF_INDEX(consumed_old
, chan
);
288 commit_count
= local_read(&buf
->commit_count
[consumed_idx
].cc_sb
);
290 * No memory barrier here, since we are only interested
291 * in a statistically correct polling result. The next poll will
292 * get the data is we are racing. The mb() that ensures correct
293 * memory order is in get_subbuf.
295 write_offset
= local_read(&buf
->offset
);
298 * Check that the subbuffer we are trying to consume has been
299 * already fully committed.
302 if (((commit_count
- chan
->a
.sb_size
)
303 & chan
->commit_count_mask
)
304 - (BUFFER_TRUNC(consumed_old
, chan
)
305 >> chan
->a
.n_sb_order
)
310 * Check that we are not about to read the same subbuffer in
311 * which the writer head is.
313 if ((SUBBUF_TRUNC(write_offset
, chan
)
314 - SUBBUF_TRUNC(consumed_old
, chan
))
323 u32
get_read_sb_size(struct ltt_chanbuf
*buf
)
325 struct ltt_subbuffer_header
*header
=
326 (struct ltt_subbuffer_header
*)
327 ltt_relay_read_offset_address(&buf
->a
, 0);
328 return header
->sb_size
;
332 * returns 0 if reserve ok, or 1 if the slow path must be taken.
335 int ltt_relay_try_reserve(struct ltt_chanbuf
*buf
, struct ltt_chan
*chan
,
336 size_t data_size
, u64
*tsc
, unsigned int *rflags
,
337 int largest_align
, long *o_begin
, long *o_end
,
338 long *o_old
, size_t *before_hdr_pad
, size_t *size
)
340 *o_begin
= local_read(&buf
->offset
);
343 *tsc
= trace_clock_read64();
346 prefetch(&buf
->commit_count
[SUBBUF_INDEX(*o_begin
, chan
)]);
347 prefetch(&buf
->commit_seq
[SUBBUF_INDEX(*o_begin
, chan
)]);
349 prefetchw(&buf
->commit_count
[SUBBUF_INDEX(*o_begin
, chan
)]);
351 if (last_tsc_overflow(buf
, *tsc
))
352 *rflags
= LTT_RFLAG_ID_SIZE_TSC
;
354 if (unlikely(SUBBUF_OFFSET(*o_begin
, chan
) == 0))
357 *size
= ltt_get_header_size(chan
, *o_begin
, data_size
, before_hdr_pad
,
359 *size
+= ltt_align(*o_begin
+ *size
, largest_align
) + data_size
;
360 if (unlikely((SUBBUF_OFFSET(*o_begin
, chan
) + *size
) > chan
->a
.sb_size
))
364 * Event fits in the current buffer and we are not on a switch
365 * boundary. It's safe to write.
367 *o_end
= *o_begin
+ *size
;
369 if (unlikely((SUBBUF_OFFSET(*o_end
, chan
)) == 0))
371 * The offset_end will fall at the very beginning of the next
380 int ltt_reserve_slot(struct ltt_chan
*chan
,
381 struct ltt_trace
*trace
, size_t data_size
,
382 int largest_align
, int cpu
,
383 struct ltt_chanbuf
**ret_buf
,
384 size_t *slot_size
, long *buf_offset
, u64
*tsc
,
385 unsigned int *rflags
)
387 struct ltt_chanbuf
*buf
= *ret_buf
= per_cpu_ptr(chan
->a
.buf
, cpu
);
388 long o_begin
, o_end
, o_old
;
389 size_t before_hdr_pad
;
392 * Perform retryable operations.
394 if (unlikely(__get_cpu_var(ltt_nesting
) > 4)) {
395 local_inc(&buf
->events_lost
);
399 if (unlikely(ltt_relay_try_reserve(buf
, chan
, data_size
, tsc
, rflags
,
400 largest_align
, &o_begin
, &o_end
,
401 &o_old
, &before_hdr_pad
, slot_size
)))
404 if (unlikely(local_cmpxchg(&buf
->offset
, o_old
, o_end
) != o_old
))
408 * Atomically update last_tsc. This update races against concurrent
409 * atomic updates, but the race will always cause supplementary full TSC
410 * events, never the opposite (missing a full TSC event when it would be
413 save_last_tsc(buf
, *tsc
);
416 * Push the reader if necessary
418 ltt_reserve_push_reader(buf
, chan
, o_end
- 1);
421 * Clear noref flag for this subbuffer.
423 ltt_clear_noref_flag(&buf
->a
, SUBBUF_INDEX(o_end
- 1, chan
));
425 *buf_offset
= o_begin
+ before_hdr_pad
;
428 return ltt_reserve_slot_lockless_slow(chan
, trace
, data_size
,
429 largest_align
, cpu
, ret_buf
,
430 slot_size
, buf_offset
, tsc
,
435 * Force a sub-buffer switch for a per-cpu buffer. This operation is
436 * completely reentrant : can be called while tracing is active with
437 * absolutely no lock held.
439 * Note, however, that as a local_cmpxchg is used for some atomic
440 * operations, this function must be called from the CPU which owns the buffer
441 * for a ACTIVE flush.
444 void ltt_force_switch(struct ltt_chanbuf
*buf
, enum force_switch_mode mode
)
446 return ltt_force_switch_lockless_slow(buf
, mode
);
450 * for flight recording. must be called after relay_commit.
451 * This function increments the subbuffer's commit_seq counter each time the
452 * commit count reaches back the reserve offset (module subbuffer size). It is
453 * useful for crash dump.
457 void ltt_write_commit_counter(struct ltt_chanbuf
*buf
, struct ltt_chan
*chan
,
458 long idx
, long buf_offset
, long commit_count
,
464 offset
= buf_offset
+ data_size
;
467 * SUBBUF_OFFSET includes commit_count_mask. We can simply
468 * compare the offsets within the subbuffer without caring about
469 * buffer full/empty mismatch because offset is never zero here
470 * (subbuffer header and event headers have non-zero length).
472 if (unlikely(SUBBUF_OFFSET(offset
- commit_count
, chan
)))
475 commit_seq_old
= local_read(&buf
->commit_seq
[idx
]);
476 while (commit_seq_old
< commit_count
)
477 commit_seq_old
= local_cmpxchg(&buf
->commit_seq
[idx
],
478 commit_seq_old
, commit_count
);
482 void ltt_write_commit_counter(struct ltt_chanbuf
*buf
, struct ltt_chan
*chan
,
483 long idx
, long buf_offset
, long commit_count
,
490 * Atomic unordered slot commit. Increments the commit count in the
491 * specified sub-buffer, and delivers it if necessary.
497 * @buf_offset : offset following the event header.
498 * @data_size : size of the event data.
499 * @slot_size : size of the reserved slot.
502 void ltt_commit_slot(struct ltt_chanbuf
*buf
, struct ltt_chan
*chan
,
503 long buf_offset
, size_t data_size
, size_t slot_size
)
505 long offset_end
= buf_offset
;
506 long endidx
= SUBBUF_INDEX(offset_end
- 1, chan
);
509 #ifdef LTT_NO_IPI_BARRIER
513 * Must write slot data before incrementing commit count.
514 * This compiler barrier is upgraded into a smp_mb() by the IPI
515 * sent by get_subbuf().
519 local_add(slot_size
, &buf
->commit_count
[endidx
].cc
);
520 local_inc(&buf
->commit_count
[endidx
].events
);
522 * commit count read can race with concurrent OOO commit count updates.
523 * This is only needed for ltt_check_deliver (for non-polling delivery
524 * only) and for ltt_write_commit_counter. The race can only cause the
525 * counter to be read with the same value more than once, which could
527 * - Multiple delivery for the same sub-buffer (which is handled
528 * gracefully by the reader code) if the value is for a full
529 * sub-buffer. It's important that we can never miss a sub-buffer
530 * delivery. Re-reading the value after the local_add ensures this.
531 * - Reading a commit_count with a higher value that what was actually
532 * added to it for the ltt_write_commit_counter call (again caused by
533 * a concurrent committer). It does not matter, because this function
534 * is interested in the fact that the commit count reaches back the
535 * reserve offset for a specific sub-buffer, which is completely
536 * independent of the order.
538 commit_count
= local_read(&buf
->commit_count
[endidx
].cc
);
540 ltt_check_deliver(buf
, chan
, offset_end
- 1, commit_count
, endidx
);
542 * Update data_size for each commit. It's needed only for extracting
543 * ltt buffers from vmcore, after crash.
545 ltt_write_commit_counter(buf
, chan
, endidx
, buf_offset
,
546 commit_count
, data_size
);
549 #endif //_LTT_LTT_RELAY_LOCKLESS_H