lttng-modules v0.19-stable: setup_trace_write: Fix recursive locking
[lttng-modules.git] / ltt-relay-lockless.h
CommitLineData
1c8284eb
MD
1#ifndef _LTT_LTT_RELAY_LOCKLESS_H
2#define _LTT_LTT_RELAY_LOCKLESS_H
3
4/*
5 * ltt/ltt-relay-lockless.h
6 *
7 * (C) Copyright 2005-2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
8 *
9 * LTTng lockless buffer space management (reader/writer).
10 *
11 * Author:
12 * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
13 *
14 * Inspired from LTT :
15 * Karim Yaghmour (karim@opersys.com)
16 * Tom Zanussi (zanussi@us.ibm.com)
17 * Bob Wisniewski (bob@watson.ibm.com)
18 * And from K42 :
19 * Bob Wisniewski (bob@watson.ibm.com)
20 *
21 * Changelog:
22 * 08/10/08, Cleanup.
23 * 19/10/05, Complete lockless mechanism.
24 * 27/05/05, Modular redesign and rewrite.
25 *
26 * Userspace reader semantic :
27 * while (poll fd != POLLHUP) {
28 * - ioctl RELAY_GET_SUBBUF_SIZE
29 * while (1) {
30 * - ioctl GET_SUBBUF
31 * - splice 1 subbuffer worth of data to a pipe
32 * - splice the data from pipe to disk/network
33 * - ioctl PUT_SUBBUF, check error value
34 * if err val < 0, previous subbuffer was corrupted.
35 * }
36 * }
37 *
38 * Dual LGPL v2.1/GPL v2 license.
39 */
40
41#include <linux/cache.h>
42#include <linux/time.h>
43#include <linux/module.h>
44#include <linux/string.h>
45#include <linux/slab.h>
46#include <linux/init.h>
47#include <linux/rcupdate.h>
48#include <linux/timer.h>
49#include <linux/sched.h>
50#include <linux/bitops.h>
51#include <linux/fs.h>
52#include <linux/smp_lock.h>
53#include <linux/debugfs.h>
54#include <linux/stat.h>
55#include <linux/cpu.h>
56#include <linux/pipe_fs_i.h>
57#include <linux/splice.h>
58#include <asm/atomic.h>
59#include <asm/local.h>
60
61#include "ltt-tracer.h"
62#include "ltt-relay.h"
63
64#if 0
65#define printk_dbg(fmt, args...) printk(fmt, args)
66#else
67#define printk_dbg(fmt, args...)
68#endif
69
70struct commit_counters {
71 local_t cc;
72 local_t cc_sb; /* Incremented _once_ at sb switch */
73 local_t events; /* Event count */
74};
75
76/* LTTng lockless logging buffer info */
77struct ltt_chanbuf {
78 struct ltt_chanbuf_alloc a; /* Parent. First field. */
79 /* First 32 bytes cache-hot cacheline */
80 local_t offset; /* Current offset in the buffer */
81 struct commit_counters *commit_count;
82 /* Commit count per sub-buffer */
83 atomic_long_t consumed; /*
84 * Current offset in the buffer
85 * standard atomic access (shared)
86 */
87 unsigned long last_tsc; /*
88 * Last timestamp written in the buffer.
89 */
90 /* End of first 32 bytes cacheline */
2e6246b4 91#ifdef LTT_VMCORE
1c8284eb
MD
92 local_t *commit_seq; /* Consecutive commits */
93#endif
94 atomic_long_t active_readers; /*
95 * Active readers count
96 * standard atomic access (shared)
97 */
98 local_t events_lost;
99 local_t corrupted_subbuffers;
100 spinlock_t full_lock; /*
101 * buffer full condition spinlock, only
102 * for userspace tracing blocking mode
103 * synchronization with reader.
104 */
105 wait_queue_head_t write_wait; /*
106 * Wait queue for blocking user space
107 * writers
108 */
109 wait_queue_head_t read_wait; /* reader wait queue */
110 unsigned int finalized; /* buffer has been finalized */
111 struct timer_list switch_timer; /* timer for periodical switch */
112};
113
114/*
115 * A switch is done during tracing or as a final flush after tracing (so it
116 * won't write in the new sub-buffer).
117 */
118enum force_switch_mode { FORCE_ACTIVE, FORCE_FLUSH };
119
120extern
121int ltt_reserve_slot_lockless_slow(struct ltt_chan *chan,
122 struct ltt_trace *trace, size_t data_size,
123 int largest_align, int cpu,
124 struct ltt_chanbuf **ret_buf,
125 size_t *slot_size, long *buf_offset,
126 u64 *tsc, unsigned int *rflags);
127
128extern void ltt_force_switch_lockless_slow(struct ltt_chanbuf *buf,
129 enum force_switch_mode mode);
130
131/*
132 * Last TSC comparison functions. Check if the current TSC overflows
133 * LTT_TSC_BITS bits from the last TSC read. Reads and writes last_tsc
134 * atomically.
135 */
136
137#if (BITS_PER_LONG == 32)
138static __inline__ void save_last_tsc(struct ltt_chanbuf *buf, u64 tsc)
139{
140 buf->last_tsc = (unsigned long)(tsc >> LTT_TSC_BITS);
141}
142
143static __inline__ int last_tsc_overflow(struct ltt_chanbuf *buf, u64 tsc)
144{
145 unsigned long tsc_shifted = (unsigned long)(tsc >> LTT_TSC_BITS);
146
147 if (unlikely((tsc_shifted - buf->last_tsc)))
148 return 1;
149 else
150 return 0;
151}
152#else
153static __inline__ void save_last_tsc(struct ltt_chanbuf *buf, u64 tsc)
154{
155 buf->last_tsc = (unsigned long)tsc;
156}
157
158static __inline__ int last_tsc_overflow(struct ltt_chanbuf *buf, u64 tsc)
159{
160 if (unlikely((tsc - buf->last_tsc) >> LTT_TSC_BITS))
161 return 1;
162 else
163 return 0;
164}
165#endif
166
167extern
168int ltt_chanbuf_create(struct ltt_chanbuf *buf, struct ltt_chan_alloc *chana,
169 int cpu);
170extern void ltt_chanbuf_free(struct ltt_chanbuf *buf);
171extern int ltt_chan_create(const char *base_filename, struct ltt_chan *chan,
172 struct dentry *parent, size_t sb_size, size_t n_sb,
173 int overwrite, struct ltt_trace *trace);
174extern void ltt_chan_free(struct kref *kref);
175extern void ltt_chan_remove_files(struct ltt_chan *chan);
176
177/* Buffer access operations */
178
179extern int ltt_chanbuf_open_read(struct ltt_chanbuf *buf);
180extern void ltt_chanbuf_release_read(struct ltt_chanbuf *buf);
181extern int ltt_chanbuf_get_subbuf(struct ltt_chanbuf *buf,
182 unsigned long *consumed);
183extern int ltt_chanbuf_put_subbuf(struct ltt_chanbuf *buf,
184 unsigned long consumed);
185extern void ltt_chan_start_switch_timer(struct ltt_chan *chan);
186extern void ltt_chan_stop_switch_timer(struct ltt_chan *chan);
187
188extern int ltt_relay_init(void);
189extern void ltt_relay_exit(void);
190
191static __inline__
192unsigned long ltt_chanbuf_get_offset(struct ltt_chanbuf *buf)
193{
194 return local_read(&buf->offset);
195}
196
197static __inline__
198unsigned long ltt_chanbuf_get_consumed(struct ltt_chanbuf *buf)
199{
200 return atomic_long_read(&buf->consumed);
201}
202
203static __inline__
204int ltt_chanbuf_is_finalized(struct ltt_chanbuf *buf)
205{
206 return buf->finalized;
207}
208
209static __inline__
210void ltt_reserve_push_reader(struct ltt_chanbuf *buf, struct ltt_chan *chan,
211 long offset)
212{
213 long consumed_old, consumed_new;
214
215 do {
216 consumed_old = atomic_long_read(&buf->consumed);
217 /*
218 * If buffer is in overwrite mode, push the reader consumed
219 * count if the write position has reached it and we are not
220 * at the first iteration (don't push the reader farther than
221 * the writer). This operation can be done concurrently by many
222 * writers in the same buffer, the writer being at the farthest
223 * write position sub-buffer index in the buffer being the one
224 * which will win this loop.
225 * If the buffer is not in overwrite mode, pushing the reader
226 * only happens if a sub-buffer is corrupted.
227 */
228 if (unlikely((SUBBUF_TRUNC(offset, chan)
229 - SUBBUF_TRUNC(consumed_old, chan))
230 >= chan->a.buf_size))
231 consumed_new = SUBBUF_ALIGN(consumed_old, chan);
232 else
233 return;
234 } while (unlikely(atomic_long_cmpxchg(&buf->consumed, consumed_old,
235 consumed_new) != consumed_old));
236}
237
2e6246b4 238#ifdef LTT_VMCORE
1c8284eb
MD
239static __inline__
240void ltt_vmcore_check_deliver(struct ltt_chanbuf *buf, long commit_count,
241 long idx)
242{
243 local_set(&buf->commit_seq[idx], commit_count);
244}
245#else
246static __inline__
247void ltt_vmcore_check_deliver(struct ltt_chanbuf *buf, long commit_count,
248 long idx)
249{
250}
251#endif
252
253static __inline__
254void ltt_check_deliver(struct ltt_chanbuf *buf, struct ltt_chan *chan,
255 long offset, long commit_count, long idx)
256{
257 long old_commit_count = commit_count - chan->a.sb_size;
258
259 /* Check if all commits have been done */
260 if (unlikely((BUFFER_TRUNC(offset, chan) >> chan->a.n_sb_order)
261 - (old_commit_count & chan->commit_count_mask) == 0)) {
262 /*
263 * If we succeeded in updating the cc_sb, we are delivering
264 * the subbuffer. Deals with concurrent updates of the "cc"
265 * value without adding a add_return atomic operation to the
266 * fast path.
267 */
268 if (likely(local_cmpxchg(&buf->commit_count[idx].cc_sb,
269 old_commit_count, commit_count)
270 == old_commit_count)) {
271 /*
272 * Set noref flag for this subbuffer.
273 */
274 ltt_set_noref_flag(&buf->a, idx);
275 ltt_vmcore_check_deliver(buf, commit_count, idx);
276 }
277 }
278}
279
280
281static __inline__
282int ltt_poll_deliver(struct ltt_chanbuf *buf, struct ltt_chan *chan)
283{
284 long consumed_old, consumed_idx, commit_count, write_offset;
285
286 consumed_old = atomic_long_read(&buf->consumed);
287 consumed_idx = SUBBUF_INDEX(consumed_old, chan);
288 commit_count = local_read(&buf->commit_count[consumed_idx].cc_sb);
289 /*
290 * No memory barrier here, since we are only interested
291 * in a statistically correct polling result. The next poll will
292 * get the data is we are racing. The mb() that ensures correct
293 * memory order is in get_subbuf.
294 */
295 write_offset = local_read(&buf->offset);
296
297 /*
298 * Check that the subbuffer we are trying to consume has been
299 * already fully committed.
300 */
301
302 if (((commit_count - chan->a.sb_size)
303 & chan->commit_count_mask)
304 - (BUFFER_TRUNC(consumed_old, chan)
305 >> chan->a.n_sb_order)
306 != 0)
307 return 0;
308
309 /*
310 * Check that we are not about to read the same subbuffer in
311 * which the writer head is.
312 */
313 if ((SUBBUF_TRUNC(write_offset, chan)
314 - SUBBUF_TRUNC(consumed_old, chan))
315 == 0)
316 return 0;
317
318 return 1;
319
320}
321
322static __inline__
323u32 get_read_sb_size(struct ltt_chanbuf *buf)
324{
325 struct ltt_subbuffer_header *header =
326 (struct ltt_subbuffer_header *)
327 ltt_relay_read_offset_address(&buf->a, 0);
328 return header->sb_size;
329}
330
331/*
332 * returns 0 if reserve ok, or 1 if the slow path must be taken.
333 */
334static __inline__
335int ltt_relay_try_reserve(struct ltt_chanbuf *buf, struct ltt_chan *chan,
336 size_t data_size, u64 *tsc, unsigned int *rflags,
337 int largest_align, long *o_begin, long *o_end,
338 long *o_old, size_t *before_hdr_pad, size_t *size)
339{
340 *o_begin = local_read(&buf->offset);
341 *o_old = *o_begin;
342
343 *tsc = trace_clock_read64();
344
2e6246b4 345#ifdef LTT_VMCORE
1c8284eb
MD
346 prefetch(&buf->commit_count[SUBBUF_INDEX(*o_begin, chan)]);
347 prefetch(&buf->commit_seq[SUBBUF_INDEX(*o_begin, chan)]);
348#else
349 prefetchw(&buf->commit_count[SUBBUF_INDEX(*o_begin, chan)]);
350#endif
351 if (last_tsc_overflow(buf, *tsc))
352 *rflags = LTT_RFLAG_ID_SIZE_TSC;
353
354 if (unlikely(SUBBUF_OFFSET(*o_begin, chan) == 0))
355 return 1;
356
357 *size = ltt_get_header_size(chan, *o_begin, data_size, before_hdr_pad,
358 *rflags);
359 *size += ltt_align(*o_begin + *size, largest_align) + data_size;
360 if (unlikely((SUBBUF_OFFSET(*o_begin, chan) + *size) > chan->a.sb_size))
361 return 1;
362
363 /*
364 * Event fits in the current buffer and we are not on a switch
365 * boundary. It's safe to write.
366 */
367 *o_end = *o_begin + *size;
368
369 if (unlikely((SUBBUF_OFFSET(*o_end, chan)) == 0))
370 /*
371 * The offset_end will fall at the very beginning of the next
372 * subbuffer.
373 */
374 return 1;
375
376 return 0;
377}
378
379static __inline__
380int ltt_reserve_slot(struct ltt_chan *chan,
381 struct ltt_trace *trace, size_t data_size,
382 int largest_align, int cpu,
383 struct ltt_chanbuf **ret_buf,
384 size_t *slot_size, long *buf_offset, u64 *tsc,
385 unsigned int *rflags)
386{
387 struct ltt_chanbuf *buf = *ret_buf = per_cpu_ptr(chan->a.buf, cpu);
388 long o_begin, o_end, o_old;
389 size_t before_hdr_pad;
390
391 /*
392 * Perform retryable operations.
393 */
394 if (unlikely(__get_cpu_var(ltt_nesting) > 4)) {
395 local_inc(&buf->events_lost);
396 return -EPERM;
397 }
398
399 if (unlikely(ltt_relay_try_reserve(buf, chan, data_size, tsc, rflags,
400 largest_align, &o_begin, &o_end,
401 &o_old, &before_hdr_pad, slot_size)))
402 goto slow_path;
403
404 if (unlikely(local_cmpxchg(&buf->offset, o_old, o_end) != o_old))
405 goto slow_path;
406
407 /*
408 * Atomically update last_tsc. This update races against concurrent
409 * atomic updates, but the race will always cause supplementary full TSC
410 * events, never the opposite (missing a full TSC event when it would be
411 * needed).
412 */
413 save_last_tsc(buf, *tsc);
414
415 /*
416 * Push the reader if necessary
417 */
418 ltt_reserve_push_reader(buf, chan, o_end - 1);
419
420 /*
421 * Clear noref flag for this subbuffer.
422 */
423 ltt_clear_noref_flag(&buf->a, SUBBUF_INDEX(o_end - 1, chan));
424
425 *buf_offset = o_begin + before_hdr_pad;
426 return 0;
427slow_path:
428 return ltt_reserve_slot_lockless_slow(chan, trace, data_size,
429 largest_align, cpu, ret_buf,
430 slot_size, buf_offset, tsc,
431 rflags);
432}
433
434/*
435 * Force a sub-buffer switch for a per-cpu buffer. This operation is
436 * completely reentrant : can be called while tracing is active with
437 * absolutely no lock held.
438 *
439 * Note, however, that as a local_cmpxchg is used for some atomic
440 * operations, this function must be called from the CPU which owns the buffer
441 * for a ACTIVE flush.
442 */
443static __inline__
444void ltt_force_switch(struct ltt_chanbuf *buf, enum force_switch_mode mode)
445{
446 return ltt_force_switch_lockless_slow(buf, mode);
447}
448
449/*
450 * for flight recording. must be called after relay_commit.
451 * This function increments the subbuffer's commit_seq counter each time the
452 * commit count reaches back the reserve offset (module subbuffer size). It is
453 * useful for crash dump.
454 */
2e6246b4 455#ifdef LTT_VMCORE
1c8284eb
MD
456static __inline__
457void ltt_write_commit_counter(struct ltt_chanbuf *buf, struct ltt_chan *chan,
458 long idx, long buf_offset, long commit_count,
459 size_t data_size)
460{
461 long offset;
462 long commit_seq_old;
463
464 offset = buf_offset + data_size;
465
466 /*
467 * SUBBUF_OFFSET includes commit_count_mask. We can simply
468 * compare the offsets within the subbuffer without caring about
469 * buffer full/empty mismatch because offset is never zero here
470 * (subbuffer header and event headers have non-zero length).
471 */
472 if (unlikely(SUBBUF_OFFSET(offset - commit_count, chan)))
473 return;
474
475 commit_seq_old = local_read(&buf->commit_seq[idx]);
476 while (commit_seq_old < commit_count)
477 commit_seq_old = local_cmpxchg(&buf->commit_seq[idx],
478 commit_seq_old, commit_count);
479}
480#else
481static __inline__
482void ltt_write_commit_counter(struct ltt_chanbuf *buf, struct ltt_chan *chan,
483 long idx, long buf_offset, long commit_count,
484 size_t data_size)
485{
486}
487#endif
488
489/*
490 * Atomic unordered slot commit. Increments the commit count in the
491 * specified sub-buffer, and delivers it if necessary.
492 *
493 * Parameters:
494 *
495 * @buf: buffer.
496 * @chan: channel.
497 * @buf_offset : offset following the event header.
498 * @data_size : size of the event data.
499 * @slot_size : size of the reserved slot.
500 */
501static __inline__
502void ltt_commit_slot(struct ltt_chanbuf *buf, struct ltt_chan *chan,
503 long buf_offset, size_t data_size, size_t slot_size)
504{
505 long offset_end = buf_offset;
506 long endidx = SUBBUF_INDEX(offset_end - 1, chan);
507 long commit_count;
508
509#ifdef LTT_NO_IPI_BARRIER
510 smp_wmb();
511#else
512 /*
513 * Must write slot data before incrementing commit count.
514 * This compiler barrier is upgraded into a smp_mb() by the IPI
515 * sent by get_subbuf().
516 */
517 barrier();
518#endif
519 local_add(slot_size, &buf->commit_count[endidx].cc);
520 local_inc(&buf->commit_count[endidx].events);
521 /*
522 * commit count read can race with concurrent OOO commit count updates.
523 * This is only needed for ltt_check_deliver (for non-polling delivery
524 * only) and for ltt_write_commit_counter. The race can only cause the
525 * counter to be read with the same value more than once, which could
526 * cause :
527 * - Multiple delivery for the same sub-buffer (which is handled
528 * gracefully by the reader code) if the value is for a full
529 * sub-buffer. It's important that we can never miss a sub-buffer
530 * delivery. Re-reading the value after the local_add ensures this.
531 * - Reading a commit_count with a higher value that what was actually
532 * added to it for the ltt_write_commit_counter call (again caused by
533 * a concurrent committer). It does not matter, because this function
534 * is interested in the fact that the commit count reaches back the
535 * reserve offset for a specific sub-buffer, which is completely
536 * independent of the order.
537 */
538 commit_count = local_read(&buf->commit_count[endidx].cc);
539
540 ltt_check_deliver(buf, chan, offset_end - 1, commit_count, endidx);
541 /*
542 * Update data_size for each commit. It's needed only for extracting
543 * ltt buffers from vmcore, after crash.
544 */
545 ltt_write_commit_counter(buf, chan, endidx, buf_offset,
546 commit_count, data_size);
547}
548
549#endif //_LTT_LTT_RELAY_LOCKLESS_H
This page took 0.041519 seconds and 4 git commands to generate.