LTTng modularization, import of lttng 0.226
authorMathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Fri, 3 Sep 2010 12:08:18 +0000 (08:08 -0400)
committerMathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Fri, 3 Sep 2010 12:08:18 +0000 (08:08 -0400)
Will match kernel tree "lttng 0.227".

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
38 files changed:
Makefile [new file with mode: 0644]
ltt-ascii.c [new file with mode: 0644]
ltt-channels.c [new file with mode: 0644]
ltt-core.c [new file with mode: 0644]
ltt-event-header.c [new file with mode: 0644]
ltt-filter.c [new file with mode: 0644]
ltt-kprobes.c [new file with mode: 0644]
ltt-marker-control.c [new file with mode: 0644]
ltt-relay-alloc.c [new file with mode: 0644]
ltt-relay-lockless.c [new file with mode: 0644]
ltt-relay-lockless.h [new file with mode: 0644]
ltt-relay-splice.c [new file with mode: 0644]
ltt-relay-vfs.c [new file with mode: 0644]
ltt-relay.h [new file with mode: 0644]
ltt-serialize.c [new file with mode: 0644]
ltt-statedump.c [new file with mode: 0644]
ltt-trace-control.c [new file with mode: 0644]
ltt-tracer-core.h [new file with mode: 0644]
ltt-tracer.c [new file with mode: 0644]
ltt-tracer.h [new file with mode: 0644]
ltt-type-serializer.c [new file with mode: 0644]
ltt-type-serializer.h [new file with mode: 0644]
ltt-userspace-event.c [new file with mode: 0644]
probes/Makefile [new file with mode: 0644]
probes/block-trace.c [new file with mode: 0644]
probes/ext4-trace.c [new file with mode: 0644]
probes/fs-trace.c [new file with mode: 0644]
probes/ipc-trace.c [new file with mode: 0644]
probes/jbd2-trace.c [new file with mode: 0644]
probes/kernel-trace.c [new file with mode: 0644]
probes/lockdep-trace.c [new file with mode: 0644]
probes/mm-trace.c [new file with mode: 0644]
probes/net-extended-trace.c [new file with mode: 0644]
probes/net-trace.c [new file with mode: 0644]
probes/pm-trace.c [new file with mode: 0644]
probes/rcu-trace.c [new file with mode: 0644]
probes/syscall-trace.c [new file with mode: 0644]
probes/trap-trace.c [new file with mode: 0644]

diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..b9f48ca
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,42 @@
+#
+# Makefile for the LTT objects.
+#
+
+ifneq ($(KERNELRELEASE),)
+ifneq ($(CONFIG_MARKERS),)
+
+obj-m += ltt-core.o
+obj-m += ltt-tracer.o
+obj-m += ltt-marker-control.o
+
+obj-m += ltt-relay.o
+ltt-relay-objs := ltt-relay-lockless.o ltt-relay-alloc.o ltt-relay-splice.o \
+                 ltt-relay-vfs.o ltt-event-header.o
+
+obj-m += ltt-serialize.o
+obj-m += ltt-statedump.o
+obj-m += ltt-type-serializer.o
+obj-m += ltt-trace-control.o
+obj-m += ltt-userspace-event.o
+obj-m += ltt-filter.o
+obj-m += ltt-kprobes.o
+obj-m += probes/
+obj-m += ltt-ascii.o
+
+endif
+
+else
+       KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+       PWD := $(shell pwd)
+
+default:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) modules
+
+modules_install:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) modules_install
+       /sbin/depmod -a
+
+clean:
+       $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/ltt-ascii.c b/ltt-ascii.c
new file mode 100644 (file)
index 0000000..975f94a
--- /dev/null
@@ -0,0 +1,586 @@
+/*
+ * LTT ascii binary buffer to ascii converter.
+ *
+ * Copyright       2008 - 2009   Lai Jiangshan (laijs@cn.fujitsu.com)
+ * Copyright       2009 -        Mathieu Desnoyers mathieu.desnoyers@polymtl.ca
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+/*
+ * TODO
+ *
+ * Move to new switch behavior: Wait for data for the duration of the
+ * timer interval + safety, if none is coming, consider that no activity occured
+ * in the buffer.
+ *
+ * Fix case when having a text file open and destroying trace.
+ *
+ * - Automate periodical switch:
+ *
+ * The debugfs file "switch_timer" receives a timer period as parameter
+ * (e.g. echo 100 > switch_timer) to activate the timer per channel. This can
+ * also be accessed through the internal API _before the trace session starts_.
+ * This timer will insure that we periodically have subbuffers to read, and
+ * therefore that the merge-sort does not wait endlessly for a subbuffer.
+ *
+ * - If a channel is switched and read without data, make sure it is still
+ * considered afterward (not removed from the queue).
+ *
+ * - Create a ascii/tracename/ALL file to merge-sort all active channels.
+ * - Create a ascii/tracename/README file to contain the text output legend.
+ * - Remove leading zeroes from timestamps.
+ * - Enhance pretty-printing to make sure all types used for addesses output in
+ * the form 0xAB00000000 (not decimal). This is true for %p and 0x%...X.
+ * - Hotplug support
+ */
+
+
+
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/fs.h>
+
+#include "ltt-tracer.h"
+#include "ltt-relay.h"
+#include "ltt-relay-lockless.h"
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , a...)
+#endif
+
+struct dentry *ltt_ascii_dir_dentry;
+EXPORT_SYMBOL_GPL(ltt_ascii_dir_dentry);
+
+struct ltt_relay_iter;
+
+struct ltt_relay_cpu_iter {
+       /* cpu buffer information */
+       struct ltt_chanbuf *buf;
+       struct ltt_relay_iter *iter;
+       int sb_ref;             /* holding a reference to a subbuffer */
+       long read_sb_offset;    /* offset of the subbuffer read */
+
+       /* current event information */
+       struct ltt_subbuffer_header *header;
+       long hdr_offset;        /* event header offset */
+       long payload_offset;    /* event payload offset */
+       u64 tsc;        /* full 64-bits timestamp value */
+       u32 data_size;
+       u16 chID;       /* channel ID, const */
+       u16 eID;
+};
+
+struct ltt_relay_iter {
+       struct ltt_relay_cpu_iter iter_cpu[NR_CPUS];
+       struct ltt_chan *chan;
+       loff_t pos;
+       int cpu;
+       int nr_refs;
+};
+
+/*
+ * offset of 0 in subbuffer means "subbuf size" (filled subbuffer).
+ */
+static int is_subbuffer_offset_end(struct ltt_relay_cpu_iter *citer,
+                                  long offset)
+{
+       struct ltt_chan *chan = container_of(citer->buf->a.chan,
+                                            struct ltt_chan, a);
+       long sub_offset = SUBBUF_OFFSET(offset - 1, chan) + 1;
+
+       return (sub_offset <= citer->header->data_size);
+}
+
+static u64 calculate_tsc(u64 pre_tsc, u64 read_tsc, unsigned int rflags)
+{
+       u64 new_tsc = read_tsc;
+
+       if (rflags != LTT_RFLAG_ID_SIZE_TSC) {
+               BUG_ON(read_tsc >> LTT_TSC_BITS);
+
+               new_tsc = (pre_tsc & ~LTT_TSC_MASK) + read_tsc;
+               if (read_tsc < (pre_tsc & LTT_TSC_MASK))
+                       new_tsc += 1UL << LTT_TSC_BITS;
+       }
+
+       return new_tsc;
+}
+
+/*
+ * calculate payload offset */
+static inline long calculate_payload_offset(long offset, u16 chID, u16 eID)
+{
+       const char *fmt;
+
+       if (!ltt_get_alignment())
+               return offset;
+
+       fmt = marker_get_fmt_from_id(chID, eID);
+       BUG_ON(!fmt);
+
+       return offset + ltt_fmt_largest_align(offset, fmt);
+}
+
+static void update_new_event(struct ltt_relay_cpu_iter *citer, long hdr_offset)
+{
+       u64 read_tsc;
+       unsigned int rflags;
+       long tmp_offset;
+
+       WARN_ON_ONCE(hdr_offset != citer->hdr_offset);
+
+       tmp_offset = ltt_read_event_header(&citer->buf->a, hdr_offset,
+                                          &read_tsc, &citer->data_size,
+                                          &citer->eID, &rflags);
+       citer->payload_offset = calculate_payload_offset(tmp_offset,
+                                                        citer->chID,
+                                                        citer->eID);
+
+       citer->tsc = calculate_tsc(citer->tsc, read_tsc, rflags);
+}
+
+static void update_event_size(struct ltt_relay_cpu_iter *citer, long hdr_offset)
+{
+       char output[1];
+       const char *fmt;
+       size_t data_size;
+
+       if (citer->data_size != INT_MAX)
+               return;
+
+       fmt = marker_get_fmt_from_id(citer->chID, citer->eID);
+       BUG_ON(!fmt);
+       ltt_serialize_printf(citer->buf, citer->payload_offset,
+                            &data_size, output, 0, fmt);
+       citer->data_size = data_size;
+}
+
+static void update_cpu_iter(struct ltt_relay_cpu_iter *citer, long hdr_offset)
+{
+       if (unlikely((!citer->sb_ref)
+                    || is_subbuffer_offset_end(citer, hdr_offset))) {
+               citer->header = NULL;
+               return;
+       }
+       update_new_event(citer, hdr_offset);
+       update_event_size(citer, hdr_offset);
+}
+
+/*
+ * returns 0 if we get a subbuffer reference.
+ * else, the buffer has not available data, try again later.
+ */
+static int subbuffer_start(struct ltt_relay_cpu_iter *citer, long *offset)
+{
+       int ret;
+       struct ltt_relay_iter *iter = citer->iter;
+
+       ret = ltt_chanbuf_get_subbuf(citer->buf, offset);
+       if (!ret) {
+               citer->header = ltt_relay_read_offset_address(&citer->buf->a,
+                                                             *offset);
+               citer->hdr_offset = (*offset) + ltt_sb_header_size();
+               citer->tsc = citer->header->cycle_count_begin;
+               iter->nr_refs++;
+               citer->sb_ref = 1;
+               return 0;
+       } else {
+               if (ltt_chanbuf_is_finalized(citer->buf))
+                       return -ENODATA;
+               else
+                       return -EAGAIN;
+       }
+}
+
+static void subbuffer_stop(struct ltt_relay_cpu_iter *citer,
+                          long offset)
+{
+       int ret;
+       struct ltt_relay_iter *iter = citer->iter;
+
+       WARN_ON_ONCE(!citer->sb_ref);
+       ret = ltt_chanbuf_put_subbuf(citer->buf, offset);
+       WARN_ON_ONCE(ret);
+       citer->sb_ref = 0;
+       iter->nr_refs--;
+}
+
+static void ltt_relay_advance_cpu_iter(struct ltt_relay_cpu_iter *citer)
+{
+       long old_offset = citer->payload_offset;
+       long new_offset = citer->payload_offset;
+       int ret;
+
+       /* find that whether we read all data in this subbuffer */
+       if (unlikely(is_subbuffer_offset_end(citer,
+                                            old_offset + citer->data_size))) {
+               DEBUGP(KERN_DEBUG "LTT ASCII stop cpu %d offset %lX\n",
+                      citer->buf->a.cpu, citer->read_sb_offset);
+               subbuffer_stop(citer, citer->read_sb_offset);
+               for (;;) {
+                       ret = subbuffer_start(citer, &citer->read_sb_offset);
+                       DEBUGP(KERN_DEBUG
+                              "LTT ASCII start cpu %d ret %d offset %lX\n",
+                              citer->buf->a.cpu, ret, citer->read_sb_offset);
+                       if (!ret || ret == -ENODATA) {
+                               break;  /* got data, or finalized */
+                       } else {        /* -EAGAIN */
+                               if (signal_pending(current))
+                                       break;
+                               schedule_timeout_interruptible(1);
+                               //TODO: check for no-data delay. take ref. break
+                       }
+               }
+       } else {
+               new_offset += citer->data_size;
+               citer->hdr_offset = new_offset + ltt_align(new_offset, sizeof(struct ltt_event_header));
+               DEBUGP(KERN_DEBUG
+                      "LTT ASCII old_offset %lX new_offset %lX cpu %d\n",
+                      old_offset, new_offset, citer->buf->a.cpu);
+       }
+
+       update_cpu_iter(citer, citer->hdr_offset);
+}
+
+static int cpu_iter_eof(struct ltt_relay_cpu_iter *citer)
+{
+       return !citer->sb_ref;
+}
+
+static int ltt_relay_iter_eof(struct ltt_relay_iter *iter)
+{
+       return iter->nr_refs == 0;
+}
+
+static void ltt_relay_advance_iter(struct ltt_relay_iter *iter)
+{
+       int i;
+       struct ltt_relay_cpu_iter *curr, *min = NULL;
+       iter->cpu = -1;
+
+       /*
+        * find the event with the minimum tsc.
+        * TODO: use min-heep for 4096CPUS
+        */
+       for_each_possible_cpu(i) {
+               curr = &iter->iter_cpu[i];
+
+               if (!curr->buf->a.allocated || !curr->header)
+                       continue;
+
+               if (cpu_iter_eof(curr))
+                       continue;
+
+               if (!min || curr->tsc < min->tsc) {
+                       min = curr;
+                       iter->cpu = i;
+               }
+       }
+
+       /* update cpu_iter for next ltt_relay_advance_iter() */
+       if (min)
+               ltt_relay_advance_cpu_iter(min);
+}
+
+static void *ascii_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+       struct ltt_relay_iter *iter = m->private;
+
+       WARN_ON_ONCE(!iter->nr_refs);
+       BUG_ON(v != iter);
+
+       ltt_relay_advance_iter(iter);
+       return (ltt_relay_iter_eof(iter) || signal_pending(current))
+               ? NULL : iter;
+}
+
+static void *ascii_start(struct seq_file *m, loff_t *ppos)
+{
+       struct ltt_relay_iter *iter = m->private;
+
+       ltt_relay_advance_iter(iter);
+       return (ltt_relay_iter_eof(iter) || signal_pending(current))
+               ? NULL : iter;
+}
+
+static void ascii_stop(struct seq_file *m, void *v)
+{
+}
+
+static
+int seq_serialize(struct seq_file *m, struct ltt_chanbuf *buf,
+                 size_t buf_offset, const char *fmt, size_t *data_size)
+{
+       int len;
+
+       if (m->count < m->size) {
+               len = ltt_serialize_printf(buf, buf_offset, data_size,
+                                          m->buf + m->count,
+                                          m->size - m->count, fmt);
+               if (m->count + len < m->size) {
+                       m->count += len;
+                       return 0;
+               }
+       }
+
+       m->count = m->size;
+       return -1;
+}
+
+static int ascii_show(struct seq_file *m, void *v)
+{
+       struct ltt_relay_iter *iter = v;
+       struct ltt_relay_cpu_iter *citer;
+       const char *name;
+       const char *fmt;
+       unsigned long long tsc;
+       size_t data_size;
+
+       if (iter->cpu == -1)
+               return 0;
+
+       citer = &iter->iter_cpu[iter->cpu];
+       WARN_ON_ONCE(!citer->sb_ref);
+       /*
+        * Nothing to show, we are at the end of the last subbuffer currently
+        * having data.
+        */
+       if (!citer->header)
+               return 0;
+
+       tsc = citer->tsc;
+       name = marker_get_name_from_id(citer->chID, citer->eID);
+       fmt = marker_get_fmt_from_id(citer->chID, citer->eID);
+
+       if (!name || !fmt)
+               return 0;
+
+       seq_printf(m, "event:%16.16s: cpu:%2d time:%20.20llu ",
+                  name, iter->cpu, tsc);
+       seq_serialize(m, citer->buf, citer->payload_offset, fmt, &data_size);
+       seq_puts(m, "\n");
+       if (citer->data_size == INT_MAX)
+               citer->data_size = data_size;
+
+       return 0;
+}
+
+static struct seq_operations ascii_seq_ops = {
+       .start          = ascii_start,
+       .next           = ascii_next,
+       .stop           = ascii_stop,
+       .show           = ascii_show,
+};
+
+/* FIXME : cpu hotplug support */
+static int ltt_relay_iter_open_channel(struct ltt_relay_iter *iter,
+                                      struct ltt_chan *chan)
+{
+       int i, ret;
+       u16 chID = ltt_channels_get_index_from_name(chan->a.filename);
+
+       /* we don't need lock relay_channels_mutex */
+       for_each_possible_cpu(i) {
+               struct ltt_relay_cpu_iter *citer = &iter->iter_cpu[i];
+
+               citer->buf = per_cpu_ptr(chan->a.buf, i);
+               if (!citer->buf->a.allocated)
+                       continue;
+
+               citer->iter = iter;     /* easy lazy parent info */
+               citer->chID = chID;
+
+               ret = ltt_chanbuf_open_read(citer->buf);
+               if (ret) {
+                       /* Failed to open a percpu buffer, close everything. */
+                       citer->buf = NULL;
+                       goto error;
+               }
+
+               for (;;) {
+                       ret = subbuffer_start(citer,
+                                             &citer->read_sb_offset);
+                       DEBUGP(KERN_DEBUG
+                               "LTT ASCII open start "
+                               "cpu %d ret %d offset %lX\n",
+                               citer->buf->a.cpu, ret, citer->read_sb_offset);
+                       if (!ret || ret == -ENODATA) {
+                               break;  /* got data, or finalized */
+                       } else {        /* -EAGAIN */
+                               if (signal_pending(current))
+                                       break;
+                               schedule_timeout_interruptible(1);
+                       }
+               }
+               update_cpu_iter(citer, citer->hdr_offset);
+       }
+       if (!iter->nr_refs) {
+               ret = -ENODATA;
+               goto error;
+       }
+
+       return 0;
+
+error:
+       for_each_possible_cpu(i) {
+               struct ltt_relay_cpu_iter *citer = &iter->iter_cpu[i];
+
+               if (!citer->buf)
+                       break;
+
+               if (citer->buf->a.allocated)
+                       ltt_chanbuf_release_read(citer->buf);
+       }
+       return ret;
+}
+
+/* FIXME : cpu hotplug support */
+static int ltt_relay_iter_release_channel(struct ltt_relay_iter *iter)
+{
+       int i;
+
+       for_each_possible_cpu(i) {
+               struct ltt_relay_cpu_iter *citer = &iter->iter_cpu[i];
+
+               if (citer->sb_ref) {
+                       WARN_ON_ONCE(!citer->buf->a.allocated);
+                       DEBUGP(KERN_DEBUG
+                               "LTT ASCII release stop cpu %d offset %lX\n",
+                               citer->buf->a.cpu, citer->read_sb_offset);
+                       subbuffer_stop(&iter->iter_cpu[i],
+                                      citer->read_sb_offset);
+               }
+               if (citer->buf->a.allocated)
+                       ltt_chanbuf_release_read(citer->buf);
+       }
+       WARN_ON_ONCE(iter->nr_refs);
+       return 0;
+}
+
+static int ltt_relay_ascii_open(struct inode *inode, struct file *file)
+{
+       int ret;
+       struct ltt_chan *chan = inode->i_private;
+       struct ltt_relay_iter *iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               return -ENOMEM;
+
+       iter->chan = chan;
+       ret = ltt_relay_iter_open_channel(iter, chan);
+       if (ret)
+               goto error_free_alloc;
+
+       ret = seq_open(file, &ascii_seq_ops);
+       if (ret)
+               goto error_release_channel;
+       ((struct seq_file *)file->private_data)->private = iter;
+       return 0;
+
+error_release_channel:
+       ltt_relay_iter_release_channel(iter);
+error_free_alloc:
+       kfree(iter);
+       return ret;
+}
+
+static int ltt_relay_ascii_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = file->private_data;
+       struct ltt_relay_iter *iter = seq->private;
+
+       ltt_relay_iter_release_channel(iter);
+       kfree(iter);
+       return 0;
+}
+
+static struct file_operations ltt_ascii_fops =
+{
+       .read = seq_read,
+       .open = ltt_relay_ascii_open,
+       .release = ltt_relay_ascii_release,
+       .llseek = no_llseek,
+       .owner = THIS_MODULE,
+};
+
+int ltt_ascii_create(struct ltt_chan *chan)
+{
+       struct dentry *dentry;
+
+       dentry = debugfs_create_file(chan->a.filename,
+                                    S_IRUSR | S_IRGRP,
+                                    chan->a.trace->dentry.ascii_root,
+                                    chan, &ltt_ascii_fops);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+
+       if (!dentry)
+               return -EEXIST;
+
+       chan->a.ascii_dentry = dentry;
+       dentry->d_inode->i_private = chan;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(ltt_ascii_create);
+
+void ltt_ascii_remove(struct ltt_chan *chan)
+{
+       struct dentry *dentry;
+
+       dentry = dget(chan->a.ascii_dentry);
+       debugfs_remove(dentry);
+       /* TODO: wait / wakeup instead */
+       /*
+        * Wait for every reference to the dentry to be gone,
+        * except us.
+        */
+       while (atomic_read(&dentry->d_count) != 1)
+               msleep(100);
+       dput(dentry);
+}
+EXPORT_SYMBOL_GPL(ltt_ascii_remove);
+
+int ltt_ascii_create_dir(struct ltt_trace *new_trace)
+{
+       new_trace->dentry.ascii_root = debugfs_create_dir(new_trace->trace_name,
+                                                         ltt_ascii_dir_dentry);
+       if (!new_trace->dentry.ascii_root)
+               return -EEXIST;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(ltt_ascii_create_dir);
+
+void ltt_ascii_remove_dir(struct ltt_trace *trace)
+{
+       debugfs_remove(trace->dentry.ascii_root);
+}
+EXPORT_SYMBOL_GPL(ltt_ascii_remove_dir);
+
+static __init int ltt_ascii_init(void)
+{
+       ltt_ascii_dir_dentry = debugfs_create_dir(LTT_ASCII, get_ltt_root());
+       put_ltt_root();
+
+       return ltt_ascii_dir_dentry ? 0 : -EFAULT;
+}
+
+static __exit void ltt_ascii_exit(void)
+{
+       debugfs_remove(ltt_ascii_dir_dentry);
+}
+
+module_init(ltt_ascii_init);
+module_exit(ltt_ascii_exit);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Lai Jiangshan@FNST and Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Ascii Converter");
diff --git a/ltt-channels.c b/ltt-channels.c
new file mode 100644 (file)
index 0000000..c1cee26
--- /dev/null
@@ -0,0 +1,388 @@
+ /*
+ * ltt/ltt-channels.c
+ *
+ * (C) Copyright 2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * LTTng channel management.
+ *
+ * Author:
+ *     Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/ltt-channels.h>
+
+/*
+ * ltt_channel_mutex may be nested inside the LTT trace mutex.
+ * ltt_channel_mutex mutex may be nested inside markers mutex.
+ */
+static DEFINE_MUTEX(ltt_channel_mutex);
+static LIST_HEAD(ltt_channels);
+/*
+ * Index of next channel in array. Makes sure that as long as a trace channel is
+ * allocated, no array index will be re-used when a channel is freed and then
+ * another channel is allocated. This index is cleared and the array indexeds
+ * get reassigned when the index_kref goes back to 0, which indicates that no
+ * more trace channels are allocated.
+ */
+static unsigned int free_index;
+/* index_kref is protected by both ltt_channel_mutex and lock_markers */
+static struct kref index_kref; /* Keeps track of allocated trace channels */
+
+static struct ltt_channel_setting *lookup_channel(const char *name)
+{
+       struct ltt_channel_setting *iter;
+
+       list_for_each_entry(iter, &ltt_channels, list)
+               if (strcmp(name, iter->name) == 0)
+                       return iter;
+       return NULL;
+}
+
+/*
+ * Must be called when channel refcount falls to 0 _and_ also when the last
+ * trace is freed. This function is responsible for compacting the channel and
+ * event IDs when no users are active.
+ *
+ * Called with lock_markers() and channels mutex held.
+ */
+static void release_channel_setting(struct kref *kref)
+{
+       struct ltt_channel_setting *setting = container_of(kref,
+               struct ltt_channel_setting, kref);
+       struct ltt_channel_setting *iter;
+
+       if (atomic_read(&index_kref.refcount) == 0
+           && atomic_read(&setting->kref.refcount) == 0) {
+               list_del(&setting->list);
+               kfree(setting);
+
+               free_index = 0;
+               list_for_each_entry(iter, &ltt_channels, list) {
+                       iter->index = free_index++;
+                       iter->free_event_id = 0;
+               }
+       }
+}
+
+/*
+ * Perform channel index compaction when the last trace channel is freed.
+ *
+ * Called with lock_markers() and channels mutex held.
+ */
+static void release_trace_channel(struct kref *kref)
+{
+       struct ltt_channel_setting *iter, *n;
+
+       list_for_each_entry_safe(iter, n, &ltt_channels, list)
+               release_channel_setting(&iter->kref);
+       if (atomic_read(&index_kref.refcount) == 0)
+               markers_compact_event_ids();
+}
+
+/*
+ * ltt_channel_trace_ref :  Is there an existing trace session ?
+ *
+ * Must be called with lock_markers() held.
+ */
+int ltt_channels_trace_ref(void)
+{
+       return !!atomic_read(&index_kref.refcount);
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_ref);
+
+/**
+ * ltt_channels_register - Register a trace channel.
+ * @name: channel name
+ *
+ * Uses refcounting.
+ */
+int ltt_channels_register(const char *name)
+{
+       struct ltt_channel_setting *setting;
+       int ret = 0;
+
+       mutex_lock(&ltt_channel_mutex);
+       setting = lookup_channel(name);
+       if (setting) {
+               if (atomic_read(&setting->kref.refcount) == 0)
+                       goto init_kref;
+               else {
+                       kref_get(&setting->kref);
+                       goto end;
+               }
+       }
+       setting = kzalloc(sizeof(*setting), GFP_KERNEL);
+       if (!setting) {
+               ret = -ENOMEM;
+               goto end;
+       }
+       list_add(&setting->list, &ltt_channels);
+       strncpy(setting->name, name, PATH_MAX-1);
+       setting->index = free_index++;
+init_kref:
+       kref_init(&setting->kref);
+end:
+       mutex_unlock(&ltt_channel_mutex);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_register);
+
+/**
+ * ltt_channels_unregister - Unregister a trace channel.
+ * @name: channel name
+ * @compacting: performing compaction
+ *
+ * Must be called with markers mutex held.
+ */
+int ltt_channels_unregister(const char *name, int compacting)
+{
+       struct ltt_channel_setting *setting;
+       int ret = 0;
+
+       if (!compacting)
+               mutex_lock(&ltt_channel_mutex);
+       setting = lookup_channel(name);
+       if (!setting || atomic_read(&setting->kref.refcount) == 0) {
+               ret = -ENOENT;
+               goto end;
+       }
+       kref_put(&setting->kref, release_channel_setting);
+       if (!compacting && atomic_read(&index_kref.refcount) == 0)
+                       markers_compact_event_ids();
+end:
+       if (!compacting)
+               mutex_unlock(&ltt_channel_mutex);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_unregister);
+
+/**
+ * ltt_channels_set_default - Set channel default behavior.
+ * @name: default channel name
+ * @sb_size: size of the subbuffers
+ * @n_sb: number of subbuffers
+ */
+int ltt_channels_set_default(const char *name,
+                            unsigned int sb_size,
+                            unsigned int n_sb)
+{
+       struct ltt_channel_setting *setting;
+       int ret = 0;
+
+       mutex_lock(&ltt_channel_mutex);
+       setting = lookup_channel(name);
+       if (!setting || atomic_read(&setting->kref.refcount) == 0) {
+               ret = -ENOENT;
+               goto end;
+       }
+       setting->sb_size = sb_size;
+       setting->n_sb = n_sb;
+end:
+       mutex_unlock(&ltt_channel_mutex);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_set_default);
+
+/**
+ * ltt_channels_get_name_from_index - get channel name from channel index
+ * @index: channel index
+ *
+ * Allows to lookup the channel name given its index. Done to keep the name
+ * information outside of each trace channel instance.
+ */
+const char *ltt_channels_get_name_from_index(unsigned int index)
+{
+       struct ltt_channel_setting *iter;
+
+       list_for_each_entry(iter, &ltt_channels, list)
+               if (iter->index == index && atomic_read(&iter->kref.refcount))
+                       return iter->name;
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_get_name_from_index);
+
+static struct ltt_channel_setting *
+ltt_channels_get_setting_from_name(const char *name)
+{
+       struct ltt_channel_setting *iter;
+
+       list_for_each_entry(iter, &ltt_channels, list)
+               if (!strcmp(iter->name, name)
+                   && atomic_read(&iter->kref.refcount))
+                       return iter;
+       return NULL;
+}
+
+/**
+ * ltt_channels_get_index_from_name - get channel index from channel name
+ * @name: channel name
+ *
+ * Allows to lookup the channel index given its name. Done to keep the name
+ * information outside of each trace channel instance.
+ * Returns -1 if not found.
+ */
+int ltt_channels_get_index_from_name(const char *name)
+{
+       struct ltt_channel_setting *setting;
+
+       setting = ltt_channels_get_setting_from_name(name);
+       if (setting)
+               return setting->index;
+       else
+               return -1;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_get_index_from_name);
+
+/**
+ * ltt_channels_trace_alloc - Allocate channel structures for a trace
+ * @sb_size: subbuffer size. 0 uses default.
+ * @n_sb: number of subbuffers per per-cpu buffers. 0 uses default.
+ * @flags: Default channel flags
+ *
+ * Use the current channel list to allocate the channels for a trace.
+ * Called with trace lock held. Does not perform the trace buffer allocation,
+ * because we must let the user overwrite specific channel sizes.
+ */
+struct ltt_chan *ltt_channels_trace_alloc(unsigned int *nr_channels,
+                                         int overwrite, int active)
+{
+       struct ltt_chan *chan = NULL;
+       struct ltt_channel_setting *iter;
+
+       lock_markers();
+       mutex_lock(&ltt_channel_mutex);
+       if (!free_index)
+               goto end;
+       if (!atomic_read(&index_kref.refcount))
+               kref_init(&index_kref);
+       else
+               kref_get(&index_kref);
+       *nr_channels = free_index;
+       chan = kzalloc(sizeof(struct ltt_chan) * free_index, GFP_KERNEL);
+       if (!chan)
+               goto end;
+       list_for_each_entry(iter, &ltt_channels, list) {
+               if (!atomic_read(&iter->kref.refcount))
+                       continue;
+               chan[iter->index].a.sb_size = iter->sb_size;
+               chan[iter->index].a.n_sb = iter->n_sb;
+               chan[iter->index].overwrite = overwrite;
+               chan[iter->index].active = active;
+               strncpy(chan[iter->index].a.filename, iter->name, NAME_MAX - 1);
+               chan[iter->index].switch_timer_interval = 0;
+       }
+end:
+       mutex_unlock(&ltt_channel_mutex);
+       unlock_markers();
+       return chan;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_alloc);
+
+/**
+ * ltt_channels_trace_free - Free one trace's channels
+ * @channels: channels to free
+ *
+ * Called with trace lock held. The actual channel buffers must be freed before
+ * this function is called.
+ */
+void ltt_channels_trace_free(struct ltt_chan *channels,
+                            unsigned int nr_channels)
+{
+       lock_markers();
+       mutex_lock(&ltt_channel_mutex);
+       kfree(channels);
+       kref_put(&index_kref, release_trace_channel);
+       mutex_unlock(&ltt_channel_mutex);
+       unlock_markers();
+       marker_update_probes();
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_free);
+
+/**
+ * ltt_channels_trace_set_timer - set switch timer
+ * @channel: channel
+ * @interval: interval of timer interrupt, in jiffies. 0 inhibits timer.
+ */
+
+void ltt_channels_trace_set_timer(struct ltt_chan *chan,
+                                 unsigned long interval)
+{
+       chan->switch_timer_interval = interval;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_set_timer);
+
+/**
+ * _ltt_channels_get_event_id - get next event ID for a marker
+ * @channel: channel name
+ * @name: event name
+ *
+ * Returns a unique event ID (for this channel) or < 0 on error.
+ * Must be called with channels mutex held.
+ */
+int _ltt_channels_get_event_id(const char *channel, const char *name)
+{
+       struct ltt_channel_setting *setting;
+       int ret;
+
+       setting = ltt_channels_get_setting_from_name(channel);
+       if (!setting) {
+               ret = -ENOENT;
+               goto end;
+       }
+       if (strcmp(channel, "metadata") == 0) {
+               if (strcmp(name, "core_marker_id") == 0)
+                       ret = 0;
+               else if (strcmp(name, "core_marker_format") == 0)
+                       ret = 1;
+               else
+                       ret = -ENOENT;
+               goto end;
+       }
+       if (setting->free_event_id == EVENTS_PER_CHANNEL - 1) {
+               ret = -ENOSPC;
+               goto end;
+       }
+       ret = setting->free_event_id++;
+end:
+       return ret;
+}
+
+/**
+ * ltt_channels_get_event_id - get next event ID for a marker
+ * @channel: channel name
+ * @name: event name
+ *
+ * Returns a unique event ID (for this channel) or < 0 on error.
+ */
+int ltt_channels_get_event_id(const char *channel, const char *name)
+{
+       int ret;
+
+       mutex_lock(&ltt_channel_mutex);
+       ret = _ltt_channels_get_event_id(channel, name);
+       mutex_unlock(&ltt_channel_mutex);
+       return ret;
+}
+
+/**
+ * ltt_channels_reset_event_ids - reset event IDs at compaction
+ *
+ * Called with lock marker and channel mutex held.
+ */
+void _ltt_channels_reset_event_ids(void)
+{
+       struct ltt_channel_setting *iter;
+
+       list_for_each_entry(iter, &ltt_channels, list)
+               iter->free_event_id = 0;
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Channel Management");
diff --git a/ltt-core.c b/ltt-core.c
new file mode 100644 (file)
index 0000000..13d517f
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * LTT core in-kernel infrastructure.
+ *
+ * Copyright 2006 - Mathieu Desnoyers mathieu.desnoyers@polymtl.ca
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/percpu.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/kref.h>
+#include <linux/cpu.h>
+
+#include "ltt-tracer-core.h"
+
+/* Traces structures */
+struct ltt_traces ltt_traces = {
+       .setup_head = LIST_HEAD_INIT(ltt_traces.setup_head),
+       .head = LIST_HEAD_INIT(ltt_traces.head),
+};
+EXPORT_SYMBOL(ltt_traces);
+
+/* Traces list writer locking */
+static DEFINE_MUTEX(ltt_traces_mutex);
+
+/* root dentry mutex */
+static DEFINE_MUTEX(ltt_root_mutex);
+/* dentry of ltt's root dir */
+static struct dentry *ltt_root_dentry;
+static struct kref ltt_root_kref = {
+       .refcount = ATOMIC_INIT(0),
+};
+
+static void ltt_root_release(struct kref *ref)
+{
+       debugfs_remove(ltt_root_dentry);
+       ltt_root_dentry = NULL;
+}
+
+void put_ltt_root(void)
+{
+       mutex_lock(&ltt_root_mutex);
+       if (ltt_root_dentry)
+               kref_put(&ltt_root_kref, ltt_root_release);
+       mutex_unlock(&ltt_root_mutex);
+}
+EXPORT_SYMBOL_GPL(put_ltt_root);
+
+struct dentry *get_ltt_root(void)
+{
+       mutex_lock(&ltt_root_mutex);
+       if (!ltt_root_dentry) {
+               ltt_root_dentry = debugfs_create_dir(LTT_ROOT, NULL);
+               if (!ltt_root_dentry) {
+                       printk(KERN_ERR "LTT : create ltt root dir failed\n");
+                       goto out;
+               }
+               kref_init(&ltt_root_kref);
+               goto out;
+       }
+       kref_get(&ltt_root_kref);
+out:
+       mutex_unlock(&ltt_root_mutex);
+       return ltt_root_dentry;
+}
+EXPORT_SYMBOL_GPL(get_ltt_root);
+
+/*
+ * ltt_lock_traces/ltt_unlock_traces also disables cpu hotplug.
+ */
+void ltt_lock_traces(void)
+{
+       mutex_lock(&ltt_traces_mutex);
+       get_online_cpus();
+}
+EXPORT_SYMBOL_GPL(ltt_lock_traces);
+
+void ltt_unlock_traces(void)
+{
+       put_online_cpus();
+       mutex_unlock(&ltt_traces_mutex);
+}
+EXPORT_SYMBOL_GPL(ltt_unlock_traces);
+
+DEFINE_PER_CPU(unsigned int, ltt_nesting);
+EXPORT_PER_CPU_SYMBOL(ltt_nesting);
+
+int ltt_run_filter_default(void *trace, uint16_t eID)
+{
+       return 1;
+}
+
+/* This function pointer is protected by a trace activation check */
+ltt_run_filter_functor ltt_run_filter = ltt_run_filter_default;
+EXPORT_SYMBOL_GPL(ltt_run_filter);
+
+void ltt_filter_register(ltt_run_filter_functor func)
+{
+       ltt_run_filter = func;
+}
+EXPORT_SYMBOL_GPL(ltt_filter_register);
+
+void ltt_filter_unregister(void)
+{
+       ltt_run_filter = ltt_run_filter_default;
+}
+EXPORT_SYMBOL_GPL(ltt_filter_unregister);
diff --git a/ltt-event-header.c b/ltt-event-header.c
new file mode 100644 (file)
index 0000000..4f049d3
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * ltt/ltt-event-header.c
+ *
+ * (C) Copyright 2010 - Mathieu Desnoyers (mathieu.desnoyers@efficios.com)
+ *
+ * LTTng event header.
+ *
+ * Author:
+ *     Mathieu Desnoyers (mathieu.desnoyers@efficios.com)
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+
+#include "ltt-tracer.h"
+#include "ltt-relay.h"
+
+size_t ltt_write_event_header_slow(struct ltt_chanbuf_alloc *bufa,
+                                  struct ltt_chan_alloc *chana,
+                                  long buf_offset, u16 eID, u32 event_size,
+                                  u64 tsc, unsigned int rflags)
+{
+       struct ltt_event_header header;
+       u16 small_size;
+
+       switch (rflags) {
+       case LTT_RFLAG_ID_SIZE_TSC:
+               header.id_time = 29 << LTT_TSC_BITS;
+               break;
+       case LTT_RFLAG_ID_SIZE:
+               header.id_time = 30 << LTT_TSC_BITS;
+               break;
+       case LTT_RFLAG_ID:
+               header.id_time = 31 << LTT_TSC_BITS;
+               break;
+       default:
+               WARN_ON_ONCE(1);
+               header.id_time = 0;
+       }
+
+       header.id_time |= (u32)tsc & LTT_TSC_MASK;
+       ltt_relay_write(bufa, chana, buf_offset, &header, sizeof(header));
+       buf_offset += sizeof(header);
+
+       switch (rflags) {
+       case LTT_RFLAG_ID_SIZE_TSC:
+               small_size = (u16)min_t(u32, event_size, LTT_MAX_SMALL_SIZE);
+               ltt_relay_write(bufa, chana, buf_offset,
+                       &eID, sizeof(u16));
+               buf_offset += sizeof(u16);
+               ltt_relay_write(bufa, chana, buf_offset,
+                       &small_size, sizeof(u16));
+               buf_offset += sizeof(u16);
+               if (small_size == LTT_MAX_SMALL_SIZE) {
+                       ltt_relay_write(bufa, chana, buf_offset,
+                               &event_size, sizeof(u32));
+                       buf_offset += sizeof(u32);
+               }
+               buf_offset += ltt_align(buf_offset, sizeof(u64));
+               ltt_relay_write(bufa, chana, buf_offset,
+                       &tsc, sizeof(u64));
+               buf_offset += sizeof(u64);
+               break;
+       case LTT_RFLAG_ID_SIZE:
+               small_size = (u16)min_t(u32, event_size, LTT_MAX_SMALL_SIZE);
+               ltt_relay_write(bufa, chana, buf_offset,
+                       &eID, sizeof(u16));
+               buf_offset += sizeof(u16);
+               ltt_relay_write(bufa, chana, buf_offset,
+                       &small_size, sizeof(u16));
+               buf_offset += sizeof(u16);
+               if (small_size == LTT_MAX_SMALL_SIZE) {
+                       ltt_relay_write(bufa, chana, buf_offset,
+                               &event_size, sizeof(u32));
+                       buf_offset += sizeof(u32);
+               }
+               break;
+       case LTT_RFLAG_ID:
+               ltt_relay_write(bufa, chana, buf_offset,
+                       &eID, sizeof(u16));
+               buf_offset += sizeof(u16);
+               break;
+       }
+
+       return buf_offset;
+}
+EXPORT_SYMBOL_GPL(ltt_write_event_header_slow);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Event Header");
diff --git a/ltt-filter.c b/ltt-filter.c
new file mode 100644 (file)
index 0000000..ec113af
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2008 Mathieu Desnoyers
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+
+#include "ltt-tracer.h"
+
+#define LTT_FILTER_DIR "filter"
+
+/*
+ * Protects the ltt_filter_dir allocation.
+ */
+static DEFINE_MUTEX(ltt_filter_mutex);
+
+static struct dentry *ltt_filter_dir;
+
+struct dentry *get_filter_root(void)
+{
+       struct dentry *ltt_root_dentry;
+
+       mutex_lock(&ltt_filter_mutex);
+       if (!ltt_filter_dir) {
+               ltt_root_dentry = get_ltt_root();
+               if (!ltt_root_dentry)
+                       goto err_no_root;
+
+               ltt_filter_dir = debugfs_create_dir(LTT_FILTER_DIR,
+                                                   ltt_root_dentry);
+               if (!ltt_filter_dir)
+                       printk(KERN_ERR
+                              "ltt_filter_init: failed to create dir %s\n",
+                              LTT_FILTER_DIR);
+       }
+err_no_root:
+       mutex_unlock(&ltt_filter_mutex);
+       return ltt_filter_dir;
+}
+EXPORT_SYMBOL_GPL(get_filter_root);
+
+static void __exit ltt_filter_exit(void)
+{
+       debugfs_remove(ltt_filter_dir);
+}
+
+module_exit(ltt_filter_exit);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>");
+MODULE_DESCRIPTION("Linux Trace Toolkit Filter");
diff --git a/ltt-kprobes.c b/ltt-kprobes.c
new file mode 100644 (file)
index 0000000..7539381
--- /dev/null
@@ -0,0 +1,493 @@
+/*
+ * (C) Copyright       2009 -
+ *             Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * LTTng kprobes integration module.
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/kprobes.h>
+#include <linux/marker.h>
+#include <linux/mutex.h>
+#include <linux/jhash.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/kallsyms.h>
+
+#include "ltt-type-serializer.h"
+#include "ltt-tracer.h"
+
+#define LTT_KPROBES_DIR        "kprobes"
+#define LTT_KPROBES_ENABLE     "enable"
+#define LTT_KPROBES_DISABLE    "disable"
+#define LTT_KPROBES_LIST       "list"
+
+/* Active LTTng kprobes hash table */
+static DEFINE_MUTEX(ltt_kprobes_mutex);
+
+#define LTT_KPROBE_HASH_BITS   6
+#define LTT_KPROBE_TABLE_SIZE  (1 << LTT_KPROBE_HASH_BITS)
+static struct hlist_head ltt_kprobe_table[LTT_KPROBE_TABLE_SIZE];
+
+struct kprobe_entry {
+       struct hlist_node hlist;
+       struct kprobe kp;
+       char key[0];
+};
+
+static struct dentry *ltt_kprobes_dir,
+                    *ltt_kprobes_enable_dentry,
+                    *ltt_kprobes_disable_dentry,
+                    *ltt_kprobes_list_dentry;
+
+static int module_exit;
+
+
+static void trace_kprobe_table_entry(void *call_data, struct kprobe_entry *e)
+{
+       unsigned long addr;
+       char *namebuf = (char *)__get_free_page(GFP_KERNEL);
+
+       if (e->kp.addr) {
+               sprint_symbol(namebuf, (unsigned long)e->kp.addr);
+               addr = (unsigned long)e->kp.addr;
+       } else {
+               strncpy(namebuf, e->kp.symbol_name, PAGE_SIZE - 1);
+               /* TODO : add offset */
+               addr = kallsyms_lookup_name(namebuf);
+       }
+       if (addr)
+               __trace_mark(0, kprobe_state, kprobe_table, call_data,
+                            "ip 0x%lX symbol %s", addr, namebuf);
+       free_page((unsigned long)namebuf);
+}
+
+DEFINE_MARKER(kernel, kprobe, "ip %lX");
+
+static int ltt_kprobe_handler_pre(struct kprobe *p, struct pt_regs *regs)
+{
+       struct marker *marker;
+       unsigned long data;
+
+       data = (unsigned long)p->addr;
+       marker = &GET_MARKER(kernel, kprobe);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+                             &data, sizeof(data), sizeof(data));
+       return 0;
+}
+
+static int ltt_register_kprobe(const char *key)
+{
+       struct hlist_head *head;
+       struct hlist_node *node;
+       struct kprobe_entry *e = NULL;
+       char *symbol_name = NULL;
+       unsigned long addr;
+       unsigned int offset = 0;
+       u32 hash;
+       size_t key_len = strlen(key) + 1;
+       int ret;
+
+       if (key_len == 1)
+               return -ENOENT; /* only \0 */
+
+       if (sscanf(key, "%li", &addr) != 1)
+               addr = 0;
+
+       if (!addr) {
+               const char *symbol_end = NULL;
+               unsigned int symbol_len;        /* includes final \0 */
+
+               symbol_end = strchr(key, ' ');
+               if (symbol_end)
+                       symbol_len = symbol_end - key + 1;
+               else
+                       symbol_len = key_len;
+               symbol_name = kmalloc(symbol_len, GFP_KERNEL);
+               if (!symbol_name) {
+                       ret = -ENOMEM;
+                       goto error;
+               }
+               memcpy(symbol_name, key, symbol_len - 1);
+               symbol_name[symbol_len-1] = '\0';
+               if (symbol_end) {
+                       symbol_end++;   /* start of offset */
+                       if (sscanf(symbol_end, "%i", &offset) != 1)
+                               offset = 0;
+               }
+       }
+
+       hash = jhash(key, key_len-1, 0);
+       head = &ltt_kprobe_table[hash & ((1 << LTT_KPROBE_HASH_BITS)-1)];
+       hlist_for_each_entry(e, node, head, hlist) {
+               if (!strcmp(key, e->key)) {
+                       printk(KERN_NOTICE "Kprobe %s busy\n", key);
+                       ret = -EBUSY;
+                       goto error;
+               }
+       }
+       /*
+        * Using kzalloc here to allocate a variable length element. Could
+        * cause some memory fragmentation if overused.
+        */
+       e = kzalloc(sizeof(struct kprobe_entry) + key_len, GFP_KERNEL);
+       if (!e) {
+               ret = -ENOMEM;
+               goto error;
+       }
+       memcpy(e->key, key, key_len);
+       hlist_add_head(&e->hlist, head);
+       e->kp.pre_handler = ltt_kprobe_handler_pre;
+       e->kp.symbol_name = symbol_name;
+       e->kp.offset = offset;
+       e->kp.addr = (void *)addr;
+       ret = register_kprobe(&e->kp);
+       if (ret < 0)
+               goto error_list_del;
+       trace_kprobe_table_entry(NULL, e);
+       return 0;
+
+error_list_del:
+       hlist_del(&e->hlist);
+error:
+       kfree(symbol_name);
+       kfree(e);
+       return ret;
+}
+
+static int ltt_unregister_kprobe(const char *key)
+{
+       struct hlist_head *head;
+       struct hlist_node *node;
+       struct kprobe_entry *e;
+       int found = 0;
+       size_t key_len = strlen(key) + 1;
+       u32 hash;
+
+       hash = jhash(key, key_len-1, 0);
+       head = &ltt_kprobe_table[hash & ((1 << LTT_KPROBE_HASH_BITS)-1)];
+       hlist_for_each_entry(e, node, head, hlist) {
+               if (!strcmp(key, e->key)) {
+                       found = 1;
+                       break;
+               }
+       }
+       if (!found)
+               return -ENOENT;
+       hlist_del(&e->hlist);
+       unregister_kprobe(&e->kp);
+       kfree(e->kp.symbol_name);
+       kfree(e);
+       return 0;
+}
+
+static void ltt_unregister_all_kprobes(void)
+{
+       struct kprobe_entry *e;
+       struct hlist_head *head;
+       struct hlist_node *node, *tmp;
+       unsigned int i;
+
+       for (i = 0; i < LTT_KPROBE_TABLE_SIZE; i++) {
+               head = &ltt_kprobe_table[i];
+               hlist_for_each_entry_safe(e, node, tmp, head, hlist) {
+                       hlist_del(&e->hlist);
+                       unregister_kprobe(&e->kp);
+                       kfree(e->kp.symbol_name);
+                       kfree(e);
+               }
+       }
+}
+
+/*
+ * Allows to specify either
+ * - symbol
+ * - symbol offset
+ * - address
+ */
+static ssize_t enable_op_write(struct file *file,
+       const char __user *user_buf, size_t count, loff_t *ppos)
+{
+       int err, buf_size;
+       char *end;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+
+       mutex_lock(&ltt_kprobes_mutex);
+       if (module_exit) {
+               err = -EPERM;
+               goto error;
+       }
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto error;
+       buf[buf_size] = '\0';
+       end = strchr(buf, '\n');
+       if (end)
+               *end = '\0';
+       err = ltt_register_kprobe(buf);
+       if (err)
+               goto error;
+
+       mutex_unlock(&ltt_kprobes_mutex);
+       free_page((unsigned long)buf);
+       return count;
+error:
+       mutex_unlock(&ltt_kprobes_mutex);
+       free_page((unsigned long)buf);
+       return err;
+}
+
+static const struct file_operations ltt_kprobes_enable = {
+       .write = enable_op_write,
+};
+
+static ssize_t disable_op_write(struct file *file,
+       const char __user *user_buf, size_t count, loff_t *ppos)
+{
+       int err, buf_size;
+       char *end;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+
+       mutex_lock(&ltt_kprobes_mutex);
+       if (module_exit)
+               goto end;
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto error;
+       buf[buf_size] = '\0';
+       end = strchr(buf, '\n');
+       if (end)
+               *end = '\0';
+       err = ltt_unregister_kprobe(buf);
+       if (err)
+               goto error;
+end:
+       mutex_unlock(&ltt_kprobes_mutex);
+       free_page((unsigned long)buf);
+       return count;
+error:
+       mutex_unlock(&ltt_kprobes_mutex);
+       free_page((unsigned long)buf);
+       return err;
+}
+
+static const struct file_operations ltt_kprobes_disable = {
+       .write = disable_op_write,
+};
+
+/*
+ * This seqfile read is not perfectly safe, as a kprobe could be removed from
+ * the hash table between two reads. This will result in an incomplete output.
+ */
+static struct kprobe_entry *ltt_find_next_kprobe(struct kprobe_entry *prev)
+{
+       struct kprobe_entry *e;
+       struct hlist_head *head;
+       struct hlist_node *node;
+       unsigned int i;
+       int found = 0;
+
+       if (prev == (void *)-1UL)
+               return NULL;
+
+       if (!prev)
+               found = 1;
+
+       for (i = 0; i < LTT_KPROBE_TABLE_SIZE; i++) {
+               head = &ltt_kprobe_table[i];
+               hlist_for_each_entry(e, node, head, hlist) {
+                       if (found)
+                               return e;
+                       if (e == prev)
+                               found = 1;
+               }
+       }
+       return NULL;
+}
+
+static void *lk_next(struct seq_file *m, void *p, loff_t *pos)
+{
+       m->private = ltt_find_next_kprobe(m->private);
+       if (!m->private) {
+               m->private = (void *)-1UL;
+               return NULL;
+       }
+       return m->private;
+}
+
+static void *lk_start(struct seq_file *m, loff_t *pos)
+{
+       mutex_lock(&ltt_kprobes_mutex);
+       if (!*pos)
+               m->private = NULL;
+       m->private = ltt_find_next_kprobe(m->private);
+       if (!m->private) {
+               m->private = (void *)-1UL;
+               return NULL;
+       }
+       return m->private;
+}
+
+static void lk_stop(struct seq_file *m, void *p)
+{
+       mutex_unlock(&ltt_kprobes_mutex);
+}
+
+static int lk_show(struct seq_file *m, void *p)
+{
+       struct kprobe_entry *e = m->private;
+       seq_printf(m, "%s\n", e->key);
+       return 0;
+}
+
+static const struct seq_operations ltt_kprobes_list_op = {
+       .start = lk_start,
+       .next = lk_next,
+       .stop = lk_stop,
+       .show = lk_show,
+};
+
+static int ltt_kprobes_list_open(struct inode *inode, struct file *file)
+{
+       int ret;
+
+       ret = seq_open(file, &ltt_kprobes_list_op);
+       if (ret == 0)
+               ((struct seq_file *)file->private_data)->private = NULL;
+       return ret;
+}
+
+static int ltt_kprobes_list_release(struct inode *inode, struct file *file)
+{
+       struct seq_file *seq = file->private_data;
+
+       seq->private = NULL;
+       return seq_release(inode, file);
+}
+
+static const struct file_operations ltt_kprobes_list = {
+       .open = ltt_kprobes_list_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = ltt_kprobes_list_release,
+};
+
+/*
+ * kprobes table dump. Callback invoked by ltt-statedump. ltt-statedump must
+ * take a reference to this module before calling this callback.
+ */
+void ltt_dump_kprobes_table(void *call_data)
+{
+       struct kprobe_entry *e;
+       struct hlist_head *head;
+       struct hlist_node *node;
+       unsigned int i;
+
+       for (i = 0; i < LTT_KPROBE_TABLE_SIZE; i++) {
+               head = &ltt_kprobe_table[i];
+               hlist_for_each_entry(e, node, head, hlist)
+                       trace_kprobe_table_entry(call_data, e);
+       }
+}
+EXPORT_SYMBOL_GPL(ltt_dump_kprobes_table);
+
+static int __init ltt_kprobes_init(void)
+{
+       struct dentry *ltt_root_dentry;
+       int ret = 0;
+
+       printk(KERN_INFO "LTT : ltt-kprobes init\n");
+       mutex_lock(&ltt_kprobes_mutex);
+
+       ltt_root_dentry = get_ltt_root();
+       if (!ltt_root_dentry) {
+               ret = -ENOENT;
+               goto err_no_root;
+       }
+
+       ltt_kprobes_dir = debugfs_create_dir(LTT_KPROBES_DIR, ltt_root_dentry);
+       if (!ltt_kprobes_dir) {
+               printk(KERN_ERR
+                      "ltt_kprobes_init: failed to create dir %s\n",
+                       LTT_KPROBES_DIR);
+               ret = -ENOMEM;
+               goto err_no_dir;
+       }
+
+       ltt_kprobes_enable_dentry = debugfs_create_file(LTT_KPROBES_ENABLE,
+                                                       S_IWUSR,
+                                                       ltt_kprobes_dir, NULL,
+                                                       &ltt_kprobes_enable);
+       if (IS_ERR(ltt_kprobes_enable_dentry) || !ltt_kprobes_enable_dentry) {
+               printk(KERN_ERR
+                      "ltt_kprobes_init: failed to create file %s\n",
+                       LTT_KPROBES_ENABLE);
+               ret = -ENOMEM;
+               goto err_no_enable;
+       }
+
+       ltt_kprobes_disable_dentry = debugfs_create_file(LTT_KPROBES_DISABLE,
+                                                        S_IWUSR,
+                                                        ltt_kprobes_dir, NULL,
+                                                        &ltt_kprobes_disable);
+       if (IS_ERR(ltt_kprobes_disable_dentry) || !ltt_kprobes_disable_dentry) {
+               printk(KERN_ERR
+                      "ltt_kprobes_init: failed to create file %s\n",
+                       LTT_KPROBES_DISABLE);
+               ret = -ENOMEM;
+               goto err_no_disable;
+       }
+
+       ltt_kprobes_list_dentry = debugfs_create_file(LTT_KPROBES_LIST,
+                                                     S_IWUSR, ltt_kprobes_dir,
+                                                     NULL, &ltt_kprobes_list);
+       if (IS_ERR(ltt_kprobes_list_dentry) || !ltt_kprobes_list_dentry) {
+               printk(KERN_ERR
+                      "ltt_kprobes_init: failed to create file %s\n",
+                       LTT_KPROBES_LIST);
+               ret = -ENOMEM;
+               goto err_no_list;
+       }
+       ltt_statedump_register_kprobes_dump(ltt_dump_kprobes_table);
+
+       mutex_unlock(&ltt_kprobes_mutex);
+       return ret;
+
+err_no_list:
+       debugfs_remove(ltt_kprobes_disable_dentry);
+err_no_disable:
+       debugfs_remove(ltt_kprobes_enable_dentry);
+err_no_enable:
+       debugfs_remove(ltt_kprobes_dir);
+err_no_dir:
+err_no_root:
+       mutex_unlock(&ltt_kprobes_mutex);
+       return ret;
+}
+module_init(ltt_kprobes_init);
+
+static void __exit ltt_kprobes_exit(void)
+{
+       printk(KERN_INFO "LTT : ltt-kprobes exit\n");
+       mutex_lock(&ltt_kprobes_mutex);
+       module_exit = 1;
+       ltt_statedump_unregister_kprobes_dump(ltt_dump_kprobes_table);
+       debugfs_remove(ltt_kprobes_list_dentry);
+       debugfs_remove(ltt_kprobes_disable_dentry);
+       debugfs_remove(ltt_kprobes_enable_dentry);
+       debugfs_remove(ltt_kprobes_dir);
+       ltt_unregister_all_kprobes();
+       mutex_unlock(&ltt_kprobes_mutex);
+}
+module_exit(ltt_kprobes_exit);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Kprobes Support");
diff --git a/ltt-marker-control.c b/ltt-marker-control.c
new file mode 100644 (file)
index 0000000..6142430
--- /dev/null
@@ -0,0 +1,254 @@
+/*
+ * Copyright (C) 2007 Mathieu Desnoyers
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/stat.h>
+#include <linux/vmalloc.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include "ltt-tracer.h"
+
+#define DEFAULT_CHANNEL "cpu"
+#define DEFAULT_PROBE "default"
+
+LIST_HEAD(probes_list);
+
+/*
+ * Mutex protecting the probe slab cache.
+ * Nests inside the traces mutex.
+ */
+DEFINE_MUTEX(probes_mutex);
+
+struct ltt_available_probe default_probe = {
+       .name = "default",
+       .format = NULL,
+       .probe_func = ltt_vtrace,
+       .callbacks[0] = ltt_serialize_data,
+};
+
+static struct kmem_cache *markers_loaded_cachep;
+static LIST_HEAD(markers_loaded_list);
+/*
+ * List sorted by name strcmp order.
+ */
+static LIST_HEAD(probes_registered_list);
+
+static struct ltt_available_probe *get_probe_from_name(const char *pname)
+{
+       struct ltt_available_probe *iter;
+       int comparison, found = 0;
+
+       if (!pname)
+               pname = DEFAULT_PROBE;
+       list_for_each_entry(iter, &probes_registered_list, node) {
+               comparison = strcmp(pname, iter->name);
+               if (!comparison)
+                       found = 1;
+               if (comparison <= 0)
+                       break;
+       }
+       if (found)
+               return iter;
+       else
+               return NULL;
+}
+
+int ltt_probe_register(struct ltt_available_probe *pdata)
+{
+       int ret = 0;
+       int comparison;
+       struct ltt_available_probe *iter;
+
+       mutex_lock(&probes_mutex);
+       list_for_each_entry_reverse(iter, &probes_registered_list, node) {
+               comparison = strcmp(pdata->name, iter->name);
+               if (!comparison) {
+                       ret = -EBUSY;
+                       goto end;
+               } else if (comparison > 0) {
+                       /* We belong to the location right after iter. */
+                       list_add(&pdata->node, &iter->node);
+                       goto end;
+               }
+       }
+       /* Should be added at the head of the list */
+       list_add(&pdata->node, &probes_registered_list);
+end:
+       mutex_unlock(&probes_mutex);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_probe_register);
+
+/*
+ * Called when a probe does not want to be called anymore.
+ */
+int ltt_probe_unregister(struct ltt_available_probe *pdata)
+{
+       int ret = 0;
+       struct ltt_active_marker *amark, *tmp;
+
+       mutex_lock(&probes_mutex);
+       list_for_each_entry_safe(amark, tmp, &markers_loaded_list, node) {
+               if (amark->probe == pdata) {
+                       ret = marker_probe_unregister_private_data(
+                               pdata->probe_func, amark);
+                       if (ret)
+                               goto end;
+                       list_del(&amark->node);
+                       kmem_cache_free(markers_loaded_cachep, amark);
+               }
+       }
+       list_del(&pdata->node);
+end:
+       mutex_unlock(&probes_mutex);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_probe_unregister);
+
+/*
+ * Connect marker "mname" to probe "pname".
+ * Only allow _only_ probe instance to be connected to a marker.
+ */
+int ltt_marker_connect(const char *channel, const char *mname,
+                      const char *pname)
+
+{
+       int ret;
+       struct ltt_active_marker *pdata;
+       struct ltt_available_probe *probe;
+
+       ltt_lock_traces();
+       mutex_lock(&probes_mutex);
+       probe = get_probe_from_name(pname);
+       if (!probe) {
+               ret = -ENOENT;
+               goto end;
+       }
+       pdata = marker_get_private_data(channel, mname, probe->probe_func, 0);
+       if (pdata && !IS_ERR(pdata)) {
+               ret = -EEXIST;
+               goto end;
+       }
+       pdata = kmem_cache_zalloc(markers_loaded_cachep, GFP_KERNEL);
+       if (!pdata) {
+               ret = -ENOMEM;
+               goto end;
+       }
+       pdata->probe = probe;
+       /*
+        * ID has priority over channel in case of conflict.
+        */
+       ret = marker_probe_register(channel, mname, NULL,
+               probe->probe_func, pdata);
+       if (ret)
+               kmem_cache_free(markers_loaded_cachep, pdata);
+       else
+               list_add(&pdata->node, &markers_loaded_list);
+end:
+       mutex_unlock(&probes_mutex);
+       ltt_unlock_traces();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_marker_connect);
+
+/*
+ * Disconnect marker "mname", probe "pname".
+ */
+int ltt_marker_disconnect(const char *channel, const char *mname,
+                         const char *pname)
+{
+       struct ltt_active_marker *pdata;
+       struct ltt_available_probe *probe;
+       int ret = 0;
+
+       mutex_lock(&probes_mutex);
+       probe = get_probe_from_name(pname);
+       if (!probe) {
+               ret = -ENOENT;
+               goto end;
+       }
+       pdata = marker_get_private_data(channel, mname, probe->probe_func, 0);
+       if (IS_ERR(pdata)) {
+               ret = PTR_ERR(pdata);
+               goto end;
+       } else if (!pdata) {
+               /*
+                * Not registered by us.
+                */
+               ret = -EPERM;
+               goto end;
+       }
+       ret = marker_probe_unregister(channel, mname, probe->probe_func, pdata);
+       if (ret)
+               goto end;
+       else {
+               list_del(&pdata->node);
+               kmem_cache_free(markers_loaded_cachep, pdata);
+       }
+end:
+       mutex_unlock(&probes_mutex);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_marker_disconnect);
+
+static void disconnect_all_markers(void)
+{
+       struct ltt_active_marker *pdata, *tmp;
+
+       list_for_each_entry_safe(pdata, tmp, &markers_loaded_list, node) {
+               marker_probe_unregister_private_data(pdata->probe->probe_func,
+                       pdata);
+               list_del(&pdata->node);
+               kmem_cache_free(markers_loaded_cachep, pdata);
+       }
+}
+
+static int __init marker_control_init(void)
+{
+       int ret;
+
+       markers_loaded_cachep = KMEM_CACHE(ltt_active_marker, 0);
+
+       ret = ltt_probe_register(&default_probe);
+       BUG_ON(ret);
+       ret = ltt_marker_connect("metadata", "core_marker_format",
+                                DEFAULT_PROBE);
+       BUG_ON(ret);
+       ret = ltt_marker_connect("metadata", "core_marker_id", DEFAULT_PROBE);
+       BUG_ON(ret);
+
+       return 0;
+}
+module_init(marker_control_init);
+
+static void __exit marker_control_exit(void)
+{
+       int ret;
+
+       ret = ltt_marker_disconnect("metadata", "core_marker_format",
+                                   DEFAULT_PROBE);
+       BUG_ON(ret);
+       ret = ltt_marker_disconnect("metadata", "core_marker_id",
+                                   DEFAULT_PROBE);
+       BUG_ON(ret);
+       ret = ltt_probe_unregister(&default_probe);
+       BUG_ON(ret);
+       disconnect_all_markers();
+       kmem_cache_destroy(markers_loaded_cachep);
+       marker_synchronize_unregister();
+}
+module_exit(marker_control_exit);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Marker Control");
diff --git a/ltt-relay-alloc.c b/ltt-relay-alloc.c
new file mode 100644 (file)
index 0000000..a6697de
--- /dev/null
@@ -0,0 +1,732 @@
+/*
+ * ltt-relay-alloc.c
+ *
+ * Copyright (C) 2008,2009 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+
+#include "ltt-relay.h"
+#include "ltt-tracer.h"
+#include "ltt-relay-lockless.h"        /* for cpu hotplug */
+
+/**
+ * ltt_chanbuf_allocate - allocate a channel buffer
+ * @buf: the buffer struct
+ * @size: total size of the buffer
+ * @n_sb: number of subbuffers
+ * @extra_reader_sb: need extra subbuffer for reader
+ */
+static
+int ltt_chanbuf_allocate(struct ltt_chanbuf_alloc *buf, size_t size,
+                        size_t n_sb, int extra_reader_sb)
+{
+       long i, j, n_pages, n_pages_per_sb, page_idx = 0;
+       struct page **pages;
+       void **virt;
+
+       n_pages = size >> PAGE_SHIFT;
+       n_pages_per_sb = n_pages >> get_count_order(n_sb);
+       if (extra_reader_sb)
+               n_pages += n_pages_per_sb;      /* Add pages for reader */
+
+       pages = kmalloc_node(max_t(size_t, sizeof(*pages) * n_pages,
+                                  1 << INTERNODE_CACHE_SHIFT),
+                       GFP_KERNEL, cpu_to_node(buf->cpu));
+       if (unlikely(!pages))
+               goto pages_error;
+
+       virt = kmalloc_node(ALIGN(sizeof(*virt) * n_pages,
+                                 1 << INTERNODE_CACHE_SHIFT),
+                       GFP_KERNEL, cpu_to_node(buf->cpu));
+       if (unlikely(!virt))
+               goto virt_error;
+
+       for (i = 0; i < n_pages; i++) {
+               pages[i] = alloc_pages_node(cpu_to_node(buf->cpu),
+                       GFP_KERNEL | __GFP_ZERO, 0);
+               if (unlikely(!pages[i]))
+                       goto depopulate;
+               virt[i] = page_address(pages[i]);
+       }
+       buf->nr_pages = n_pages;
+       buf->_pages = pages;
+       buf->_virt = virt;
+
+       /* Allocate write-side page index */
+       buf->buf_wsb = kzalloc_node(max_t(size_t,
+                               sizeof(struct chanbuf_sb) * n_sb,
+                               1 << INTERNODE_CACHE_SHIFT),
+                               GFP_KERNEL, cpu_to_node(buf->cpu));
+       if (unlikely(!buf->buf_wsb))
+               goto depopulate;
+
+       for (i = 0; i < n_sb; i++) {
+               buf->buf_wsb[i].pages =
+                       kzalloc_node(max_t(size_t,
+                               sizeof(struct chanbuf_page) * n_pages_per_sb,
+                               1 << INTERNODE_CACHE_SHIFT),
+                               GFP_KERNEL, cpu_to_node(buf->cpu));
+               if (!buf->buf_wsb[i].pages)
+                       goto free_buf_wsb;
+       }
+
+       if (extra_reader_sb) {
+               /* Allocate read-side page index */
+               buf->buf_rsb.pages =
+                       kzalloc_node(max_t(size_t,
+                               sizeof(struct chanbuf_page) * n_pages_per_sb,
+                               1 << INTERNODE_CACHE_SHIFT),
+                               GFP_KERNEL, cpu_to_node(buf->cpu));
+               if (unlikely(!buf->buf_rsb.pages))
+                       goto free_buf_wsb;
+       } else {
+               buf->buf_rsb.pages = buf->buf_wsb[0].pages;
+       }
+
+       /* Assign pages to write-side page index */
+       for (i = 0; i < n_sb; i++) {
+               for (j = 0; j < n_pages_per_sb; j++) {
+                       WARN_ON(page_idx > n_pages);
+                       buf->buf_wsb[i].pages[j].virt = virt[page_idx];
+                       buf->buf_wsb[i].pages[j].page = pages[page_idx];
+                       page_idx++;
+               }
+               RCHAN_SB_SET_NOREF(buf->buf_wsb[i].pages);
+       }
+
+       if (extra_reader_sb) {
+               for (j = 0; j < n_pages_per_sb; j++) {
+                       WARN_ON(page_idx > n_pages);
+                       buf->buf_rsb.pages[j].virt = virt[page_idx];
+                       buf->buf_rsb.pages[j].page = pages[page_idx];
+                       page_idx++;
+               }
+               RCHAN_SB_SET_NOREF(buf->buf_rsb.pages);
+       }
+
+       /*
+        * If kmalloc ever uses vmalloc underneath, make sure the buffer pages
+        * will not fault.
+        */
+       vmalloc_sync_all();
+       return 0;
+
+free_buf_wsb:
+       for (i = 0; i < n_sb; i++) {
+               RCHAN_SB_CLEAR_NOREF(buf->buf_wsb[i].pages);
+               kfree(buf->buf_wsb[i].pages);
+       }
+       kfree(buf->buf_wsb);
+depopulate:
+       /*
+        * Free all pages from [ i - 1 down to 0 ].
+        * If i = 0, don't free anything.
+        */
+       for (i--; i >= 0; i--)
+               __free_page(pages[i]);
+       kfree(virt);
+virt_error:
+       kfree(pages);
+pages_error:
+       return -ENOMEM;
+}
+
+int ltt_chanbuf_alloc_create(struct ltt_chanbuf_alloc *buf,
+                            struct ltt_chan_alloc *chan, int cpu)
+{
+       int ret = 0;
+
+       ret = ltt_chanbuf_allocate(buf, chan->buf_size, chan->n_sb,
+                                  chan->extra_reader_sb);
+       if (ret)
+               goto end;
+
+       buf->chan = chan;
+       buf->cpu = cpu;
+end:
+       return ret;
+}
+
+void ltt_chanbuf_alloc_free(struct ltt_chanbuf_alloc *buf)
+{
+       struct ltt_chan_alloc *chan = buf->chan;
+       struct page **pages;
+       long i;
+
+       /* Destroy index */
+       if (chan->extra_reader_sb) {
+               RCHAN_SB_CLEAR_NOREF(buf->buf_rsb.pages);
+               kfree(buf->buf_rsb.pages);
+       }
+       for (i = 0; i < chan->n_sb; i++) {
+               RCHAN_SB_CLEAR_NOREF(buf->buf_wsb[i].pages);
+               kfree(buf->buf_wsb[i].pages);
+       }
+       kfree(buf->buf_wsb);
+
+       /* Destroy pages */
+       pages = buf->_pages;
+       for (i = 0; i < buf->nr_pages; i++)
+               __free_page(pages[i]);
+       kfree(buf->_pages);
+       kfree(buf->_virt);
+       buf->allocated = 0;
+}
+
+/**
+ *     ltt_relay_hotcpu_callback - CPU hotplug callback
+ *     @nb: notifier block
+ *     @action: hotplug action to take
+ *     @hcpu: CPU number
+ *
+ *     Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
+ */
+static
+int __cpuinit ltt_relay_hotcpu_callback(struct notifier_block *nb,
+                                       unsigned long action,
+                                       void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+       struct ltt_trace *trace;
+       struct ltt_chan *chan;
+       struct ltt_chanbuf *buf;
+       int ret, i;
+
+       switch (action) {
+       case CPU_UP_PREPARE:
+       case CPU_UP_PREPARE_FROZEN:
+               /*
+                * CPU hotplug lock protects trace lock from this callback.
+                */
+               __list_for_each_entry_rcu(trace, &ltt_traces.head, list) {
+                       for (i = 0; i < trace->nr_channels; i++) {
+                               chan = &trace->channels[i];
+                               buf = per_cpu_ptr(chan->a.buf, cpu);
+                               ret = ltt_chanbuf_create(buf, &chan->a, cpu);
+                               if (ret) {
+                                       printk(KERN_ERR
+                                         "ltt_relay_hotcpu_callback: cpu %d "
+                                         "buffer creation failed\n", cpu);
+                                       return NOTIFY_BAD;
+                               }
+
+                       }
+               }
+               break;
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               /* No need to do a buffer switch here, because it will happen
+                * when tracing is stopped, or will be done by switch timer CPU
+                * DEAD callback. */
+               break;
+       }
+       return NOTIFY_OK;
+}
+
+/*
+ * Must be called with either trace lock or rcu read lock sched held.
+ */
+void ltt_chan_for_each_channel(void (*cb) (struct ltt_chanbuf *buf), int cpu)
+{
+       struct ltt_trace *trace;
+       struct ltt_chan *chan;
+       struct ltt_chanbuf *buf;
+       int i;
+
+       __list_for_each_entry_rcu(trace, &ltt_traces.head, list) {
+               for (i = 0; i < trace->nr_channels; i++) {
+                       chan = &trace->channels[i];
+                       if (!chan->active)
+                               continue;
+                       buf = per_cpu_ptr(chan->a.buf, cpu);
+                       cb(buf);
+               }
+       }
+}
+
+/**
+ * ltt_chan_create - create a new relay channel
+ * @chan: channel
+ * @trace: trace
+ * @base_filename: base name of files to create
+ * @parent: dentry of parent directory, %NULL for root directory
+ * @sb_size: size of sub-buffers (> PAGE_SIZE, power of 2)
+ * @n_sb: number of sub-buffers (power of 2)
+ * @extra_reader_sb: allocate an extra subbuffer for the reader
+ * @overwrite: channel is in overwrite mode
+ *
+ * Returns channel pointer if successful, %NULL otherwise.
+ *
+ * Creates per-cpu channel buffers using the sizes and attributes
+ * specified.  The created channel buffer files will be named
+ * base_filename_0...base_filename_N-1.  File permissions will
+ * be %S_IRUSR.
+ */
+int ltt_chan_alloc_init(struct ltt_chan_alloc *chan, struct ltt_trace *trace,
+                       const char *base_filename,
+                       struct dentry *parent, size_t sb_size,
+                       size_t n_sb, int extra_reader_sb, int overwrite)
+{
+       unsigned int i;
+       int ret;
+
+       if (!base_filename)
+               return -EPERM;
+
+       if (!(sb_size && n_sb))
+               return -EPERM;
+
+       /* Check that the subbuffer size is larger than a page. */
+       WARN_ON_ONCE(sb_size < PAGE_SIZE);
+
+       /*
+        * Make sure the number of subbuffers and subbuffer size are power of 2.
+        */
+       WARN_ON_ONCE(hweight32(sb_size) != 1);
+       WARN_ON(hweight32(n_sb) != 1);
+
+       chan->trace = trace;
+       chan->buf_size = n_sb * sb_size;
+       chan->sb_size = sb_size;
+       chan->sb_size_order = get_count_order(sb_size);
+       chan->n_sb_order = get_count_order(n_sb);
+       chan->extra_reader_sb = extra_reader_sb;
+       chan->n_sb = n_sb;
+       chan->parent = parent;
+       strlcpy(chan->filename, base_filename, NAME_MAX);
+       kref_init(&chan->kref);
+       kref_get(&chan->trace->kref);
+
+       /* Allocating the child structure */
+       chan->buf = alloc_percpu(struct ltt_chanbuf);
+       if (!chan->buf)
+               goto free_chan;
+
+       for_each_online_cpu(i) {
+               ret = ltt_chanbuf_create(per_cpu_ptr(chan->buf, i), chan, i);
+               if (ret)
+                       goto free_bufs;
+       }
+
+       return 0;
+
+free_bufs:
+       for_each_possible_cpu(i) {
+               struct ltt_chanbuf *buf = per_cpu_ptr(chan->buf, i);
+
+               if (!buf->a.allocated)
+                       continue;
+               ltt_chanbuf_remove_file(buf);
+               ltt_chanbuf_free(buf);
+       }
+       free_percpu(chan->buf);
+free_chan:
+       kref_put(&chan->kref, ltt_chan_free);
+       return -ENOMEM;
+}
+
+/**
+ * ltt_chan_alloc_remove_files - remove channel files.
+ * @chan: the channel
+ *
+ * Remove all channel files and wait for dentry use counts to become zero.
+ */
+void ltt_chan_alloc_remove_files(struct ltt_chan_alloc *chan)
+{
+       unsigned int i;
+       struct dentry *dentry;
+
+       for_each_possible_cpu(i) {
+               struct ltt_chanbuf *buf = per_cpu_ptr(chan->buf, i);
+
+               if (!buf->a.allocated)
+                       continue;
+               dentry = dget(buf->a.dentry);
+               ltt_chanbuf_remove_file(buf);
+               /* TODO: wait / wakeup instead */
+               /*
+                * Wait for every reference to the dentry to be gone,
+                * except us.
+                */
+               while (atomic_read(&dentry->d_count) != 1)
+                       msleep(100);
+               dput(dentry);
+       }
+}
+
+/**
+ * ltt_chan_alloc_free - destroy the channel
+ * @chan: the channel
+ *
+ * Destroy all channel buffers and frees the channel.
+ */
+void ltt_chan_alloc_free(struct ltt_chan_alloc *chan)
+{
+       unsigned int i;
+
+       for_each_possible_cpu(i) {
+               struct ltt_chanbuf *buf = per_cpu_ptr(chan->buf, i);
+
+               if (!buf->a.allocated)
+                       continue;
+               ltt_chanbuf_free(buf);
+       }
+       free_percpu(chan->buf);
+       kref_put(&chan->trace->kref, ltt_release_trace);
+       wake_up_interruptible(&chan->trace->kref_wq);
+}
+
+/**
+ * _ltt_relay_write - write data to a ltt_relay buffer.
+ * @bufa : buffer
+ * @offset : offset within the buffer
+ * @src : source address
+ * @len : length to write
+ * @pagecpy : page size copied so far
+ */
+void _ltt_relay_write(struct ltt_chanbuf_alloc *bufa, size_t offset,
+                     const void *src, size_t len, ssize_t pagecpy)
+{
+       struct ltt_chan_alloc *chana = bufa->chan;
+       size_t sbidx, index;
+       struct chanbuf_page *rpages;
+
+       do {
+               len -= pagecpy;
+               src += pagecpy;
+               offset += pagecpy;
+               sbidx = offset >> chana->sb_size_order;
+               index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+
+               /*
+                * Underlying layer should never ask for writes across
+                * subbuffers.
+                */
+               WARN_ON(offset >= chana->buf_size);
+
+               pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
+               rpages = bufa->buf_wsb[sbidx].pages;
+               WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+               ltt_relay_do_copy(rpages[index].virt + (offset & ~PAGE_MASK),
+                                 src, pagecpy);
+       } while (unlikely(len != pagecpy));
+}
+EXPORT_SYMBOL_GPL(_ltt_relay_write);
+
+/**
+ * _ltt_relay_strncpy_fixup - Fix an incomplete string in a ltt_relay buffer.
+ * @bufa : buffer
+ * @offset : offset within the buffer
+ * @len : length to write
+ * @copied: string actually copied
+ * @terminated: does string end with \0
+ *
+ * Fills string with "X" if incomplete.
+ */
+void _ltt_relay_strncpy_fixup(struct ltt_chanbuf_alloc *bufa, size_t offset,
+                             size_t len, size_t copied, int terminated)
+{
+       struct ltt_chan_alloc *chana = bufa->chan;
+       size_t sbidx, index;
+       ssize_t pagecpy;
+       struct chanbuf_page *rpages;
+
+       if (copied == len) {
+               /*
+                * Deal with non-terminated string.
+                */
+               WARN_ON_ONCE(terminated);
+               offset += copied - 1;
+               sbidx = offset >> chana->sb_size_order;
+               index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+               /*
+                * Underlying layer should never ask for writes across
+                * subbuffers.
+                */
+               WARN_ON(offset >= chana->buf_size);
+               rpages = bufa->buf_wsb[sbidx].pages;
+               WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+               ltt_relay_do_memset(rpages[index].virt + (offset & ~PAGE_MASK),
+                                   '\0', 1);
+               return;
+       }
+
+       /*
+        * Deal with incomplete string.
+        * Overwrite string's \0 with X too.
+        */
+       pagecpy = copied - 1;
+       do {
+               WARN_ON_ONCE(!terminated);
+               len -= pagecpy;
+               offset += pagecpy;
+               sbidx = offset >> chana->sb_size_order;
+               index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+
+               /*
+                * Underlying layer should never ask for writes across
+                * subbuffers.
+                */
+               WARN_ON(offset >= chana->buf_size);
+
+               pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
+               rpages = bufa->buf_wsb[sbidx].pages;
+               WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+               ltt_relay_do_memset(rpages[index].virt + (offset & ~PAGE_MASK),
+                                   'X', pagecpy);
+       } while (unlikely(len != pagecpy));
+       /*
+        * Overwrite last 'X' with '\0'.
+        */
+       offset += pagecpy - 1;
+       sbidx = offset >> chana->sb_size_order;
+       index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+       /*
+        * Underlying layer should never ask for writes across subbuffers.
+        */
+       WARN_ON(offset >= chana->buf_size);
+       rpages = bufa->buf_wsb[sbidx].pages;
+       WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+       ltt_relay_do_memset(rpages[index].virt + (offset & ~PAGE_MASK),
+                           '\0', 1);
+}
+EXPORT_SYMBOL_GPL(_ltt_relay_strncpy_fixup);
+
+/**
+ * _ltt_relay_strncpy - copy a string to a ltt_relay buffer.
+ * @bufa : buffer
+ * @offset : offset within the buffer
+ * @src : source address
+ * @len : length to write
+ * @pagecpy : page size copied so far
+ */
+void _ltt_relay_strncpy(struct ltt_chanbuf_alloc *bufa, size_t offset,
+                       const void *src, size_t len, ssize_t pagecpy)
+{
+       struct ltt_chan_alloc *chana = bufa->chan;
+       size_t sbidx, index, copied;
+       struct chanbuf_page *rpages;
+       int terminated;
+
+       do {
+               len -= pagecpy;
+               src += pagecpy;
+               offset += pagecpy;
+               sbidx = offset >> chana->sb_size_order;
+               index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+
+               /*
+                * Underlying layer should never ask for writes across
+                * subbuffers.
+                */
+               WARN_ON(offset >= chana->buf_size);
+
+               pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
+               rpages = bufa->buf_wsb[sbidx].pages;
+               WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+               copied = ltt_relay_do_strncpy(rpages[index].virt
+                                             + (offset & ~PAGE_MASK),
+                                             src, pagecpy, &terminated);
+               if (copied < pagecpy || ((len == pagecpy) && !terminated)) {
+                       _ltt_relay_strncpy_fixup(bufa, offset, len, copied,
+                                                terminated);
+                       break;
+               }
+       } while (unlikely(len != pagecpy));
+}
+EXPORT_SYMBOL_GPL(_ltt_relay_strncpy);
+
+/**
+ * ltt_relay_read - read data from ltt_relay_buffer.
+ * @bufa : buffer
+ * @offset : offset within the buffer
+ * @dest : destination address
+ * @len : length to write
+ *
+ * Should be protected by get_subbuf/put_subbuf.
+ */
+int ltt_relay_read(struct ltt_chanbuf_alloc *bufa, size_t offset, void *dest,
+                  size_t len)
+{
+       struct ltt_chan_alloc *chana = bufa->chan;
+       size_t index;
+       ssize_t pagecpy, orig_len;
+       struct chanbuf_page *rpages;
+
+       orig_len = len;
+       offset &= chana->buf_size - 1;
+       index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+       if (unlikely(!len))
+               return 0;
+       for (;;) {
+               pagecpy = min_t(size_t, len, PAGE_SIZE - (offset & ~PAGE_MASK));
+               rpages = bufa->buf_rsb.pages;
+               WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+               memcpy(dest, rpages[index].virt + (offset & ~PAGE_MASK),
+                      pagecpy);
+               len -= pagecpy;
+               if (likely(!len))
+                       break;
+               dest += pagecpy;
+               offset += pagecpy;
+               index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+               /*
+                * Underlying layer should never ask for reads across
+                * subbuffers.
+                */
+               WARN_ON(offset >= chana->buf_size);
+       }
+       return orig_len;
+}
+EXPORT_SYMBOL_GPL(ltt_relay_read);
+
+/**
+ * ltt_relay_read_cstr - read a C-style string from ltt_relay_buffer.
+ * @bufa : buffer
+ * @offset : offset within the buffer
+ * @dest : destination address
+ * @len : destination's length
+ *
+ * return string's length
+ * Should be protected by get_subbuf/put_subbuf.
+ */
+int ltt_relay_read_cstr(struct ltt_chanbuf_alloc *bufa, size_t offset,
+                       void *dest, size_t len)
+{
+       struct ltt_chan_alloc *chana = bufa->chan;
+       size_t index;
+       ssize_t pagecpy, pagelen, strpagelen, orig_offset;
+       char *str;
+       struct chanbuf_page *rpages;
+
+       offset &= chana->buf_size - 1;
+       index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+       orig_offset = offset;
+       for (;;) {
+               rpages = bufa->buf_rsb.pages;
+               WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+               str = (char *)rpages[index].virt + (offset & ~PAGE_MASK);
+               pagelen = PAGE_SIZE - (offset & ~PAGE_MASK);
+               strpagelen = strnlen(str, pagelen);
+               if (len) {
+                       pagecpy = min_t(size_t, len, strpagelen);
+                       if (dest) {
+                               memcpy(dest, str, pagecpy);
+                               dest += pagecpy;
+                       }
+                       len -= pagecpy;
+               }
+               offset += strpagelen;
+               index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+               if (strpagelen < pagelen)
+                       break;
+               /*
+                * Underlying layer should never ask for reads across
+                * subbuffers.
+                */
+               WARN_ON(offset >= chana->buf_size);
+       }
+       if (dest && len)
+               ((char *)dest)[0] = 0;
+       return offset - orig_offset;
+}
+EXPORT_SYMBOL_GPL(ltt_relay_read_cstr);
+
+/**
+ * ltt_relay_read_get_page - Get a whole page to read from
+ * @bufa : buffer
+ * @offset : offset within the buffer
+ *
+ * Should be protected by get_subbuf/put_subbuf.
+ */
+struct page *ltt_relay_read_get_page(struct ltt_chanbuf_alloc *bufa,
+                                    size_t offset)
+{
+       size_t index;
+       struct chanbuf_page *rpages;
+       struct ltt_chan_alloc *chana = bufa->chan;
+
+       offset &= chana->buf_size - 1;
+       index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+       rpages = bufa->buf_rsb.pages;
+       WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+       return rpages[index].page;
+}
+EXPORT_SYMBOL_GPL(ltt_relay_read_get_page);
+
+/**
+ * ltt_relay_read_offset_address - get address of a location within the buffer
+ * @bufa : buffer
+ * @offset : offset within the buffer.
+ *
+ * Return the address where a given offset is located (for read).
+ * Should be used to get the current subbuffer header pointer. Given we know
+ * it's never on a page boundary, it's safe to write directly to this address,
+ * as long as the write is never bigger than a page size.
+ */
+void *ltt_relay_read_offset_address(struct ltt_chanbuf_alloc *bufa,
+                                   size_t offset)
+{
+       size_t index;
+       struct chanbuf_page *rpages;
+       struct ltt_chan_alloc *chana = bufa->chan;
+
+       offset &= chana->buf_size - 1;
+       index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+       rpages = bufa->buf_rsb.pages;
+       WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+       return rpages[index].virt + (offset & ~PAGE_MASK);
+}
+EXPORT_SYMBOL_GPL(ltt_relay_read_offset_address);
+
+/**
+ * ltt_relay_offset_address - get address of a location within the buffer
+ * @bufa : buffer
+ * @offset : offset within the buffer.
+ *
+ * Return the address where a given offset is located.
+ * Should be used to get the current subbuffer header pointer. Given we know
+ * it's never on a page boundary, it's safe to write directly to this address,
+ * as long as the write is never bigger than a page size.
+ */
+void *ltt_relay_offset_address(struct ltt_chanbuf_alloc *bufa, size_t offset)
+{
+       size_t sbidx, index;
+       struct chanbuf_page *rpages;
+       struct ltt_chan_alloc *chana = bufa->chan;
+
+       offset &= chana->buf_size - 1;
+       sbidx = offset >> chana->sb_size_order;
+       index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+       rpages = bufa->buf_wsb[sbidx].pages;
+       WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+       return rpages[index].virt + (offset & ~PAGE_MASK);
+}
+EXPORT_SYMBOL_GPL(ltt_relay_offset_address);
+
+static __init int ltt_relay_alloc_init(void)
+{
+       hotcpu_notifier(ltt_relay_hotcpu_callback, 5);
+       ltt_relay_init();
+       return 0;
+}
+
+static void __exit ltt_relay_alloc_exit(void)
+{
+       ltt_relay_exit();
+}
+
+module_init(ltt_relay_alloc_init);
+module_exit(ltt_relay_alloc_exit);
diff --git a/ltt-relay-lockless.c b/ltt-relay-lockless.c
new file mode 100644 (file)
index 0000000..0c4c83c
--- /dev/null
@@ -0,0 +1,1366 @@
+/*
+ * ltt/ltt-relay-lockless.c
+ *
+ * (C) Copyright 2005-2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * LTTng lockless buffer space management (reader/writer).
+ *
+ * Author:
+ *     Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Inspired from LTT :
+ *  Karim Yaghmour (karim@opersys.com)
+ *  Tom Zanussi (zanussi@us.ibm.com)
+ *  Bob Wisniewski (bob@watson.ibm.com)
+ * And from K42 :
+ *  Bob Wisniewski (bob@watson.ibm.com)
+ *
+ * Changelog:
+ *  08/10/08, Cleanup.
+ *  19/10/05, Complete lockless mechanism.
+ *  27/05/05, Modular redesign and rewrite.
+ *
+ * Userspace reader semantic :
+ * while (poll fd != POLLHUP) {
+ *   - ioctl RELAY_GET_SUBBUF_SIZE
+ *   while (1) {
+ *     - ioctl GET_SUBBUF
+ *     - splice 1 subbuffer worth of data to a pipe
+ *     - splice the data from pipe to disk/network
+ *     - ioctl PUT_SUBBUF, check error value
+ *       if err val < 0, previous subbuffer was corrupted.
+ *   }
+ * }
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/time.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/smp_lock.h>
+#include <linux/stat.h>
+#include <linux/cpu.h>
+#include <linux/idle.h>
+#include <linux/delay.h>
+#include <linux/notifier.h>
+#include <asm/atomic.h>
+#include <asm/local.h>
+
+#include "ltt-tracer.h"
+#include "ltt-relay.h"
+#include "ltt-relay-lockless.h"
+
+#if 0
+#define printk_dbg(fmt, args...) printk(fmt, args)
+#else
+#define printk_dbg(fmt, args...)
+#endif
+
+struct ltt_reserve_switch_offsets {
+       long begin, end, old;
+       long begin_switch, end_switch_current, end_switch_old;
+       size_t before_hdr_pad, size;
+};
+
+static
+void ltt_force_switch(struct ltt_chanbuf *buf, enum force_switch_mode mode);
+
+static
+void ltt_relay_print_buffer_errors(struct ltt_chan *chan, unsigned int cpu);
+
+static const struct file_operations ltt_file_operations;
+
+static
+void ltt_buffer_begin(struct ltt_chanbuf *buf, u64 tsc, unsigned int subbuf_idx)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+       struct ltt_subbuffer_header *header =
+               (struct ltt_subbuffer_header *)
+                       ltt_relay_offset_address(&buf->a,
+                               subbuf_idx * chan->a.sb_size);
+
+       header->cycle_count_begin = tsc;
+       header->data_size = 0xFFFFFFFF; /* for debugging */
+       ltt_write_trace_header(chan->a.trace, header);
+}
+
+/*
+ * offset is assumed to never be 0 here : never deliver a completely empty
+ * subbuffer. The lost size is between 0 and subbuf_size-1.
+ */
+static
+void ltt_buffer_end(struct ltt_chanbuf *buf, u64 tsc, unsigned int offset,
+                   unsigned int subbuf_idx)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+       struct ltt_subbuffer_header *header =
+               (struct ltt_subbuffer_header *)
+                       ltt_relay_offset_address(&buf->a,
+                               subbuf_idx * chan->a.sb_size);
+       u32 data_size = SUBBUF_OFFSET(offset - 1, chan) + 1;
+
+       header->data_size = data_size;
+       header->sb_size = PAGE_ALIGN(data_size);
+       header->cycle_count_end = tsc;
+       header->events_lost = local_read(&buf->events_lost);
+       header->subbuf_corrupt = local_read(&buf->corrupted_subbuffers);
+}
+
+/*
+ * Must be called under trace lock or cpu hotplug protection.
+ */
+void ltt_chanbuf_free(struct ltt_chanbuf *buf)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+
+       ltt_relay_print_buffer_errors(chan, buf->a.cpu);
+#ifdef CONFIG_LTT_VMCORE
+       kfree(buf->commit_seq);
+#endif
+       kfree(buf->commit_count);
+
+       ltt_chanbuf_alloc_free(&buf->a);
+}
+
+/*
+ * Must be called under trace lock or cpu hotplug protection.
+ */
+int ltt_chanbuf_create(struct ltt_chanbuf *buf, struct ltt_chan_alloc *chana,
+                      int cpu)
+{
+       struct ltt_chan *chan = container_of(chana, struct ltt_chan, a);
+       struct ltt_trace *trace = chana->trace;
+       unsigned int j, n_sb;
+       int ret;
+
+       /* Test for cpu hotplug */
+       if (buf->a.allocated)
+               return 0;
+
+       ret = ltt_chanbuf_alloc_create(&buf->a, &chan->a, cpu);
+       if (ret)
+               return ret;
+
+       buf->commit_count =
+               kzalloc_node(ALIGN(sizeof(*buf->commit_count) * chan->a.n_sb,
+                                  1 << INTERNODE_CACHE_SHIFT),
+                       GFP_KERNEL, cpu_to_node(cpu));
+       if (!buf->commit_count) {
+               ret = -ENOMEM;
+               goto free_chanbuf;
+       }
+
+#ifdef CONFIG_LTT_VMCORE
+       buf->commit_seq =
+               kzalloc_node(ALIGN(sizeof(*buf->commit_seq) * chan->a.n_sb,
+                                  1 << INTERNODE_CACHE_SHIFT),
+                       GFP_KERNEL, cpu_to_node(cpu));
+       if (!buf->commit_seq) {
+               kfree(buf->commit_count);
+               ret = -ENOMEM;
+               goto free_commit;
+       }
+#endif
+
+       local_set(&buf->offset, ltt_sb_header_size());
+       atomic_long_set(&buf->consumed, 0);
+       atomic_long_set(&buf->active_readers, 0);
+       n_sb = chan->a.n_sb;
+       for (j = 0; j < n_sb; j++) {
+               local_set(&buf->commit_count[j].cc, 0);
+               local_set(&buf->commit_count[j].cc_sb, 0);
+               local_set(&buf->commit_count[j].events, 0);
+       }
+       init_waitqueue_head(&buf->write_wait);
+       init_waitqueue_head(&buf->read_wait);
+       spin_lock_init(&buf->full_lock);
+
+       RCHAN_SB_CLEAR_NOREF(buf->a.buf_wsb[0].pages);
+       ltt_buffer_begin(buf, trace->start_tsc, 0);
+       /* atomic_add made on local variable on data that belongs to
+        * various CPUs : ok because tracing not started (for this cpu). */
+       local_add(ltt_sb_header_size(), &buf->commit_count[0].cc);
+
+       local_set(&buf->events_lost, 0);
+       local_set(&buf->corrupted_subbuffers, 0);
+       buf->finalized = 0;
+
+       ret = ltt_chanbuf_create_file(chan->a.filename, chan->a.parent,
+                                     S_IRUSR, buf);
+       if (ret)
+               goto free_init;
+
+       /*
+        * Ensure the buffer is ready before setting it to allocated.
+        * Used for cpu hotplug vs async wakeup.
+        */
+       smp_wmb();
+       buf->a.allocated = 1;
+
+       return 0;
+
+       /* Error handling */
+free_init:
+#ifdef CONFIG_LTT_VMCORE
+       kfree(buf->commit_seq);
+free_commit:
+#endif
+       kfree(buf->commit_count);
+free_chanbuf:
+       ltt_chanbuf_alloc_free(&buf->a);
+       return ret;
+}
+
+void ltt_chan_remove_files(struct ltt_chan *chan)
+{
+       ltt_ascii_remove(chan);
+       ltt_chan_alloc_remove_files(&chan->a);
+}
+EXPORT_SYMBOL_GPL(ltt_chan_remove_files);
+
+
+void ltt_chan_free(struct kref *kref)
+{
+       struct ltt_chan *chan = container_of(kref, struct ltt_chan, a.kref);
+
+       ltt_chan_alloc_free(&chan->a);
+}
+EXPORT_SYMBOL_GPL(ltt_chan_free);
+
+/**
+ * ltt_chan_create - Create channel.
+ */
+int ltt_chan_create(const char *base_filename,
+                   struct ltt_chan *chan, struct dentry *parent,
+                   size_t sb_size, size_t n_sb,
+                   int overwrite, struct ltt_trace *trace)
+{
+       int ret;
+
+       chan->overwrite = overwrite;
+
+       ret = ltt_chan_alloc_init(&chan->a, trace, base_filename, parent,
+                                 sb_size, n_sb, overwrite, overwrite);
+       if (ret)
+               goto error;
+
+       chan->commit_count_mask = (~0UL >> chan->a.n_sb_order);
+
+       ret = ltt_ascii_create(chan);
+       if (ret)
+               goto error_chan_alloc_free;
+
+       return ret;
+
+error_chan_alloc_free:
+       ltt_chan_alloc_free(&chan->a);
+error:
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_chan_create);
+
+int ltt_chanbuf_open_read(struct ltt_chanbuf *buf)
+{
+       kref_get(&buf->a.chan->kref);
+       if (!atomic_long_add_unless(&buf->active_readers, 1, 1)) {
+               kref_put(&buf->a.chan->kref, ltt_chan_free);
+               return -EBUSY;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(ltt_chanbuf_open_read);
+
+void ltt_chanbuf_release_read(struct ltt_chanbuf *buf)
+{
+       //ltt_relay_destroy_buffer(&buf->a.chan->a, buf->a.cpu);
+       WARN_ON(atomic_long_read(&buf->active_readers) != 1);
+       atomic_long_dec(&buf->active_readers);
+       kref_put(&buf->a.chan->kref, ltt_chan_free);
+}
+EXPORT_SYMBOL_GPL(ltt_chanbuf_release_read);
+
+/*
+ * Wake writers :
+ *
+ * This must be done after the trace is removed from the RCU list so that there
+ * are no stalled writers.
+ */
+static void ltt_relay_wake_writers(struct ltt_chanbuf *buf)
+{
+
+       if (waitqueue_active(&buf->write_wait))
+               wake_up_interruptible(&buf->write_wait);
+}
+
+/*
+ * This function should not be called from NMI interrupt context
+ */
+static void ltt_buf_unfull(struct ltt_chanbuf *buf)
+{
+       ltt_relay_wake_writers(buf);
+}
+
+/*
+ * Promote compiler barrier to a smp_mb().
+ * For the specific LTTng case, this IPI call should be removed if the
+ * architecture does not reorder writes.  This should eventually be provided by
+ * a separate architecture-specific infrastructure.
+ */
+static void remote_mb(void *info)
+{
+       smp_mb();
+}
+
+int ltt_chanbuf_get_subbuf(struct ltt_chanbuf *buf, unsigned long *consumed)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+       long consumed_old, consumed_idx, commit_count, write_offset;
+       int ret;
+
+       consumed_old = atomic_long_read(&buf->consumed);
+       consumed_idx = SUBBUF_INDEX(consumed_old, chan);
+       commit_count = local_read(&buf->commit_count[consumed_idx].cc_sb);
+       /*
+        * Make sure we read the commit count before reading the buffer
+        * data and the write offset. Correct consumed offset ordering
+        * wrt commit count is insured by the use of cmpxchg to update
+        * the consumed offset.
+        * smp_call_function_single can fail if the remote CPU is offline,
+        * this is OK because then there is no wmb to execute there.
+        * If our thread is executing on the same CPU as the on the buffers
+        * belongs to, we don't have to synchronize it at all. If we are
+        * migrated, the scheduler will take care of the memory barriers.
+        * Normally, smp_call_function_single() should ensure program order when
+        * executing the remote function, which implies that it surrounds the
+        * function execution with :
+        * smp_mb()
+        * send IPI
+        * csd_lock_wait
+        *                recv IPI
+        *                smp_mb()
+        *                exec. function
+        *                smp_mb()
+        *                csd unlock
+        * smp_mb()
+        *
+        * However, smp_call_function_single() does not seem to clearly execute
+        * such barriers. It depends on spinlock semantic to provide the barrier
+        * before executing the IPI and, when busy-looping, csd_lock_wait only
+        * executes smp_mb() when it has to wait for the other CPU.
+        *
+        * I don't trust this code. Therefore, let's add the smp_mb() sequence
+        * required ourself, even if duplicated. It has no performance impact
+        * anyway.
+        *
+        * smp_mb() is needed because smp_rmb() and smp_wmb() only order read vs
+        * read and write vs write. They do not ensure core synchronization. We
+        * really have to ensure total order between the 3 barriers running on
+        * the 2 CPUs.
+        */
+#ifdef LTT_NO_IPI_BARRIER
+       /*
+        * Local rmb to match the remote wmb to read the commit count before the
+        * buffer data and the write offset.
+        */
+       smp_rmb();
+#else
+       if (raw_smp_processor_id() != buf->a.cpu) {
+               smp_mb();       /* Total order with IPI handler smp_mb() */
+               smp_call_function_single(buf->a.cpu, remote_mb, NULL, 1);
+               smp_mb();       /* Total order with IPI handler smp_mb() */
+       }
+#endif
+       write_offset = local_read(&buf->offset);
+       /*
+        * Check that the subbuffer we are trying to consume has been
+        * already fully committed.
+        */
+       if (((commit_count - chan->a.sb_size)
+            & chan->commit_count_mask)
+           - (BUFFER_TRUNC(consumed_old, chan)
+              >> chan->a.n_sb_order)
+           != 0) {
+               return -EAGAIN;
+       }
+       /*
+        * Check that we are not about to read the same subbuffer in
+        * which the writer head is.
+        */
+       if ((SUBBUF_TRUNC(write_offset, chan)
+          - SUBBUF_TRUNC(consumed_old, chan))
+          == 0) {
+               return -EAGAIN;
+       }
+
+       ret = update_read_sb_index(&buf->a, &chan->a, consumed_idx);
+       if (ret)
+               return ret;
+
+       *consumed = consumed_old;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(ltt_chanbuf_get_subbuf);
+
+int ltt_chanbuf_put_subbuf(struct ltt_chanbuf *buf, unsigned long consumed)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+       long consumed_new, consumed_old;
+
+       WARN_ON(atomic_long_read(&buf->active_readers) != 1);
+
+       consumed_old = consumed;
+       consumed_new = SUBBUF_ALIGN(consumed_old, chan);
+       WARN_ON_ONCE(RCHAN_SB_IS_NOREF(buf->a.buf_rsb.pages));
+       RCHAN_SB_SET_NOREF(buf->a.buf_rsb.pages);
+
+       spin_lock(&buf->full_lock);
+       if (atomic_long_cmpxchg(&buf->consumed, consumed_old, consumed_new)
+           != consumed_old) {
+               /* We have been pushed by the writer. */
+               spin_unlock(&buf->full_lock);
+               /*
+                * We exchanged the subbuffer pages. No corruption possible
+                * even if the writer did push us. No more -EIO possible.
+                */
+               return 0;
+       } else {
+               /* tell the client that buffer is now unfull */
+               int index;
+               long data;
+               index = SUBBUF_INDEX(consumed_old, chan);
+               data = BUFFER_OFFSET(consumed_old, chan);
+               ltt_buf_unfull(buf);
+               spin_unlock(&buf->full_lock);
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(ltt_chanbuf_put_subbuf);
+
+static void switch_buffer(unsigned long data)
+{
+       struct ltt_chanbuf *buf = (struct ltt_chanbuf *)data;
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+
+       /*
+        * Only flush buffers periodically if readers are active.
+        */
+       if (atomic_long_read(&buf->active_readers))
+               ltt_force_switch(buf, FORCE_ACTIVE);
+
+       mod_timer_pinned(&buf->switch_timer,
+                        jiffies + chan->switch_timer_interval);
+}
+
+static void ltt_chanbuf_start_switch_timer(struct ltt_chanbuf *buf)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+
+       if (!chan->switch_timer_interval)
+               return;
+
+       init_timer_deferrable(&buf->switch_timer);
+       buf->switch_timer.function = switch_buffer;
+       buf->switch_timer.expires = jiffies + chan->switch_timer_interval;
+       buf->switch_timer.data = (unsigned long)buf;
+       add_timer_on(&buf->switch_timer, buf->a.cpu);
+}
+
+/*
+ * called with ltt traces lock held.
+ */
+void ltt_chan_start_switch_timer(struct ltt_chan *chan)
+{
+       int cpu;
+
+       if (!chan->switch_timer_interval)
+               return;
+
+       for_each_online_cpu(cpu) {
+               struct ltt_chanbuf *buf;
+
+               buf = per_cpu_ptr(chan->a.buf, cpu);
+               ltt_chanbuf_start_switch_timer(buf);
+       }
+}
+
+static void ltt_chanbuf_stop_switch_timer(struct ltt_chanbuf *buf)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+
+       if (!chan->switch_timer_interval)
+               return;
+
+       del_timer_sync(&buf->switch_timer);
+}
+
+/*
+ * called with ltt traces lock held.
+ */
+void ltt_chan_stop_switch_timer(struct ltt_chan *chan)
+{
+       int cpu;
+
+       if (!chan->switch_timer_interval)
+               return;
+
+       for_each_online_cpu(cpu) {
+               struct ltt_chanbuf *buf;
+
+               buf = per_cpu_ptr(chan->a.buf, cpu);
+               ltt_chanbuf_stop_switch_timer(buf);
+       }
+}
+
+static void ltt_chanbuf_idle_switch(struct ltt_chanbuf *buf)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+
+       if (chan->switch_timer_interval)
+               ltt_force_switch(buf, FORCE_ACTIVE);
+}
+
+/*
+ * ltt_chanbuf_switch is called from a remote CPU to ensure that the buffers of
+ * a cpu which went down are flushed. Note that if we execute concurrently
+ * with trace allocation, a buffer might appear be unallocated (because it
+ * detects that the target CPU is offline).
+ */
+static void ltt_chanbuf_switch(struct ltt_chanbuf *buf)
+{
+       if (buf->a.allocated)
+               ltt_force_switch(buf, FORCE_ACTIVE);
+}
+
+/**
+ *     ltt_chanbuf_hotcpu_callback - CPU hotplug callback
+ *     @nb: notifier block
+ *     @action: hotplug action to take
+ *     @hcpu: CPU number
+ *
+ *     Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
+ */
+static
+int ltt_chanbuf_hotcpu_callback(struct notifier_block *nb,
+                                         unsigned long action,
+                                         void *hcpu)
+{
+       unsigned int cpu = (unsigned long)hcpu;
+
+       switch (action) {
+       case CPU_DOWN_FAILED:
+       case CPU_DOWN_FAILED_FROZEN:
+       case CPU_ONLINE:
+       case CPU_ONLINE_FROZEN:
+               /*
+                * CPU hotplug lock protects trace lock from this callback.
+                */
+               ltt_chan_for_each_channel(ltt_chanbuf_start_switch_timer, cpu);
+               return NOTIFY_OK;
+
+       case CPU_DOWN_PREPARE:
+       case CPU_DOWN_PREPARE_FROZEN:
+               /*
+                * Performs an IPI to delete the timer locally on the target
+                * CPU. CPU hotplug lock protects trace lock from this
+                * callback.
+                */
+               ltt_chan_for_each_channel(ltt_chanbuf_stop_switch_timer, cpu);
+               return NOTIFY_OK;
+
+       case CPU_DEAD:
+       case CPU_DEAD_FROZEN:
+               /*
+                * Performing a buffer switch on a remote CPU. Performed by
+                * the CPU responsible for doing the hotunplug after the target
+                * CPU stopped running completely. Ensures that all data
+                * from that remote CPU is flushed. CPU hotplug lock protects
+                * trace lock from this callback.
+                */
+               ltt_chan_for_each_channel(ltt_chanbuf_switch, cpu);
+               return NOTIFY_OK;
+
+       default:
+               return NOTIFY_DONE;
+       }
+}
+
+static int pm_idle_entry_callback(struct notifier_block *self,
+                                 unsigned long val, void *data)
+{
+       if (val == IDLE_START) {
+               rcu_read_lock_sched_notrace();
+               ltt_chan_for_each_channel(ltt_chanbuf_idle_switch,
+                                         smp_processor_id());
+               rcu_read_unlock_sched_notrace();
+       }
+       return 0;
+}
+
+struct notifier_block pm_idle_entry_notifier = {
+       .notifier_call = pm_idle_entry_callback,
+       .priority = ~0U,        /* smallest prio, run after tracing events */
+};
+
+static
+void ltt_relay_print_written(struct ltt_chan *chan, long cons_off,
+                            unsigned int cpu)
+{
+       struct ltt_chanbuf *buf = per_cpu_ptr(chan->a.buf, cpu);
+       long cons_idx, events_count;
+
+       cons_idx = SUBBUF_INDEX(cons_off, chan);
+       events_count = local_read(&buf->commit_count[cons_idx].events);
+
+       if (events_count)
+               printk(KERN_INFO
+                       "LTT: %lu events written in channel %s "
+                       "(cpu %u, index %lu)\n",
+                       events_count, chan->a.filename, cpu, cons_idx);
+}
+
+static
+void ltt_relay_print_subbuffer_errors(struct ltt_chanbuf *buf,
+                                     struct ltt_chan *chan, long cons_off,
+                                     unsigned int cpu)
+{
+       long cons_idx, commit_count, commit_count_sb, write_offset;
+
+       cons_idx = SUBBUF_INDEX(cons_off, chan);
+       commit_count = local_read(&buf->commit_count[cons_idx].cc);
+       commit_count_sb = local_read(&buf->commit_count[cons_idx].cc_sb);
+       /*
+        * No need to order commit_count and write_offset reads because we
+        * execute after trace is stopped when there are no readers left.
+        */
+       write_offset = local_read(&buf->offset);
+       printk(KERN_WARNING
+              "LTT : unread channel %s offset is %ld "
+              "and cons_off : %ld (cpu %u)\n",
+              chan->a.filename, write_offset, cons_off, cpu);
+       /* Check each sub-buffer for non filled commit count */
+       if (((commit_count - chan->a.sb_size) & chan->commit_count_mask)
+           - (BUFFER_TRUNC(cons_off, chan) >> chan->a.n_sb_order)
+           != 0)
+               printk(KERN_ALERT
+                      "LTT : %s : subbuffer %lu has non filled "
+                      "commit count [cc, cc_sb] [%lu,%lu].\n",
+                      chan->a.filename, cons_idx, commit_count,
+                      commit_count_sb);
+       printk(KERN_ALERT "LTT : %s : commit count : %lu, subbuf size %lu\n",
+              chan->a.filename, commit_count, chan->a.sb_size);
+}
+
+static
+void ltt_relay_print_errors(struct ltt_chanbuf *buf, struct ltt_chan *chan,
+                           struct ltt_trace *trace, int cpu)
+{
+       long cons_off;
+
+       /*
+        * Can be called in the error path of allocation when
+        * trans_channel_data is not yet set.
+        */
+       if (!chan)
+               return;
+       for (cons_off = 0; cons_off < chan->a.buf_size;
+            cons_off = SUBBUF_ALIGN(cons_off, chan))
+               ltt_relay_print_written(chan, cons_off, cpu);
+       for (cons_off = atomic_long_read(&buf->consumed);
+                       (SUBBUF_TRUNC(local_read(&buf->offset), chan)
+                        - cons_off) > 0;
+                       cons_off = SUBBUF_ALIGN(cons_off, chan))
+               ltt_relay_print_subbuffer_errors(buf, chan, cons_off, cpu);
+}
+
+static
+void ltt_relay_print_buffer_errors(struct ltt_chan *chan, unsigned int cpu)
+{
+       struct ltt_trace *trace = chan->a.trace;
+       struct ltt_chanbuf *buf = per_cpu_ptr(chan->a.buf, cpu);
+
+       if (local_read(&buf->events_lost))
+               printk(KERN_ALERT
+                      "LTT : %s : %ld events lost "
+                      "in %s channel (cpu %u).\n",
+                      chan->a.filename, local_read(&buf->events_lost),
+                      chan->a.filename, cpu);
+       if (local_read(&buf->corrupted_subbuffers))
+               printk(KERN_ALERT
+                      "LTT : %s : %ld corrupted subbuffers "
+                      "in %s channel (cpu %u).\n",
+                      chan->a.filename,
+                      local_read(&buf->corrupted_subbuffers),
+                      chan->a.filename, cpu);
+
+       ltt_relay_print_errors(buf, chan, trace, cpu);
+}
+
+static void ltt_relay_remove_dirs(struct ltt_trace *trace)
+{
+       ltt_ascii_remove_dir(trace);
+       debugfs_remove(trace->dentry.trace_root);
+}
+
+static int ltt_relay_create_dirs(struct ltt_trace *new_trace)
+{
+       struct dentry *ltt_root_dentry;
+       int ret;
+
+       ltt_root_dentry = get_ltt_root();
+       if (!ltt_root_dentry)
+               return ENOENT;
+
+       new_trace->dentry.trace_root = debugfs_create_dir(new_trace->trace_name,
+                                                         ltt_root_dentry);
+       put_ltt_root();
+       if (new_trace->dentry.trace_root == NULL) {
+               printk(KERN_ERR "LTT : Trace directory name %s already taken\n",
+                      new_trace->trace_name);
+               return EEXIST;
+       }
+       ret = ltt_ascii_create_dir(new_trace);
+       if (ret)
+               printk(KERN_WARNING "LTT : Unable to create ascii output file "
+                                   "for trace %s\n", new_trace->trace_name);
+
+       return 0;
+}
+
+/*
+ * LTTng channel flush function.
+ *
+ * Must be called when no tracing is active in the channel, because of
+ * accesses across CPUs.
+ */
+static notrace void ltt_relay_buffer_flush(struct ltt_chanbuf *buf)
+{
+       buf->finalized = 1;
+       ltt_force_switch(buf, FORCE_FLUSH);
+}
+
+static void ltt_relay_async_wakeup_chan(struct ltt_chan *chan)
+{
+       unsigned int i;
+
+       for_each_possible_cpu(i) {
+               struct ltt_chanbuf *buf;
+
+               buf = per_cpu_ptr(chan->a.buf, i);
+               if (!buf->a.allocated)
+                       continue;
+               /*
+                * Ensure the buffer has been allocated before reading its
+                * content. Sync cpu hotplug vs async wakeup.
+                */
+               smp_rmb();
+               if (ltt_poll_deliver(buf, chan))
+                       wake_up_interruptible(&buf->read_wait);
+       }
+}
+
+static void ltt_relay_finish_buffer(struct ltt_chan *chan, unsigned int cpu)
+{
+       struct ltt_chanbuf *buf = per_cpu_ptr(chan->a.buf, cpu);
+
+       if (buf->a.allocated) {
+               ltt_relay_buffer_flush(buf);
+               ltt_relay_wake_writers(buf);
+       }
+}
+
+
+static void ltt_relay_finish_channel(struct ltt_chan *chan)
+{
+       unsigned int i;
+
+       for_each_possible_cpu(i)
+               ltt_relay_finish_buffer(chan, i);
+}
+
+/*
+ * This is called with preemption disabled when user space has requested
+ * blocking mode.  If one of the active traces has free space below a
+ * specific threshold value, we reenable preemption and block.
+ */
+static
+int ltt_relay_user_blocking(struct ltt_trace *trace, unsigned int chan_index,
+                           size_t data_size, struct user_dbg_data *dbg)
+{
+       struct ltt_chanbuf *buf;
+       struct ltt_chan *chan;
+       int cpu;
+       DECLARE_WAITQUEUE(wait, current);
+
+       chan = &trace->channels[chan_index];
+       cpu = smp_processor_id();
+       buf = per_cpu_ptr(chan->a.buf, cpu);
+
+       /*
+        * Check if data is too big for the channel : do not
+        * block for it.
+        */
+       if (LTT_RESERVE_CRITICAL + data_size > chan->a.sb_size)
+               return 0;
+
+       /*
+        * If free space too low, we block. We restart from the
+        * beginning after we resume (cpu id may have changed
+        * while preemption is active).
+        */
+       spin_lock(&buf->full_lock);
+       if (!chan->overwrite) {
+               dbg->write = local_read(&buf->offset);
+               dbg->read = atomic_long_read(&buf->consumed);
+               dbg->avail_size = dbg->write + LTT_RESERVE_CRITICAL + data_size
+                                 - SUBBUF_TRUNC(dbg->read, chan);
+               if (dbg->avail_size > chan->a.buf_size) {
+                       __set_current_state(TASK_INTERRUPTIBLE);
+                       add_wait_queue(&buf->write_wait, &wait);
+                       spin_unlock(&buf->full_lock);
+                       preempt_enable();
+                       schedule();
+                       __set_current_state(TASK_RUNNING);
+                       remove_wait_queue(&buf->write_wait, &wait);
+                       if (signal_pending(current))
+                               return -ERESTARTSYS;
+                       preempt_disable();
+                       return 1;
+               }
+       }
+       spin_unlock(&buf->full_lock);
+       return 0;
+}
+
+static
+void ltt_relay_print_user_errors(struct ltt_trace *trace,
+                                unsigned int chan_index, size_t data_size,
+                                struct user_dbg_data *dbg, int cpu)
+{
+       struct ltt_chanbuf *buf;
+       struct ltt_chan *chan;
+
+       chan = &trace->channels[chan_index];
+       buf = per_cpu_ptr(chan->a.buf, cpu);
+
+       printk(KERN_ERR "Error in LTT usertrace : "
+              "buffer full : event lost in blocking "
+              "mode. Increase LTT_RESERVE_CRITICAL.\n");
+       printk(KERN_ERR "LTT nesting level is %u.\n",
+              per_cpu(ltt_nesting, cpu));
+       printk(KERN_ERR "LTT available size %lu.\n",
+              dbg->avail_size);
+       printk(KERN_ERR "available write : %lu, read : %lu\n",
+              dbg->write, dbg->read);
+
+       dbg->write = local_read(&buf->offset);
+       dbg->read = atomic_long_read(&buf->consumed);
+
+       printk(KERN_ERR "LTT current size %lu.\n",
+               dbg->write + LTT_RESERVE_CRITICAL + data_size
+               - SUBBUF_TRUNC(dbg->read, chan));
+       printk(KERN_ERR "current write : %lu, read : %lu\n",
+                       dbg->write, dbg->read);
+}
+
+/*
+ * ltt_reserve_switch_old_subbuf: switch old subbuffer
+ *
+ * Concurrency safe because we are the last and only thread to alter this
+ * sub-buffer. As long as it is not delivered and read, no other thread can
+ * alter the offset, alter the reserve_count or call the
+ * client_buffer_end_callback on this sub-buffer.
+ *
+ * The only remaining threads could be the ones with pending commits. They will
+ * have to do the deliver themselves.  Not concurrency safe in overwrite mode.
+ * We detect corrupted subbuffers with commit and reserve counts. We keep a
+ * corrupted sub-buffers count and push the readers across these sub-buffers.
+ *
+ * Not concurrency safe if a writer is stalled in a subbuffer and another writer
+ * switches in, finding out it's corrupted.  The result will be than the old
+ * (uncommited) subbuffer will be declared corrupted, and that the new subbuffer
+ * will be declared corrupted too because of the commit count adjustment.
+ *
+ * Note : offset_old should never be 0 here.
+ */
+static
+void ltt_reserve_switch_old_subbuf(struct ltt_chanbuf *buf,
+                                  struct ltt_chan *chan,
+                                  struct ltt_reserve_switch_offsets *offsets,
+                                  u64 *tsc)
+{
+       long oldidx = SUBBUF_INDEX(offsets->old - 1, chan);
+       long commit_count, padding_size;
+
+       padding_size = chan->a.sb_size
+                       - (SUBBUF_OFFSET(offsets->old - 1, chan) + 1);
+       ltt_buffer_end(buf, *tsc, offsets->old, oldidx);
+
+       /*
+        * Must write slot data before incrementing commit count.
+        * This compiler barrier is upgraded into a smp_wmb() by the IPI
+        * sent by get_subbuf() when it does its smp_rmb().
+        */
+       barrier();
+       local_add(padding_size, &buf->commit_count[oldidx].cc);
+       commit_count = local_read(&buf->commit_count[oldidx].cc);
+       ltt_check_deliver(buf, chan, offsets->old - 1, commit_count, oldidx);
+       ltt_write_commit_counter(buf, chan, oldidx, offsets->old, commit_count,
+                                padding_size);
+}
+
+/*
+ * ltt_reserve_switch_new_subbuf: Populate new subbuffer.
+ *
+ * This code can be executed unordered : writers may already have written to the
+ * sub-buffer before this code gets executed, caution.  The commit makes sure
+ * that this code is executed before the deliver of this sub-buffer.
+ */
+static
+void ltt_reserve_switch_new_subbuf(struct ltt_chanbuf *buf,
+                                  struct ltt_chan *chan,
+                                  struct ltt_reserve_switch_offsets *offsets,
+                                  u64 *tsc)
+{
+       long beginidx = SUBBUF_INDEX(offsets->begin, chan);
+       long commit_count;
+
+       ltt_buffer_begin(buf, *tsc, beginidx);
+
+       /*
+        * Must write slot data before incrementing commit count.
+        * This compiler barrier is upgraded into a smp_wmb() by the IPI
+        * sent by get_subbuf() when it does its smp_rmb().
+        */
+       barrier();
+       local_add(ltt_sb_header_size(), &buf->commit_count[beginidx].cc);
+       commit_count = local_read(&buf->commit_count[beginidx].cc);
+       /* Check if the written buffer has to be delivered */
+       ltt_check_deliver(buf, chan, offsets->begin, commit_count, beginidx);
+       ltt_write_commit_counter(buf, chan, beginidx, offsets->begin,
+                                commit_count, ltt_sb_header_size());
+}
+
+
+/*
+ * ltt_reserve_end_switch_current: finish switching current subbuffer
+ *
+ * Concurrency safe because we are the last and only thread to alter this
+ * sub-buffer. As long as it is not delivered and read, no other thread can
+ * alter the offset, alter the reserve_count or call the
+ * client_buffer_end_callback on this sub-buffer.
+ *
+ * The only remaining threads could be the ones with pending commits. They will
+ * have to do the deliver themselves.  Not concurrency safe in overwrite mode.
+ * We detect corrupted subbuffers with commit and reserve counts. We keep a
+ * corrupted sub-buffers count and push the readers across these sub-buffers.
+ *
+ * Not concurrency safe if a writer is stalled in a subbuffer and another writer
+ * switches in, finding out it's corrupted.  The result will be than the old
+ * (uncommited) subbuffer will be declared corrupted, and that the new subbuffer
+ * will be declared corrupted too because of the commit count adjustment.
+ */
+static
+void ltt_reserve_end_switch_current(struct ltt_chanbuf *buf,
+                                   struct ltt_chan *chan,
+                                   struct ltt_reserve_switch_offsets *offsets,
+                                   u64 *tsc)
+{
+       long endidx = SUBBUF_INDEX(offsets->end - 1, chan);
+       long commit_count, padding_size;
+
+       padding_size = chan->a.sb_size
+                       - (SUBBUF_OFFSET(offsets->end - 1, chan) + 1);
+
+       ltt_buffer_end(buf, *tsc, offsets->end, endidx);
+
+       /*
+        * Must write slot data before incrementing commit count.
+        * This compiler barrier is upgraded into a smp_wmb() by the IPI
+        * sent by get_subbuf() when it does its smp_rmb().
+        */
+       barrier();
+       local_add(padding_size, &buf->commit_count[endidx].cc);
+       commit_count = local_read(&buf->commit_count[endidx].cc);
+       ltt_check_deliver(buf, chan, offsets->end - 1, commit_count, endidx);
+       ltt_write_commit_counter(buf, chan, endidx, offsets->end, commit_count,
+                                padding_size);
+}
+
+/*
+ * Returns :
+ * 0 if ok
+ * !0 if execution must be aborted.
+ */
+static
+int ltt_relay_try_switch_slow(enum force_switch_mode mode,
+                             struct ltt_chanbuf *buf, struct ltt_chan *chan,
+                             struct ltt_reserve_switch_offsets *offsets,
+                             u64 *tsc)
+{
+       long sb_index;
+       long reserve_commit_diff;
+       long off;
+
+       offsets->begin = local_read(&buf->offset);
+       offsets->old = offsets->begin;
+       offsets->begin_switch = 0;
+       offsets->end_switch_old = 0;
+
+       *tsc = trace_clock_read64();
+
+       off = SUBBUF_OFFSET(offsets->begin, chan);
+       if ((mode != FORCE_ACTIVE && off > 0) || off > ltt_sb_header_size()) {
+               offsets->begin = SUBBUF_ALIGN(offsets->begin, chan);
+               offsets->end_switch_old = 1;
+       } else {
+               /* we do not have to switch : buffer is empty */
+               return -1;
+       }
+       if (mode == FORCE_ACTIVE)
+               offsets->begin += ltt_sb_header_size();
+       /*
+        * Always begin_switch in FORCE_ACTIVE mode.
+        * Test new buffer integrity
+        */
+       sb_index = SUBBUF_INDEX(offsets->begin, chan);
+       reserve_commit_diff =
+               (BUFFER_TRUNC(offsets->begin, chan)
+                >> chan->a.n_sb_order)
+               - (local_read(&buf->commit_count[sb_index].cc_sb)
+                       & chan->commit_count_mask);
+       if (reserve_commit_diff == 0) {
+               /* Next buffer not corrupted. */
+               if (mode == FORCE_ACTIVE
+                   && !chan->overwrite
+                   && offsets->begin - atomic_long_read(&buf->consumed)
+                      >= chan->a.buf_size) {
+                       /*
+                        * We do not overwrite non consumed buffers and we are
+                        * full : ignore switch while tracing is active.
+                        */
+                       return -1;
+               }
+       } else {
+               /*
+                * Next subbuffer corrupted. Force pushing reader even in normal
+                * mode
+                */
+       }
+       offsets->end = offsets->begin;
+       return 0;
+}
+
+/*
+ * Force a sub-buffer switch for a per-cpu buffer. This operation is
+ * completely reentrant : can be called while tracing is active with
+ * absolutely no lock held.
+ *
+ * Note, however, that as a local_cmpxchg is used for some atomic
+ * operations, this function must be called from the CPU which owns the buffer
+ * for a ACTIVE flush.
+ */
+void ltt_force_switch_lockless_slow(struct ltt_chanbuf *buf,
+                                   enum force_switch_mode mode)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+       struct ltt_reserve_switch_offsets offsets;
+       u64 tsc;
+
+       offsets.size = 0;
+
+       /*
+        * Perform retryable operations.
+        */
+       do {
+               if (ltt_relay_try_switch_slow(mode, buf, chan, &offsets, &tsc))
+                       return;
+       } while (local_cmpxchg(&buf->offset, offsets.old, offsets.end)
+                != offsets.old);
+
+       /*
+        * Atomically update last_tsc. This update races against concurrent
+        * atomic updates, but the race will always cause supplementary full TSC
+        * events, never the opposite (missing a full TSC event when it would be
+        * needed).
+        */
+       save_last_tsc(buf, tsc);
+
+       /*
+        * Push the reader if necessary
+        */
+       if (mode == FORCE_ACTIVE) {
+               ltt_reserve_push_reader(buf, chan, offsets.end - 1);
+               ltt_clear_noref_flag(&buf->a, SUBBUF_INDEX(offsets.end - 1,
+                                                          chan));
+       }
+
+       /*
+        * Switch old subbuffer if needed.
+        */
+       if (offsets.end_switch_old) {
+               ltt_clear_noref_flag(&buf->a, SUBBUF_INDEX(offsets.old - 1,
+                                                          chan));
+               ltt_reserve_switch_old_subbuf(buf, chan, &offsets, &tsc);
+       }
+
+       /*
+        * Populate new subbuffer.
+        */
+       if (mode == FORCE_ACTIVE)
+               ltt_reserve_switch_new_subbuf(buf, chan, &offsets, &tsc);
+}
+EXPORT_SYMBOL_GPL(ltt_force_switch_lockless_slow);
+
+/*
+ * Returns :
+ * 0 if ok
+ * !0 if execution must be aborted.
+ */
+static
+int ltt_relay_try_reserve_slow(struct ltt_chanbuf *buf, struct ltt_chan *chan,
+                              struct ltt_reserve_switch_offsets *offsets,
+                              size_t data_size, u64 *tsc, unsigned int *rflags,
+                              int largest_align)
+{
+       long reserve_commit_diff;
+
+       offsets->begin = local_read(&buf->offset);
+       offsets->old = offsets->begin;
+       offsets->begin_switch = 0;
+       offsets->end_switch_current = 0;
+       offsets->end_switch_old = 0;
+
+       *tsc = trace_clock_read64();
+       if (last_tsc_overflow(buf, *tsc))
+               *rflags = LTT_RFLAG_ID_SIZE_TSC;
+
+       if (unlikely(SUBBUF_OFFSET(offsets->begin, chan) == 0)) {
+               offsets->begin_switch = 1;              /* For offsets->begin */
+       } else {
+               offsets->size = ltt_get_header_size(chan, offsets->begin,
+                                                   data_size,
+                                                   &offsets->before_hdr_pad,
+                                                   *rflags);
+               offsets->size += ltt_align(offsets->begin + offsets->size,
+                                          largest_align)
+                                + data_size;
+               if (unlikely((SUBBUF_OFFSET(offsets->begin, chan) +
+                            offsets->size) > chan->a.sb_size)) {
+                       offsets->end_switch_old = 1;    /* For offsets->old */
+                       offsets->begin_switch = 1;      /* For offsets->begin */
+               }
+       }
+       if (unlikely(offsets->begin_switch)) {
+               long sb_index;
+
+               /*
+                * We are typically not filling the previous buffer completely.
+                */
+               if (likely(offsets->end_switch_old))
+                       offsets->begin = SUBBUF_ALIGN(offsets->begin, chan);
+               offsets->begin = offsets->begin + ltt_sb_header_size();
+               /* Test new buffer integrity */
+               sb_index = SUBBUF_INDEX(offsets->begin, chan);
+               reserve_commit_diff =
+                 (BUFFER_TRUNC(offsets->begin, chan)
+                  >> chan->a.n_sb_order)
+                 - (local_read(&buf->commit_count[sb_index].cc_sb)
+                               & chan->commit_count_mask);
+               if (likely(reserve_commit_diff == 0)) {
+                       /* Next buffer not corrupted. */
+                       if (unlikely(!chan->overwrite &&
+                               (SUBBUF_TRUNC(offsets->begin, chan)
+                                - SUBBUF_TRUNC(atomic_long_read(&buf->consumed),
+                                               chan))
+                               >= chan->a.buf_size)) {
+                               /*
+                                * We do not overwrite non consumed buffers
+                                * and we are full : event is lost.
+                                */
+                               local_inc(&buf->events_lost);
+                               return -1;
+                       } else {
+                               /*
+                                * next buffer not corrupted, we are either in
+                                * overwrite mode or the buffer is not full.
+                                * It's safe to write in this new subbuffer.
+                                */
+                       }
+               } else {
+                       /*
+                        * Next subbuffer corrupted. Drop event in normal and
+                        * overwrite mode. Caused by either a writer OOPS or
+                        * too many nested writes over a reserve/commit pair.
+                        */
+                       local_inc(&buf->events_lost);
+                       return -1;
+               }
+               offsets->size = ltt_get_header_size(chan, offsets->begin,
+                                                   data_size,
+                                                   &offsets->before_hdr_pad,
+                                                   *rflags);
+               offsets->size += ltt_align(offsets->begin + offsets->size,
+                                          largest_align)
+                                + data_size;
+               if (unlikely((SUBBUF_OFFSET(offsets->begin, chan)
+                            + offsets->size) > chan->a.sb_size)) {
+                       /*
+                        * Event too big for subbuffers, report error, don't
+                        * complete the sub-buffer switch.
+                        */
+                       local_inc(&buf->events_lost);
+                       return -1;
+               } else {
+                       /*
+                        * We just made a successful buffer switch and the event
+                        * fits in the new subbuffer. Let's write.
+                        */
+               }
+       } else {
+               /*
+                * Event fits in the current buffer and we are not on a switch
+                * boundary. It's safe to write.
+                */
+       }
+       offsets->end = offsets->begin + offsets->size;
+
+       if (unlikely((SUBBUF_OFFSET(offsets->end, chan)) == 0)) {
+               /*
+                * The offset_end will fall at the very beginning of the next
+                * subbuffer.
+                */
+               offsets->end_switch_current = 1;        /* For offsets->begin */
+       }
+       return 0;
+}
+
+/**
+ * ltt_relay_reserve_slot_lockless_slow - Atomic slot reservation in a buffer.
+ * @trace: the trace structure to log to.
+ * @ltt_channel: channel structure
+ * @transport_data: data structure specific to ltt relay
+ * @data_size: size of the variable length data to log.
+ * @slot_size: pointer to total size of the slot (out)
+ * @buf_offset : pointer to reserved buffer offset (out)
+ * @tsc: pointer to the tsc at the slot reservation (out)
+ * @cpu: cpuid
+ *
+ * Return : -ENOSPC if not enough space, else returns 0.
+ * It will take care of sub-buffer switching.
+ */
+int ltt_reserve_slot_lockless_slow(struct ltt_chan *chan,
+                                  struct ltt_trace *trace, size_t data_size,
+                                  int largest_align, int cpu,
+                                  struct ltt_chanbuf **ret_buf,
+                                  size_t *slot_size, long *buf_offset,
+                                  u64 *tsc, unsigned int *rflags)
+{
+       struct ltt_chanbuf *buf = *ret_buf = per_cpu_ptr(chan->a.buf, cpu);
+       struct ltt_reserve_switch_offsets offsets;
+
+       offsets.size = 0;
+
+       do {
+               if (unlikely(ltt_relay_try_reserve_slow(buf, chan, &offsets,
+                                                       data_size, tsc, rflags,
+                                                       largest_align)))
+                       return -ENOSPC;
+       } while (unlikely(local_cmpxchg(&buf->offset, offsets.old, offsets.end)
+                         != offsets.old));
+
+       /*
+        * Atomically update last_tsc. This update races against concurrent
+        * atomic updates, but the race will always cause supplementary full TSC
+        * events, never the opposite (missing a full TSC event when it would be
+        * needed).
+        */
+       save_last_tsc(buf, *tsc);
+
+       /*
+        * Push the reader if necessary
+        */
+       ltt_reserve_push_reader(buf, chan, offsets.end - 1);
+
+       /*
+        * Clear noref flag for this subbuffer.
+        */
+       ltt_clear_noref_flag(&buf->a, SUBBUF_INDEX(offsets.end - 1, chan));
+
+       /*
+        * Switch old subbuffer if needed.
+        */
+       if (unlikely(offsets.end_switch_old)) {
+               ltt_clear_noref_flag(&buf->a, SUBBUF_INDEX(offsets.old - 1,
+                                                         chan));
+               ltt_reserve_switch_old_subbuf(buf, chan, &offsets, tsc);
+       }
+
+       /*
+        * Populate new subbuffer.
+        */
+       if (unlikely(offsets.begin_switch))
+               ltt_reserve_switch_new_subbuf(buf, chan, &offsets, tsc);
+
+       if (unlikely(offsets.end_switch_current))
+               ltt_reserve_end_switch_current(buf, chan, &offsets, tsc);
+
+       *slot_size = offsets.size;
+       *buf_offset = offsets.begin + offsets.before_hdr_pad;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(ltt_reserve_slot_lockless_slow);
+
+static struct ltt_transport ltt_relay_transport = {
+       .name = "relay",
+       .owner = THIS_MODULE,
+       .ops = {
+               .create_dirs = ltt_relay_create_dirs,
+               .remove_dirs = ltt_relay_remove_dirs,
+               .create_channel = ltt_chan_create,
+               .finish_channel = ltt_relay_finish_channel,
+               .remove_channel = ltt_chan_free,
+               .remove_channel_files = ltt_chan_remove_files,
+               .wakeup_channel = ltt_relay_async_wakeup_chan,
+               .user_blocking = ltt_relay_user_blocking,
+               .user_errors = ltt_relay_print_user_errors,
+               .start_switch_timer = ltt_chan_start_switch_timer,
+               .stop_switch_timer = ltt_chan_stop_switch_timer,
+       },
+};
+
+static struct notifier_block fn_ltt_chanbuf_hotcpu_callback = {
+       .notifier_call = ltt_chanbuf_hotcpu_callback,
+       .priority = 6,
+};
+
+int __init ltt_relay_init(void)
+{
+       printk(KERN_INFO "LTT : ltt-relay init\n");
+
+       ltt_transport_register(&ltt_relay_transport);
+       register_cpu_notifier(&fn_ltt_chanbuf_hotcpu_callback);
+       register_idle_notifier(&pm_idle_entry_notifier);
+
+       return 0;
+}
+
+void __exit ltt_relay_exit(void)
+{
+       printk(KERN_INFO "LTT : ltt-relay exit\n");
+
+       unregister_idle_notifier(&pm_idle_entry_notifier);
+       unregister_cpu_notifier(&fn_ltt_chanbuf_hotcpu_callback);
+       ltt_transport_unregister(&ltt_relay_transport);
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Lockless Relay");
diff --git a/ltt-relay-lockless.h b/ltt-relay-lockless.h
new file mode 100644 (file)
index 0000000..62fc515
--- /dev/null
@@ -0,0 +1,549 @@
+#ifndef _LTT_LTT_RELAY_LOCKLESS_H
+#define _LTT_LTT_RELAY_LOCKLESS_H
+
+/*
+ * ltt/ltt-relay-lockless.h
+ *
+ * (C) Copyright 2005-2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * LTTng lockless buffer space management (reader/writer).
+ *
+ * Author:
+ *     Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Inspired from LTT :
+ *  Karim Yaghmour (karim@opersys.com)
+ *  Tom Zanussi (zanussi@us.ibm.com)
+ *  Bob Wisniewski (bob@watson.ibm.com)
+ * And from K42 :
+ *  Bob Wisniewski (bob@watson.ibm.com)
+ *
+ * Changelog:
+ *  08/10/08, Cleanup.
+ *  19/10/05, Complete lockless mechanism.
+ *  27/05/05, Modular redesign and rewrite.
+ *
+ * Userspace reader semantic :
+ * while (poll fd != POLLHUP) {
+ *   - ioctl RELAY_GET_SUBBUF_SIZE
+ *   while (1) {
+ *     - ioctl GET_SUBBUF
+ *     - splice 1 subbuffer worth of data to a pipe
+ *     - splice the data from pipe to disk/network
+ *     - ioctl PUT_SUBBUF, check error value
+ *       if err val < 0, previous subbuffer was corrupted.
+ *   }
+ * }
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/cache.h>
+#include <linux/time.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/fs.h>
+#include <linux/smp_lock.h>
+#include <linux/debugfs.h>
+#include <linux/stat.h>
+#include <linux/cpu.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/splice.h>
+#include <asm/atomic.h>
+#include <asm/local.h>
+
+#include "ltt-tracer.h"
+#include "ltt-relay.h"
+
+#if 0
+#define printk_dbg(fmt, args...) printk(fmt, args)
+#else
+#define printk_dbg(fmt, args...)
+#endif
+
+struct commit_counters {
+       local_t cc;
+       local_t cc_sb;                  /* Incremented _once_ at sb switch */
+       local_t events;                 /* Event count */
+};
+
+/* LTTng lockless logging buffer info */
+struct ltt_chanbuf {
+       struct ltt_chanbuf_alloc a;     /* Parent. First field. */
+       /* First 32 bytes cache-hot cacheline */
+       local_t offset;                 /* Current offset in the buffer */
+       struct commit_counters *commit_count;
+                                       /* Commit count per sub-buffer */
+       atomic_long_t consumed;         /*
+                                        * Current offset in the buffer
+                                        * standard atomic access (shared)
+                                        */
+       unsigned long last_tsc;         /*
+                                        * Last timestamp written in the buffer.
+                                        */
+       /* End of first 32 bytes cacheline */
+#ifdef CONFIG_LTT_VMCORE
+       local_t *commit_seq;            /* Consecutive commits */
+#endif
+       atomic_long_t active_readers;   /*
+                                        * Active readers count
+                                        * standard atomic access (shared)
+                                        */
+       local_t events_lost;
+       local_t corrupted_subbuffers;
+       spinlock_t full_lock;           /*
+                                        * buffer full condition spinlock, only
+                                        * for userspace tracing blocking mode
+                                        * synchronization with reader.
+                                        */
+       wait_queue_head_t write_wait;   /*
+                                        * Wait queue for blocking user space
+                                        * writers
+                                        */
+       wait_queue_head_t read_wait;    /* reader wait queue */
+       unsigned int finalized;         /* buffer has been finalized */
+       struct timer_list switch_timer; /* timer for periodical switch */
+};
+
+/*
+ * A switch is done during tracing or as a final flush after tracing (so it
+ * won't write in the new sub-buffer).
+ */
+enum force_switch_mode { FORCE_ACTIVE, FORCE_FLUSH };
+
+extern
+int ltt_reserve_slot_lockless_slow(struct ltt_chan *chan,
+                                  struct ltt_trace *trace, size_t data_size,
+                                  int largest_align, int cpu,
+                                  struct ltt_chanbuf **ret_buf,
+                                  size_t *slot_size, long *buf_offset,
+                                  u64 *tsc, unsigned int *rflags);
+
+extern void ltt_force_switch_lockless_slow(struct ltt_chanbuf *buf,
+                                          enum force_switch_mode mode);
+
+/*
+ * Last TSC comparison functions. Check if the current TSC overflows
+ * LTT_TSC_BITS bits from the last TSC read. Reads and writes last_tsc
+ * atomically.
+ */
+
+#if (BITS_PER_LONG == 32)
+static __inline__ void save_last_tsc(struct ltt_chanbuf *buf, u64 tsc)
+{
+       buf->last_tsc = (unsigned long)(tsc >> LTT_TSC_BITS);
+}
+
+static __inline__ int last_tsc_overflow(struct ltt_chanbuf *buf, u64 tsc)
+{
+       unsigned long tsc_shifted = (unsigned long)(tsc >> LTT_TSC_BITS);
+
+       if (unlikely((tsc_shifted - buf->last_tsc)))
+               return 1;
+       else
+               return 0;
+}
+#else
+static __inline__ void save_last_tsc(struct ltt_chanbuf *buf, u64 tsc)
+{
+       buf->last_tsc = (unsigned long)tsc;
+}
+
+static __inline__ int last_tsc_overflow(struct ltt_chanbuf *buf, u64 tsc)
+{
+       if (unlikely((tsc - buf->last_tsc) >> LTT_TSC_BITS))
+               return 1;
+       else
+               return 0;
+}
+#endif
+
+extern
+int ltt_chanbuf_create(struct ltt_chanbuf *buf, struct ltt_chan_alloc *chana,
+                      int cpu);
+extern void ltt_chanbuf_free(struct ltt_chanbuf *buf);
+extern int ltt_chan_create(const char *base_filename, struct ltt_chan *chan,
+                          struct dentry *parent, size_t sb_size, size_t n_sb,
+                          int overwrite, struct ltt_trace *trace);
+extern void ltt_chan_free(struct kref *kref);
+extern void ltt_chan_remove_files(struct ltt_chan *chan);
+
+/* Buffer access operations */
+
+extern int ltt_chanbuf_open_read(struct ltt_chanbuf *buf);
+extern void ltt_chanbuf_release_read(struct ltt_chanbuf *buf);
+extern int ltt_chanbuf_get_subbuf(struct ltt_chanbuf *buf,
+                                 unsigned long *consumed);
+extern int ltt_chanbuf_put_subbuf(struct ltt_chanbuf *buf,
+                                 unsigned long consumed);
+extern void ltt_chan_start_switch_timer(struct ltt_chan *chan);
+extern void ltt_chan_stop_switch_timer(struct ltt_chan *chan);
+
+extern int ltt_relay_init(void);
+extern void ltt_relay_exit(void);
+
+static __inline__
+unsigned long ltt_chanbuf_get_offset(struct ltt_chanbuf *buf)
+{
+       return local_read(&buf->offset);
+}
+
+static __inline__
+unsigned long ltt_chanbuf_get_consumed(struct ltt_chanbuf *buf)
+{
+       return atomic_long_read(&buf->consumed);
+}
+
+static __inline__
+int ltt_chanbuf_is_finalized(struct ltt_chanbuf *buf)
+{
+       return buf->finalized;
+}
+
+static __inline__
+void ltt_reserve_push_reader(struct ltt_chanbuf *buf, struct ltt_chan *chan,
+                            long offset)
+{
+       long consumed_old, consumed_new;
+
+       do {
+               consumed_old = atomic_long_read(&buf->consumed);
+               /*
+                * If buffer is in overwrite mode, push the reader consumed
+                * count if the write position has reached it and we are not
+                * at the first iteration (don't push the reader farther than
+                * the writer). This operation can be done concurrently by many
+                * writers in the same buffer, the writer being at the farthest
+                * write position sub-buffer index in the buffer being the one
+                * which will win this loop.
+                * If the buffer is not in overwrite mode, pushing the reader
+                * only happens if a sub-buffer is corrupted.
+                */
+               if (unlikely((SUBBUF_TRUNC(offset, chan)
+                             - SUBBUF_TRUNC(consumed_old, chan))
+                            >= chan->a.buf_size))
+                       consumed_new = SUBBUF_ALIGN(consumed_old, chan);
+               else
+                       return;
+       } while (unlikely(atomic_long_cmpxchg(&buf->consumed, consumed_old,
+                                             consumed_new) != consumed_old));
+}
+
+#ifdef CONFIG_LTT_VMCORE
+static __inline__
+void ltt_vmcore_check_deliver(struct ltt_chanbuf *buf, long commit_count,
+                             long idx)
+{
+       local_set(&buf->commit_seq[idx], commit_count);
+}
+#else
+static __inline__
+void ltt_vmcore_check_deliver(struct ltt_chanbuf *buf, long commit_count,
+                             long idx)
+{
+}
+#endif
+
+static __inline__
+void ltt_check_deliver(struct ltt_chanbuf *buf, struct ltt_chan *chan,
+                      long offset, long commit_count, long idx)
+{
+       long old_commit_count = commit_count - chan->a.sb_size;
+
+       /* Check if all commits have been done */
+       if (unlikely((BUFFER_TRUNC(offset, chan) >> chan->a.n_sb_order)
+                    - (old_commit_count & chan->commit_count_mask) == 0)) {
+               /*
+                * If we succeeded in updating the cc_sb, we are delivering
+                * the subbuffer. Deals with concurrent updates of the "cc"
+                * value without adding a add_return atomic operation to the
+                * fast path.
+                */
+               if (likely(local_cmpxchg(&buf->commit_count[idx].cc_sb,
+                                        old_commit_count, commit_count)
+                          == old_commit_count)) {
+                       /*
+                        * Set noref flag for this subbuffer.
+                        */
+                       ltt_set_noref_flag(&buf->a, idx);
+                       ltt_vmcore_check_deliver(buf, commit_count, idx);
+               }
+       }
+}
+
+
+static __inline__
+int ltt_poll_deliver(struct ltt_chanbuf *buf, struct ltt_chan *chan)
+{
+       long consumed_old, consumed_idx, commit_count, write_offset;
+
+       consumed_old = atomic_long_read(&buf->consumed);
+       consumed_idx = SUBBUF_INDEX(consumed_old, chan);
+       commit_count = local_read(&buf->commit_count[consumed_idx].cc_sb);
+       /*
+        * No memory barrier here, since we are only interested
+        * in a statistically correct polling result. The next poll will
+        * get the data is we are racing. The mb() that ensures correct
+        * memory order is in get_subbuf.
+        */
+       write_offset = local_read(&buf->offset);
+
+       /*
+        * Check that the subbuffer we are trying to consume has been
+        * already fully committed.
+        */
+
+       if (((commit_count - chan->a.sb_size)
+            & chan->commit_count_mask)
+           - (BUFFER_TRUNC(consumed_old, chan)
+              >> chan->a.n_sb_order)
+           != 0)
+               return 0;
+
+       /*
+        * Check that we are not about to read the same subbuffer in
+        * which the writer head is.
+        */
+       if ((SUBBUF_TRUNC(write_offset, chan)
+          - SUBBUF_TRUNC(consumed_old, chan))
+          == 0)
+               return 0;
+
+       return 1;
+
+}
+
+static __inline__
+u32 get_read_sb_size(struct ltt_chanbuf *buf)
+{
+       struct ltt_subbuffer_header *header =
+               (struct ltt_subbuffer_header *)
+                       ltt_relay_read_offset_address(&buf->a, 0);
+       return header->sb_size;
+}
+
+/*
+ * returns 0 if reserve ok, or 1 if the slow path must be taken.
+ */
+static __inline__
+int ltt_relay_try_reserve(struct ltt_chanbuf *buf, struct ltt_chan *chan,
+                         size_t data_size, u64 *tsc, unsigned int *rflags,
+                         int largest_align, long *o_begin, long *o_end,
+                         long *o_old, size_t *before_hdr_pad, size_t *size)
+{
+       *o_begin = local_read(&buf->offset);
+       *o_old = *o_begin;
+
+       *tsc = trace_clock_read64();
+
+#ifdef CONFIG_LTT_VMCORE
+       prefetch(&buf->commit_count[SUBBUF_INDEX(*o_begin, chan)]);
+       prefetch(&buf->commit_seq[SUBBUF_INDEX(*o_begin, chan)]);
+#else
+       prefetchw(&buf->commit_count[SUBBUF_INDEX(*o_begin, chan)]);
+#endif
+       if (last_tsc_overflow(buf, *tsc))
+               *rflags = LTT_RFLAG_ID_SIZE_TSC;
+
+       if (unlikely(SUBBUF_OFFSET(*o_begin, chan) == 0))
+               return 1;
+
+       *size = ltt_get_header_size(chan, *o_begin, data_size, before_hdr_pad,
+                                   *rflags);
+       *size += ltt_align(*o_begin + *size, largest_align) + data_size;
+       if (unlikely((SUBBUF_OFFSET(*o_begin, chan) + *size) > chan->a.sb_size))
+               return 1;
+
+       /*
+        * Event fits in the current buffer and we are not on a switch
+        * boundary. It's safe to write.
+        */
+       *o_end = *o_begin + *size;
+
+       if (unlikely((SUBBUF_OFFSET(*o_end, chan)) == 0))
+               /*
+                * The offset_end will fall at the very beginning of the next
+                * subbuffer.
+                */
+               return 1;
+
+       return 0;
+}
+
+static __inline__
+int ltt_reserve_slot(struct ltt_chan *chan,
+                    struct ltt_trace *trace, size_t data_size,
+                    int largest_align, int cpu,
+                    struct ltt_chanbuf **ret_buf,
+                    size_t *slot_size, long *buf_offset, u64 *tsc,
+                    unsigned int *rflags)
+{
+       struct ltt_chanbuf *buf = *ret_buf = per_cpu_ptr(chan->a.buf, cpu);
+       long o_begin, o_end, o_old;
+       size_t before_hdr_pad;
+
+       /*
+        * Perform retryable operations.
+        */
+       if (unlikely(__get_cpu_var(ltt_nesting) > 4)) {
+               local_inc(&buf->events_lost);
+               return -EPERM;
+       }
+
+       if (unlikely(ltt_relay_try_reserve(buf, chan, data_size, tsc, rflags,
+                                          largest_align, &o_begin, &o_end,
+                                          &o_old, &before_hdr_pad, slot_size)))
+               goto slow_path;
+
+       if (unlikely(local_cmpxchg(&buf->offset, o_old, o_end) != o_old))
+               goto slow_path;
+
+       /*
+        * Atomically update last_tsc. This update races against concurrent
+        * atomic updates, but the race will always cause supplementary full TSC
+        * events, never the opposite (missing a full TSC event when it would be
+        * needed).
+        */
+       save_last_tsc(buf, *tsc);
+
+       /*
+        * Push the reader if necessary
+        */
+       ltt_reserve_push_reader(buf, chan, o_end - 1);
+
+       /*
+        * Clear noref flag for this subbuffer.
+        */
+       ltt_clear_noref_flag(&buf->a, SUBBUF_INDEX(o_end - 1, chan));
+
+       *buf_offset = o_begin + before_hdr_pad;
+       return 0;
+slow_path:
+       return ltt_reserve_slot_lockless_slow(chan, trace, data_size,
+                                             largest_align, cpu, ret_buf,
+                                             slot_size, buf_offset, tsc,
+                                             rflags);
+}
+
+/*
+ * Force a sub-buffer switch for a per-cpu buffer. This operation is
+ * completely reentrant : can be called while tracing is active with
+ * absolutely no lock held.
+ *
+ * Note, however, that as a local_cmpxchg is used for some atomic
+ * operations, this function must be called from the CPU which owns the buffer
+ * for a ACTIVE flush.
+ */
+static __inline__
+void ltt_force_switch(struct ltt_chanbuf *buf, enum force_switch_mode mode)
+{
+       return ltt_force_switch_lockless_slow(buf, mode);
+}
+
+/*
+ * for flight recording. must be called after relay_commit.
+ * This function increments the subbuffer's commit_seq counter each time the
+ * commit count reaches back the reserve offset (module subbuffer size). It is
+ * useful for crash dump.
+ */
+#ifdef CONFIG_LTT_VMCORE
+static __inline__
+void ltt_write_commit_counter(struct ltt_chanbuf *buf, struct ltt_chan *chan,
+                             long idx, long buf_offset, long commit_count,
+                             size_t data_size)
+{
+       long offset;
+       long commit_seq_old;
+
+       offset = buf_offset + data_size;
+
+       /*
+        * SUBBUF_OFFSET includes commit_count_mask. We can simply
+        * compare the offsets within the subbuffer without caring about
+        * buffer full/empty mismatch because offset is never zero here
+        * (subbuffer header and event headers have non-zero length).
+        */
+       if (unlikely(SUBBUF_OFFSET(offset - commit_count, chan)))
+               return;
+
+       commit_seq_old = local_read(&buf->commit_seq[idx]);
+       while (commit_seq_old < commit_count)
+               commit_seq_old = local_cmpxchg(&buf->commit_seq[idx],
+                                        commit_seq_old, commit_count);
+}
+#else
+static __inline__
+void ltt_write_commit_counter(struct ltt_chanbuf *buf, struct ltt_chan *chan,
+                             long idx, long buf_offset, long commit_count,
+                             size_t data_size)
+{
+}
+#endif
+
+/*
+ * Atomic unordered slot commit. Increments the commit count in the
+ * specified sub-buffer, and delivers it if necessary.
+ *
+ * Parameters:
+ *
+ * @buf: buffer.
+ * @chan: channel.
+ * @buf_offset : offset following the event header.
+ * @data_size : size of the event data.
+ * @slot_size : size of the reserved slot.
+ */
+static __inline__
+void ltt_commit_slot(struct ltt_chanbuf *buf, struct ltt_chan *chan,
+                    long buf_offset, size_t data_size, size_t slot_size)
+{
+       long offset_end = buf_offset;
+       long endidx = SUBBUF_INDEX(offset_end - 1, chan);
+       long commit_count;
+
+#ifdef LTT_NO_IPI_BARRIER
+       smp_wmb();
+#else
+       /*
+        * Must write slot data before incrementing commit count.
+        * This compiler barrier is upgraded into a smp_mb() by the IPI
+        * sent by get_subbuf().
+        */
+       barrier();
+#endif
+       local_add(slot_size, &buf->commit_count[endidx].cc);
+       local_inc(&buf->commit_count[endidx].events);
+       /*
+        * commit count read can race with concurrent OOO commit count updates.
+        * This is only needed for ltt_check_deliver (for non-polling delivery
+        * only) and for ltt_write_commit_counter. The race can only cause the
+        * counter to be read with the same value more than once, which could
+        * cause :
+        * - Multiple delivery for the same sub-buffer (which is handled
+        *   gracefully by the reader code) if the value is for a full
+        *   sub-buffer. It's important that we can never miss a sub-buffer
+        *   delivery. Re-reading the value after the local_add ensures this.
+        * - Reading a commit_count with a higher value that what was actually
+        *   added to it for the ltt_write_commit_counter call (again caused by
+        *   a concurrent committer). It does not matter, because this function
+        *   is interested in the fact that the commit count reaches back the
+        *   reserve offset for a specific sub-buffer, which is completely
+        *   independent of the order.
+        */
+       commit_count = local_read(&buf->commit_count[endidx].cc);
+
+       ltt_check_deliver(buf, chan, offset_end - 1, commit_count, endidx);
+       /*
+        * Update data_size for each commit. It's needed only for extracting
+        * ltt buffers from vmcore, after crash.
+        */
+       ltt_write_commit_counter(buf, chan, endidx, buf_offset,
+                                commit_count, data_size);
+}
+
+#endif //_LTT_LTT_RELAY_LOCKLESS_H
diff --git a/ltt-relay-splice.c b/ltt-relay-splice.c
new file mode 100644 (file)
index 0000000..e4694c1
--- /dev/null
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ * Copyright (C) 2008-2009 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Re-using content from kernel/relay.c
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/splice.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/bitops.h>
+
+#include "ltt-relay.h"
+#include "ltt-relay-lockless.h"
+
+loff_t ltt_relay_no_llseek(struct file *file, loff_t offset, int origin)
+{
+       return -ESPIPE;
+}
+
+static void ltt_relay_pipe_buf_release(struct pipe_inode_info *pipe,
+                                      struct pipe_buffer *pbuf)
+{
+}
+
+static struct pipe_buf_operations ltt_relay_pipe_buf_ops = {
+       .can_merge = 0,
+       .map = generic_pipe_buf_map,
+       .unmap = generic_pipe_buf_unmap,
+       .confirm = generic_pipe_buf_confirm,
+       .release = ltt_relay_pipe_buf_release,
+       .steal = generic_pipe_buf_steal,
+       .get = generic_pipe_buf_get,
+};
+
+static void ltt_relay_page_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+}
+
+/*
+ *     subbuf_splice_actor - splice up to one subbuf's worth of data
+ */
+static int subbuf_splice_actor(struct file *in,
+                              loff_t *ppos,
+                              struct pipe_inode_info *pipe,
+                              size_t len,
+                              unsigned int flags)
+{
+       struct ltt_chanbuf *buf = in->private_data;
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+       unsigned int poff, subbuf_pages, nr_pages;
+       struct page *pages[PIPE_DEF_BUFFERS];
+       struct partial_page partial[PIPE_DEF_BUFFERS];
+       struct splice_pipe_desc spd = {
+               .pages = pages,
+               .nr_pages = 0,
+               .partial = partial,
+               .flags = flags,
+               .ops = &ltt_relay_pipe_buf_ops,
+               .spd_release = ltt_relay_page_release,
+       };
+       long consumed_old, consumed_idx, roffset;
+       unsigned long bytes_avail;
+
+       /*
+        * Check that a GET_SUBBUF ioctl has been done before.
+        */
+       WARN_ON(atomic_long_read(&buf->active_readers) != 1);
+       consumed_old = atomic_long_read(&buf->consumed);
+       consumed_old += *ppos;
+       consumed_idx = SUBBUF_INDEX(consumed_old, chan);
+
+       /*
+        * Adjust read len, if longer than what is available.
+        * Max read size is 1 subbuffer due to get_subbuf/put_subbuf for
+        * protection.
+        */
+       bytes_avail = chan->a.sb_size;
+       WARN_ON(bytes_avail > chan->a.buf_size);
+       len = min_t(size_t, len, bytes_avail);
+       subbuf_pages = bytes_avail >> PAGE_SHIFT;
+       nr_pages = min_t(unsigned int, subbuf_pages, PIPE_DEF_BUFFERS);
+       roffset = consumed_old & PAGE_MASK;
+       poff = consumed_old & ~PAGE_MASK;
+       printk_dbg(KERN_DEBUG "SPLICE actor len %zu pos %zd write_pos %ld\n",
+                  len, (ssize_t)*ppos, local_read(&buf->offset));
+
+       for (; spd.nr_pages < nr_pages; spd.nr_pages++) {
+               unsigned int this_len;
+               struct page *page;
+
+               if (!len)
+                       break;
+               printk_dbg(KERN_DEBUG "SPLICE actor loop len %zu roffset %ld\n",
+                          len, roffset);
+
+               this_len = PAGE_SIZE - poff;
+               page = ltt_relay_read_get_page(&buf->a, roffset);
+               spd.pages[spd.nr_pages] = page;
+               spd.partial[spd.nr_pages].offset = poff;
+               spd.partial[spd.nr_pages].len = this_len;
+
+               poff = 0;
+               roffset += PAGE_SIZE;
+               len -= this_len;
+       }
+
+       if (!spd.nr_pages)
+               return 0;
+
+       return splice_to_pipe(pipe, &spd);
+}
+
+ssize_t ltt_relay_file_splice_read(struct file *in, loff_t *ppos,
+                                  struct pipe_inode_info *pipe, size_t len,
+                                  unsigned int flags)
+{
+       ssize_t spliced;
+       int ret;
+
+       ret = 0;
+       spliced = 0;
+
+       printk_dbg(KERN_DEBUG "SPLICE read len %zu pos %zd\n", len,
+                  (ssize_t)*ppos);
+       while (len && !spliced) {
+               ret = subbuf_splice_actor(in, ppos, pipe, len, flags);
+               printk_dbg(KERN_DEBUG "SPLICE read loop ret %d\n", ret);
+               if (ret < 0)
+                       break;
+               else if (!ret) {
+                       if (flags & SPLICE_F_NONBLOCK)
+                               ret = -EAGAIN;
+                       break;
+               }
+
+               *ppos += ret;
+               if (ret > len)
+                       len = 0;
+               else
+                       len -= ret;
+               spliced += ret;
+       }
+
+       if (spliced)
+               return spliced;
+
+       return ret;
+}
diff --git a/ltt-relay-vfs.c b/ltt-relay-vfs.c
new file mode 100644 (file)
index 0000000..defbe2d
--- /dev/null
@@ -0,0 +1,244 @@
+/*
+ * ltt/ltt-relay-vfs.c
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * LTTng VFS interface.
+ *
+ * Author:
+ *     Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ltt-channels.h>
+#include <asm/atomic.h>
+
+#include "ltt-tracer.h"
+#include "ltt-relay.h"
+#include "ltt-relay-lockless.h"
+
+/**
+ *     ltt_open - open file op for ltt files
+ *     @inode: opened inode
+ *     @file: opened file
+ *
+ *     Open implementation. Makes sure only one open instance of a buffer is
+ *     done at a given moment.
+ */
+static int ltt_open(struct inode *inode, struct file *file)
+{
+       struct ltt_chanbuf *buf = inode->i_private;
+       int ret;
+
+       ret = ltt_chanbuf_open_read(buf);
+       if (ret)
+               goto end;
+
+       file->private_data = buf;
+       ret = nonseekable_open(inode, file);
+       /*
+        * Let LTTng splice operation must believe that the file descriptor is
+        * seekable. This is a temporary fix to follow new checks added to
+        * splice.c. We should probably do the proper thing and implement a
+        * llseek function eventually, which involves modifying the lttng splice
+        * actors accordingly. TODO
+        */
+       file->f_mode |= FMODE_PREAD;
+end:
+       return ret;
+}
+
+/**
+ *     ltt_release - release file op for ltt files
+ *     @inode: opened inode
+ *     @file: opened file
+ *
+ *     Release implementation.
+ */
+static int ltt_release(struct inode *inode, struct file *file)
+{
+       struct ltt_chanbuf *buf = inode->i_private;
+
+       ltt_chanbuf_release_read(buf);
+
+       return 0;
+}
+
+/**
+ *     ltt_poll - file op for ltt files
+ *     @filp: the file
+ *     @wait: poll table
+ *
+ *     Poll implementation.
+ */
+static unsigned int ltt_poll(struct file *filp, poll_table *wait)
+{
+       unsigned int mask = 0;
+       struct inode *inode = filp->f_dentry->d_inode;
+       struct ltt_chanbuf *buf = inode->i_private;
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+
+       if (filp->f_mode & FMODE_READ) {
+               poll_wait_set_exclusive(wait);
+               poll_wait(filp, &buf->read_wait, wait);
+
+               WARN_ON(atomic_long_read(&buf->active_readers) != 1);
+               if (SUBBUF_TRUNC(ltt_chanbuf_get_offset(buf), chan)
+                 - SUBBUF_TRUNC(ltt_chanbuf_get_consumed(buf), chan)
+                 == 0) {
+                       if (buf->finalized)
+                               return POLLHUP;
+                       else
+                               return 0;
+               } else {
+                       if (SUBBUF_TRUNC(ltt_chanbuf_get_offset(buf), chan)
+                         - SUBBUF_TRUNC(ltt_chanbuf_get_consumed(buf), chan)
+                         >= chan->a.buf_size)
+                               return POLLPRI | POLLRDBAND;
+                       else
+                               return POLLIN | POLLRDNORM;
+               }
+       }
+       return mask;
+}
+
+/**
+ *     ltt_ioctl - control on the debugfs file
+ *
+ *     @inode: the inode
+ *     @filp: the file
+ *     @cmd: the command
+ *     @arg: command arg
+ *
+ *     This ioctl implements three commands necessary for a minimal
+ *     producer/consumer implementation :
+ *     RELAY_GET_SB
+ *             Get the next sub-buffer that can be read. It never blocks.
+ *     RELAY_PUT_SB
+ *             Release the currently read sub-buffer. Parameter is the last
+ *             put subbuffer (returned by GET_SUBBUF).
+ *     RELAY_GET_N_SB
+ *             returns the number of sub-buffers in the per cpu channel.
+ *     RELAY_GET_SB_SIZE
+ *             returns the size of the current sub-buffer.
+ *     RELAY_GET_MAX_SB_SIZE
+ *             returns the maximum size for sub-buffers.
+ */
+static
+int ltt_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+             unsigned long arg)
+{
+       struct ltt_chanbuf *buf = inode->i_private;
+       u32 __user *argp = (u32 __user *)arg;
+
+       switch (cmd) {
+       case RELAY_GET_SB:
+       {
+               unsigned long consumed;
+               int ret;
+
+               ret = ltt_chanbuf_get_subbuf(buf, &consumed);
+               if (ret)
+                       return ret;
+               else
+                       return put_user((u32)consumed, argp);
+               break;
+       }
+       case RELAY_PUT_SB:
+       {
+               u32 uconsumed_old;
+               int ret;
+               long consumed_old;
+
+               ret = get_user(uconsumed_old, argp);
+               if (ret)
+                       return ret; /* will return -EFAULT */
+
+               consumed_old = ltt_chanbuf_get_consumed(buf);
+               consumed_old = consumed_old & (~0xFFFFFFFFL);
+               consumed_old = consumed_old | uconsumed_old;
+               ret = ltt_chanbuf_put_subbuf(buf, consumed_old);
+               if (ret)
+                       return ret;
+               break;
+       }
+       case RELAY_GET_N_SB:
+               return put_user((u32)buf->a.chan->n_sb, argp);
+               break;
+       case RELAY_GET_SB_SIZE:
+               return put_user(get_read_sb_size(buf), argp);
+               break;
+       case RELAY_GET_MAX_SB_SIZE:
+               return put_user((u32)buf->a.chan->sb_size, argp);
+               break;
+       default:
+               return -ENOIOCTLCMD;
+       }
+       return 0;
+}
+
+#ifdef CONFIG_COMPAT
+static
+long ltt_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       long ret = -ENOIOCTLCMD;
+
+       lock_kernel();
+       ret = ltt_ioctl(file->f_dentry->d_inode, file, cmd, arg);
+       unlock_kernel();
+
+       return ret;
+}
+#endif
+
+static const struct file_operations ltt_file_operations = {
+       .open = ltt_open,
+       .release = ltt_release,
+       .poll = ltt_poll,
+       .splice_read = ltt_relay_file_splice_read,
+       .ioctl = ltt_ioctl,
+       .llseek = ltt_relay_no_llseek,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl = ltt_compat_ioctl,
+#endif
+};
+
+int ltt_chanbuf_create_file(const char *filename, struct dentry *parent,
+                           int mode, struct ltt_chanbuf *buf)
+{
+       struct ltt_chan *chan = container_of(buf->a.chan, struct ltt_chan, a);
+       char *tmpname;
+       int ret = 0;
+
+       tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
+       if (!tmpname) {
+               ret = -ENOMEM;
+               goto end;
+       }
+
+       snprintf(tmpname, NAME_MAX, "%s%s_%d",
+                chan->overwrite ? LTT_FLIGHT_PREFIX : "",
+                chan->a.filename, buf->a.cpu);
+
+       buf->a.dentry = debugfs_create_file(tmpname, mode, parent, buf,
+                                           &ltt_file_operations);
+       if (!buf->a.dentry) {
+               ret = -ENOMEM;
+               goto free_name;
+       }
+free_name:
+       kfree(tmpname);
+end:
+       return ret;
+}
+
+int ltt_chanbuf_remove_file(struct ltt_chanbuf *buf)
+{
+       debugfs_remove(buf->a.dentry);
+
+       return 0;
+}
diff --git a/ltt-relay.h b/ltt-relay.h
new file mode 100644 (file)
index 0000000..c79403b
--- /dev/null
@@ -0,0 +1,377 @@
+ /*
+ * include/linux/ltt-relay.h
+ *
+ * Copyright (C) 2008,2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ *
+ * Credits to Steven Rostedt for proposing to use an extra-subbuffer owned by
+ * the reader in flight recorder mode.
+ */
+
+#ifndef _LINUX_LTT_RELAY_H
+#define _LINUX_LTT_RELAY_H
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/ltt-channels.h>
+
+#include "ltt-tracer-core.h"
+
+/* Use lowest pointer bit to show the sub-buffer has no reference. */
+#define RCHAN_NOREF_FLAG       0x1UL
+
+#define RCHAN_SB_IS_NOREF(x)   ((unsigned long)(x) & RCHAN_NOREF_FLAG)
+#define RCHAN_SB_SET_NOREF(x)  \
+       (x = (struct chanbuf_page *)((unsigned long)(x) | RCHAN_NOREF_FLAG))
+#define RCHAN_SB_CLEAR_NOREF(x)        \
+       (x = (struct chanbuf_page *)((unsigned long)(x) & ~RCHAN_NOREF_FLAG))
+
+struct ltt_trace;
+
+struct chanbuf_page {
+       void *virt;                     /* page virtual address (cached) */
+       struct page *page;              /* pointer to page structure */
+};
+
+struct chanbuf_sb {
+       struct chanbuf_page *pages;     /* Pointer to rchan pages for subbuf */
+};
+
+struct ltt_chanbuf_alloc {
+       struct chanbuf_sb *buf_wsb;     /* Array of rchan_sb for writer */
+       struct chanbuf_sb buf_rsb;      /* chanbuf_sb for reader */
+       void **_virt;                   /* Array of pointers to page addr */
+       struct page **_pages;           /* Array of pointers to pages */
+       struct dentry *dentry;          /* Associated file dentry */
+       unsigned int nr_pages;          /* Number pages in buffer */
+
+       struct ltt_chan_alloc *chan;    /* Associated channel */
+       unsigned int cpu;               /* This buffer's cpu */
+       unsigned int allocated:1;       /* Bool: is buffer allocated ? */
+};
+
+int ltt_chanbuf_alloc_create(struct ltt_chanbuf_alloc *buf,
+                            struct ltt_chan_alloc *chan, int cpu);
+void ltt_chanbuf_alloc_free(struct ltt_chanbuf_alloc *buf);
+int ltt_chan_alloc_init(struct ltt_chan_alloc *chan, struct ltt_trace *trace,
+                       const char *base_filename,
+                       struct dentry *parent, size_t sb_size,
+                       size_t n_sb, int extra_reader_sb, int overwrite);
+void ltt_chan_alloc_free(struct ltt_chan_alloc *chan);
+void ltt_chan_alloc_remove_files(struct ltt_chan_alloc *chan);
+int ltt_chanbuf_create_file(const char *filename, struct dentry *parent,
+                           int mode, struct ltt_chanbuf *buf);
+int ltt_chanbuf_remove_file(struct ltt_chanbuf *buf);
+
+void ltt_chan_for_each_channel(void (*cb) (struct ltt_chanbuf *buf), int cpu);
+
+extern void _ltt_relay_write(struct ltt_chanbuf_alloc *bufa,
+                            size_t offset, const void *src, size_t len,
+                            ssize_t pagecpy);
+
+extern void _ltt_relay_strncpy(struct ltt_chanbuf_alloc *bufa,
+                              size_t offset, const void *src, size_t len,
+                              ssize_t pagecpy);
+
+extern void _ltt_relay_strncpy_fixup(struct ltt_chanbuf_alloc *bufa,
+                                    size_t offset, size_t len, size_t copied,
+                                    int terminated);
+
+extern int ltt_relay_read(struct ltt_chanbuf_alloc *bufa,
+                         size_t offset, void *dest, size_t len);
+
+extern int ltt_relay_read_cstr(struct ltt_chanbuf_alloc *bufa,
+                              size_t offset, void *dest, size_t len);
+
+extern struct page *ltt_relay_read_get_page(struct ltt_chanbuf_alloc *bufa,
+                                           size_t offset);
+
+/*
+ * Return the address where a given offset is located.
+ * Should be used to get the current subbuffer header pointer. Given we know
+ * it's never on a page boundary, it's safe to write directly to this address,
+ * as long as the write is never bigger than a page size.
+ */
+extern void *ltt_relay_offset_address(struct ltt_chanbuf_alloc *bufa,
+                                     size_t offset);
+extern void *ltt_relay_read_offset_address(struct ltt_chanbuf_alloc *bufa,
+                                          size_t offset);
+
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+static __inline__
+void ltt_relay_do_copy(void *dest, const void *src, size_t len)
+{
+       switch (len) {
+       case 0:
+               break;
+       case 1:
+               *(u8 *)dest = *(const u8 *)src;
+               break;
+       case 2:
+               *(u16 *)dest = *(const u16 *)src;
+               break;
+       case 4:
+               *(u32 *)dest = *(const u32 *)src;
+               break;
+       case 8:
+               *(u64 *)dest = *(const u64 *)src;
+               break;
+       default:
+               /*
+                * What we really want here is an __inline__ memcpy, but we don't
+                * have constants, so gcc generally uses a function call.
+                */
+               for (; len > 0; len--)
+                       *(u8 *)dest++ = *(const u8 *)src++;
+       }
+}
+#else
+/*
+ * Returns whether the dest and src addresses are aligned on
+ * min(sizeof(void *), len). Call this with statically known len for efficiency.
+ */
+static __inline__
+int addr_aligned(const void *dest, const void *src, size_t len)
+{
+       if (ltt_align((size_t)dest, len))
+               return 0;
+       if (ltt_align((size_t)src, len))
+               return 0;
+       return 1;
+}
+
+static __inline__
+void ltt_relay_do_copy(void *dest, const void *src, size_t len)
+{
+       switch (len) {
+       case 0:
+               break;
+       case 1:
+               *(u8 *)dest = *(const u8 *)src;
+               break;
+       case 2:
+               if (unlikely(!addr_aligned(dest, src, 2)))
+                       goto memcpy_fallback;
+               *(u16 *)dest = *(const u16 *)src;
+               break;
+       case 4:
+               if (unlikely(!addr_aligned(dest, src, 4)))
+                       goto memcpy_fallback;
+               *(u32 *)dest = *(const u32 *)src;
+               break;
+       case 8:
+               if (unlikely(!addr_aligned(dest, src, 8)))
+                       goto memcpy_fallback;
+               *(u64 *)dest = *(const u64 *)src;
+               break;
+       default:
+               goto memcpy_fallback;
+       }
+       return;
+
+memcpy_fallback:
+       /*
+        * What we really want here is an inline memcpy, but we don't
+        * have constants, so gcc generally uses a function call.
+        */
+       for (; len > 0; len--)
+               *(u8 *)dest++ = *(const u8 *)src++;
+}
+#endif
+
+/*
+ * ltt_relay_do_memset - write character into dest.
+ * @dest: destination
+ * @src: source character
+ * @len: length to write
+ */
+static __inline__
+void ltt_relay_do_memset(void *dest, char src, size_t len)
+{
+       /*
+        * What we really want here is an __inline__ memset, but we
+        * don't have constants, so gcc generally uses a function call.
+        */
+       for (; len > 0; len--)
+               *(u8 *)dest++ = src;
+}
+
+
+/*
+ * ltt_relay_do_strncpy - copy a string up to a certain number of bytes
+ * @dest: destination
+ * @src: source
+ * @len: max. length to copy
+ * @terminated: output string ends with \0 (output)
+ *
+ * returns the number of bytes copied. Does not finalize with \0 if len is
+ * reached.
+ */
+static __inline__
+size_t ltt_relay_do_strncpy(void *dest, const void *src, size_t len,
+                           int *terminated)
+{
+       size_t orig_len = len;
+
+       *terminated = 0;
+       /*
+        * What we really want here is an __inline__ strncpy, but we
+        * don't have constants, so gcc generally uses a function call.
+        */
+       for (; len > 0; len--) {
+               *(u8 *)dest = ACCESS_ONCE(*(const u8 *)src);
+               /* Check with dest, because src may be modified concurrently */
+               if (*(const u8 *)dest == '\0') {
+                       len--;
+                       *terminated = 1;
+                       break;
+               }
+               dest++;
+               src++;
+       }
+       return orig_len - len;
+}
+
+static __inline__
+int ltt_relay_write(struct ltt_chanbuf_alloc *bufa,
+                   struct ltt_chan_alloc *chana, size_t offset,
+                   const void *src, size_t len)
+{
+       size_t sbidx, index;
+       ssize_t pagecpy;
+       struct chanbuf_page *rpages;
+
+       offset &= chana->buf_size - 1;
+       sbidx = offset >> chana->sb_size_order;
+       index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+       pagecpy = min_t(size_t, len, (- offset) & ~PAGE_MASK);
+       rpages = bufa->buf_wsb[sbidx].pages;
+       WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+       ltt_relay_do_copy(rpages[index].virt + (offset & ~PAGE_MASK),
+                         src, pagecpy);
+
+       if (unlikely(len != pagecpy))
+               _ltt_relay_write(bufa, offset, src, len, pagecpy);
+       return len;
+}
+
+static __inline__
+int ltt_relay_strncpy(struct ltt_chanbuf_alloc *bufa,
+                     struct ltt_chan_alloc *chana, size_t offset,
+                     const void *src, size_t len)
+{
+       size_t sbidx, index;
+       ssize_t pagecpy, copied;
+       struct chanbuf_page *rpages;
+       int terminated;
+
+       offset &= chana->buf_size - 1;
+       sbidx = offset >> chana->sb_size_order;
+       index = (offset & (chana->sb_size - 1)) >> PAGE_SHIFT;
+       pagecpy = min_t(size_t, len, (- offset) & ~PAGE_MASK);
+       rpages = bufa->buf_wsb[sbidx].pages;
+       WARN_ON_ONCE(RCHAN_SB_IS_NOREF(rpages));
+       copied = ltt_relay_do_strncpy(rpages[index].virt
+                                     + (offset & ~PAGE_MASK),
+                                     src, pagecpy, &terminated);
+       if (unlikely(copied < pagecpy || ((len == pagecpy) && !terminated)))
+               _ltt_relay_strncpy_fixup(bufa, offset, len, copied,
+                                        terminated);
+       else {
+               if (unlikely(len != pagecpy))
+                       _ltt_relay_strncpy(bufa, offset, src, len, pagecpy);
+       }
+       return len;
+}
+
+/**
+ * ltt_clear_noref_flag - Clear the noref subbuffer flag, for writer.
+ */
+static __inline__
+void ltt_clear_noref_flag(struct ltt_chanbuf_alloc *bufa, long idx)
+{
+       struct chanbuf_page *sb_pages, *new_sb_pages;
+
+       sb_pages = bufa->buf_wsb[idx].pages;
+       for (;;) {
+               if (!RCHAN_SB_IS_NOREF(sb_pages))
+                       return; /* Already writing to this buffer */
+               new_sb_pages = sb_pages;
+               RCHAN_SB_CLEAR_NOREF(new_sb_pages);
+               new_sb_pages = cmpxchg(&bufa->buf_wsb[idx].pages,
+                       sb_pages, new_sb_pages);
+               if (likely(new_sb_pages == sb_pages))
+                       break;
+               sb_pages = new_sb_pages;
+       }
+}
+
+/**
+ * ltt_set_noref_flag - Set the noref subbuffer flag, for writer.
+ */
+static __inline__
+void ltt_set_noref_flag(struct ltt_chanbuf_alloc *bufa, long idx)
+{
+       struct chanbuf_page *sb_pages, *new_sb_pages;
+
+       sb_pages = bufa->buf_wsb[idx].pages;
+       for (;;) {
+               if (RCHAN_SB_IS_NOREF(sb_pages))
+                       return; /* Already set */
+               new_sb_pages = sb_pages;
+               RCHAN_SB_SET_NOREF(new_sb_pages);
+               new_sb_pages = cmpxchg(&bufa->buf_wsb[idx].pages,
+                       sb_pages, new_sb_pages);
+               if (likely(new_sb_pages == sb_pages))
+                       break;
+               sb_pages = new_sb_pages;
+       }
+}
+
+/**
+ * update_read_sb_index - Read-side subbuffer index update.
+ */
+static __inline__
+int update_read_sb_index(struct ltt_chanbuf_alloc *bufa,
+                        struct ltt_chan_alloc *chana,
+                        long consumed_idx)
+{
+       struct chanbuf_page *old_wpage, *new_wpage;
+
+       if (unlikely(chana->extra_reader_sb)) {
+               /*
+                * Exchange the target writer subbuffer with our own unused
+                * subbuffer.
+                */
+               old_wpage = bufa->buf_wsb[consumed_idx].pages;
+               if (unlikely(!RCHAN_SB_IS_NOREF(old_wpage)))
+                       return -EAGAIN;
+               WARN_ON_ONCE(!RCHAN_SB_IS_NOREF(bufa->buf_rsb.pages));
+               new_wpage = cmpxchg(&bufa->buf_wsb[consumed_idx].pages,
+                               old_wpage,
+                               bufa->buf_rsb.pages);
+               if (unlikely(old_wpage != new_wpage))
+                       return -EAGAIN;
+               bufa->buf_rsb.pages = new_wpage;
+               RCHAN_SB_CLEAR_NOREF(bufa->buf_rsb.pages);
+       } else {
+               /* No page exchange, use the writer page directly */
+               bufa->buf_rsb.pages = bufa->buf_wsb[consumed_idx].pages;
+               RCHAN_SB_CLEAR_NOREF(bufa->buf_rsb.pages);
+       }
+       return 0;
+}
+
+ssize_t ltt_relay_file_splice_read(struct file *in, loff_t *ppos,
+                                  struct pipe_inode_info *pipe, size_t len,
+                                  unsigned int flags);
+loff_t ltt_relay_no_llseek(struct file *file, loff_t offset, int origin);
+
+#endif /* _LINUX_LTT_RELAY_H */
diff --git a/ltt-serialize.c b/ltt-serialize.c
new file mode 100644 (file)
index 0000000..305b3eb
--- /dev/null
@@ -0,0 +1,969 @@
+/*
+ * LTTng serializing code.
+ *
+ * Copyright Mathieu Desnoyers, March 2007.
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ *
+ * See this discussion about weirdness about passing va_list and then va_list to
+ * functions. (related to array argument passing). va_list seems to be
+ * implemented as an array on x86_64, but not on i386... This is why we pass a
+ * va_list * to ltt_vtrace.
+ */
+
+#include <stdarg.h>
+#include <linux/ctype.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+#include "ltt-tracer.h"
+#include "ltt-relay-lockless.h"
+
+enum ltt_type {
+       LTT_TYPE_SIGNED_INT,
+       LTT_TYPE_UNSIGNED_INT,
+       LTT_TYPE_STRING,
+       LTT_TYPE_NONE,
+};
+
+#define LTT_ATTRIBUTE_NETWORK_BYTE_ORDER (1<<1)
+
+/*
+ * Stack used to keep track of string length at size calculation, passed to
+ * string copy to handle racy input string updates.
+ * Can be used by any context; this is ensured by putting the stack position
+ * back to its original position after using it.
+ */
+#define TRACER_STACK_LEN       (PAGE_SIZE / sizeof(unsigned long))
+static DEFINE_PER_CPU(unsigned long [TRACER_STACK_LEN],
+                     tracer_stack);
+static DEFINE_PER_CPU(unsigned int, tracer_stack_pos);
+
+/*
+ * Inspired from vsnprintf
+ *
+ * The serialization format string supports the basic printf format strings.
+ * In addition, it defines new formats that can be used to serialize more
+ * complex/non portable data structures.
+ *
+ * Typical use:
+ *
+ * field_name %ctype
+ * field_name #tracetype %ctype
+ * field_name #tracetype %ctype1 %ctype2 ...
+ *
+ * A conversion is performed between format string types supported by GCC and
+ * the trace type requested. GCC type is used to perform type checking on format
+ * strings. Trace type is used to specify the exact binary representation
+ * in the trace. A mapping is done between one or more GCC types to one trace
+ * type. Sign extension, if required by the conversion, is performed following
+ * the trace type.
+ *
+ * If a gcc format is not declared with a trace format, the gcc format is
+ * also used as binary representation in the trace.
+ *
+ * Strings are supported with %s.
+ * A single tracetype (sequence) can take multiple c types as parameter.
+ *
+ * c types:
+ *
+ * see printf(3).
+ *
+ * Note: to write a uint32_t in a trace, the following expression is recommended
+ * si it can be portable:
+ *
+ * ("#4u%lu", (unsigned long)var)
+ *
+ * trace types:
+ *
+ * Serialization specific formats :
+ *
+ * Fixed size integers
+ * #1u     writes uint8_t
+ * #2u     writes uint16_t
+ * #4u     writes uint32_t
+ * #8u     writes uint64_t
+ * #1d     writes int8_t
+ * #2d     writes int16_t
+ * #4d     writes int32_t
+ * #8d     writes int64_t
+ * i.e.:
+ * #1u%lu #2u%lu #4d%lu #8d%lu #llu%hu #d%lu
+ *
+ * * Attributes:
+ *
+ * n:  (for network byte order)
+ * #ntracetype%ctype
+ *            is written in the trace in network byte order.
+ *
+ * i.e.: #bn4u%lu, #n%lu, #b%u
+ *
+ * TODO (eventually)
+ * Variable length sequence
+ * #a #tracetype1 #tracetype2 %array_ptr %elem_size %num_elems
+ *            In the trace:
+ *            #a specifies that this is a sequence
+ *            #tracetype1 is the type of elements in the sequence
+ *            #tracetype2 is the type of the element count
+ *            GCC input:
+ *            array_ptr is a pointer to an array that contains members of size
+ *            elem_size.
+ *            num_elems is the number of elements in the array.
+ * i.e.: #a #lu #lu %p %lu %u
+ *
+ * Callback
+ * #k         callback (taken from the probe data)
+ *            The following % arguments are exepected by the callback
+ *
+ * i.e.: #a #lu #lu #k %p
+ *
+ * Note: No conversion is done from floats to integers, nor from integers to
+ * floats between c types and trace types. float conversion from double to float
+ * or from float to double is also not supported.
+ *
+ * REMOVE
+ * %*b     expects sizeof(data), data
+ *         where sizeof(data) is 1, 2, 4 or 8
+ *
+ * Fixed length struct, union or array.
+ * FIXME: unable to extract those sizes statically.
+ * %*r     expects sizeof(*ptr), ptr
+ * %*.*r   expects sizeof(*ptr), __alignof__(*ptr), ptr
+ * struct and unions removed.
+ * Fixed length array:
+ * [%p]#a[len #tracetype]
+ * i.e.: [%p]#a[12 #lu]
+ *
+ * Variable length sequence
+ * %*.*:*v expects sizeof(*ptr), __alignof__(*ptr), elem_num, ptr
+ *         where elem_num is the number of elements in the sequence
+ */
+static inline
+const char *parse_trace_type(const char *fmt, char *trace_size,
+                            enum ltt_type *trace_type,
+                            unsigned long *attributes)
+{
+       int qualifier;          /* 'h', 'l', or 'L' for integer fields */
+                               /* 'z' support added 23/7/1999 S.H.    */
+                               /* 'z' changed to 'Z' --davidm 1/25/99 */
+                               /* 't' added for ptrdiff_t */
+
+       /* parse attributes. */
+repeat:
+       switch (*fmt) {
+       case 'n':
+               *attributes |= LTT_ATTRIBUTE_NETWORK_BYTE_ORDER;
+               ++fmt;
+               goto repeat;
+       }
+
+       /* get the conversion qualifier */
+       qualifier = -1;
+       if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
+           *fmt == 'Z' || *fmt == 'z' || *fmt == 't' ||
+           *fmt == 'S' || *fmt == '1' || *fmt == '2' ||
+           *fmt == '4' || *fmt == 8) {
+               qualifier = *fmt;
+               ++fmt;
+               if (qualifier == 'l' && *fmt == 'l') {
+                       qualifier = 'L';
+                       ++fmt;
+               }
+       }
+
+       switch (*fmt) {
+       case 'c':
+               *trace_type = LTT_TYPE_UNSIGNED_INT;
+               *trace_size = sizeof(unsigned char);
+               goto parse_end;
+       case 's':
+               *trace_type = LTT_TYPE_STRING;
+               goto parse_end;
+       case 'p':
+               *trace_type = LTT_TYPE_UNSIGNED_INT;
+               *trace_size = sizeof(void *);
+               goto parse_end;
+       case 'd':
+       case 'i':
+               *trace_type = LTT_TYPE_SIGNED_INT;
+               break;
+       case 'o':
+       case 'u':
+       case 'x':
+       case 'X':
+               *trace_type = LTT_TYPE_UNSIGNED_INT;
+               break;
+       default:
+               if (!*fmt)
+                       --fmt;
+               goto parse_end;
+       }
+       switch (qualifier) {
+       case 'L':
+               *trace_size = sizeof(long long);
+               break;
+       case 'l':
+               *trace_size = sizeof(long);
+               break;
+       case 'Z':
+       case 'z':
+               *trace_size = sizeof(size_t);
+               break;
+       case 't':
+               *trace_size = sizeof(ptrdiff_t);
+               break;
+       case 'h':
+               *trace_size = sizeof(short);
+               break;
+       case '1':
+               *trace_size = sizeof(uint8_t);
+               break;
+       case '2':
+               *trace_size = sizeof(uint16_t);
+               break;
+       case '4':
+               *trace_size = sizeof(uint32_t);
+               break;
+       case '8':
+               *trace_size = sizeof(uint64_t);
+               break;
+       default:
+               *trace_size = sizeof(int);
+       }
+
+parse_end:
+       return fmt;
+}
+
+/*
+ * Restrictions:
+ * Field width and precision are *not* supported.
+ * %n not supported.
+ */
+static inline
+const char *parse_c_type(const char *fmt, char *c_size, enum ltt_type *c_type,
+                        char *outfmt)
+{
+       int qualifier;          /* 'h', 'l', or 'L' for integer fields */
+                               /* 'z' support added 23/7/1999 S.H.    */
+                               /* 'z' changed to 'Z' --davidm 1/25/99 */
+                               /* 't' added for ptrdiff_t */
+
+       /* process flags : ignore standard print formats for now. */
+repeat:
+       switch (*fmt) {
+       case '-':
+       case '+':
+       case ' ':
+       case '#':
+       case '0':
+               ++fmt;
+               goto repeat;
+       }
+
+       /* get the conversion qualifier */
+       qualifier = -1;
+       if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
+           *fmt == 'Z' || *fmt == 'z' || *fmt == 't' ||
+           *fmt == 'S') {
+               qualifier = *fmt;
+               ++fmt;
+               if (qualifier == 'l' && *fmt == 'l') {
+                       qualifier = 'L';
+                       ++fmt;
+               }
+       }
+
+       if (outfmt) {
+               if (qualifier != -1)
+                       *outfmt++ = (char)qualifier;
+               *outfmt++ = *fmt;
+               *outfmt = 0;
+       }
+
+       switch (*fmt) {
+       case 'c':
+               *c_type = LTT_TYPE_UNSIGNED_INT;
+               *c_size = sizeof(unsigned char);
+               goto parse_end;
+       case 's':
+               *c_type = LTT_TYPE_STRING;
+               goto parse_end;
+       case 'p':
+               *c_type = LTT_TYPE_UNSIGNED_INT;
+               *c_size = sizeof(void *);
+               goto parse_end;
+       case 'd':
+       case 'i':
+               *c_type = LTT_TYPE_SIGNED_INT;
+               break;
+       case 'o':
+       case 'u':
+       case 'x':
+       case 'X':
+               *c_type = LTT_TYPE_UNSIGNED_INT;
+               break;
+       default:
+               if (!*fmt)
+                       --fmt;
+               goto parse_end;
+       }
+       switch (qualifier) {
+       case 'L':
+               *c_size = sizeof(long long);
+               break;
+       case 'l':
+               *c_size = sizeof(long);
+               break;
+       case 'Z':
+       case 'z':
+               *c_size = sizeof(size_t);
+               break;
+       case 't':
+               *c_size = sizeof(ptrdiff_t);
+               break;
+       case 'h':
+               *c_size = sizeof(short);
+               break;
+       default:
+               *c_size = sizeof(int);
+       }
+
+parse_end:
+       return fmt;
+}
+
+static inline
+size_t serialize_trace_data(struct ltt_chanbuf *buf, size_t buf_offset,
+                           char trace_size, enum ltt_type trace_type,
+                           char c_size, enum ltt_type c_type,
+                           unsigned int *stack_pos_ctx,
+                           int *largest_align,
+                           va_list *args)
+{
+       union {
+               unsigned long v_ulong;
+               uint64_t v_uint64;
+               struct {
+                       const char *s;
+                       size_t len;
+               } v_string;
+       } tmp;
+
+       /*
+        * Be careful about sign extension here.
+        * Sign extension is done with the destination (trace) type.
+        */
+       switch (trace_type) {
+       case LTT_TYPE_SIGNED_INT:
+               switch (c_size) {
+               case 1:
+                       tmp.v_ulong = (long)(int8_t)va_arg(*args, int);
+                       break;
+               case 2:
+                       tmp.v_ulong = (long)(int16_t)va_arg(*args, int);
+                       break;
+               case 4:
+                       tmp.v_ulong = (long)(int32_t)va_arg(*args, int);
+                       break;
+               case 8:
+                       tmp.v_uint64 = va_arg(*args, int64_t);
+                       break;
+               default:
+                       BUG();
+               }
+               break;
+       case LTT_TYPE_UNSIGNED_INT:
+               switch (c_size) {
+               case 1:
+                       tmp.v_ulong = (unsigned long)(uint8_t)va_arg(*args, unsigned int);
+                       break;
+               case 2:
+                       tmp.v_ulong = (unsigned long)(uint16_t)va_arg(*args, unsigned int);
+                       break;
+               case 4:
+                       tmp.v_ulong = (unsigned long)(uint32_t)va_arg(*args, unsigned int);
+                       break;
+               case 8:
+                       tmp.v_uint64 = va_arg(*args, uint64_t);
+                       break;
+               default:
+                       BUG();
+               }
+               break;
+       case LTT_TYPE_STRING:
+               tmp.v_string.s = va_arg(*args, const char *);
+               if ((unsigned long)tmp.v_string.s < PAGE_SIZE)
+                       tmp.v_string.s = "<NULL>";
+               if (!buf) {
+                       /*
+                        * Reserve tracer stack entry.
+                        */
+                       __get_cpu_var(tracer_stack_pos)++;
+                       WARN_ON_ONCE(__get_cpu_var(tracer_stack_pos)
+                                    > TRACER_STACK_LEN);
+                       barrier();
+                       __get_cpu_var(tracer_stack)[*stack_pos_ctx] =
+                                       strlen(tmp.v_string.s) + 1;
+               }
+               tmp.v_string.len = __get_cpu_var(tracer_stack)
+                                       [(*stack_pos_ctx)++];
+               if (buf)
+                       ltt_relay_strncpy(&buf->a, buf->a.chan, buf_offset,
+                                         tmp.v_string.s, tmp.v_string.len);
+               buf_offset += tmp.v_string.len;
+               goto copydone;
+       default:
+               BUG();
+       }
+
+       /*
+        * If trace_size is lower or equal to 4 bytes, there is no sign
+        * extension to do because we are already encoded in a long. Therefore,
+        * we can combine signed and unsigned ops. 4 bytes float also works
+        * with this, because we do a simple copy of 4 bytes into 4 bytes
+        * without manipulation (and we do not support conversion from integers
+        * to floats).
+        * It is also the case if c_size is 8 bytes, which is the largest
+        * possible integer.
+        */
+       if (ltt_get_alignment()) {
+               buf_offset += ltt_align(buf_offset, trace_size);
+               if (largest_align)
+                       *largest_align = max_t(int, *largest_align, trace_size);
+       }
+       if (trace_size <= 4 || c_size == 8) {
+               if (buf) {
+                       switch (trace_size) {
+                       case 1:
+                               if (c_size == 8)
+                                       ltt_relay_write(&buf->a, buf->a.chan,
+                                       buf_offset,
+                                       (uint8_t[]){ (uint8_t)tmp.v_uint64 },
+                                       sizeof(uint8_t));
+                               else
+                                       ltt_relay_write(&buf->a, buf->a.chan,
+                                       buf_offset,
+                                       (uint8_t[]){ (uint8_t)tmp.v_ulong },
+                                       sizeof(uint8_t));
+                               break;
+                       case 2:
+                               if (c_size == 8)
+                                       ltt_relay_write(&buf->a, buf->a.chan,
+                                       buf_offset,
+                                       (uint16_t[]){ (uint16_t)tmp.v_uint64 },
+                                       sizeof(uint16_t));
+                               else
+                                       ltt_relay_write(&buf->a, buf->a.chan,
+                                       buf_offset,
+                                       (uint16_t[]){ (uint16_t)tmp.v_ulong },
+                                       sizeof(uint16_t));
+                               break;
+                       case 4:
+                               if (c_size == 8)
+                                       ltt_relay_write(&buf->a, buf->a.chan,
+                                       buf_offset,
+                                       (uint32_t[]){ (uint32_t)tmp.v_uint64 },
+                                       sizeof(uint32_t));
+                               else
+                                       ltt_relay_write(&buf->a, buf->a.chan,
+                                       buf_offset,
+                                       (uint32_t[]){ (uint32_t)tmp.v_ulong },
+                                       sizeof(uint32_t));
+                               break;
+                       case 8:
+                               /*
+                                * c_size cannot be other than 8 here because
+                                * trace_size > 4.
+                                */
+                               ltt_relay_write(&buf->a, buf->a.chan, buf_offset,
+                               (uint64_t[]){ (uint64_t)tmp.v_uint64 },
+                               sizeof(uint64_t));
+                               break;
+                       default:
+                               BUG();
+                       }
+               }
+               buf_offset += trace_size;
+               goto copydone;
+       } else {
+               /*
+                * Perform sign extension.
+                */
+               if (buf) {
+                       switch (trace_type) {
+                       case LTT_TYPE_SIGNED_INT:
+                               ltt_relay_write(&buf->a, buf->a.chan, buf_offset,
+                                       (int64_t[]){ (int64_t)tmp.v_ulong },
+                                       sizeof(int64_t));
+                               break;
+                       case LTT_TYPE_UNSIGNED_INT:
+                               ltt_relay_write(&buf->a, buf->a.chan, buf_offset,
+                                       (uint64_t[]){ (uint64_t)tmp.v_ulong },
+                                       sizeof(uint64_t));
+                               break;
+                       default:
+                               BUG();
+                       }
+               }
+               buf_offset += trace_size;
+               goto copydone;
+       }
+
+copydone:
+       return buf_offset;
+}
+
+notrace size_t
+ltt_serialize_data(struct ltt_chanbuf *buf, size_t buf_offset,
+                  struct ltt_serialize_closure *closure,
+                  void *serialize_private, unsigned int stack_pos_ctx,
+                  int *largest_align, const char *fmt, va_list *args)
+{
+       char trace_size = 0, c_size = 0;        /*
+                                                * 0 (unset), 1, 2, 4, 8 bytes.
+                                                */
+       enum ltt_type trace_type = LTT_TYPE_NONE, c_type = LTT_TYPE_NONE;
+       unsigned long attributes = 0;
+
+       for (; *fmt ; ++fmt) {
+               switch (*fmt) {
+               case '#':
+                       /* tracetypes (#) */
+                       ++fmt;                  /* skip first '#' */
+                       if (*fmt == '#')        /* Escaped ## */
+                               break;
+                       attributes = 0;
+                       fmt = parse_trace_type(fmt, &trace_size, &trace_type,
+                                              &attributes);
+                       break;
+               case '%':
+                       /* c types (%) */
+                       ++fmt;                  /* skip first '%' */
+                       if (*fmt == '%')        /* Escaped %% */
+                               break;
+                       fmt = parse_c_type(fmt, &c_size, &c_type, NULL);
+                       /*
+                        * Output c types if no trace types has been
+                        * specified.
+                        */
+                       if (!trace_size)
+                               trace_size = c_size;
+                       if (trace_type == LTT_TYPE_NONE)
+                               trace_type = c_type;
+                       if (c_type == LTT_TYPE_STRING)
+                               trace_type = LTT_TYPE_STRING;
+                       /* perform trace write */
+                       buf_offset = serialize_trace_data(buf, buf_offset,
+                                                         trace_size,
+                                                         trace_type, c_size,
+                                                         c_type,
+                                                         &stack_pos_ctx,
+                                                         largest_align,
+                                                         args);
+                       trace_size = 0;
+                       c_size = 0;
+                       trace_type = LTT_TYPE_NONE;
+                       c_size = LTT_TYPE_NONE;
+                       attributes = 0;
+                       break;
+                       /* default is to skip the text, doing nothing */
+               }
+       }
+       return buf_offset;
+}
+EXPORT_SYMBOL_GPL(ltt_serialize_data);
+
+static inline
+uint64_t unserialize_base_type(struct ltt_chanbuf *buf,
+                              size_t *ppos, char trace_size,
+                              enum ltt_type trace_type)
+{
+       uint64_t tmp;
+
+       *ppos += ltt_align(*ppos, trace_size);
+       ltt_relay_read(&buf->a, *ppos, &tmp, trace_size);
+       *ppos += trace_size;
+
+       switch (trace_type) {
+       case LTT_TYPE_SIGNED_INT:
+               switch (trace_size) {
+               case 1:
+                       return (uint64_t)*(int8_t *)&tmp;
+               case 2:
+                       return (uint64_t)*(int16_t *)&tmp;
+               case 4:
+                       return (uint64_t)*(int32_t *)&tmp;
+               case 8:
+                       return tmp;
+               }
+               break;
+       case LTT_TYPE_UNSIGNED_INT:
+               switch (trace_size) {
+               case 1:
+                       return (uint64_t)*(uint8_t *)&tmp;
+               case 2:
+                       return (uint64_t)*(uint16_t *)&tmp;
+               case 4:
+                       return (uint64_t)*(uint32_t *)&tmp;
+               case 8:
+                       return tmp;
+               }
+               break;
+       default:
+               break;
+       }
+
+       BUG();
+       return 0;
+}
+
+static
+int serialize_printf_data(struct ltt_chanbuf *buf, size_t *ppos,
+                         char trace_size, enum ltt_type trace_type,
+                         char c_size, enum ltt_type c_type, char *output,
+                         ssize_t outlen, const char *outfmt)
+{
+       u64 value;
+       outlen = outlen < 0 ? 0 : outlen;
+
+       if (trace_type == LTT_TYPE_STRING) {
+               size_t len = ltt_relay_read_cstr(&buf->a, *ppos, output,
+                                                outlen);
+               *ppos += len + 1;
+               return len;
+       }
+
+       value = unserialize_base_type(buf, ppos, trace_size, trace_type);
+
+       if (c_size == 8)
+               return snprintf(output, outlen, outfmt, value);
+       else
+               return snprintf(output, outlen, outfmt, (unsigned int)value);
+}
+
+/**
+ * ltt_serialize_printf - Format a string and place it in a buffer
+ * @buf: The ltt-relay buffer that store binary data
+ * @buf_offset: binary data's offset in @buf (should be masked to use as offset)
+ * @msg_size: return message's length
+ * @output: The buffer to place the result into
+ * @outlen: The size of the buffer, including the trailing '\0'
+ * @fmt: The format string to use
+ *
+ * The return value is the number of characters which would
+ * be generated for the given input, excluding the trailing
+ * '\0', as per ISO C99. If the return is greater than or equal to @outlen,
+ * the resulting string is truncated.
+ */
+size_t ltt_serialize_printf(struct ltt_chanbuf *buf, unsigned long buf_offset,
+                           size_t *msg_size, char *output, size_t outlen,
+                           const char *fmt)
+{
+       char trace_size = 0, c_size = 0;        /*
+                                                * 0 (unset), 1, 2, 4, 8 bytes.
+                                                */
+       enum ltt_type trace_type = LTT_TYPE_NONE, c_type = LTT_TYPE_NONE;
+       unsigned long attributes = 0;
+       char outfmt[4] = "%";
+       size_t outpos = 0;
+       size_t len;
+       size_t msgpos = buf_offset;
+
+       for (; *fmt ; ++fmt) {
+               switch (*fmt) {
+               case '#':
+                       /* tracetypes (#) */
+                       ++fmt;                  /* skip first '#' */
+                       if (*fmt == '#') {      /* Escaped ## */
+                               if (outpos < outlen)
+                                       output[outpos] = '#';
+                               outpos++;
+                               break;
+                       }
+                       attributes = 0;
+                       fmt = parse_trace_type(fmt, &trace_size, &trace_type,
+                                              &attributes);
+                       break;
+               case '%':
+                       /* c types (%) */
+                       ++fmt;                  /* skip first '%' */
+                       if (*fmt == '%') {      /* Escaped %% */
+                               if (outpos < outlen)
+                                       output[outpos] = '%';
+                               outpos++;
+                               break;
+                       }
+                       fmt = parse_c_type(fmt, &c_size, &c_type, outfmt + 1);
+                       /*
+                        * Output c types if no trace types has been
+                        * specified.
+                        */
+                       if (!trace_size)
+                               trace_size = c_size;
+                       if (trace_type == LTT_TYPE_NONE)
+                               trace_type = c_type;
+                       if (c_type == LTT_TYPE_STRING)
+                               trace_type = LTT_TYPE_STRING;
+
+                       /* perform trace printf */
+                       len = serialize_printf_data(buf, &msgpos, trace_size,
+                                                   trace_type, c_size, c_type,
+                                                   output + outpos,
+                                                   outlen - outpos, outfmt);
+                       outpos += len;
+                       trace_size = 0;
+                       c_size = 0;
+                       trace_type = LTT_TYPE_NONE;
+                       c_size = LTT_TYPE_NONE;
+                       attributes = 0;
+                       break;
+               default:
+                       if (outpos < outlen)
+                               output[outpos] = *fmt;
+                       outpos++;
+                       break;
+               }
+       }
+       if (msg_size)
+               *msg_size = (size_t)(msgpos - buf_offset);
+       /*
+        * Make sure we end output with terminating \0 when truncated.
+        */
+       if (outpos >= outlen + 1)
+               output[outlen] = '\0';
+       return outpos;
+}
+EXPORT_SYMBOL_GPL(ltt_serialize_printf);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+
+unsigned int ltt_fmt_largest_align(size_t align_drift, const char *fmt)
+{
+       char trace_size = 0, c_size = 0;
+       enum ltt_type trace_type = LTT_TYPE_NONE, c_type = LTT_TYPE_NONE;
+       unsigned long attributes = 0;
+       int largest_align = 1;
+
+       for (; *fmt ; ++fmt) {
+               switch (*fmt) {
+               case '#':
+                       /* tracetypes (#) */
+                       ++fmt;                  /* skip first '#' */
+                       if (*fmt == '#')        /* Escaped ## */
+                               break;
+                       attributes = 0;
+                       fmt = parse_trace_type(fmt, &trace_size, &trace_type,
+                                              &attributes);
+
+                       largest_align = max_t(int, largest_align, trace_size);
+                       if (largest_align >= ltt_get_alignment())
+                               goto exit;
+                       break;
+               case '%':
+                       /* c types (%) */
+                       ++fmt;                  /* skip first '%' */
+                       if (*fmt == '%')        /* Escaped %% */
+                               break;
+                       fmt = parse_c_type(fmt, &c_size, &c_type, NULL);
+                       /*
+                        * Output c types if no trace types has been
+                        * specified.
+                        */
+                       if (!trace_size)
+                               trace_size = c_size;
+                       if (trace_type == LTT_TYPE_NONE)
+                               trace_type = c_type;
+                       if (c_type == LTT_TYPE_STRING)
+                               trace_type = LTT_TYPE_STRING;
+
+                       largest_align = max_t(int, largest_align, trace_size);
+                       if (largest_align >= ltt_get_alignment())
+                               goto exit;
+
+                       trace_size = 0;
+                       c_size = 0;
+                       trace_type = LTT_TYPE_NONE;
+                       c_size = LTT_TYPE_NONE;
+                       break;
+               }
+       }
+
+exit:
+       largest_align = min_t(int, largest_align, ltt_get_alignment());
+       return (largest_align - align_drift) & (largest_align - 1);
+}
+EXPORT_SYMBOL_GPL(ltt_fmt_largest_align);
+
+#endif
+
+/*
+ * Calculate data size
+ * Assume that the padding for alignment starts at a sizeof(void *) address.
+ */
+static notrace
+size_t ltt_get_data_size(struct ltt_serialize_closure *closure,
+                        void *serialize_private, unsigned int stack_pos_ctx,
+                        int *largest_align, const char *fmt, va_list *args)
+{
+       ltt_serialize_cb cb = closure->callbacks[0];
+       closure->cb_idx = 0;
+       return (size_t)cb(NULL, 0, closure, serialize_private, stack_pos_ctx,
+                         largest_align, fmt, args);
+}
+
+static notrace
+void ltt_write_event_data(struct ltt_chanbuf *buf, size_t buf_offset,
+                         struct ltt_serialize_closure *closure,
+                         void *serialize_private, unsigned int stack_pos_ctx,
+                         int largest_align, const char *fmt, va_list *args)
+{
+       ltt_serialize_cb cb = closure->callbacks[0];
+       closure->cb_idx = 0;
+       buf_offset += ltt_align(buf_offset, largest_align);
+       cb(buf, buf_offset, closure, serialize_private, stack_pos_ctx, NULL,
+          fmt, args);
+}
+
+
+notrace
+void ltt_vtrace(const struct marker *mdata, void *probe_data, void *call_data,
+               const char *fmt, va_list *args)
+{
+       int largest_align, ret;
+       struct ltt_active_marker *pdata;
+       uint16_t eID;
+       size_t data_size, slot_size;
+       unsigned int chan_index;
+       struct ltt_chanbuf *buf;
+       struct ltt_chan *chan;
+       struct ltt_trace *trace, *dest_trace = NULL;
+       uint64_t tsc;
+       long buf_offset;
+       va_list args_copy;
+       struct ltt_serialize_closure closure;
+       struct ltt_probe_private_data *private_data = call_data;
+       void *serialize_private = NULL;
+       int cpu;
+       unsigned int rflags;
+       unsigned int stack_pos_ctx;
+
+       /*
+        * This test is useful for quickly exiting static tracing when no trace
+        * is active. We expect to have an active trace when we get here.
+        */
+       if (unlikely(ltt_traces.num_active_traces == 0))
+               return;
+
+       rcu_read_lock_sched_notrace();
+       cpu = smp_processor_id();
+       __get_cpu_var(ltt_nesting)++;
+       stack_pos_ctx = __get_cpu_var(tracer_stack_pos);
+       /*
+        * asm volatile and "memory" clobber prevent the compiler from moving
+        * instructions out of the ltt nesting count. This is required to ensure
+        * that probe side-effects which can cause recursion (e.g. unforeseen
+        * traps, divisions by 0, ...) are triggered within the incremented
+        * nesting count section.
+        */
+       barrier();
+       pdata = (struct ltt_active_marker *)probe_data;
+       eID = mdata->event_id;
+       chan_index = mdata->channel_id;
+       closure.callbacks = pdata->probe->callbacks;
+
+       if (unlikely(private_data)) {
+               dest_trace = private_data->trace;
+               if (private_data->serializer)
+                       closure.callbacks = &private_data->serializer;
+               serialize_private = private_data->serialize_private;
+       }
+
+       va_copy(args_copy, *args);
+       /*
+        * Assumes event payload to start on largest_align alignment.
+        */
+       largest_align = 1;      /* must be non-zero for ltt_align */
+       data_size = ltt_get_data_size(&closure, serialize_private,
+                                     stack_pos_ctx, &largest_align,
+                                     fmt, &args_copy);
+       largest_align = min_t(int, largest_align, sizeof(void *));
+       va_end(args_copy);
+
+       /* Iterate on each trace */
+       list_for_each_entry_rcu(trace, &ltt_traces.head, list) {
+               /*
+                * Expect the filter to filter out events. If we get here,
+                * we went through tracepoint activation as a first step.
+                */
+               if (unlikely(dest_trace && trace != dest_trace))
+                       continue;
+               if (unlikely(!trace->active))
+                       continue;
+               if (unlikely(!ltt_run_filter(trace, eID)))
+                       continue;
+#ifdef CONFIG_LTT_DEBUG_EVENT_SIZE
+               rflags = LTT_RFLAG_ID_SIZE;
+#else
+               if (unlikely(eID >= LTT_FREE_EVENTS))
+                       rflags = LTT_RFLAG_ID;
+               else
+                       rflags = 0;
+#endif
+               /*
+                * Skip channels added after trace creation.
+                */
+               if (unlikely(chan_index >= trace->nr_channels))
+                       continue;
+               chan = &trace->channels[chan_index];
+               if (!chan->active)
+                       continue;
+
+               /* reserve space : header and data */
+               ret = ltt_reserve_slot(chan, trace, data_size, largest_align,
+                                      cpu, &buf, &slot_size, &buf_offset,
+                                      &tsc, &rflags);
+               if (unlikely(ret < 0))
+                       continue; /* buffer full */
+
+               va_copy(args_copy, *args);
+               /* Out-of-order write : header and data */
+               buf_offset = ltt_write_event_header(&buf->a, &chan->a,
+                                                   buf_offset, eID, data_size,
+                                                   tsc, rflags);
+               ltt_write_event_data(buf, buf_offset, &closure,
+                                    serialize_private, stack_pos_ctx,
+                                    largest_align, fmt, &args_copy);
+               va_end(args_copy);
+               /* Out-of-order commit */
+               ltt_commit_slot(buf, chan, buf_offset, data_size, slot_size);
+       }
+       /*
+        * asm volatile and "memory" clobber prevent the compiler from moving
+        * instructions out of the ltt nesting count. This is required to ensure
+        * that probe side-effects which can cause recursion (e.g. unforeseen
+        * traps, divisions by 0, ...) are triggered within the incremented
+        * nesting count section.
+        */
+       barrier();
+       __get_cpu_var(tracer_stack_pos) = stack_pos_ctx;
+       __get_cpu_var(ltt_nesting)--;
+       rcu_read_unlock_sched_notrace();
+}
+EXPORT_SYMBOL_GPL(ltt_vtrace);
+
+notrace
+void ltt_trace(const struct marker *mdata, void *probe_data, void *call_data,
+              const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       ltt_vtrace(mdata, probe_data, call_data, fmt, &args);
+       va_end(args);
+}
+EXPORT_SYMBOL_GPL(ltt_trace);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Serializer");
diff --git a/ltt-statedump.c b/ltt-statedump.c
new file mode 100644 (file)
index 0000000..06ade69
--- /dev/null
@@ -0,0 +1,441 @@
+/*
+ * Linux Trace Toolkit Kernel State Dump
+ *
+ * Copyright 2005 -
+ * Jean-Hugues Deschenes <jean-hugues.deschenes@polymtl.ca>
+ *
+ * Changes:
+ *     Eric Clement:                   Add listing of network IP interface
+ *     2006, 2007 Mathieu Desnoyers    Fix kernel threads
+ *                                     Various updates
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/inet.h>
+#include <linux/ip.h>
+#include <linux/kthread.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/interrupt.h>
+#include <linux/irqnr.h>
+#include <linux/cpu.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/marker.h>
+#include <linux/fdtable.h>
+#include <linux/swap.h>
+#include <linux/wait.h>
+#include <linux/mutex.h>
+
+#include "ltt-tracer.h"
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+#include <linux/irq.h>
+#endif
+
+#define NB_PROC_CHUNK 20
+
+/*
+ * Protected by the trace lock.
+ */
+static struct delayed_work cpu_work[NR_CPUS];
+static DECLARE_WAIT_QUEUE_HEAD(statedump_wq);
+static atomic_t kernel_threads_to_run;
+
+static void empty_cb(void *call_data)
+{
+}
+
+static DEFINE_MUTEX(statedump_cb_mutex);
+static void (*ltt_dump_kprobes_table_cb)(void *call_data) = empty_cb;
+
+enum lttng_thread_type {
+       LTTNG_USER_THREAD = 0,
+       LTTNG_KERNEL_THREAD = 1,
+};
+
+enum lttng_execution_mode {
+       LTTNG_USER_MODE = 0,
+       LTTNG_SYSCALL = 1,
+       LTTNG_TRAP = 2,
+       LTTNG_IRQ = 3,
+       LTTNG_SOFTIRQ = 4,
+       LTTNG_MODE_UNKNOWN = 5,
+};
+
+enum lttng_execution_submode {
+       LTTNG_NONE = 0,
+       LTTNG_UNKNOWN = 1,
+};
+
+enum lttng_process_status {
+       LTTNG_UNNAMED = 0,
+       LTTNG_WAIT_FORK = 1,
+       LTTNG_WAIT_CPU = 2,
+       LTTNG_EXIT = 3,
+       LTTNG_ZOMBIE = 4,
+       LTTNG_WAIT = 5,
+       LTTNG_RUN = 6,
+       LTTNG_DEAD = 7,
+};
+
+#ifdef CONFIG_INET
+static void ltt_enumerate_device(struct ltt_probe_private_data *call_data,
+                                struct net_device *dev)
+{
+       struct in_device *in_dev;
+       struct in_ifaddr *ifa;
+
+       if (dev->flags & IFF_UP) {
+               in_dev = in_dev_get(dev);
+               if (in_dev) {
+                       for (ifa = in_dev->ifa_list; ifa != NULL;
+                            ifa = ifa->ifa_next)
+                               __trace_mark(0, netif_state,
+                                            network_ipv4_interface,
+                                            call_data,
+                                            "name %s address #n4u%lu up %d",
+                                            dev->name,
+                                            (unsigned long)ifa->ifa_address,
+                                            0);
+                       in_dev_put(in_dev);
+               }
+       } else
+               __trace_mark(0, netif_state, network_ip_interface,
+                            call_data, "name %s address #n4u%lu up %d",
+                            dev->name, 0UL, 0);
+}
+
+static inline int
+ltt_enumerate_network_ip_interface(struct ltt_probe_private_data *call_data)
+{
+       struct net_device *dev;
+
+       read_lock(&dev_base_lock);
+       for_each_netdev(&init_net, dev)
+               ltt_enumerate_device(call_data, dev);
+       read_unlock(&dev_base_lock);
+
+       return 0;
+}
+#else /* CONFIG_INET */
+static inline int
+ltt_enumerate_network_ip_interface(struct ltt_probe_private_data *call_data)
+{
+       return 0;
+}
+#endif /* CONFIG_INET */
+
+
+static inline void
+ltt_enumerate_task_fd(struct ltt_probe_private_data *call_data,
+                     struct task_struct *t, char *tmp)
+{
+       struct fdtable *fdt;
+       struct file *filp;
+       unsigned int i;
+       const unsigned char *path;
+
+       if (!t->files)
+               return;
+
+       spin_lock(&t->files->file_lock);
+       fdt = files_fdtable(t->files);
+       for (i = 0; i < fdt->max_fds; i++) {
+               filp = fcheck_files(t->files, i);
+               if (!filp)
+                       continue;
+               path = d_path(&filp->f_path, tmp, PAGE_SIZE);
+               /* Make sure we give at least some info */
+               __trace_mark(0, fd_state, file_descriptor, call_data,
+                            "filename %s pid %d fd %u",
+                            (IS_ERR(path))?(filp->f_dentry->d_name.name):(path),
+                            t->pid, i);
+       }
+       spin_unlock(&t->files->file_lock);
+}
+
+static inline int
+ltt_enumerate_file_descriptors(struct ltt_probe_private_data *call_data)
+{
+       struct task_struct *t = &init_task;
+       char *tmp = (char *)__get_free_page(GFP_KERNEL);
+
+       /* Enumerate active file descriptors */
+       do {
+               read_lock(&tasklist_lock);
+               if (t != &init_task)
+                       atomic_dec(&t->usage);
+               t = next_task(t);
+               atomic_inc(&t->usage);
+               read_unlock(&tasklist_lock);
+               task_lock(t);
+               ltt_enumerate_task_fd(call_data, t, tmp);
+               task_unlock(t);
+       } while (t != &init_task);
+       free_page((unsigned long)tmp);
+       return 0;
+}
+
+static inline void
+ltt_enumerate_task_vm_maps(struct ltt_probe_private_data *call_data,
+               struct task_struct *t)
+{
+       struct mm_struct *mm;
+       struct vm_area_struct *map;
+       unsigned long ino;
+
+       /* get_task_mm does a task_lock... */
+       mm = get_task_mm(t);
+       if (!mm)
+               return;
+
+       map = mm->mmap;
+       if (map) {
+               down_read(&mm->mmap_sem);
+               while (map) {
+                       if (map->vm_file)
+                               ino = map->vm_file->f_dentry->d_inode->i_ino;
+                       else
+                               ino = 0;
+                       __trace_mark(0, vm_state, vm_map, call_data,
+                                    "pid %d start %lu end %lu flags %lu "
+                                    "pgoff %lu inode %lu",
+                                    t->pid, map->vm_start, map->vm_end,
+                                    map->vm_flags, map->vm_pgoff << PAGE_SHIFT,
+                                    ino);
+                       map = map->vm_next;
+               }
+               up_read(&mm->mmap_sem);
+       }
+       mmput(mm);
+}
+
+static inline int
+ltt_enumerate_vm_maps(struct ltt_probe_private_data *call_data)
+{
+       struct task_struct *t = &init_task;
+
+       do {
+               read_lock(&tasklist_lock);
+               if (t != &init_task)
+                       atomic_dec(&t->usage);
+               t = next_task(t);
+               atomic_inc(&t->usage);
+               read_unlock(&tasklist_lock);
+               ltt_enumerate_task_vm_maps(call_data, t);
+       } while (t != &init_task);
+       return 0;
+}
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+static inline void list_interrupts(struct ltt_probe_private_data *call_data)
+{
+       unsigned int irq;
+       unsigned long flags = 0;
+       struct irq_desc *desc;
+
+       /* needs irq_desc */
+       for_each_irq_desc(irq, desc) {
+               struct irqaction *action;
+               const char *irq_chip_name =
+                       desc->chip->name ? : "unnamed_irq_chip";
+
+               local_irq_save(flags);
+               raw_spin_lock(&desc->lock);
+               for (action = desc->action; action; action = action->next)
+                       __trace_mark(0, irq_state, interrupt, call_data,
+                                    "name %s action %s irq_id %u",
+                                    irq_chip_name, action->name, irq);
+               raw_spin_unlock(&desc->lock);
+               local_irq_restore(flags);
+       }
+}
+#else
+static inline void list_interrupts(struct ltt_probe_private_data *call_data)
+{
+}
+#endif
+
+static inline int
+ltt_enumerate_process_states(struct ltt_probe_private_data *call_data)
+{
+       struct task_struct *t = &init_task;
+       struct task_struct *p = t;
+       enum lttng_process_status status;
+       enum lttng_thread_type type;
+       enum lttng_execution_mode mode;
+       enum lttng_execution_submode submode;
+
+       do {
+               mode = LTTNG_MODE_UNKNOWN;
+               submode = LTTNG_UNKNOWN;
+
+               read_lock(&tasklist_lock);
+               if (t != &init_task) {
+                       atomic_dec(&t->usage);
+                       t = next_thread(t);
+               }
+               if (t == p) {
+                       p = next_task(t);
+                       t = p;
+               }
+               atomic_inc(&t->usage);
+               read_unlock(&tasklist_lock);
+
+               task_lock(t);
+
+               if (t->exit_state == EXIT_ZOMBIE)
+                       status = LTTNG_ZOMBIE;
+               else if (t->exit_state == EXIT_DEAD)
+                       status = LTTNG_DEAD;
+               else if (t->state == TASK_RUNNING) {
+                       /* Is this a forked child that has not run yet? */
+                       if (list_empty(&t->rt.run_list))
+                               status = LTTNG_WAIT_FORK;
+                       else
+                               /*
+                                * All tasks are considered as wait_cpu;
+                                * the viewer will sort out if the task was
+                                * really running at this time.
+                                */
+                               status = LTTNG_WAIT_CPU;
+               } else if (t->state &
+                       (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)) {
+                       /* Task is waiting for something to complete */
+                       status = LTTNG_WAIT;
+               } else
+                       status = LTTNG_UNNAMED;
+               submode = LTTNG_NONE;
+
+               /*
+                * Verification of t->mm is to filter out kernel threads;
+                * Viewer will further filter out if a user-space thread was
+                * in syscall mode or not.
+                */
+               if (t->mm)
+                       type = LTTNG_USER_THREAD;
+               else
+                       type = LTTNG_KERNEL_THREAD;
+
+               __trace_mark(0, task_state, process_state, call_data,
+                            "pid %d parent_pid %d name %s type %d mode %d "
+                            "submode %d status %d tgid %d",
+                            t->pid, t->parent->pid, t->comm,
+                            type, mode, submode, status, t->tgid);
+               task_unlock(t);
+       } while (t != &init_task);
+
+       return 0;
+}
+
+void ltt_statedump_register_kprobes_dump(void (*callback)(void *call_data))
+{
+       mutex_lock(&statedump_cb_mutex);
+       ltt_dump_kprobes_table_cb = callback;
+       mutex_unlock(&statedump_cb_mutex);
+}
+EXPORT_SYMBOL_GPL(ltt_statedump_register_kprobes_dump);
+
+void ltt_statedump_unregister_kprobes_dump(void (*callback)(void *call_data))
+{
+       mutex_lock(&statedump_cb_mutex);
+       ltt_dump_kprobes_table_cb = empty_cb;
+       mutex_unlock(&statedump_cb_mutex);
+}
+EXPORT_SYMBOL_GPL(ltt_statedump_unregister_kprobes_dump);
+
+void ltt_statedump_work_func(struct work_struct *work)
+{
+       if (atomic_dec_and_test(&kernel_threads_to_run))
+               /* If we are the last thread, wake up do_ltt_statedump */
+               wake_up(&statedump_wq);
+}
+
+static int do_ltt_statedump(struct ltt_probe_private_data *call_data)
+{
+       int cpu;
+       struct module *cb_owner;
+
+       printk(KERN_DEBUG "LTT state dump thread start\n");
+       ltt_enumerate_process_states(call_data);
+       ltt_enumerate_file_descriptors(call_data);
+       list_modules(call_data);
+       ltt_enumerate_vm_maps(call_data);
+       list_interrupts(call_data);
+       ltt_enumerate_network_ip_interface(call_data);
+       ltt_dump_swap_files(call_data);
+       ltt_dump_sys_call_table(call_data);
+       ltt_dump_softirq_vec(call_data);
+       ltt_dump_idt_table(call_data);
+
+       mutex_lock(&statedump_cb_mutex);
+
+       cb_owner = __module_address((unsigned long)ltt_dump_kprobes_table_cb);
+       __module_get(cb_owner);
+       ltt_dump_kprobes_table_cb(call_data);
+       module_put(cb_owner);
+
+       mutex_unlock(&statedump_cb_mutex);
+
+       /*
+        * Fire off a work queue on each CPU. Their sole purpose in life
+        * is to guarantee that each CPU has been in a state where is was in
+        * syscall mode (i.e. not in a trap, an IRQ or a soft IRQ).
+        */
+       get_online_cpus();
+       atomic_set(&kernel_threads_to_run, num_online_cpus());
+       for_each_online_cpu(cpu) {
+               INIT_DELAYED_WORK(&cpu_work[cpu], ltt_statedump_work_func);
+               schedule_delayed_work_on(cpu, &cpu_work[cpu], 0);
+       }
+       /* Wait for all threads to run */
+       __wait_event(statedump_wq, (atomic_read(&kernel_threads_to_run) != 0));
+       put_online_cpus();
+       /* Our work is done */
+       printk(KERN_DEBUG "LTT state dump end\n");
+       __trace_mark(0, global_state, statedump_end,
+                    call_data, MARK_NOARGS);
+       return 0;
+}
+
+/*
+ * Called with trace lock held.
+ */
+int ltt_statedump_start(struct ltt_trace *trace)
+{
+       struct ltt_probe_private_data call_data;
+       printk(KERN_DEBUG "LTT state dump begin\n");
+
+       call_data.trace = trace;
+       call_data.serializer = NULL;
+       return do_ltt_statedump(&call_data);
+}
+
+static int __init statedump_init(void)
+{
+       int ret;
+       printk(KERN_DEBUG "LTT : State dump init\n");
+       ret = ltt_module_register(LTT_FUNCTION_STATEDUMP,
+                       ltt_statedump_start, THIS_MODULE);
+       return ret;
+}
+
+static void __exit statedump_exit(void)
+{
+       printk(KERN_DEBUG "LTT : State dump exit\n");
+       ltt_module_unregister(LTT_FUNCTION_STATEDUMP);
+}
+
+module_init(statedump_init)
+module_exit(statedump_exit)
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Jean-Hugues Deschenes");
+MODULE_DESCRIPTION("Linux Trace Toolkit Statedump");
diff --git a/ltt-trace-control.c b/ltt-trace-control.c
new file mode 100644 (file)
index 0000000..9d6d239
--- /dev/null
@@ -0,0 +1,1426 @@
+/*
+ * LTT trace control module over debugfs.
+ *
+ * Copyright 2008 - Zhaolei <zhaolei@cn.fujitsu.com>
+ *
+ * Copyright 2009 - Gui Jianfeng <guijianfeng@cn.fujitsu.com>
+ *                  Make mark-control work in debugfs
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+/*
+ * Todo:
+ *   Impl read operations for control file to read attributes
+ *   Create a README file in ltt control dir, for display help info
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
+#include <linux/notifier.h>
+#include <linux/jiffies.h>
+#include <linux/marker.h>
+
+#include "ltt-tracer.h"
+
+#define LTT_CONTROL_DIR "control"
+#define MARKERS_CONTROL_DIR "markers"
+#define LTT_SETUP_TRACE_FILE "setup_trace"
+#define LTT_DESTROY_TRACE_FILE "destroy_trace"
+
+#define LTT_WRITE_MAXLEN       (128)
+
+struct dentry *ltt_control_dir, *ltt_setup_trace_file, *ltt_destroy_trace_file,
+       *markers_control_dir;
+
+/*
+ * the traces_lock nests inside control_lock.
+ * control_lock protects the consistency of directories presented in ltt
+ * directory.
+ */
+static DEFINE_MUTEX(control_lock);
+
+/*
+ * big note about locking for marker control files :
+ * If a marker control file is added/removed manually racing with module
+ * load/unload, there may be warning messages appearing, but those two
+ * operations should be able to execute concurrently without any lock
+ * synchronizing their operation one wrt another.
+ * Locking the marker mutex, module mutex and also keeping a mutex here
+ * from mkdir/rmdir _and_ from the notifier called from module load/unload makes
+ * life miserable and just asks for deadlocks.
+ */
+
+/*
+ * lookup a file/dir in parent dir.
+ * only designed to work well for debugfs.
+ * (although it maybe ok for other fs)
+ *
+ * return:
+ *     file/dir's dentry on success
+ *     NULL on failure
+ */
+static struct dentry *dir_lookup(struct dentry *parent, const char *name)
+{
+       struct qstr q;
+       struct dentry *d;
+
+       q.name = name;
+       q.len = strlen(name);
+       q.hash = full_name_hash(q.name, q.len);
+
+       d = d_lookup(parent, &q);
+       if (d)
+               dput(d);
+
+       return d;
+}
+
+
+static ssize_t alloc_write(struct file *file, const char __user *user_buf,
+                          size_t count, loff_t *ppos)
+{
+       int err = 0;
+       int buf_size;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+       char *cmd = (char *)__get_free_page(GFP_KERNEL);
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", cmd) != 1) {
+               err = -EPERM;
+               goto err_get_cmd;
+       }
+
+       if ((cmd[0] != 'Y' && cmd[0] != 'y' && cmd[0] != '1') || cmd[1]) {
+               err = -EPERM;
+               goto err_bad_cmd;
+       }
+
+       err = ltt_trace_alloc(file->f_dentry->d_parent->d_name.name);
+       if (IS_ERR_VALUE(err)) {
+               printk(KERN_ERR "alloc_write: ltt_trace_alloc failed: %d\n",
+                       err);
+               goto err_alloc_trace;
+       }
+
+       free_page((unsigned long)buf);
+       free_page((unsigned long)cmd);
+       return count;
+
+err_alloc_trace:
+err_bad_cmd:
+err_get_cmd:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       free_page((unsigned long)cmd);
+       return err;
+}
+
+static const struct file_operations ltt_alloc_operations = {
+       .write = alloc_write,
+};
+
+
+static ssize_t enabled_write(struct file *file, const char __user *user_buf,
+                            size_t count, loff_t *ppos)
+{
+       int err = 0;
+       int buf_size;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+       char *cmd = (char *)__get_free_page(GFP_KERNEL);
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", cmd) != 1) {
+               err = -EPERM;
+               goto err_get_cmd;
+       }
+
+       if (cmd[1]) {
+               err = -EPERM;
+               goto err_bad_cmd;
+       }
+
+       switch (cmd[0]) {
+       case 'Y':
+       case 'y':
+       case '1':
+               err = ltt_trace_start(file->f_dentry->d_parent->d_name.name);
+               if (IS_ERR_VALUE(err)) {
+                       printk(KERN_ERR
+                              "enabled_write: ltt_trace_start failed: %d\n",
+                              err);
+                       err = -EPERM;
+                       goto err_start_trace;
+               }
+               break;
+       case 'N':
+       case 'n':
+       case '0':
+               err = ltt_trace_stop(file->f_dentry->d_parent->d_name.name);
+               if (IS_ERR_VALUE(err)) {
+                       printk(KERN_ERR
+                              "enabled_write: ltt_trace_stop failed: %d\n",
+                              err);
+                       err = -EPERM;
+                       goto err_stop_trace;
+               }
+               break;
+       default:
+               err = -EPERM;
+               goto err_bad_cmd;
+       }
+
+       free_page((unsigned long)buf);
+       free_page((unsigned long)cmd);
+       return count;
+
+err_stop_trace:
+err_start_trace:
+err_bad_cmd:
+err_get_cmd:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       free_page((unsigned long)cmd);
+       return err;
+}
+
+static const struct file_operations ltt_enabled_operations = {
+       .write = enabled_write,
+};
+
+
+static ssize_t trans_write(struct file *file, const char __user *user_buf,
+                          size_t count, loff_t *ppos)
+{
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+       char *trans_name = (char *)__get_free_page(GFP_KERNEL);
+       int err = 0;
+       int buf_size;
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", trans_name) != 1) {
+               err = -EPERM;
+               goto err_get_transname;
+       }
+
+       err = ltt_trace_set_type(file->f_dentry->d_parent->d_name.name,
+                                trans_name);
+       if (IS_ERR_VALUE(err)) {
+               printk(KERN_ERR "trans_write: ltt_trace_set_type failed: %d\n",
+                      err);
+               goto err_set_trans;
+       }
+
+       free_page((unsigned long)buf);
+       free_page((unsigned long)trans_name);
+       return count;
+
+err_set_trans:
+err_get_transname:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       free_page((unsigned long)trans_name);
+       return err;
+}
+
+static const struct file_operations ltt_trans_operations = {
+       .write = trans_write,
+};
+
+
+static ssize_t channel_subbuf_num_write(struct file *file,
+               const char __user *user_buf, size_t count, loff_t *ppos)
+{
+       int err = 0;
+       int buf_size;
+       unsigned int num;
+       const char *channel_name;
+       const char *trace_name;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%u", &num) != 1) {
+               err = -EPERM;
+               goto err_get_number;
+       }
+
+       channel_name = file->f_dentry->d_parent->d_name.name;
+       trace_name = file->f_dentry->d_parent->d_parent->d_parent->d_name.name;
+
+       err = ltt_trace_set_channel_subbufcount(trace_name, channel_name, num);
+       if (IS_ERR_VALUE(err)) {
+               printk(KERN_ERR "channel_subbuf_num_write: "
+                      "ltt_trace_set_channel_subbufcount failed: %d\n", err);
+               goto err_set_subbufcount;
+       }
+
+       free_page((unsigned long)buf);
+       return count;
+
+err_set_subbufcount:
+err_get_number:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       return err;
+}
+
+static const struct file_operations ltt_channel_subbuf_num_operations = {
+       .write = channel_subbuf_num_write,
+};
+
+
+static
+ssize_t channel_subbuf_size_write(struct file *file,
+                                 const char __user *user_buf,
+                                 size_t count, loff_t *ppos)
+{
+       int err = 0;
+       int buf_size;
+       unsigned int num;
+       const char *channel_name;
+       const char *trace_name;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%u", &num) != 1) {
+               err = -EPERM;
+               goto err_get_number;
+       }
+
+       channel_name = file->f_dentry->d_parent->d_name.name;
+       trace_name = file->f_dentry->d_parent->d_parent->d_parent->d_name.name;
+
+       err = ltt_trace_set_channel_subbufsize(trace_name, channel_name, num);
+       if (IS_ERR_VALUE(err)) {
+               printk(KERN_ERR "channel_subbuf_size_write: "
+                      "ltt_trace_set_channel_subbufsize failed: %d\n", err);
+               goto err_set_subbufsize;
+       }
+
+       free_page((unsigned long)buf);
+       return count;
+
+err_set_subbufsize:
+err_get_number:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       return err;
+}
+
+static const struct file_operations ltt_channel_subbuf_size_operations = {
+       .write = channel_subbuf_size_write,
+};
+
+static
+ssize_t channel_switch_timer_write(struct file *file,
+                                  const char __user *user_buf,
+                                  size_t count, loff_t *ppos)
+{
+       int err = 0;
+       int buf_size;
+       unsigned long num;
+       const char *channel_name;
+       const char *trace_name;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%lu", &num) != 1) {
+               err = -EPERM;
+               goto err_get_number;
+       }
+
+       channel_name = file->f_dentry->d_parent->d_name.name;
+       trace_name = file->f_dentry->d_parent->d_parent->d_parent->d_name.name;
+
+       /* Convert from ms to jiffies */
+       num = msecs_to_jiffies(num);
+
+       err = ltt_trace_set_channel_switch_timer(trace_name, channel_name, num);
+       if (IS_ERR_VALUE(err)) {
+               printk(KERN_ERR "channel_switch_timer_write: "
+                      "ltt_trace_set_channel_switch_timer failed: %d\n", err);
+               goto err_set_switch_timer;
+       }
+
+       free_page((unsigned long)buf);
+       return count;
+
+err_set_switch_timer:
+err_get_number:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       return err;
+}
+
+static struct file_operations ltt_channel_switch_timer_operations = {
+       .write = channel_switch_timer_write,
+};
+
+static
+ssize_t channel_overwrite_write(struct file *file,
+                               const char __user *user_buf, size_t count,
+                               loff_t *ppos)
+{
+       int err = 0;
+       int buf_size;
+       const char *channel_name;
+       const char *trace_name;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+       char *cmd = (char *)__get_free_page(GFP_KERNEL);
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", cmd) != 1) {
+               err = -EPERM;
+               goto err_get_cmd;
+       }
+
+       if (cmd[1]) {
+               err = -EPERM;
+               goto err_bad_cmd;
+       }
+
+       channel_name = file->f_dentry->d_parent->d_name.name;
+       trace_name = file->f_dentry->d_parent->d_parent->d_parent->d_name.name;
+
+       switch (cmd[0]) {
+       case 'Y':
+       case 'y':
+       case '1':
+               err = ltt_trace_set_channel_overwrite(trace_name, channel_name,
+                                                     1);
+               if (IS_ERR_VALUE(err)) {
+                       printk(KERN_ERR "channel_overwrite_write: "
+                              "ltt_trace_set_channel_overwrite failed: %d\n",
+                              err);
+                       goto err_set_subbufsize;
+               }
+               break;
+       case 'N':
+       case 'n':
+       case '0':
+               err = ltt_trace_set_channel_overwrite(trace_name, channel_name,
+                                                     0);
+               if (IS_ERR_VALUE(err)) {
+                       printk(KERN_ERR "channel_overwrite_write: "
+                              "ltt_trace_set_channel_overwrite failed: %d\n",
+                              err);
+                       goto err_set_subbufsize;
+               }
+               break;
+       default:
+               err = -EPERM;
+               goto err_bad_cmd;
+       }
+
+       free_page((unsigned long)buf);
+       free_page((unsigned long)cmd);
+       return count;
+
+err_set_subbufsize:
+err_bad_cmd:
+err_get_cmd:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       free_page((unsigned long)cmd);
+       return err;
+}
+
+static const struct file_operations ltt_channel_overwrite_operations = {
+       .write = channel_overwrite_write,
+};
+
+
+static
+ssize_t channel_enable_write(struct file *file,
+                            const char __user *user_buf, size_t count,
+                            loff_t *ppos)
+{
+       int err = 0;
+       int buf_size;
+       const char *channel_name;
+       const char *trace_name;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+       char *cmd = (char *)__get_free_page(GFP_KERNEL);
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", cmd) != 1) {
+               err = -EPERM;
+               goto err_get_cmd;
+       }
+
+       if (cmd[1]) {
+               err = -EPERM;
+               goto err_bad_cmd;
+       }
+
+       channel_name = file->f_dentry->d_parent->d_name.name;
+       trace_name = file->f_dentry->d_parent->d_parent->d_parent->d_name.name;
+
+       switch (cmd[0]) {
+       case 'Y':
+       case 'y':
+       case '1':
+               err = ltt_trace_set_channel_enable(trace_name, channel_name,
+                                                  1);
+               if (IS_ERR_VALUE(err)) {
+                       printk(KERN_ERR "channel_enable_write: "
+                              "ltt_trace_set_channel_enable failed: %d\n",
+                              err);
+                       goto err_set_subbufsize;
+               }
+               break;
+       case 'N':
+       case 'n':
+       case '0':
+               err = ltt_trace_set_channel_enable(trace_name, channel_name,
+                                                  0);
+               if (IS_ERR_VALUE(err)) {
+                       printk(KERN_ERR "channel_enable_write: "
+                              "ltt_trace_set_channel_enable failed: %d\n",
+                              err);
+                       goto err_set_subbufsize;
+               }
+               break;
+       default:
+               err = -EPERM;
+               goto err_bad_cmd;
+       }
+
+       free_page((unsigned long)buf);
+       free_page((unsigned long)cmd);
+       return count;
+
+err_set_subbufsize:
+err_bad_cmd:
+err_get_cmd:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       free_page((unsigned long)cmd);
+       return err;
+}
+
+static const struct file_operations ltt_channel_enable_operations = {
+       .write = channel_enable_write,
+};
+
+
+static int _create_trace_control_dir(const char *trace_name,
+                                    struct ltt_trace *trace)
+{
+       int err;
+       struct dentry *trace_root, *channel_root;
+       struct dentry *tmp_den;
+       int i;
+
+       /* debugfs/control/trace_name */
+       trace_root = debugfs_create_dir(trace_name, ltt_control_dir);
+       if (IS_ERR(trace_root) || !trace_root) {
+               printk(KERN_ERR "_create_trace_control_dir: "
+                      "create control root dir of %s failed\n", trace_name);
+               err = -ENOMEM;
+               goto err_create_trace_root;
+       }
+
+       /* debugfs/control/trace_name/alloc */
+       tmp_den = debugfs_create_file("alloc", S_IWUSR, trace_root, NULL,
+                                     &ltt_alloc_operations);
+       if (IS_ERR(tmp_den) || !tmp_den) {
+               printk(KERN_ERR "_create_trace_control_dir: "
+                      "create file of alloc failed\n");
+               err = -ENOMEM;
+               goto err_create_subdir;
+       }
+
+       /* debugfs/control/trace_name/trans */
+       tmp_den = debugfs_create_file("trans", S_IWUSR, trace_root, NULL,
+                                     &ltt_trans_operations);
+       if (IS_ERR(tmp_den) || !tmp_den) {
+               printk(KERN_ERR "_create_trace_control_dir: "
+                      "create file of trans failed\n");
+               err = -ENOMEM;
+               goto err_create_subdir;
+       }
+
+       /* debugfs/control/trace_name/enabled */
+       tmp_den = debugfs_create_file("enabled", S_IWUSR, trace_root, NULL,
+                                     &ltt_enabled_operations);
+       if (IS_ERR(tmp_den) || !tmp_den) {
+               printk(KERN_ERR "_create_trace_control_dir: "
+                      "create file of enabled failed\n");
+               err = -ENOMEM;
+               goto err_create_subdir;
+       }
+
+       /* debugfs/control/trace_name/channel/ */
+       channel_root = debugfs_create_dir("channel", trace_root);
+       if (IS_ERR(channel_root) || !channel_root) {
+               printk(KERN_ERR "_create_trace_control_dir: "
+                      "create dir of channel failed\n");
+               err = -ENOMEM;
+               goto err_create_subdir;
+       }
+
+       /*
+        * Create dir and files in debugfs/ltt/control/trace_name/channel/
+        * Following things(without <>) will be created:
+        * `-- <control>
+        *     `-- <trace_name>
+        *         `-- <channel>
+        *             |-- <channel_name>
+        *             |   |-- enable
+        *             |   |-- overwrite
+        *             |   |-- subbuf_num
+        *             |   |-- subbuf_size
+        *             |   `-- switch_timer
+        *             `-- ...
+        */
+
+       for (i = 0; i < trace->nr_channels; i++) {
+               struct dentry *channel_den;
+               struct ltt_chan *chan;
+
+               chan = &trace->channels[i];
+               if (!chan->active)
+                       continue;
+               channel_den = debugfs_create_dir(chan->a.filename,
+                                                channel_root);
+               if (IS_ERR(channel_den) || !channel_den) {
+                       printk(KERN_ERR "_create_trace_control_dir: "
+                              "create channel dir of %s failed\n",
+                              chan->a.filename);
+                       err = -ENOMEM;
+                       goto err_create_subdir;
+               }
+
+               tmp_den = debugfs_create_file("subbuf_num", S_IWUSR,
+                                             channel_den, NULL,
+                                             &ltt_channel_subbuf_num_operations);
+               if (IS_ERR(tmp_den) || !tmp_den) {
+                       printk(KERN_ERR "_create_trace_control_dir: "
+                              "create subbuf_num in %s failed\n",
+                              chan->a.filename);
+                       err = -ENOMEM;
+                       goto err_create_subdir;
+               }
+
+               tmp_den = debugfs_create_file("subbuf_size", S_IWUSR,
+                                             channel_den, NULL,
+                                             &ltt_channel_subbuf_size_operations);
+               if (IS_ERR(tmp_den) || !tmp_den) {
+                       printk(KERN_ERR "_create_trace_control_dir: "
+                              "create subbuf_size in %s failed\n",
+                              chan->a.filename);
+                       err = -ENOMEM;
+                       goto err_create_subdir;
+               }
+
+               tmp_den = debugfs_create_file("enable", S_IWUSR, channel_den,
+                                             NULL,
+                                             &ltt_channel_enable_operations);
+               if (IS_ERR(tmp_den) || !tmp_den) {
+                       printk(KERN_ERR "_create_trace_control_dir: "
+                              "create enable in %s failed\n",
+                              chan->a.filename);
+                       err = -ENOMEM;
+                       goto err_create_subdir;
+               }
+
+               tmp_den = debugfs_create_file("overwrite", S_IWUSR, channel_den,
+                                             NULL,
+                                             &ltt_channel_overwrite_operations);
+               if (IS_ERR(tmp_den) || !tmp_den) {
+                       printk(KERN_ERR "_create_trace_control_dir: "
+                              "create overwrite in %s failed\n",
+                              chan->a.filename);
+                       err = -ENOMEM;
+                       goto err_create_subdir;
+               }
+
+               tmp_den = debugfs_create_file("switch_timer", S_IWUSR,
+                                             channel_den, NULL,
+                                             &ltt_channel_switch_timer_operations);
+               if (IS_ERR(tmp_den) || !tmp_den) {
+                       printk(KERN_ERR "_create_trace_control_dir: "
+                              "create switch_timer in %s failed\n",
+                              chan->a.filename);
+                       err = -ENOMEM;
+                       goto err_create_subdir;
+               }
+       }
+
+       return 0;
+
+err_create_subdir:
+       debugfs_remove_recursive(trace_root);
+err_create_trace_root:
+       return err;
+}
+
+static
+ssize_t setup_trace_write(struct file *file, const char __user *user_buf,
+                         size_t count, loff_t *ppos)
+{
+       int err = 0;
+       int buf_size;
+       struct ltt_trace *trace;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+       char *trace_name = (char *)__get_free_page(GFP_KERNEL);
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", trace_name) != 1) {
+               err = -EPERM;
+               goto err_get_tracename;
+       }
+
+       mutex_lock(&control_lock);
+       ltt_lock_traces();
+
+       err = _ltt_trace_setup(trace_name);
+       if (IS_ERR_VALUE(err)) {
+               printk(KERN_ERR
+                      "setup_trace_write: ltt_trace_setup failed: %d\n", err);
+               goto err_setup_trace;
+       }
+       trace = _ltt_trace_find_setup(trace_name);
+       BUG_ON(!trace);
+       err = _create_trace_control_dir(trace_name, trace);
+       if (IS_ERR_VALUE(err)) {
+               printk(KERN_ERR "setup_trace_write: "
+                      "_create_trace_control_dir failed: %d\n", err);
+               goto err_create_trace_control_dir;
+       }
+
+       ltt_unlock_traces();
+       mutex_unlock(&control_lock);
+
+       free_page((unsigned long)buf);
+       free_page((unsigned long)trace_name);
+       return count;
+
+err_create_trace_control_dir:
+       ltt_trace_destroy(trace_name);
+err_setup_trace:
+       ltt_unlock_traces();
+       mutex_unlock(&control_lock);
+err_get_tracename:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       free_page((unsigned long)trace_name);
+       return err;
+}
+
+static const struct file_operations ltt_setup_trace_operations = {
+       .write = setup_trace_write,
+};
+
+static
+ssize_t destroy_trace_write(struct file *file, const char __user *user_buf,
+                           size_t count, loff_t *ppos)
+{
+       struct dentry *trace_den;
+       int buf_size;
+       int err = 0;
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+       char *trace_name = (char *)__get_free_page(GFP_KERNEL);
+
+       buf_size = min_t(size_t, count, PAGE_SIZE - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto err_copy_from_user;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", trace_name) != 1) {
+               err = -EPERM;
+               goto err_get_tracename;
+       }
+
+       mutex_lock(&control_lock);
+
+       err = ltt_trace_destroy(trace_name);
+       if (IS_ERR_VALUE(err)) {
+               printk(KERN_ERR
+                      "destroy_trace_write: ltt_trace_destroy failed: %d\n",
+                      err);
+               err = -EPERM;
+               goto err_destroy_trace;
+       }
+
+       trace_den = dir_lookup(ltt_control_dir, trace_name);
+       if (!trace_den) {
+               printk(KERN_ERR
+                      "destroy_trace_write: lookup for %s's dentry failed\n",
+                      trace_name);
+               err = -ENOENT;
+               goto err_get_dentry;
+       }
+
+       debugfs_remove_recursive(trace_den);
+
+       mutex_unlock(&control_lock);
+
+       free_page((unsigned long)buf);
+       free_page((unsigned long)trace_name);
+       return count;
+
+err_get_dentry:
+err_destroy_trace:
+       mutex_unlock(&control_lock);
+err_get_tracename:
+err_copy_from_user:
+       free_page((unsigned long)buf);
+       free_page((unsigned long)trace_name);
+       return err;
+}
+
+static const struct file_operations ltt_destroy_trace_operations = {
+       .write = destroy_trace_write,
+};
+
+static void init_marker_dir(struct dentry *dentry,
+                           const struct inode_operations *opt)
+{
+       dentry->d_inode->i_op = opt;
+}
+
+static
+ssize_t marker_enable_read(struct file *filp, char __user *ubuf,
+                          size_t cnt, loff_t *ppos)
+{
+       char *buf;
+       const char *channel, *marker;
+       int len, enabled, present;
+
+       marker = filp->f_dentry->d_parent->d_name.name;
+       channel = filp->f_dentry->d_parent->d_parent->d_name.name;
+
+       len = 0;
+       buf = (char *)__get_free_page(GFP_KERNEL);
+
+       /*
+        * Note: we cannot take the marker lock to make these two checks
+        * atomic, because the marker mutex nests inside the module mutex, taken
+        * inside the marker present check.
+        */
+       enabled = is_marker_enabled(channel, marker);
+       present = is_marker_present(channel, marker);
+
+       if (enabled && present)
+               len = snprintf(buf, PAGE_SIZE, "%d\n", 1);
+       else if (enabled && !present)
+               len = snprintf(buf, PAGE_SIZE, "%d\n", 2);
+       else
+               len = snprintf(buf, PAGE_SIZE, "%d\n", 0);
+
+
+       if (len >= PAGE_SIZE) {
+               len = PAGE_SIZE;
+               buf[PAGE_SIZE] = '\0';
+       }
+       len = simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+       free_page((unsigned long)buf);
+
+       return len;
+}
+
+static
+ssize_t marker_enable_write(struct file *filp, const char __user *ubuf,
+                           size_t cnt, loff_t *ppos)
+{
+       char *buf = (char *)__get_free_page(GFP_KERNEL);
+       int buf_size;
+       ssize_t ret = 0;
+       const char *channel, *marker;
+
+       marker = filp->f_dentry->d_parent->d_name.name;
+       channel = filp->f_dentry->d_parent->d_parent->d_name.name;
+
+       buf_size = min_t(size_t, cnt, PAGE_SIZE - 1);
+       ret = copy_from_user(buf, ubuf, buf_size);
+       if (ret)
+               goto end;
+
+       buf[buf_size] = 0;
+
+       switch (buf[0]) {
+       case 'Y':
+       case 'y':
+       case '1':
+               ret = ltt_marker_connect(channel, marker, "default");
+               if (ret)
+                       goto end;
+               break;
+       case 'N':
+       case 'n':
+       case '0':
+               ret = ltt_marker_disconnect(channel, marker, "default");
+               if (ret)
+                       goto end;
+               break;
+       default:
+               ret = -EPERM;
+               goto end;
+       }
+       ret = cnt;
+end:
+       free_page((unsigned long)buf);
+       return ret;
+}
+
+static const struct file_operations enable_fops = {
+       .read = marker_enable_read,
+       .write = marker_enable_write,
+};
+
+/*
+ * In practice, the output size should never be larger than 4096 kB. If it
+ * ever happens, the output will simply be truncated.
+ */
+static
+ssize_t marker_info_read(struct file *filp, char __user *ubuf,
+                        size_t cnt, loff_t *ppos)
+{
+       char *buf;
+       const char *channel, *marker;
+       int len;
+       struct marker_iter iter;
+
+       marker = filp->f_dentry->d_parent->d_name.name;
+       channel = filp->f_dentry->d_parent->d_parent->d_name.name;
+
+       len = 0;
+       buf = (char *)__get_free_page(GFP_KERNEL);
+
+       if (is_marker_enabled(channel, marker) &&
+           !is_marker_present(channel, marker)) {
+               len += snprintf(buf + len, PAGE_SIZE - len,
+                               "Marker Pre-enabled\n");
+               goto out;
+       }
+
+       marker_iter_reset(&iter);
+       marker_iter_start(&iter);
+       for (; iter.marker != NULL; marker_iter_next(&iter)) {
+               if (!strcmp(iter.marker->channel, channel) &&
+                   !strcmp(iter.marker->name, marker))
+                       len += snprintf(buf + len, PAGE_SIZE - len,
+                                      "Location: %s\n"
+                                      "format: \"%s\"\nstate: %d\n"
+                                      "event_id: %hu\n"
+                                      "call: 0x%p\n"
+                                      "probe %s : 0x%p\n\n",
+#ifdef CONFIG_MODULES
+                                      iter.module ? iter.module->name :
+#endif
+                                      "Core Kernel",
+                                      iter.marker->format,
+                                      _imv_read(iter.marker->state),
+                                      iter.marker->event_id,
+                                      iter.marker->call,
+                                      iter.marker->ptype ?
+                                      "multi" : "single", iter.marker->ptype ?
+                                      (void *)iter.marker->multi :
+                                      (void *)iter.marker->single.func);
+                       if (len >= PAGE_SIZE)
+                               break;
+       }
+       marker_iter_stop(&iter);
+
+out:
+       if (len >= PAGE_SIZE) {
+               len = PAGE_SIZE;
+               buf[PAGE_SIZE] = '\0';
+       }
+
+       len = simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+       free_page((unsigned long)buf);
+
+       return len;
+}
+
+static const struct file_operations info_fops = {
+       .read = marker_info_read,
+};
+
+static int marker_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+       struct dentry *marker_d, *enable_d, *info_d, *channel_d;
+       int ret;
+
+       ret = 0;
+       channel_d = (struct dentry *)dir->i_private;
+       mutex_unlock(&dir->i_mutex);
+
+       marker_d = debugfs_create_dir(dentry->d_name.name,
+                                     channel_d);
+       if (IS_ERR(marker_d)) {
+               ret = PTR_ERR(marker_d);
+               goto out;
+       }
+
+       enable_d = debugfs_create_file("enable", 0644, marker_d,
+                                      NULL, &enable_fops);
+       if (IS_ERR(enable_d) || !enable_d) {
+               printk(KERN_ERR
+                      "%s: create file of %s failed\n",
+                      __func__, "enable");
+               ret = -ENOMEM;
+               goto remove_marker_dir;
+       }
+
+       info_d = debugfs_create_file("info", 0644, marker_d,
+                                    NULL, &info_fops);
+       if (IS_ERR(info_d) || !info_d) {
+               printk(KERN_ERR
+                      "%s: create file of %s failed\n",
+                      __func__, "info");
+               ret = -ENOMEM;
+               goto remove_enable_dir;
+       }
+
+       goto out;
+
+remove_enable_dir:
+       debugfs_remove(enable_d);
+remove_marker_dir:
+       debugfs_remove(marker_d);
+out:
+       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       return ret;
+}
+
+static int marker_rmdir(struct inode *dir, struct dentry *dentry)
+{
+       struct dentry *marker_d, *channel_d;
+       const char *channel, *name;
+       int ret, enabled, present;
+
+       ret = 0;
+
+       channel_d = (struct dentry *)dir->i_private;
+       channel = channel_d->d_name.name;
+
+       marker_d = dir_lookup(channel_d, dentry->d_name.name);
+
+       if (!marker_d) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       name = marker_d->d_name.name;
+
+       enabled = is_marker_enabled(channel, name);
+       present = is_marker_present(channel, name);
+
+       if (present || (!present && enabled)) {
+               ret = -EPERM;
+               goto out;
+       }
+
+       mutex_unlock(&dir->i_mutex);
+       mutex_unlock(&dentry->d_inode->i_mutex);
+       debugfs_remove_recursive(marker_d);
+       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       mutex_lock(&dentry->d_inode->i_mutex);
+out:
+       return ret;
+}
+
+const struct inode_operations channel_dir_opt = {
+       .lookup = simple_lookup,
+       .mkdir = marker_mkdir,
+       .rmdir = marker_rmdir,
+};
+
+static int channel_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+       struct dentry *channel_d;
+       int ret;
+
+       ret = 0;
+       mutex_unlock(&dir->i_mutex);
+
+       channel_d = debugfs_create_dir(dentry->d_name.name,
+                                      markers_control_dir);
+       if (IS_ERR(channel_d)) {
+               ret = PTR_ERR(channel_d);
+               goto out;
+       }
+
+       channel_d->d_inode->i_private = (void *)channel_d;
+       init_marker_dir(channel_d, &channel_dir_opt);
+out:
+       mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+       return ret;
+}
+
+static int channel_rmdir(struct inode *dir, struct dentry *dentry)
+{
+       struct dentry *channel_d;
+       int ret;
+
+       ret = 0;
+
+       channel_d = dir_lookup(markers_control_dir, dentry->d_name.name);
+       if (!channel_d) {
+               ret = -ENOENT;
+               goto out;
+       }
+
+       if (list_empty(&channel_d->d_subdirs)) {
+               mutex_unlock(&dir->i_mutex);
+               mutex_unlock(&dentry->d_inode->i_mutex);
+               debugfs_remove(channel_d);
+               mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+               mutex_lock(&dentry->d_inode->i_mutex);
+       } else
+               ret = -EPERM;
+
+out:
+       return ret;
+}
+
+const struct inode_operations root_dir_opt = {
+       .lookup = simple_lookup,
+       .mkdir = channel_mkdir,
+       .rmdir = channel_rmdir
+};
+
+static int build_marker_file(struct marker *marker)
+{
+       struct dentry *channel_d, *marker_d, *enable_d, *info_d;
+       int err;
+
+       channel_d = dir_lookup(markers_control_dir, marker->channel);
+       if (!channel_d) {
+               channel_d = debugfs_create_dir(marker->channel,
+                                              markers_control_dir);
+               if (IS_ERR(channel_d) || !channel_d) {
+                       printk(KERN_ERR
+                              "%s: build channel dir of %s failed\n",
+                              __func__, marker->channel);
+                       err = -ENOMEM;
+                       goto err_build_fail;
+               }
+               channel_d->d_inode->i_private = (void *)channel_d;
+               init_marker_dir(channel_d, &channel_dir_opt);
+       }
+
+       marker_d  = dir_lookup(channel_d, marker->name);
+       if (!marker_d) {
+               marker_d = debugfs_create_dir(marker->name, channel_d);
+               if (IS_ERR(marker_d) || !marker_d) {
+                       printk(KERN_ERR
+                              "%s: marker dir of %s failed\n",
+                              __func__, marker->name);
+                       err = -ENOMEM;
+                       goto err_build_fail;
+               }
+       }
+
+       enable_d = dir_lookup(marker_d, "enable");
+       if (!enable_d) {
+               enable_d = debugfs_create_file("enable", 0644, marker_d,
+                                               NULL, &enable_fops);
+               if (IS_ERR(enable_d) || !enable_d) {
+                       printk(KERN_ERR
+                              "%s: create file of %s failed\n",
+                              __func__, "enable");
+                       err = -ENOMEM;
+                       goto err_build_fail;
+               }
+       }
+
+       info_d = dir_lookup(marker_d, "info");
+       if (!info_d) {
+               info_d = debugfs_create_file("info", 0444, marker_d,
+                                               NULL, &info_fops);
+               if (IS_ERR(info_d) || !info_d) {
+                       printk(KERN_ERR
+                              "%s: create file of %s failed\n",
+                              __func__, "enable");
+                       err = -ENOMEM;
+                       goto err_build_fail;
+               }
+       }
+
+       return 0;
+
+err_build_fail:
+       return err;
+}
+
+static int build_marker_control_files(void)
+{
+       struct marker_iter iter;
+       int err;
+
+       err = 0;
+       if (!markers_control_dir)
+               return -EEXIST;
+
+       marker_iter_reset(&iter);
+       marker_iter_start(&iter);
+       for (; iter.marker != NULL; marker_iter_next(&iter)) {
+               err = build_marker_file(iter.marker);
+               if (err)
+                       goto out;
+       }
+       marker_iter_stop(&iter);
+
+out:
+       return err;
+}
+
+#ifdef CONFIG_MODULES
+static int remove_marker_control_dir(struct module *mod, struct marker *marker)
+{
+       struct dentry *channel_d, *marker_d;
+       const char *channel, *name;
+       int count;
+       struct marker_iter iter;
+
+       count = 0;
+
+       channel_d = dir_lookup(markers_control_dir, marker->channel);
+       if (!channel_d)
+               return -ENOENT;
+       channel = channel_d->d_name.name;
+
+       marker_d = dir_lookup(channel_d, marker->name);
+       if (!marker_d)
+               return -ENOENT;
+       name = marker_d->d_name.name;
+
+       marker_iter_reset(&iter);
+       marker_iter_start(&iter);
+       for (; iter.marker != NULL; marker_iter_next(&iter)) {
+               if (!strcmp(iter.marker->channel, channel) &&
+                   !strcmp(iter.marker->name, name) && mod != iter.module)
+                       count++;
+       }
+
+       if (count > 0)
+               goto end;
+
+       debugfs_remove_recursive(marker_d);
+       if (list_empty(&channel_d->d_subdirs))
+               debugfs_remove(channel_d);
+
+end:
+       marker_iter_stop(&iter);
+       return 0;
+}
+
+static void cleanup_control_dir(struct module *mod, struct marker *begin,
+                               struct marker *end)
+{
+       struct marker *iter;
+
+       if (!markers_control_dir)
+               return;
+
+       for (iter = begin; iter < end; iter++)
+               remove_marker_control_dir(mod, iter);
+
+       return;
+}
+
+static void build_control_dir(struct module *mod, struct marker *begin,
+                             struct marker *end)
+{
+       struct marker *iter;
+       int err;
+
+       err = 0;
+       if (!markers_control_dir)
+               return;
+
+       for (iter = begin; iter < end; iter++) {
+               err = build_marker_file(iter);
+               if (err)
+                       goto err_build_fail;
+       }
+
+       return;
+err_build_fail:
+       cleanup_control_dir(mod, begin, end);
+}
+
+static int module_notify(struct notifier_block *self,
+                 unsigned long val, void *data)
+{
+       struct module *mod = data;
+
+       switch (val) {
+       case MODULE_STATE_COMING:
+               build_control_dir(mod, mod->markers,
+                                 mod->markers + mod->num_markers);
+               break;
+       case MODULE_STATE_GOING:
+               cleanup_control_dir(mod, mod->markers,
+                                   mod->markers + mod->num_markers);
+               break;
+       }
+       return NOTIFY_DONE;
+}
+#else
+static inline int module_notify(struct notifier_block *self,
+               unsigned long val, void *data)
+{
+       return 0;
+}
+#endif
+
+static struct notifier_block module_nb = {
+       .notifier_call = module_notify,
+};
+
+static int __init ltt_trace_control_init(void)
+{
+       int err = 0;
+       struct dentry *ltt_root_dentry;
+
+       ltt_root_dentry = get_ltt_root();
+       if (!ltt_root_dentry) {
+               err = -ENOENT;
+               goto err_no_root;
+       }
+
+       ltt_control_dir = debugfs_create_dir(LTT_CONTROL_DIR, ltt_root_dentry);
+       if (IS_ERR(ltt_control_dir) || !ltt_control_dir) {
+               printk(KERN_ERR
+                      "ltt_channel_control_init: create dir of %s failed\n",
+                      LTT_CONTROL_DIR);
+               err = -ENOMEM;
+               goto err_create_control_dir;
+       }
+
+       ltt_setup_trace_file = debugfs_create_file(LTT_SETUP_TRACE_FILE,
+                                                  S_IWUSR, ltt_root_dentry,
+                                                  NULL,
+                                                  &ltt_setup_trace_operations);
+       if (IS_ERR(ltt_setup_trace_file) || !ltt_setup_trace_file) {
+               printk(KERN_ERR
+                      "ltt_channel_control_init: create file of %s failed\n",
+                      LTT_SETUP_TRACE_FILE);
+               err = -ENOMEM;
+               goto err_create_setup_trace_file;
+       }
+
+       ltt_destroy_trace_file = debugfs_create_file(LTT_DESTROY_TRACE_FILE,
+                                                    S_IWUSR, ltt_root_dentry,
+                                                    NULL,
+                                                    &ltt_destroy_trace_operations);
+       if (IS_ERR(ltt_destroy_trace_file) || !ltt_destroy_trace_file) {
+               printk(KERN_ERR
+                      "ltt_channel_control_init: create file of %s failed\n",
+                      LTT_DESTROY_TRACE_FILE);
+               err = -ENOMEM;
+               goto err_create_destroy_trace_file;
+       }
+
+       markers_control_dir = debugfs_create_dir(MARKERS_CONTROL_DIR,
+                                                ltt_root_dentry);
+       if (IS_ERR(markers_control_dir) || !markers_control_dir) {
+               printk(KERN_ERR
+                      "ltt_channel_control_init: create dir of %s failed\n",
+                      MARKERS_CONTROL_DIR);
+               err = -ENOMEM;
+               goto err_create_marker_control_dir;
+       }
+
+       init_marker_dir(markers_control_dir, &root_dir_opt);
+
+       if (build_marker_control_files())
+               goto err_build_fail;
+
+       if (!register_module_notifier(&module_nb))
+               return 0;
+
+err_build_fail:
+       debugfs_remove_recursive(markers_control_dir);
+       markers_control_dir = NULL;
+err_create_marker_control_dir:
+       debugfs_remove(ltt_destroy_trace_file);
+err_create_destroy_trace_file:
+       debugfs_remove(ltt_setup_trace_file);
+err_create_setup_trace_file:
+       debugfs_remove(ltt_control_dir);
+err_create_control_dir:
+err_no_root:
+       return err;
+}
+
+static void __exit ltt_trace_control_exit(void)
+{
+       struct dentry *trace_dir;
+
+       /* destory all traces */
+       list_for_each_entry(trace_dir, &ltt_control_dir->d_subdirs,
+                           d_u.d_child) {
+               ltt_trace_stop(trace_dir->d_name.name);
+               ltt_trace_destroy(trace_dir->d_name.name);
+       }
+
+       /* clean dirs in debugfs */
+       debugfs_remove(ltt_setup_trace_file);
+       debugfs_remove(ltt_destroy_trace_file);
+       debugfs_remove_recursive(ltt_control_dir);
+       debugfs_remove_recursive(markers_control_dir);
+       unregister_module_notifier(&module_nb);
+       put_ltt_root();
+}
+
+module_init(ltt_trace_control_init);
+module_exit(ltt_trace_control_exit);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Zhao Lei <zhaolei@cn.fujitsu.com>");
+MODULE_DESCRIPTION("Linux Trace Toolkit Trace Controller");
diff --git a/ltt-tracer-core.h b/ltt-tracer-core.h
new file mode 100644 (file)
index 0000000..1ac8c5b
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2005,2006 Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * This contains the core definitions for the Linux Trace Toolkit.
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#ifndef LTT_TRACER_CORE_H
+#define LTT_TRACER_CORE_H
+
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/ltt-core.h>
+
+/* ltt's root dir in debugfs */
+#define LTT_ROOT        "ltt"
+
+/*
+ * All modifications of ltt_traces must be done by ltt-tracer.c, while holding
+ * the semaphore. Only reading of this information can be done elsewhere, with
+ * the RCU mechanism : the preemption must be disabled while reading the
+ * list.
+ */
+struct ltt_traces {
+       struct list_head setup_head;    /* Pre-allocated traces list */
+       struct list_head head;          /* Allocated Traces list */
+       unsigned int num_active_traces; /* Number of active traces */
+} ____cacheline_aligned;
+
+extern struct ltt_traces ltt_traces;
+
+/*
+ * get dentry of ltt's root dir
+ */
+struct dentry *get_ltt_root(void);
+
+void put_ltt_root(void);
+
+/* Keep track of trap nesting inside LTT */
+DECLARE_PER_CPU(unsigned int, ltt_nesting);
+
+typedef int (*ltt_run_filter_functor)(void *trace, uint16_t eID);
+
+extern ltt_run_filter_functor ltt_run_filter;
+
+extern void ltt_filter_register(ltt_run_filter_functor func);
+extern void ltt_filter_unregister(void);
+
+#endif /* LTT_TRACER_CORE_H */
diff --git a/ltt-tracer.c b/ltt-tracer.c
new file mode 100644 (file)
index 0000000..8eae966
--- /dev/null
@@ -0,0 +1,1293 @@
+/*
+ * ltt/ltt-tracer.c
+ *
+ * (C) Copyright       2005-2008 -
+ *             Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Tracing management internal kernel API. Trace buffer allocation/free, tracing
+ * start/stop.
+ *
+ * Author:
+ *     Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Inspired from LTT :
+ *  Karim Yaghmour (karim@opersys.com)
+ *  Tom Zanussi (zanussi@us.ibm.com)
+ *  Bob Wisniewski (bob@watson.ibm.com)
+ * And from K42 :
+ *  Bob Wisniewski (bob@watson.ibm.com)
+ *
+ * Changelog:
+ *  22/09/06, Move to the marker/probes mechanism.
+ *  19/10/05, Complete lockless mechanism.
+ *  27/05/05, Modular redesign and rewrite.
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/time.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/fs.h>
+#include <linux/cpu.h>
+#include <linux/kref.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <asm/atomic.h>
+
+#include "ltt-tracer.h"
+
+static void synchronize_trace(void)
+{
+       synchronize_sched();
+#ifdef CONFIG_PREEMPT_RT
+       synchronize_rcu();
+#endif
+}
+
+static void async_wakeup(unsigned long data);
+
+static DEFINE_TIMER(ltt_async_wakeup_timer, async_wakeup, 0, 0);
+
+/* Default callbacks for modules */
+notrace
+int ltt_filter_control_default(enum ltt_filter_control_msg msg,
+                              struct ltt_trace *trace)
+{
+       return 0;
+}
+
+int ltt_statedump_default(struct ltt_trace *trace)
+{
+       return 0;
+}
+
+/* Callbacks for registered modules */
+
+int (*ltt_filter_control_functor)
+       (enum ltt_filter_control_msg msg, struct ltt_trace *trace) =
+                                       ltt_filter_control_default;
+struct module *ltt_filter_control_owner;
+
+/* These function pointers are protected by a trace activation check */
+struct module *ltt_run_filter_owner;
+int (*ltt_statedump_functor)(struct ltt_trace *trace) = ltt_statedump_default;
+struct module *ltt_statedump_owner;
+
+struct chan_info_struct {
+       const char *name;
+       unsigned int def_sb_size;
+       unsigned int def_n_sb;
+} chan_infos[] = {
+       [LTT_CHANNEL_METADATA] = {
+               LTT_METADATA_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_FD_STATE] = {
+               LTT_FD_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_GLOBAL_STATE] = {
+               LTT_GLOBAL_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_IRQ_STATE] = {
+               LTT_IRQ_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_MODULE_STATE] = {
+               LTT_MODULE_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_NETIF_STATE] = {
+               LTT_NETIF_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_SOFTIRQ_STATE] = {
+               LTT_SOFTIRQ_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_SWAP_STATE] = {
+               LTT_SWAP_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_SYSCALL_STATE] = {
+               LTT_SYSCALL_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_TASK_STATE] = {
+               LTT_TASK_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_VM_STATE] = {
+               LTT_VM_STATE_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_MED,
+               LTT_DEFAULT_N_SUBBUFS_MED,
+       },
+       [LTT_CHANNEL_FS] = {
+               LTT_FS_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_MED,
+               LTT_DEFAULT_N_SUBBUFS_MED,
+       },
+       [LTT_CHANNEL_INPUT] = {
+               LTT_INPUT_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_IPC] = {
+               LTT_IPC_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_LOW,
+               LTT_DEFAULT_N_SUBBUFS_LOW,
+       },
+       [LTT_CHANNEL_KERNEL] = {
+               LTT_KERNEL_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_HIGH,
+               LTT_DEFAULT_N_SUBBUFS_HIGH,
+       },
+       [LTT_CHANNEL_MM] = {
+               LTT_MM_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_MED,
+               LTT_DEFAULT_N_SUBBUFS_MED,
+       },
+       [LTT_CHANNEL_RCU] = {
+               LTT_RCU_CHANNEL,
+               LTT_DEFAULT_SUBBUF_SIZE_MED,
+               LTT_DEFAULT_N_SUBBUFS_MED,
+       },
+       [LTT_CHANNEL_DEFAULT] = {
+               NULL,
+               LTT_DEFAULT_SUBBUF_SIZE_MED,
+               LTT_DEFAULT_N_SUBBUFS_MED,
+       },
+};
+
+static enum ltt_channels get_channel_type_from_name(const char *name)
+{
+       int i;
+
+       if (!name)
+               return LTT_CHANNEL_DEFAULT;
+
+       for (i = 0; i < ARRAY_SIZE(chan_infos); i++)
+               if (chan_infos[i].name && !strcmp(name, chan_infos[i].name))
+                       return (enum ltt_channels)i;
+
+       return LTT_CHANNEL_DEFAULT;
+}
+
+/**
+ * ltt_module_register - LTT module registration
+ * @name: module type
+ * @function: callback to register
+ * @owner: module which owns the callback
+ *
+ * The module calling this registration function must ensure that no
+ * trap-inducing code will be executed by "function". E.g. vmalloc_sync_all()
+ * must be called between a vmalloc and the moment the memory is made visible to
+ * "function". This registration acts as a vmalloc_sync_all. Therefore, only if
+ * the module allocates virtual memory after its registration must it
+ * synchronize the TLBs.
+ */
+int ltt_module_register(enum ltt_module_function name, void *function,
+                       struct module *owner)
+{
+       int ret = 0;
+
+       /*
+        * Make sure no page fault can be triggered by the module about to be
+        * registered. We deal with this here so we don't have to call
+        * vmalloc_sync_all() in each module's init.
+        */
+       vmalloc_sync_all();
+
+       switch (name) {
+       case LTT_FUNCTION_RUN_FILTER:
+               if (ltt_run_filter_owner != NULL) {
+                       ret = -EEXIST;
+                       goto end;
+               }
+               ltt_filter_register((ltt_run_filter_functor)function);
+               ltt_run_filter_owner = owner;
+               break;
+       case LTT_FUNCTION_FILTER_CONTROL:
+               if (ltt_filter_control_owner != NULL) {
+                       ret = -EEXIST;
+                       goto end;
+               }
+               ltt_filter_control_functor =
+                       (int (*)(enum ltt_filter_control_msg,
+                       struct ltt_trace *))function;
+               ltt_filter_control_owner = owner;
+               break;
+       case LTT_FUNCTION_STATEDUMP:
+               if (ltt_statedump_owner != NULL) {
+                       ret = -EEXIST;
+                       goto end;
+               }
+               ltt_statedump_functor =
+                       (int (*)(struct ltt_trace *))function;
+               ltt_statedump_owner = owner;
+               break;
+       }
+
+end:
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_module_register);
+
+/**
+ * ltt_module_unregister - LTT module unregistration
+ * @name: module type
+ */
+void ltt_module_unregister(enum ltt_module_function name)
+{
+       switch (name) {
+       case LTT_FUNCTION_RUN_FILTER:
+               ltt_filter_unregister();
+               ltt_run_filter_owner = NULL;
+               /* Wait for preempt sections to finish */
+               synchronize_trace();
+               break;
+       case LTT_FUNCTION_FILTER_CONTROL:
+               ltt_filter_control_functor = ltt_filter_control_default;
+               ltt_filter_control_owner = NULL;
+               break;
+       case LTT_FUNCTION_STATEDUMP:
+               ltt_statedump_functor = ltt_statedump_default;
+               ltt_statedump_owner = NULL;
+               break;
+       }
+
+}
+EXPORT_SYMBOL_GPL(ltt_module_unregister);
+
+static LIST_HEAD(ltt_transport_list);
+
+/**
+ * ltt_transport_register - LTT transport registration
+ * @transport: transport structure
+ *
+ * Registers a transport which can be used as output to extract the data out of
+ * LTTng. The module calling this registration function must ensure that no
+ * trap-inducing code will be executed by the transport functions. E.g.
+ * vmalloc_sync_all() must be called between a vmalloc and the moment the memory
+ * is made visible to the transport function. This registration acts as a
+ * vmalloc_sync_all. Therefore, only if the module allocates virtual memory
+ * after its registration must it synchronize the TLBs.
+ */
+void ltt_transport_register(struct ltt_transport *transport)
+{
+       /*
+        * Make sure no page fault can be triggered by the module about to be
+        * registered. We deal with this here so we don't have to call
+        * vmalloc_sync_all() in each module's init.
+        */
+       vmalloc_sync_all();
+
+       ltt_lock_traces();
+       list_add_tail(&transport->node, &ltt_transport_list);
+       ltt_unlock_traces();
+}
+EXPORT_SYMBOL_GPL(ltt_transport_register);
+
+/**
+ * ltt_transport_unregister - LTT transport unregistration
+ * @transport: transport structure
+ */
+void ltt_transport_unregister(struct ltt_transport *transport)
+{
+       ltt_lock_traces();
+       list_del(&transport->node);
+       ltt_unlock_traces();
+}
+EXPORT_SYMBOL_GPL(ltt_transport_unregister);
+
+static inline
+int is_channel_overwrite(enum ltt_channels chan, enum trace_mode mode)
+{
+       switch (mode) {
+       case LTT_TRACE_NORMAL:
+               return 0;
+       case LTT_TRACE_FLIGHT:
+               switch (chan) {
+               case LTT_CHANNEL_METADATA:
+                       return 0;
+               default:
+                       return 1;
+               }
+       case LTT_TRACE_HYBRID:
+               switch (chan) {
+               case LTT_CHANNEL_KERNEL:
+               case LTT_CHANNEL_FS:
+               case LTT_CHANNEL_MM:
+               case LTT_CHANNEL_RCU:
+               case LTT_CHANNEL_IPC:
+               case LTT_CHANNEL_INPUT:
+                       return 1;
+               default:
+                       return 0;
+               }
+       default:
+               return 0;
+       }
+}
+
+static void trace_async_wakeup(struct ltt_trace *trace)
+{
+       int i;
+       struct ltt_chan *chan;
+
+       /* Must check each channel for pending read wakeup */
+       for (i = 0; i < trace->nr_channels; i++) {
+               chan = &trace->channels[i];
+               if (chan->active)
+                       trace->ops->wakeup_channel(chan);
+       }
+}
+
+/* Timer to send async wakeups to the readers */
+static void async_wakeup(unsigned long data)
+{
+       struct ltt_trace *trace;
+
+       /*
+        * PREEMPT_RT does not allow spinlocks to be taken within preempt
+        * disable sections (spinlock taken in wake_up). However, mainline won't
+        * allow mutex to be taken in interrupt context. Ugly.
+        * Take a standard RCU read lock for RT kernels, which imply that we
+        * also have to synchronize_rcu() upon updates.
+        */
+#ifndef CONFIG_PREEMPT_RT
+       rcu_read_lock_sched();
+#else
+       rcu_read_lock();
+#endif
+       list_for_each_entry_rcu(trace, &ltt_traces.head, list) {
+               trace_async_wakeup(trace);
+       }
+#ifndef CONFIG_PREEMPT_RT
+       rcu_read_unlock_sched();
+#else
+       rcu_read_unlock();
+#endif
+
+       mod_timer(&ltt_async_wakeup_timer, jiffies + LTT_PERCPU_TIMER_INTERVAL);
+}
+
+/**
+ * _ltt_trace_find - find a trace by given name.
+ * trace_name: trace name
+ *
+ * Returns a pointer to the trace structure, NULL if not found.
+ */
+static struct ltt_trace *_ltt_trace_find(const char *trace_name)
+{
+       struct ltt_trace *trace;
+
+       list_for_each_entry(trace, &ltt_traces.head, list)
+               if (!strncmp(trace->trace_name, trace_name, NAME_MAX))
+                       return trace;
+
+       return NULL;
+}
+
+/* _ltt_trace_find_setup :
+ * find a trace in setup list by given name.
+ *
+ * Returns a pointer to the trace structure, NULL if not found.
+ */
+struct ltt_trace *_ltt_trace_find_setup(const char *trace_name)
+{
+       struct ltt_trace *trace;
+
+       list_for_each_entry(trace, &ltt_traces.setup_head, list)
+               if (!strncmp(trace->trace_name, trace_name, NAME_MAX))
+                       return trace;
+
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(_ltt_trace_find_setup);
+
+/**
+ * ltt_release_trace - Release a LTT trace
+ * @kref : reference count on the trace
+ */
+void ltt_release_trace(struct kref *kref)
+{
+       struct ltt_trace *trace = container_of(kref, struct ltt_trace, kref);
+
+       trace->ops->remove_dirs(trace);
+       module_put(trace->transport->owner);
+       ltt_channels_trace_free(trace->channels, trace->nr_channels);
+       kfree(trace);
+}
+EXPORT_SYMBOL_GPL(ltt_release_trace);
+
+static inline void prepare_chan_size_num(unsigned int *subbuf_size,
+                                        unsigned int *n_subbufs)
+{
+       /* Make sure the subbuffer size is larger than a page */
+       *subbuf_size = max_t(unsigned int, *subbuf_size, PAGE_SIZE);
+
+       /* round to next power of 2 */
+       *subbuf_size = 1 << get_count_order(*subbuf_size);
+       *n_subbufs = 1 << get_count_order(*n_subbufs);
+
+       /* Subbuf size and number must both be power of two */
+       WARN_ON(hweight32(*subbuf_size) != 1);
+       WARN_ON(hweight32(*n_subbufs) != 1);
+}
+
+int _ltt_trace_setup(const char *trace_name)
+{
+       int err = 0;
+       struct ltt_trace *new_trace = NULL;
+       int metadata_index;
+       unsigned int chan;
+       enum ltt_channels chantype;
+
+       if (_ltt_trace_find_setup(trace_name)) {
+               printk(KERN_ERR "LTT : Trace name %s already used.\n",
+                               trace_name);
+               err = -EEXIST;
+               goto traces_error;
+       }
+
+       if (_ltt_trace_find(trace_name)) {
+               printk(KERN_ERR "LTT : Trace name %s already used.\n",
+                               trace_name);
+               err = -EEXIST;
+               goto traces_error;
+       }
+
+       new_trace = kzalloc(sizeof(struct ltt_trace), GFP_KERNEL);
+       if (!new_trace) {
+               printk(KERN_ERR
+                       "LTT : Unable to allocate memory for trace %s\n",
+                       trace_name);
+               err = -ENOMEM;
+               goto traces_error;
+       }
+       strncpy(new_trace->trace_name, trace_name, NAME_MAX);
+       new_trace->channels = ltt_channels_trace_alloc(&new_trace->nr_channels,
+                                                      0, 1);
+       if (!new_trace->channels) {
+               printk(KERN_ERR
+                       "LTT : Unable to allocate memory for chaninfo  %s\n",
+                       trace_name);
+               err = -ENOMEM;
+               goto trace_free;
+       }
+
+       /*
+        * Force metadata channel to active, no overwrite.
+        */
+       metadata_index = ltt_channels_get_index_from_name("metadata");
+       WARN_ON(metadata_index < 0);
+       new_trace->channels[metadata_index].overwrite = 0;
+       new_trace->channels[metadata_index].active = 1;
+
+       /*
+        * Set hardcoded tracer defaults for some channels
+        */
+       for (chan = 0; chan < new_trace->nr_channels; chan++) {
+               if (!(new_trace->channels[chan].active))
+                       continue;
+
+               chantype = get_channel_type_from_name(
+                       ltt_channels_get_name_from_index(chan));
+               new_trace->channels[chan].a.sb_size =
+                       chan_infos[chantype].def_sb_size;
+               new_trace->channels[chan].a.n_sb =
+                       chan_infos[chantype].def_n_sb;
+       }
+
+       list_add(&new_trace->list, &ltt_traces.setup_head);
+       return 0;
+
+trace_free:
+       kfree(new_trace);
+traces_error:
+       return err;
+}
+EXPORT_SYMBOL_GPL(_ltt_trace_setup);
+
+
+int ltt_trace_setup(const char *trace_name)
+{
+       int ret;
+       ltt_lock_traces();
+       ret = _ltt_trace_setup(trace_name);
+       ltt_unlock_traces();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_setup);
+
+/* must be called from within a traces lock. */
+static void _ltt_trace_free(struct ltt_trace *trace)
+{
+       list_del(&trace->list);
+       kfree(trace);
+}
+
+int ltt_trace_set_type(const char *trace_name, const char *trace_type)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+       struct ltt_transport *tran_iter, *transport = NULL;
+
+       ltt_lock_traces();
+
+       trace = _ltt_trace_find_setup(trace_name);
+       if (!trace) {
+               printk(KERN_ERR "LTT : Trace not found %s\n", trace_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+
+       list_for_each_entry(tran_iter, &ltt_transport_list, node) {
+               if (!strcmp(tran_iter->name, trace_type)) {
+                       transport = tran_iter;
+                       break;
+               }
+       }
+       if (!transport) {
+               printk(KERN_ERR "LTT : Transport %s is not present.\n",
+                       trace_type);
+               err = -EINVAL;
+               goto traces_error;
+       }
+
+       trace->transport = transport;
+
+traces_error:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_set_type);
+
+int ltt_trace_set_channel_subbufsize(const char *trace_name,
+                                    const char *channel_name,
+                                    unsigned int size)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+       int index;
+
+       ltt_lock_traces();
+
+       trace = _ltt_trace_find_setup(trace_name);
+       if (!trace) {
+               printk(KERN_ERR "LTT : Trace not found %s\n", trace_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+
+       index = ltt_channels_get_index_from_name(channel_name);
+       if (index < 0) {
+               printk(KERN_ERR "LTT : Channel %s not found\n", channel_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+       trace->channels[index].a.sb_size = size;
+
+traces_error:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_set_channel_subbufsize);
+
+int ltt_trace_set_channel_subbufcount(const char *trace_name,
+                                     const char *channel_name,
+                                     unsigned int cnt)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+       int index;
+
+       ltt_lock_traces();
+
+       trace = _ltt_trace_find_setup(trace_name);
+       if (!trace) {
+               printk(KERN_ERR "LTT : Trace not found %s\n", trace_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+
+       index = ltt_channels_get_index_from_name(channel_name);
+       if (index < 0) {
+               printk(KERN_ERR "LTT : Channel %s not found\n", channel_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+       trace->channels[index].a.n_sb = cnt;
+
+traces_error:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_set_channel_subbufcount);
+
+int ltt_trace_set_channel_switch_timer(const char *trace_name,
+                                      const char *channel_name,
+                                      unsigned long interval)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+       int index;
+
+       ltt_lock_traces();
+
+       trace = _ltt_trace_find_setup(trace_name);
+       if (!trace) {
+               printk(KERN_ERR "LTT : Trace not found %s\n", trace_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+
+       index = ltt_channels_get_index_from_name(channel_name);
+       if (index < 0) {
+               printk(KERN_ERR "LTT : Channel %s not found\n", channel_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+       ltt_channels_trace_set_timer(&trace->channels[index], interval);
+
+traces_error:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_set_channel_switch_timer);
+
+int ltt_trace_set_channel_enable(const char *trace_name,
+                                const char *channel_name, unsigned int enable)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+       int index;
+
+       ltt_lock_traces();
+
+       trace = _ltt_trace_find_setup(trace_name);
+       if (!trace) {
+               printk(KERN_ERR "LTT : Trace not found %s\n", trace_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+
+       /*
+        * Datas in metadata channel(marker info) is necessary to be able to
+        * read the trace, we always enable this channel.
+        */
+       if (!enable && !strcmp(channel_name, "metadata")) {
+               printk(KERN_ERR "LTT : Trying to disable metadata channel\n");
+               err = -EINVAL;
+               goto traces_error;
+       }
+
+       index = ltt_channels_get_index_from_name(channel_name);
+       if (index < 0) {
+               printk(KERN_ERR "LTT : Channel %s not found\n", channel_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+
+       trace->channels[index].active = enable;
+
+traces_error:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_set_channel_enable);
+
+int ltt_trace_set_channel_overwrite(const char *trace_name,
+                                   const char *channel_name,
+                                   unsigned int overwrite)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+       int index;
+
+       ltt_lock_traces();
+
+       trace = _ltt_trace_find_setup(trace_name);
+       if (!trace) {
+               printk(KERN_ERR "LTT : Trace not found %s\n", trace_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+
+       /*
+        * Always put the metadata channel in non-overwrite mode :
+        * This is a very low traffic channel and it can't afford to have its
+        * data overwritten : this data (marker info) is necessary to be
+        * able to read the trace.
+        */
+       if (overwrite && !strcmp(channel_name, "metadata")) {
+               printk(KERN_ERR "LTT : Trying to set metadata channel to "
+                               "overwrite mode\n");
+               err = -EINVAL;
+               goto traces_error;
+       }
+
+       index = ltt_channels_get_index_from_name(channel_name);
+       if (index < 0) {
+               printk(KERN_ERR "LTT : Channel %s not found\n", channel_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+
+       trace->channels[index].overwrite = overwrite;
+
+traces_error:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_set_channel_overwrite);
+
+int ltt_trace_alloc(const char *trace_name)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+       int sb_size, n_sb;
+       unsigned long flags;
+       int chan;
+       const char *channel_name;
+
+       ltt_lock_traces();
+
+       trace = _ltt_trace_find_setup(trace_name);
+       if (!trace) {
+               printk(KERN_ERR "LTT : Trace not found %s\n", trace_name);
+               err = -ENOENT;
+               goto traces_error;
+       }
+
+       kref_init(&trace->kref);
+       init_waitqueue_head(&trace->kref_wq);
+       trace->active = 0;
+       get_trace_clock();
+       trace->freq_scale = trace_clock_freq_scale();
+
+       if (!trace->transport) {
+               printk(KERN_ERR "LTT : Transport is not set.\n");
+               err = -EINVAL;
+               goto transport_error;
+       }
+       if (!try_module_get(trace->transport->owner)) {
+               printk(KERN_ERR "LTT : Can't lock transport module.\n");
+               err = -ENODEV;
+               goto transport_error;
+       }
+       trace->ops = &trace->transport->ops;
+
+       err = trace->ops->create_dirs(trace);
+       if (err) {
+               printk(KERN_ERR "LTT : Can't create dir for trace %s.\n",
+                       trace_name);
+               goto dirs_error;
+       }
+
+       local_irq_save(flags);
+       trace->start_freq = trace_clock_frequency();
+       trace->start_tsc = trace_clock_read64();
+       do_gettimeofday(&trace->start_time);
+       local_irq_restore(flags);
+
+       for (chan = 0; chan < trace->nr_channels; chan++) {
+               if (!(trace->channels[chan].active))
+                       continue;
+
+               channel_name = ltt_channels_get_name_from_index(chan);
+               WARN_ON(!channel_name);
+               /*
+                * note: sb_size and n_sb will be overwritten with updated
+                * values by channel creation.
+                */
+               sb_size = trace->channels[chan].a.sb_size;
+               n_sb = trace->channels[chan].a.n_sb;
+               prepare_chan_size_num(&sb_size, &n_sb);
+               err = trace->ops->create_channel(channel_name,
+                                     &trace->channels[chan],
+                                     trace->dentry.trace_root,
+                                     sb_size, n_sb,
+                                     trace->channels[chan].overwrite, trace);
+               if (err != 0) {
+                       printk(KERN_ERR "LTT : Can't create channel %s.\n",
+                               channel_name);
+                       goto create_channel_error;
+               }
+       }
+
+       list_del(&trace->list);
+       if (list_empty(&ltt_traces.head)) {
+               mod_timer(&ltt_async_wakeup_timer,
+                               jiffies + LTT_PERCPU_TIMER_INTERVAL);
+               set_kernel_trace_flag_all_tasks();
+       }
+       list_add_rcu(&trace->list, &ltt_traces.head);
+       synchronize_trace();
+
+       ltt_unlock_traces();
+
+       return 0;
+
+create_channel_error:
+       for (chan--; chan >= 0; chan--) {
+               if (trace->channels[chan].active) {
+                       struct ltt_chan *chanp = &trace->channels[chan];
+                       trace->ops->remove_channel_files(chanp);
+                       kref_put(&chanp->a.kref, trace->ops->remove_channel);
+               }
+       }
+       trace->ops->remove_dirs(trace);
+
+dirs_error:
+       module_put(trace->transport->owner);
+transport_error:
+       put_trace_clock();
+traces_error:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_alloc);
+
+/*
+ * It is worked as a wrapper for current version of ltt_control.ko.
+ * We will make a new ltt_control based on debugfs, and control each channel's
+ * buffer.
+ */
+static
+int ltt_trace_create(const char *trace_name, const char *trace_type,
+                    enum trace_mode mode,
+                    unsigned int subbuf_size_low, unsigned int n_subbufs_low,
+                    unsigned int subbuf_size_med, unsigned int n_subbufs_med,
+                    unsigned int subbuf_size_high, unsigned int n_subbufs_high)
+{
+       int err = 0;
+
+       err = ltt_trace_setup(trace_name);
+       if (IS_ERR_VALUE(err))
+               return err;
+
+       err = ltt_trace_set_type(trace_name, trace_type);
+       if (IS_ERR_VALUE(err))
+               return err;
+
+       err = ltt_trace_alloc(trace_name);
+       if (IS_ERR_VALUE(err))
+               return err;
+
+       return err;
+}
+
+/* Must be called while sure that trace is in the list. */
+static int _ltt_trace_destroy(struct ltt_trace *trace)
+{
+       int err = -EPERM;
+
+       if (trace == NULL) {
+               err = -ENOENT;
+               goto traces_error;
+       }
+       if (trace->active) {
+               printk(KERN_ERR
+                       "LTT : Can't destroy trace %s : tracer is active\n",
+                       trace->trace_name);
+               err = -EBUSY;
+               goto active_error;
+       }
+       /* Everything went fine */
+       list_del_rcu(&trace->list);
+       synchronize_trace();
+       if (list_empty(&ltt_traces.head)) {
+               clear_kernel_trace_flag_all_tasks();
+               /*
+                * We stop the asynchronous delivery of reader wakeup, but
+                * we must make one last check for reader wakeups pending
+                * later in __ltt_trace_destroy.
+                */
+               del_timer_sync(&ltt_async_wakeup_timer);
+       }
+       return 0;
+
+       /* error handling */
+active_error:
+traces_error:
+       return err;
+}
+
+/* Sleepable part of the destroy */
+static void __ltt_trace_destroy(struct ltt_trace *trace)
+{
+       int i;
+       struct ltt_chan *chan;
+
+       for (i = 0; i < trace->nr_channels; i++) {
+               chan = &trace->channels[i];
+               if (chan->active)
+                       trace->ops->finish_channel(chan);
+       }
+
+       flush_scheduled_work();
+
+       /*
+        * The currently destroyed trace is not in the trace list anymore,
+        * so it's safe to call the async wakeup ourself. It will deliver
+        * the last subbuffers.
+        */
+       trace_async_wakeup(trace);
+
+       for (i = 0; i < trace->nr_channels; i++) {
+               chan = &trace->channels[i];
+               if (chan->active) {
+                       trace->ops->remove_channel_files(chan);
+                       kref_put(&chan->a.kref,
+                                trace->ops->remove_channel);
+               }
+       }
+
+       /*
+        * Wait for lttd readers to release the files, therefore making sure
+        * the last subbuffers have been read.
+        */
+       if (atomic_read(&trace->kref.refcount) > 1) {
+               int ret = 0;
+               /*
+                * Unlock traces and CPU hotplug while we wait for lttd to
+                * release the files.
+                */
+               ltt_unlock_traces();
+               __wait_event_interruptible(trace->kref_wq,
+                       (atomic_read(&trace->kref.refcount) == 1), ret);
+               ltt_lock_traces();
+       }
+
+       kref_put(&trace->kref, ltt_release_trace);
+}
+
+int ltt_trace_destroy(const char *trace_name)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+
+       ltt_lock_traces();
+
+       trace = _ltt_trace_find(trace_name);
+       if (trace) {
+               err = _ltt_trace_destroy(trace);
+               if (err)
+                       goto error;
+
+               __ltt_trace_destroy(trace);
+               ltt_unlock_traces();
+               put_trace_clock();
+
+               return 0;
+       }
+
+       trace = _ltt_trace_find_setup(trace_name);
+       if (trace) {
+               _ltt_trace_free(trace);
+               ltt_unlock_traces();
+               return 0;
+       }
+
+       err = -ENOENT;
+
+       /* Error handling */
+error:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_destroy);
+
+/*
+ * called with trace lock held.
+ */
+static
+void ltt_channels_trace_start_timer(struct ltt_chan *channels,
+                                   unsigned int nr_channels)
+{
+       int i;
+
+       for (i = 0; i < nr_channels; i++) {
+               struct ltt_chan *chan = &channels[i];
+               chan->a.trace->ops->start_switch_timer(chan);
+       }
+}
+
+/*
+ * called with trace lock held.
+ */
+static
+void ltt_channels_trace_stop_timer(struct ltt_chan *channels,
+                                  unsigned int nr_channels)
+{
+       int i;
+
+       for (i = 0; i < nr_channels; i++) {
+               struct ltt_chan *chan = &channels[i];
+               chan->a.trace->ops->stop_switch_timer(chan);
+       }
+}
+
+/* must be called from within a traces lock. */
+static int _ltt_trace_start(struct ltt_trace *trace)
+{
+       int err = 0;
+
+       if (trace == NULL) {
+               err = -ENOENT;
+               goto traces_error;
+       }
+       if (trace->active)
+               printk(KERN_INFO "LTT : Tracing already active for trace %s\n",
+                               trace->trace_name);
+       if (!try_module_get(ltt_run_filter_owner)) {
+               err = -ENODEV;
+               printk(KERN_ERR "LTT : Can't lock filter module.\n");
+               goto get_ltt_run_filter_error;
+       }
+       ltt_channels_trace_start_timer(trace->channels, trace->nr_channels);
+       trace->active = 1;
+       /* Read by trace points without protection : be careful */
+       ltt_traces.num_active_traces++;
+       return err;
+
+       /* error handling */
+get_ltt_run_filter_error:
+traces_error:
+       return err;
+}
+
+int ltt_trace_start(const char *trace_name)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+
+       ltt_lock_traces();
+
+       trace = _ltt_trace_find(trace_name);
+       err = _ltt_trace_start(trace);
+       if (err)
+               goto no_trace;
+
+       ltt_unlock_traces();
+
+       /*
+        * Call the kernel state dump.
+        * Events will be mixed with real kernel events, it's ok.
+        * Notice that there is no protection on the trace : that's exactly
+        * why we iterate on the list and check for trace equality instead of
+        * directly using this trace handle inside the logging function.
+        */
+
+       ltt_dump_marker_state(trace);
+
+       if (!try_module_get(ltt_statedump_owner)) {
+               err = -ENODEV;
+               printk(KERN_ERR
+                       "LTT : Can't lock state dump module.\n");
+       } else {
+               ltt_statedump_functor(trace);
+               module_put(ltt_statedump_owner);
+       }
+
+       return err;
+
+       /* Error handling */
+no_trace:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_start);
+
+/* must be called from within traces lock */
+static int _ltt_trace_stop(struct ltt_trace *trace)
+{
+       int err = -EPERM;
+
+       if (trace == NULL) {
+               err = -ENOENT;
+               goto traces_error;
+       }
+       if (!trace->active)
+               printk(KERN_INFO "LTT : Tracing not active for trace %s\n",
+                               trace->trace_name);
+       if (trace->active) {
+               ltt_channels_trace_stop_timer(trace->channels,
+                       trace->nr_channels);
+               trace->active = 0;
+               ltt_traces.num_active_traces--;
+               synchronize_trace(); /* Wait for each tracing to be finished */
+       }
+       module_put(ltt_run_filter_owner);
+       /* Everything went fine */
+       return 0;
+
+       /* Error handling */
+traces_error:
+       return err;
+}
+
+int ltt_trace_stop(const char *trace_name)
+{
+       int err = 0;
+       struct ltt_trace *trace;
+
+       ltt_lock_traces();
+       trace = _ltt_trace_find(trace_name);
+       err = _ltt_trace_stop(trace);
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_trace_stop);
+
+/**
+ * ltt_control - Trace control in-kernel API
+ * @msg: Action to perform
+ * @trace_name: Trace on which the action must be done
+ * @trace_type: Type of trace (normal, flight, hybrid)
+ * @args: Arguments specific to the action
+ */
+int ltt_control(enum ltt_control_msg msg, const char *trace_name,
+               const char *trace_type, union ltt_control_args args)
+{
+       int err = -EPERM;
+
+       printk(KERN_ALERT "ltt_control : trace %s\n", trace_name);
+       switch (msg) {
+       case LTT_CONTROL_START:
+               printk(KERN_DEBUG "Start tracing %s\n", trace_name);
+               err = ltt_trace_start(trace_name);
+               break;
+       case LTT_CONTROL_STOP:
+               printk(KERN_DEBUG "Stop tracing %s\n", trace_name);
+               err = ltt_trace_stop(trace_name);
+               break;
+       case LTT_CONTROL_CREATE_TRACE:
+               printk(KERN_DEBUG "Creating trace %s\n", trace_name);
+               err = ltt_trace_create(trace_name, trace_type,
+                       args.new_trace.mode,
+                       args.new_trace.subbuf_size_low,
+                       args.new_trace.n_subbufs_low,
+                       args.new_trace.subbuf_size_med,
+                       args.new_trace.n_subbufs_med,
+                       args.new_trace.subbuf_size_high,
+                       args.new_trace.n_subbufs_high);
+               break;
+       case LTT_CONTROL_DESTROY_TRACE:
+               printk(KERN_DEBUG "Destroying trace %s\n", trace_name);
+               err = ltt_trace_destroy(trace_name);
+               break;
+       }
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_control);
+
+/**
+ * ltt_filter_control - Trace filter control in-kernel API
+ * @msg: Action to perform on the filter
+ * @trace_name: Trace on which the action must be done
+ */
+int ltt_filter_control(enum ltt_filter_control_msg msg, const char *trace_name)
+{
+       int err;
+       struct ltt_trace *trace;
+
+       printk(KERN_DEBUG "ltt_filter_control : trace %s\n", trace_name);
+       ltt_lock_traces();
+       trace = _ltt_trace_find(trace_name);
+       if (trace == NULL) {
+               printk(KERN_ALERT
+                       "Trace does not exist. Cannot proxy control request\n");
+               err = -ENOENT;
+               goto trace_error;
+       }
+       if (!try_module_get(ltt_filter_control_owner)) {
+               err = -ENODEV;
+               goto get_module_error;
+       }
+       switch (msg) {
+       case LTT_FILTER_DEFAULT_ACCEPT:
+               printk(KERN_DEBUG
+                       "Proxy filter default accept %s\n", trace_name);
+               err = (*ltt_filter_control_functor)(msg, trace);
+               break;
+       case LTT_FILTER_DEFAULT_REJECT:
+               printk(KERN_DEBUG
+                       "Proxy filter default reject %s\n", trace_name);
+               err = (*ltt_filter_control_functor)(msg, trace);
+               break;
+       default:
+               err = -EPERM;
+       }
+       module_put(ltt_filter_control_owner);
+
+get_module_error:
+trace_error:
+       ltt_unlock_traces();
+       return err;
+}
+EXPORT_SYMBOL_GPL(ltt_filter_control);
+
+int __init ltt_init(void)
+{
+       /* Make sure no page fault can be triggered by this module */
+       vmalloc_sync_all();
+       init_timer_deferrable(&ltt_async_wakeup_timer);
+       return 0;
+}
+
+module_init(ltt_init)
+
+static void __exit ltt_exit(void)
+{
+       struct ltt_trace *trace;
+       struct list_head *pos, *n;
+
+       ltt_lock_traces();
+       /* Stop each trace, currently being read by RCU read-side */
+       list_for_each_entry_rcu(trace, &ltt_traces.head, list)
+               _ltt_trace_stop(trace);
+       /* Wait for quiescent state. Readers have preemption disabled. */
+       synchronize_trace();
+       /* Safe iteration is now permitted. It does not have to be RCU-safe
+        * because no readers are left. */
+       list_for_each_safe(pos, n, &ltt_traces.head) {
+               trace = container_of(pos, struct ltt_trace, list);
+               /* _ltt_trace_destroy does a synchronize_trace() */
+               _ltt_trace_destroy(trace);
+               __ltt_trace_destroy(trace);
+       }
+       /* free traces in pre-alloc status */
+       list_for_each_safe(pos, n, &ltt_traces.setup_head) {
+               trace = container_of(pos, struct ltt_trace, list);
+               _ltt_trace_free(trace);
+       }
+
+       ltt_unlock_traces();
+}
+
+module_exit(ltt_exit)
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Tracer Kernel API");
diff --git a/ltt-tracer.h b/ltt-tracer.h
new file mode 100644 (file)
index 0000000..9564c3f
--- /dev/null
@@ -0,0 +1,663 @@
+/*
+ * Copyright (C) 2005,2006,2008 Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * This contains the definitions for the Linux Trace Toolkit tracer.
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#ifndef _LTT_TRACER_H
+#define _LTT_TRACER_H
+
+#include <stdarg.h>
+#include <linux/types.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/cache.h>
+#include <linux/kernel.h>
+#include <linux/timex.h>
+#include <linux/wait.h>
+#include <linux/marker.h>
+#include <linux/trace-clock.h>
+#include <linux/ltt-channels.h>
+#include <asm/atomic.h>
+#include <asm/local.h>
+
+#include "ltt-tracer-core.h"
+#include "ltt-relay.h"
+
+/* Number of bytes to log with a read/write event */
+#define LTT_LOG_RW_SIZE                        32L
+
+/* Interval (in jiffies) at which the LTT per-CPU timer fires */
+#define LTT_PERCPU_TIMER_INTERVAL      1
+
+#ifndef LTT_ARCH_TYPE
+#define LTT_ARCH_TYPE                  LTT_ARCH_TYPE_UNDEFINED
+#endif
+
+#ifndef LTT_ARCH_VARIANT
+#define LTT_ARCH_VARIANT               LTT_ARCH_VARIANT_NONE
+#endif
+
+struct ltt_active_marker;
+
+/* Maximum number of callbacks per marker */
+#define LTT_NR_CALLBACKS       10
+
+struct ltt_serialize_closure {
+       ltt_serialize_cb *callbacks;
+       long cb_args[LTT_NR_CALLBACKS];
+       unsigned int cb_idx;
+};
+
+size_t ltt_serialize_data(struct ltt_chanbuf *buf, size_t buf_offset,
+                         struct ltt_serialize_closure *closure,
+                         void *serialize_private, unsigned int stack_pos_ctx,
+                         int *largest_align, const char *fmt, va_list *args);
+
+struct ltt_available_probe {
+       const char *name;               /* probe name */
+       const char *format;
+       marker_probe_func *probe_func;
+       ltt_serialize_cb callbacks[LTT_NR_CALLBACKS];
+       struct list_head node;          /* registered probes list */
+};
+
+enum ltt_channels {
+       LTT_CHANNEL_METADATA,
+       LTT_CHANNEL_FD_STATE,
+       LTT_CHANNEL_GLOBAL_STATE,
+       LTT_CHANNEL_IRQ_STATE,
+       LTT_CHANNEL_MODULE_STATE,
+       LTT_CHANNEL_NETIF_STATE,
+       LTT_CHANNEL_SOFTIRQ_STATE,
+       LTT_CHANNEL_SWAP_STATE,
+       LTT_CHANNEL_SYSCALL_STATE,
+       LTT_CHANNEL_TASK_STATE,
+       LTT_CHANNEL_VM_STATE,
+       LTT_CHANNEL_FS,
+       LTT_CHANNEL_INPUT,
+       LTT_CHANNEL_IPC,
+       LTT_CHANNEL_KERNEL,
+       LTT_CHANNEL_MM,
+       LTT_CHANNEL_RCU,
+       LTT_CHANNEL_DEFAULT,
+};
+
+struct ltt_active_marker {
+       struct list_head node;          /* active markers list */
+       const char *channel;
+       const char *name;
+       const char *format;
+       struct ltt_available_probe *probe;
+};
+
+extern void ltt_vtrace(const struct marker *mdata, void *probe_data,
+                      void *call_data, const char *fmt, va_list *args);
+extern void ltt_trace(const struct marker *mdata, void *probe_data,
+                     void *call_data, const char *fmt, ...);
+
+size_t ltt_serialize_printf(struct ltt_chanbuf *buf, unsigned long buf_offset,
+                           size_t *msg_size, char *output, size_t outlen,
+                           const char *fmt);
+
+/*
+ * Unique ID assigned to each registered probe.
+ */
+enum marker_id {
+       MARKER_ID_SET_MARKER_ID = 0,    /* Static IDs available (range 0-7) */
+       MARKER_ID_SET_MARKER_FORMAT,
+       MARKER_ID_COMPACT,              /* Compact IDs (range: 8-127)       */
+       MARKER_ID_DYNAMIC,              /* Dynamic IDs (range: 128-65535)   */
+};
+
+/* static ids 0-1 reserved for internal use. */
+#define MARKER_CORE_IDS                2
+static __inline__ enum marker_id marker_id_type(uint16_t id)
+{
+       if (id < MARKER_CORE_IDS)
+               return (enum marker_id)id;
+       else
+               return MARKER_ID_DYNAMIC;
+}
+
+struct user_dbg_data {
+       unsigned long avail_size;
+       unsigned long write;
+       unsigned long read;
+};
+
+struct ltt_trace_ops {
+       /* First 32 bytes cache-hot cacheline */
+       void (*wakeup_channel) (struct ltt_chan *chan);
+       int (*user_blocking) (struct ltt_trace *trace, unsigned int index,
+                             size_t data_size, struct user_dbg_data *dbg);
+       /* End of first 32 bytes cacheline */
+       int (*create_dirs) (struct ltt_trace *new_trace);
+       void (*remove_dirs) (struct ltt_trace *new_trace);
+       int (*create_channel) (const char *channel_name, struct ltt_chan *chan,
+                              struct dentry *parent, size_t sb_size,
+                              size_t n_sb, int overwrite,
+                              struct ltt_trace *trace);
+       void (*finish_channel) (struct ltt_chan *chan);
+       void (*remove_channel) (struct kref *kref);
+       void (*remove_channel_files) (struct ltt_chan *chan);
+       void (*user_errors) (struct ltt_trace *trace, unsigned int index,
+                            size_t data_size, struct user_dbg_data *dbg,
+                            int cpu);
+       void (*start_switch_timer) (struct ltt_chan *chan);
+       void (*stop_switch_timer) (struct ltt_chan *chan);
+#ifdef CONFIG_HOTPLUG_CPU
+       int (*handle_cpuhp) (struct notifier_block *nb, unsigned long action,
+                            void *hcpu, struct ltt_trace *trace);
+#endif
+};
+
+struct ltt_transport {
+       char *name;
+       struct module *owner;
+       struct list_head node;
+       struct ltt_trace_ops ops;
+};
+
+enum trace_mode { LTT_TRACE_NORMAL, LTT_TRACE_FLIGHT, LTT_TRACE_HYBRID };
+
+#define CHANNEL_FLAG_ENABLE    (1U<<0)
+#define CHANNEL_FLAG_OVERWRITE (1U<<1)
+
+/* Per-trace information - each trace/flight recorder represented by one */
+struct ltt_trace {
+       /* First 32 bytes cache-hot cacheline */
+       struct list_head list;
+       struct ltt_chan *channels;
+       unsigned int nr_channels;
+       int active;
+       /* Second 32 bytes cache-hot cacheline */
+       struct ltt_trace_ops *ops;
+       u32 freq_scale;
+       u64 start_freq;
+       u64 start_tsc;
+       unsigned long long start_monotonic;
+       struct timeval          start_time;
+       struct ltt_channel_setting *settings;
+       struct {
+               struct dentry                   *trace_root;
+               struct dentry                   *ascii_root;
+       } dentry;
+       struct kref kref; /* Each channel has a kref of the trace struct */
+       struct ltt_transport *transport;
+       struct kref ltt_transport_kref;
+       wait_queue_head_t kref_wq; /* Place for ltt_trace_destroy to sleep */
+       char trace_name[NAME_MAX];
+} ____cacheline_aligned;
+
+/* Hardcoded event headers
+ *
+ * event header for a trace with active heartbeat : 27 bits timestamps
+ *
+ * headers are 32-bits aligned. In order to insure such alignment, a dynamic per
+ * trace alignment value must be done.
+ *
+ * Remember that the C compiler does align each member on the boundary
+ * equivalent to their own size.
+ *
+ * As relay subbuffers are aligned on pages, we are sure that they are 4 and 8
+ * bytes aligned, so the buffer header and trace header are aligned.
+ *
+ * Event headers are aligned depending on the trace alignment option.
+ *
+ * Note using C structure bitfields for cross-endianness and portability
+ * concerns.
+ */
+
+#define LTT_RESERVED_EVENTS    3
+#define LTT_EVENT_BITS         5
+#define LTT_FREE_EVENTS                ((1 << LTT_EVENT_BITS) - LTT_RESERVED_EVENTS)
+#define LTT_TSC_BITS           27
+#define LTT_TSC_MASK           ((1 << LTT_TSC_BITS) - 1)
+
+struct ltt_event_header {
+       u32 id_time;            /* 5 bits event id (MSB); 27 bits time (LSB) */
+};
+
+/* Reservation flags */
+#define        LTT_RFLAG_ID                    (1 << 0)
+#define        LTT_RFLAG_ID_SIZE               (1 << 1)
+#define        LTT_RFLAG_ID_SIZE_TSC           (1 << 2)
+
+#define LTT_MAX_SMALL_SIZE             0xFFFFU
+
+/*
+ * We use asm/timex.h : cpu_khz/HZ variable in here : we might have to deal
+ * specifically with CPU frequency scaling someday, so using an interpolation
+ * between the start and end of buffer values is not flexible enough. Using an
+ * immediate frequency value permits to calculate directly the times for parts
+ * of a buffer that would be before a frequency change.
+ *
+ * Keep the natural field alignment for _each field_ within this structure if
+ * you ever add/remove a field from this header. Packed attribute is not used
+ * because gcc generates poor code on at least powerpc and mips. Don't ever
+ * let gcc add padding between the structure elements.
+ */
+struct ltt_subbuffer_header {
+       uint64_t cycle_count_begin;     /* Cycle count at subbuffer start */
+       uint64_t cycle_count_end;       /* Cycle count at subbuffer end */
+       uint32_t magic_number;          /*
+                                        * Trace magic number.
+                                        * contains endianness information.
+                                        */
+       uint8_t major_version;
+       uint8_t minor_version;
+       uint8_t arch_size;              /* Architecture pointer size */
+       uint8_t alignment;              /* LTT data alignment */
+       uint64_t start_time_sec;        /* NTP-corrected start time */
+       uint64_t start_time_usec;
+       uint64_t start_freq;            /*
+                                        * Frequency at trace start,
+                                        * used all along the trace.
+                                        */
+       uint32_t freq_scale;            /* Frequency scaling (divisor) */
+       uint32_t data_size;             /* Size of data in subbuffer */
+       uint32_t sb_size;               /* Subbuffer size (include padding) */
+       uint32_t events_lost;           /*
+                                        * Events lost in this subbuffer since
+                                        * the beginning of the trace.
+                                        * (may overflow)
+                                        */
+       uint32_t subbuf_corrupt;        /*
+                                        * Corrupted (lost) subbuffers since
+                                        * the begginig of the trace.
+                                        * (may overflow)
+                                        */
+       uint8_t header_end[0];          /* End of header */
+};
+
+/**
+ * ltt_sb_header_size - called on buffer-switch to a new sub-buffer
+ *
+ * Return header size without padding after the structure. Don't use packed
+ * structure because gcc generates inefficient code on some architectures
+ * (powerpc, mips..)
+ */
+static __inline__ size_t ltt_sb_header_size(void)
+{
+       return offsetof(struct ltt_subbuffer_header, header_end);
+}
+
+/*
+ * ltt_get_header_size
+ *
+ * Calculate alignment offset to 32-bits. This is the alignment offset of the
+ * event header.
+ *
+ * Important note :
+ * The event header must be 32-bits. The total offset calculated here :
+ *
+ * Alignment of header struct on 32 bits (min arch size, header size)
+ * + sizeof(header struct)  (32-bits)
+ * + (opt) u16 (ext. event id)
+ * + (opt) u16 (event_size)
+ *             (if event_size == LTT_MAX_SMALL_SIZE, has ext. event size)
+ * + (opt) u32 (ext. event size)
+ * + (opt) u64 full TSC (aligned on min(64-bits, arch size))
+ *
+ * The payload must itself determine its own alignment from the biggest type it
+ * contains.
+ * */
+static __inline__
+unsigned char ltt_get_header_size(struct ltt_chan *chan, size_t offset,
+                                 size_t data_size, size_t *before_hdr_pad,
+                                 unsigned int rflags)
+{
+       size_t orig_offset = offset;
+       size_t padding;
+
+       BUILD_BUG_ON(sizeof(struct ltt_event_header) != sizeof(u32));
+
+       padding = ltt_align(offset, sizeof(struct ltt_event_header));
+       offset += padding;
+       offset += sizeof(struct ltt_event_header);
+
+       if (unlikely(rflags)) {
+               switch (rflags) {
+               case LTT_RFLAG_ID_SIZE_TSC:
+                       offset += sizeof(u16) + sizeof(u16);
+                       if (data_size >= LTT_MAX_SMALL_SIZE)
+                               offset += sizeof(u32);
+                       offset += ltt_align(offset, sizeof(u64));
+                       offset += sizeof(u64);
+                       break;
+               case LTT_RFLAG_ID_SIZE:
+                       offset += sizeof(u16) + sizeof(u16);
+                       if (data_size >= LTT_MAX_SMALL_SIZE)
+                               offset += sizeof(u32);
+                       break;
+               case LTT_RFLAG_ID:
+                       offset += sizeof(u16);
+                       break;
+               }
+       }
+
+       *before_hdr_pad = padding;
+       return offset - orig_offset;
+}
+
+extern
+size_t ltt_write_event_header_slow(struct ltt_chanbuf_alloc *bufa,
+                                  struct ltt_chan_alloc *chana,
+                                  long buf_offset, u16 eID, u32 event_size,
+                                  u64 tsc, unsigned int rflags);
+
+/*
+ * ltt_write_event_header
+ *
+ * Writes the event header to the offset (already aligned on 32-bits).
+ *
+ * @buf : buffer to write to.
+ * @chan : pointer to the channel structure..
+ * @buf_offset : buffer offset to write to (aligned on 32 bits).
+ * @eID : event ID
+ * @event_size : size of the event, excluding the event header.
+ * @tsc : time stamp counter.
+ * @rflags : reservation flags.
+ *
+ * returns : offset where the event data must be written.
+ */
+static __inline__
+size_t ltt_write_event_header(struct ltt_chanbuf_alloc *bufa,
+                             struct ltt_chan_alloc *chana,
+                             long buf_offset, u16 eID, u32 event_size, u64 tsc,
+                             unsigned int rflags)
+{
+       struct ltt_event_header header;
+
+       if (unlikely(rflags))
+               goto slow_path;
+
+       header.id_time = eID << LTT_TSC_BITS;
+       header.id_time |= (u32)tsc & LTT_TSC_MASK;
+       ltt_relay_write(bufa, chana, buf_offset, &header, sizeof(header));
+       buf_offset += sizeof(header);
+
+       return buf_offset;
+
+slow_path:
+       return ltt_write_event_header_slow(bufa, chana, buf_offset,
+                                          eID, event_size, tsc, rflags);
+}
+
+/*
+ * ltt_read_event_header
+ * buf_offset must aligned on 32 bits
+ */
+static __inline__
+size_t ltt_read_event_header(struct ltt_chanbuf_alloc *bufa, long buf_offset,
+                            u64 *tsc, u32 *event_size, u16 *eID,
+                            unsigned int *rflags)
+{
+       struct ltt_event_header header;
+       u16 small_size;
+
+       ltt_relay_read(bufa, buf_offset, &header, sizeof(header));
+       buf_offset += sizeof(header);
+
+       *event_size = INT_MAX;
+       *eID = header.id_time >> LTT_TSC_BITS;
+       *tsc = header.id_time & LTT_TSC_MASK;
+
+       switch (*eID) {
+       case 29:
+               *rflags = LTT_RFLAG_ID_SIZE_TSC;
+               ltt_relay_read(bufa, buf_offset, eID, sizeof(u16));
+               buf_offset += sizeof(u16);
+               ltt_relay_read(bufa, buf_offset, &small_size, sizeof(u16));
+               buf_offset += sizeof(u16);
+               if (small_size == LTT_MAX_SMALL_SIZE) {
+                       ltt_relay_read(bufa, buf_offset, event_size,
+                                       sizeof(u32));
+                       buf_offset += sizeof(u32);
+               } else
+                       *event_size = small_size;
+               buf_offset += ltt_align(buf_offset, sizeof(u64));
+               ltt_relay_read(bufa, buf_offset, tsc, sizeof(u64));
+               buf_offset += sizeof(u64);
+               break;
+       case 30:
+               *rflags = LTT_RFLAG_ID_SIZE;
+               ltt_relay_read(bufa, buf_offset, eID, sizeof(u16));
+               buf_offset += sizeof(u16);
+               ltt_relay_read(bufa, buf_offset, &small_size, sizeof(u16));
+               buf_offset += sizeof(u16);
+               if (small_size == LTT_MAX_SMALL_SIZE) {
+                       ltt_relay_read(bufa, buf_offset, event_size,
+                                       sizeof(u32));
+                       buf_offset += sizeof(u32);
+               } else
+                       *event_size = small_size;
+               break;
+       case 31:
+               *rflags = LTT_RFLAG_ID;
+               ltt_relay_read(bufa, buf_offset, eID, sizeof(u16));
+               buf_offset += sizeof(u16);
+               break;
+       default:
+               *rflags = 0;
+               break;
+       }
+
+       return buf_offset;
+}
+
+/* Lockless LTTng */
+
+/* Buffer offset macros */
+
+/*
+ * BUFFER_TRUNC zeroes the subbuffer offset and the subbuffer number parts of
+ * the offset, which leaves only the buffer number.
+ */
+#define BUFFER_TRUNC(offset, chan) \
+       ((offset) & (~((chan)->a.buf_size - 1)))
+#define BUFFER_OFFSET(offset, chan) ((offset) & ((chan)->a.buf_size - 1))
+#define SUBBUF_OFFSET(offset, chan) ((offset) & ((chan)->a.sb_size - 1))
+#define SUBBUF_ALIGN(offset, chan) \
+       (((offset) + (chan)->a.sb_size) & (~((chan)->a.sb_size - 1)))
+#define SUBBUF_TRUNC(offset, chan) \
+       ((offset) & (~((chan)->a.sb_size - 1)))
+#define SUBBUF_INDEX(offset, chan) \
+       (BUFFER_OFFSET((offset), chan) >> (chan)->a.sb_size_order)
+
+/*
+ * Control channels :
+ * control/metadata
+ * control/interrupts
+ * control/...
+ *
+ * cpu channel :
+ * cpu
+ */
+#define LTT_RELAY_ROOT                 "ltt"
+#define LTT_RELAY_LOCKED_ROOT          "ltt-locked"
+
+#define LTT_METADATA_CHANNEL           "metadata_state"
+#define LTT_FD_STATE_CHANNEL           "fd_state"
+#define LTT_GLOBAL_STATE_CHANNEL       "global_state"
+#define LTT_IRQ_STATE_CHANNEL          "irq_state"
+#define LTT_MODULE_STATE_CHANNEL       "module_state"
+#define LTT_NETIF_STATE_CHANNEL                "netif_state"
+#define LTT_SOFTIRQ_STATE_CHANNEL      "softirq_state"
+#define LTT_SWAP_STATE_CHANNEL         "swap_state"
+#define LTT_SYSCALL_STATE_CHANNEL      "syscall_state"
+#define LTT_TASK_STATE_CHANNEL         "task_state"
+#define LTT_VM_STATE_CHANNEL           "vm_state"
+#define LTT_FS_CHANNEL                 "fs"
+#define LTT_INPUT_CHANNEL              "input"
+#define LTT_IPC_CHANNEL                        "ipc"
+#define LTT_KERNEL_CHANNEL             "kernel"
+#define LTT_MM_CHANNEL                 "mm"
+#define LTT_RCU_CHANNEL                        "rcu"
+
+#define LTT_FLIGHT_PREFIX              "flight-"
+
+#define LTT_ASCII                      "ascii"
+
+/* Tracer properties */
+#define LTT_DEFAULT_SUBBUF_SIZE_LOW    65536
+#define LTT_DEFAULT_N_SUBBUFS_LOW      2
+#define LTT_DEFAULT_SUBBUF_SIZE_MED    262144
+#define LTT_DEFAULT_N_SUBBUFS_MED      2
+#define LTT_DEFAULT_SUBBUF_SIZE_HIGH   1048576
+#define LTT_DEFAULT_N_SUBBUFS_HIGH     2
+#define LTT_TRACER_MAGIC_NUMBER                0x00D6B7ED
+#define LTT_TRACER_VERSION_MAJOR       2
+#define LTT_TRACER_VERSION_MINOR       6
+
+/**
+ * ltt_write_trace_header - Write trace header
+ * @trace: Trace information
+ * @header: Memory address where the information must be written to
+ */
+static __inline__
+void ltt_write_trace_header(struct ltt_trace *trace,
+                           struct ltt_subbuffer_header *header)
+{
+       header->magic_number = LTT_TRACER_MAGIC_NUMBER;
+       header->major_version = LTT_TRACER_VERSION_MAJOR;
+       header->minor_version = LTT_TRACER_VERSION_MINOR;
+       header->arch_size = sizeof(void *);
+       header->alignment = ltt_get_alignment();
+       header->start_time_sec = trace->start_time.tv_sec;
+       header->start_time_usec = trace->start_time.tv_usec;
+       header->start_freq = trace->start_freq;
+       header->freq_scale = trace->freq_scale;
+}
+
+/*
+ * Size reserved for high priority events (interrupts, NMI, BH) at the end of a
+ * nearly full buffer. User space won't use this last amount of space when in
+ * blocking mode. This space also includes the event header that would be
+ * written by this user space event.
+ */
+#define LTT_RESERVE_CRITICAL           4096
+
+/* Register and unregister function pointers */
+
+enum ltt_module_function {
+       LTT_FUNCTION_RUN_FILTER,
+       LTT_FUNCTION_FILTER_CONTROL,
+       LTT_FUNCTION_STATEDUMP
+};
+
+extern int ltt_module_register(enum ltt_module_function name, void *function,
+                              struct module *owner);
+extern void ltt_module_unregister(enum ltt_module_function name);
+
+void ltt_transport_register(struct ltt_transport *transport);
+void ltt_transport_unregister(struct ltt_transport *transport);
+
+/* Exported control function */
+
+enum ltt_control_msg {
+       LTT_CONTROL_START,
+       LTT_CONTROL_STOP,
+       LTT_CONTROL_CREATE_TRACE,
+       LTT_CONTROL_DESTROY_TRACE
+};
+
+union ltt_control_args {
+       struct {
+               enum trace_mode mode;
+               unsigned int subbuf_size_low;
+               unsigned int n_subbufs_low;
+               unsigned int subbuf_size_med;
+               unsigned int n_subbufs_med;
+               unsigned int subbuf_size_high;
+               unsigned int n_subbufs_high;
+       } new_trace;
+};
+
+int _ltt_trace_setup(const char *trace_name);
+int ltt_trace_setup(const char *trace_name);
+struct ltt_trace *_ltt_trace_find_setup(const char *trace_name);
+int ltt_trace_set_type(const char *trace_name, const char *trace_type);
+int ltt_trace_set_channel_subbufsize(const char *trace_name,
+                                    const char *channel_name,
+                                    unsigned int size);
+int ltt_trace_set_channel_subbufcount(const char *trace_name,
+                                     const char *channel_name,
+                                     unsigned int cnt);
+int ltt_trace_set_channel_switch_timer(const char *trace_name,
+                                      const char *channel_name,
+                                      unsigned long interval);
+int ltt_trace_set_channel_enable(const char *trace_name,
+                                const char *channel_name,
+                                unsigned int enable);
+int ltt_trace_set_channel_overwrite(const char *trace_name,
+                                   const char *channel_name,
+                                   unsigned int overwrite);
+int ltt_trace_alloc(const char *trace_name);
+int ltt_trace_destroy(const char *trace_name);
+int ltt_trace_start(const char *trace_name);
+int ltt_trace_stop(const char *trace_name);
+
+extern int ltt_control(enum ltt_control_msg msg, const char *trace_name,
+                      const char *trace_type, union ltt_control_args args);
+
+enum ltt_filter_control_msg {
+       LTT_FILTER_DEFAULT_ACCEPT,
+       LTT_FILTER_DEFAULT_REJECT
+};
+
+extern int ltt_filter_control(enum ltt_filter_control_msg msg,
+                             const char *trace_name);
+
+extern struct dentry *get_filter_root(void);
+
+void ltt_core_register(int (*function)(u8, void *));
+
+void ltt_core_unregister(void);
+
+void ltt_release_trace(struct kref *kref);
+void ltt_release_transport(struct kref *kref);
+
+extern int ltt_probe_register(struct ltt_available_probe *pdata);
+extern int ltt_probe_unregister(struct ltt_available_probe *pdata);
+extern int ltt_marker_connect(const char *channel, const char *mname,
+                             const char *pname);
+extern int ltt_marker_disconnect(const char *channel, const char *mname,
+                                const char *pname);
+extern void ltt_dump_marker_state(struct ltt_trace *trace);
+
+void ltt_lock_traces(void);
+void ltt_unlock_traces(void);
+
+extern int ltt_ascii_create_dir(struct ltt_trace *new_trace);
+extern void ltt_ascii_remove_dir(struct ltt_trace *trace);
+extern int ltt_ascii_create(struct ltt_chan *chan);
+extern void ltt_ascii_remove(struct ltt_chan *chan);
+
+extern
+void ltt_statedump_register_kprobes_dump(void (*callback)(void *call_data));
+extern
+void ltt_statedump_unregister_kprobes_dump(void (*callback)(void *call_data));
+
+extern void ltt_dump_softirq_vec(void *call_data);
+
+extern void ltt_dump_sys_call_table(void *call_data);
+extern void ltt_dump_idt_table(void *call_data);
+
+/* Relay IOCTL */
+
+/* Get the next sub-buffer that can be read. */
+#define RELAY_GET_SB                   _IOR(0xF5, 0x00, __u32)
+/* Release the oldest reserved (by "get") sub-buffer. */
+#define RELAY_PUT_SB                   _IOW(0xF5, 0x01, __u32)
+/* returns the number of sub-buffers in the per cpu channel. */
+#define RELAY_GET_N_SB                 _IOR(0xF5, 0x02, __u32)
+/* returns the size of the current sub-buffer. */
+#define RELAY_GET_SB_SIZE              _IOR(0xF5, 0x03, __u32)
+/* returns the maximum size for sub-buffers. */
+#define RELAY_GET_MAX_SB_SIZE          _IOR(0xF5, 0x04, __u32)
+
+#endif /* _LTT_TRACER_H */
diff --git a/ltt-type-serializer.c b/ltt-type-serializer.c
new file mode 100644 (file)
index 0000000..cb92aee
--- /dev/null
@@ -0,0 +1,113 @@
+/**
+ * ltt-type-serializer.c
+ *
+ * LTTng specialized type serializer.
+ *
+ * Copyright Mathieu Desnoyers, 2008.
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+#include <linux/module.h>
+
+#include "ltt-type-serializer.h"
+#include "ltt-relay-lockless.h"
+
+notrace
+void _ltt_specialized_trace(const struct marker *mdata, void *probe_data,
+               void *serialize_private, unsigned int data_size,
+               unsigned int largest_align)
+{
+       int ret;
+       uint16_t eID;
+       size_t slot_size;
+       unsigned int chan_index;
+       struct ltt_chanbuf *buf;
+       struct ltt_chan *chan;
+       struct ltt_trace *trace;
+       uint64_t tsc;
+       long buf_offset;
+       int cpu;
+       unsigned int rflags;
+
+       /*
+        * If we get here, it's probably because we have useful work to do.
+        */
+       if (unlikely(ltt_traces.num_active_traces == 0))
+               return;
+
+       rcu_read_lock_sched_notrace();
+       cpu = smp_processor_id();
+       __get_cpu_var(ltt_nesting)++;
+       /*
+        * asm volatile and "memory" clobber prevent the compiler from moving
+        * instructions out of the ltt nesting count. This is required to ensure
+        * that probe side-effects which can cause recursion (e.g. unforeseen
+        * traps, divisions by 0, ...) are triggered within the incremented
+        * nesting count section.
+        */
+       barrier();
+       eID = mdata->event_id;
+       chan_index = mdata->channel_id;
+
+       /*
+        * Iterate on each trace, typically small number of active traces,
+        * list iteration with prefetch is usually slower.
+        */
+       __list_for_each_entry_rcu(trace, &ltt_traces.head, list) {
+               if (unlikely(!trace->active))
+                       continue;
+               if (unlikely(!ltt_run_filter(trace, eID)))
+                       continue;
+#ifdef CONFIG_LTT_DEBUG_EVENT_SIZE
+               rflags = LTT_RFLAG_ID_SIZE;
+#else
+               if (unlikely(eID >= LTT_FREE_EVENTS))
+                       rflags = LTT_RFLAG_ID;
+               else
+                       rflags = 0;
+#endif
+               /*
+                * Skip channels added after trace creation.
+                */
+               if (unlikely(chan_index >= trace->nr_channels))
+                       continue;
+               chan = &trace->channels[chan_index];
+               if (!chan->active)
+                       continue;
+
+               /* reserve space : header and data */
+               ret = ltt_reserve_slot(chan, trace, data_size, largest_align,
+                                      cpu, &buf, &slot_size, &buf_offset, &tsc,
+                                      &rflags);
+               if (unlikely(ret < 0))
+                       continue; /* buffer full */
+
+               /* Out-of-order write : header and data */
+               buf_offset = ltt_write_event_header(&buf->a, &chan->a,
+                                                   buf_offset, eID, data_size,
+                                                   tsc, rflags);
+               if (data_size) {
+                       buf_offset += ltt_align(buf_offset, largest_align);
+                       ltt_relay_write(&buf->a, &chan->a, buf_offset,
+                                       serialize_private, data_size);
+                       buf_offset += data_size;
+               }
+               /* Out-of-order commit */
+               ltt_commit_slot(buf, chan, buf_offset, data_size, slot_size);
+       }
+       /*
+        * asm volatile and "memory" clobber prevent the compiler from moving
+        * instructions out of the ltt nesting count. This is required to ensure
+        * that probe side-effects which can cause recursion (e.g. unforeseen
+        * traps, divisions by 0, ...) are triggered within the incremented
+        * nesting count section.
+        */
+       barrier();
+       __get_cpu_var(ltt_nesting)--;
+       rcu_read_unlock_sched_notrace();
+}
+EXPORT_SYMBOL_GPL(_ltt_specialized_trace);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("LTT type serializer");
diff --git a/ltt-type-serializer.h b/ltt-type-serializer.h
new file mode 100644 (file)
index 0000000..3e5cd3c
--- /dev/null
@@ -0,0 +1,187 @@
+#ifndef _LTT_TYPE_SERIALIZER_H
+#define _LTT_TYPE_SERIALIZER_H
+
+#include <linux/if.h>  /* For IFNAMSIZ */
+
+#include "ltt-tracer.h"
+
+/*
+ * largest_align must be non-zero, equal to the minimum between the largest type
+ * and sizeof(void *).
+ */
+extern void _ltt_specialized_trace(const struct marker *mdata, void *probe_data,
+               void *serialize_private, unsigned int data_size,
+               unsigned int largest_align);
+
+/*
+ * Statically check that 0 < largest_align < sizeof(void *) to make sure it is
+ * dumb-proof. It will make sure 0 is changed into 1 and unsigned long long is
+ * changed into sizeof(void *) on 32-bit architectures.
+ */
+static inline void ltt_specialized_trace(const struct marker *mdata,
+               void *probe_data,
+               void *serialize_private, unsigned int data_size,
+               unsigned int largest_align)
+{
+       largest_align = min_t(unsigned int, largest_align, sizeof(void *));
+       largest_align = max_t(unsigned int, largest_align, 1);
+       _ltt_specialized_trace(mdata, probe_data, serialize_private, data_size,
+               largest_align);
+}
+
+/*
+ * Type serializer definitions.
+ */
+
+/*
+ * Return size of structure without end-of-structure padding.
+ */
+#define serialize_sizeof(type) offsetof(typeof(type), end_field)
+
+struct serialize_long_int {
+       unsigned long f1;
+       unsigned int f2;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_int_int_long {
+       unsigned int f1;
+       unsigned int f2;
+       unsigned long f3;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_int_int_short {
+       unsigned int f1;
+       unsigned int f2;
+       unsigned short f3;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_long_long {
+       unsigned long f1;
+       unsigned long f2;
+       unsigned long f3;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_long_int {
+       unsigned long f1;
+       unsigned long f2;
+       unsigned int f3;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_long_short_char {
+       unsigned long f1;
+       unsigned long f2;
+       unsigned short f3;
+       unsigned char f4;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_long_short {
+       unsigned long f1;
+       unsigned long f2;
+       unsigned short f3;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_short_char {
+       unsigned long f1;
+       unsigned short f2;
+       unsigned char f3;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_short {
+       unsigned long f1;
+       unsigned short f2;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_char {
+       unsigned long f1;
+       unsigned char f2;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_ifname {
+       unsigned long f1;
+       unsigned char f2[IFNAMSIZ];
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_sizet_int {
+       size_t f1;
+       unsigned int f2;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_long_sizet_int {
+       unsigned long f1;
+       unsigned long f2;
+       size_t f3;
+       unsigned int f4;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_long_long_sizet_int_int {
+       unsigned long f1;
+       unsigned long f2;
+       size_t f3;
+       unsigned int f4;
+       unsigned int f5;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_l4421224411111 {
+       unsigned long f1;
+       uint32_t f2;
+       uint32_t f3;
+       uint16_t f4;
+       uint8_t f5;
+       uint16_t f6;
+       uint16_t f7;
+       uint32_t f8;
+       uint32_t f9;
+       uint8_t f10;
+       uint8_t f11;
+       uint8_t f12;
+       uint8_t f13;
+       uint8_t f14;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+
+struct serialize_l214421224411111 {
+       unsigned long f1;
+       uint16_t f2;
+       uint8_t f3;
+       uint32_t f4;
+       uint32_t f5;
+       uint16_t f6;
+       uint8_t f7;
+       uint16_t f8;
+       uint16_t f9;
+       uint32_t f10;
+       uint32_t f11;
+       uint8_t f12;
+       uint8_t f13;
+       uint8_t f14;
+       uint8_t f15;
+       uint8_t f16;
+       uint8_t end_field[0];
+} LTT_ALIGN;
+
+struct serialize_l4412228 {
+       unsigned long f1;
+       uint32_t f2;
+       uint32_t f3;
+       uint8_t f4;
+       uint16_t f5;
+       uint16_t f6;
+       uint16_t f7;
+       uint64_t f8;
+       unsigned char end_field[0];
+} LTT_ALIGN;
+#endif /* _LTT_TYPE_SERIALIZER_H */
diff --git a/ltt-userspace-event.c b/ltt-userspace-event.c
new file mode 100644 (file)
index 0000000..c716d72
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2008 Mathieu Desnoyers
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+
+#include "ltt-type-serializer.h"
+
+#define LTT_WRITE_EVENT_FILE   "write_event"
+
+DEFINE_MARKER(userspace, event, "string %s");
+static struct dentry *ltt_event_file;
+
+/**
+ * write_event - write a userspace string into the trace system
+ * @file: file pointer
+ * @user_buf: user string
+ * @count: length to copy, including the final NULL
+ * @ppos: unused
+ *
+ * Copy a string into a trace event, in channel "userspace", event "event".
+ * Copies until either \n or \0 is reached.
+ * On success, returns the number of bytes copied from the source, including the
+ * \n or \0 character (if there was one in the count range). It cannot return
+ * more than count.
+ * Inspired from tracing_mark_write implementation from Steven Rostedt and
+ * Ingo Molnar.
+ */
+static
+ssize_t write_event(struct file *file, const char __user *user_buf,
+                   size_t count, loff_t *ppos)
+{
+       struct marker *marker;
+       char *buf, *end;
+       long copycount;
+       ssize_t ret;
+
+       buf = kmalloc(count + 1, GFP_KERNEL);
+       if (!buf) {
+               ret = -ENOMEM;
+               goto string_out;
+       }
+       copycount = strncpy_from_user(buf, user_buf, count);
+       if (copycount < 0) {
+               ret = -EFAULT;
+               goto string_err;
+       }
+       /* Cut from the first nil or newline. */
+       buf[copycount] = '\0';
+       end = strchr(buf, '\n');
+       if (end) {
+               *end = '\0';
+               copycount = end - buf;
+       }
+       /* Add final \0 to copycount */
+       copycount++;
+       marker = &GET_MARKER(userspace, event);
+       ltt_specialized_trace(marker, marker->single.probe_private, buf,
+                             copycount, sizeof(char));
+       /* If there is no \0 nor \n in count, do not return a larger value */
+       ret = min_t(size_t, copycount, count);
+string_err:
+       kfree(buf);
+string_out:
+       return ret;
+}
+
+static const struct file_operations ltt_userspace_operations = {
+       .write = write_event,
+};
+
+static int __init ltt_userspace_init(void)
+{
+       struct dentry *ltt_root_dentry;
+       int err = 0;
+
+       ltt_root_dentry = get_ltt_root();
+       if (!ltt_root_dentry) {
+               err = -ENOENT;
+               goto err_no_root;
+       }
+
+       ltt_event_file = debugfs_create_file(LTT_WRITE_EVENT_FILE,
+                                            S_IWUGO,
+                                            ltt_root_dentry,
+                                            NULL,
+                                            &ltt_userspace_operations);
+       if (IS_ERR(ltt_event_file) || !ltt_event_file) {
+               printk(KERN_ERR
+                       "ltt_userspace_init: failed to create file %s\n",
+                       LTT_WRITE_EVENT_FILE);
+               err = -EPERM;
+               goto err_no_file;
+       }
+
+       return err;
+err_no_file:
+       put_ltt_root();
+err_no_root:
+       return err;
+}
+
+static void __exit ltt_userspace_exit(void)
+{
+       debugfs_remove(ltt_event_file);
+       put_ltt_root();
+}
+
+module_init(ltt_userspace_init);
+module_exit(ltt_userspace_exit);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>");
+MODULE_DESCRIPTION("Linux Trace Toolkit Userspace Event");
diff --git a/probes/Makefile b/probes/Makefile
new file mode 100644 (file)
index 0000000..d8f1c40
--- /dev/null
@@ -0,0 +1,47 @@
+# LTTng tracing probes
+
+ifdef CONFIG_FTRACE
+CFLAGS_REMOVE_kernel-trace.o = -pg
+CFLAGS_REMOVE_mm-trace.o = -pg
+CFLAGS_REMOVE_fs-trace.o = -pg
+CFLAGS_REMOVE_ipc-trace.o = -pg
+CFLAGS_REMOVE_lockdep-trace.o = -pg
+CFLAGS_REMOVE_rcu-trace.o = -pg
+CFLAGS_REMOVE_syscall-trace.o = -pg
+CFLAGS_REMOVE_trap-trace.o = -pg
+CFLAGS_REMOVE_pm-trace.o = -pg
+endif
+
+obj-m += kernel-trace.o mm-trace.o fs-trace.o ipc-trace.o lockdep-trace.o \
+        rcu-trace.o syscall-trace.o trap-trace.o pm-trace.o
+
+ifeq ($(CONFIG_NET),y)
+ifdef CONFIG_FTRACE
+CFLAGS_REMOVE_net-trace.o = -pg
+CFLAGS_REMOVE_net-extended-trace.o = -pg
+endif
+obj-m += net-trace.o net-extended-trace.o
+endif
+
+ifdef CONFIG_JBD2
+ifdef CONFIG_FTRACE
+CFLAGS_REMOVE_jbd2-trace.o = -pg
+endif
+obj-m += jbd2-trace.o
+endif
+
+#ifdef CONFIG_EXT4_FS
+#ifdef CONFIG_FTRACE
+#CFLAGS_REMOVE_ext4-trace.o = -pg
+#endif
+#obj-$(CONFIG_LTT_TRACEPROBES) += ext4-trace.o
+#endif
+
+ifdef CONFIG_BLOCK
+ifdef CONFIG_FTRACE
+CFLAGS_REMOVE_block-trace.o = -pg
+endif
+obj-m += block-trace.o
+endif
+
+
diff --git a/probes/block-trace.c b/probes/block-trace.c
new file mode 100644 (file)
index 0000000..51ae2cd
--- /dev/null
@@ -0,0 +1,309 @@
+/*
+ * ltt/probes/block-trace.c
+ *
+ * block layer tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+
+#include <trace/events/block.h>
+
+/*
+ * Add rq cmd as a sequence. Needs new type. (size + binary blob)
+ */
+
+void probe_block_rq_abort(void *data, struct request_queue *q, struct request *rq)
+{
+       int rw = rq->cmd_flags & 0x03;
+
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
+       if (blk_pc_request(rq)) {
+               trace_mark_tp(block, rq_abort_pc, block_rq_abort,
+                       probe_block_rq_abort,
+                       "data_len %u rw %d errors %d",
+                       blk_rq_bytes(rq), rw, rq->errors);
+       } else {
+               /*
+                * FIXME Using a simple trace_mark for the second event
+                * possibility because tracepoints do not support multiple
+                * connections to the same probe yet. They should have some
+                * refcounting. Need to enable both rq_abort_pc and rq_abort_fs
+                * markers to have the rq_abort_fs marker enabled.
+                */
+               trace_mark(block, rq_abort_fs,
+                       "hard_sector %llu "
+                       "rw %d errors %d", (unsigned long long)blk_rq_pos(rq),
+                       rw, rq->errors);
+       }
+}
+
+void probe_block_rq_insert(void *data, struct request_queue *q, struct request *rq)
+{
+       int rw = rq->cmd_flags & 0x03;
+
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
+       if (blk_pc_request(rq)) {
+               trace_mark_tp(block, rq_insert_pc, block_rq_insert,
+                       probe_block_rq_insert,
+                       "data_len %u rw %d errors %d",
+                       blk_rq_bytes(rq), rw, rq->errors);
+       } else {
+               /*
+                * FIXME Using a simple trace_mark for the second event
+                * possibility because tracepoints do not support multiple
+                * connections to the same probe yet. They should have some
+                * refcounting. Need to enable both rq_insert_pc and
+                * rq_insert_fs markers to have the rq_insert_fs marker enabled.
+                */
+               trace_mark(block, rq_insert_fs,
+                       "hard_sector %llu "
+                       "rw %d errors %d", (unsigned long long)blk_rq_pos(rq),
+                       rw, rq->errors);
+       }
+}
+
+void probe_block_rq_issue(void *data, struct request_queue *q, struct request *rq)
+{
+       int rw = rq->cmd_flags & 0x03;
+
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
+       if (blk_pc_request(rq)) {
+               trace_mark_tp(block, rq_issue_pc, block_rq_issue,
+                       probe_block_rq_issue,
+                       "data_len %u rw %d errors %d",
+                       blk_rq_bytes(rq), rw, rq->errors);
+       } else {
+               /*
+                * FIXME Using a simple trace_mark for the second event
+                * possibility because tracepoints do not support multiple
+                * connections to the same probe yet. They should have some
+                * refcounting. Need to enable both rq_issue_pc and rq_issue_fs
+                * markers to have the rq_issue_fs marker enabled.
+                */
+               trace_mark(block, rq_issue_fs,
+                       "hard_sector %llu "
+                       "rw %d errors %d", (unsigned long long)blk_rq_pos(rq),
+                       rw, rq->errors);
+       }
+}
+
+void probe_block_rq_requeue(void *data, struct request_queue *q, struct request *rq)
+{
+       int rw = rq->cmd_flags & 0x03;
+
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
+       if (blk_pc_request(rq)) {
+               trace_mark_tp(block, rq_requeue_pc, block_rq_requeue,
+                       probe_block_rq_requeue,
+                       "data_len %u rw %d errors %d",
+                       blk_rq_bytes(rq), rw, rq->errors);
+       } else {
+               /*
+                * FIXME Using a simple trace_mark for the second event
+                * possibility because tracepoints do not support multiple
+                * connections to the same probe yet. They should have some
+                * refcounting. Need to enable both rq_requeue_pc and
+                * rq_requeue_fs markers to have the rq_requeue_fs marker
+                * enabled.
+                */
+               trace_mark(block, rq_requeue_fs,
+                       "hard_sector %llu "
+                       "rw %d errors %d", (unsigned long long)blk_rq_pos(rq),
+                       rw, rq->errors);
+       }
+}
+
+void probe_block_rq_complete(void *data, struct request_queue *q, struct request *rq)
+{
+       int rw = rq->cmd_flags & 0x03;
+
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
+       if (blk_pc_request(rq)) {
+               trace_mark_tp(block, rq_complete_pc, block_rq_complete,
+                       probe_block_rq_complete,
+                       "data_len %u rw %d errors %d",
+                       blk_rq_bytes(rq), rw, rq->errors);
+       } else {
+               /*
+                * FIXME Using a simple trace_mark for the second event
+                * possibility because tracepoints do not support multiple
+                * connections to the same probe yet. They should have some
+                * refcounting. Need to enable both rq_complete_pc and
+                * rq_complete_fs markers to have the rq_complete_fs marker
+                * enabled.
+                */
+               trace_mark(block, rq_complete_fs,
+                       "hard_sector %llu "
+                       "rw %d errors %d", (unsigned long long)blk_rq_pos(rq),
+                       rw, rq->errors);
+       }
+}
+
+void probe_block_bio_bounce(void *data, struct request_queue *q, struct bio *bio)
+{
+       trace_mark_tp(block, bio_bounce, block_bio_bounce,
+               probe_block_bio_bounce,
+               "sector %llu size %u rw(FAILFAST_DRIVER,FAILFAST_TRANSPORT,"
+               "FAILFAST_DEV,DISCARD,META,SYNC,BARRIER,AHEAD,RW) %lX "
+               "not_uptodate #1u%d",
+               (unsigned long long)bio->bi_sector, bio->bi_size,
+               bio->bi_rw, !bio_flagged(bio, BIO_UPTODATE));
+}
+
+void probe_block_bio_complete(void *data, struct request_queue *q, struct bio *bio)
+{
+       trace_mark_tp(block, bio_complete, block_bio_complete,
+               probe_block_bio_complete,
+               "sector %llu size %u rw(FAILFAST_DRIVER,FAILFAST_TRANSPORT,"
+               "FAILFAST_DEV,DISCARD,META,SYNC,BARRIER,AHEAD,RW) %lX "
+               "not_uptodate #1u%d",
+               (unsigned long long)bio->bi_sector, bio->bi_size,
+               bio->bi_rw, !bio_flagged(bio, BIO_UPTODATE));
+}
+
+void probe_block_bio_backmerge(void *data, struct request_queue *q, struct bio *bio)
+{
+       trace_mark_tp(block, bio_backmerge, block_bio_backmerge,
+               probe_block_bio_backmerge,
+               "sector %llu size %u rw(FAILFAST_DRIVER,FAILFAST_TRANSPORT,"
+               "FAILFAST_DEV,DISCARD,META,SYNC,BARRIER,AHEAD,RW) %lX "
+               "not_uptodate #1u%d",
+               (unsigned long long)bio->bi_sector, bio->bi_size,
+               bio->bi_rw, !bio_flagged(bio, BIO_UPTODATE));
+}
+
+void probe_block_bio_frontmerge(void *data, struct request_queue *q, struct bio *bio)
+{
+       trace_mark_tp(block, bio_frontmerge, block_bio_frontmerge,
+               probe_block_bio_frontmerge,
+               "sector %llu size %u rw(FAILFAST_DRIVER,FAILFAST_TRANSPORT,"
+               "FAILFAST_DEV,DISCARD,META,SYNC,BARRIER,AHEAD,RW) %lX "
+               "not_uptodate #1u%d",
+               (unsigned long long)bio->bi_sector, bio->bi_size,
+               bio->bi_rw, !bio_flagged(bio, BIO_UPTODATE));
+}
+
+void probe_block_bio_queue(void *data, struct request_queue *q, struct bio *bio)
+{
+       trace_mark_tp(block, bio_queue, block_bio_queue,
+               probe_block_bio_queue,
+               "sector %llu size %u rw(FAILFAST_DRIVER,FAILFAST_TRANSPORT,"
+               "FAILFAST_DEV,DISCARD,META,SYNC,BARRIER,AHEAD,RW) %lX "
+               "not_uptodate #1u%d",
+               (unsigned long long)bio->bi_sector, bio->bi_size,
+               bio->bi_rw, !bio_flagged(bio, BIO_UPTODATE));
+}
+
+void probe_block_getrq(void *data, struct request_queue *q, struct bio *bio, int rw)
+{
+       if (bio) {
+               trace_mark_tp(block, getrq_bio, block_getrq,
+                       probe_block_getrq,
+                       "sector %llu size %u "
+                       "rw(FAILFAST_DRIVER,FAILFAST_TRANSPORT,"
+                       "FAILFAST_DEV,DISCARD,META,SYNC,BARRIER,AHEAD,RW) %lX "
+                       "not_uptodate #1u%d",
+                       (unsigned long long)bio->bi_sector, bio->bi_size,
+                       bio->bi_rw, !bio_flagged(bio, BIO_UPTODATE));
+       } else {
+               /*
+                * FIXME Using a simple trace_mark for the second event
+                * possibility because tracepoints do not support multiple
+                * connections to the same probe yet. They should have some
+                * refcounting. Need to enable both getrq_bio and getrq markers
+                * to have the getrq marker enabled.
+                */
+               trace_mark(block, getrq, "rw %d", rw);
+       }
+}
+
+void probe_block_sleeprq(void *data, struct request_queue *q, struct bio *bio, int rw)
+{
+       if (bio) {
+               trace_mark_tp(block, sleeprq_bio, block_sleeprq,
+                       probe_block_sleeprq,
+                       "sector %llu size %u "
+                       "rw(FAILFAST_DRIVER,FAILFAST_TRANSPORT,"
+                       "FAILFAST_DEV,DISCARD,META,SYNC,BARRIER,AHEAD,RW) %lX "
+                       "not_uptodate #1u%d",
+                       (unsigned long long)bio->bi_sector, bio->bi_size,
+                       bio->bi_rw, !bio_flagged(bio, BIO_UPTODATE));
+       } else {
+               /*
+                * FIXME Using a simple trace_mark for the second event
+                * possibility because tracepoints do not support multiple
+                * connections to the same probe yet. They should have some
+                * refcounting. Need to enable both sleeprq_bio and sleeprq
+                * markers to have the sleeprq marker enabled.
+                */
+               trace_mark(block, sleeprq, "rw %d", rw);
+       }
+}
+
+void probe_block_plug(void *data, struct request_queue *q)
+{
+       trace_mark_tp(block, plug, block_plug, probe_block_plug,
+                        MARK_NOARGS);
+}
+
+void probe_block_unplug_io(void *data, struct request_queue *q)
+{
+       unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+
+       trace_mark_tp(block, unplug_io, block_unplug_io, probe_block_unplug_io,
+                       "pdu %u", pdu);
+}
+
+void probe_block_unplug_timer(void *data, struct request_queue *q)
+{
+       unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+
+       trace_mark_tp(block, unplug_timer, block_unplug_timer,
+                       probe_block_unplug_timer,
+                       "pdu %u", pdu);
+}
+
+void probe_block_split(void *data, struct request_queue *q, struct bio *bio,
+                      unsigned int pdu)
+{
+       trace_mark_tp(block, split, block_split,
+               probe_block_split,
+               "sector %llu size %u rw(FAILFAST_DRIVER,FAILFAST_TRANSPORT,"
+               "FAILFAST_DEV,DISCARD,META,SYNC,BARRIER,AHEAD,RW) %lX "
+               "not_uptodate #1u%d pdu %u",
+               (unsigned long long)bio->bi_sector, bio->bi_size,
+               bio->bi_rw, !bio_flagged(bio, BIO_UPTODATE), pdu);
+}
+
+void probe_block_remap(void *data, struct request_queue *q, struct bio *bio,
+                      dev_t dev, sector_t from)
+{
+       trace_mark_tp(block, remap, block_remap,
+               probe_block_remap,
+               "device_from %lu sector_from %llu device_to %lu "
+               "size %u rw(FAILFAST_DRIVER,FAILFAST_TRANSPORT,"
+               "FAILFAST_DEV,DISCARD,META,SYNC,BARRIER,AHEAD,RW) %lX "
+               "not_uptodate #1u%d",
+               (unsigned long)bio->bi_bdev->bd_dev,
+               (unsigned long long)from,
+               (unsigned long)dev,
+               bio->bi_size, bio->bi_rw,
+               !bio_flagged(bio, BIO_UPTODATE));
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Block Tracepoint Probes");
diff --git a/probes/ext4-trace.c b/probes/ext4-trace.c
new file mode 100644 (file)
index 0000000..83683e7
--- /dev/null
@@ -0,0 +1,611 @@
+/*
+ * ltt/probes/ext4-trace.c
+ *
+ * ext4 tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/debugfs.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <trace/events/ext4.h>
+
+#include "../ltt-tracer.h"
+#include "../../fs/ext4/mballoc.h"
+
+static struct dentry *ext4_filter_dentry, *ext4_filter_dev_dentry,
+       *ext4_filter_inode_dentry;
+static DEFINE_MUTEX(ext4_filter_mutex);
+/* Make sure we don't race between module exit and file write */
+static int module_exits;
+
+struct rcu_dev_filter {
+       struct rcu_head rcu;
+       char devname[NAME_MAX];
+};
+
+static struct rcu_dev_filter *dev_filter;
+/* ~0UL inode_filter enables all inodes */
+static unsigned long inode_filter = ~0UL;
+
+/*
+ * Probes are executed in rcu_sched read-side critical section.
+ */
+
+static int do_dev_filter(const char *dev)
+{
+       struct rcu_dev_filter *ldev_filter = rcu_dereference(dev_filter);
+
+       if (unlikely(ldev_filter))
+               if (unlikely(strcmp(ldev_filter->devname, dev)))
+                       return 0;
+       return 1;
+}
+
+static int do_inode_filter(unsigned long ino)
+{
+       if (unlikely(inode_filter != ~0UL))
+               if (unlikely(inode_filter != ino))
+                       return 0;
+       return 1;
+}
+
+/*
+ * Logical AND between dev and inode filter.
+ */
+static int do_filter(const char *dev, unsigned long ino)
+{
+       if (unlikely(!do_dev_filter(dev)))
+               return 0;
+       if (unlikely(!do_inode_filter(ino)))
+               return 0;
+       return 1;
+}
+
+
+void probe_ext4_free_inode(void *data, struct inode *inode)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, free_inode, ext4_free_inode,
+               probe_ext4_free_inode,
+               "dev %s ino %lu mode %d uid %lu gid %lu blocks %llu",
+               inode->i_sb->s_id, inode->i_ino, inode->i_mode,
+               (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
+               (unsigned long long) inode->i_blocks);
+}
+
+void probe_ext4_request_inode(void *data, struct inode *dir, int mode)
+{
+       if (unlikely(!do_filter(dir->i_sb->s_id, dir->i_ino)))
+               return;
+       trace_mark_tp(ext4, request_inode, ext4_request_inode,
+               probe_ext4_request_inode,
+               "dev %s dir %lu mode %d",
+               dir->i_sb->s_id, dir->i_ino, mode);
+}
+
+void probe_ext4_allocate_inode(void *data, struct inode *inode, struct inode *dir, int mode)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)
+                    && !do_filter(dir->i_sb->s_id, dir->i_ino)))
+               return;
+       trace_mark_tp(ext4, allocate_inode, ext4_allocate_inode,
+               probe_ext4_allocate_inode,
+               "dev %s ino %lu dir %lu mode %d",
+               dir->i_sb->s_id, inode->i_ino, dir->i_ino, mode);
+}
+
+void probe_ext4_write_begin(void *data, struct inode *inode, loff_t pos, unsigned int len,
+                           unsigned int flags)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, write_begin, ext4_write_begin,
+               probe_ext4_write_begin,
+               "dev %s ino %lu pos %llu len %u flags %u",
+               inode->i_sb->s_id, inode->i_ino,
+               (unsigned long long) pos, len, flags);
+}
+
+void probe_ext4_ordered_write_end(void *data, struct inode *inode, loff_t pos,
+                           unsigned int len, unsigned int copied)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, ordered_write_end, ext4_ordered_write_end,
+               probe_ext4_ordered_write_end,
+               "dev %s ino %lu pos %llu len %u copied %u",
+               inode->i_sb->s_id, inode->i_ino,
+               (unsigned long long) pos, len, copied);
+}
+
+void probe_ext4_writeback_write_end(void *data, struct inode *inode, loff_t pos,
+                           unsigned int len, unsigned int copied)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, writeback_write_end, ext4_writeback_write_end,
+               probe_ext4_writeback_write_end,
+               "dev %s ino %lu pos %llu len %u copied %u",
+               inode->i_sb->s_id, inode->i_ino,
+               (unsigned long long) pos, len, copied);
+}
+
+void probe_ext4_journalled_write_end(void *data, struct inode *inode, loff_t pos,
+                           unsigned int len, unsigned int copied)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, journalled_write_end, ext4_journalled_write_end,
+               probe_ext4_journalled_write_end,
+               "dev %s ino %lu pos %llu len %u copied %u",
+               inode->i_sb->s_id, inode->i_ino,
+               (unsigned long long) pos, len, copied);
+}
+
+/*
+ * note : wbc_flags will have to be decoded by userspace.
+ * #1x uses a single byte in the trace. Limits to 8 bits.
+ */
+void probe_ext4_da_writepages(void *data, struct inode *inode,
+                             struct writeback_control *wbc)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, da_writepages, ext4_da_writepages,
+               probe_ext4_da_writepages,
+               "dev %s ino %lu nr_to_write %ld "
+               "pages_skipped %ld range_start %llu range_end %llu "
+               "wbc_flags(nonblocking,for_kupdate,"
+               "for_reclaim,range_cyclic) #1x%u",
+               inode->i_sb->s_id, inode->i_ino, wbc->nr_to_write,
+               wbc->pages_skipped,
+               (unsigned long long) wbc->range_start,
+               (unsigned long long) wbc->range_end,
+                 (wbc->nonblocking << 3)
+               | (wbc->for_kupdate << 2)
+               | (wbc->for_reclaim << 1)
+               | wbc->range_cyclic);
+}
+
+/*
+ * note : wbc_flags will have to be decoded by userspace.
+ * #1x uses a single byte in the trace. Limits to 8 bits.
+ */
+void probe_ext4_da_writepages_result(void *data, struct inode *inode,
+                                    struct writeback_control *wbc,
+                                    int ret, int pages_written)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, da_writepages_result, ext4_da_writepages_result,
+               probe_ext4_da_writepages_result,
+               "dev %s ino %lu ret %d pages_written %d "
+               "pages_skipped %ld "
+               "wbc_flags(encountered_congestion,"
+               "more_io,no_nrwrite_index_update) #1x%u",
+               inode->i_sb->s_id, inode->i_ino, ret, pages_written,
+               wbc->pages_skipped,
+                 (wbc->encountered_congestion << 2)
+               | (wbc->more_io << 1)
+               | wbc->no_nrwrite_index_update);
+}
+
+void probe_ext4_da_write_begin(void *data, struct inode *inode, loff_t pos,
+                           unsigned int len, unsigned int flags)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, da_write_begin, ext4_da_write_begin,
+               probe_ext4_da_write_begin,
+               "dev %s ino %lu pos %llu len %u flags %u",
+               inode->i_sb->s_id, inode->i_ino,
+               (unsigned long long) pos, len, flags);
+}
+
+void probe_ext4_da_write_end(void *data, struct inode *inode, loff_t pos,
+                           unsigned int len, unsigned int copied)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, da_write_end, ext4_da_write_end,
+               probe_ext4_da_write_end,
+               "dev %s ino %lu pos %llu len %u copied %u",
+               inode->i_sb->s_id, inode->i_ino,
+               (unsigned long long) pos, len, copied);
+}
+
+void probe_ext4_discard_blocks(void *data, struct super_block *sb, unsigned long long blk,
+                              unsigned long long count)
+{
+       if (unlikely(!do_dev_filter(sb->s_id)))
+               return;
+       trace_mark_tp(ext4, discard_blocks, ext4_discard_blocks,
+               probe_ext4_discard_blocks,
+               "dev %s blk %llu count %llu",
+               sb->s_id, blk, count);
+}
+
+void probe_ext4_mb_new_inode_pa(void *data, struct ext4_allocation_context *ac,
+                               struct ext4_prealloc_space *pa)
+{
+       if (unlikely(!do_filter(ac->ac_sb->s_id, ac->ac_inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, mb_new_inode_pa, ext4_mb_new_inode_pa,
+               probe_ext4_mb_new_inode_pa,
+               "dev %s ino %lu pstart %llu len %u lstart %u",
+               ac->ac_sb->s_id, ac->ac_inode->i_ino, pa->pa_pstart,
+               pa->pa_len, pa->pa_lstart);
+}
+
+void probe_ext4_mb_new_group_pa(void *data, struct ext4_allocation_context *ac,
+                               struct ext4_prealloc_space *pa)
+{
+       if (unlikely(!do_dev_filter(ac->ac_sb->s_id)))
+               return;
+       trace_mark_tp(ext4, mb_new_group_pa, ext4_mb_new_group_pa,
+               probe_ext4_mb_new_group_pa,
+               "dev %s pstart %llu len %u lstart %u",
+               ac->ac_sb->s_id, pa->pa_pstart,
+               pa->pa_len, pa->pa_lstart);
+}
+
+void probe_ext4_mb_release_inode_pa(void *data, struct ext4_allocation_context *ac,
+                                   struct ext4_prealloc_space *pa,
+                                   unsigned long long block,
+                                   unsigned int count)
+{
+       if (unlikely(!do_filter(ac->ac_sb->s_id, ac->ac_inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, mb_release_inode_pa, ext4_mb_release_inode_pa,
+               probe_ext4_mb_release_inode_pa,
+               "dev %s ino %lu block %llu count %u",
+               ac->ac_sb->s_id, pa->pa_inode->i_ino, block, count);
+}
+
+void probe_ext4_mb_release_group_pa(void *data, struct ext4_allocation_context *ac,
+                                   struct ext4_prealloc_space *pa)
+{
+       if (unlikely(!do_dev_filter(ac->ac_sb->s_id)))
+               return;
+       trace_mark_tp(ext4, mb_release_group_pa, ext4_mb_release_group_pa,
+               probe_ext4_mb_release_group_pa,
+               "dev %s pstart %llu len %d",
+               ac->ac_sb->s_id, pa->pa_pstart, pa->pa_len);
+}
+
+void probe_ext4_discard_preallocations(void *data, struct inode *inode)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, discard_preallocations,
+               ext4_discard_preallocations,
+               probe_ext4_discard_preallocations,
+               "dev %s ino %lu",
+               inode->i_sb->s_id, inode->i_ino);
+}
+
+void probe_ext4_mb_discard_preallocations(void *data, struct super_block *sb, int needed)
+{
+       if (unlikely(!do_dev_filter(sb->s_id)))
+               return;
+       trace_mark_tp(ext4, mb_discard_preallocations,
+               ext4_mb_discard_preallocations,
+               probe_ext4_mb_discard_preallocations,
+               "dev %s needed %d",
+               sb->s_id, needed);
+}
+
+void probe_ext4_request_blocks(void *data, struct ext4_allocation_request *ar)
+{
+       if (ar->inode) {
+               if (unlikely(!do_filter(ar->inode->i_sb->s_id,
+                                       ar->inode->i_ino)))
+                       return;
+       } else {
+               if (unlikely(!do_dev_filter(ar->inode->i_sb->s_id)))
+                       return;
+       }
+       trace_mark_tp(ext4, request_blocks, ext4_request_blocks,
+               probe_ext4_request_blocks,
+               "dev %s flags %u len %u ino %lu "
+               "lblk %llu goal %llu lleft %llu lright %llu "
+               "pleft %llu pright %llu",
+               ar->inode->i_sb->s_id, ar->flags, ar->len,
+               ar->inode ? ar->inode->i_ino : 0,
+               (unsigned long long) ar->logical,
+               (unsigned long long) ar->goal,
+               (unsigned long long) ar->lleft,
+               (unsigned long long) ar->lright,
+               (unsigned long long) ar->pleft,
+               (unsigned long long) ar->pright);
+}
+
+void probe_ext4_allocate_blocks(void *data, struct ext4_allocation_request *ar,
+                               unsigned long long block)
+{
+       if (ar->inode) {
+               if (unlikely(!do_filter(ar->inode->i_sb->s_id,
+                                       ar->inode->i_ino)))
+                       return;
+       } else {
+               if (unlikely(!do_dev_filter(ar->inode->i_sb->s_id)))
+                       return;
+       }
+       trace_mark_tp(ext4, allocate_blocks, ext4_allocate_blocks,
+               probe_ext4_allocate_blocks,
+               "dev %s block %llu flags %u len %u ino %lu "
+               "logical %llu goal %llu lleft %llu lright %llu "
+               "pleft %llu pright %llu",
+               ar->inode->i_sb->s_id, (unsigned long long) block,
+               ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
+               (unsigned long long) ar->logical,
+               (unsigned long long) ar->goal,
+               (unsigned long long) ar->lleft,
+               (unsigned long long) ar->lright,
+               (unsigned long long) ar->pleft,
+               (unsigned long long) ar->pright);
+}
+
+void probe_ext4_free_blocks(void *data, struct inode *inode, __u64 block,
+                           unsigned long count, int metadata)
+{
+       if (unlikely(!do_filter(inode->i_sb->s_id, inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, free_blocks, ext4_free_blocks,
+               probe_ext4_free_blocks,
+               "dev %s block %llu count %lu metadata %d ino %lu",
+               inode->i_sb->s_id, (unsigned long long)block,
+               count, metadata, inode->i_ino);
+}
+
+void probe_ext4_sync_file(void *data, struct file *file, struct dentry *dentry,
+                         int datasync)
+{
+       if (unlikely(!do_dev_filter(dentry->d_inode->i_sb->s_id)))
+               return;
+       if (unlikely(!do_inode_filter(dentry->d_inode->i_ino)
+                       && !do_inode_filter(dentry->d_parent->d_inode->i_ino)))
+               return;
+       trace_mark_tp(ext4, sync_file, ext4_sync_file,
+               probe_ext4_sync_file,
+               "dev %s datasync %d ino %ld parent %ld",
+               dentry->d_inode->i_sb->s_id, datasync, dentry->d_inode->i_ino,
+               dentry->d_parent->d_inode->i_ino);
+}
+
+void probe_ext4_sync_fs(void *data, struct super_block *sb, int wait)
+{
+       if (unlikely(!do_dev_filter(sb->s_id)))
+               return;
+       trace_mark_tp(ext4, sync_fs, ext4_sync_fs,
+               probe_ext4_sync_fs,
+               "dev %s wait %d",
+               sb->s_id, wait);
+}
+
+static void free_dev_filter(struct rcu_head *head)
+{
+       kfree(container_of(head, struct rcu_dev_filter, rcu));
+}
+
+static ssize_t dev_filter_op_write(struct file *file,
+       const char __user *user_buf, size_t count, loff_t *ppos)
+{
+       int err = 0;
+       char buf[NAME_MAX];
+       int buf_size;
+       char name[NAME_MAX];
+       struct rcu_dev_filter *new, *old;
+
+       mutex_lock(&ext4_filter_mutex);
+       if (module_exits) {
+               err = -EPERM;
+               goto error;
+       }
+       buf_size = min(count, sizeof(buf) - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto error;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", name) != 1) {
+               err = -EPERM;
+               goto error;
+       }
+
+       old = dev_filter;
+
+       /* Empty string or * means all active */
+       if (name[0] == '\0' || (name[0] == '*' && name[1] == '\0')) {
+               new = NULL;
+       } else {
+               new = kmalloc(sizeof(*new), GFP_KERNEL);
+               strcpy(new->devname, name);
+       }
+
+       rcu_assign_pointer(dev_filter, new);
+       if (old)
+               call_rcu_sched(&old->rcu, free_dev_filter);
+
+       mutex_unlock(&ext4_filter_mutex);
+       return count;
+
+error:
+       mutex_unlock(&ext4_filter_mutex);
+       return err;
+}
+
+static ssize_t dev_filter_op_read(struct file *filp, char __user *buffer,
+       size_t count, loff_t *ppos)
+{
+       ssize_t bcount;
+       const char *devname;
+
+       mutex_lock(&ext4_filter_mutex);
+       if (!dev_filter)
+               devname = "*";
+       else
+               devname = dev_filter->devname;
+       bcount = simple_read_from_buffer(buffer, count, ppos,
+                       devname, strlen(devname));
+       mutex_unlock(&ext4_filter_mutex);
+       return bcount;
+}
+
+static struct file_operations ext4_dev_file_operations = {
+       .write = dev_filter_op_write,
+       .read = dev_filter_op_read,
+};
+
+static ssize_t inode_filter_op_write(struct file *file,
+       const char __user *user_buf, size_t count, loff_t *ppos)
+{
+       int err = 0;
+       char buf[NAME_MAX];
+       int buf_size;
+       char name[NAME_MAX];
+       unsigned long inode_num;
+
+       mutex_lock(&ext4_filter_mutex);
+       if (module_exits) {
+               err = -EPERM;
+               goto error;
+       }
+       buf_size = min(count, sizeof(buf) - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto error;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", name) != 1) {
+               err = -EPERM;
+               goto error;
+       }
+
+       /* Empty string or * means all active */
+       if (name[0] == '\0' || (name[0] == '*' && name[1] == '\0')) {
+               inode_filter = ~0UL;
+       } else {
+               if (sscanf(buf, "%lu", &inode_num) != 1) {
+                       err = -EPERM;
+                       goto error;
+               }
+               inode_filter = inode_num;
+       }
+
+       mutex_unlock(&ext4_filter_mutex);
+       return count;
+
+error:
+       mutex_unlock(&ext4_filter_mutex);
+       return err;
+}
+
+static ssize_t inode_filter_op_read(struct file *filp, char __user *buffer,
+       size_t count, loff_t *ppos)
+{
+       ssize_t bcount;
+       char inode_str[NAME_MAX];
+
+       mutex_lock(&ext4_filter_mutex);
+       if (inode_filter == ~0UL)
+               strcpy(inode_str, "*");
+       else {
+               bcount = snprintf(inode_str, sizeof(inode_str), "%lu",
+                              inode_filter);
+               if (bcount == sizeof(inode_str))
+                       bcount = -ENOSPC;
+               if (bcount < 0)
+                       goto end;
+       }
+       bcount = simple_read_from_buffer(buffer, count, ppos,
+                       inode_str, strlen(inode_str));
+end:
+       mutex_unlock(&ext4_filter_mutex);
+       return bcount;
+}
+
+static struct file_operations ext4_inode_file_operations = {
+       .write = inode_filter_op_write,
+       .read = inode_filter_op_read,
+};
+
+static void release_filter_dev(void)
+{
+       struct rcu_dev_filter *old;
+
+       mutex_lock(&ext4_filter_mutex);
+       module_exits = 1;
+       old = dev_filter;
+       rcu_assign_pointer(dev_filter, NULL);
+       if (old)
+               call_rcu_sched(&old->rcu, free_dev_filter);
+       mutex_unlock(&ext4_filter_mutex);
+}
+
+static int __init filter_init(void)
+{
+       struct dentry *filter_root_dentry;
+       int err = 0;
+
+       filter_root_dentry = get_filter_root();
+       if (!filter_root_dentry) {
+               err = -ENOENT;
+               goto end;
+       }
+
+       ext4_filter_dentry = debugfs_create_dir("ext4", filter_root_dentry);
+
+       if (IS_ERR(ext4_filter_dentry) || !ext4_filter_dentry) {
+               printk(KERN_ERR "Failed to create ext4 filter file\n");
+               err = -ENOMEM;
+               goto end;
+       }
+
+       ext4_filter_dev_dentry = debugfs_create_file("dev", S_IWUSR,
+                       ext4_filter_dentry, NULL, &ext4_dev_file_operations);
+       if (IS_ERR(ext4_filter_dev_dentry) || !ext4_filter_dev_dentry) {
+               printk(KERN_ERR "Failed to create ext4 dev filter file\n");
+               err = -ENOMEM;
+               goto release_filter_dentry;
+       }
+
+       ext4_filter_inode_dentry = debugfs_create_file("inode", S_IWUSR,
+                       ext4_filter_dentry, NULL, &ext4_inode_file_operations);
+       if (IS_ERR(ext4_filter_inode_dentry) || !ext4_filter_inode_dentry) {
+               printk(KERN_ERR "Failed to create ext4 inode filter file\n");
+               err = -ENOMEM;
+               goto release_filter_dev_dentry;
+       }
+
+       goto end;
+
+release_filter_dev_dentry:
+       debugfs_remove(ext4_filter_dev_dentry);
+release_filter_dentry:
+       debugfs_remove(ext4_filter_dentry);
+       release_filter_dev();
+end:
+       return err;
+}
+
+static void __exit filter_exit(void)
+{
+       debugfs_remove(ext4_filter_dev_dentry);
+       debugfs_remove(ext4_filter_inode_dentry);
+       debugfs_remove(ext4_filter_dentry);
+       release_filter_dev();
+}
+
+module_init(filter_init);
+module_exit(filter_exit);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("ext4 Tracepoint Probes");
diff --git a/probes/fs-trace.c b/probes/fs-trace.c
new file mode 100644 (file)
index 0000000..bca2827
--- /dev/null
@@ -0,0 +1,158 @@
+/*
+ * ltt/probes/fs-trace.c
+ *
+ * FS tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <trace/fs.h>
+
+#include "../ltt-type-serializer.h"
+
+void probe_fs_buffer_wait_start(void *_data, struct buffer_head *bh)
+{
+       trace_mark_tp(fs, buffer_wait_start, fs_buffer_wait_start,
+               probe_fs_buffer_wait_start, "bh %p", bh);
+}
+
+void probe_fs_buffer_wait_end(void *_data, struct buffer_head *bh)
+{
+       trace_mark_tp(fs, buffer_wait_end, fs_buffer_wait_end,
+               probe_fs_buffer_wait_end, "bh %p", bh);
+}
+
+void probe_fs_exec(void *_data, char *filename)
+{
+       trace_mark_tp(fs, exec, fs_exec, probe_fs_exec, "filename %s",
+               filename);
+}
+
+void probe_fs_ioctl(void *_data, unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+       trace_mark_tp(fs, ioctl, fs_ioctl, probe_fs_ioctl,
+               "fd %u cmd %u arg %lu", fd, cmd, arg);
+}
+
+void probe_fs_open(void *_data, int fd, char *filename)
+{
+       trace_mark_tp(fs, open, fs_open, probe_fs_open,
+               "fd %d filename %s", fd, filename);
+}
+
+void probe_fs_close(void *_data, unsigned int fd)
+{
+       trace_mark_tp(fs, close, fs_close, probe_fs_close, "fd %u", fd);
+}
+
+void probe_fs_lseek(void *_data, unsigned int fd, long offset, unsigned int origin)
+{
+       trace_mark_tp(fs, lseek, fs_lseek, probe_fs_lseek,
+               "fd %u offset %ld origin %u", fd, offset, origin);
+}
+
+void probe_fs_llseek(void *_data, unsigned int fd, loff_t offset, unsigned int origin)
+{
+       trace_mark_tp(fs, llseek, fs_llseek, probe_fs_llseek,
+               "fd %u offset %lld origin %u", fd,
+               (long long)offset, origin);
+}
+
+void probe_fs_read(void *_data, unsigned int fd, char __user *buf, size_t count,
+               ssize_t ret);
+
+DEFINE_MARKER_TP(fs, read, fs_read, probe_fs_read,
+       "count %zu fd %u");
+
+notrace void probe_fs_read(void *_data, unsigned int fd, char __user *buf, size_t count,
+               ssize_t ret)
+{
+       struct marker *marker;
+       struct serialize_sizet_int data;
+
+       data.f1 = count;
+       data.f2 = fd;
+
+       marker = &GET_MARKER(fs, read);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(size_t));
+}
+
+void probe_fs_write(void *_data, unsigned int fd, char __user *buf, size_t count,
+               ssize_t ret);
+
+DEFINE_MARKER_TP(fs, write, fs_write, probe_fs_write,
+       "count %zu fd %u");
+
+notrace void probe_fs_write(void *_data, unsigned int fd, char __user *buf, size_t count,
+               ssize_t ret)
+{
+       struct marker *marker;
+       struct serialize_sizet_int data;
+
+       data.f1 = count;
+       data.f2 = fd;
+
+       marker = &GET_MARKER(fs, write);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(size_t));
+}
+
+void probe_fs_pread64(void *_data, unsigned int fd, char __user *buf, size_t count,
+               loff_t pos, ssize_t ret)
+{
+       trace_mark_tp(fs, pread64, fs_pread64, probe_fs_pread64,
+               "fd %u count %zu pos %llu",
+               fd, count, (unsigned long long)pos);
+}
+
+void probe_fs_pwrite64(void *_data, unsigned int fd, const char __user *buf,
+               size_t count, loff_t pos, ssize_t ret)
+{
+       trace_mark_tp(fs, pwrite64, fs_pwrite64, probe_fs_pwrite64,
+               "fd %u count %zu pos %llu",
+               fd, count, (unsigned long long)pos);
+}
+
+void probe_fs_readv(void *_data, unsigned long fd, const struct iovec __user *vec,
+               unsigned long vlen, ssize_t ret)
+{
+       trace_mark_tp(fs, readv, fs_readv, probe_fs_readv,
+               "fd %lu vlen %lu", fd, vlen);
+}
+
+void probe_fs_writev(void *_data, unsigned long fd, const struct iovec __user *vec,
+               unsigned long vlen, ssize_t ret)
+{
+       trace_mark_tp(fs, writev, fs_writev, probe_fs_writev,
+               "fd %lu vlen %lu", fd, vlen);
+}
+
+void probe_fs_select(void *_data, int fd, struct timespec *end_time)
+{
+       struct timespec tmptime;
+
+       if (end_time) {
+               tmptime = *end_time;
+       } else {
+               tmptime.tv_sec = -1L;
+               tmptime.tv_nsec = -1L;
+       }
+
+       trace_mark_tp(fs, select, fs_select, probe_fs_select,
+               "fd %d end_time_sec %ld end_time_nsec %ld", fd,
+                       tmptime.tv_sec, tmptime.tv_nsec);
+}
+
+void probe_fs_poll(void *_data, int fd)
+{
+       trace_mark_tp(fs, pollfd, fs_poll, probe_fs_poll,
+               "fd %d", fd);
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("FS Tracepoint Probes");
diff --git a/probes/ipc-trace.c b/probes/ipc-trace.c
new file mode 100644 (file)
index 0000000..3a09525
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * ltt/probes/ipc-trace.c
+ *
+ * IPC tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <trace/ipc.h>
+
+void probe_ipc_msg_create(void *data, long id, int flags)
+{
+       trace_mark_tp(ipc, msg_create, ipc_msg_create, probe_ipc_msg_create,
+               "id %ld flags %d", id, flags);
+}
+
+void probe_ipc_sem_create(void *data, long id, int flags)
+{
+       trace_mark_tp(ipc, sem_create, ipc_sem_create, probe_ipc_sem_create,
+               "id %ld flags %d", id, flags);
+}
+
+void probe_ipc_shm_create(void *data, long id, int flags)
+{
+       trace_mark_tp(ipc, shm_create, ipc_shm_create, probe_ipc_shm_create,
+               "id %ld flags %d", id, flags);
+}
+
+void probe_ipc_call(void *data, unsigned int call, unsigned int first)
+{
+       trace_mark_tp(ipc, call, ipc_call, probe_ipc_call,
+               "call %u first %d", call, first);
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("IPC Tracepoint Probes");
diff --git a/probes/jbd2-trace.c b/probes/jbd2-trace.c
new file mode 100644 (file)
index 0000000..3da32cd
--- /dev/null
@@ -0,0 +1,208 @@
+/*
+ * ltt/probes/jbd2-trace.c
+ *
+ * JBD2 tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <trace/events/jbd2.h>
+
+#include "../ltt-tracer.h"
+
+static struct dentry *jbd2_filter_dentry, *jbd2_filter_dev_dentry;
+static DEFINE_MUTEX(jbd2_filter_mutex);
+/* Make sure we don't race between module exit and file write */
+static int module_exits;
+
+struct rcu_dev_filter {
+       struct rcu_head rcu;
+       char devname[NAME_MAX];
+};
+
+static struct rcu_dev_filter *dev_filter;
+
+/*
+ * Probes are executed in rcu_sched read-side critical section.
+ */
+static int do_filter(const char *dev)
+{
+       struct rcu_dev_filter *ldev_filter = rcu_dereference(dev_filter);
+
+       if (unlikely(ldev_filter))
+               if (unlikely(strcmp(ldev_filter->devname, dev)))
+                       return 0;
+       return 1;
+}
+
+void probe_jbd2_checkpoint(void *data, journal_t *journal, int result)
+{
+       if (unlikely(!do_filter(journal->j_devname)))
+               return;
+       trace_mark_tp(jbd2, checkpoint, jbd2_checkpoint,
+               probe_jbd2_checkpoint, "dev %s need_checkpoint %d",
+               journal->j_devname, result);
+}
+
+void probe_jbd2_start_commit(void *data, journal_t *journal,
+                            transaction_t *commit_transaction)
+{
+       if (unlikely(!do_filter(journal->j_devname)))
+               return;
+       trace_mark_tp(jbd2, start_commit, jbd2_start_commit,
+               probe_jbd2_start_commit, "dev %s transaction %d",
+               journal->j_devname, commit_transaction->t_tid);
+}
+
+void probe_jbd2_end_commit(void *data, journal_t *journal,
+                          transaction_t *commit_transaction)
+{
+       if (unlikely(!do_filter(journal->j_devname)))
+               return;
+       trace_mark_tp(jbd2, end_commit, jbd2_end_commit,
+               probe_jbd2_end_commit, "dev %s transaction %d head %d",
+               journal->j_devname, commit_transaction->t_tid,
+               journal->j_tail_sequence);
+}
+
+static void free_dev_filter(struct rcu_head *head)
+{
+       kfree(container_of(head, struct rcu_dev_filter, rcu));
+}
+
+static ssize_t filter_op_write(struct file *file,
+       const char __user *user_buf, size_t count, loff_t *ppos)
+{
+       int err = 0;
+       char buf[NAME_MAX];
+       int buf_size;
+       char name[NAME_MAX];
+       struct rcu_dev_filter *new, *old;
+
+       mutex_lock(&jbd2_filter_mutex);
+       if (module_exits) {
+               err = -EPERM;
+               goto error;
+       }
+       buf_size = min(count, sizeof(buf) - 1);
+       err = copy_from_user(buf, user_buf, buf_size);
+       if (err)
+               goto error;
+       buf[buf_size] = 0;
+
+       if (sscanf(buf, "%s", name) != 1) {
+               err = -EPERM;
+               goto error;
+       }
+
+       old = dev_filter;
+
+       /* Empty string or * means all active */
+       if (name[0] == '\0' || (name[0] == '*' && name[1] == '\0')) {
+               new = NULL;
+       } else {
+               new = kmalloc(sizeof(*new), GFP_KERNEL);
+               strcpy(new->devname, name);
+       }
+
+       rcu_assign_pointer(dev_filter, new);
+       if (old)
+               call_rcu_sched(&old->rcu, free_dev_filter);
+
+       mutex_unlock(&jbd2_filter_mutex);
+       return count;
+
+error:
+       mutex_unlock(&jbd2_filter_mutex);
+       return err;
+}
+
+static ssize_t filter_op_read(struct file *filp, char __user *buffer,
+       size_t count, loff_t *ppos)
+{
+       ssize_t bcount;
+       const char *devname;
+
+       mutex_lock(&jbd2_filter_mutex);
+       if (!dev_filter)
+               devname = "*";
+       else
+               devname = dev_filter->devname;
+       bcount = simple_read_from_buffer(buffer, count, ppos,
+                       devname, strlen(devname));
+       mutex_unlock(&jbd2_filter_mutex);
+       return bcount;
+}
+
+static struct file_operations jbd2_file_operations = {
+       .write = filter_op_write,
+       .read = filter_op_read,
+};
+
+static void release_filter_dev(void)
+{
+       struct rcu_dev_filter *old;
+
+       mutex_lock(&jbd2_filter_mutex);
+       module_exits = 1;
+       old = dev_filter;
+       rcu_assign_pointer(dev_filter, NULL);
+       if (old)
+               call_rcu_sched(&old->rcu, free_dev_filter);
+       mutex_unlock(&jbd2_filter_mutex);
+}
+
+static int __init filter_init(void)
+{
+       struct dentry *filter_root_dentry;
+       int err = 0;
+
+       filter_root_dentry = get_filter_root();
+       if (!filter_root_dentry) {
+               err = -ENOENT;
+               goto end;
+       }
+
+       jbd2_filter_dentry = debugfs_create_dir("jbd2", filter_root_dentry);
+
+       if (IS_ERR(jbd2_filter_dentry) || !jbd2_filter_dentry) {
+               printk(KERN_ERR "Failed to create jbd2 filter file\n");
+               err = -ENOMEM;
+               goto end;
+       }
+
+       jbd2_filter_dev_dentry = debugfs_create_file("dev", S_IWUSR,
+                       jbd2_filter_dentry, NULL, &jbd2_file_operations);
+       if (IS_ERR(jbd2_filter_dentry) || !jbd2_filter_dentry) {
+               printk(KERN_ERR "Failed to create jbd2 filter file\n");
+               err = -ENOMEM;
+               goto release_filter_dentry;
+       }
+
+       goto end;
+
+release_filter_dentry:
+       debugfs_remove(jbd2_filter_dentry);
+       release_filter_dev();
+end:
+       return err;
+}
+
+static void __exit filter_exit(void)
+{
+       debugfs_remove(jbd2_filter_dev_dentry);
+       debugfs_remove(jbd2_filter_dentry);
+       release_filter_dev();
+}
+
+module_init(filter_init);
+module_exit(filter_exit);
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("JBD2 Tracepoint Probes");
diff --git a/probes/kernel-trace.c b/probes/kernel-trace.c
new file mode 100644 (file)
index 0000000..cabe60e
--- /dev/null
@@ -0,0 +1,581 @@
+/*
+ * ltt/probes/kernel-trace.c
+ *
+ * kernel tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/irq.h>
+#include <trace/events/signal.h>
+#include <trace/irq.h>
+#include <trace/sched.h>
+#include <trace/timer.h>
+#include <trace/kernel.h>
+#include <trace/fault.h>
+#include <trace/events/sched.h>
+
+#include "../ltt-tracer.h"
+#include "../ltt-type-serializer.h"
+
+/*
+ * This should probably be added to s390.
+ */
+#ifdef CONFIG_S390
+static struct pt_regs *get_irq_regs(void)
+{
+       return task_pt_regs(current);
+}
+#endif
+
+/*
+ * FIXME :
+ * currently, the specialized tracepoint probes cannot call into other marker
+ * probes, such as ftrace enable/disable. Given we want them to be as fast as
+ * possible, it might not be so bad to lose this flexibility. But that means
+ * such probes would have to connect to tracepoints on their own.
+ */
+
+/* kernel_irq_entry specialized tracepoint probe */
+
+void probe_irq_entry(void *_data, unsigned int id, struct pt_regs *regs,
+       struct irqaction *action);
+
+DEFINE_MARKER_TP(kernel, irq_entry, irq_entry, probe_irq_entry,
+       "ip %lu handler %p irq_id #2u%u kernel_mode #1u%u");
+
+notrace void probe_irq_entry(void *_data, unsigned int id, struct pt_regs *regs,
+       struct irqaction *action)
+{
+       struct marker *marker;
+       struct serialize_long_long_short_char data;
+
+       if (unlikely(!regs))
+               regs = get_irq_regs();
+       if (likely(regs)) {
+               data.f1 = instruction_pointer(regs);
+               data.f4 = !user_mode(regs);
+       } else {
+               data.f1 = 0UL;
+               data.f4 = 1;
+       }
+       data.f2 = (unsigned long) (action ? action->handler : NULL);
+       data.f3 = id;
+
+       marker = &GET_MARKER(kernel, irq_entry);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+void probe_irq_next_handler(void *_data, unsigned int id, struct irqaction *action,
+               irqreturn_t prev_ret);
+
+DEFINE_MARKER_TP(kernel, irq_next_handler, irq_next_handler,
+       probe_irq_next_handler,
+       "handler %p prev_ret #1u%u");
+
+notrace void probe_irq_next_handler(void *_data, unsigned int id, struct irqaction *action,
+               irqreturn_t prev_ret)
+{
+       struct marker *marker;
+       struct serialize_long_char data;
+
+       data.f1 = (unsigned long) (action ? action->handler : NULL);
+       data.f2 = prev_ret;
+
+       marker = &GET_MARKER(kernel, irq_next_handler);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+/* kernel_irq_exit specialized tracepoint probe */
+
+void probe_irq_exit(void *_data, irqreturn_t retval);
+
+DEFINE_MARKER_TP(kernel, irq_exit, irq_exit, probe_irq_exit,
+       "handled #1u%u");
+
+notrace void probe_irq_exit(void *_data, irqreturn_t retval)
+{
+       struct marker *marker;
+       unsigned char data;
+
+       data = IRQ_RETVAL(retval);
+
+       marker = &GET_MARKER(kernel, irq_exit);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, sizeof(data), sizeof(data));
+}
+
+/* kernel_softirq_entry specialized tracepoint probe */
+
+void probe_softirq_entry(void *_data, struct softirq_action *h,
+       struct softirq_action *softirq_vec);
+
+DEFINE_MARKER_TP(kernel, softirq_entry, softirq_entry,
+       probe_softirq_entry, "softirq_id #1u%lu");
+
+notrace void probe_softirq_entry(void *_data, struct softirq_action *h,
+       struct softirq_action *softirq_vec)
+{
+       struct marker *marker;
+       unsigned char data;
+
+       data = ((unsigned long)h - (unsigned long)softirq_vec) / sizeof(*h);
+
+       marker = &GET_MARKER(kernel, softirq_entry);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, sizeof(data), sizeof(data));
+}
+
+/* kernel_softirq_exit specialized tracepoint probe */
+
+void probe_softirq_exit(void *_data, struct softirq_action *h,
+       struct softirq_action *softirq_vec);
+
+DEFINE_MARKER_TP(kernel, softirq_exit, softirq_exit,
+       probe_softirq_exit, "softirq_id #1u%lu");
+
+notrace void probe_softirq_exit(void *_data, struct softirq_action *h,
+       struct softirq_action *softirq_vec)
+{
+       struct marker *marker;
+       unsigned char data;
+
+       data = ((unsigned long)h - (unsigned long)softirq_vec) / sizeof(*h);
+
+       marker = &GET_MARKER(kernel, softirq_exit);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, sizeof(data), sizeof(data));
+}
+
+/* kernel_softirq_raise specialized tracepoint probe */
+
+void probe_softirq_raise(void *_data, unsigned int nr);
+
+DEFINE_MARKER_TP(kernel, softirq_raise, softirq_raise,
+       probe_softirq_raise, "softirq_id #1u%u");
+
+notrace void probe_softirq_raise(void *_data, unsigned int nr)
+{
+       struct marker *marker;
+       unsigned char data;
+
+       data = nr;
+
+       marker = &GET_MARKER(kernel, softirq_raise);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, sizeof(data), sizeof(data));
+}
+
+/* Standard probes */
+void probe_irq_tasklet_low_entry(void *_data, struct tasklet_struct *t)
+{
+       trace_mark_tp(kernel, tasklet_low_entry, irq_tasklet_low_entry,
+               probe_irq_tasklet_low_entry, "func %p data %lu",
+               t->func, t->data);
+}
+
+void probe_irq_tasklet_low_exit(void *_data, struct tasklet_struct *t)
+{
+       trace_mark_tp(kernel, tasklet_low_exit, irq_tasklet_low_exit,
+               probe_irq_tasklet_low_exit, "func %p data %lu",
+               t->func, t->data);
+}
+
+void probe_irq_tasklet_high_entry(void *_data, struct tasklet_struct *t)
+{
+       trace_mark_tp(kernel, tasklet_high_entry, irq_tasklet_high_entry,
+               probe_irq_tasklet_high_entry, "func %p data %lu",
+               t->func, t->data);
+}
+
+void probe_irq_tasklet_high_exit(void *_data, struct tasklet_struct *t)
+{
+       trace_mark_tp(kernel, tasklet_high_exit, irq_tasklet_high_exit,
+               probe_irq_tasklet_high_exit, "func %p data %lu",
+               t->func, t->data);
+}
+
+void probe_sched_kthread_stop(void *_data, struct task_struct *t)
+{
+       trace_mark_tp(kernel, kthread_stop, sched_kthread_stop,
+               probe_sched_kthread_stop, "pid %d", t->pid);
+}
+
+void probe_sched_kthread_stop_ret(void *_data, int ret)
+{
+       trace_mark_tp(kernel, kthread_stop_ret, sched_kthread_stop_ret,
+               probe_sched_kthread_stop_ret, "ret %d", ret);
+}
+
+void probe_sched_wait_task(void *_data, struct task_struct *p)
+{
+       trace_mark_tp(kernel, sched_wait_task, sched_wait_task,
+               probe_sched_wait_task, "pid %d state #2d%ld",
+               p->pid, p->state);
+}
+
+/* kernel_sched_try_wakeup specialized tracepoint probe */
+
+void probe_sched_wakeup(void *_data, struct task_struct *p, int success);
+
+DEFINE_MARKER_TP(kernel, sched_try_wakeup, sched_wakeup,
+       probe_sched_wakeup, "pid %d cpu_id %u state #2d%ld");
+
+notrace void probe_sched_wakeup(void *_data, struct task_struct *p, int success)
+{
+       struct marker *marker;
+       struct serialize_int_int_short data;
+
+       data.f1 = p->pid;
+       data.f2 = task_cpu(p);
+       data.f3 = p->state;
+
+       marker = &GET_MARKER(kernel, sched_try_wakeup);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(int));
+}
+
+void probe_sched_wakeup_new(void *_data, struct task_struct *p, int success)
+{
+       trace_mark_tp(kernel, sched_wakeup_new_task, sched_wakeup_new,
+               probe_sched_wakeup_new, "pid %d state #2d%ld cpu_id %u",
+               p->pid, p->state, task_cpu(p));
+}
+
+/* kernel_sched_schedule specialized tracepoint probe */
+
+void probe_sched_switch(void *_data, struct task_struct *prev,
+               struct task_struct *next);
+
+DEFINE_MARKER_TP(kernel, sched_schedule, sched_switch, probe_sched_switch,
+       "prev_pid %d next_pid %d prev_state #2d%ld");
+
+notrace void probe_sched_switch(void *_data, struct task_struct *prev,
+               struct task_struct *next)
+{
+       struct marker *marker;
+       struct serialize_int_int_short data;
+
+       data.f1 = prev->pid;
+       data.f2 = next->pid;
+       data.f3 = prev->state;
+
+       marker = &GET_MARKER(kernel, sched_schedule);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(int));
+}
+
+void probe_sched_migrate_task(void *_data, struct task_struct *p, int dest_cpu)
+{
+       trace_mark_tp(kernel, sched_migrate_task, sched_migrate_task,
+               probe_sched_migrate_task, "pid %d state #2d%ld dest_cpu %d",
+               p->pid, p->state, dest_cpu);
+}
+
+void probe_sched_signal_send(void *_data, int sig, struct siginfo *info, struct task_struct *t)
+{
+       trace_mark_tp(kernel, send_signal, signal_generate,
+               probe_sched_signal_send, "pid %d signal %d", t->pid, sig);
+}
+
+void probe_sched_process_free(void *_data, struct task_struct *p)
+{
+       trace_mark_tp(kernel, process_free, sched_process_free,
+               probe_sched_process_free, "pid %d", p->pid);
+}
+
+void probe_sched_process_exit(void *_data, struct task_struct *p)
+{
+       trace_mark_tp(kernel, process_exit, sched_process_exit,
+               probe_sched_process_exit, "pid %d", p->pid);
+}
+
+void probe_sched_process_wait(void *_data, struct pid *pid)
+{
+       trace_mark_tp(kernel, process_wait, sched_process_wait,
+               probe_sched_process_wait, "pid %d", pid_nr(pid));
+}
+
+void probe_sched_process_fork(void *_data, struct task_struct *parent,
+               struct task_struct *child)
+{
+       trace_mark_tp(kernel, process_fork, sched_process_fork,
+               probe_sched_process_fork,
+               "parent_pid %d child_pid %d child_tgid %d",
+               parent->pid, child->pid, child->tgid);
+}
+
+void probe_sched_kthread_create(void *_data, void *fn, int pid)
+{
+       trace_mark_tp(kernel, kthread_create, sched_kthread_create,
+               probe_sched_kthread_create,
+               "fn %p pid %d", fn, pid);
+}
+
+void probe_timer_itimer_expired(void *_data, struct signal_struct *sig)
+{
+       trace_mark_tp(kernel, timer_itimer_expired, timer_itimer_expired,
+               probe_timer_itimer_expired, "pid %d",
+               pid_nr(sig->leader_pid));
+}
+
+void probe_timer_itimer_set(void *_data, int which, struct itimerval *value)
+{
+       trace_mark_tp(kernel, timer_itimer_set,
+               timer_itimer_set, probe_timer_itimer_set,
+               "which %d interval_sec %ld interval_usec %ld "
+               "value_sec %ld value_usec %ld",
+               which,
+               value->it_interval.tv_sec,
+               value->it_interval.tv_usec,
+               value->it_value.tv_sec,
+               value->it_value.tv_usec);
+}
+
+/* kernel_timer_set specialized tracepoint probe */
+
+void probe_timer_set(void *_data, struct timer_list *timer);
+
+DEFINE_MARKER_TP(kernel, timer_set, timer_set, probe_timer_set,
+       "expires %lu function %p data %lu");
+
+notrace void probe_timer_set(void *_data, struct timer_list *timer)
+{
+       struct marker *marker;
+       struct serialize_long_long_long data;
+
+       data.f1 = timer->expires;
+       data.f2 = (unsigned long)timer->function;
+       data.f3 = timer->data;
+
+       marker = &GET_MARKER(kernel, timer_set);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+void probe_timer_update_time(void *_data, struct timespec *_xtime,
+               struct timespec *_wall_to_monotonic)
+{
+       trace_mark_tp(kernel, timer_update_time, timer_update_time,
+               probe_timer_update_time,
+               "jiffies #8u%llu xtime_sec %ld xtime_nsec %ld "
+               "walltomonotonic_sec %ld walltomonotonic_nsec %ld",
+               (unsigned long long)jiffies_64, _xtime->tv_sec, _xtime->tv_nsec,
+               _wall_to_monotonic->tv_sec, _wall_to_monotonic->tv_nsec);
+}
+
+void probe_timer_timeout(void *_data, struct task_struct *p)
+{
+       trace_mark_tp(kernel, timer_timeout, timer_timeout,
+               probe_timer_timeout, "pid %d", p->pid);
+}
+
+void probe_kernel_printk(void *_data, unsigned long retaddr)
+{
+       trace_mark_tp(kernel, printk, kernel_printk,
+               probe_kernel_printk, "ip 0x%lX", retaddr);
+}
+
+void probe_kernel_vprintk(void *_data, unsigned long retaddr, char *buf, int len)
+{
+       if (len > 0) {
+               unsigned int loglevel;
+               int mark_len;
+               char *mark_buf;
+               char saved_char;
+
+               if (buf[0] == '<' && buf[1] >= '0' &&
+                  buf[1] <= '7' && buf[2] == '>') {
+                       loglevel = buf[1] - '0';
+                       mark_buf = &buf[3];
+                       mark_len = len - 3;
+               } else {
+                       loglevel = default_message_loglevel;
+                       mark_buf = buf;
+                       mark_len = len;
+               }
+               if (mark_buf[mark_len - 1] == '\n')
+                       mark_len--;
+               saved_char = mark_buf[mark_len];
+               mark_buf[mark_len] = '\0';
+               trace_mark_tp(kernel, vprintk, kernel_vprintk,
+                       probe_kernel_vprintk,
+                       "loglevel #1u%u string %s ip 0x%lX",
+                       loglevel, mark_buf, retaddr);
+               mark_buf[mark_len] = saved_char;
+       }
+}
+
+#ifdef CONFIG_MODULES
+void probe_kernel_module_free(void *_data, struct module *mod)
+{
+       trace_mark_tp(kernel, module_free, kernel_module_free,
+               probe_kernel_module_free, "name %s", mod->name);
+}
+
+void probe_kernel_module_load(void *_data, struct module *mod)
+{
+       trace_mark_tp(kernel, module_load, kernel_module_load,
+               probe_kernel_module_load, "name %s", mod->name);
+}
+#endif
+
+void probe_kernel_panic(void *_data, const char *fmt, va_list args)
+{
+       char info[64];
+       vsnprintf(info, sizeof(info), fmt, args);
+       trace_mark_tp(kernel, panic, kernel_panic, probe_kernel_panic,
+               "info %s", info);
+}
+
+void probe_kernel_kernel_kexec(void *_data, struct kimage *image)
+{
+       trace_mark_tp(kernel, kernel_kexec, kernel_kernel_kexec,
+               probe_kernel_kernel_kexec, "image %p", image);
+}
+
+void probe_kernel_crash_kexec(void *_data, struct kimage *image, struct pt_regs *regs)
+{
+       trace_mark_tp(kernel, crash_kexec, kernel_crash_kexec,
+               probe_kernel_crash_kexec, "image %p ip %p", image,
+               regs ? (void *)instruction_pointer(regs) : NULL);
+}
+
+/* kernel_page_fault_entry specialized tracepoint probe */
+
+void probe_kernel_page_fault_entry(void *_data, struct pt_regs *regs, int trapnr,
+       struct mm_struct *mm, struct vm_area_struct *vma,
+       unsigned long address, int write_access);
+
+DEFINE_MARKER_TP(kernel, page_fault_entry, page_fault_entry,
+       probe_kernel_page_fault_entry,
+       "ip #p%lu address #p%lu trap_id #2u%u write_access #1u%u");
+
+notrace void probe_kernel_page_fault_entry(void *_data, struct pt_regs *regs, int trapnr,
+       struct mm_struct *mm, struct vm_area_struct *vma,
+       unsigned long address, int write_access)
+{
+       struct marker *marker;
+       struct serialize_long_long_short_char data;
+
+       if (likely(regs))
+               data.f1 = instruction_pointer(regs);
+       else
+               data.f1 = 0UL;
+       data.f2 = address;
+       data.f3 = (unsigned short)trapnr;
+       data.f4 = (unsigned char)!!write_access;
+
+       marker = &GET_MARKER(kernel, page_fault_entry);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+/* kernel_page_fault_exit specialized tracepoint probe */
+
+void probe_kernel_page_fault_exit(void *_data, int res);
+
+DEFINE_MARKER_TP(kernel, page_fault_exit, page_fault_exit,
+       probe_kernel_page_fault_exit,
+       "res %d");
+
+notrace void probe_kernel_page_fault_exit(void *_data, int res)
+{
+       struct marker *marker;
+
+       marker = &GET_MARKER(kernel, page_fault_exit);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &res, sizeof(res), sizeof(res));
+}
+
+/* kernel_page_fault_nosem_entry specialized tracepoint probe */
+
+void probe_kernel_page_fault_nosem_entry(void *_data, struct pt_regs *regs,
+       int trapnr, unsigned long address);
+
+DEFINE_MARKER_TP(kernel, page_fault_nosem_entry, page_fault_nosem_entry,
+       probe_kernel_page_fault_nosem_entry,
+       "ip #p%lu address #p%lu trap_id #2u%u");
+
+notrace void probe_kernel_page_fault_nosem_entry(void *_data, struct pt_regs *regs,
+       int trapnr, unsigned long address)
+{
+       struct marker *marker;
+       struct serialize_long_long_short data;
+
+       if (likely(regs))
+               data.f1 = instruction_pointer(regs);
+       else
+               data.f1 = 0UL;
+       data.f2 = address;
+       data.f3 = (unsigned short)trapnr;
+
+       marker = &GET_MARKER(kernel, page_fault_nosem_entry);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+/* kernel_page_fault_nosem_exit specialized tracepoint probe */
+
+void probe_kernel_page_fault_nosem_exit(void *_data, int res);
+
+DEFINE_MARKER_TP(kernel, page_fault_nosem_exit, page_fault_nosem_exit,
+       probe_kernel_page_fault_nosem_exit,
+       MARK_NOARGS);
+
+notrace void probe_kernel_page_fault_nosem_exit(void *_data, int res)
+{
+       struct marker *marker;
+
+       marker = &GET_MARKER(kernel, page_fault_nosem_exit);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               NULL, 0, 0);
+}
+
+/* kernel_page_fault_get_user_entry specialized tracepoint probe */
+
+void probe_kernel_page_fault_get_user_entry(void *_data, struct mm_struct *mm,
+       struct vm_area_struct *vma, unsigned long address, int write_access);
+
+DEFINE_MARKER_TP(kernel, page_fault_get_user_entry, page_fault_get_user_entry,
+       probe_kernel_page_fault_get_user_entry,
+       "address #p%lu write_access #1u%u");
+
+notrace void probe_kernel_page_fault_get_user_entry(void *_data, struct mm_struct *mm,
+       struct vm_area_struct *vma, unsigned long address, int write_access)
+{
+       struct marker *marker;
+       struct serialize_long_char data;
+
+       data.f1 = address;
+       data.f2 = (unsigned char)!!write_access;
+
+       marker = &GET_MARKER(kernel, page_fault_get_user_entry);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+/* kernel_page_fault_get_user_exit specialized tracepoint probe */
+
+void probe_kernel_page_fault_get_user_exit(void *_data, int res);
+
+DEFINE_MARKER_TP(kernel, page_fault_get_user_exit, page_fault_get_user_exit,
+       probe_kernel_page_fault_get_user_exit,
+       "res %d");
+
+notrace void probe_kernel_page_fault_get_user_exit(void *_data, int res)
+{
+       struct marker *marker;
+
+       marker = &GET_MARKER(kernel, page_fault_get_user_exit);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &res, sizeof(res), sizeof(res));
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("kernel Tracepoint Probes");
diff --git a/probes/lockdep-trace.c b/probes/lockdep-trace.c
new file mode 100644 (file)
index 0000000..a9a7734
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * ltt/probes/lockdep-trace.c
+ *
+ * lockdep tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/lockdep.h>
+#include <trace/lockdep.h>
+
+void probe_lockdep_hardirqs_on(void *data, unsigned long retaddr)
+{
+       trace_mark_tp(lockdep, hardirqs_on, lockdep_hardirqs_on,
+               probe_lockdep_hardirqs_on, "retaddr 0x%lX", retaddr);
+}
+
+void probe_lockdep_hardirqs_off(void *data, unsigned long retaddr)
+{
+       trace_mark_tp(lockdep, hardirqs_off, lockdep_hardirqs_off,
+               probe_lockdep_hardirqs_off, "retaddr 0x%lX", retaddr);
+}
+
+void probe_lockdep_softirqs_on(void *data, unsigned long retaddr)
+{
+       trace_mark_tp(lockdep, softirqs_on, lockdep_softirqs_on,
+               probe_lockdep_softirqs_on, "retaddr 0x%lX", retaddr);
+}
+
+void probe_lockdep_softirqs_off(void *data, unsigned long retaddr)
+{
+       trace_mark_tp(lockdep, softirqs_off, lockdep_softirqs_off,
+               probe_lockdep_softirqs_off, "retaddr 0x%lX", retaddr);
+}
+
+void probe_lockdep_lock_acquire(void *data, unsigned long retaddr,
+               unsigned int subclass, struct lockdep_map *lock, int trylock,
+               int read, int hardirqs_off)
+{
+       trace_mark_tp(lockdep, lock_acquire, lockdep_lock_acquire,
+               probe_lockdep_lock_acquire,
+               "retaddr 0x%lX subclass %u lock %p trylock %d read %d "
+               "hardirqs_off %d",
+               retaddr, subclass, lock, trylock, read, hardirqs_off);
+}
+
+void probe_lockdep_lock_release(void *data, unsigned long retaddr,
+               struct lockdep_map *lock, int nested)
+{
+       trace_mark_tp(lockdep, lock_release, lockdep_lock_release,
+               probe_lockdep_lock_release,
+               "retaddr 0x%lX lock %p nested %d",
+               retaddr, lock, nested);
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("lockdep Tracepoint Probes");
diff --git a/probes/mm-trace.c b/probes/mm-trace.c
new file mode 100644 (file)
index 0000000..935e366
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * ltt/probes/mm-trace.c
+ *
+ * MM tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <linux/swapops.h>
+#include <trace/page_alloc.h>
+#include <trace/filemap.h>
+#include <trace/swap.h>
+#include <trace/hugetlb.h>
+
+#include "../ltt-type-serializer.h"
+
+void probe_wait_on_page_start(void *_data, struct page *page, int bit_nr)
+{
+       trace_mark_tp(mm, wait_on_page_start, wait_on_page_start,
+               probe_wait_on_page_start, "pfn %lu bit_nr %d",
+               page_to_pfn(page), bit_nr);
+}
+
+void probe_wait_on_page_end(void *_data, struct page *page, int bit_nr)
+{
+       trace_mark_tp(mm, wait_on_page_end, wait_on_page_end,
+               probe_wait_on_page_end, "pfn %lu bit_nr %d",
+               page_to_pfn(page), bit_nr);
+}
+
+void probe_hugetlb_page_free(void *_data, struct page *page)
+{
+       trace_mark_tp(mm, huge_page_free, hugetlb_page_free,
+               probe_hugetlb_page_free, "pfn %lu", page_to_pfn(page));
+}
+
+void probe_hugetlb_page_alloc(void *_data, struct page *page)
+{
+       if (page)
+               trace_mark_tp(mm, huge_page_alloc, hugetlb_page_alloc,
+                       probe_hugetlb_page_alloc, "pfn %lu", page_to_pfn(page));
+}
+
+/* mm_page_free specialized tracepoint probe */
+
+void probe_page_free(void *_data, struct page *page, unsigned int order);
+
+DEFINE_MARKER_TP(mm, page_free, page_free, probe_page_free,
+       "pfn %lu order %u");
+
+notrace void probe_page_free(void *_data, struct page *page, unsigned int order)
+{
+       struct marker *marker;
+       struct serialize_long_int data;
+
+       data.f1 = page_to_pfn(page);
+       data.f2 = order;
+
+       marker = &GET_MARKER(mm, page_free);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+/* mm_page_alloc specialized tracepoint probe */
+
+void probe_page_alloc(void *_data, struct page *page, unsigned int order);
+
+DEFINE_MARKER_TP(mm, page_alloc, page_alloc, probe_page_alloc,
+       "pfn %lu order %u");
+
+notrace void probe_page_alloc(void *_data, struct page *page, unsigned int order)
+{
+       struct marker *marker;
+       struct serialize_long_int data;
+
+       if (unlikely(!page))
+               return;
+
+       data.f1 = page_to_pfn(page);
+       data.f2 = order;
+
+       marker = &GET_MARKER(mm, page_alloc);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+#ifdef CONFIG_SWAP
+void probe_swap_in(void *_data, struct page *page, swp_entry_t entry)
+{
+       trace_mark_tp(mm, swap_in, swap_in, probe_swap_in,
+               "pfn %lu filp %p offset %lu",
+               page_to_pfn(page),
+               get_swap_info_struct(swp_type(entry))->swap_file,
+               swp_offset(entry));
+}
+
+void probe_swap_out(void *_data, struct page *page)
+{
+       trace_mark_tp(mm, swap_out, swap_out, probe_swap_out,
+               "pfn %lu filp %p offset %lu",
+               page_to_pfn(page),
+               get_swap_info_struct(swp_type(
+                       page_swp_entry(page)))->swap_file,
+               swp_offset(page_swp_entry(page)));
+}
+
+void probe_swap_file_close(void *_data, struct file *file)
+{
+       trace_mark_tp(mm, swap_file_close, swap_file_close,
+               probe_swap_file_close, "filp %p", file);
+}
+
+void probe_swap_file_open(void *_data, struct file *file, char *filename)
+{
+       trace_mark_tp(mm, swap_file_open, swap_file_open,
+               probe_swap_file_open, "filp %p filename %s",
+               file, filename);
+}
+#endif
+
+void probe_add_to_page_cache(void *_data, struct address_space *mapping, pgoff_t offset)
+{
+       trace_mark_tp(mm, add_to_page_cache, add_to_page_cache,
+               probe_add_to_page_cache,
+               "inode %lu sdev %u",
+               mapping->host->i_ino, mapping->host->i_sb->s_dev);
+}
+
+void probe_remove_from_page_cache(void *_data, struct address_space *mapping)
+{
+       trace_mark_tp(mm, remove_from_page_cache, remove_from_page_cache,
+               probe_remove_from_page_cache,
+               "inode %lu sdev %u",
+               mapping->host->i_ino, mapping->host->i_sb->s_dev);
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("MM Tracepoint Probes");
diff --git a/probes/net-extended-trace.c b/probes/net-extended-trace.c
new file mode 100644 (file)
index 0000000..15fc810
--- /dev/null
@@ -0,0 +1,146 @@
+/*
+ * ltt/probes/net-extended-trace.c
+ *
+ * Net tracepoint extended probes.
+ *
+ * These probes record many header fields from TCP and UDP messages. Here are
+ * the consequences of this:
+ * 1) it allows analyzing network traffic to provide some pcap-like
+ *    functionality within LTTng
+ * 2) it allows offline synchronization of a group of concurrent traces
+ *    recorded on different nodes
+ * 3) it increases tracing overhead
+ *
+ * You can leave out these probes or not activate them if you are not
+ * especially interested in the details of network traffic and do not wish to
+ * synchronize distributed traces.
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/in_route.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/route.h>
+#include <trace/net.h>
+
+#include "../ltt-type-serializer.h"
+
+void probe_net_dev_xmit_extended(void *_data, struct sk_buff *skb);
+
+DEFINE_MARKER_TP(net, dev_xmit_extended, net_dev_xmit,
+       probe_net_dev_xmit_extended, "skb 0x%lX network_protocol #n2u%hu "
+       "transport_protocol #1u%u saddr #n4u%lu daddr #n4u%lu "
+       "tot_len #n2u%hu ihl #1u%u source #n2u%hu dest #n2u%hu seq #n4u%lu "
+       "ack_seq #n4u%lu doff #1u%u ack #1u%u rst #1u%u syn #1u%u fin #1u%u");
+
+notrace void probe_net_dev_xmit_extended(void *_data, struct sk_buff *skb)
+{
+       struct marker *marker;
+       struct serialize_l214421224411111 data;
+       struct iphdr *iph = ip_hdr(skb);
+       struct tcphdr *th = tcp_hdr(skb);
+
+       data.f1 = (unsigned long)skb;
+       data.f2 = skb->protocol;
+
+       if (ntohs(skb->protocol) == ETH_P_IP) {
+               data.f3 = ip_hdr(skb)->protocol;
+               data.f4 = iph->saddr;
+               data.f5 = iph->daddr;
+               data.f6 = iph->tot_len;
+               data.f7 = iph->ihl;
+
+               if (data.f3 == IPPROTO_TCP) {
+                       data.f8 = th->source;
+                       data.f9 = th->dest;
+                       data.f10 = th->seq;
+                       data.f11 = th->ack_seq;
+                       data.f12 = th->doff;
+                       data.f13 = th->ack;
+                       data.f14 = th->rst;
+                       data.f15 = th->syn;
+                       data.f16 = th->fin;
+               }
+       }
+
+       marker = &GET_MARKER(net, dev_xmit_extended);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+void probe_tcpv4_rcv_extended(void *_data, struct sk_buff *skb);
+
+DEFINE_MARKER_TP(net, tcpv4_rcv_extended, net_tcpv4_rcv,
+       probe_tcpv4_rcv_extended, "skb 0x%lX saddr #n4u%lu daddr #n4u%lu "
+       "tot_len #n2u%hu ihl #1u%u source #n2u%hu dest #n2u%hu seq #n4u%lu "
+       "ack_seq #n4u%lu doff #1u%u ack #1u%u rst #1u%u syn #1u%u fin #1u%u");
+
+notrace void probe_tcpv4_rcv_extended(void *_data, struct sk_buff *skb)
+{
+       struct marker *marker;
+       struct serialize_l4421224411111 data;
+       struct iphdr *iph = ip_hdr(skb);
+       struct tcphdr *th = tcp_hdr(skb);
+
+       data.f1 = (unsigned long)skb;
+       data.f2 = iph->saddr;
+       data.f3 = iph->daddr;
+       data.f4 = iph->tot_len;
+       data.f5 = iph->ihl;
+       data.f6 = th->source;
+       data.f7 = th->dest;
+       data.f8 = th->seq;
+       data.f9 = th->ack_seq;
+       data.f10 = th->doff;
+       data.f11 = th->ack;
+       data.f12 = th->rst;
+       data.f13 = th->syn;
+       data.f14 = th->fin;
+
+       marker = &GET_MARKER(net, tcpv4_rcv_extended);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+void probe_udpv4_rcv_extended(void *_data, struct sk_buff *skb);
+
+DEFINE_MARKER_TP(net, udpv4_rcv_extended, net_udpv4_rcv,
+       probe_udpv4_rcv_extended, "skb 0x%lX saddr #n4u%lu daddr #n4u%lu "
+       "unicast #1u%u ulen #n2u%hu source #n2u%hu dest #n2u%hu "
+       "data_start #8u%lx");
+
+notrace void probe_udpv4_rcv_extended(void *_data, struct sk_buff *skb)
+{
+       struct marker *marker;
+       struct serialize_l4412228 data;
+       struct iphdr *iph = ip_hdr(skb);
+       struct rtable *rt = skb_rtable(skb);
+       struct udphdr *uh = udp_hdr(skb);
+
+       data.f1 = (unsigned long)skb;
+       data.f2 = iph->saddr;
+       data.f3 = iph->daddr;
+       data.f4 = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST) ? 0 : 1;
+       data.f5 = uh->len;
+       data.f6 = uh->source;
+       data.f7 = uh->dest;
+       /* UDP header has not been pulled from skb->data, read the first 8
+        * bytes of UDP data if they are not in a fragment*/
+       data.f8 = 0;
+       if (skb_headlen(skb) >= sizeof(struct udphdr) + 8)
+               data.f8 = *(unsigned long long *)(skb->data + sizeof(*uh));
+       else if (skb_headlen(skb) >= sizeof(struct udphdr))
+               memcpy(&data.f8, skb->data + sizeof(struct udphdr),
+                       skb_headlen(skb) - sizeof(struct udphdr));
+
+       marker = &GET_MARKER(net, udpv4_rcv_extended);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(unsigned long long));
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Benjamin Poirier");
+MODULE_DESCRIPTION("Net Tracepoint Extended Probes");
diff --git a/probes/net-trace.c b/probes/net-trace.c
new file mode 100644 (file)
index 0000000..3124125
--- /dev/null
@@ -0,0 +1,406 @@
+/*
+ * ltt/probes/net-trace.c
+ *
+ * Net tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <trace/net.h>
+#include <trace/ipv4.h>
+#include <trace/ipv6.h>
+#include <trace/socket.h>
+
+#include "../ltt-type-serializer.h"
+
+void probe_net_dev_xmit(void *_data, struct sk_buff *skb);
+
+DEFINE_MARKER_TP(net, dev_xmit, net_dev_xmit, probe_net_dev_xmit,
+       "skb %p protocol #n2u%hu");
+
+notrace void probe_net_dev_xmit(void *_data, struct sk_buff *skb)
+{
+       struct marker *marker;
+       struct serialize_long_short data;
+
+       data.f1 = (unsigned long)skb;
+       data.f2 = skb->protocol;
+
+       marker = &GET_MARKER(net, dev_xmit);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+void probe_net_dev_receive(void *_data, struct sk_buff *skb);
+
+DEFINE_MARKER_TP(net, dev_receive, net_dev_receive, probe_net_dev_receive,
+       "skb %p protocol #n2u%hu");
+
+notrace void probe_net_dev_receive(void *_data, struct sk_buff *skb)
+{
+       struct marker *marker;
+       struct serialize_long_short data;
+
+       data.f1 = (unsigned long)skb;
+       data.f2 = skb->protocol;
+
+       marker = &GET_MARKER(net, dev_receive);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+void probe_ipv4_addr_add(void *_data, struct in_ifaddr *ifa)
+{
+       trace_mark_tp(netif_state, insert_ifa_ipv4, ipv4_addr_add,
+               probe_ipv4_addr_add, "label %s address #4u%u",
+               ifa->ifa_label, (unsigned int)ifa->ifa_address);
+}
+
+void probe_ipv4_addr_del(void *_data, struct in_ifaddr *ifa)
+{
+       trace_mark_tp(netif_state, del_ifa_ipv4, ipv4_addr_del,
+               probe_ipv4_addr_del, "label %s address #4u%u",
+               ifa->ifa_label, (unsigned int)ifa->ifa_address);
+}
+
+void probe_ipv6_addr_add(void *_data, struct inet6_ifaddr *ifa)
+{
+       __u8 *addr = ifa->addr.s6_addr;
+
+       trace_mark_tp(netif_state, insert_ifa_ipv6, ipv6_addr_add,
+               probe_ipv6_addr_add,
+               "label %s "
+               "a15 #1x%c a14 #1x%c a13 #1x%c a12 #1x%c "
+               "a11 #1x%c a10 #1x%c a9 #1x%c a8 #1x%c "
+               "a7 #1x%c a6 #1x%c a5 #1x%c a4 #1x%c "
+               "a3 #1x%c a2 #1x%c a1 #1x%c a0 #1x%c",
+               ifa->idev->dev->name,
+               addr[15], addr[14], addr[13], addr[12],
+               addr[11], addr[10], addr[9], addr[8],
+               addr[7], addr[6], addr[5], addr[4],
+               addr[3], addr[2], addr[1], addr[0]);
+}
+
+void probe_ipv6_addr_del(void *_data, struct inet6_ifaddr *ifa)
+{
+       __u8 *addr = ifa->addr.s6_addr;
+
+       trace_mark_tp(netif_state, insert_ifa_ipv6, ipv6_addr_del,
+               probe_ipv6_addr_del,
+               "label %s "
+               "a15 #1x%c a14 #1x%c a13 #1x%c a12 #1x%c "
+               "a11 #1x%c a10 #1x%c a9 #1x%c a8 #1x%c "
+               "a7 #1x%c a6 #1x%c a5 #1x%c a4 #1x%c "
+               "a3 #1x%c a2 #1x%c a1 #1x%c a0 #1x%c",
+               ifa->idev->dev->name,
+               addr[15], addr[14], addr[13], addr[12],
+               addr[11], addr[10], addr[9], addr[8],
+               addr[7], addr[6], addr[5], addr[4],
+               addr[3], addr[2], addr[1], addr[0]);
+}
+
+void probe_socket_create(void *_data, int family, int type, int protocol,
+       struct socket *sock, int ret)
+{
+       trace_mark_tp(net, socket_create, socket_create, probe_socket_create,
+               "family %d type %d protocol %d sock %p ret %d",
+               family, type, protocol, sock, ret);
+}
+
+void probe_socket_bind(void *_data, int fd, struct sockaddr __user *umyaddr, int addrlen,
+       int ret)
+{
+       trace_mark_tp(net, socket_bind, socket_bind, probe_socket_bind,
+               "fd %d umyaddr %p addrlen %d ret %d",
+               fd, umyaddr, addrlen, ret);
+}
+
+void probe_socket_connect(void *_data, int fd, struct sockaddr __user *uservaddr,
+       int addrlen, int ret)
+{
+       trace_mark_tp(net, socket_connect, socket_connect, probe_socket_connect,
+               "fd %d uservaddr %p addrlen %d ret %d",
+               fd, uservaddr, addrlen, ret);
+}
+
+void probe_socket_listen(void *_data, int fd, int backlog, int ret)
+{
+       trace_mark_tp(net, socket_listen, socket_listen, probe_socket_listen,
+               "fd %d backlog %d ret %d",
+               fd, backlog, ret);
+}
+
+void probe_socket_accept(void *_data, int fd, struct sockaddr __user *upeer_sockaddr,
+       int __user *upeer_addrlen, int flags, int ret)
+{
+       trace_mark_tp(net, socket_accept, socket_accept, probe_socket_accept,
+               "fd %d upeer_sockaddr %p upeer_addrlen %p flags %d ret %d",
+               fd, upeer_sockaddr, upeer_addrlen, flags, ret);
+}
+
+void probe_socket_getsockname(void *_data, int fd, struct sockaddr __user *usockaddr,
+       int __user *usockaddr_len, int ret)
+{
+       trace_mark_tp(net, socket_getsockname, socket_getsockname,
+               probe_socket_getsockname,
+               "fd %d usockaddr %p usockaddr_len %p ret %d",
+               fd, usockaddr, usockaddr_len, ret);
+}
+
+void probe_socket_getpeername(void *_data, int fd, struct sockaddr __user *usockaddr,
+       int __user *usockaddr_len, int ret)
+{
+       trace_mark_tp(net, socket_getpeername, socket_getpeername,
+               probe_socket_getpeername,
+               "fd %d usockaddr %p usockaddr_len %p ret %d",
+               fd, usockaddr, usockaddr_len, ret);
+}
+
+void probe_socket_socketpair(void *_data, int family, int type, int protocol,
+       int __user *usockvec, int ret)
+{
+       trace_mark_tp(net, socket_socketpair, socket_socketpair,
+               probe_socket_socketpair,
+               "family %d type %d protocol %d usockvec %p ret %d",
+               family, type, protocol, usockvec, ret);
+}
+
+void probe_socket_sendmsg(void *_data, struct socket *sock, struct msghdr *msg, size_t size,
+       int ret);
+
+DEFINE_MARKER_TP(net, socket_sendmsg, net_socket_sendmsg,
+       probe_socket_sendmsg,
+       "sock %p msg %p size %zu ret %d");
+
+notrace void probe_socket_sendmsg(void *_data, struct socket *sock, struct msghdr *msg,
+       size_t size, int ret)
+{
+       struct marker *marker;
+       struct serialize_long_long_sizet_int data;
+
+       data.f1 = (unsigned long)sock;
+       data.f2 = (unsigned long)msg;
+       data.f3 = size;
+       data.f4 = ret;
+
+       marker = &GET_MARKER(net, socket_sendmsg);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(size_t));
+}
+
+void probe_socket_recvmsg(void *_data, struct socket *sock, struct msghdr *msg, size_t size,
+       int flags, int ret);
+
+DEFINE_MARKER_TP(net, socket_recvmsg, net_socket_recvmsg,
+       probe_socket_recvmsg,
+       "sock %p msg %p size %zu flags %d ret %d");
+
+notrace void probe_socket_recvmsg(void *_data, struct socket *sock, struct msghdr *msg,
+       size_t size, int flags, int ret)
+{
+       struct marker *marker;
+       struct serialize_long_long_sizet_int_int data;
+
+       data.f1 = (unsigned long)sock;
+       data.f2 = (unsigned long)msg;
+       data.f3 = size;
+       data.f4 = flags;
+       data.f5 = ret;
+
+       marker = &GET_MARKER(net, socket_recvmsg);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(size_t));
+}
+
+void probe_socket_setsockopt(void *_data, int fd, int level, int optname,
+       char __user *optval, int optlen, int ret)
+{
+       trace_mark_tp(net, socket_setsockopt, socket_setsockopt,
+               probe_socket_setsockopt,
+               "fd %d level %d optname %d optval %p optlen %d ret %d",
+               fd, level, optname, optval, optlen, ret);
+}
+
+void probe_socket_getsockopt(void *_data, int fd, int level, int optname,
+       char __user *optval, int __user *optlen, int ret)
+{
+       trace_mark_tp(net, socket_getsockopt, socket_getsockopt,
+               probe_socket_getsockopt,
+               "fd %d level %d optname %d optval %p optlen %p ret %d",
+               fd, level, optname, optval, optlen, ret);
+}
+
+void probe_socket_shutdown(void *_data, int fd, int how, int ret)
+{
+       trace_mark_tp(net, socket_shutdown, socket_shutdown,
+               probe_socket_shutdown,
+               "fd %d how %d ret %d",
+               fd, how, ret);
+}
+
+void probe_socket_call(void *_data, int call, unsigned long a0)
+{
+       trace_mark_tp(net, socket_call, socket_call, probe_socket_call,
+               "call %d a0 %lu", call, a0);
+}
+
+void probe_tcpv4_rcv(void *_data, struct sk_buff *skb);
+
+DEFINE_MARKER_TP(net, tcpv4_rcv, net_tcpv4_rcv, probe_tcpv4_rcv,
+       "skb %p");
+
+notrace void probe_tcpv4_rcv(void *_data, struct sk_buff *skb)
+{
+       struct marker *marker;
+
+       marker = &GET_MARKER(net, tcpv4_rcv);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &skb, sizeof(skb), sizeof(skb));
+}
+
+void probe_udpv4_rcv(void *_data, struct sk_buff *skb);
+
+DEFINE_MARKER_TP(net, udpv4_rcv, net_udpv4_rcv, probe_udpv4_rcv,
+       "skb %p");
+
+notrace void probe_udpv4_rcv(void *_data, struct sk_buff *skb)
+{
+       struct marker *marker;
+
+       marker = &GET_MARKER(net, udpv4_rcv);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &skb, sizeof(skb), sizeof(skb));
+}
+
+#ifdef CONFIG_NETPOLL
+void probe_net_napi_schedule(void *_data, struct napi_struct *n);
+
+DEFINE_MARKER_TP(net, napi_schedule, net_napi_schedule,
+       probe_net_napi_schedule,
+       "napi_struct %p name %s");
+
+notrace void probe_net_napi_schedule(void *_data, struct napi_struct *n)
+{
+       struct marker *marker;
+       struct serialize_long_ifname data;
+       size_t data_len = 0;
+
+       data.f1 = (unsigned long)n;
+       data_len += sizeof(data.f1);
+       /* No need to align for strings */
+       strcpy(data.f2, n->dev ? n->dev->name : "<unk>");
+       data_len += strlen(data.f2) + 1;
+
+       marker = &GET_MARKER(net, napi_schedule);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, data_len, sizeof(long));
+}
+
+void probe_net_napi_poll(void *_data, struct napi_struct *n);
+
+DEFINE_MARKER_TP(net, napi_poll, net_napi_poll,
+       probe_net_napi_poll,
+       "napi_struct %p name %s");
+
+notrace void probe_net_napi_poll(void *_data, struct napi_struct *n)
+{
+       struct marker *marker;
+       struct serialize_long_ifname data;
+       size_t data_len = 0;
+
+       data.f1 = (unsigned long)n;
+       data_len += sizeof(data.f1);
+       /* No need to align for strings */
+       strcpy(data.f2, n->dev ? n->dev->name : "<unk>");
+       data_len += strlen(data.f2) + 1;
+
+       marker = &GET_MARKER(net, napi_poll);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, data_len, sizeof(long));
+}
+
+void probe_net_napi_complete(void *_data, struct napi_struct *n);
+
+DEFINE_MARKER_TP(net, napi_complete, net_napi_complete,
+       probe_net_napi_complete,
+       "napi_struct %p name %s");
+
+notrace void probe_net_napi_complete(void *_data, struct napi_struct *n)
+{
+       struct marker *marker;
+       struct serialize_long_ifname data;
+       size_t data_len = 0;
+
+       data.f1 = (unsigned long)n;
+       data_len += sizeof(data.f1);
+       /* No need to align for strings */
+       strcpy(data.f2, n->dev ? n->dev->name : "<unk>");
+       data_len += strlen(data.f2) + 1;
+
+       marker = &GET_MARKER(net, napi_complete);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, data_len, sizeof(long));
+}
+#else /* !CONFIG_NETPOLL */
+void probe_net_napi_schedule(void *_data, struct napi_struct *n);
+
+DEFINE_MARKER_TP(net, napi_schedule, net_napi_schedule,
+       probe_net_napi_schedule,
+       "napi_struct %p");
+
+notrace void probe_net_napi_schedule(void *_data, struct napi_struct *n)
+{
+       struct marker *marker;
+       unsigned long data;
+
+       data = (unsigned long)n;
+
+       marker = &GET_MARKER(net, napi_schedule);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, sizeof(data), sizeof(data));
+}
+
+void probe_net_napi_poll(void *_data, struct napi_struct *n);
+
+DEFINE_MARKER_TP(net, napi_poll, net_napi_poll,
+       probe_net_napi_poll,
+       "napi_struct %p");
+
+notrace void probe_net_napi_poll(void *_data, struct napi_struct *n)
+{
+       struct marker *marker;
+       unsigned long data;
+
+       data = (unsigned long)n;
+
+       marker = &GET_MARKER(net, napi_poll);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, sizeof(data), sizeof(data));
+}
+
+void probe_net_napi_complete(void *_data, struct napi_struct *n);
+
+DEFINE_MARKER_TP(net, napi_complete, net_napi_complete,
+       probe_net_napi_complete,
+       "napi_struct %p");
+
+notrace void probe_net_napi_complete(void *_data, struct napi_struct *n)
+{
+       struct marker *marker;
+       unsigned long data;
+
+       data = (unsigned long)n;
+
+       marker = &GET_MARKER(net, napi_complete);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, sizeof(data), sizeof(data));
+}
+#endif
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Net Tracepoint Probes");
diff --git a/probes/pm-trace.c b/probes/pm-trace.c
new file mode 100644 (file)
index 0000000..7abe8e3
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * ltt/probes/pm-trace.c
+ *
+ * Power Management tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <trace/pm.h>
+
+void probe_pm_idle_entry(void *_data)
+{
+       trace_mark_tp(pm, idle_entry, pm_idle_entry,
+               probe_pm_idle_entry, "irqstate #1%d",
+               irqs_disabled());
+}
+
+void probe_pm_idle_exit(void *_data)
+{
+       trace_mark_tp(pm, idle_exit, pm_idle_exit,
+               probe_pm_idle_exit, "irqstate #1%d",
+               irqs_disabled());
+}
+
+void probe_pm_suspend_entry(void *_data)
+{
+       trace_mark_tp(pm, suspend_entry, pm_suspend_entry,
+               probe_pm_suspend_entry, "irqstate #1%d",
+               irqs_disabled());
+}
+
+void probe_pm_suspend_exit(void *_data)
+{
+       trace_mark_tp(pm, suspend_exit, pm_suspend_exit,
+               probe_pm_suspend_exit, "irqstate #1%d",
+               irqs_disabled());
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Power Management Tracepoint Probes");
diff --git a/probes/rcu-trace.c b/probes/rcu-trace.c
new file mode 100644 (file)
index 0000000..cc16454
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * ltt/probes/rcu-trace.c
+ *
+ * RCU tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <trace/rcu.h>
+
+#ifdef CONFIG_TREE_RCU
+void probe_rcu_tree_callback(void *data, struct rcu_head *head)
+{
+       trace_mark_tp(rcu, tree_callback, rcu_tree_callback,
+               probe_rcu_tree_callback, "func %p", head->func);
+}
+
+void probe_rcu_tree_call_rcu(void *data, struct rcu_head *head, unsigned long ip)
+{
+       trace_mark_tp(rcu, tree_call_rcu, rcu_tree_call_rcu,
+               probe_rcu_tree_call_rcu, "func %p ip 0x%lX", head->func, ip);
+}
+
+void probe_rcu_tree_call_rcu_bh(void *data, struct rcu_head *head, unsigned long ip)
+{
+       trace_mark_tp(rcu, tree_call_rcu_bh, rcu_tree_call_rcu_bh,
+               probe_rcu_tree_call_rcu_bh, "func %p ip 0x%lX",
+               head->func, ip);
+}
+#endif
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("RCU Tracepoint Probes");
diff --git a/probes/syscall-trace.c b/probes/syscall-trace.c
new file mode 100644 (file)
index 0000000..9ae419f
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ * ltt/probes/syscall-trace.c
+ *
+ * System call tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <trace/syscall.h>
+
+#include "../ltt-type-serializer.h"
+
+
+/* kernel_syscall_entry specialized tracepoint probe */
+
+void probe_syscall_entry(void *_data, struct pt_regs *regs, long id);
+
+DEFINE_MARKER_TP(kernel, syscall_entry, syscall_entry,
+       probe_syscall_entry, "ip #p%ld syscall_id #2u%u");
+
+notrace void probe_syscall_entry(void *_data, struct pt_regs *regs, long id)
+{
+       struct marker *marker;
+       struct serialize_long_short data;
+
+       data.f1 = instruction_pointer(regs);
+       data.f2 = (unsigned short)id;
+
+       marker = &GET_MARKER(kernel, syscall_entry);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+/* kernel_syscall_exit specialized tracepoint probe */
+
+void probe_syscall_exit(void *_data, long ret);
+
+DEFINE_MARKER_TP(kernel, syscall_exit, syscall_exit,
+       probe_syscall_exit, "ret %ld");
+
+notrace void probe_syscall_exit(void *_data, long ret)
+{
+       struct marker *marker;
+
+       marker = &GET_MARKER(kernel, syscall_exit);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &ret, sizeof(ret), sizeof(ret));
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("syscall Tracepoint Probes");
diff --git a/probes/trap-trace.c b/probes/trap-trace.c
new file mode 100644 (file)
index 0000000..397254c
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * ltt/probes/trap-trace.c
+ *
+ * Trap tracepoint probes.
+ *
+ * (C) Copyright 2009 - Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <trace/trap.h>
+
+#include "../ltt-type-serializer.h"
+
+/* kernel_trap_entry specialized tracepoint probe */
+
+void probe_trap_entry(void *_data, struct pt_regs *regs, long id);
+
+DEFINE_MARKER_TP(kernel, trap_entry, trap_entry,
+       probe_trap_entry, "ip #p%ld trap_id #2u%u");
+
+notrace void probe_trap_entry(void *_data, struct pt_regs *regs, long id)
+{
+       struct marker *marker;
+       struct serialize_long_short data;
+
+       if (likely(regs))
+               data.f1 = instruction_pointer(regs);
+       else
+               data.f1 = 0UL;
+       data.f2 = (unsigned short)id;
+
+       marker = &GET_MARKER(kernel, trap_entry);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               &data, serialize_sizeof(data), sizeof(long));
+}
+
+/* kernel_syscall_exit specialized tracepoint probe */
+
+void probe_trap_exit(void *_data);
+
+DEFINE_MARKER_TP(kernel, trap_exit, trap_exit,
+       probe_trap_exit, MARK_NOARGS);
+
+notrace void probe_trap_exit(void *_data)
+{
+       struct marker *marker;
+
+       marker = &GET_MARKER(kernel, trap_exit);
+       ltt_specialized_trace(marker, marker->single.probe_private,
+               NULL, 0, 0);
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Trap Tracepoint Probes");
This page took 0.235245 seconds and 4 git commands to generate.