From b9c0c958f48982d12913ac1175c6c5fabce30769 Mon Sep 17 00:00:00 2001
From: compudj <compudj@04897980-b3bd-0310-b5e0-8ef037075253>
Date: Fri, 29 Sep 2006 19:59:03 +0000
Subject: [PATCH] update bench

git-svn-id: http://ltt.polymtl.ca/svn@2143 04897980-b3bd-0310-b5e0-8ef037075253
---
 tests/markers/markers-microbench-0.2.txt | 189 +++++++++++++++++++----
 1 file changed, 158 insertions(+), 31 deletions(-)

diff --git a/tests/markers/markers-microbench-0.2.txt b/tests/markers/markers-microbench-0.2.txt
index 6237f34f..b97b059d 100644
--- a/tests/markers/markers-microbench-0.2.txt
+++ b/tests/markers/markers-microbench-0.2.txt
@@ -3,91 +3,129 @@
 * Microbenchmarks
 
 Use timestamp counter to calculate the time spent, with interrupts disabled.
-Machine : Pentium 4 3GHz
+Machine : Pentium 4 3GHz, 1GB ram
 Fully preemptible kernel
-marker : MARK(subsys_mark1, "%d %p", 1, NULL);
 Linux Kernel Markers 0.19
+Kernel : Linux 2.6.17
+
+marker : MARK(subsys_mark1, "%d %p", 1, NULL);
+
+This marker, with two elements (integer and pointer) have been chosen because it
+is representative of high volume events. For instance, a trap entry event logs a
+trap_id (long) and an address (pointer). The same applies to system calls, where
+a system call entry event logs both the ID of the system call and the address of
+the caller.
+
 
 * Execute an empty loop
+
+- Without marker
 NR_LOOPS : 10000000
 time delta (cycles): 15026497
 cycles per loop : 1.50
+
 - i386 "optimized" : immediate value, test and predicted branch
   (non connected marker)
 NR_LOOPS : 10000000
 time delta (cycles): 40031640
 cycles per loop : 4.00
-cycles per loop for marker : 2.50
+cycles per loop for marker : 4.00-1.50=2.50
+
 - i386 "generic" : load, test and predicted branch
   (non connected marker)
 NR_LOOPS : 10000000
 time delta (cycles): 26697878
 cycles per loop : 2.67
-cycles per loop for marker : 1.17
+cycles per loop for marker : 2.67-1.50=1.17
+
 
 * Execute a loop of memcpy 4096 bytes
+
+This test has been done to show the impact of markers on a system where the
+memory is already used, which is more representative of a running kernel.
+
 - Without marker
 NR_LOOPS : 10000
 time delta (cycles): 12981555
 cycles per loop : 1298.16
+
 - i386 "optimized" : immediate value, test and predicted branch
   (non connected marker)
 NR_LOOPS : 10000
 time delta (cycles): 12982290
 cycles per loop : 1298.23
-cycles per loop for marker : 0.074
+cycles per loop for marker : 1298.23-1298.16=0.074
+
 - i386 "generic" : load, test and predicted branch
   (non connected marker)
 NR_LOOPS : 10000
 time delta (cycles): 13002788
 cycles per loop : 1300.28
-cycles per loop for marker : 2.123
+cycles per loop for marker : 1300.28-1298.16=2.123
 
 
 The following tests are done with the "optimized" markers only
 
-Execute a loop with marker enabled, with i386 "fastcall" register argument
-	setup, probe empty.
+- Execute a loop with marker enabled, with i386 "fastcall" register argument
+  setup, probe empty. With preemption disabling.
 NR_LOOPS : 100000
 time delta (cycles): 4407608
 cycles per loop : 44.08
-cycles per loop to setup arguments in registers : 44.08-4.00=40.08
+cycles per loop to disable preemption and setup arguments in registers :
+44.08-4.00=40.08
 
-Execute a loop with a marker enabled, with an empty probe. Var args argument
-	setup, probe empty.
+- Execute a loop with a marker enabled, with an empty probe. Var args argument
+  setup, probe empty. With preemption disabling.
 NR_LOOPS : 100000
 time delta (cycles): 5210587
 cycles per loop : 52.11
 additional cycles per loop to setup var args : 52.11-44.08=8.03
 
-Execute a loop with marker enabled, with i386 "asmlinkage" arguments expected.
-	Data is copied by the probe.
+- Execute a loop with a marker enabled, with an empty probe. Var args argument
+  setup, probe empty. No preemption disabling.
+NR_LOOPS : 100000
+time delta (cycles): 3363450
+cycles per loop : 33.63
+cycles per loop to disable preemption : 44.08-33.63=10.45
+
+- Execute a loop with marker enabled, with i386 "asmlinkage" arguments expected.
+  Data is copied by the probe. With preemption disabling.
 NR_LOOPS : 100000
 time delta (cycles): 5299837
 cycles per loop : 53.00
 additional cycles per loop to get arguments in probe (from stack) on x86 :
-  53.00-52.11=0.89
+53.00-52.11=0.89
 
-Execute a loop with marker enabled, with var args probe expecting arguments.
-	Data is copied by the probe.
+- Execute a loop with marker enabled, with var args probe expecting arguments.
+  Data is copied by the probe. With preemption disabling.
 NR_LOOPS : 100000
 time delta (cycles): 5574300
 cycles per loop : 55.74
 additional cycles per loop to get expected variable arguments on x86 : 
   55.74-53.00=2.74
 
-Execute a loop with marker enabled, with var args probe, format string
-	Data is copied by the probe.
+- Execute a loop with marker enabled, with var args probe, format string
+  Data is copied by the probe. This is a 6 bytes string to decode.
 processing.
 NR_LOOPS : 100000
 time delta (cycles): 9622117
 cycles per loop : 96.22
-additional cycles per loop to dynamically parse arguments with format string :
-  96.22-55.74=40.48
+additional cycles per loop to dynamically parse arguments with a 6 bytes format
+string :
+96.22-55.74=40.48
 
 
 * Assembly code
 
+The disassembly of the following marked function will be shown :
+
+static int my_open(struct inode *inode, struct file *file)
+{
+        MARK(subsys_mark1, "%d %p", 1, NULL);
+
+        return -EPERM;
+}
+
 
 - Optimized
 
@@ -97,7 +135,7 @@ static int my_open(struct inode *inode, struct file *file)
    1:   89 e5                   mov    %esp,%ebp
    3:   83 ec 0c                sub    $0xc,%esp
         MARK(subsys_mark1, "%d %p", 1, NULL);
-   6:   b0 00                   mov    $0x0,%al
+   6:   b0 00                   mov    $0x0,%al <-- immediate load 0 in al
    8:   84 c0                   test   %al,%al
    a:   75 07                   jne    13 <my_open+0x13>
 
@@ -107,15 +145,15 @@ static int my_open(struct inode *inode, struct file *file)
   11:   c9                      leave  
   12:   c3                      ret    
   13:   b8 01 00 00 00          mov    $0x1,%eax
-  18:   e8 fc ff ff ff          call   19 <my_open+0x19>
+  18:   e8 fc ff ff ff          call   19 <my_open+0x19> <-- preempt_disable
   1d:   c7 44 24 08 00 00 00    movl   $0x0,0x8(%esp)
   24:   00 
   25:   c7 44 24 04 01 00 00    movl   $0x1,0x4(%esp)
   2c:   00 
   2d:   c7 04 24 0d 00 00 00    movl   $0xd,(%esp)
-  34:   ff 15 74 10 00 00       call   *0x1074
+  34:   ff 15 74 10 00 00       call   *0x1074 <-- function pointer
   3a:   b8 01 00 00 00          mov    $0x1,%eax
-  3f:   e8 fc ff ff ff          call   40 <my_open+0x40>
+  3f:   e8 fc ff ff ff          call   40 <my_open+0x40> <-- preempt_enable
   44:   eb c6                   jmp    c <my_open+0xc>
 
 
@@ -127,7 +165,7 @@ static int my_open(struct inode *inode, struct file *file)
    1:   89 e5                   mov    %esp,%ebp
    3:   83 ec 0c                sub    $0xc,%esp
         MARK(subsys_mark1, "%d %p", 1, NULL);
-   6:   0f b6 05 20 10 00 00    movzbl 0x1020,%eax
+   6:   0f b6 05 20 10 00 00    movzbl 0x1020,%eax <-- memory load byte
    d:   84 c0                   test   %al,%al
    f:   75 07                   jne    18 <my_open+0x18>
 
@@ -137,28 +175,101 @@ static int my_open(struct inode *inode, struct file *file)
   16:   c9                      leave  
   17:   c3                      ret    
   18:   b8 01 00 00 00          mov    $0x1,%eax
-  1d:   e8 fc ff ff ff          call   1e <my_open+0x1e>
+  1d:   e8 fc ff ff ff          call   1e <my_open+0x1e> <-- preempt_disable
   22:   c7 44 24 08 00 00 00    movl   $0x0,0x8(%esp)
   29:   00 
   2a:   c7 44 24 04 01 00 00    movl   $0x1,0x4(%esp)
   31:   00 
   32:   c7 04 24 0d 00 00 00    movl   $0xd,(%esp)
-  39:   ff 15 74 10 00 00       call   *0x1074
+  39:   ff 15 74 10 00 00       call   *0x1074 <-- function pointer
   3f:   b8 01 00 00 00          mov    $0x1,%eax
-  44:   e8 fc ff ff ff          call   45 <my_open+0x45>
+  44:   e8 fc ff ff ff          call   45 <my_open+0x45> <-- preempt_enable
   49:   eb c6                   jmp    11 <my_open+0x11>
 
+
+
+Here is the typical var arg probe that has been used in those tests. It saves
+the values expectes as parameters in global variables. The DO_MARK1_FORMAT
+define is used for probe registration to make sure that it will be connected
+with a marker that has a matching format string. Note that this checking is
+optional : the probe can register with a NULL format and afterward check itself
+the format string received in parameter dynamically.
+
+int value;
+void *ptr;
+
+#define DO_MARK1_FORMAT "%d %p"
+void do_mark1(const char *format, ...)
+{
+        va_list ap;
+
+        va_start(ap, format);
+        value = va_arg(ap, int);
+        ptr = va_arg(ap, void*);
+
+        va_end(ap);
+}
+
+
+Here is the disassembly of the probe :
+
+#define DO_MARK1_FORMAT "%d %p"
+void do_mark1(const char *format, ...)
+{
+   0:   55                      push   %ebp
+   1:   89 e5                   mov    %esp,%ebp
+   3:   83 ec 04                sub    $0x4,%esp
+        va_list ap;
+
+        va_start(ap, format);
+        value = va_arg(ap, int);
+   6:   8b 45 0c                mov    0xc(%ebp),%eax
+   9:   a3 00 00 00 00          mov    %eax,0x0
+        ptr = va_arg(ap, void*);
+   e:   8b 45 10                mov    0x10(%ebp),%eax
+  11:   a3 00 00 00 00          mov    %eax,0x0
+        
+        va_end(ap);
+}
+  16:   c9                      leave  
+  17:   c3                      ret    
+
+
 * Size (x86)
 
+This is the size added by each marker to the memory image :
+
 - Optimized
 
+.text section : instructions
 Adds 6 bytes in the "likely" path.
 Adds 32 bytes in the "unlikely" path.
-
-- Generic 
-
+.data section : r/w data
+0 byte
+.rodata.str1 : strings
+Length of the marker name
+.debug_str : strings (if loaded..)
+Length of the marker name + 7 bytes (__mark_)
+.markers
+8 bytes (2 pointers)
+.markers.c
+12 bytes (3 pointers)
+
+- Generic
+
+.text section : instructions
 Adds 11 bytes in the "likely" path.
 Adds 32 bytes in the "unlikely" path.
+.data section : r/w data
+1 byte (the activation flag)
+.rodata.str1 : strings
+Length of the marker name
+.debug_str : strings (if loaded..)
+Length of the marker name + 7 bytes (__mark_)
+.markers
+8 bytes (2 pointers)
+.markers.c
+12 bytes (3 pointers)
 
 
 * Macrobenchmarks
@@ -212,5 +323,21 @@ There is almost no difference, on x86, between passing the arguments directly on
 the stack and using a variable argument list when its layout is known
 statically (0.89 cycles vs 2.74 cycles).
 
+The int3 approach for adding instrumentation dynamically saves the 0.074 cycle
+(typcal use, high memory usage) used by the optimized marker by adding the
+ability to insert a breakpoint at any location without any impact on the code
+when inactive. This breakpoint based approach is very useful to instrument core
+kernel code that has not been previously marked without need to recompile and
+reboot. We can therefore compare the case "without markers" to the null impact
+of the int3 breakpoint based approach when inactive.
+
+
+
+
+
+
+
+
+
 
 
-- 
2.34.1