From b9c0c958f48982d12913ac1175c6c5fabce30769 Mon Sep 17 00:00:00 2001 From: compudj Date: Fri, 29 Sep 2006 19:59:03 +0000 Subject: [PATCH] update bench git-svn-id: http://ltt.polymtl.ca/svn@2143 04897980-b3bd-0310-b5e0-8ef037075253 --- tests/markers/markers-microbench-0.2.txt | 189 +++++++++++++++++++---- 1 file changed, 158 insertions(+), 31 deletions(-) diff --git a/tests/markers/markers-microbench-0.2.txt b/tests/markers/markers-microbench-0.2.txt index 6237f34f..b97b059d 100644 --- a/tests/markers/markers-microbench-0.2.txt +++ b/tests/markers/markers-microbench-0.2.txt @@ -3,91 +3,129 @@ * Microbenchmarks Use timestamp counter to calculate the time spent, with interrupts disabled. -Machine : Pentium 4 3GHz +Machine : Pentium 4 3GHz, 1GB ram Fully preemptible kernel -marker : MARK(subsys_mark1, "%d %p", 1, NULL); Linux Kernel Markers 0.19 +Kernel : Linux 2.6.17 + +marker : MARK(subsys_mark1, "%d %p", 1, NULL); + +This marker, with two elements (integer and pointer) have been chosen because it +is representative of high volume events. For instance, a trap entry event logs a +trap_id (long) and an address (pointer). The same applies to system calls, where +a system call entry event logs both the ID of the system call and the address of +the caller. + * Execute an empty loop + +- Without marker NR_LOOPS : 10000000 time delta (cycles): 15026497 cycles per loop : 1.50 + - i386 "optimized" : immediate value, test and predicted branch (non connected marker) NR_LOOPS : 10000000 time delta (cycles): 40031640 cycles per loop : 4.00 -cycles per loop for marker : 2.50 +cycles per loop for marker : 4.00-1.50=2.50 + - i386 "generic" : load, test and predicted branch (non connected marker) NR_LOOPS : 10000000 time delta (cycles): 26697878 cycles per loop : 2.67 -cycles per loop for marker : 1.17 +cycles per loop for marker : 2.67-1.50=1.17 + * Execute a loop of memcpy 4096 bytes + +This test has been done to show the impact of markers on a system where the +memory is already used, which is more representative of a running kernel. + - Without marker NR_LOOPS : 10000 time delta (cycles): 12981555 cycles per loop : 1298.16 + - i386 "optimized" : immediate value, test and predicted branch (non connected marker) NR_LOOPS : 10000 time delta (cycles): 12982290 cycles per loop : 1298.23 -cycles per loop for marker : 0.074 +cycles per loop for marker : 1298.23-1298.16=0.074 + - i386 "generic" : load, test and predicted branch (non connected marker) NR_LOOPS : 10000 time delta (cycles): 13002788 cycles per loop : 1300.28 -cycles per loop for marker : 2.123 +cycles per loop for marker : 1300.28-1298.16=2.123 The following tests are done with the "optimized" markers only -Execute a loop with marker enabled, with i386 "fastcall" register argument - setup, probe empty. +- Execute a loop with marker enabled, with i386 "fastcall" register argument + setup, probe empty. With preemption disabling. NR_LOOPS : 100000 time delta (cycles): 4407608 cycles per loop : 44.08 -cycles per loop to setup arguments in registers : 44.08-4.00=40.08 +cycles per loop to disable preemption and setup arguments in registers : +44.08-4.00=40.08 -Execute a loop with a marker enabled, with an empty probe. Var args argument - setup, probe empty. +- Execute a loop with a marker enabled, with an empty probe. Var args argument + setup, probe empty. With preemption disabling. NR_LOOPS : 100000 time delta (cycles): 5210587 cycles per loop : 52.11 additional cycles per loop to setup var args : 52.11-44.08=8.03 -Execute a loop with marker enabled, with i386 "asmlinkage" arguments expected. - Data is copied by the probe. +- Execute a loop with a marker enabled, with an empty probe. Var args argument + setup, probe empty. No preemption disabling. +NR_LOOPS : 100000 +time delta (cycles): 3363450 +cycles per loop : 33.63 +cycles per loop to disable preemption : 44.08-33.63=10.45 + +- Execute a loop with marker enabled, with i386 "asmlinkage" arguments expected. + Data is copied by the probe. With preemption disabling. NR_LOOPS : 100000 time delta (cycles): 5299837 cycles per loop : 53.00 additional cycles per loop to get arguments in probe (from stack) on x86 : - 53.00-52.11=0.89 +53.00-52.11=0.89 -Execute a loop with marker enabled, with var args probe expecting arguments. - Data is copied by the probe. +- Execute a loop with marker enabled, with var args probe expecting arguments. + Data is copied by the probe. With preemption disabling. NR_LOOPS : 100000 time delta (cycles): 5574300 cycles per loop : 55.74 additional cycles per loop to get expected variable arguments on x86 : 55.74-53.00=2.74 -Execute a loop with marker enabled, with var args probe, format string - Data is copied by the probe. +- Execute a loop with marker enabled, with var args probe, format string + Data is copied by the probe. This is a 6 bytes string to decode. processing. NR_LOOPS : 100000 time delta (cycles): 9622117 cycles per loop : 96.22 -additional cycles per loop to dynamically parse arguments with format string : - 96.22-55.74=40.48 +additional cycles per loop to dynamically parse arguments with a 6 bytes format +string : +96.22-55.74=40.48 * Assembly code +The disassembly of the following marked function will be shown : + +static int my_open(struct inode *inode, struct file *file) +{ + MARK(subsys_mark1, "%d %p", 1, NULL); + + return -EPERM; +} + - Optimized @@ -97,7 +135,7 @@ static int my_open(struct inode *inode, struct file *file) 1: 89 e5 mov %esp,%ebp 3: 83 ec 0c sub $0xc,%esp MARK(subsys_mark1, "%d %p", 1, NULL); - 6: b0 00 mov $0x0,%al + 6: b0 00 mov $0x0,%al <-- immediate load 0 in al 8: 84 c0 test %al,%al a: 75 07 jne 13 @@ -107,15 +145,15 @@ static int my_open(struct inode *inode, struct file *file) 11: c9 leave 12: c3 ret 13: b8 01 00 00 00 mov $0x1,%eax - 18: e8 fc ff ff ff call 19 + 18: e8 fc ff ff ff call 19 <-- preempt_disable 1d: c7 44 24 08 00 00 00 movl $0x0,0x8(%esp) 24: 00 25: c7 44 24 04 01 00 00 movl $0x1,0x4(%esp) 2c: 00 2d: c7 04 24 0d 00 00 00 movl $0xd,(%esp) - 34: ff 15 74 10 00 00 call *0x1074 + 34: ff 15 74 10 00 00 call *0x1074 <-- function pointer 3a: b8 01 00 00 00 mov $0x1,%eax - 3f: e8 fc ff ff ff call 40 + 3f: e8 fc ff ff ff call 40 <-- preempt_enable 44: eb c6 jmp c @@ -127,7 +165,7 @@ static int my_open(struct inode *inode, struct file *file) 1: 89 e5 mov %esp,%ebp 3: 83 ec 0c sub $0xc,%esp MARK(subsys_mark1, "%d %p", 1, NULL); - 6: 0f b6 05 20 10 00 00 movzbl 0x1020,%eax + 6: 0f b6 05 20 10 00 00 movzbl 0x1020,%eax <-- memory load byte d: 84 c0 test %al,%al f: 75 07 jne 18 @@ -137,28 +175,101 @@ static int my_open(struct inode *inode, struct file *file) 16: c9 leave 17: c3 ret 18: b8 01 00 00 00 mov $0x1,%eax - 1d: e8 fc ff ff ff call 1e + 1d: e8 fc ff ff ff call 1e <-- preempt_disable 22: c7 44 24 08 00 00 00 movl $0x0,0x8(%esp) 29: 00 2a: c7 44 24 04 01 00 00 movl $0x1,0x4(%esp) 31: 00 32: c7 04 24 0d 00 00 00 movl $0xd,(%esp) - 39: ff 15 74 10 00 00 call *0x1074 + 39: ff 15 74 10 00 00 call *0x1074 <-- function pointer 3f: b8 01 00 00 00 mov $0x1,%eax - 44: e8 fc ff ff ff call 45 + 44: e8 fc ff ff ff call 45 <-- preempt_enable 49: eb c6 jmp 11 + + +Here is the typical var arg probe that has been used in those tests. It saves +the values expectes as parameters in global variables. The DO_MARK1_FORMAT +define is used for probe registration to make sure that it will be connected +with a marker that has a matching format string. Note that this checking is +optional : the probe can register with a NULL format and afterward check itself +the format string received in parameter dynamically. + +int value; +void *ptr; + +#define DO_MARK1_FORMAT "%d %p" +void do_mark1(const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + value = va_arg(ap, int); + ptr = va_arg(ap, void*); + + va_end(ap); +} + + +Here is the disassembly of the probe : + +#define DO_MARK1_FORMAT "%d %p" +void do_mark1(const char *format, ...) +{ + 0: 55 push %ebp + 1: 89 e5 mov %esp,%ebp + 3: 83 ec 04 sub $0x4,%esp + va_list ap; + + va_start(ap, format); + value = va_arg(ap, int); + 6: 8b 45 0c mov 0xc(%ebp),%eax + 9: a3 00 00 00 00 mov %eax,0x0 + ptr = va_arg(ap, void*); + e: 8b 45 10 mov 0x10(%ebp),%eax + 11: a3 00 00 00 00 mov %eax,0x0 + + va_end(ap); +} + 16: c9 leave + 17: c3 ret + + * Size (x86) +This is the size added by each marker to the memory image : + - Optimized +.text section : instructions Adds 6 bytes in the "likely" path. Adds 32 bytes in the "unlikely" path. - -- Generic - +.data section : r/w data +0 byte +.rodata.str1 : strings +Length of the marker name +.debug_str : strings (if loaded..) +Length of the marker name + 7 bytes (__mark_) +.markers +8 bytes (2 pointers) +.markers.c +12 bytes (3 pointers) + +- Generic + +.text section : instructions Adds 11 bytes in the "likely" path. Adds 32 bytes in the "unlikely" path. +.data section : r/w data +1 byte (the activation flag) +.rodata.str1 : strings +Length of the marker name +.debug_str : strings (if loaded..) +Length of the marker name + 7 bytes (__mark_) +.markers +8 bytes (2 pointers) +.markers.c +12 bytes (3 pointers) * Macrobenchmarks @@ -212,5 +323,21 @@ There is almost no difference, on x86, between passing the arguments directly on the stack and using a variable argument list when its layout is known statically (0.89 cycles vs 2.74 cycles). +The int3 approach for adding instrumentation dynamically saves the 0.074 cycle +(typcal use, high memory usage) used by the optimized marker by adding the +ability to insert a breakpoint at any location without any impact on the code +when inactive. This breakpoint based approach is very useful to instrument core +kernel code that has not been previously marked without need to recompile and +reboot. We can therefore compare the case "without markers" to the null impact +of the int3 breakpoint based approach when inactive. + + + + + + + + + -- 2.34.1