d8ce0927 |
1 | |
2 | Monotonic accurate time |
3 | |
4 | The goal of this design is to provide a monotonic time : |
5 | |
6 | Readable from userspace without a system call |
7 | Readable from NMI handler |
8 | Readable without disabling interrupts |
9 | Readable without disabling preemption |
10 | Only one clock source (most precise available : tsc) |
11 | Support architectures with variable TSC frequency. |
12 | |
13 | Main difference with wall time currently implemented in the Linux kernel : the |
14 | time update is done atomically instead of using a write seqlock. It permits |
15 | reading time from NMI handler and from userspace. |
16 | |
17 | struct time_info { |
18 | u64 tsc; |
19 | u64 freq; |
20 | u64 walltime; |
21 | } |
22 | |
23 | static struct time_struct { |
24 | struct time_info time_sel[2]; |
25 | long update_count; |
26 | } |
27 | |
28 | DECLARE_PERCPU(struct time_struct, cpu_time); |
29 | |
30 | /* On frequency change event */ |
31 | /* In irq context */ |
32 | void freq_change_cb(unsigned int new_freq) |
33 | { |
34 | struct time_struct this_cpu_time = |
35 | per_cpu(cpu_time, smp_processor_id()); |
36 | struct time_info *write_time, *current_time; |
37 | write_time = |
38 | this_cpu_time->time_sel[(this_cpu_time->update_count+1)&1]; |
39 | current_time = |
40 | this_cpu_time->time_sel[(this_cpu_time->update_count)&1]; |
41 | write_time->tsc = get_cycles(); |
42 | write_time->freq = new_freq; |
43 | /* We cumulate the division imprecision. This is the downside of using |
44 | * the TSC with variable frequency as a time base. */ |
45 | write_time->walltime = |
46 | current_time->walltime + |
47 | (write_time->tsc - current_time->tsc) / |
48 | current_time->freq; |
49 | wmb(); |
50 | this_cpu_time->update_count++; |
51 | } |
52 | |
53 | |
54 | /* Init cpu freq */ |
55 | init_cpu_freq() |
56 | { |
57 | struct time_struct this_cpu_time = |
58 | per_cpu(cpu_time, smp_processor_id()); |
59 | struct time_info *current_time; |
60 | memset(this_cpu_time, 0, sizeof(this_cpu_time)); |
61 | current_time = this_cpu_time->time_sel[this_cpu_time->update_count&1]; |
62 | /* Init current time */ |
63 | /* Get frequency */ |
64 | /* Reset cpus to 0 ns, 0 tsc, start their tsc. */ |
65 | } |
66 | |
67 | |
68 | /* After a CPU comes back from hlt */ |
69 | /* The trick is to sync all the other CPUs on the first CPU up when they come |
70 | * up. If all CPUs are down, then there is no need to increment the walltime : |
71 | * let's simply define the useful walltime on a machine as the time elapsed |
72 | * while there is a CPU running. If we want, when no cpu is active, we can use |
73 | * a lower resolution clock to somehow keep track of walltime. */ |
74 | |
75 | wake_from_hlt() |
76 | { |
77 | /* TODO */ |
78 | } |
79 | |
80 | |
81 | |
82 | /* Read time from anywhere in the kernel. Return time in walltime. (ns) */ |
83 | /* If the update_count changes while we read the context, it may be invalid. |
84 | * This would happen if we are scheduled out for a period of time long enough to |
85 | * permit 2 frequency changes. We simply start the loop again if it happens. |
86 | * We detect it by comparing the update_count running counter. */ |
87 | u64 read_time(void) |
88 | { |
89 | u64 walltime; |
90 | long update_count; |
91 | struct time_struct this_cpu_time = |
92 | per_cpu(cpu_time, smp_processor_id()); |
93 | struct time_info *current_time; |
94 | do { |
95 | update_count = this_cpu_time->update_count; |
96 | current_time = this_cpu_time->time_sel[update_count&1]; |
97 | walltime = current_time->walltime + |
98 | (get_cycles() - current_time->tsc) / |
99 | current_time->freq; |
100 | } while(this_cpu_time->update_count != update_count); |
101 | return walltime; |
102 | } |
103 | |
104 | /* Userspace */ |
105 | /* Export all this data to user space through the vsyscall page. Use a function |
106 | * like read_time to read the walltime. This function can be implemented as-is |
107 | * because it doesn't need to disable preemption. */ |
108 | |
109 | |
110 | |