[lttv.git] / ltt / branches / poly / doc / developer / time-monotonic-accurate.txt


Monotonic accurate time

The goal of this design is to provide a monotonic time :

Readable from userspace without a system call
Readable from NMI handler
Readable without disabling interrupts
Readable without disabling preemption
Only one clock source (most precise available : tsc)
Support architectures with variable TSC frequency.

Main difference with wall time currently implemented in the Linux kernel : the
time update is done atomically instead of using a write seqlock. It permits
reading time from NMI handler and from userspace.

struct time_info {
	u64 tsc;
	u64 freq;
	u64 walltime;
}

static struct time_struct {
	struct time_info time_sel[2];
	long update_count;
}

DECLARE_PERCPU(struct time_struct, cpu_time);

/* On frequency change event */
/* In irq context */
void freq_change_cb(unsigned int new_freq)
{
	struct time_struct this_cpu_time = 
		per_cpu(cpu_time, smp_processor_id());
	struct time_info *write_time, *current_time;
	write_time =
		this_cpu_time->time_sel[(this_cpu_time->update_count+1)&1];
	current_time =
		this_cpu_time->time_sel[(this_cpu_time->update_count)&1];
	write_time->tsc = get_cycles();
	write_time->freq = new_freq;
	/* We cumulate the division imprecision. This is the downside of using
	 * the TSC with variable frequency as a time base. */
	write_time->walltime = 
		current_time->walltime + 
			(write_time->tsc - current_time->tsc) /
			current_time->freq;
	wmb();
	this_cpu_time->update_count++;
}


/* Init cpu freq */
init_cpu_freq()
{
	struct time_struct this_cpu_time = 
		per_cpu(cpu_time, smp_processor_id());
	struct time_info *current_time;
	memset(this_cpu_time, 0, sizeof(this_cpu_time));
	current_time = this_cpu_time->time_sel[this_cpu_time->update_count&1];
	/* Init current time */
	/* Get frequency */
	/* Reset cpus to 0 ns, 0 tsc, start their tsc. */
}


/* After a CPU comes back from hlt */
/* The trick is to sync all the other CPUs on the first CPU up when they come
 * up. If all CPUs are down, then there is no need to increment the walltime :
 * let's simply define the useful walltime on a machine as the time elapsed
 * while there is a CPU running. If we want, when no cpu is active, we can use
 * a lower resolution clock to somehow keep track of walltime. */

wake_from_hlt()
{
	/* TODO */
}


/* Read time from anywhere in the kernel. Return time in walltime. (ns) */
/* If the update_count changes while we read the context, it may be invalid.
 * This would happen if we are scheduled out for a period of time long enough to
 * permit 2 frequency changes. We simply start the loop again if it happens.
 * We detect it by comparing the update_count running counter. */
/* FIXME : if thread is migrated to another CPU, get_cycles() is bad */
/* Pb with get cpu id / migrate / get_cycles() / migrate / get cpu id and check
 */
u64 read_time(void)
{
	u64 walltime;
	long update_count;
	struct time_struct this_cpu_time;
	struct time_info *current_time;
	unsigned int cpu;
	do {
		cpu = _smp_processor_id();
		this_cpu_time = per_cpu(cpu_time, cpu);
		update_count = this_cpu_time->update_count;
		current_time = this_cpu_time->time_sel[update_count&1];
		walltime = current_time->walltime + 
				(get_cycles() - current_time->tsc) /
				current_time->freq;
	} while(this_cpu_time->update_count != update_count
		|| cpu != _smp_processor_id());
	return walltime;
}

/* Userspace */
/* Export all this data to user space through the vsyscall page. Use a function
 * like read_time to read the walltime. This function can be implemented as-is
 * because it doesn't need to disable preemption. */
Commit	Line	Data
d8ce0927	1
	2	Monotonic accurate time
	3
	4	The goal of this design is to provide a monotonic time :
	5
	6	Readable from userspace without a system call
	7	Readable from NMI handler
	8	Readable without disabling interrupts
	9	Readable without disabling preemption
	10	Only one clock source (most precise available : tsc)
	11	Support architectures with variable TSC frequency.
	12
	13	Main difference with wall time currently implemented in the Linux kernel : the
	14	time update is done atomically instead of using a write seqlock. It permits
	15	reading time from NMI handler and from userspace.
	16
	17	struct time_info {
	18	u64 tsc;
	19	u64 freq;
	20	u64 walltime;
	21	}
	22
	23	static struct time_struct {
	24	struct time_info time_sel[2];
	25	long update_count;
	26	}
	27
	28	DECLARE_PERCPU(struct time_struct, cpu_time);
	29
	30	/* On frequency change event */
	31	/* In irq context */
	32	void freq_change_cb(unsigned int new_freq)
	33	{
	34	struct time_struct this_cpu_time =
	35	per_cpu(cpu_time, smp_processor_id());
	36	struct time_info write_time, current_time;
	37	write_time =
	38	this_cpu_time->time_sel[(this_cpu_time->update_count+1)&1];
	39	current_time =
	40	this_cpu_time->time_sel[(this_cpu_time->update_count)&1];
	41	write_time->tsc = get_cycles();
	42	write_time->freq = new_freq;
	43	/* We cumulate the division imprecision. This is the downside of using
	44	* the TSC with variable frequency as a time base. */
	45	write_time->walltime =
	46	current_time->walltime +
	47	(write_time->tsc - current_time->tsc) /
	48	current_time->freq;
	49	wmb();
	50	this_cpu_time->update_count++;
	51	}
	52
	53
	54	/* Init cpu freq */
	55	init_cpu_freq()
	56	{
	57	struct time_struct this_cpu_time =
	58	per_cpu(cpu_time, smp_processor_id());
	59	struct time_info *current_time;
	60	memset(this_cpu_time, 0, sizeof(this_cpu_time));
	61	current_time = this_cpu_time->time_sel[this_cpu_time->update_count&1];
	62	/* Init current time */
	63	/* Get frequency */
	64	/* Reset cpus to 0 ns, 0 tsc, start their tsc. */
65	}
66
67
68	/* After a CPU comes back from hlt */
69	/* The trick is to sync all the other CPUs on the first CPU up when they come
70	* up. If all CPUs are down, then there is no need to increment the walltime :
71	* let's simply define the useful walltime on a machine as the time elapsed
72	* while there is a CPU running. If we want, when no cpu is active, we can use
73	* a lower resolution clock to somehow keep track of walltime. */
74
75	wake_from_hlt()
76	{
77	/* TODO */
78	}
79
80
81
82	/* Read time from anywhere in the kernel. Return time in walltime. (ns) */
83	/* If the update_count changes while we read the context, it may be invalid.
84	* This would happen if we are scheduled out for a period of time long enough to
85	* permit 2 frequency changes. We simply start the loop again if it happens.
86	* We detect it by comparing the update_count running counter. */
b82734ba	87	/* FIXME : if thread is migrated to another CPU, get_cycles() is bad */
	88	/* Pb with get cpu id / migrate / get_cycles() / migrate / get cpu id and check
	89	*/
d8ce0927	90	u64 read_time(void)
	91	{
	92	u64 walltime;
	93	long update_count;
b82734ba	94	struct time_struct this_cpu_time;
d8ce0927	95	struct time_info *current_time;
b82734ba	96	unsigned int cpu;
d8ce0927	97	do {
b82734ba	98	cpu = _smp_processor_id();
b82734ba	99	this_cpu_time = per_cpu(cpu_time, cpu);
d8ce0927	100	update_count = this_cpu_time->update_count;
	101	current_time = this_cpu_time->time_sel[update_count&1];
	102	walltime = current_time->walltime +
	103	(get_cycles() - current_time->tsc) /
	104	current_time->freq;
b82734ba	105	} while(this_cpu_time->update_count != update_count
b82734ba	106	\|\| cpu != _smp_processor_id());
d8ce0927	107	return walltime;
	108	}
	109
	110	/* Userspace */
	111	/* Export all this data to user space through the vsyscall page. Use a function
	112	* like read_time to read the walltime. This function can be implemented as-is
	113	* because it doesn't need to disable preemption. */
	114
	115
	116