2 * Xen time implementation.
4 * This is implemented in terms of a clocksource driver which uses
5 * the hypervisor clock as a nanosecond timebase, and a clockevent
6 * driver which uses the hypervisor's timer mechanism.
8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
15 #include <linux/math64.h>
16 #include <linux/gfp.h>
17 #include <linux/pvclock_gtod.h>
19 #include <asm/pvclock.h>
20 #include <asm/xen/hypervisor.h>
21 #include <asm/xen/hypercall.h>
23 #include <xen/events.h>
24 #include <xen/features.h>
25 #include <xen/interface/xen.h>
26 #include <xen/interface/vcpu.h>
30 /* Xen may fire a timer up to this many ns early */
31 #define TIMER_SLOP 100000
32 #define NS_PER_TICK (1000000000LL / HZ)
34 /* runstate info updated by Xen */
35 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
37 /* snapshots of runstate info */
38 static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
40 /* unused ns of stolen and blocked time */
41 static DEFINE_PER_CPU(u64, xen_residual_stolen);
42 static DEFINE_PER_CPU(u64, xen_residual_blocked);
44 /* return an consistent snapshot of 64-bit time/counter value */
45 static u64 get64(const u64 *p)
49 if (BITS_PER_LONG < 64) {
54 * Read high then low, and then make sure high is
55 * still the same; this will only loop if low wraps
56 * and carries into high.
57 * XXX some clean way to make this endian-proof?
64 } while (p32[1] != h);
66 ret = (((u64)h) << 32) | l;
76 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
79 struct vcpu_runstate_info *state;
81 BUG_ON(preemptible());
83 state = &__get_cpu_var(xen_runstate);
86 * The runstate info is always updated by the hypervisor on
87 * the current CPU, so there's no need to use anything
88 * stronger than a compiler barrier when fetching it.
91 state_time = get64(&state->state_entry_time);
95 } while (get64(&state->state_entry_time) != state_time);
98 /* return true when a vcpu could run but has no real cpu to run on */
99 bool xen_vcpu_stolen(int vcpu)
101 return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
104 void xen_setup_runstate_info(int cpu)
106 struct vcpu_register_runstate_memory_area area;
108 area.addr.v = &per_cpu(xen_runstate, cpu);
110 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
115 static void do_stolen_accounting(void)
117 struct vcpu_runstate_info state;
118 struct vcpu_runstate_info *snap;
119 s64 blocked, runnable, offline, stolen;
122 get_runstate_snapshot(&state);
124 WARN_ON(state.state != RUNSTATE_running);
126 snap = &__get_cpu_var(xen_runstate_snapshot);
128 /* work out how much time the VCPU has not been runn*ing* */
129 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
130 runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
131 offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
135 /* Add the appropriate number of ticks of stolen time,
136 including any left-overs from last time. */
137 stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
142 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
143 __this_cpu_write(xen_residual_stolen, stolen);
144 account_steal_ticks(ticks);
146 /* Add the appropriate number of ticks of blocked time,
147 including any left-overs from last time. */
148 blocked += __this_cpu_read(xen_residual_blocked);
153 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
154 __this_cpu_write(xen_residual_blocked, blocked);
155 account_idle_ticks(ticks);
158 /* Get the TSC speed from Xen */
159 static unsigned long xen_tsc_khz(void)
161 struct pvclock_vcpu_time_info *info =
162 &HYPERVISOR_shared_info->vcpu_info[0].time;
164 return pvclock_tsc_khz(info);
167 cycle_t xen_clocksource_read(void)
169 struct pvclock_vcpu_time_info *src;
172 preempt_disable_notrace();
173 src = &__get_cpu_var(xen_vcpu)->time;
174 ret = pvclock_clocksource_read(src);
175 preempt_enable_notrace();
179 static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
181 return xen_clocksource_read();
184 static void xen_read_wallclock(struct timespec *ts)
186 struct shared_info *s = HYPERVISOR_shared_info;
187 struct pvclock_wall_clock *wall_clock = &(s->wc);
188 struct pvclock_vcpu_time_info *vcpu_time;
190 vcpu_time = &get_cpu_var(xen_vcpu)->time;
191 pvclock_read_wallclock(wall_clock, vcpu_time, ts);
192 put_cpu_var(xen_vcpu);
195 static void xen_get_wallclock(struct timespec *now)
197 xen_read_wallclock(now);
200 static int xen_set_wallclock(const struct timespec *now)
205 static int xen_pvclock_gtod_notify(struct notifier_block *nb,
206 unsigned long was_set, void *priv)
208 /* Protected by the calling core code serialization */
209 static struct timespec next_sync;
211 struct xen_platform_op op;
214 now = __current_kernel_time();
217 * We only take the expensive HV call when the clock was set
218 * or when the 11 minutes RTC synchronization time elapsed.
220 if (!was_set && timespec_compare(&now, &next_sync) < 0)
223 op.cmd = XENPF_settime;
224 op.u.settime.secs = now.tv_sec;
225 op.u.settime.nsecs = now.tv_nsec;
226 op.u.settime.system_time = xen_clocksource_read();
228 (void)HYPERVISOR_dom0_op(&op);
231 * Move the next drift compensation time 11 minutes
232 * ahead. That's emulating the sync_cmos_clock() update for
236 next_sync.tv_sec += 11 * 60;
241 static struct notifier_block xen_pvclock_gtod_notifier = {
242 .notifier_call = xen_pvclock_gtod_notify,
245 static struct clocksource xen_clocksource __read_mostly = {
248 .read = xen_clocksource_get_cycles,
250 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
254 Xen clockevent implementation
256 Xen has two clockevent implementations:
258 The old timer_op one works with all released versions of Xen prior
259 to version 3.0.4. This version of the hypervisor provides a
260 single-shot timer with nanosecond resolution. However, sharing the
261 same event channel is a 100Hz tick which is delivered while the
262 vcpu is running. We don't care about or use this tick, but it will
263 cause the core time code to think the timer fired too soon, and
264 will end up resetting it each time. It could be filtered, but
265 doing so has complications when the ktime clocksource is not yet
266 the xen clocksource (ie, at boot time).
268 The new vcpu_op-based timer interface allows the tick timer period
269 to be changed or turned off. The tick timer is not useful as a
270 periodic timer because events are only delivered to running vcpus.
271 The one-shot timer can report when a timeout is in the past, so
272 set_next_event is capable of returning -ETIME when appropriate.
273 This interface is used when available.
278 Get a hypervisor absolute time. In theory we could maintain an
279 offset between the kernel's time and the hypervisor's time, and
280 apply that to a kernel's absolute timeout. Unfortunately the
281 hypervisor and kernel times can drift even if the kernel is using
282 the Xen clocksource, because ntp can warp the kernel's clocksource.
284 static s64 get_abs_timeout(unsigned long delta)
286 return xen_clocksource_read() + delta;
289 static void xen_timerop_set_mode(enum clock_event_mode mode,
290 struct clock_event_device *evt)
293 case CLOCK_EVT_MODE_PERIODIC:
298 case CLOCK_EVT_MODE_ONESHOT:
299 case CLOCK_EVT_MODE_RESUME:
302 case CLOCK_EVT_MODE_UNUSED:
303 case CLOCK_EVT_MODE_SHUTDOWN:
304 HYPERVISOR_set_timer_op(0); /* cancel timeout */
309 static int xen_timerop_set_next_event(unsigned long delta,
310 struct clock_event_device *evt)
312 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
314 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
317 /* We may have missed the deadline, but there's no real way of
318 knowing for sure. If the event was in the past, then we'll
319 get an immediate interrupt. */
324 static const struct clock_event_device xen_timerop_clockevent = {
326 .features = CLOCK_EVT_FEAT_ONESHOT,
328 .max_delta_ns = 0xffffffff,
329 .min_delta_ns = TIMER_SLOP,
335 .set_mode = xen_timerop_set_mode,
336 .set_next_event = xen_timerop_set_next_event,
341 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
342 struct clock_event_device *evt)
344 int cpu = smp_processor_id();
347 case CLOCK_EVT_MODE_PERIODIC:
348 WARN_ON(1); /* unsupported */
351 case CLOCK_EVT_MODE_ONESHOT:
352 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
356 case CLOCK_EVT_MODE_UNUSED:
357 case CLOCK_EVT_MODE_SHUTDOWN:
358 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
359 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
362 case CLOCK_EVT_MODE_RESUME:
367 static int xen_vcpuop_set_next_event(unsigned long delta,
368 struct clock_event_device *evt)
370 int cpu = smp_processor_id();
371 struct vcpu_set_singleshot_timer single;
374 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
376 single.timeout_abs_ns = get_abs_timeout(delta);
377 single.flags = VCPU_SSHOTTMR_future;
379 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
381 BUG_ON(ret != 0 && ret != -ETIME);
386 static const struct clock_event_device xen_vcpuop_clockevent = {
388 .features = CLOCK_EVT_FEAT_ONESHOT,
390 .max_delta_ns = 0xffffffff,
391 .min_delta_ns = TIMER_SLOP,
397 .set_mode = xen_vcpuop_set_mode,
398 .set_next_event = xen_vcpuop_set_next_event,
401 static const struct clock_event_device *xen_clockevent =
402 &xen_timerop_clockevent;
403 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events) = { .irq = -1 };
405 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
407 struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
411 if (evt->event_handler) {
412 evt->event_handler(evt);
416 do_stolen_accounting();
421 void xen_setup_timer(int cpu)
424 struct clock_event_device *evt;
427 evt = &per_cpu(xen_clock_events, cpu);
428 WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
430 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
432 name = kasprintf(GFP_KERNEL, "timer%d", cpu);
434 name = "<timer kasprintf failed>";
436 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
437 IRQF_DISABLED|IRQF_PERCPU|
438 IRQF_NOBALANCING|IRQF_TIMER|
442 memcpy(evt, xen_clockevent, sizeof(*evt));
444 evt->cpumask = cpumask_of(cpu);
448 void xen_teardown_timer(int cpu)
450 struct clock_event_device *evt;
452 evt = &per_cpu(xen_clock_events, cpu);
453 unbind_from_irqhandler(evt->irq, NULL);
457 void xen_setup_cpu_clockevents(void)
459 BUG_ON(preemptible());
461 clockevents_register_device(&__get_cpu_var(xen_clock_events));
464 void xen_timer_resume(void)
470 if (xen_clockevent != &xen_vcpuop_clockevent)
473 for_each_online_cpu(cpu) {
474 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
479 static const struct pv_time_ops xen_time_ops __initconst = {
480 .sched_clock = xen_clocksource_read,
483 static void __init xen_time_init(void)
485 int cpu = smp_processor_id();
488 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
490 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
491 /* Successfully turned off 100Hz tick, so we have the
492 vcpuop-based timer interface */
493 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
494 xen_clockevent = &xen_vcpuop_clockevent;
497 /* Set initial system time with full resolution */
498 xen_read_wallclock(&tp);
499 do_settimeofday(&tp);
501 setup_force_cpu_cap(X86_FEATURE_TSC);
503 xen_setup_runstate_info(cpu);
504 xen_setup_timer(cpu);
505 xen_setup_cpu_clockevents();
507 if (xen_initial_domain())
508 pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
511 void __init xen_init_time_ops(void)
513 pv_time_ops = xen_time_ops;
515 x86_init.timers.timer_init = xen_time_init;
516 x86_init.timers.setup_percpu_clockev = x86_init_noop;
517 x86_cpuinit.setup_percpu_clockev = x86_init_noop;
519 x86_platform.calibrate_tsc = xen_tsc_khz;
520 x86_platform.get_wallclock = xen_get_wallclock;
521 /* Dom0 uses the native method to set the hardware RTC. */
522 if (!xen_initial_domain())
523 x86_platform.set_wallclock = xen_set_wallclock;
526 #ifdef CONFIG_XEN_PVHVM
527 static void xen_hvm_setup_cpu_clockevents(void)
529 int cpu = smp_processor_id();
530 xen_setup_runstate_info(cpu);
532 * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
533 * doing it xen_hvm_cpu_notify (which gets called by smp_init during
534 * early bootup and also during CPU hotplug events).
536 xen_setup_cpu_clockevents();
539 void __init xen_hvm_init_time_ops(void)
541 /* vector callback is needed otherwise we cannot receive interrupts
542 * on cpu > 0 and at this point we don't know how many cpus are
544 if (!xen_have_vector_callback)
546 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
547 printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
548 "disable pv timer\n");
552 pv_time_ops = xen_time_ops;
553 x86_init.timers.setup_percpu_clockev = xen_time_init;
554 x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
556 x86_platform.calibrate_tsc = xen_tsc_khz;
557 x86_platform.get_wallclock = xen_get_wallclock;
558 x86_platform.set_wallclock = xen_set_wallclock;