kernel/hrtimer.c - linux/kernel/git/gregkh/char-misc - Git at Google

 /*
  *  linux/kernel/hrtimer.c
  *
  *  Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
  *
  *  High-resolution kernel timers
  *
  *  In contrast to the low-resolution timeout API implemented in
  *  kernel/timer.c, hrtimers provide finer resolution and accuracy
  *  depending on system configuration and capabilities.
  *
  *  These timers are currently used for:
  *   - itimers
  *   - POSIX timers
  *   - nanosleep
  *   - precise in-kernel timing
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
  *
  *  Credits:
  *	based on kernel/timer.c
  *
  *	Help, testing, suggestions, bugfixes, improvements were
  *	provided by:
  *
  *	George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
  *	et. al.
  *
  *  For licencing details see kernel-base/COPYING
  */

 #include <linux/cpu.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
 #include <linux/notifier.h>
 #include <linux/syscalls.h>
 #include <linux/interrupt.h>

 #include <asm/uaccess.h>

 /**
  * ktime_get - get the monotonic time in ktime_t format
  *
  * returns the time in ktime_t format
  */
 static ktime_t ktime_get(void)
 {
 	struct timespec now;

 	ktime_get_ts(&now);

 	return timespec_to_ktime(now);
 }

 /**
  * ktime_get_real - get the real (wall-) time in ktime_t format
  *
  * returns the time in ktime_t format
  */
 static ktime_t ktime_get_real(void)
 {
 	struct timespec now;

 	getnstimeofday(&now);

 	return timespec_to_ktime(now);
 }

 EXPORT_SYMBOL_GPL(ktime_get_real);

 /*
  * The timer bases:
  *
  * Note: If we want to add new timer bases, we have to skip the two
  * clock ids captured by the cpu-timers. We do this by holding empty
  * entries rather than doing math adjustment of the clock ids.
  * This ensures that we capture erroneous accesses to these clock ids
  * rather than moving them into the range of valid clock id's.
  */

 #define MAX_HRTIMER_BASES 2

 static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
 {
 	{
 		.index = CLOCK_REALTIME,
 		.get_time = &ktime_get_real,
 		.resolution = KTIME_REALTIME_RES,
 	},
 	{
 		.index = CLOCK_MONOTONIC,
 		.get_time = &ktime_get,
 		.resolution = KTIME_MONOTONIC_RES,
 	},
 };

 /**
  * ktime_get_ts - get the monotonic clock in timespec format
  *
  * @ts:		pointer to timespec variable
  *
  * The function calculates the monotonic clock from the realtime
  * clock and the wall_to_monotonic offset and stores the result
  * in normalized timespec format in the variable pointed to by ts.
  */
 void ktime_get_ts(struct timespec *ts)
 {
 	struct timespec tomono;
 	unsigned long seq;

 	do {
 		seq = read_seqbegin(&xtime_lock);
 		getnstimeofday(ts);
 		tomono = wall_to_monotonic;

 	} while (read_seqretry(&xtime_lock, seq));

 	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
 				ts->tv_nsec + tomono.tv_nsec);
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);

 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
  */
 #ifdef CONFIG_SMP

 #define set_curr_timer(b, t)		do { (b)->curr_timer = (t); } while (0)

 /*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
  * locked, and the base itself is locked too.
  *
  * So __run_timers/migrate_timers can safely modify all timers which could
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
  * possible to set timer->base = NULL and drop the lock: the timer remains
  * locked.
  */
 static struct hrtimer_base *lock_hrtimer_base(const struct hrtimer *timer,
 					      unsigned long *flags)
 {
 	struct hrtimer_base *base;

 	for (;;) {
 		base = timer->base;
 		if (likely(base != NULL)) {
 			spin_lock_irqsave(&base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
 			/* The timer has migrated to another CPU: */
 			spin_unlock_irqrestore(&base->lock, *flags);
 		}
 		cpu_relax();
 	}
 }

 /*
  * Switch the timer base to the current CPU when possible.
  */
 static inline struct hrtimer_base *
 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	struct hrtimer_base *new_base;

 	new_base = &__get_cpu_var(hrtimer_bases[base->index]);

 	if (base != new_base) {
 		/*
 		 * We are trying to schedule the timer on the local CPU.
 		 * However we can't change timer's base while it is running,
 		 * so we keep it on the same CPU. No hassle vs. reprogramming
 		 * the event source in the high resolution case. The softirq
 		 * code will take care of this when the timer function has
 		 * completed. There is no conflict as we hold the lock until
 		 * the timer is enqueued.
 		 */
 		if (unlikely(base->curr_timer == timer))
 			return base;

 		/* See the comment in lock_timer_base() */
 		timer->base = NULL;
 		spin_unlock(&base->lock);
 		spin_lock(&new_base->lock);
 		timer->base = new_base;
 	}
 	return new_base;
 }

 #else /* CONFIG_SMP */

 #define set_curr_timer(b, t)		do { } while (0)

 static inline struct hrtimer_base *
 lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
 	struct hrtimer_base *base = timer->base;

 	spin_lock_irqsave(&base->lock, *flags);

 	return base;
 }

 #define switch_hrtimer_base(t, b)	(b)

 #endif	/* !CONFIG_SMP */

 /*
  * Functions for the union type storage format of ktime_t which are
  * too large for inlining:
  */
 #if BITS_PER_LONG < 64
 # ifndef CONFIG_KTIME_SCALAR
 /**
  * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
  *
  * @kt:		addend
  * @nsec:	the scalar nsec value to add
  *
  * Returns the sum of kt and nsec in ktime_t format
  */
 ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
 {
 	ktime_t tmp;

 	if (likely(nsec < NSEC_PER_SEC)) {
 		tmp.tv64 = nsec;
 	} else {
 		unsigned long rem = do_div(nsec, NSEC_PER_SEC);

 		tmp = ktime_set((long)nsec, rem);
 	}

 	return ktime_add(kt, tmp);
 }

 #else /* CONFIG_KTIME_SCALAR */

 # endif /* !CONFIG_KTIME_SCALAR */

 /*
  * Divide a ktime value by a nanosecond value
  */
 static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
 {
 	u64 dclc, inc, dns;
 	int sft = 0;

 	dclc = dns = ktime_to_ns(kt);
 	inc = div;
 	/* Make sure the divisor is less than 2^32: */
 	while (div >> 32) {
 		sft++;
 		div >>= 1;
 	}
 	dclc >>= sft;
 	do_div(dclc, (unsigned long) div);

 	return (unsigned long) dclc;
 }

 #else /* BITS_PER_LONG < 64 */
 # define ktime_divns(kt, div)		(unsigned long)((kt).tv64 / (div))
 #endif /* BITS_PER_LONG >= 64 */

 /*
  * Counterpart to lock_timer_base above:
  */
 static inline
 void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 {
 	spin_unlock_irqrestore(&timer->base->lock, *flags);
 }

 /**
  * hrtimer_forward - forward the timer expiry
  *
  * @timer:	hrtimer to forward
  * @interval:	the interval to forward
  *
  * Forward the timer expiry so it will expire in the future.
  * Returns the number of overruns.
  */
 unsigned long
 hrtimer_forward(struct hrtimer *timer, ktime_t interval)
 {
 	unsigned long orun = 1;
 	ktime_t delta, now;

 	now = timer->base->get_time();

 	delta = ktime_sub(now, timer->expires);

 	if (delta.tv64 < 0)
 		return 0;

 	if (interval.tv64 < timer->base->resolution.tv64)
 		interval.tv64 = timer->base->resolution.tv64;

 	if (unlikely(delta.tv64 >= interval.tv64)) {
 		nsec_t incr = ktime_to_ns(interval);

 		orun = ktime_divns(delta, incr);
 		timer->expires = ktime_add_ns(timer->expires, incr * orun);
 		if (timer->expires.tv64 > now.tv64)
 			return orun;
 		/*
 		 * This (and the ktime_add() below) is the
 		 * correction for exact:
 		 */
 		orun++;
 	}
 	timer->expires = ktime_add(timer->expires, interval);

 	return orun;
 }

 /*
  * enqueue_hrtimer - internal function to (re)start a timer
  *
  * The timer is inserted in expiry order. Insertion into the
  * red black tree is O(log(n)). Must hold the base lock.
  */
 static void enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	struct rb_node **link = &base->active.rb_node;
 	struct rb_node *parent = NULL;
 	struct hrtimer *entry;

 	/*
 	 * Find the right place in the rbtree:
 	 */
 	while (*link) {
 		parent = *link;
 		entry = rb_entry(parent, struct hrtimer, node);
 		/*
 		 * We dont care about collisions. Nodes with
 		 * the same expiry time stay together.
 		 */
 		if (timer->expires.tv64 < entry->expires.tv64)
 			link = &(*link)->rb_left;
 		else
 			link = &(*link)->rb_right;
 	}

 	/*
 	 * Insert the timer to the rbtree and check whether it
 	 * replaces the first pending timer
 	 */
 	rb_link_node(&timer->node, parent, link);
 	rb_insert_color(&timer->node, &base->active);

 	timer->state = HRTIMER_PENDING;

 	if (!base->first || timer->expires.tv64 <
 	    rb_entry(base->first, struct hrtimer, node)->expires.tv64)
 		base->first = &timer->node;
 }

 /*
  * __remove_hrtimer - internal function to remove a timer
  *
  * Caller must hold the base lock.
  */
 static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	/*
 	 * Remove the timer from the rbtree and replace the
 	 * first entry pointer if necessary.
 	 */
 	if (base->first == &timer->node)
 		base->first = rb_next(&timer->node);
 	rb_erase(&timer->node, &base->active);
 }

 /*
  * remove hrtimer, called with base lock held
  */
 static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
 {
 	if (hrtimer_active(timer)) {
 		__remove_hrtimer(timer, base);
 		timer->state = HRTIMER_INACTIVE;
 		return 1;
 	}
 	return 0;
 }

 /**
  * hrtimer_start - (re)start an relative timer on the current CPU
  *
  * @timer:	the timer to be added
  * @tim:	expiry time
  * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
  *
  * Returns:
  *  0 on success
  *  1 when the timer was active
  */
 int
 hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
 	struct hrtimer_base *base, *new_base;
 	unsigned long flags;
 	int ret;

 	base = lock_hrtimer_base(timer, &flags);

 	/* Remove an active timer from the queue: */
 	ret = remove_hrtimer(timer, base);

 	/* Switch the timer base, if necessary: */
 	new_base = switch_hrtimer_base(timer, base);

 	if (mode == HRTIMER_REL) {
 		tim = ktime_add(tim, new_base->get_time());
 		/*
 		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
 		 * to signal that they simply return xtime in
 		 * do_gettimeoffset(). In this case we want to round up by
 		 * resolution when starting a relative timer, to avoid short
 		 * timeouts. This will go away with the GTOD framework.
 		 */
 #ifdef CONFIG_TIME_LOW_RES
 		tim = ktime_add(tim, base->resolution);
 #endif
 	}
 	timer->expires = tim;

 	enqueue_hrtimer(timer, new_base);

 	unlock_hrtimer_base(timer, &flags);

 	return ret;
 }

 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
  *
  * @timer:	hrtimer to stop
  *
  * Returns:
  *  0 when the timer was not active
  *  1 when the timer was active
  * -1 when the timer is currently excuting the callback function and
  *    can not be stopped
  */
 int hrtimer_try_to_cancel(struct hrtimer *timer)
 {
 	struct hrtimer_base *base;
 	unsigned long flags;
 	int ret = -1;

 	base = lock_hrtimer_base(timer, &flags);

 	if (base->curr_timer != timer)
 		ret = remove_hrtimer(timer, base);

 	unlock_hrtimer_base(timer, &flags);

 	return ret;

 }

 /**
  * hrtimer_cancel - cancel a timer and wait for the handler to finish.
  *
  * @timer:	the timer to be cancelled
  *
  * Returns:
  *  0 when the timer was not active
  *  1 when the timer was active
  */
 int hrtimer_cancel(struct hrtimer *timer)
 {
 	for (;;) {
 		int ret = hrtimer_try_to_cancel(timer);

 		if (ret >= 0)
 			return ret;
 	}
 }

 /**
  * hrtimer_get_remaining - get remaining time for the timer
  *
  * @timer:	the timer to read
  */
 ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 {
 	struct hrtimer_base *base;
 	unsigned long flags;
 	ktime_t rem;

 	base = lock_hrtimer_base(timer, &flags);
 	rem = ktime_sub(timer->expires, timer->base->get_time());
 	unlock_hrtimer_base(timer, &flags);

 	return rem;
 }

 /**
  * hrtimer_init - initialize a timer to the given clock
  *
  * @timer:	the timer to be initialized
  * @clock_id:	the clock to be used
  * @mode:	timer mode abs/rel
  */
 void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 		  enum hrtimer_mode mode)
 {
 	struct hrtimer_base *bases;

 	memset(timer, 0, sizeof(struct hrtimer));

 	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());

 	if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
 		clock_id = CLOCK_MONOTONIC;

 	timer->base = &bases[clock_id];
 }

 /**
  * hrtimer_get_res - get the timer resolution for a clock
  *
  * @which_clock: which clock to query
  * @tp:		 pointer to timespec variable to store the resolution
  *
  * Store the resolution of the clock selected by which_clock in the
  * variable pointed to by tp.
  */
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
 	struct hrtimer_base *bases;

 	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
 	*tp = ktime_to_timespec(bases[which_clock].resolution);

 	return 0;
 }

 /*
  * Expire the per base hrtimer-queue:
  */
 static inline void run_hrtimer_queue(struct hrtimer_base *base)
 {
 	ktime_t now = base->get_time();
 	struct rb_node *node;

 	spin_lock_irq(&base->lock);

 	while ((node = base->first)) {
 		struct hrtimer *timer;
 		int (*fn)(void *);
 		int restart;
 		void *data;

 		timer = rb_entry(node, struct hrtimer, node);
 		if (now.tv64 <= timer->expires.tv64)
 			break;

 		fn = timer->function;
 		data = timer->data;
 		set_curr_timer(base, timer);
 		timer->state = HRTIMER_RUNNING;
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);

 		/*
 		 * fn == NULL is special case for the simplest timer
 		 * variant - wake up process and do not restart:
 		 */
 		if (!fn) {
 			wake_up_process(data);
 			restart = HRTIMER_NORESTART;
 		} else
 			restart = fn(data);

 		spin_lock_irq(&base->lock);

 		/* Another CPU has added back the timer */
 		if (timer->state != HRTIMER_RUNNING)
 			continue;

 		if (restart == HRTIMER_RESTART)
 			enqueue_hrtimer(timer, base);
 		else
 			timer->state = HRTIMER_EXPIRED;
 	}
 	set_curr_timer(base, NULL);
 	spin_unlock_irq(&base->lock);
 }

 /*
  * Called from timer softirq every jiffy, expire hrtimers:
  */
 void hrtimer_run_queues(void)
 {
 	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
 	int i;

 	for (i = 0; i < MAX_HRTIMER_BASES; i++)
 		run_hrtimer_queue(&base[i]);
 }

 /*
  * Sleep related functions:
  */

 /**
  * schedule_hrtimer - sleep until timeout
  *
  * @timer:	hrtimer variable initialized with the correct clock base
  * @mode:	timeout value is abs/rel
  *
  * Make the current task sleep until @timeout is
  * elapsed.
  *
  * You can set the task state as follows -
  *
  * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
  * pass before the routine returns. The routine will return 0
  *
  * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
  * delivered to the current task. In this case the remaining time
  * will be returned
  *
  * The current task state is guaranteed to be TASK_RUNNING when this
  * routine returns.
  */
 static ktime_t __sched
 schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
 {
 	/* fn stays NULL, meaning single-shot wakeup: */
 	timer->data = current;

 	hrtimer_start(timer, timer->expires, mode);

 	schedule();
 	hrtimer_cancel(timer);

 	/* Return the remaining time: */
 	if (timer->state != HRTIMER_EXPIRED)
 		return ktime_sub(timer->expires, timer->base->get_time());
 	else
 		return (ktime_t) {.tv64 = 0 };
 }

 static inline ktime_t __sched
 schedule_hrtimer_interruptible(struct hrtimer *timer,
 			       const enum hrtimer_mode mode)
 {
 	set_current_state(TASK_INTERRUPTIBLE);

 	return schedule_hrtimer(timer, mode);
 }

 static long __sched nanosleep_restart(struct restart_block *restart)
 {
 	struct timespec __user *rmtp;
 	struct timespec tu;
 	void *rfn_save = restart->fn;
 	struct hrtimer timer;
 	ktime_t rem;

 	restart->fn = do_no_restart_syscall;

 	hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);

 	timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;

 	rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);

 	if (rem.tv64 <= 0)
 		return 0;

 	rmtp = (struct timespec __user *) restart->arg2;
 	tu = ktime_to_timespec(rem);
 	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
 		return -EFAULT;

 	restart->fn = rfn_save;

 	/* The other values in restart are already filled in */
 	return -ERESTART_RESTARTBLOCK;
 }

 long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
 	struct hrtimer timer;
 	struct timespec tu;
 	ktime_t rem;

 	hrtimer_init(&timer, clockid, mode);

 	timer.expires = timespec_to_ktime(*rqtp);

 	rem = schedule_hrtimer_interruptible(&timer, mode);
 	if (rem.tv64 <= 0)
 		return 0;

 	/* Absolute timers do not update the rmtp value and restart: */
 	if (mode == HRTIMER_ABS)
 		return -ERESTARTNOHAND;

 	tu = ktime_to_timespec(rem);

 	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
 		return -EFAULT;

 	restart = &current_thread_info()->restart_block;
 	restart->fn = nanosleep_restart;
 	restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
 	restart->arg1 = timer.expires.tv64 >> 32;
 	restart->arg2 = (unsigned long) rmtp;
 	restart->arg3 = (unsigned long) timer.base->index;

 	return -ERESTART_RESTARTBLOCK;
 }

 asmlinkage long
 sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
 {
 	struct timespec tu;

 	if (copy_from_user(&tu, rqtp, sizeof(tu)))
 		return -EFAULT;

 	if (!timespec_valid(&tu))
 		return -EINVAL;

 	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC);
 }

 /*
  * Functions related to boot-time initialization:
  */
 static void __devinit init_hrtimers_cpu(int cpu)
 {
 	struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
 	int i;

 	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++)
 		spin_lock_init(&base->lock);
 }

 #ifdef CONFIG_HOTPLUG_CPU

 static void migrate_hrtimer_list(struct hrtimer_base *old_base,
 				struct hrtimer_base *new_base)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;

 	while ((node = rb_first(&old_base->active))) {
 		timer = rb_entry(node, struct hrtimer, node);
 		__remove_hrtimer(timer, old_base);
 		timer->base = new_base;
 		enqueue_hrtimer(timer, new_base);
 	}
 }

 static void migrate_hrtimers(int cpu)
 {
 	struct hrtimer_base *old_base, *new_base;
 	int i;

 	BUG_ON(cpu_online(cpu));
 	old_base = per_cpu(hrtimer_bases, cpu);
 	new_base = get_cpu_var(hrtimer_bases);

 	local_irq_disable();

 	for (i = 0; i < MAX_HRTIMER_BASES; i++) {

 		spin_lock(&new_base->lock);
 		spin_lock(&old_base->lock);

 		BUG_ON(old_base->curr_timer);

 		migrate_hrtimer_list(old_base, new_base);

 		spin_unlock(&old_base->lock);
 		spin_unlock(&new_base->lock);
 		old_base++;
 		new_base++;
 	}

 	local_irq_enable();
 	put_cpu_var(hrtimer_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */

 static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;

 	switch (action) {

 	case CPU_UP_PREPARE:
 		init_hrtimers_cpu(cpu);
 		break;

 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DEAD:
 		migrate_hrtimers(cpu);
 		break;
 #endif

 	default:
 		break;
 	}

 	return NOTIFY_OK;
 }

 static struct notifier_block __devinitdata hrtimers_nb = {
 	.notifier_call = hrtimer_cpu_notify,
 };

 void __init hrtimers_init(void)
 {
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
 }
	/*
	* linux/kernel/hrtimer.c
	*
	* Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
	* Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
	*
	* High-resolution kernel timers
	*
	* In contrast to the low-resolution timeout API implemented in
	* kernel/timer.c, hrtimers provide finer resolution and accuracy
	* depending on system configuration and capabilities.
	*
	* These timers are currently used for:
	* - itimers
	* - POSIX timers
	* - nanosleep
	* - precise in-kernel timing
	*
	* Started by: Thomas Gleixner and Ingo Molnar
	*
	* Credits:
	* based on kernel/timer.c
	*
	* Help, testing, suggestions, bugfixes, improvements were
	* provided by:
	*
	* George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
	* et. al.
	*
	* For licencing details see kernel-base/COPYING
	*/

	#include <linux/cpu.h>
	#include <linux/module.h>
	#include <linux/percpu.h>
	#include <linux/hrtimer.h>
	#include <linux/notifier.h>
	#include <linux/syscalls.h>
	#include <linux/interrupt.h>

	#include <asm/uaccess.h>

	/**
	* ktime_get - get the monotonic time in ktime_t format
	*
	* returns the time in ktime_t format
	*/
	static ktime_t ktime_get(void)
	{
	struct timespec now;

	ktime_get_ts(&now);

	return timespec_to_ktime(now);
	}

	/**
	* ktime_get_real - get the real (wall-) time in ktime_t format
	*
	* returns the time in ktime_t format
	*/
	static ktime_t ktime_get_real(void)
	{
	struct timespec now;

	getnstimeofday(&now);

	return timespec_to_ktime(now);
	}

	EXPORT_SYMBOL_GPL(ktime_get_real);

	/*
	* The timer bases:
	*
	* Note: If we want to add new timer bases, we have to skip the two
	* clock ids captured by the cpu-timers. We do this by holding empty
	* entries rather than doing math adjustment of the clock ids.
	* This ensures that we capture erroneous accesses to these clock ids
	* rather than moving them into the range of valid clock id's.
	*/

	#define MAX_HRTIMER_BASES 2

	static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
	{
	{
	.index = CLOCK_REALTIME,
	.get_time = &ktime_get_real,
	.resolution = KTIME_REALTIME_RES,
	},
	{
	.index = CLOCK_MONOTONIC,
	.get_time = &ktime_get,
	.resolution = KTIME_MONOTONIC_RES,
	},
	};

	/**
	* ktime_get_ts - get the monotonic clock in timespec format
	*
	* @ts: pointer to timespec variable
	*
	* The function calculates the monotonic clock from the realtime
	* clock and the wall_to_monotonic offset and stores the result
	* in normalized timespec format in the variable pointed to by ts.
	*/
	void ktime_get_ts(struct timespec *ts)
	{
	struct timespec tomono;
	unsigned long seq;

	do {
	seq = read_seqbegin(&xtime_lock);
	getnstimeofday(ts);
	tomono = wall_to_monotonic;

	} while (read_seqretry(&xtime_lock, seq));

	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
	ts->tv_nsec + tomono.tv_nsec);
	}
	EXPORT_SYMBOL_GPL(ktime_get_ts);

	/*
	* Functions and macros which are different for UP/SMP systems are kept in a
	* single place
	*/
	#ifdef CONFIG_SMP

	#define set_curr_timer(b, t) do { (b)->curr_timer = (t); } while (0)

	/*
	* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
	* means that all timers which are tied to this base via timer->base are
	* locked, and the base itself is locked too.
	*
	* So __run_timers/migrate_timers can safely modify all timers which could
	* be found on the lists/queues.
	*
	* When the timer's base is locked, and the timer removed from list, it is
	* possible to set timer->base = NULL and drop the lock: the timer remains
	* locked.
	*/
	static struct hrtimer_base lock_hrtimer_base(const struct hrtimer timer,
	unsigned long *flags)
	{
	struct hrtimer_base *base;

	for (;;) {
	base = timer->base;
	if (likely(base != NULL)) {
	spin_lock_irqsave(&base->lock, *flags);
	if (likely(base == timer->base))
	return base;
	/* The timer has migrated to another CPU: */
	spin_unlock_irqrestore(&base->lock, *flags);
	}
	cpu_relax();
	}
	}

	/*
	* Switch the timer base to the current CPU when possible.
	*/
	static inline struct hrtimer_base *
	switch_hrtimer_base(struct hrtimer timer, struct hrtimer_base base)
	{
	struct hrtimer_base *new_base;

	new_base = &__get_cpu_var(hrtimer_bases[base->index]);

	if (base != new_base) {
	/*
	* We are trying to schedule the timer on the local CPU.
	* However we can't change timer's base while it is running,
	* so we keep it on the same CPU. No hassle vs. reprogramming
	* the event source in the high resolution case. The softirq
	* code will take care of this when the timer function has
	* completed. There is no conflict as we hold the lock until
	* the timer is enqueued.
	*/
	if (unlikely(base->curr_timer == timer))
	return base;

	/* See the comment in lock_timer_base() */
	timer->base = NULL;
	spin_unlock(&base->lock);
	spin_lock(&new_base->lock);
	timer->base = new_base;
	}
	return new_base;
	}

	#else /* CONFIG_SMP */

	#define set_curr_timer(b, t) do { } while (0)

	static inline struct hrtimer_base *
	lock_hrtimer_base(const struct hrtimer timer, unsigned long flags)
	{
	struct hrtimer_base *base = timer->base;

	spin_lock_irqsave(&base->lock, *flags);

	return base;
	}

	#define switch_hrtimer_base(t, b) (b)

	#endif /* !CONFIG_SMP */

	/*
	* Functions for the union type storage format of ktime_t which are
	* too large for inlining:
	*/
	#if BITS_PER_LONG < 64
	# ifndef CONFIG_KTIME_SCALAR
	/**
	* ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
	*
	* @kt: addend
	* @nsec: the scalar nsec value to add
	*
	* Returns the sum of kt and nsec in ktime_t format
	*/
	ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
	{
	ktime_t tmp;

	if (likely(nsec < NSEC_PER_SEC)) {
	tmp.tv64 = nsec;
	} else {
	unsigned long rem = do_div(nsec, NSEC_PER_SEC);

	tmp = ktime_set((long)nsec, rem);
	}

	return ktime_add(kt, tmp);
	}

	#else /* CONFIG_KTIME_SCALAR */

	# endif /* !CONFIG_KTIME_SCALAR */

	/*
	* Divide a ktime value by a nanosecond value
	*/
	static unsigned long ktime_divns(const ktime_t kt, nsec_t div)
	{
	u64 dclc, inc, dns;
	int sft = 0;

	dclc = dns = ktime_to_ns(kt);
	inc = div;
	/* Make sure the divisor is less than 2^32: */
	while (div >> 32) {
	sft++;
	div >>= 1;
	}
	dclc >>= sft;
	do_div(dclc, (unsigned long) div);

	return (unsigned long) dclc;
	}

	#else /* BITS_PER_LONG < 64 */
	# define ktime_divns(kt, div) (unsigned long)((kt).tv64 / (div))
	#endif /* BITS_PER_LONG >= 64 */

	/*
	* Counterpart to lock_timer_base above:
	*/
	static inline
	void unlock_hrtimer_base(const struct hrtimer timer, unsigned long flags)
	{
	spin_unlock_irqrestore(&timer->base->lock, *flags);
	}

	/**
	* hrtimer_forward - forward the timer expiry
	*
	* @timer: hrtimer to forward
	* @interval: the interval to forward
	*
	* Forward the timer expiry so it will expire in the future.
	* Returns the number of overruns.
	*/
	unsigned long
	hrtimer_forward(struct hrtimer *timer, ktime_t interval)
	{
	unsigned long orun = 1;
	ktime_t delta, now;

	now = timer->base->get_time();

	delta = ktime_sub(now, timer->expires);

	if (delta.tv64 < 0)
	return 0;

	if (interval.tv64 < timer->base->resolution.tv64)
	interval.tv64 = timer->base->resolution.tv64;

	if (unlikely(delta.tv64 >= interval.tv64)) {
	nsec_t incr = ktime_to_ns(interval);

	orun = ktime_divns(delta, incr);
	timer->expires = ktime_add_ns(timer->expires, incr * orun);
	if (timer->expires.tv64 > now.tv64)
	return orun;
	/*
	* This (and the ktime_add() below) is the
	* correction for exact:
	*/
	orun++;
	}
	timer->expires = ktime_add(timer->expires, interval);

	return orun;
	}

	/*
	* enqueue_hrtimer - internal function to (re)start a timer
	*
	* The timer is inserted in expiry order. Insertion into the
	* red black tree is O(log(n)). Must hold the base lock.
	*/
	static void enqueue_hrtimer(struct hrtimer timer, struct hrtimer_base base)
	{
	struct rb_node **link = &base->active.rb_node;
	struct rb_node *parent = NULL;
	struct hrtimer *entry;

	/*
	* Find the right place in the rbtree:
	*/
	while (*link) {
	parent = *link;
	entry = rb_entry(parent, struct hrtimer, node);
	/*
	* We dont care about collisions. Nodes with
	* the same expiry time stay together.
	*/
	if (timer->expires.tv64 < entry->expires.tv64)
	link = &(*link)->rb_left;
	else
	link = &(*link)->rb_right;
	}

	/*
	* Insert the timer to the rbtree and check whether it
	* replaces the first pending timer
	*/
	rb_link_node(&timer->node, parent, link);
	rb_insert_color(&timer->node, &base->active);

	timer->state = HRTIMER_PENDING;

	if (!base->first \|\| timer->expires.tv64 <
	rb_entry(base->first, struct hrtimer, node)->expires.tv64)
	base->first = &timer->node;
	}

	/*
	* __remove_hrtimer - internal function to remove a timer
	*
	* Caller must hold the base lock.
	*/
	static void __remove_hrtimer(struct hrtimer timer, struct hrtimer_base base)
	{
	/*
	* Remove the timer from the rbtree and replace the
	* first entry pointer if necessary.
	*/
	if (base->first == &timer->node)
	base->first = rb_next(&timer->node);
	rb_erase(&timer->node, &base->active);
	}

	/*
	* remove hrtimer, called with base lock held
	*/
	static inline int
	remove_hrtimer(struct hrtimer timer, struct hrtimer_base base)
	{
	if (hrtimer_active(timer)) {
	__remove_hrtimer(timer, base);
	timer->state = HRTIMER_INACTIVE;
	return 1;
	}
	return 0;
	}

	/**
	* hrtimer_start - (re)start an relative timer on the current CPU
	*
	* @timer: the timer to be added
	* @tim: expiry time
	* @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
	*
	* Returns:
	* 0 on success
	* 1 when the timer was active
	*/
	int
	hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
	{
	struct hrtimer_base base, new_base;
	unsigned long flags;
	int ret;

	base = lock_hrtimer_base(timer, &flags);

	/* Remove an active timer from the queue: */
	ret = remove_hrtimer(timer, base);

	/* Switch the timer base, if necessary: */
	new_base = switch_hrtimer_base(timer, base);

	if (mode == HRTIMER_REL) {
	tim = ktime_add(tim, new_base->get_time());
	/*
	* CONFIG_TIME_LOW_RES is a temporary way for architectures
	* to signal that they simply return xtime in
	* do_gettimeoffset(). In this case we want to round up by
	* resolution when starting a relative timer, to avoid short
	* timeouts. This will go away with the GTOD framework.
	*/
	#ifdef CONFIG_TIME_LOW_RES
	tim = ktime_add(tim, base->resolution);
	#endif
	}
	timer->expires = tim;

	enqueue_hrtimer(timer, new_base);

	unlock_hrtimer_base(timer, &flags);

	return ret;
	}

	/**
	* hrtimer_try_to_cancel - try to deactivate a timer
	*
	* @timer: hrtimer to stop
	*
	* Returns:
	* 0 when the timer was not active
	* 1 when the timer was active
	* -1 when the timer is currently excuting the callback function and
	* can not be stopped
	*/
	int hrtimer_try_to_cancel(struct hrtimer *timer)
	{
	struct hrtimer_base *base;
	unsigned long flags;
	int ret = -1;

	base = lock_hrtimer_base(timer, &flags);

	if (base->curr_timer != timer)
	ret = remove_hrtimer(timer, base);

	unlock_hrtimer_base(timer, &flags);

	return ret;

	}

	/**
	* hrtimer_cancel - cancel a timer and wait for the handler to finish.
	*
	* @timer: the timer to be cancelled
	*
	* Returns:
	* 0 when the timer was not active
	* 1 when the timer was active
	*/
	int hrtimer_cancel(struct hrtimer *timer)
	{
	for (;;) {
	int ret = hrtimer_try_to_cancel(timer);

	if (ret >= 0)
	return ret;
	}
	}

	/**
	* hrtimer_get_remaining - get remaining time for the timer
	*
	* @timer: the timer to read
	*/
	ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
	{
	struct hrtimer_base *base;
	unsigned long flags;
	ktime_t rem;

	base = lock_hrtimer_base(timer, &flags);
	rem = ktime_sub(timer->expires, timer->base->get_time());
	unlock_hrtimer_base(timer, &flags);

	return rem;
	}

	/**
	* hrtimer_init - initialize a timer to the given clock
	*
	* @timer: the timer to be initialized
	* @clock_id: the clock to be used
	* @mode: timer mode abs/rel
	*/
	void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
	enum hrtimer_mode mode)
	{
	struct hrtimer_base *bases;

	memset(timer, 0, sizeof(struct hrtimer));

	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());

	if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
	clock_id = CLOCK_MONOTONIC;

	timer->base = &bases[clock_id];
	}

	/**
	* hrtimer_get_res - get the timer resolution for a clock
	*
	* @which_clock: which clock to query
	* @tp: pointer to timespec variable to store the resolution
	*
	* Store the resolution of the clock selected by which_clock in the
	* variable pointed to by tp.
	*/
	int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
	{
	struct hrtimer_base *bases;

	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
	*tp = ktime_to_timespec(bases[which_clock].resolution);

	return 0;
	}

	/*
	* Expire the per base hrtimer-queue:
	*/
	static inline void run_hrtimer_queue(struct hrtimer_base *base)
	{
	ktime_t now = base->get_time();
	struct rb_node *node;

	spin_lock_irq(&base->lock);

	while ((node = base->first)) {
	struct hrtimer *timer;
	int (fn)(void );
	int restart;
	void *data;

	timer = rb_entry(node, struct hrtimer, node);
	if (now.tv64 <= timer->expires.tv64)
	break;

	fn = timer->function;
	data = timer->data;
	set_curr_timer(base, timer);
	timer->state = HRTIMER_RUNNING;
	__remove_hrtimer(timer, base);
	spin_unlock_irq(&base->lock);

	/*
	* fn == NULL is special case for the simplest timer
	* variant - wake up process and do not restart:
	*/
	if (!fn) {
	wake_up_process(data);
	restart = HRTIMER_NORESTART;
	} else
	restart = fn(data);

	spin_lock_irq(&base->lock);

	/* Another CPU has added back the timer */
	if (timer->state != HRTIMER_RUNNING)
	continue;

	if (restart == HRTIMER_RESTART)
	enqueue_hrtimer(timer, base);
	else
	timer->state = HRTIMER_EXPIRED;
	}
	set_curr_timer(base, NULL);
	spin_unlock_irq(&base->lock);
	}

	/*
	* Called from timer softirq every jiffy, expire hrtimers:
	*/
	void hrtimer_run_queues(void)
	{
	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
	int i;

	for (i = 0; i < MAX_HRTIMER_BASES; i++)
	run_hrtimer_queue(&base[i]);
	}

	/*
	* Sleep related functions:
	*/

	/**
	* schedule_hrtimer - sleep until timeout
	*
	* @timer: hrtimer variable initialized with the correct clock base
	* @mode: timeout value is abs/rel
	*
	* Make the current task sleep until @timeout is
	* elapsed.
	*
	* You can set the task state as follows -
	*
	* %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
	* pass before the routine returns. The routine will return 0
	*
	* %TASK_INTERRUPTIBLE - the routine may return early if a signal is
	* delivered to the current task. In this case the remaining time
	* will be returned
	*
	* The current task state is guaranteed to be TASK_RUNNING when this
	* routine returns.
	*/
	static ktime_t __sched
	schedule_hrtimer(struct hrtimer *timer, const enum hrtimer_mode mode)
	{
	/* fn stays NULL, meaning single-shot wakeup: */
	timer->data = current;

	hrtimer_start(timer, timer->expires, mode);

	schedule();
	hrtimer_cancel(timer);

	/* Return the remaining time: */
	if (timer->state != HRTIMER_EXPIRED)
	return ktime_sub(timer->expires, timer->base->get_time());
	else
	return (ktime_t) {.tv64 = 0 };
	}

	static inline ktime_t __sched
	schedule_hrtimer_interruptible(struct hrtimer *timer,
	const enum hrtimer_mode mode)
	{
	set_current_state(TASK_INTERRUPTIBLE);

	return schedule_hrtimer(timer, mode);
	}

	static long __sched nanosleep_restart(struct restart_block *restart)
	{
	struct timespec __user *rmtp;
	struct timespec tu;
	void *rfn_save = restart->fn;
	struct hrtimer timer;
	ktime_t rem;

	restart->fn = do_no_restart_syscall;

	hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);

	timer.expires.tv64 = ((u64)restart->arg1 << 32) \| (u64) restart->arg0;

	rem = schedule_hrtimer_interruptible(&timer, HRTIMER_ABS);

	if (rem.tv64 <= 0)
	return 0;

	rmtp = (struct timespec __user *) restart->arg2;
	tu = ktime_to_timespec(rem);
	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
	return -EFAULT;

	restart->fn = rfn_save;

	/* The other values in restart are already filled in */
	return -ERESTART_RESTARTBLOCK;
	}

	long hrtimer_nanosleep(struct timespec rqtp, struct timespec __user rmtp,
	const enum hrtimer_mode mode, const clockid_t clockid)
	{
	struct restart_block *restart;
	struct hrtimer timer;
	struct timespec tu;
	ktime_t rem;

	hrtimer_init(&timer, clockid, mode);

	timer.expires = timespec_to_ktime(*rqtp);

	rem = schedule_hrtimer_interruptible(&timer, mode);
	if (rem.tv64 <= 0)
	return 0;

	/* Absolute timers do not update the rmtp value and restart: */
	if (mode == HRTIMER_ABS)
	return -ERESTARTNOHAND;

	tu = ktime_to_timespec(rem);

	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
	return -EFAULT;

	restart = &current_thread_info()->restart_block;
	restart->fn = nanosleep_restart;
	restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
	restart->arg1 = timer.expires.tv64 >> 32;
	restart->arg2 = (unsigned long) rmtp;
	restart->arg3 = (unsigned long) timer.base->index;

	return -ERESTART_RESTARTBLOCK;
	}

	asmlinkage long
	sys_nanosleep(struct timespec __user rqtp, struct timespec __user rmtp)
	{
	struct timespec tu;

	if (copy_from_user(&tu, rqtp, sizeof(tu)))
	return -EFAULT;

	if (!timespec_valid(&tu))
	return -EINVAL;

	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_REL, CLOCK_MONOTONIC);
	}

	/*
	* Functions related to boot-time initialization:
	*/
	static void __devinit init_hrtimers_cpu(int cpu)
	{
	struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
	int i;

	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++)
	spin_lock_init(&base->lock);
	}

	#ifdef CONFIG_HOTPLUG_CPU

	static void migrate_hrtimer_list(struct hrtimer_base *old_base,
	struct hrtimer_base *new_base)
	{
	struct hrtimer *timer;
	struct rb_node *node;

	while ((node = rb_first(&old_base->active))) {
	timer = rb_entry(node, struct hrtimer, node);
	__remove_hrtimer(timer, old_base);
	timer->base = new_base;
	enqueue_hrtimer(timer, new_base);
	}
	}

	static void migrate_hrtimers(int cpu)
	{
	struct hrtimer_base old_base, new_base;
	int i;

	BUG_ON(cpu_online(cpu));
	old_base = per_cpu(hrtimer_bases, cpu);
	new_base = get_cpu_var(hrtimer_bases);

	local_irq_disable();

	for (i = 0; i < MAX_HRTIMER_BASES; i++) {

	spin_lock(&new_base->lock);
	spin_lock(&old_base->lock);

	BUG_ON(old_base->curr_timer);

	migrate_hrtimer_list(old_base, new_base);

	spin_unlock(&old_base->lock);
	spin_unlock(&new_base->lock);
	old_base++;
	new_base++;
	}

	local_irq_enable();
	put_cpu_var(hrtimer_bases);
	}
	#endif /* CONFIG_HOTPLUG_CPU */

	static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
	unsigned long action, void *hcpu)
	{
	long cpu = (long)hcpu;

	switch (action) {

	case CPU_UP_PREPARE:
	init_hrtimers_cpu(cpu);
	break;

	#ifdef CONFIG_HOTPLUG_CPU
	case CPU_DEAD:
	migrate_hrtimers(cpu);
	break;
	#endif

	default:
	break;
	}

	return NOTIFY_OK;
	}

	static struct notifier_block __devinitdata hrtimers_nb = {
	.notifier_call = hrtimer_cpu_notify,
	};

	void __init hrtimers_init(void)
	{
	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
	(void *)(long)smp_processor_id());
	register_cpu_notifier(&hrtimers_nb);
	}