blob: ff74fca39ed21693428e2f5276839581808c693b [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/core.c
*
* Core kernel scheduler code and related syscalls
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
#undef CREATE_TRACE_POINTS
#include "sched.h"
#include <linux/nospec.h>
#include <linux/kcov.h>
#include <linux/scs.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
#include "../workqueue_internal.h"
#include "../../fs/io-wq.h"
#include "../smpboot.h"
#include "pelt.h"
#include "smp.h"
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
* associated with them) to allow external modules to probe them.
*/
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#ifdef CONFIG_SCHED_DEBUG
/*
* Debugging: various feature bits
*
* If SCHED_DEBUG is disabled, each compilation unit has its own copy of
* sysctl_sched_features, defined in sched.h, to allow constants propagation
* at compile time and compiler optimization based on features default.
*/
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
#endif
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
const_debug unsigned int sysctl_sched_nr_migrate = 32;
/*
* period over which we measure -rt task CPU usage in us.
* default: 1s
*/
unsigned int sysctl_sched_rt_period = 1000000;
__read_mostly int scheduler_running;
/*
* part of the period that we allow rt tasks to run in us.
* default: 0.95s
*/
int sysctl_sched_rt_runtime = 950000;
/*
* Serialization rules:
*
* Lock order:
*
* p->pi_lock
* rq->lock
* hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
*
* rq1->lock
* rq2->lock where: rq1 < rq2
*
* Regular state:
*
* Normal scheduling state is serialized by rq->lock. __schedule() takes the
* local CPU's rq->lock, it optionally removes the task from the runqueue and
* always looks at the local rq data structures to find the most eligible task
* to run next.
*
* Task enqueue is also under rq->lock, possibly taken from another CPU.
* Wakeups from another LLC domain might use an IPI to transfer the enqueue to
* the local CPU to avoid bouncing the runqueue state around [ see
* ttwu_queue_wakelist() ]
*
* Task wakeup, specifically wakeups that involve migration, are horribly
* complicated to avoid having to take two rq->locks.
*
* Special state:
*
* System-calls and anything external will use task_rq_lock() which acquires
* both p->pi_lock and rq->lock. As a consequence the state they change is
* stable while holding either lock:
*
* - sched_setaffinity()/
* set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
* - set_user_nice(): p->se.load, p->*prio
* - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
* p->se.load, p->rt_priority,
* p->dl.dl_{runtime, deadline, period, flags, bw, density}
* - sched_setnuma(): p->numa_preferred_nid
* - sched_move_task()/
* cpu_cgroup_fork(): p->sched_task_group
* - uclamp_update_active() p->uclamp*
*
* p->state <- TASK_*:
*
* is changed locklessly using set_current_state(), __set_current_state() or
* set_special_state(), see their respective comments, or by
* try_to_wake_up(). This latter uses p->pi_lock to serialize against
* concurrent self.
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
* is set by activate_task() and cleared by deactivate_task(), under
* rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
* p->on_cpu <- { 0, 1 }:
*
* is set by prepare_task() and cleared by finish_task() such that it will be
* set before p is scheduled-in and cleared after p is scheduled-out, both
* under rq->lock. Non-zero indicates the task is running on its CPU.
*
* [ The astute reader will observe that it is possible for two tasks on one
* CPU to have ->on_cpu = 1 at the same time. ]
*
* task_cpu(p): is changed by set_task_cpu(), the rules are:
*
* - Don't call set_task_cpu() on a blocked task:
*
* We don't care what CPU we're not running on, this simplifies hotplug,
* the CPU assignment of blocked tasks isn't required to be valid.
*
* - for try_to_wake_up(), called under p->pi_lock:
*
* This allows try_to_wake_up() to only take one rq->lock, see its comment.
*
* - for migration called under rq->lock:
* [ see task_on_rq_migrating() in task_rq_lock() ]
*
* o move_queued_task()
* o detach_task()
*
* - for migration called under double_rq_lock():
*
* o __migrate_swap_task()
* o push_rt_task() / pull_rt_task()
* o push_dl_task() / pull_dl_task()
* o dl_task_offline_migration()
*
*/
/*
* __task_rq_lock - lock the rq @p resides on.
*/
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock)
{
struct rq *rq;
lockdep_assert_held(&p->pi_lock);
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
raw_spin_unlock(&rq->lock);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
/*
* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
/*
* move_queued_task() task_rq_lock()
*
* ACQUIRE (rq->lock)
* [S] ->on_rq = MIGRATING [L] rq = task_rq()
* WMB (__set_task_cpu()) ACQUIRE (rq->lock);
* [S] ->cpu = new_cpu [L] task_rq()
* [L] ->on_rq
* RELEASE (rq->lock)
*
* If we observe the old CPU in task_rq_lock(), the acquire of
* the old rq->lock will fully serialize against the stores.
*
* If we observe the new CPU in task_rq_lock(), the address
* dependency headed by '[L] rq = task_rq()' and the acquire
* will pair with the WMB to ensure we then also see migrating.
*/
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
}
}
/*
* RQ-clock updating methods:
*/
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
* In theory, the compile should just see 0 here, and optimize out the call
* to sched_rt_avg_update. But I don't trust it...
*/
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into
* this case when a previous update_rq_clock() happened inside a
* {soft,}irq region.
*
* When this happens, we stop ->clock_task and only update the
* prev_irq_time stamp to account for the part that fit, so that a next
* update will consume the rest. This ensures ->clock_task is
* monotonic.
*
* It does however cause some slight miss-attribution of {soft,}irq
* time, a more accurate solution would be to update the irq_time using
* the current rq->clock timestamp, except that would require using
* atomic ops.
*/
if (irq_delta > delta)
irq_delta = delta;
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
rq->prev_steal_time_rq += steal;
delta -= steal;
}
#endif
rq->clock_task += delta;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
update_rq_clock_pelt(rq, delta);
}
void update_rq_clock(struct rq *rq)
{
s64 delta;
lockdep_assert_held(&rq->lock);
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
#ifdef CONFIG_SCHED_DEBUG
if (sched_feat(WARN_DOUBLE_CLOCK))
SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
if (delta < 0)
return;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
*/
static void hrtick_clear(struct rq *rq)
{
if (hrtimer_active(&rq->hrtick_timer))
hrtimer_cancel(&rq->hrtick_timer);
}
/*
* High-resolution timer tick.
* Runs from hardirq context with interrupts disabled.
*/
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
struct rq_flags rf;
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
rq_lock(rq, &rf);
update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
}
#ifdef CONFIG_SMP
static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
* called from hardirq (IPI) context
*/
static void __hrtick_start(void *arg)
{
struct rq *rq = arg;
struct rq_flags rf;
rq_lock(rq, &rf);
__hrtick_restart(rq);
rq_unlock(rq, &rf);
}
/*
* Called to set the hrtick timer state.
*
* called with rq->lock held and irqs disabled
*/
void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time;
s64 delta;
/*
* Don't schedule slices shorter than 10000ns, that just
* doesn't make sense and can cause timer DoS.
*/
delta = max_t(s64, delay, 10000LL);
time = ktime_add_ns(timer->base->get_time(), delta);
hrtimer_set_expires(timer, time);
if (rq == this_rq())
__hrtick_restart(rq);
else
smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
}
#else
/*
* Called to set the hrtick timer state.
*
* called with rq->lock held and irqs disabled
*/
void hrtick_start(struct rq *rq, u64 delay)
{
/*
* Don't schedule slices shorter than 10000ns, that just
* doesn't make sense. Rely on vruntime for fairness.
*/
delay = max_t(u64, delay, 10000LL);
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
HRTIMER_MODE_REL_PINNED_HARD);
}
#endif /* CONFIG_SMP */
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
{
}
static inline void hrtick_rq_init(struct rq *rq)
{
}
#endif /* CONFIG_SCHED_HRTICK */
/*
* cmpxchg based fetch_or, macro so it works for different integer types
*/
#define fetch_or(ptr, mask) \
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
typeof(*_ptr) _old, _val = *_ptr; \
\
for (;;) { \
_old = cmpxchg(_ptr, _val, _val | _mask); \
if (_old == _val) \
break; \
_val = _old; \
} \
_old; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
static bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
/*
* Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
*
* If this returns true, then the idle task promises to call
* sched_ttwu_pending() and reschedule soon.
*/
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
typeof(ti->flags) old, val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
if (old == val)
break;
val = old;
}
return true;
}
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
static bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
#endif
#endif
static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
struct wake_q_node *node = &task->wake_q;
/*
* Atomically grab the task, if ->wake_q is !nil already it means
* it's already queued (either by us or someone else) and will get the
* wakeup due to that.
*
* In order to ensure that a pending wakeup will observe our pending
* state, even in the failed case, an explicit smp_mb() must be used.
*/
smp_mb__before_atomic();
if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
return false;
/*
* The head is context local, there can be no concurrency.
*/
*head->lastp = node;
head->lastp = &node->next;
return true;
}
/**
* wake_q_add() - queue a wakeup for 'later' waking.
* @head: the wake_q_head to add @task to
* @task: the task to queue for 'later' wakeup
*
* Queue a task for later wakeup, most likely by the wake_up_q() call in the
* same context, _HOWEVER_ this is not guaranteed, the wakeup can come
* instantly.
*
* This function must be used as-if it were wake_up_process(); IOW the task
* must be ready to be woken at this location.
*/
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
if (__wake_q_add(head, task))
get_task_struct(task);
}
/**
* wake_q_add_safe() - safely queue a wakeup for 'later' waking.
* @head: the wake_q_head to add @task to
* @task: the task to queue for 'later' wakeup
*
* Queue a task for later wakeup, most likely by the wake_up_q() call in the
* same context, _HOWEVER_ this is not guaranteed, the wakeup can come
* instantly.
*
* This function must be used as-if it were wake_up_process(); IOW the task
* must be ready to be woken at this location.
*
* This function is essentially a task-safe equivalent to wake_q_add(). Callers
* that already hold reference to @task can call the 'safe' version and trust
* wake_q to do the right thing depending whether or not the @task is already
* queued for wakeup.
*/
void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
{
if (!__wake_q_add(head, task))
put_task_struct(task);
}
void wake_up_q(struct wake_q_head *head)
{
struct wake_q_node *node = head->first;
while (node != WAKE_Q_TAIL) {
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
BUG_ON(!task);
/* Task can safely be re-inserted now: */
node = node->next;
task->wake_q.next = NULL;
/*
* wake_up_process() executes a full barrier, which pairs with
* the queueing in wake_q_add() so as not to miss wakeups.
*/
wake_up_process(task);
put_task_struct(task);
}
}
/*
* resched_curr - mark rq's current task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
void resched_curr(struct rq *rq)
{
struct task_struct *curr = rq->curr;
int cpu;
lockdep_assert_held(&rq->lock);
if (test_tsk_need_resched(curr))
return;
cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
set_tsk_need_resched(curr);
set_preempt_need_resched();
return;
}
if (set_nr_and_not_polling(curr))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
raw_spin_lock_irqsave(&rq->lock, flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy CPU for migrating timers
* from an idle CPU. This is good for power-savings.
*
* We don't do similar optimization for completely idle system, as
* selecting an idle CPU will add more delays to the timers than intended
* (as that CPU's timer base may not be uptodate wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu_and(i, sched_domain_span(sd),
housekeeping_cpumask(HK_FLAG_TIMER)) {
if (cpu == i)
continue;
if (!idle_cpu(i)) {
cpu = i;
goto unlock;
}
}
}
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
cpu = default_cpu;
unlock:
rcu_read_unlock();
return cpu;
}
/*
* When add_timer_on() enqueues a timer into the timer wheel of an
* idle CPU then this timer might expire before the next timer event
* which is scheduled to wake up that CPU. In case of a completely
* idle system the next event might even be infinite time into the
* future. wake_up_idle_cpu() ensures that the CPU is woken up and
* leaves the inner idle loop so the newly added timer is taken into
* account when the CPU goes back to idle and evaluates the timer
* wheel for the next timer event.
*/
static void wake_up_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (cpu == smp_processor_id())
return;
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
static bool wake_up_full_nohz_cpu(int cpu)
{
/*
* We just need the target to call irq_exit() and re-evaluate
* the next tick. The nohz full kick at least implies that.
* If needed we can still optimize that later with an
* empty IRQ.
*/
if (cpu_is_offline(cpu))
return true; /* Don't try to wake offline CPUs. */
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
tick_nohz_full_kick_cpu(cpu);
return true;
}
return false;
}
/*
* Wake up the specified CPU. If the CPU is going offline, it is the
* caller's responsibility to deal with the lost wakeup, for example,
* by hooking into the CPU_DEAD notifier like timers and hrtimers do.
*/
void wake_up_nohz_cpu(int cpu)
{
if (!wake_up_full_nohz_cpu(cpu))
wake_up_idle_cpu(cpu);
}
static void nohz_csd_func(void *info)
{
struct rq *rq = info;
int cpu = cpu_of(rq);
unsigned int flags;
/*
* Release the rq::nohz_csd.
*/
flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
rq->idle_balance = idle_cpu(cpu);
if (rq->idle_balance && !need_resched()) {
rq->nohz_idle_balance = flags;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
}
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
/* Deadline tasks, even if single, need the tick */
if (rq->dl.dl_nr_running)
return false;
/*
* If there are more than one RR tasks, we need the tick to affect the
* actual RR behaviour.
*/
if (rq->rt.rr_nr_running) {
if (rq->rt.rr_nr_running == 1)
return true;
else
return false;
}
/*
* If there's no RR tasks, but FIFO tasks, we can skip the tick, no
* forced preemption between FIFO tasks.
*/
fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
if (fifo_nr_running)
return true;
/*
* If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
* if there's more than one we need the tick for involuntary
* preemption.
*/
if (rq->nr_running > 1)
return false;
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
#endif /* CONFIG_SMP */
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
/*
* Iterate task_group tree rooted at *from, calling @down when first entering a
* node and @up when leaving it for the final time.
*
* Caller must hold rcu_lock or sufficient equivalent.
*/
int walk_tg_tree_from(struct task_group *from,
tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
int ret;
parent = from;
down:
ret = (*down)(parent, data);
if (ret)
goto out;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
up:
continue;
}
ret = (*up)(parent, data);
if (ret || parent == from)
goto out;
child = parent;
parent = parent->parent;
if (parent)
goto up;
out:
return ret;
}
int tg_nop(struct task_group *tg, void *data)
{
return 0;
}
#endif
static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
/*
* SCHED_IDLE tasks get minimal weight:
*/
if (task_has_idle_policy(p)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
return;
}
/*
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
if (update_load && p->sched_class == &fair_sched_class) {
reweight_task(p, prio);
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
}
}
#ifdef CONFIG_UCLAMP_TASK
/*
* Serializes updates of utilization clamp values
*
* The (slow-path) user-space triggers utilization clamp value updates which
* can require updates on (fast-path) scheduler's data structures used to
* support enqueue/dequeue operations.
* While the per-CPU rq lock protects fast-path update operations, user-space
* requests are serialized using a mutex to reduce the risk of conflicting
* updates or API abuses.
*/
static DEFINE_MUTEX(uclamp_mutex);
/* Max allowed minimum utilization */
unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
/*
* By default RT tasks run at the maximum performance point/capacity of the
* system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
* SCHED_CAPACITY_SCALE.
*
* This knob allows admins to change the default behavior when uclamp is being
* used. In battery powered devices, particularly, running at the maximum
* capacity and frequency will increase energy consumption and shorten the
* battery life.
*
* This knob only affects RT tasks that their uclamp_se->user_defined == false.
*
* This knob will not override the system default sched_util_clamp_min defined
* above.
*/
unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
/*
* This static key is used to reduce the uclamp overhead in the fast path. It
* primarily disables the call to uclamp_rq_{inc, dec}() in
* enqueue/dequeue_task().
*
* This allows users to continue to enable uclamp in their kernel config with
* minimum uclamp overhead in the fast path.
*
* As soon as userspace modifies any of the uclamp knobs, the static key is
* enabled, since we have an actual users that make use of uclamp
* functionality.
*
* The knobs that would enable this static key are:
*
* * A task modifying its uclamp value with sched_setattr().
* * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
* * An admin modifying the cgroup cpu.uclamp.{min, max}
*/
DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
/* Integer rounded range for each bucket */
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
#define for_each_clamp_id(clamp_id) \
for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
{
return clamp_value / UCLAMP_BUCKET_DELTA;
}
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
return SCHED_CAPACITY_SCALE;
}
static inline void uclamp_se_set(struct uclamp_se *uc_se,
unsigned int value, bool user_defined)
{
uc_se->value = value;
uc_se->bucket_id = uclamp_bucket_id(value);
uc_se->user_defined = user_defined;
}
static inline unsigned int
uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/*
* Avoid blocked utilization pushing up the frequency when we go
* idle (which drops the max-clamp) by retaining the last known
* max-clamp.
*/
if (clamp_id == UCLAMP_MAX) {
rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
return clamp_value;
}
return uclamp_none(UCLAMP_MIN);
}
static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/* Reset max-clamp retention only on idle exit */
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
}
static inline
unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
int bucket_id = UCLAMP_BUCKETS - 1;
/*
* Since both min and max clamps are max aggregated, find the
* top most bucket with tasks in.
*/
for ( ; bucket_id >= 0; bucket_id--) {
if (!bucket[bucket_id].tasks)
continue;
return bucket[bucket_id].value;
}
/* No tasks -- default clamp values */
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
static void __uclamp_update_util_min_rt_default(struct task_struct *p)
{
unsigned int default_util_min;
struct uclamp_se *uc_se;
lockdep_assert_held(&p->pi_lock);
uc_se = &p->uclamp_req[UCLAMP_MIN];
/* Only sync if user didn't override the default */
if (uc_se->user_defined)
return;
default_util_min = sysctl_sched_uclamp_util_min_rt_default;
uclamp_se_set(uc_se, default_util_min, false);
}
static void uclamp_update_util_min_rt_default(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
if (!rt_task(p))
return;
/* Protect updates to p->uclamp_* */
rq = task_rq_lock(p, &rf);
__uclamp_update_util_min_rt_default(p);
task_rq_unlock(rq, p, &rf);
}
static void uclamp_sync_util_min_rt_default(void)
{
struct task_struct *g, *p;
/*
* copy_process() sysctl_uclamp
* uclamp_min_rt = X;
* write_lock(&tasklist_lock) read_lock(&tasklist_lock)
* // link thread smp_mb__after_spinlock()
* write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
* sched_post_fork() for_each_process_thread()
* __uclamp_sync_rt() __uclamp_sync_rt()
*
* Ensures that either sched_post_fork() will observe the new
* uclamp_min_rt or for_each_process_thread() will observe the new
* task.
*/
read_lock(&tasklist_lock);
smp_mb__after_spinlock();
read_unlock(&tasklist_lock);
rcu_read_lock();
for_each_process_thread(g, p)
uclamp_update_util_min_rt_default(p);
rcu_read_unlock();
}
static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_req = p->uclamp_req[clamp_id];
#ifdef CONFIG_UCLAMP_TASK_GROUP
struct uclamp_se uc_max;
/*
* Tasks in autogroups or root task group will be
* restricted by system defaults.
*/
if (task_group_is_autogroup(task_group(p)))
return uc_req;
if (task_group(p) == &root_task_group)
return uc_req;
uc_max = task_group(p)->uclamp[clamp_id];
if (uc_req.value > uc_max.value || !uc_req.user_defined)
return uc_max;
#endif
return uc_req;
}
/*
* The effective clamp bucket index of a task depends on, by increasing
* priority:
* - the task specific clamp value, when explicitly requested from userspace
* - the task group effective clamp value, for tasks not either in the root
* group or in an autogroup
* - the system default clamp value, defined by the sysadmin
*/
static inline struct uclamp_se
uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
struct uclamp_se uc_max = uclamp_default[clamp_id];
/* System default restrictions always apply */
if (unlikely(uc_req.value > uc_max.value))
return uc_max;
return uc_req;
}
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
/* Task currently refcounted: use back-annotated (effective) value */
if (p->uclamp[clamp_id].active)
return (unsigned long)p->uclamp[clamp_id].value;
uc_eff = uclamp_eff_get(p, clamp_id);
return (unsigned long)uc_eff.value;
}
/*
* When a task is enqueued on a rq, the clamp bucket currently defined by the
* task's uclamp::bucket_id is refcounted on that rq. This also immediately
* updates the rq's clamp value if required.
*
* Tasks can have a task-specific value requested from user-space, track
* within each bucket the maximum value for tasks refcounted in it.
* This "local max aggregation" allows to track the exact "requested" value
* for each bucket when all its RUNNABLE tasks require the same clamp.
*/
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
lockdep_assert_held(&rq->lock);
/* Update task effective clamp */
p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
bucket = &uc_rq->bucket[uc_se->bucket_id];
bucket->tasks++;
uc_se->active = true;
uclamp_idle_reset(rq, clamp_id, uc_se->value);
/*
* Local max aggregation: rq buckets always track the max
* "requested" clamp value of its RUNNABLE tasks.
*/
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
if (uc_se->value > READ_ONCE(uc_rq->value))
WRITE_ONCE(uc_rq->value, uc_se->value);
}
/*
* When a task is dequeued from a rq, the clamp bucket refcounted by the task
* is released. If this is the last task reference counting the rq's max
* active clamp value, then the rq's clamp value is updated.
*
* Both refcounted tasks and rq's cached clamp values are expected to be
* always valid. If it's detected they are not, as defensive programming,
* enforce the expected state and warn.
*/
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
struct uclamp_bucket *bucket;
unsigned int bkt_clamp;
unsigned int rq_clamp;
lockdep_assert_held(&rq->lock);
/*
* If sched_uclamp_used was enabled after task @p was enqueued,
* we could end up with unbalanced call to uclamp_rq_dec_id().
*
* In this case the uc_se->active flag should be false since no uclamp
* accounting was performed at enqueue time and we can just return
* here.
*
* Need to be careful of the following enqueue/dequeue ordering
* problem too
*
* enqueue(taskA)
* // sched_uclamp_used gets enabled
* enqueue(taskB)
* dequeue(taskA)
* // Must not decrement bucket->tasks here
* dequeue(taskB)
*
* where we could end up with stale data in uc_se and
* bucket[uc_se->bucket_id].
*
* The following check here eliminates the possibility of such race.
*/
if (unlikely(!uc_se->active))
return;
bucket = &uc_rq->bucket[uc_se->bucket_id];
SCHED_WARN_ON(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
uc_se->active = false;
/*
* Keep "local max aggregation" simple and accept to (possibly)
* overboost some RUNNABLE tasks in the same bucket.
* The rq clamp bucket value is reset to its base value whenever
* there are no more RUNNABLE tasks refcounting it.
*/
if (likely(bucket->tasks))
return;
rq_clamp = READ_ONCE(uc_rq->value);
/*
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fixup the expected value.
*/
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
WRITE_ONCE(uc_rq->value, bkt_clamp);
}
}
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
/*
* Avoid any overhead until uclamp is actually used by the userspace.
*
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
if (!static_branch_unlikely(&sched_uclamp_used))
return;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
for_each_clamp_id(clamp_id)
uclamp_rq_inc_id(rq, p, clamp_id);
/* Reset clamp idle holding when there is one RUNNABLE task */
if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
enum uclamp_id clamp_id;
/*
* Avoid any overhead until uclamp is actually used by the userspace.
*
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
if (!static_branch_unlikely(&sched_uclamp_used))
return;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
for_each_clamp_id(clamp_id)
uclamp_rq_dec_id(rq, p, clamp_id);
}
static inline void
uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
{
struct rq_flags rf;
struct rq *rq;
/*
* Lock the task and the rq where the task is (or was) queued.
*
* We might lock the (previous) rq of a !RUNNABLE task, but that's the
* price to pay to safely serialize util_{min,max} updates with
* enqueues, dequeues and migration operations.
* This is the same locking schema used by __set_cpus_allowed_ptr().
*/
rq = task_rq_lock(p, &rf);
/*
* Setting the clamp bucket is serialized by task_rq_lock().
* If the task is not yet RUNNABLE and its task_struct is not
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
if (p->uclamp[clamp_id].active) {
uclamp_rq_dec_id(rq, p, clamp_id);
uclamp_rq_inc_id(rq, p, clamp_id);
}
task_rq_unlock(rq, p, &rf);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
static inline void
uclamp_update_active_tasks(struct cgroup_subsys_state *css,
unsigned int clamps)
{
enum uclamp_id clamp_id;
struct css_task_iter it;
struct task_struct *p;
css_task_iter_start(css, 0, &it);
while ((p = css_task_iter_next(&it))) {
for_each_clamp_id(clamp_id) {
if ((0x1 << clamp_id) & clamps)
uclamp_update_active(p, clamp_id);
}
}
css_task_iter_end(&it);
}
static void cpu_util_update_eff(struct cgroup_subsys_state *css);
static void uclamp_update_root_tg(void)
{
struct task_group *tg = &root_task_group;
uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
rcu_read_lock();
cpu_util_update_eff(&root_task_group.css);
rcu_read_unlock();
}
#else
static void uclamp_update_root_tg(void) { }
#endif
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
bool update_root_tg = false;
int old_min, old_max, old_min_rt;
int result;
mutex_lock(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
result = proc_dointvec(table, write, buffer, lenp, ppos);
if (result)
goto undo;
if (!write)
goto done;
if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
result = -EINVAL;
goto undo;
}
if (old_min != sysctl_sched_uclamp_util_min) {
uclamp_se_set(&uclamp_default[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
update_root_tg = true;
}
if (old_max != sysctl_sched_uclamp_util_max) {
uclamp_se_set(&uclamp_default[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
update_root_tg = true;
}
if (update_root_tg) {
static_branch_enable(&sched_uclamp_used);
uclamp_update_root_tg();
}
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
static_branch_enable(&sched_uclamp_used);
uclamp_sync_util_min_rt_default();
}
/*
* We update all RUNNABLE tasks only when task groups are in use.
* Otherwise, keep it simple and do just a lazy update at each next
* task enqueue time.
*/
goto done;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
done:
mutex_unlock(&uclamp_mutex);
return result;
}
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
int util_min = p->uclamp_req[UCLAMP_MIN].value;
int util_max = p->uclamp_req[UCLAMP_MAX].value;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
util_min = attr->sched_util_min;
if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
return -EINVAL;
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
util_max = attr->sched_util_max;
if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
return -EINVAL;
}
if (util_min != -1 && util_max != -1 && util_min > util_max)
return -EINVAL;
/*
* We have valid uclamp attributes; make sure uclamp is enabled.
*
* We need to do that here, because enabling static branches is a
* blocking operation which obviously cannot be done while holding
* scheduler locks.
*/
static_branch_enable(&sched_uclamp_used);
return 0;
}
static bool uclamp_reset(const struct sched_attr *attr,
enum uclamp_id clamp_id,
struct uclamp_se *uc_se)
{
/* Reset on sched class change for a non user-defined clamp value. */
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
!uc_se->user_defined)
return true;
/* Reset on sched_util_{min,max} == -1. */
if (clamp_id == UCLAMP_MIN &&
attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
attr->sched_util_min == -1) {
return true;
}
if (clamp_id == UCLAMP_MAX &&
attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
attr->sched_util_max == -1) {
return true;
}
return false;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
unsigned int value;
if (!uclamp_reset(attr, clamp_id, uc_se))
continue;
/*
* RT by default have a 100% boost value that could be modified
* at runtime.
*/
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
value = sysctl_sched_uclamp_util_min_rt_default;
else
value = uclamp_none(clamp_id);
uclamp_se_set(uc_se, value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
return;
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
attr->sched_util_min != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
attr->sched_util_min, true);
}
if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
attr->sched_util_max != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
attr->sched_util_max, true);
}
}
static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
/*
* We don't need to hold task_rq_lock() when updating p->uclamp_* here
* as the task is still at its early fork stages.
*/
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
if (likely(!p->sched_reset_on_fork))
return;
for_each_clamp_id(clamp_id) {
uclamp_se_set(&p->uclamp_req[clamp_id],
uclamp_none(clamp_id), false);
}
}
static void uclamp_post_fork(struct task_struct *p)
{
uclamp_update_util_min_rt_default(p);
}
static void __init init_uclamp_rq(struct rq *rq)
{
enum uclamp_id clamp_id;
struct uclamp_rq *uc_rq = rq->uclamp;
for_each_clamp_id(clamp_id) {
uc_rq[clamp_id] = (struct uclamp_rq) {
.value = uclamp_none(clamp_id)
};
}
rq->uclamp_flags = 0;
}
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
enum uclamp_id clamp_id;
int cpu;
for_each_possible_cpu(cpu)
init_uclamp_rq(cpu_rq(cpu));
for_each_clamp_id(clamp_id) {
uclamp_se_set(&init_task.uclamp_req[clamp_id],
uclamp_none(clamp_id), false);
}
/* System defaults allow max clamp values for both indexes */
uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
for_each_clamp_id(clamp_id) {
uclamp_default[clamp_id] = uc_max;
#ifdef CONFIG_UCLAMP_TASK_GROUP
root_task_group.uclamp_req[clamp_id] = uc_max;
root_task_group.uclamp[clamp_id] = uc_max;
#endif
}
}
#else /* CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
static inline int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
return -EOPNOTSUPP;
}
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & ENQUEUE_RESTORE)) {
sched_info_queued(rq, p);
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
}
uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
if (!(flags & DEQUEUE_SAVE)) {
sched_info_dequeued(rq, p);
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
uclamp_rq_dec(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
enqueue_task(rq, p, flags);
p->on_rq = TASK_ON_RQ_QUEUED;
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
dequeue_task(rq, p, flags);
}
/*
* __normal_prio - return the priority that is based on the static prio
*/
static inline int __normal_prio(struct task_struct *p)
{
return p->static_prio;
}
/*
* Calculate the expected normal priority: i.e. priority
* without taking RT-inheritance into account. Might be
* boosted by interactivity modifiers. Changes upon fork,
* setprio syscalls, and whenever the interactivity
* estimator recalculates.
*/
static inline int normal_prio(struct task_struct *p)
{
int prio;
if (task_has_dl_policy(p))
prio = MAX_DL_PRIO-1;
else if (task_has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
return prio;
}
/*
* Calculate the current priority, i.e. the priority
* taken into account by the scheduler. This value might
* be boosted by RT tasks, or might be boosted by
* interactivity modifiers. Will be RT if the task got
* RT-boosted. If not then it returns p->normal_prio.
*/
static int effective_prio(struct task_struct *p)
{
p->normal_prio = normal_prio(p);
/*
* If we are RT tasks or we were boosted to RT priority,
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
if (!rt_prio(p->prio))
return p->normal_prio;
return p->prio;
}
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
*
* Return: 1 if the task is currently executing. 0 otherwise.
*/
inline int task_curr(const struct task_struct *p)
{
return cpu_curr(task_cpu(p)) == p;
}
/*
* switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
* use the balance_callback list if you want balancing.
*
* this means any call to check_class_changed() must be followed by a call to
* balance_callback().
*/
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
}
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
else if (p->sched_class > rq->curr->sched_class)
resched_curr(rq);
/*
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
rq_clock_skip_update(rq);
}
#ifdef CONFIG_SMP
static void
__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask,
u32 flags);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
if (likely(!p->migration_disabled))
return;
if (p->cpus_ptr != &p->cpus_mask)
return;
/*
* Violates locking rules! see comment in __do_set_cpus_allowed().
*/
__do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
}
void migrate_disable(void)
{
struct task_struct *p = current;
if (p->migration_disabled) {
p->migration_disabled++;
return;
}
preempt_disable();
this_rq()->nr_pinned++;
p->migration_disabled = 1;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_disable);
void migrate_enable(void)
{
struct task_struct *p = current;
if (p->migration_disabled > 1) {
p->migration_disabled--;
return;
}
/*
* Ensure stop_task runs either before or after this, and that
* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
*/
preempt_disable();
if (p->cpus_ptr != &p->cpus_mask)
__set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
/*
* Mustn't clear migration_disabled() until cpus_ptr points back at the
* regular cpus_mask, otherwise things that race (eg.
* select_fallback_rq) get confused.
*/
barrier();
p->migration_disabled = 0;
this_rq()->nr_pinned--;
preempt_enable();
}
EXPORT_SYMBOL_GPL(migrate_enable);
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return rq->nr_pinned;
}
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
/* When not in the task's cpumask, no point in looking further. */
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
/* migrate_disabled() must be allowed to finish. */
if (is_migration_disabled(p))
return cpu_online(cpu);
/* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD))
return cpu_active(cpu);
/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
return cpu_online(cpu);
/* Regular kernel threads don't get to stay during offline. */
if (cpu_rq(cpu)->balance_push)
return false;
/* But are allowed during online. */
return cpu_online(cpu);
}
/*
* This is how migration works:
*
* 1) we invoke migration_cpu_stop() on the target CPU using
* stop_one_cpu().
* 2) stopper starts to run (implicitly forcing the migrated thread
* off the CPU)
* 3) it checks whether the migrated task is still in the wrong runqueue.
* 4) if it's in the wrong runqueue then the migration thread removes
* it and puts it into the right queue.
* 5) stopper completes and stop_one_cpu() returns and the migration
* is done.
*/
/*
* move_queued_task - move a queued task to new rq.
*
* Returns (locked) new rq. Old rq's lock is released.
*/
static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int new_cpu)
{
lockdep_assert_held(&rq->lock);
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
rq_unlock(rq, rf);
rq = cpu_rq(new_cpu);
rq_lock(rq, rf);
BUG_ON(task_cpu(p) != new_cpu);
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
return rq;
}
struct migration_arg {
struct task_struct *task;
int dest_cpu;
struct set_affinity_pending *pending;
};
struct set_affinity_pending {
refcount_t refs;
struct completion done;
struct cpu_stop_work stop_work;
struct migration_arg arg;
};
/*
* Move (not current) task off this CPU, onto the destination CPU. We're doing
* this because either it can't run here any more (set_cpus_allowed()
* away from this CPU, or CPU going down), or because we're
* attempting to rebalance this task on exec (sched_exec).
*
* So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU.
*/
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
/* Affinity changed (again). */
if (!is_cpu_allowed(p, dest_cpu))
return rq;
update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
}
/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
* 'pushing' onto another runqueue.
*/
static int migration_cpu_stop(void *data)
{
struct set_affinity_pending *pending;
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
bool complete = false;
struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
flush_smp_call_function_from_idle();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) {
if (is_migration_disabled(p))
goto out;
if (pending) {
p->migration_pending = NULL;
complete = true;
}
/* migrate_enable() -- we must not race against SCA */
if (dest_cpu < 0) {
/*
* When this was migrate_enable() but we no longer
* have a @pending, a concurrent SCA 'fixed' things
* and we should be valid again. Nothing to do.
*/
if (!pending) {
WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
goto out;
}
dest_cpu = cpumask_any_distribute(&p->cpus_mask);
}
if (task_on_rq_queued(p))
rq = __migrate_task(rq, &rf, p, dest_cpu);
else
p->wake_cpu = dest_cpu;
} else if (dest_cpu < 0 || pending) {
/*
* This happens when we get migrated between migrate_enable()'s
* preempt_enable() and scheduling the stopper task. At that
* point we're a regular task again and not current anymore.
*
* A !PREEMPT kernel has a giant hole here, which makes it far
* more likely.
*/
/*
* The task moved before the stopper got to run. We're holding
* ->pi_lock, so the allowed mask is stable - if it got
* somewhere allowed, we're done.
*/
if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
p->migration_pending = NULL;
complete = true;
goto out;
}
/*
* When this was migrate_enable() but we no longer have an
* @pending, a concurrent SCA 'fixed' things and we should be
* valid again. Nothing to do.
*/
if (!pending) {
WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
goto out;
}
/*
* When migrate_enable() hits a rq mis-match we can't reliably
* determine is_migration_disabled() and so have to chase after
* it.
*/
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
out:
task_rq_unlock(rq, p, &rf);
if (complete)
complete_all(&pending->done);
/* For pending->{arg,stop_work} */
pending = arg->pending;
if (pending && refcount_dec_and_test(&pending->refs))
wake_up_var(&pending->refs);
return 0;
}
int push_cpu_stop(void *arg)
{
struct rq *lowest_rq = NULL, *rq = this_rq();
struct task_struct *p = arg;
raw_spin_lock_irq(&p->pi_lock);
raw_spin_lock(&rq->lock);
if (task_rq(p) != rq)
goto out_unlock;
if (is_migration_disabled(p)) {
p->migration_flags |= MDF_PUSH;
goto out_unlock;
}
p->migration_flags &= ~MDF_PUSH;
if (p->sched_class->find_lock_rq)
lowest_rq = p->sched_class->find_lock_rq(p, rq);
if (!lowest_rq)
goto out_unlock;
// XXX validate p is still the highest prio task
if (task_rq(p) == rq) {
deactivate_task(rq, p, 0);
set_task_cpu(p, lowest_rq->cpu);
activate_task(lowest_rq, p, 0);
resched_curr(lowest_rq);
}
double_unlock_balance(rq, lowest_rq);
out_unlock:
rq->push_busy = false;
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
return 0;
}
/*
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
p->cpus_ptr = new_mask;
return;
}
cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
static void
__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
struct rq *rq = task_rq(p);
bool queued, running;
/*
* This here violates the locking rules for affinity, since we're only
* supposed to change these variables while holding both rq->lock and
* p->pi_lock.
*
* HOWEVER, it magically works, because ttwu() is the only code that
* accesses these variables under p->pi_lock and only does so after
* smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
* before finish_task().
*
* XXX do further audits, this smells like something putrid.
*/
if (flags & SCA_MIGRATE_DISABLE)
SCHED_WARN_ON(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued) {
/*
* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock.
*/
lockdep_assert_held(&rq->lock);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
put_prev_task(rq, p);
p->sched_class->set_cpus_allowed(p, new_mask, flags);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_next_task(rq, p);
}
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
__do_set_cpus_allowed(p, new_mask, 0);
}
/*
* This function is wildly self concurrent; here be dragons.
*
*
* When given a valid mask, __set_cpus_allowed_ptr() must block until the
* designated task is enqueued on an allowed CPU. If that task is currently
* running, we have to kick it out using the CPU stopper.
*
* Migrate-Disable comes along and tramples all over our nice sandcastle.
* Consider:
*
* Initial conditions: P0->cpus_mask = [0, 1]
*
* P0@CPU0 P1
*
* migrate_disable();
* <preempted>
* set_cpus_allowed_ptr(P0, [1]);
*
* P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes
* its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
* This means we need the following scheme:
*
* P0@CPU0 P1
*
* migrate_disable();
* <preempted>
* set_cpus_allowed_ptr(P0, [1]);
* <blocks>
* <resumes>
* migrate_enable();
* __set_cpus_allowed_ptr();
* <wakes local stopper>
* `--> <woken on migration completion>
*
* Now the fun stuff: there may be several P1-like tasks, i.e. multiple
* concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
* task p are serialized by p->pi_lock, which we can leverage: the one that
* should come into effect at the end of the Migrate-Disable region is the last
* one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
* but we still need to properly signal those waiting tasks at the appropriate
* moment.
*
* This is implemented using struct set_affinity_pending. The first
* __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
* setup an instance of that struct and install it on the targeted task_struct.
* Any and all further callers will reuse that instance. Those then wait for
* a completion signaled at the tail of the CPU stopper callback (1), triggered
* on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
*
*
* (1) In the cases covered above. There is one more where the completion is
* signaled within affine_move_task() itself: when a subsequent affinity request
* cancels the need for an active migration. Consider:
*
* Initial conditions: P0->cpus_mask = [0, 1]
*
* P0@CPU0 P1 P2
*
* migrate_disable();
* <preempted>
* set_cpus_allowed_ptr(P0, [1]);
* <blocks>
* set_cpus_allowed_ptr(P0, [0, 1]);
* <signal completion>
* <awakes>
*
* Note that the above is safe vs a concurrent migrate_enable(), as any
* pending affinity completion is preceded by an uninstallation of
* p->migration_pending done with p->pi_lock held.
*/
static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
int dest_cpu, unsigned int flags)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
struct migration_arg arg = {
.task = p,
.dest_cpu = dest_cpu,
};
bool complete = false;
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
struct task_struct *push_task = NULL;
if ((flags & SCA_MIGRATE_ENABLE) &&
(p->migration_flags & MDF_PUSH) && !rq->push_busy) {
rq->push_busy = true;
push_task = get_task_struct(p);
}
pending = p->migration_pending;
if (pending) {
refcount_inc(&pending->refs);
p->migration_pending = NULL;
complete = true;
}
task_rq_unlock(rq, p, rf);
if (push_task) {
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
p, &rq->push_work);
}
if (complete)
goto do_complete;
return 0;
}
if (!(flags & SCA_MIGRATE_ENABLE)) {
/* serialized by p->pi_lock */
if (!p->migration_pending) {
/* Install the request */
refcount_set(&my_pending.refs, 1);
init_completion(&my_pending.done);
p->migration_pending = &my_pending;
} else {
pending = p->migration_pending;
refcount_inc(&pending->refs);
}
}
pending = p->migration_pending;
/*
* - !MIGRATE_ENABLE:
* we'll have installed a pending if there wasn't one already.
*
* - MIGRATE_ENABLE:
* we're here because the current CPU isn't matching anymore,
* the only way that can happen is because of a concurrent
* set_cpus_allowed_ptr() call, which should then still be
* pending completion.
*
* Either way, we really should have a @pending here.
*/
if (WARN_ON_ONCE(!pending)) {
task_rq_unlock(rq, p, rf);
return -EINVAL;
}
if (flags & SCA_MIGRATE_ENABLE) {
refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
p->migration_flags &= ~MDF_PUSH;
task_rq_unlock(rq, p, rf);
pending->arg = (struct migration_arg) {
.task = p,
.dest_cpu = -1,
.pending = pending,
};
stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
if (task_running(rq, p) || p->state == TASK_WAKING) {
/*
* Lessen races (and headaches) by delegating
* is_migration_disabled(p) checks to the stopper, which will
* run on the same CPU as said p.
*/
task_rq_unlock(rq, p, rf);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
} else {
if (!is_migration_disabled(p)) {
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);
p->migration_pending = NULL;
complete = true;
}
task_rq_unlock(rq, p, rf);
do_complete:
if (complete)
complete_all(&pending->done);
}
wait_for_completion(&pending->done);
if (refcount_dec_and_test(&pending->refs))
wake_up_var(&pending->refs);
/*
* Block the original owner of &pending until all subsequent callers
* have seen the completion and decremented the refcount
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
return 0;
}
/*
* Change a given task's CPU affinity. Migrate the thread to a
* proper CPU and schedule it away if the CPU it's executing on
* is removed from the allowed bitmask.
*
* NOTE: the caller must have a valid reference to the task, the
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask,
u32 flags)
{
const struct cpumask *cpu_valid_mask = cpu_active_mask;
unsigned int dest_cpu;
struct rq_flags rf;
struct rq *rq;
int ret = 0;
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs,
* however, during cpu-hot-unplug, even these might get pushed
* away if not KTHREAD_IS_PER_CPU.
*
* Specifically, migration_disabled() tasks must not fail the
* cpumask_any_and_distribute() pick below, esp. so on
* SCA_MIGRATE_ENABLE, otherwise we'll not call
* set_cpus_allowed_common() and actually reset p->cpus_ptr.
*/
cpu_valid_mask = cpu_online_mask;
}
/*
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
*/
if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out;
}
if (!(flags & SCA_MIGRATE_ENABLE)) {
if (cpumask_equal(&p->cpus_mask, new_mask))
goto out;
if (WARN_ON_ONCE(p == current &&
is_migration_disabled(p) &&
!cpumask_test_cpu(task_cpu(p), new_mask))) {
ret = -EBUSY;
goto out;
}
}
/*
* Picking a ~random cpu helps in cases where we are changing affinity
* for groups of tasks (ie. cpuset), so that load balancing is not
* immediately required to distribute the tasks within their new mask.
*/
dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
if (dest_cpu >= nr_cpu_ids) {
ret = -EINVAL;
goto out;
}
__do_set_cpus_allowed(p, new_mask, flags);
return affine_move_task(rq, p, &rf, dest_cpu, flags);
out:
task_rq_unlock(rq, p, &rf);
return ret;
}
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
return __set_cpus_allowed_ptr(p, new_mask, 0);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
/*
* We should never call set_task_cpu() on a blocked task,
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!p->on_rq);
/*
* Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
* because schedstat_wait_{start,end} rebase migrating task's wait_start
* time relying on p->on_rq.
*/
WARN_ON_ONCE(p->state == TASK_RUNNING &&
p->sched_class == &fair_sched_class &&
(p->on_rq && !task_on_rq_migrating(p)));
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold either p->pi_lock or rq->lock, when changing
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
*
* sched_move_task() holds both and thus holding either pins the cgroup,
* see task_group().
*
* Furthermore, all task_rq users should acquire both locks, see
* task_rq_lock().
*/
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
/*
* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
*/
WARN_ON_ONCE(!cpu_online(new_cpu));
WARN_ON_ONCE(is_migration_disabled(p));
#endif
trace_sched_migrate_task(p, new_cpu);
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
perf_event_task_migrate(p);
}
__set_task_cpu(p, new_cpu);
}
#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cpu)
{
if (task_on_rq_queued(p)) {
struct rq *src_rq, *dst_rq;
struct rq_flags srf, drf;
src_rq = task_rq(p);
dst_rq = cpu_rq(cpu);
rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf);
deactivate_task(src_rq, p, 0);
set_task_cpu(p, cpu);
activate_task(dst_rq, p, 0);
check_preempt_curr(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
rq_unpin_lock(src_rq, &srf);
} else {
/*
* Task isn't running anymore; make it appear like we migrated
* it before it went to sleep. This means on wakeup we make the
* previous CPU our target instead of where it really is.
*/
p->wake_cpu = cpu;
}
}
struct migration_swap_arg {
struct task_struct *src_task, *dst_task;
int src_cpu, dst_cpu;
};
static int migrate_swap_stop(void *data)
{
struct migration_swap_arg *arg = data;
struct rq *src_rq, *dst_rq;
int ret = -EAGAIN;
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
return -EAGAIN;
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
double_raw_lock(&arg->src_task->pi_lock,
&arg->dst_task->pi_lock);
double_rq_lock(src_rq, dst_rq);
if (task_cpu(arg->dst_task) != arg->dst_cpu)
goto unlock;
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
goto unlock;
if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
__migrate_swap_task(arg->dst_task, arg->src_cpu);
ret = 0;
unlock:
double_rq_unlock(src_rq, dst_rq);
raw_spin_unlock(&arg->dst_task->pi_lock);
raw_spin_unlock(&arg->src_task->pi_lock);
return ret;
}
/*
* Cross migrate two tasks
*/
int migrate_swap(struct task_struct *cur, struct task_struct *p,
int target_cpu, int curr_cpu)
{
struct migration_swap_arg arg;
int ret = -EINVAL;
arg = (struct migration_swap_arg){
.src_task = cur,
.src_cpu = curr_cpu,
.dst_task = p,
.dst_cpu = target_cpu,
};
if (arg.src_cpu == arg.dst_cpu)
goto out;
/*
* These three tests are all lockless; this is OK since all of them
* will be re-checked with proper locks held further down the line.
*/
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
goto out;
if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
out:
return ret;
}
#endif /* CONFIG_NUMA_BALANCING */
/*
* wait_task_inactive - wait for a thread to unschedule.
*
* If @match_state is nonzero, it's the @p->state value just checked and
* not expected to change. If it changes, i.e. @p might have woken up,
* then return zero. When we succeed in waiting for @p to be off its CPU,
* we return a positive number (its total switch count). If a second call
* a short while later returns the same number, the caller can be sure that
* @p has remained unscheduled the whole time.
*
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
* be called with interrupts off, or it may introduce deadlock with
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
int running, queued;
struct rq_flags rf;
unsigned long ncsw;
struct rq *rq;
for (;;) {
/*
* We do the initial early heuristics without holding
* any task-queue locks at all. We'll only try to get
* the runqueue lock when things look like they will
* work out!
*/
rq = task_rq(p);
/*
* If the task is actively running on another CPU
* still, just relax and busy-wait without holding
* any locks.
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
* But we don't care, since "task_running()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
while (task_running(rq, p)) {
if (match_state && unlikely(p->state != match_state))
return 0;
cpu_relax();
}
/*
* Ok, time to look more closely! We need the rq
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
rq = task_rq_lock(p, &rf);
trace_sched_wait_task(p);
running = task_running(rq, p);
queued = task_on_rq_queued(p);
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
task_rq_unlock(rq, p, &rf);
/*
* If it changed from the expected state, bail out now.
*/
if (unlikely(!ncsw))
break;
/*
* Was it really running after all now that we
* checked with the proper locks actually held?
*
* Oops. Go back and try again..
*/
if (unlikely(running)) {
cpu_relax();
continue;
}
/*
* It's not enough that it's not actively running,
* it must be off the runqueue _entirely_, and not
* preempted!
*
* So if it was still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
if (unlikely(queued)) {
ktime_t to = NSEC_PER_SEC / HZ;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_hrtimeout(&to, HRTIMER_MODE_REL);
continue;
}
/*
* Ahh, all good. It wasn't running, and it wasn't
* runnable, which means that it will never become
* running in the future either. We're all done!
*/
break;
}
return ncsw;
}
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
*
* Cause a process which is running on another CPU to enter
* kernel-mode, without any delay. (to get signals handled.)
*
* NOTE: this function doesn't have to take the runqueue lock,
* because all it wants to ensure is that the remote task enters
* the kernel. If the IPI races and the task has been migrated
* to another CPU then no harm is done and the purpose has been
* achieved as well.
*/
void kick_process(struct task_struct *p)
{
int cpu;
preempt_disable();
cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
/*
* ->cpus_ptr is protected by both rq->lock and p->pi_lock
*
* A few notes on cpu_active vs cpu_online:
*
* - cpu_active must be a subset of cpu_online
*
* - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
* see __set_cpus_allowed_ptr(). At this point the newly online
* CPU isn't yet part of the sched domains, and balancing will not
* see it.
*
* - on CPU-down we clear cpu_active() to mask the sched domains and
* avoid the load balancer to place new tasks on the to be removed
* CPU. Existing tasks will remain running there and will be taken
* off.
*
* This means that fallback selection must not select !active CPUs.
* And can assume that any active CPU must be online. Conversely
* select_task_rq() below may allow selection of !active CPUs in order
* to satisfy the above rules.
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
int nid = cpu_to_node(cpu);
const struct cpumask *nodemask = NULL;
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
/*
* If the node that the CPU is on has been offlined, cpu_to_node()
* will return -1. There is no CPU on the node, and we should
* select the CPU on the other node.
*/
if (nid != -1) {
nodemask = cpumask_of_node(nid);
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, p->cpus_ptr) {
if (!is_cpu_allowed(p, dest_cpu))
continue;
goto out;
}
/* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
if (IS_ENABLED(CONFIG_CPUSETS)) {
cpuset_cpus_allowed_fallback(p);
state = possible;
break;
}
fallthrough;
case possible:
/*
* XXX When called from select_task_rq() we only
* hold p->pi_lock and again violate locking order.
*
* More yuck to audit.
*/
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
break;
case fail:
BUG();
break;
}
}
out:
if (state != cpuset) {
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
* leave kernel.
*/
if (p->mm && printk_ratelimit()) {
printk_deferred("process %d (%s) no longer affine to cpu%d\n",
task_pid_nr(p), p->comm, cpu);
}
}
return dest_cpu;
}
/*
* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
int select_task_rq(struct task_struct *p, int cpu, int wake_flags)
{
lockdep_assert_held(&p->pi_lock);
if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p))
cpu = p->sched_class->select_task_rq(p, cpu, wake_flags);
else
cpu = cpumask_any(p->cpus_ptr);
/*
* In order not to call set_task_cpu() on a blocking task we need
* to rely on ttwu() to place the task on a valid ->cpus_ptr
* CPU.
*
* Since this is common to all placement strategies, this lives here.
*
* [ this allows ->select_task() to simply return task_cpu(p) and
* not worry about this generic constraint ]
*/
if (unlikely(!is_cpu_allowed(p, cpu)))
cpu = select_fallback_rq(task_cpu(p), p);
return cpu;
}
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
static struct lock_class_key stop_pi_lock;
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
struct task_struct *old_stop = cpu_rq(cpu)->stop;
if (stop) {
/*
* Make it appear like a SCHED_FIFO task, its something
* userspace knows about and won't get confused about.
*
* Also, it will make PI more or less work without too
* much confusion -- but then, stop work should not
* rely on PI working anyway.
*/
sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
stop->sched_class = &stop_sched_class;
/*
* The PI code calls rt_mutex_setprio() with ->pi_lock held to
* adjust the effective priority of a task. As a result,
* rt_mutex_setprio() can trigger (RT) balancing operations,
* which can then trigger wakeups of the stop thread to push
* around the current task.
*
* The stop task itself will never be part of the PI-chain, it
* never blocks, therefore that ->pi_lock recursion is safe.
* Tell lockdep about this by placing the stop->pi_lock in its
* own class.
*/
lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
}
cpu_rq(cpu)->stop = stop;
if (old_stop) {
/*
* Reset it back to a normal scheduling class so that
* it can die in pieces.
*/
old_stop->sched_class = &rt_sched_class;
}
}
#else /* CONFIG_SMP */
static inline int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask,
u32 flags)
{
return set_cpus_allowed_ptr(p, new_mask);
}
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
static inline bool rq_has_pinned_tasks(struct rq *rq)
{
return false;
}
#endif /* !CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq;
if (!schedstat_enabled())
return;
rq = this_rq();
#ifdef CONFIG_SMP
if (cpu == rq->cpu) {
__schedstat_inc(rq->ttwu_local);
__schedstat_inc(p->se.statistics.nr_wakeups_local);
} else {
struct sched_domain *sd;
__schedstat_inc(p->se.statistics.nr_wakeups_remote);
rcu_read_lock();
for_each_domain(rq->cpu, sd) {
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
__schedstat_inc(sd->ttwu_wake_remote);
break;
}
}
rcu_read_unlock();
}
if (wake_flags & WF_MIGRATED)
__schedstat_inc(p->se.statistics.nr_wakeups_migrate);
#endif /* CONFIG_SMP */
__schedstat_inc(rq->ttwu_count);
__schedstat_inc(p->se.statistics.nr_wakeups);
if (wake_flags & WF_SYNC)
__schedstat_inc(p->se.statistics.nr_wakeups_sync);
}
/*
* Mark the task runnable and perform wakeup-preemption.
*/
static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
{
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Our task @p is fully woken up and running; so it's safe to
* drop the rq->lock, hereafter rq is only used for statistics.
*/
rq_unpin_lock(rq, rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, rf);
}
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
u64 max = 2*rq->max_idle_balance_cost;
update_avg(&rq->avg_idle, delta);
if (rq->avg_idle > max)
rq->avg_idle = max;
rq->idle_stamp = 0;
}
#endif
}
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct rq_flags *rf)
{
int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
lockdep_assert_held(&rq->lock);
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
#ifdef CONFIG_SMP
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
else
#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
activate_task(rq, p, en_flags);
ttwu_do_wakeup(rq, p, wake_flags, rf);
}
/*
* Consider @p being inside a wait loop:
*
* for (;;) {
* set_current_state(TASK_UNINTERRUPTIBLE);
*
* if (CONDITION)
* break;
*
* schedule();
* }
* __set_current_state(TASK_RUNNING);
*
* between set_current_state() and schedule(). In this case @p is still
* runnable, so all that needs doing is change p->state back to TASK_RUNNING in
* an atomic manner.
*
* By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
* then schedule() must still happen and p->state can be changed to
* TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
* need to do a full wakeup with enqueue.
*
* Returns: %true when the wakeup is done,
* %false otherwise.
*/
static int ttwu_runnable(struct task_struct *p, int wake_flags)
{
struct rq_flags rf;
struct rq *rq;
int ret = 0;
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
/* check_preempt_curr() may use rq clock */
update_rq_clock(rq);
ttwu_do_wakeup(rq, p, wake_flags, &rf);
ret = 1;
}
__task_rq_unlock(rq, &rf);
return ret;
}
#ifdef CONFIG_SMP
void sched_ttwu_pending(void *arg)
{
struct llist_node *llist = arg;
struct rq *rq = this_rq();
struct task_struct *p, *t;
struct rq_flags rf;
if (!llist)
return;
/*
* rq::ttwu_pending racy indication of out-standing wakeups.
* Races such that false-negatives are possible, since they
* are shorter lived that false-positives would be.
*/
WRITE_ONCE(rq->ttwu_pending, 0);
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
if (WARN_ON_ONCE(p->on_cpu))
smp_cond_load_acquire(&p->on_cpu, !VAL);
if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
set_task_cpu(p, cpu_of(rq));
ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
}
rq_unlock_irqrestore(rq, &rf);
}
void send_call_function_single_ipi(int cpu)
{
struct rq *rq = cpu_rq(cpu);
if (!set_nr_if_polling(rq->idle))
arch_send_call_function_single_ipi(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
}
/*
* Queue a task on the target CPUs wake_list and wake the CPU via IPI if
* necessary. The wakee CPU on receipt of the IPI will queue the task
* via sched_ttwu_wakeup() for activation so the wakee incurs the cost
* of the wakeup instead of the waker.
*/
static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
WRITE_ONCE(rq->ttwu_pending, 1);
__smp_call_single_queue(cpu, &p->wake_entry.llist);
}
void wake_up_if_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
rcu_read_lock();
if (!is_idle_task(rcu_dereference(rq->curr)))
goto out;
if (set_nr_if_polling(rq->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
} else {
rq_lock_irqsave(rq, &rf);
if (is_idle_task(rq->curr))
smp_send_reschedule(cpu);
/* Else CPU is not idle, do nothing here: */
rq_unlock_irqrestore(rq, &rf);
}
out:
rcu_read_unlock();
}
bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
static inline bool ttwu_queue_cond(int cpu, int wake_flags)
{
/*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
*/
if (!cpu_active(cpu))
return false;
/*
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
*/
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
/*
* If the task is descheduling and the only running task on the
* CPU then use the wakelist to offload the task activation to
* the soon-to-be-idle CPU as the current CPU is likely busy.
* nr_running is checked to avoid unnecessary task stacking.
*/
if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
return true;
return false;
}
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
if (WARN_ON_ONCE(cpu == smp_processor_id()))
return false;
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
}
return false;
}
#else /* !CONFIG_SMP */
static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
return false;
}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
{
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
if (ttwu_queue_wakelist(p, cpu, wake_flags))
return;
rq_lock(rq, &rf);
update_rq_clock(rq);
ttwu_do_activate(rq, p, wake_flags, &rf);
rq_unlock(rq, &rf);
}
/*
* Notes on Program-Order guarantees on SMP systems.
*
* MIGRATION
*
* The basic program-order guarantee on SMP systems is that when a task [t]
* migrates, all its activity on its old CPU [c0] happens-before any subsequent
* execution on its new CPU [c1].
*
* For migration (of runnable tasks) this is provided by the following means:
*
* A) UNLOCK of the rq(c0)->lock scheduling out task t
* B) migration for t is required to synchronize *both* rq(c0)->lock and
* rq(c1)->lock (if not at the same time, then in that order).
* C) LOCK of the rq(c1)->lock scheduling in task
*
* Release/acquire chaining guarantees that B happens after A and C after B.
* Note: the CPU doing B need not be c0 or c1
*
* Example:
*
* CPU0 CPU1 CPU2
*
* LOCK rq(0)->lock
* sched-out X
* sched-in Y
* UNLOCK rq(0)->lock
*
* LOCK rq(0)->lock // orders against CPU0
* dequeue X
* UNLOCK rq(0)->lock
*
* LOCK rq(1)->lock
* enqueue X
* UNLOCK rq(1)->lock
*
* LOCK rq(1)->lock // orders against CPU2
* sched-out Z
* sched-in X
* UNLOCK rq(1)->lock
*
*
* BLOCKING -- aka. SLEEP + WAKEUP
*
* For blocking we (obviously) need to provide the same guarantee as for
* migration. However the means are completely different as there is no lock
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0) -- finish_task()
* 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
*
* Example:
*
* CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
*
* LOCK rq(0)->lock LOCK X->pi_lock
* dequeue X
* sched-out X
* smp_store_release(X->on_cpu, 0);
*
* smp_cond_load_acquire(&X->on_cpu, !VAL);
* X->state = WAKING
* set_task_cpu(X,2)
*
* LOCK rq(2)->lock
* enqueue X
* X->state = RUNNING
* UNLOCK rq(2)->lock
*
* LOCK rq(2)->lock // orders against CPU1
* sched-out Z
* sched-in X
* UNLOCK rq(2)->lock
*
* UNLOCK X->pi_lock
* UNLOCK rq(0)->lock
*
*
* However, for wakeups there is a second guarantee we must provide, namely we
* must ensure that CONDITION=1 done by the caller can not be reordered with
* accesses to the task state; see try_to_wake_up() and set_current_state().
*/
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
* Conceptually does:
*
* If (@state & @p->state) @p->state = TASK_RUNNING.
*
* If the task was not queued/runnable, also place it back on a runqueue.
*
* This function is atomic against schedule() which would dequeue the task.
*
* It issues a full memory barrier before accessing @p->state, see the comment
* with set_current_state().
*
* Uses p->pi_lock to serialize against concurrent wake-ups.
*
* Relies on p->pi_lock stabilizing:
* - p->sched_class
* - p->cpus_ptr
* - p->sched_task_group
* in order to do migration, see its use of select_task_rq()/set_task_cpu().
*
* Tries really hard to only take one task_rq(p)->lock for performance.
* Takes rq->lock in:
* - ttwu_runnable() -- old rq, unavoidable, see comment there;
* - ttwu_queue() -- new rq, for enqueue of the task;
* - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
*
* As a consequence we race really badly with just about everything. See the
* many memory barriers and their comments for details.
*
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
* == smp_processor_id()'. Together this means we can special
* case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
if (!(p->state & state))
goto out;
success = 1;
trace_sched_waking(p);
p->state = TASK_RUNNING;
trace_sched_wakeup(p);
goto out;
}
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
* reordered with p->state check below. This pairs with smp_store_mb()
* in set_current_state() that the waiting thread does.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
if (!(p->state & state))
goto unlock;
trace_sched_waking(p);
/* We're going to change ->state: */
success = 1;
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
* be possible to, falsely, observe p->on_rq == 0 and get stuck
* in smp_cond_load_acquire() below.
*
* sched_ttwu_pending() try_to_wake_up()
* STORE p->on_rq = 1 LOAD p->state
* UNLOCK rq->lock
*
* __schedule() (switch to task 'p')
* LOCK rq->lock smp_rmb();
* smp_mb__after_spinlock();
* UNLOCK rq->lock
*
* [task p]
* STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
*
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* A similar smb_rmb() lives in try_invoke_on_locked_down_task().
*/
smp_rmb();
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
goto unlock;
#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
*
* One must be running (->on_cpu == 1) in order to remove oneself
* from the runqueue.
*
* __schedule() (switch to task 'p') try_to_wake_up()
* STORE p->on_cpu = 1 LOAD p->on_rq
* UNLOCK rq->lock
*
* __schedule() (put 'p' to sleep)
* LOCK rq->lock smp_rmb();
* smp_mb__after_spinlock();
* STORE p->on_rq = 0 LOAD p->on_cpu
*
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
* schedule()'s deactivate_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
/*
* We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
* == 0), which means we need to do an enqueue, change p->state to
* TASK_WAKING such that we can unlock p->pi_lock before doing the
* enqueue, such as ttwu_queue_wakelist().
*/
p->state = TASK_WAKING;
/*
* If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, considering queueing p on the remote CPUs wake_list
* which potentially sends an IPI instead of spinning on p->on_cpu to
* let the waker make forward progress. This is safe because IRQs are
* disabled and the IPI will deliver after on_cpu is cleared.
*
* Ensure we load task_cpu(p) after p->on_cpu:
*
* set_task_cpu(p, cpu);
* STORE p->cpu = @cpu
* __schedule() (switch to task 'p')
* LOCK rq->lock
* smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
* STORE p->on_cpu = 1 LOAD p->cpu
*
* to ensure we observe the correct CPU on which the task is currently
* scheduling.
*/
if (smp_load_acquire(&p->on_cpu) &&
ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
goto unlock;
/*
* If the owning (remote) CPU is still in the middle of schedule() with</