Merge branch 'sched/core' into core/mm

Pull the migrate disable mechanics which is a prerequisite for preemptible
kmap_local().
This commit is contained in:
Thomas Gleixner 2020-11-24 11:26:11 +01:00
commit 13c8da5db4
35 changed files with 1676 additions and 464 deletions

View file

@ -65,21 +65,17 @@ of the SMP domain will span the entire machine, with each group having the
cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example, cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example,
might have just one domain covering its one NUMA level. might have just one domain covering its one NUMA level.
The implementor should read comments in include/linux/sched.h: The implementor should read comments in include/linux/sched/sd_flags.h:
struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of SD_* to get an idea of the specifics and what to tune for the SD flags
the specifics and what to tune. of a sched_domain.
Architectures may retain the regular override the default SD_*_INIT flags Architectures may override the generic domain builder and the default SD flags
while using the generic domain builder in kernel/sched/core.c if they wish to for a given topology level by creating a sched_domain_topology_level array and
retain the traditional SMT->SMP->NUMA topology (or some subset of that). This calling set_sched_topology() with this array as the parameter.
can be done by #define'ing ARCH_HASH_SCHED_TUNE.
Alternatively, the architecture may completely override the generic domain
builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
arch_init_sched_domains function. This function will attach domains to all
CPUs using cpu_attach_domain.
The sched-domains debugging infrastructure can be enabled by enabling The sched-domains debugging infrastructure can be enabled by enabling
CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
which should catch most possible errors (described above). It also prints out tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
the domain structure in a visual format. knob. This enables an error checking parse of the sched domains which should
catch most possible errors (described above). It also prints out the domain
structure in a visual format.

View file

@ -213,6 +213,7 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
static int __init init_amu_fie(void) static int __init init_amu_fie(void)
{ {
bool invariance_status = topology_scale_freq_invariant();
cpumask_var_t valid_cpus; cpumask_var_t valid_cpus;
bool have_policy = false; bool have_policy = false;
int ret = 0; int ret = 0;
@ -255,6 +256,15 @@ static int __init init_amu_fie(void)
if (!topology_scale_freq_invariant()) if (!topology_scale_freq_invariant())
static_branch_disable(&amu_fie_key); static_branch_disable(&amu_fie_key);
/*
* Task scheduler behavior depends on frequency invariance support,
* either cpufreq or counter driven. If the support status changes as
* a result of counter initialisation and use, retrigger the build of
* scheduling domains to ensure the information is propagated properly.
*/
if (invariance_status != topology_scale_freq_invariant())
rebuild_sched_domains_energy();
free_valid_mask: free_valid_mask:
free_cpumask_var(valid_cpus); free_cpumask_var(valid_cpus);

View file

@ -382,9 +382,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{ {
seq_printf(m, "Cpus_allowed:\t%*pb\n", seq_printf(m, "Cpus_allowed:\t%*pb\n",
cpumask_pr_args(task->cpus_ptr)); cpumask_pr_args(&task->cpus_mask));
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
cpumask_pr_args(task->cpus_ptr)); cpumask_pr_args(&task->cpus_mask));
} }
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)

View file

@ -152,6 +152,7 @@ enum cpuhp_state {
CPUHP_AP_ONLINE, CPUHP_AP_ONLINE,
CPUHP_TEARDOWN_CPU, CPUHP_TEARDOWN_CPU,
CPUHP_AP_ONLINE_IDLE, CPUHP_AP_ONLINE_IDLE,
CPUHP_AP_SCHED_WAIT_EMPTY,
CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_SMPBOOT_THREADS,
CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_X86_VDSO_VMA_ONLINE,
CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE,

View file

@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
return cpumask_next_and(-1, src1p, src2p); return cpumask_next_and(-1, src1p, src2p);
} }
static inline int cpumask_any_distribute(const struct cpumask *srcp)
{
return cpumask_first(srcp);
}
#define for_each_cpu(cpu, mask) \ #define for_each_cpu(cpu, mask) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_not(cpu, mask) \ #define for_each_cpu_not(cpu, mask) \
@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node); unsigned int cpumask_local_spread(unsigned int i, int node);
int cpumask_any_and_distribute(const struct cpumask *src1p, int cpumask_any_and_distribute(const struct cpumask *src1p,
const struct cpumask *src2p); const struct cpumask *src2p);
int cpumask_any_distribute(const struct cpumask *srcp);
/** /**
* for_each_cpu - iterate over every cpu in a mask * for_each_cpu - iterate over every cpu in a mask

View file

@ -204,6 +204,7 @@ extern int _cond_resched(void);
extern void ___might_sleep(const char *file, int line, int preempt_offset); extern void ___might_sleep(const char *file, int line, int preempt_offset);
extern void __might_sleep(const char *file, int line, int preempt_offset); extern void __might_sleep(const char *file, int line, int preempt_offset);
extern void __cant_sleep(const char *file, int line, int preempt_offset); extern void __cant_sleep(const char *file, int line, int preempt_offset);
extern void __cant_migrate(const char *file, int line);
/** /**
* might_sleep - annotation for functions that can sleep * might_sleep - annotation for functions that can sleep
@ -227,6 +228,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
# define cant_sleep() \ # define cant_sleep() \
do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
# define sched_annotate_sleep() (current->task_state_change = 0) # define sched_annotate_sleep() (current->task_state_change = 0)
/**
* cant_migrate - annotation for functions that cannot migrate
*
* Will print a stack trace if executed in code which is migratable
*/
# define cant_migrate() \
do { \
if (IS_ENABLED(CONFIG_SMP)) \
__cant_migrate(__FILE__, __LINE__); \
} while (0)
/** /**
* non_block_start - annotate the start of section where sleeping is prohibited * non_block_start - annotate the start of section where sleeping is prohibited
* *
@ -251,6 +264,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
int preempt_offset) { } int preempt_offset) { }
# define might_sleep() do { might_resched(); } while (0) # define might_sleep() do { might_resched(); } while (0)
# define cant_sleep() do { } while (0) # define cant_sleep() do { } while (0)
# define cant_migrate() do { } while (0)
# define sched_annotate_sleep() do { } while (0) # define sched_annotate_sleep() do { } while (0)
# define non_block_start() do { } while (0) # define non_block_start() do { } while (0)
# define non_block_end() do { } while (0) # define non_block_end() do { } while (0)
@ -258,13 +272,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
#ifndef CONFIG_PREEMPT_RT
# define cant_migrate() cant_sleep()
#else
/* Placeholder for now */
# define cant_migrate() do { } while (0)
#endif
/** /**
* abs - return absolute value of an argument * abs - return absolute value of an argument
* @x: the value. If it is unsigned type, it is converted to signed type first. * @x: the value. If it is unsigned type, it is converted to signed type first.

View file

@ -322,34 +322,71 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
#endif #endif
/** #ifdef CONFIG_SMP
* migrate_disable - Prevent migration of the current task
*
* Maps to preempt_disable() which also disables preemption. Use
* migrate_disable() to annotate that the intent is to prevent migration,
* but not necessarily preemption.
*
* Can be invoked nested like preempt_disable() and needs the corresponding
* number of migrate_enable() invocations.
*/
static __always_inline void migrate_disable(void)
{
preempt_disable();
}
/** /*
* migrate_enable - Allow migration of the current task * Migrate-Disable and why it is undesired.
* *
* Counterpart to migrate_disable(). * When a preempted task becomes elegible to run under the ideal model (IOW it
* becomes one of the M highest priority tasks), it might still have to wait
* for the preemptee's migrate_disable() section to complete. Thereby suffering
* a reduction in bandwidth in the exact duration of the migrate_disable()
* section.
* *
* As migrate_disable() can be invoked nested, only the outermost invocation * Per this argument, the change from preempt_disable() to migrate_disable()
* reenables migration. * gets us:
*
* - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
* it would have had to wait for the lower priority task.
*
* - a lower priority tasks; which under preempt_disable() could've instantly
* migrated away when another CPU becomes available, is now constrained
* by the ability to push the higher priority task away, which might itself be
* in a migrate_disable() section, reducing it's available bandwidth.
*
* IOW it trades latency / moves the interference term, but it stays in the
* system, and as long as it remains unbounded, the system is not fully
* deterministic.
*
*
* The reason we have it anyway.
*
* PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
* number of primitives into becoming preemptible, they would also allow
* migration. This turns out to break a bunch of per-cpu usage. To this end,
* all these primitives employ migirate_disable() to restore this implicit
* assumption.
*
* This is a 'temporary' work-around at best. The correct solution is getting
* rid of the above assumptions and reworking the code to employ explicit
* per-cpu locking or short preempt-disable regions.
*
* The end goal must be to get rid of migrate_disable(), alternatively we need
* a schedulability theory that does not depend on abritrary migration.
*
*
* Notes on the implementation.
*
* The implementation is particularly tricky since existing code patterns
* dictate neither migrate_disable() nor migrate_enable() is allowed to block.
* This means that it cannot use cpus_read_lock() to serialize against hotplug,
* nor can it easily migrate itself into a pending affinity mask change on
* migrate_enable().
*
*
* Note: even non-work-conserving schedulers like semi-partitioned depends on
* migration, so migrate_disable() is not only a problem for
* work-conserving schedulers.
* *
* Currently mapped to preempt_enable().
*/ */
static __always_inline void migrate_enable(void) extern void migrate_disable(void);
{ extern void migrate_enable(void);
preempt_enable();
} #else
static inline void migrate_disable(void) { }
static inline void migrate_enable(void) { }
#endif /* CONFIG_SMP */
#endif /* __LINUX_PREEMPT_H */ #endif /* __LINUX_PREEMPT_H */

View file

@ -714,6 +714,11 @@ struct task_struct {
int nr_cpus_allowed; int nr_cpus_allowed;
const cpumask_t *cpus_ptr; const cpumask_t *cpus_ptr;
cpumask_t cpus_mask; cpumask_t cpus_mask;
void *migration_pending;
#ifdef CONFIG_SMP
unsigned short migration_disabled;
#endif
unsigned short migration_flags;
#ifdef CONFIG_PREEMPT_RCU #ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting; int rcu_read_lock_nesting;

View file

@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
extern int sched_cpu_deactivate(unsigned int cpu); extern int sched_cpu_deactivate(unsigned int cpu);
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
extern int sched_cpu_wait_empty(unsigned int cpu);
extern int sched_cpu_dying(unsigned int cpu); extern int sched_cpu_dying(unsigned int cpu);
#else #else
# define sched_cpu_wait_empty NULL
# define sched_cpu_dying NULL # define sched_cpu_dying NULL
#endif #endif

View file

@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
extern void membarrier_exec_mmap(struct mm_struct *mm); extern void membarrier_exec_mmap(struct mm_struct *mm);
extern void membarrier_update_current_mm(struct mm_struct *next_mm);
#else #else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev, static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm)
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{ {
} }
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
{
}
#endif #endif
#endif /* _LINUX_SCHED_MM_H */ #endif /* _LINUX_SCHED_MM_H */

View file

@ -225,6 +225,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
#endif /* !CONFIG_SMP */ #endif /* !CONFIG_SMP */
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
extern void rebuild_sched_domains_energy(void);
#else
static inline void rebuild_sched_domains_energy(void)
{
}
#endif
#ifndef arch_scale_cpu_capacity #ifndef arch_scale_cpu_capacity
/** /**
* arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.

View file

@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
struct cpu_stop_work { struct cpu_stop_work {
struct list_head list; /* cpu_stopper->works */ struct list_head list; /* cpu_stopper->works */
cpu_stop_fn_t fn; cpu_stop_fn_t fn;
unsigned long caller;
void *arg; void *arg;
struct cpu_stop_done *done; struct cpu_stop_done *done;
}; };
@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
void stop_machine_unpark(int cpu); void stop_machine_unpark(int cpu);
void stop_machine_yield(const struct cpumask *cpumask); void stop_machine_yield(const struct cpumask *cpumask);
extern void print_stop_info(const char *log_lvl, struct task_struct *task);
#else /* CONFIG_SMP */ #else /* CONFIG_SMP */
#include <linux/workqueue.h> #include <linux/workqueue.h>
@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu,
return false; return false;
} }
static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
/* /*

View file

@ -96,6 +96,8 @@ struct sched_param {
* on a CPU with a capacity big enough to fit the specified value. * on a CPU with a capacity big enough to fit the specified value.
* A task with a max utilization value smaller than 1024 is more likely * A task with a max utilization value smaller than 1024 is more likely
* scheduled on a CPU with no more capacity than the specified value. * scheduled on a CPU with no more capacity than the specified value.
*
* A task utilization boundary can be reset by setting the attribute to -1.
*/ */
struct sched_attr { struct sched_attr {
__u32 size; __u32 size;

View file

@ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
*/ */
static void rebuild_sched_domains_locked(void) static void rebuild_sched_domains_locked(void)
{ {
struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr; struct sched_domain_attr *attr;
cpumask_var_t *doms; cpumask_var_t *doms;
struct cpuset *cs;
int ndoms; int ndoms;
lockdep_assert_cpus_held(); lockdep_assert_cpus_held();
percpu_rwsem_assert_held(&cpuset_rwsem); percpu_rwsem_assert_held(&cpuset_rwsem);
/* /*
* We have raced with CPU hotplug. Don't do anything to avoid * If we have raced with CPU hotplug, return early to avoid
* passing doms with offlined cpu to partition_sched_domains(). * passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains. * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
*
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
* should be the same as the active CPUs, so checking only top_cpuset
* is enough to detect racing CPU offlines.
*/ */
if (!top_cpuset.nr_subparts_cpus && if (!top_cpuset.nr_subparts_cpus &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
return; return;
if (top_cpuset.nr_subparts_cpus && /*
!cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) * With subpartition CPUs, however, the effective CPUs of a partition
return; * root should be only a subset of the active CPUs. Since a CPU in any
* partition root could be offlined, all must be checked.
*/
if (top_cpuset.nr_subparts_cpus) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (!is_partition_root(cs)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
if (!cpumask_subset(cs->effective_cpus,
cpu_active_mask)) {
rcu_read_unlock();
return;
}
}
rcu_read_unlock();
}
/* Generate domain masks and attrs */ /* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr); ndoms = generate_sched_domains(&doms, &attr);

View file

@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.name = "ap:online", .name = "ap:online",
}, },
/* /*
* Handled on controll processor until the plugged processor manages * Handled on control processor until the plugged processor manages
* this itself. * this itself.
*/ */
[CPUHP_TEARDOWN_CPU] = { [CPUHP_TEARDOWN_CPU] = {
@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.teardown.single = takedown_cpu, .teardown.single = takedown_cpu,
.cant_stop = true, .cant_stop = true,
}, },
[CPUHP_AP_SCHED_WAIT_EMPTY] = {
.name = "sched:waitempty",
.startup.single = NULL,
.teardown.single = sched_cpu_wait_empty,
},
/* Handle smpboot threads park/unpark */ /* Handle smpboot threads park/unpark */
[CPUHP_AP_SMPBOOT_THREADS] = { [CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online", .name = "smpboot/threads:online",

View file

@ -475,10 +475,24 @@ static void exit_mm(void)
BUG_ON(mm != current->active_mm); BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */ /* more a memory barrier than a real lock */
task_lock(current); task_lock(current);
/*
* When a thread stops operating on an address space, the loop
* in membarrier_private_expedited() may not observe that
* tsk->mm, and the loop in membarrier_global_expedited() may
* not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
* rq->membarrier_state, so those would not issue an IPI.
* Membarrier requires a memory barrier after accessing
* user-space memory, before clearing tsk->mm or the
* rq->membarrier_state.
*/
smp_mb__after_spinlock();
local_irq_disable();
current->mm = NULL; current->mm = NULL;
mmap_read_unlock(mm); membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current); enter_lazy_tlb(mm, current);
local_irq_enable();
task_unlock(current); task_unlock(current);
mmap_read_unlock(mm);
mm_update_next_owner(mm); mm_update_next_owner(mm);
mmput(mm); mmput(mm);
if (test_thread_flag(TIF_MEMDIE)) if (test_thread_flag(TIF_MEMDIE))

View file

@ -1248,6 +1248,7 @@ void kthread_use_mm(struct mm_struct *mm)
tsk->active_mm = mm; tsk->active_mm = mm;
} }
tsk->mm = mm; tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk); switch_mm_irqs_off(active_mm, mm, tsk);
local_irq_enable(); local_irq_enable();
task_unlock(tsk); task_unlock(tsk);
@ -1255,8 +1256,19 @@ void kthread_use_mm(struct mm_struct *mm)
finish_arch_post_lock_switch(); finish_arch_post_lock_switch();
#endif #endif
/*
* When a kthread starts operating on an address space, the loop
* in membarrier_{private,global}_expedited() may not observe
* that tsk->mm, and not issue an IPI. Membarrier requires a
* memory barrier after storing to tsk->mm, before accessing
* user-space memory. A full memory barrier for membarrier
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
* mmdrop(), or explicitly with smp_mb().
*/
if (active_mm != mm) if (active_mm != mm)
mmdrop(active_mm); mmdrop(active_mm);
else
smp_mb();
to_kthread(tsk)->oldfs = force_uaccess_begin(); to_kthread(tsk)->oldfs = force_uaccess_begin();
} }
@ -1276,9 +1288,18 @@ void kthread_unuse_mm(struct mm_struct *mm)
force_uaccess_end(to_kthread(tsk)->oldfs); force_uaccess_end(to_kthread(tsk)->oldfs);
task_lock(tsk); task_lock(tsk);
/*
* When a kthread stops operating on an address space, the loop
* in membarrier_{private,global}_expedited() may not observe
* that tsk->mm, and not issue an IPI. Membarrier requires a
* memory barrier after accessing user-space memory, before
* clearing tsk->mm.
*/
smp_mb__after_spinlock();
sync_mm_rss(mm); sync_mm_rss(mm);
local_irq_disable(); local_irq_disable();
tsk->mm = NULL; tsk->mm = NULL;
membarrier_update_current_mm(NULL);
/* active_mm is still 'mm' */ /* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk); enter_lazy_tlb(mm, tsk);
local_irq_enable(); local_irq_enable();

File diff suppressed because it is too large Load diff

View file

@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl; const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask && if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
unsigned long cap, max_cap = 0; unsigned long cap, max_cap = 0;
int cpu, max_cpu = -1; int cpu, max_cpu = -1;
@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) { dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask) if (later_mask)
cpumask_set_cpu(best_cpu, later_mask); cpumask_set_cpu(best_cpu, later_mask);

View file

@ -901,16 +901,9 @@ struct cpufreq_governor *cpufreq_default_governor(void)
cpufreq_governor_init(schedutil_gov); cpufreq_governor_init(schedutil_gov);
#ifdef CONFIG_ENERGY_MODEL #ifdef CONFIG_ENERGY_MODEL
extern bool sched_energy_update;
extern struct mutex sched_energy_mutex;
static void rebuild_sd_workfn(struct work_struct *work) static void rebuild_sd_workfn(struct work_struct *work)
{ {
mutex_lock(&sched_energy_mutex); rebuild_sched_domains_energy();
sched_energy_update = true;
rebuild_sched_domains();
sched_energy_update = false;
mutex_unlock(&sched_energy_mutex);
} }
static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);

View file

@ -11,7 +11,7 @@
* This code tracks the priority of each CPU so that global migration * This code tracks the priority of each CPU so that global migration
* decisions are easy to calculate. Each CPU can be in a state as follows: * decisions are easy to calculate. Each CPU can be in a state as follows:
* *
* (INVALID), IDLE, NORMAL, RT1, ... RT99 * (INVALID), NORMAL, RT1, ... RT99, HIGHER
* *
* going from the lowest priority to the highest. CPUs in the INVALID state * going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with * are not eligible for routing. The system maintains this state with
@ -19,24 +19,48 @@
* in that class). Therefore a typical application without affinity * in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a * searches). For tasks with affinity restrictions, the algorithm has a
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that * worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived. * yields the worst case search is fairly contrived.
*/ */
#include "sched.h" #include "sched.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */ /*
* p->rt_priority p->prio newpri cpupri
*
* -1 -1 (CPUPRI_INVALID)
*
* 99 0 (CPUPRI_NORMAL)
*
* 1 98 98 1
* ...
* 49 50 50 49
* 50 49 49 50
* ...
* 99 0 0 99
*
* 100 100 (CPUPRI_HIGHER)
*/
static int convert_prio(int prio) static int convert_prio(int prio)
{ {
int cpupri; int cpupri;
if (prio == CPUPRI_INVALID) switch (prio) {
cpupri = CPUPRI_INVALID; case CPUPRI_INVALID:
else if (prio == MAX_PRIO) cpupri = CPUPRI_INVALID; /* -1 */
cpupri = CPUPRI_IDLE; break;
else if (prio >= MAX_RT_PRIO)
cpupri = CPUPRI_NORMAL; case 0 ... 98:
else cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
cpupri = MAX_RT_PRIO - prio + 1; break;
case MAX_RT_PRIO-1:
cpupri = CPUPRI_NORMAL; /* 0 */
break;
case MAX_RT_PRIO:
cpupri = CPUPRI_HIGHER; /* 100 */
break;
}
return cpupri; return cpupri;
} }
@ -73,11 +97,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip) if (skip)
return 0; return 0;
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
return 0; return 0;
if (lowest_mask) { if (lowest_mask) {
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
/* /*
* We have to ensure that we have at least one bit * We have to ensure that we have at least one bit
@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
* cpupri_set - update the CPU priority setting * cpupri_set - update the CPU priority setting
* @cp: The cpupri context * @cp: The cpupri context
* @cpu: The target CPU * @cpu: The target CPU
* @newpri: The priority (INVALID-RT99) to assign to this CPU * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
* *
* Note: Assumes cpu_rq(cpu)->lock is locked * Note: Assumes cpu_rq(cpu)->lock is locked
* *

View file

@ -1,11 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
#define CPUPRI_INVALID -1 #define CPUPRI_INVALID -1
#define CPUPRI_IDLE 0 #define CPUPRI_NORMAL 0
#define CPUPRI_NORMAL 1 /* values 1-99 are for RT1-RT99 priorities */
/* values 2-101 are RT priorities 0-99 */ #define CPUPRI_HIGHER 100
struct cpupri_vec { struct cpupri_vec {
atomic_t count; atomic_t count;

View file

@ -97,6 +97,17 @@ static inline unsigned long dl_bw_capacity(int i)
return __dl_bw_capacity(i); return __dl_bw_capacity(i);
} }
} }
static inline bool dl_bw_visited(int cpu, u64 gen)
{
struct root_domain *rd = cpu_rq(cpu)->rd;
if (rd->visit_gen == gen)
return true;
rd->visit_gen = gen;
return false;
}
#else #else
static inline struct dl_bw *dl_bw_of(int i) static inline struct dl_bw *dl_bw_of(int i)
{ {
@ -112,6 +123,11 @@ static inline unsigned long dl_bw_capacity(int i)
{ {
return SCHED_CAPACITY_SCALE; return SCHED_CAPACITY_SCALE;
} }
static inline bool dl_bw_visited(int cpu, u64 gen)
{
return false;
}
#endif #endif
static inline static inline
@ -543,7 +559,7 @@ static int push_dl_task(struct rq *rq);
static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
{ {
return dl_task(prev); return rq->online && dl_task(prev);
} }
static DEFINE_PER_CPU(struct callback_head, dl_push_head); static DEFINE_PER_CPU(struct callback_head, dl_push_head);
@ -1378,6 +1394,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (dl_rq->earliest_dl.curr == 0 || if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) { dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
if (dl_rq->earliest_dl.curr == 0)
cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
dl_rq->earliest_dl.curr = deadline; dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
} }
@ -1395,6 +1413,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0; dl_rq->earliest_dl.next = 0;
cpudl_clear(&rq->rd->cpudl, rq->cpu); cpudl_clear(&rq->rd->cpudl, rq->cpu);
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else { } else {
struct rb_node *leftmost = dl_rq->root.rb_leftmost; struct rb_node *leftmost = dl_rq->root.rb_leftmost;
struct sched_dl_entity *entry; struct sched_dl_entity *entry;
@ -1664,13 +1683,13 @@ static void yield_task_dl(struct rq *rq)
static int find_later_rq(struct task_struct *task); static int find_later_rq(struct task_struct *task);
static int static int
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) select_task_rq_dl(struct task_struct *p, int cpu, int flags)
{ {
struct task_struct *curr; struct task_struct *curr;
bool select_rq; bool select_rq;
struct rq *rq; struct rq *rq;
if (sd_flag != SD_BALANCE_WAKE) if (!(flags & WF_TTWU))
goto out; goto out;
rq = cpu_rq(cpu); rq = cpu_rq(cpu);
@ -1912,7 +1931,7 @@ static void task_fork_dl(struct task_struct *p)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{ {
if (!task_running(rq, p) && if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, p->cpus_ptr)) cpumask_test_cpu(cpu, &p->cpus_mask))
return 1; return 1;
return 0; return 0;
} }
@ -2002,8 +2021,8 @@ static int find_later_rq(struct task_struct *task)
return this_cpu; return this_cpu;
} }
best_cpu = cpumask_first_and(later_mask, best_cpu = cpumask_any_and_distribute(later_mask,
sched_domain_span(sd)); sched_domain_span(sd));
/* /*
* Last chance: if a CPU being in both later_mask * Last chance: if a CPU being in both later_mask
* and current sd span is valid, that becomes our * and current sd span is valid, that becomes our
@ -2025,7 +2044,7 @@ static int find_later_rq(struct task_struct *task)
if (this_cpu != -1) if (this_cpu != -1)
return this_cpu; return this_cpu;
cpu = cpumask_any(later_mask); cpu = cpumask_any_distribute(later_mask);
if (cpu < nr_cpu_ids) if (cpu < nr_cpu_ids)
return cpu; return cpu;
@ -2062,7 +2081,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */ /* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) { if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq || if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
task_running(rq, task) || task_running(rq, task) ||
!dl_task(task) || !dl_task(task) ||
!task_on_rq_queued(task))) { !task_on_rq_queued(task))) {
@ -2129,6 +2148,9 @@ static int push_dl_task(struct rq *rq)
return 0; return 0;
retry: retry:
if (is_migration_disabled(next_task))
return 0;
if (WARN_ON(next_task == rq->curr)) if (WARN_ON(next_task == rq->curr))
return 0; return 0;
@ -2206,7 +2228,7 @@ static void push_dl_tasks(struct rq *rq)
static void pull_dl_task(struct rq *this_rq) static void pull_dl_task(struct rq *this_rq)
{ {
int this_cpu = this_rq->cpu, cpu; int this_cpu = this_rq->cpu, cpu;
struct task_struct *p; struct task_struct *p, *push_task;
bool resched = false; bool resched = false;
struct rq *src_rq; struct rq *src_rq;
u64 dmin = LONG_MAX; u64 dmin = LONG_MAX;
@ -2236,6 +2258,7 @@ static void pull_dl_task(struct rq *this_rq)
continue; continue;
/* Might drop this_rq->lock */ /* Might drop this_rq->lock */
push_task = NULL;
double_lock_balance(this_rq, src_rq); double_lock_balance(this_rq, src_rq);
/* /*
@ -2267,17 +2290,27 @@ static void pull_dl_task(struct rq *this_rq)
src_rq->curr->dl.deadline)) src_rq->curr->dl.deadline))
goto skip; goto skip;
resched = true; if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
deactivate_task(src_rq, p, 0); } else {
set_task_cpu(p, this_cpu); deactivate_task(src_rq, p, 0);
activate_task(this_rq, p, 0); set_task_cpu(p, this_cpu);
dmin = p->dl.deadline; activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
resched = true;
}
/* Is there any other task even earlier? */ /* Is there any other task even earlier? */
} }
skip: skip:
double_unlock_balance(this_rq, src_rq); double_unlock_balance(this_rq, src_rq);
if (push_task) {
raw_spin_unlock(&this_rq->lock);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
raw_spin_lock(&this_rq->lock);
}
} }
if (resched) if (resched)
@ -2301,7 +2334,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
} }
static void set_cpus_allowed_dl(struct task_struct *p, static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask) const struct cpumask *new_mask,
u32 flags)
{ {
struct root_domain *src_rd; struct root_domain *src_rd;
struct rq *rq; struct rq *rq;
@ -2330,7 +2364,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
raw_spin_unlock(&src_dl_b->lock); raw_spin_unlock(&src_dl_b->lock);
} }
set_cpus_allowed_common(p, new_mask); set_cpus_allowed_common(p, new_mask, flags);
} }
/* Assumes rq->lock is held */ /* Assumes rq->lock is held */
@ -2503,8 +2537,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
} }
} }
const struct sched_class dl_sched_class DEFINE_SCHED_CLASS(dl) = {
__section("__dl_sched_class") = {
.enqueue_task = enqueue_task_dl, .enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl, .dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl, .yield_task = yield_task_dl,
@ -2523,6 +2557,7 @@ const struct sched_class dl_sched_class
.rq_online = rq_online_dl, .rq_online = rq_online_dl,
.rq_offline = rq_offline_dl, .rq_offline = rq_offline_dl,
.task_woken = task_woken_dl, .task_woken = task_woken_dl,
.find_lock_rq = find_lock_later_rq,
#endif #endif
.task_tick = task_tick_dl, .task_tick = task_tick_dl,
@ -2535,33 +2570,39 @@ const struct sched_class dl_sched_class
.update_curr = update_curr_dl, .update_curr = update_curr_dl,
}; };
/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
static u64 dl_generation;
int sched_dl_global_validate(void) int sched_dl_global_validate(void)
{ {
u64 runtime = global_rt_runtime(); u64 runtime = global_rt_runtime();
u64 period = global_rt_period(); u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime); u64 new_bw = to_ratio(period, runtime);
u64 gen = ++dl_generation;
struct dl_bw *dl_b; struct dl_bw *dl_b;
int cpu, ret = 0; int cpu, cpus, ret = 0;
unsigned long flags; unsigned long flags;
/* /*
* Here we want to check the bandwidth not being set to some * Here we want to check the bandwidth not being set to some
* value smaller than the currently allocated bandwidth in * value smaller than the currently allocated bandwidth in
* any of the root_domains. * any of the root_domains.
*
* FIXME: Cycling on all the CPUs is overdoing, but simpler than
* cycling on root_domains... Discussion on different/better
* solutions is welcome!
*/ */
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
rcu_read_lock_sched(); rcu_read_lock_sched();
if (dl_bw_visited(cpu, gen))
goto next;
dl_b = dl_bw_of(cpu); dl_b = dl_bw_of(cpu);
cpus = dl_bw_cpus(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags); raw_spin_lock_irqsave(&dl_b->lock, flags);
if (new_bw < dl_b->total_bw) if (new_bw * cpus < dl_b->total_bw)
ret = -EBUSY; ret = -EBUSY;
raw_spin_unlock_irqrestore(&dl_b->lock, flags); raw_spin_unlock_irqrestore(&dl_b->lock, flags);
next:
rcu_read_unlock_sched(); rcu_read_unlock_sched();
if (ret) if (ret)
@ -2587,6 +2628,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
void sched_dl_do_global(void) void sched_dl_do_global(void)
{ {
u64 new_bw = -1; u64 new_bw = -1;
u64 gen = ++dl_generation;
struct dl_bw *dl_b; struct dl_bw *dl_b;
int cpu; int cpu;
unsigned long flags; unsigned long flags;
@ -2597,11 +2639,14 @@ void sched_dl_do_global(void)
if (global_rt_runtime() != RUNTIME_INF) if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime()); new_bw = to_ratio(global_rt_period(), global_rt_runtime());
/*
* FIXME: As above...
*/
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
rcu_read_lock_sched(); rcu_read_lock_sched();
if (dl_bw_visited(cpu, gen)) {
rcu_read_unlock_sched();
continue;
}
dl_b = dl_bw_of(cpu); dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags); raw_spin_lock_irqsave(&dl_b->lock, flags);

View file

@ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (!schedstat_enabled()) if (!schedstat_enabled())
return; return;
/*
* When the sched_schedstat changes from 0 to 1, some sched se
* maybe already in the runqueue, the se->statistics.wait_start
* will be 0.So it will let the delta wrong. We need to avoid this
* scenario.
*/
if (unlikely(!schedstat_val(se->statistics.wait_start)))
return;
delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) { if (entity_is_task(se)) {
@ -4779,25 +4788,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_rq *qcfs_rq = cfs_rq_of(se); struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */ /* throttled entity or throttle-on-deactivate */
if (!se->on_rq) if (!se->on_rq)
break; goto done;
if (dequeue) { dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
} else {
update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se);
}
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight) if (qcfs_rq->load.weight) {
dequeue = 0; /* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
break;
}
} }
if (!se) for_each_sched_entity(se) {
sub_nr_running(rq, task_delta); struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
goto done;
update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se);
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
}
/* At this point se is NULL and we are at root level*/
sub_nr_running(rq, task_delta);
done:
/* /*
* Note: distribution will already see us throttled via the * Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion. * throttled-list. rq->lock protects completion.
@ -5105,9 +5126,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
return; return;
distribute_cfs_runtime(cfs_b); distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
} }
/* /*
@ -5804,6 +5822,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
if (sync && cpu_rq(this_cpu)->nr_running == 1) if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu; return this_cpu;
if (available_idle_cpu(prev_cpu))
return prev_cpu;
return nr_cpumask_bits; return nr_cpumask_bits;
} }
@ -6663,7 +6684,7 @@ fail:
/* /*
* select_task_rq_fair: Select target runqueue for the waking task in domains * select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC. * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
* *
* Balances load by selecting the idlest CPU in the idlest group, or under * Balances load by selecting the idlest CPU in the idlest group, or under
@ -6674,15 +6695,17 @@ fail:
* preempt must be disabled. * preempt must be disabled.
*/ */
static int static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
{ {
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
struct sched_domain *tmp, *sd = NULL; struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id(); int cpu = smp_processor_id();
int new_cpu = prev_cpu; int new_cpu = prev_cpu;
int want_affine = 0; int want_affine = 0;
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); /* SD_flags and WF_flags share the first nibble */
int sd_flag = wake_flags & 0xF;
if (sd_flag & SD_BALANCE_WAKE) { if (wake_flags & WF_TTWU) {
record_wakee(p); record_wakee(p);
if (sched_energy_enabled()) { if (sched_energy_enabled()) {
@ -6719,9 +6742,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (unlikely(sd)) { if (unlikely(sd)) {
/* Slow path */ /* Slow path */
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ } else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */ /* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
if (want_affine) if (want_affine)
@ -10047,6 +10069,10 @@ static inline int find_new_ilb(void)
for_each_cpu_and(ilb, nohz.idle_cpus_mask, for_each_cpu_and(ilb, nohz.idle_cpus_mask,
housekeeping_cpumask(HK_FLAG_MISC)) { housekeeping_cpumask(HK_FLAG_MISC)) {
if (ilb == smp_processor_id())
continue;
if (idle_cpu(ilb)) if (idle_cpu(ilb))
return ilb; return ilb;
} }
@ -11158,8 +11184,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/* /*
* All the scheduling class methods: * All the scheduling class methods:
*/ */
const struct sched_class fair_sched_class DEFINE_SCHED_CLASS(fair) = {
__section("__fair_sched_class") = {
.enqueue_task = enqueue_task_fair, .enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair, .dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair, .yield_task = yield_task_fair,

View file

@ -338,6 +338,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
WARN_ON_ONCE(!duration_ns); WARN_ON_ONCE(!duration_ns);
WARN_ON_ONCE(current->mm);
rcu_sleep_check(); rcu_sleep_check();
preempt_disable(); preempt_disable();
@ -375,7 +376,7 @@ void cpu_startup_entry(enum cpuhp_state state)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static int static int
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) select_task_rq_idle(struct task_struct *p, int cpu, int flags)
{ {
return task_cpu(p); /* IDLE tasks as never migrated */ return task_cpu(p); /* IDLE tasks as never migrated */
} }
@ -457,8 +458,8 @@ static void update_curr_idle(struct rq *rq)
/* /*
* Simple, special scheduling class for the per-CPU idle tasks: * Simple, special scheduling class for the per-CPU idle tasks:
*/ */
const struct sched_class idle_sched_class DEFINE_SCHED_CLASS(idle) = {
__section("__idle_sched_class") = {
/* no enqueue/yield_task for idle tasks */ /* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */ /* dequeue is not valid, we print a debug message there: */

View file

@ -6,6 +6,134 @@
*/ */
#include "sched.h" #include "sched.h"
/*
* For documentation purposes, here are some membarrier ordering
* scenarios to keep in mind:
*
* A) Userspace thread execution after IPI vs membarrier's memory
* barrier before sending the IPI
*
* Userspace variables:
*
* int x = 0, y = 0;
*
* The memory barrier at the start of membarrier() on CPU0 is necessary in
* order to enforce the guarantee that any writes occurring on CPU0 before
* the membarrier() is executed will be visible to any code executing on
* CPU1 after the IPI-induced memory barrier:
*
* CPU0 CPU1
*
* x = 1
* membarrier():
* a: smp_mb()
* b: send IPI IPI-induced mb
* c: smp_mb()
* r2 = y
* y = 1
* barrier()
* r1 = x
*
* BUG_ON(r1 == 0 && r2 == 0)
*
* The write to y and load from x by CPU1 are unordered by the hardware,
* so it's possible to have "r1 = x" reordered before "y = 1" at any
* point after (b). If the memory barrier at (a) is omitted, then "x = 1"
* can be reordered after (a) (although not after (c)), so we get r1 == 0
* and r2 == 0. This violates the guarantee that membarrier() is
* supposed by provide.
*
* The timing of the memory barrier at (a) has to ensure that it executes
* before the IPI-induced memory barrier on CPU1.
*
* B) Userspace thread execution before IPI vs membarrier's memory
* barrier after completing the IPI
*
* Userspace variables:
*
* int x = 0, y = 0;
*
* The memory barrier at the end of membarrier() on CPU0 is necessary in
* order to enforce the guarantee that any writes occurring on CPU1 before
* the membarrier() is executed will be visible to any code executing on
* CPU0 after the membarrier():
*
* CPU0 CPU1
*
* x = 1
* barrier()
* y = 1
* r2 = y
* membarrier():
* a: smp_mb()
* b: send IPI IPI-induced mb
* c: smp_mb()
* r1 = x
* BUG_ON(r1 == 0 && r2 == 1)
*
* The writes to x and y are unordered by the hardware, so it's possible to
* have "r2 = 1" even though the write to x doesn't execute until (b). If
* the memory barrier at (c) is omitted then "r1 = x" can be reordered
* before (b) (although not before (a)), so we get "r1 = 0". This violates
* the guarantee that membarrier() is supposed to provide.
*
* The timing of the memory barrier at (c) has to ensure that it executes
* after the IPI-induced memory barrier on CPU1.
*
* C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
*
* CPU0 CPU1
*
* membarrier():
* a: smp_mb()
* d: switch to kthread (includes mb)
* b: read rq->curr->mm == NULL
* e: switch to user (includes mb)
* c: smp_mb()
*
* Using the scenario from (A), we can show that (a) needs to be paired
* with (e). Using the scenario from (B), we can show that (c) needs to
* be paired with (d).
*
* D) exit_mm vs membarrier
*
* Two thread groups are created, A and B. Thread group B is created by
* issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
* Let's assume we have a single thread within each thread group (Thread A
* and Thread B). Thread A runs on CPU0, Thread B runs on CPU1.
*
* CPU0 CPU1
*
* membarrier():
* a: smp_mb()
* exit_mm():
* d: smp_mb()
* e: current->mm = NULL
* b: read rq->curr->mm == NULL
* c: smp_mb()
*
* Using scenario (B), we can show that (c) needs to be paired with (d).
*
* E) kthread_{use,unuse}_mm vs membarrier
*
* CPU0 CPU1
*
* membarrier():
* a: smp_mb()
* kthread_unuse_mm()
* d: smp_mb()
* e: current->mm = NULL
* b: read rq->curr->mm == NULL
* kthread_use_mm()
* f: current->mm = mm
* g: smp_mb()
* c: smp_mb()
*
* Using the scenario from (A), we can show that (a) needs to be paired
* with (g). Using the scenario from (B), we can show that (c) needs to
* be paired with (d).
*/
/* /*
* Bitmask made from a "or" of all commands within enum membarrier_cmd, * Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY. * except MEMBARRIER_CMD_QUERY.
@ -76,6 +204,18 @@ void membarrier_exec_mmap(struct mm_struct *mm)
this_cpu_write(runqueues.membarrier_state, 0); this_cpu_write(runqueues.membarrier_state, 0);
} }
void membarrier_update_current_mm(struct mm_struct *next_mm)
{
struct rq *rq = this_rq();
int membarrier_state = 0;
if (next_mm)
membarrier_state = atomic_read(&next_mm->membarrier_state);
if (READ_ONCE(rq->membarrier_state) == membarrier_state)
return;
WRITE_ONCE(rq->membarrier_state, membarrier_state);
}
static int membarrier_global_expedited(void) static int membarrier_global_expedited(void)
{ {
int cpu; int cpu;
@ -114,12 +254,11 @@ static int membarrier_global_expedited(void)
continue; continue;
/* /*
* Skip the CPU if it runs a kernel thread. The scheduler * Skip the CPU if it runs a kernel thread which is not using
* leaves the prior task mm in place as an optimization when * a task mm.
* scheduling a kthread.
*/ */
p = rcu_dereference(cpu_rq(cpu)->curr); p = rcu_dereference(cpu_rq(cpu)->curr);
if (p->flags & PF_KTHREAD) if (!p->mm)
continue; continue;
__cpumask_set_cpu(cpu, tmpmask); __cpumask_set_cpu(cpu, tmpmask);

View file

@ -89,8 +89,8 @@ void init_rt_rq(struct rt_rq *rt_rq)
__set_bit(MAX_RT_PRIO, array->bitmap); __set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP #if defined CONFIG_SMP
rt_rq->highest_prio.curr = MAX_RT_PRIO; rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->highest_prio.next = MAX_RT_PRIO; rt_rq->highest_prio.next = MAX_RT_PRIO-1;
rt_rq->rt_nr_migratory = 0; rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0; rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks); plist_head_init(&rt_rq->pushable_tasks);
@ -161,7 +161,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
rt_rq->highest_prio.curr = MAX_RT_PRIO; rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->rt_nr_boosted = 0; rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq; rt_rq->rq = rq;
rt_rq->tg = tg; rt_rq->tg = tg;
@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{ {
/* Try to pull RT tasks here if we lower this rq's prio */ /* Try to pull RT tasks here if we lower this rq's prio */
return rq->rt.highest_prio.curr > prev->prio; return rq->online && rq->rt.highest_prio.curr > prev->prio;
} }
static inline int rt_overloaded(struct rq *rq) static inline int rt_overloaded(struct rq *rq)
@ -393,8 +393,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
p = plist_first_entry(&rq->rt.pushable_tasks, p = plist_first_entry(&rq->rt.pushable_tasks,
struct task_struct, pushable_tasks); struct task_struct, pushable_tasks);
rq->rt.highest_prio.next = p->prio; rq->rt.highest_prio.next = p->prio;
} else } else {
rq->rt.highest_prio.next = MAX_RT_PRIO; rq->rt.highest_prio.next = MAX_RT_PRIO-1;
}
} }
#else #else
@ -1147,8 +1148,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
sched_find_first_bit(array->bitmap); sched_find_first_bit(array->bitmap);
} }
} else } else {
rt_rq->highest_prio.curr = MAX_RT_PRIO; rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
}
dec_rt_prio_smp(rt_rq, prio, prev_prio); dec_rt_prio_smp(rt_rq, prio, prev_prio);
} }
@ -1428,14 +1430,14 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task); static int find_lowest_rq(struct task_struct *task);
static int static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) select_task_rq_rt(struct task_struct *p, int cpu, int flags)
{ {
struct task_struct *curr; struct task_struct *curr;
struct rq *rq; struct rq *rq;
bool test; bool test;
/* For anything but wake ups, just return the task_cpu */ /* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) if (!(flags & (WF_TTWU | WF_FORK)))
goto out; goto out;
rq = cpu_rq(cpu); rq = cpu_rq(cpu);
@ -1658,7 +1660,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{ {
if (!task_running(rq, p) && if (!task_running(rq, p) &&
cpumask_test_cpu(cpu, p->cpus_ptr)) cpumask_test_cpu(cpu, &p->cpus_mask))
return 1; return 1;
return 0; return 0;
@ -1752,8 +1754,8 @@ static int find_lowest_rq(struct task_struct *task)
return this_cpu; return this_cpu;
} }
best_cpu = cpumask_first_and(lowest_mask, best_cpu = cpumask_any_and_distribute(lowest_mask,
sched_domain_span(sd)); sched_domain_span(sd));
if (best_cpu < nr_cpu_ids) { if (best_cpu < nr_cpu_ids) {
rcu_read_unlock(); rcu_read_unlock();
return best_cpu; return best_cpu;
@ -1770,7 +1772,7 @@ static int find_lowest_rq(struct task_struct *task)
if (this_cpu != -1) if (this_cpu != -1)
return this_cpu; return this_cpu;
cpu = cpumask_any(lowest_mask); cpu = cpumask_any_distribute(lowest_mask);
if (cpu < nr_cpu_ids) if (cpu < nr_cpu_ids)
return cpu; return cpu;
@ -1811,7 +1813,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq. * Also make sure that it wasn't scheduled on its rq.
*/ */
if (unlikely(task_rq(task) != rq || if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
task_running(rq, task) || task_running(rq, task) ||
!rt_task(task) || !rt_task(task) ||
!task_on_rq_queued(task))) { !task_on_rq_queued(task))) {
@ -1859,7 +1861,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
* running task can migrate over to a CPU that is running a task * running task can migrate over to a CPU that is running a task
* of lesser priority. * of lesser priority.
*/ */
static int push_rt_task(struct rq *rq) static int push_rt_task(struct rq *rq, bool pull)
{ {
struct task_struct *next_task; struct task_struct *next_task;
struct rq *lowest_rq; struct rq *lowest_rq;
@ -1873,6 +1875,34 @@ static int push_rt_task(struct rq *rq)
return 0; return 0;
retry: retry:
if (is_migration_disabled(next_task)) {
struct task_struct *push_task = NULL;
int cpu;
if (!pull || rq->push_busy)
return 0;
cpu = find_lowest_rq(rq->curr);
if (cpu == -1 || cpu == rq->cpu)
return 0;
/*
* Given we found a CPU with lower priority than @next_task,
* therefore it should be running. However we cannot migrate it
* to this other CPU, instead attempt to push the current
* running task on this CPU away.
*/
push_task = get_push_task(rq);
if (push_task) {
raw_spin_unlock(&rq->lock);
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
push_task, &rq->push_work);
raw_spin_lock(&rq->lock);
}
return 0;
}
if (WARN_ON(next_task == rq->curr)) if (WARN_ON(next_task == rq->curr))
return 0; return 0;
@ -1927,12 +1957,10 @@ retry:
deactivate_task(rq, next_task, 0); deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu); set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0); activate_task(lowest_rq, next_task, 0);
resched_curr(lowest_rq);
ret = 1; ret = 1;
resched_curr(lowest_rq);
double_unlock_balance(rq, lowest_rq); double_unlock_balance(rq, lowest_rq);
out: out:
put_task_struct(next_task); put_task_struct(next_task);
@ -1942,7 +1970,7 @@ out:
static void push_rt_tasks(struct rq *rq) static void push_rt_tasks(struct rq *rq)
{ {
/* push_rt_task will return true if it moved an RT */ /* push_rt_task will return true if it moved an RT */
while (push_rt_task(rq)) while (push_rt_task(rq, false))
; ;
} }
@ -2095,7 +2123,8 @@ void rto_push_irq_work_func(struct irq_work *work)
*/ */
if (has_pushable_tasks(rq)) { if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock); raw_spin_lock(&rq->lock);
push_rt_tasks(rq); while (push_rt_task(rq, true))
;
raw_spin_unlock(&rq->lock); raw_spin_unlock(&rq->lock);
} }
@ -2120,7 +2149,7 @@ static void pull_rt_task(struct rq *this_rq)
{ {
int this_cpu = this_rq->cpu, cpu; int this_cpu = this_rq->cpu, cpu;
bool resched = false; bool resched = false;
struct task_struct *p; struct task_struct *p, *push_task;
struct rq *src_rq; struct rq *src_rq;
int rt_overload_count = rt_overloaded(this_rq); int rt_overload_count = rt_overloaded(this_rq);
@ -2167,6 +2196,7 @@ static void pull_rt_task(struct rq *this_rq)
* double_lock_balance, and another CPU could * double_lock_balance, and another CPU could
* alter this_rq * alter this_rq
*/ */
push_task = NULL;
double_lock_balance(this_rq, src_rq); double_lock_balance(this_rq, src_rq);
/* /*
@ -2194,11 +2224,14 @@ static void pull_rt_task(struct rq *this_rq)
if (p->prio < src_rq->curr->prio) if (p->prio < src_rq->curr->prio)
goto skip; goto skip;
resched = true; if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
deactivate_task(src_rq, p, 0); } else {
set_task_cpu(p, this_cpu); deactivate_task(src_rq, p, 0);
activate_task(this_rq, p, 0); set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
resched = true;
}
/* /*
* We continue with the search, just in * We continue with the search, just in
* case there's an even higher prio task * case there's an even higher prio task
@ -2208,6 +2241,13 @@ static void pull_rt_task(struct rq *this_rq)
} }
skip: skip:
double_unlock_balance(this_rq, src_rq); double_unlock_balance(this_rq, src_rq);
if (push_task) {
raw_spin_unlock(&this_rq->lock);
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
push_task, &src_rq->push_work);
raw_spin_lock(&this_rq->lock);
}
} }
if (resched) if (resched)
@ -2429,8 +2469,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0; return 0;
} }
const struct sched_class rt_sched_class DEFINE_SCHED_CLASS(rt) = {
__section("__rt_sched_class") = {
.enqueue_task = enqueue_task_rt, .enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt, .dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt, .yield_task = yield_task_rt,
@ -2449,6 +2489,7 @@ const struct sched_class rt_sched_class
.rq_offline = rq_offline_rt, .rq_offline = rq_offline_rt,
.task_woken = task_woken_rt, .task_woken = task_woken_rt,
.switched_from = switched_from_rt, .switched_from = switched_from_rt,
.find_lock_rq = find_lock_lowest_rq,
#endif #endif
.task_tick = task_tick_rt, .task_tick = task_tick_rt,

View file

@ -67,7 +67,6 @@
#include <linux/tsacct_kern.h> #include <linux/tsacct_kern.h>
#include <asm/tlb.h> #include <asm/tlb.h>
#include <asm-generic/vmlinux.lds.h>
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h> # include <asm/paravirt.h>
@ -257,30 +256,6 @@ struct rt_bandwidth {
void __dl_clear_params(struct task_struct *p); void __dl_clear_params(struct task_struct *p);
/*
* To keep the bandwidth of -deadline tasks and groups under control
* we need some place where:
* - store the maximum -deadline bandwidth of the system (the group);
* - cache the fraction of that bandwidth that is currently allocated.
*
* This is all done in the data structure below. It is similar to the
* one used for RT-throttling (rt_bandwidth), with the main difference
* that, since here we are only interested in admission control, we
* do not decrease any runtime while the group "executes", neither we
* need a timer to replenish it.
*
* With respect to SMP, the bandwidth is given on a per-CPU basis,
* meaning that:
* - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
* - dl_total_bw array contains, in the i-eth element, the currently
* allocated bandwidth on the i-eth CPU.
* Moreover, groups consume bandwidth on each CPU, while tasks only
* consume bandwidth on the CPU they're running on.
* Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
* that will be shown the next time the proc or cgroup controls will
* be red. It on its turn can be changed by writing on its own
* control.
*/
struct dl_bandwidth { struct dl_bandwidth {
raw_spinlock_t dl_runtime_lock; raw_spinlock_t dl_runtime_lock;
u64 dl_runtime; u64 dl_runtime;
@ -292,6 +267,24 @@ static inline int dl_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0; return sysctl_sched_rt_runtime >= 0;
} }
/*
* To keep the bandwidth of -deadline tasks under control
* we need some place where:
* - store the maximum -deadline bandwidth of each cpu;
* - cache the fraction of bandwidth that is currently allocated in
* each root domain;
*
* This is all done in the data structure below. It is similar to the
* one used for RT-throttling (rt_bandwidth), with the main difference
* that, since here we are only interested in admission control, we
* do not decrease any runtime while the group "executes", neither we
* need a timer to replenish it.
*
* With respect to SMP, bandwidth is given on a per root domain basis,
* meaning that:
* - bw (< 100%) is the deadline bandwidth of each CPU;
* - total_bw is the currently allocated bandwidth in each root domain;
*/
struct dl_bw { struct dl_bw {
raw_spinlock_t lock; raw_spinlock_t lock;
u64 bw; u64 bw;
@ -801,6 +794,15 @@ struct root_domain {
struct dl_bw dl_bw; struct dl_bw dl_bw;
struct cpudl cpudl; struct cpudl cpudl;
/*
* Indicate whether a root_domain's dl_bw has been checked or
* updated. It's monotonously increasing value.
*
* Also, some corner cases, like 'wrap around' is dangerous, but given
* that u64 is 'big enough'. So that shouldn't be a concern.
*/
u64 visit_gen;
#ifdef HAVE_RT_PUSH_IPI #ifdef HAVE_RT_PUSH_IPI
/* /*
* For IPI pull requests, loop across the rto_mask. * For IPI pull requests, loop across the rto_mask.
@ -973,6 +975,7 @@ struct rq {
unsigned long cpu_capacity_orig; unsigned long cpu_capacity_orig;
struct callback_head *balance_callback; struct callback_head *balance_callback;
unsigned char balance_flags;
unsigned char nohz_idle_balance; unsigned char nohz_idle_balance;
unsigned char idle_balance; unsigned char idle_balance;
@ -1003,6 +1006,10 @@ struct rq {
/* This is used to determine avg_idle's max value */ /* This is used to determine avg_idle's max value */
u64 max_idle_balance_cost; u64 max_idle_balance_cost;
#ifdef CONFIG_HOTPLUG_CPU
struct rcuwait hotplug_wait;
#endif
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@ -1048,6 +1055,12 @@ struct rq {
/* Must be inspected within a rcu lock section */ /* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state; struct cpuidle_state *idle_state;
#endif #endif
#ifdef CONFIG_SMP
unsigned int nr_pinned;
#endif
unsigned int push_busy;
struct cpu_stop_work push_work;
}; };
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
@ -1075,6 +1088,16 @@ static inline int cpu_of(struct rq *rq)
#endif #endif
} }
#define MDF_PUSH 0x01
static inline bool is_migration_disabled(struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->migration_disabled;
#else
return false;
#endif
}
#ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT
extern void __update_idle_core(struct rq *rq); extern void __update_idle_core(struct rq *rq);
@ -1221,6 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0; rf->clock_update_flags = 0;
#endif #endif
#ifdef CONFIG_SMP
SCHED_WARN_ON(rq->balance_callback);
#endif
} }
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
@ -1382,6 +1408,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define BALANCE_WORK 0x01
#define BALANCE_PUSH 0x02
static inline void static inline void
queue_balance_callback(struct rq *rq, queue_balance_callback(struct rq *rq,
struct callback_head *head, struct callback_head *head,
@ -1389,12 +1418,13 @@ queue_balance_callback(struct rq *rq,
{ {
lockdep_assert_held(&rq->lock); lockdep_assert_held(&rq->lock);
if (unlikely(head->next)) if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
return; return;
head->func = (void (*)(struct callback_head *))func; head->func = (void (*)(struct callback_head *))func;
head->next = rq->balance_callback; head->next = rq->balance_callback;
rq->balance_callback = head; rq->balance_callback = head;
rq->balance_flags |= BALANCE_WORK;
} }
#define rcu_dereference_check_sched_domain(p) \ #define rcu_dereference_check_sched_domain(p) \
@ -1714,13 +1744,20 @@ static inline int task_on_rq_migrating(struct task_struct *p)
return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
} }
/* /* Wake flags. The first three directly map to some SD flag value */
* wake flags #define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
*/ #define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ #define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
#define WF_FORK 0x02 /* Child wakeup after fork */
#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */
#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
static_assert(WF_FORK == SD_BALANCE_FORK);
static_assert(WF_TTWU == SD_BALANCE_WAKE);
#endif
/* /*
* To aid in avoiding the subversion of "niceness" due to uneven distribution * To aid in avoiding the subversion of "niceness" due to uneven distribution
@ -1796,16 +1833,19 @@ struct sched_class {
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
void (*task_woken)(struct rq *this_rq, struct task_struct *task); void (*task_woken)(struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p, void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask); const struct cpumask *newmask,
u32 flags);
void (*rq_online)(struct rq *rq); void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq); void (*rq_offline)(struct rq *rq);
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
#endif #endif
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@ -1833,7 +1873,7 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type); void (*task_change_group)(struct task_struct *p, int type);
#endif #endif
} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ };
static inline void put_prev_task(struct rq *rq, struct task_struct *prev) static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{ {
@ -1847,6 +1887,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
next->sched_class->set_next_task(rq, next, false); next->sched_class->set_next_task(rq, next, false);
} }
/*
* Helper to define a sched_class instance; each one is placed in a separate
* section which is ordered by the linker script:
*
* include/asm-generic/vmlinux.lds.h
*
* Also enforce alignment on the instance, not the type, to guarantee layout.
*/
#define DEFINE_SCHED_CLASS(name) \
const struct sched_class name##_sched_class \
__aligned(__alignof__(struct sched_class)) \
__section("__" #name "_sched_class")
/* Defined in include/asm-generic/vmlinux.lds.h */ /* Defined in include/asm-generic/vmlinux.lds.h */
extern struct sched_class __begin_sched_classes[]; extern struct sched_class __begin_sched_classes[];
extern struct sched_class __end_sched_classes[]; extern struct sched_class __end_sched_classes[];
@ -1889,13 +1943,35 @@ static inline bool sched_fair_runnable(struct rq *rq)
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_next_task_idle(struct rq *rq); extern struct task_struct *pick_next_task_idle(struct rq *rq);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
#define SCA_MIGRATE_ENABLE 0x04
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq); extern void trigger_load_balance(struct rq *rq);
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
static inline struct task_struct *get_push_task(struct rq *rq)
{
struct task_struct *p = rq->curr;
lockdep_assert_held(&rq->lock);
if (rq->push_busy)
return NULL;
if (p->nr_cpus_allowed == 1)
return NULL;
rq->push_busy = true;
return get_task_struct(p);
}
extern int push_cpu_stop(void *arg);
#endif #endif

View file

@ -11,7 +11,7 @@
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
static int static int
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) select_task_rq_stop(struct task_struct *p, int cpu, int flags)
{ {
return task_cpu(p); /* stop tasks as never migrate */ return task_cpu(p); /* stop tasks as never migrate */
} }
@ -109,8 +109,7 @@ static void update_curr_stop(struct rq *rq)
/* /*
* Simple, special scheduling class for the per-CPU stop tasks: * Simple, special scheduling class for the per-CPU stop tasks:
*/ */
const struct sched_class stop_sched_class DEFINE_SCHED_CLASS(stop) = {
__section("__stop_sched_class") = {
.enqueue_task = enqueue_task_stop, .enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop, .dequeue_task = dequeue_task_stop,

View file

@ -211,6 +211,15 @@ unsigned int sysctl_sched_energy_aware = 1;
DEFINE_MUTEX(sched_energy_mutex); DEFINE_MUTEX(sched_energy_mutex);
bool sched_energy_update; bool sched_energy_update;
void rebuild_sched_domains_energy(void)
{
mutex_lock(&sched_energy_mutex);
sched_energy_update = true;
rebuild_sched_domains();
sched_energy_update = false;
mutex_unlock(&sched_energy_mutex);
}
#ifdef CONFIG_PROC_SYSCTL #ifdef CONFIG_PROC_SYSCTL
int sched_energy_aware_handler(struct ctl_table *table, int write, int sched_energy_aware_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos) void *buffer, size_t *lenp, loff_t *ppos)
@ -223,13 +232,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) { if (!ret && write) {
state = static_branch_unlikely(&sched_energy_present); state = static_branch_unlikely(&sched_energy_present);
if (state != sysctl_sched_energy_aware) { if (state != sysctl_sched_energy_aware)
mutex_lock(&sched_energy_mutex); rebuild_sched_domains_energy();
sched_energy_update = 1;
rebuild_sched_domains();
sched_energy_update = 0;
mutex_unlock(&sched_energy_mutex);
}
} }
return ret; return ret;
@ -324,6 +328,7 @@ static void sched_energy_set(bool has_eas)
* 3. no SMT is detected. * 3. no SMT is detected.
* 4. the EM complexity is low enough to keep scheduling overheads low; * 4. the EM complexity is low enough to keep scheduling overheads low;
* 5. schedutil is driving the frequency of all CPUs of the rd; * 5. schedutil is driving the frequency of all CPUs of the rd;
* 6. frequency invariance support is present;
* *
* The complexity of the Energy Model is defined as: * The complexity of the Energy Model is defined as:
* *
@ -372,6 +377,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
goto free; goto free;
} }
if (!arch_scale_freq_invariant()) {
if (sched_debug()) {
pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
cpumask_pr_args(cpu_map));
}
goto free;
}
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
/* Skip already covered CPUs. */ /* Skip already covered CPUs. */
if (find_pd(pd, i)) if (find_pd(pd, i))
@ -516,6 +529,7 @@ static int init_rootdomain(struct root_domain *rd)
init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
#endif #endif
rd->visit_gen = 0;
init_dl_bw(&rd->dl_bw); init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0) if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask; goto free_rto_mask;
@ -674,6 +688,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp; struct sched_domain *tmp;
int numa_distance = 0;
/* Remove the sched domains which do not contribute to scheduling. */ /* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) { for (tmp = sd; tmp; ) {
@ -705,6 +720,38 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sd->child = NULL; sd->child = NULL;
} }
for (tmp = sd; tmp; tmp = tmp->parent)
numa_distance += !!(tmp->flags & SD_NUMA);
/*
* FIXME: Diameter >=3 is misrepresented.
*
* Smallest diameter=3 topology is:
*
* node 0 1 2 3
* 0: 10 20 30 40
* 1: 20 10 20 30
* 2: 30 20 10 20
* 3: 40 30 20 10
*
* 0 --- 1 --- 2 --- 3
*
* NUMA-3 0-3 N/A N/A 0-3
* groups: {0-2},{1-3} {1-3},{0-2}
*
* NUMA-2 0-2 0-3 0-3 1-3
* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
*
* NUMA-1 0-1 0-2 1-3 2-3
* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
*
* NUMA-0 0 1 2 3
*
* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
* group span isn't a subset of the domain span.
*/
WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
sched_domain_debug(sd, cpu); sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd); rq_attach_root(rq, rd);

View file

@ -42,11 +42,27 @@ struct cpu_stopper {
struct list_head works; /* list of pending works */ struct list_head works; /* list of pending works */
struct cpu_stop_work stop_work; /* for stop_cpus */ struct cpu_stop_work stop_work; /* for stop_cpus */
unsigned long caller;
cpu_stop_fn_t fn;
}; };
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
static bool stop_machine_initialized = false; static bool stop_machine_initialized = false;
void print_stop_info(const char *log_lvl, struct task_struct *task)
{
/*
* If @task is a stopper task, it cannot migrate and task_cpu() is
* stable.
*/
struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task));
if (task != stopper->thread)
return;
printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
}
/* static data for stop_cpus */ /* static data for stop_cpus */
static DEFINE_MUTEX(stop_cpus_mutex); static DEFINE_MUTEX(stop_cpus_mutex);
static bool stop_cpus_in_progress; static bool stop_cpus_in_progress;
@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
{ {
struct cpu_stop_done done; struct cpu_stop_done done;
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
cpu_stop_init_done(&done, 1); cpu_stop_init_done(&done, 1);
if (!cpu_stop_queue_work(cpu, &work)) if (!cpu_stop_queue_work(cpu, &work))
@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
work1 = work2 = (struct cpu_stop_work){ work1 = work2 = (struct cpu_stop_work){
.fn = multi_cpu_stop, .fn = multi_cpu_stop,
.arg = &msdata, .arg = &msdata,
.done = &done .done = &done,
.caller = _RET_IP_,
}; };
cpu_stop_init_done(&done, 2); cpu_stop_init_done(&done, 2);
@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf) struct cpu_stop_work *work_buf)
{ {
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
return cpu_stop_queue_work(cpu, work_buf); return cpu_stop_queue_work(cpu, work_buf);
} }
@ -487,6 +504,8 @@ repeat:
int ret; int ret;
/* cpu stop callbacks must not sleep, make in_atomic() == T */ /* cpu stop callbacks must not sleep, make in_atomic() == T */
stopper->caller = work->caller;
stopper->fn = fn;
preempt_count_inc(); preempt_count_inc();
ret = fn(arg); ret = fn(arg);
if (done) { if (done) {
@ -495,6 +514,8 @@ repeat:
cpu_stop_signal_done(done); cpu_stop_signal_done(done);
} }
preempt_count_dec(); preempt_count_dec();
stopper->fn = NULL;
stopper->caller = 0;
WARN_ONCE(preempt_count(), WARN_ONCE(preempt_count(),
"cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
goto repeat; goto repeat;

View file

@ -4908,6 +4908,10 @@ static void unbind_workers(int cpu)
pool->flags |= POOL_DISASSOCIATED; pool->flags |= POOL_DISASSOCIATED;
raw_spin_unlock_irq(&pool->lock); raw_spin_unlock_irq(&pool->lock);
for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
mutex_unlock(&wq_pool_attach_mutex); mutex_unlock(&wq_pool_attach_mutex);
/* /*

View file

@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p,
return next; return next;
} }
EXPORT_SYMBOL(cpumask_any_and_distribute); EXPORT_SYMBOL(cpumask_any_and_distribute);
int cpumask_any_distribute(const struct cpumask *srcp)
{
int next, prev;
/* NOTE: our first selection will skip 0. */
prev = __this_cpu_read(distribute_cpu_mask_prev);
next = cpumask_next(prev, srcp);
if (next >= nr_cpu_ids)
next = cpumask_first(srcp);
if (next < nr_cpu_ids)
__this_cpu_write(distribute_cpu_mask_prev, next);
return next;
}
EXPORT_SYMBOL(cpumask_any_distribute);

View file

@ -12,6 +12,7 @@
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/kexec.h> #include <linux/kexec.h>
#include <linux/utsname.h> #include <linux/utsname.h>
#include <linux/stop_machine.h>
static char dump_stack_arch_desc_str[128]; static char dump_stack_arch_desc_str[128];
@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl)
log_lvl, dump_stack_arch_desc_str); log_lvl, dump_stack_arch_desc_str);
print_worker_info(log_lvl, current); print_worker_info(log_lvl, current);
print_stop_info(log_lvl, current);
} }
/** /**

View file

@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
if (current->nr_cpus_allowed == 1) if (current->nr_cpus_allowed == 1)
goto out; goto out;
#ifdef CONFIG_SMP
if (current->migration_disabled)
goto out;
#endif
/* /*
* It is valid to assume CPU-locality during early bootup: * It is valid to assume CPU-locality during early bootup:
*/ */