mirror of
https://github.com/Fishwaldo/linux-bl808.git
synced 2025-06-17 20:25:19 +00:00
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The biggest change is a performance improvement on SMP systems: | 4 socket 40 core + SMT Westmere box, single 30 sec tbench | runs, higher is better: | | clients 1 2 4 8 16 32 64 128 |.......................................................................... | pre 30 41 118 645 3769 6214 12233 14312 | post 299 603 1211 2418 4697 6847 11606 14557 | | A nice increase in performance. which speedup is particularly noticeable on heavily interacting few-tasks workloads, so the changes should help desktop-style Xorg workloads and interactivity as well, on multi-core CPUs. There are also cpuset suspend behavior fixes/restructuring and various smaller tweaks." * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Fix race in task_group() sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned) task sched: Reset loop counters if all tasks are pinned and we need to redo load balance sched: Reorder 'struct lb_env' members to reduce its size sched: Improve scalability via 'CPU buddies', which withstand random perturbations cpusets: Remove/update outdated comments cpusets, hotplug: Restructure functions that are invoked during hotplug cpusets, hotplug: Implement cpuset tree traversal in a helper function CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume sched/x86: Remove broken power estimation
This commit is contained in:
commit
79071638ce
9 changed files with 291 additions and 146 deletions
|
@ -14,7 +14,7 @@ CFLAGS_common.o := $(nostackp)
|
||||||
|
|
||||||
obj-y := intel_cacheinfo.o scattered.o topology.o
|
obj-y := intel_cacheinfo.o scattered.o topology.o
|
||||||
obj-y += proc.o capflags.o powerflags.o common.o
|
obj-y += proc.o capflags.o powerflags.o common.o
|
||||||
obj-y += vmware.o hypervisor.o sched.o mshyperv.o
|
obj-y += vmware.o hypervisor.o mshyperv.o
|
||||||
obj-y += rdrand.o
|
obj-y += rdrand.o
|
||||||
obj-y += match.o
|
obj-y += match.o
|
||||||
|
|
||||||
|
|
|
@ -1,55 +0,0 @@
|
||||||
#include <linux/sched.h>
|
|
||||||
#include <linux/math64.h>
|
|
||||||
#include <linux/percpu.h>
|
|
||||||
#include <linux/irqflags.h>
|
|
||||||
|
|
||||||
#include <asm/cpufeature.h>
|
|
||||||
#include <asm/processor.h>
|
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
|
||||||
|
|
||||||
static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
|
|
||||||
|
|
||||||
static unsigned long scale_aperfmperf(void)
|
|
||||||
{
|
|
||||||
struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
|
|
||||||
unsigned long ratio, flags;
|
|
||||||
|
|
||||||
local_irq_save(flags);
|
|
||||||
get_aperfmperf(&val);
|
|
||||||
local_irq_restore(flags);
|
|
||||||
|
|
||||||
ratio = calc_aperfmperf_ratio(old, &val);
|
|
||||||
*old = val;
|
|
||||||
|
|
||||||
return ratio;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* do aperf/mperf on the cpu level because it includes things
|
|
||||||
* like turbo mode, which are relevant to full cores.
|
|
||||||
*/
|
|
||||||
if (boot_cpu_has(X86_FEATURE_APERFMPERF))
|
|
||||||
return scale_aperfmperf();
|
|
||||||
|
|
||||||
/*
|
|
||||||
* maybe have something cpufreq here
|
|
||||||
*/
|
|
||||||
|
|
||||||
return default_scale_freq_power(sd, cpu);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* aperf/mperf already includes the smt gain
|
|
||||||
*/
|
|
||||||
if (boot_cpu_has(X86_FEATURE_APERFMPERF))
|
|
||||||
return SCHED_LOAD_SCALE;
|
|
||||||
|
|
||||||
return default_scale_smt_power(sd, cpu);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
|
@ -20,7 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
|
||||||
|
|
||||||
extern int cpuset_init(void);
|
extern int cpuset_init(void);
|
||||||
extern void cpuset_init_smp(void);
|
extern void cpuset_init_smp(void);
|
||||||
extern void cpuset_update_active_cpus(void);
|
extern void cpuset_update_active_cpus(bool cpu_online);
|
||||||
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
|
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
|
||||||
extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
|
extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
|
||||||
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
|
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
|
||||||
|
@ -124,7 +124,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
|
||||||
static inline int cpuset_init(void) { return 0; }
|
static inline int cpuset_init(void) { return 0; }
|
||||||
static inline void cpuset_init_smp(void) {}
|
static inline void cpuset_init_smp(void) {}
|
||||||
|
|
||||||
static inline void cpuset_update_active_cpus(void)
|
static inline void cpuset_update_active_cpus(bool cpu_online)
|
||||||
{
|
{
|
||||||
partition_sched_domains(1, NULL, NULL);
|
partition_sched_domains(1, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
|
@ -123,8 +123,17 @@ extern struct group_info init_groups;
|
||||||
|
|
||||||
extern struct cred init_cred;
|
extern struct cred init_cred;
|
||||||
|
|
||||||
|
extern struct task_group root_task_group;
|
||||||
|
|
||||||
|
#ifdef CONFIG_CGROUP_SCHED
|
||||||
|
# define INIT_CGROUP_SCHED(tsk) \
|
||||||
|
.sched_task_group = &root_task_group,
|
||||||
|
#else
|
||||||
|
# define INIT_CGROUP_SCHED(tsk)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_PERF_EVENTS
|
#ifdef CONFIG_PERF_EVENTS
|
||||||
# define INIT_PERF_EVENTS(tsk) \
|
# define INIT_PERF_EVENTS(tsk) \
|
||||||
.perf_event_mutex = \
|
.perf_event_mutex = \
|
||||||
__MUTEX_INITIALIZER(tsk.perf_event_mutex), \
|
__MUTEX_INITIALIZER(tsk.perf_event_mutex), \
|
||||||
.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
|
.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
|
||||||
|
@ -161,6 +170,7 @@ extern struct cred init_cred;
|
||||||
}, \
|
}, \
|
||||||
.tasks = LIST_HEAD_INIT(tsk.tasks), \
|
.tasks = LIST_HEAD_INIT(tsk.tasks), \
|
||||||
INIT_PUSHABLE_TASKS(tsk) \
|
INIT_PUSHABLE_TASKS(tsk) \
|
||||||
|
INIT_CGROUP_SCHED(tsk) \
|
||||||
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \
|
.ptraced = LIST_HEAD_INIT(tsk.ptraced), \
|
||||||
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
|
.ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
|
||||||
.real_parent = &tsk, \
|
.real_parent = &tsk, \
|
||||||
|
|
|
@ -949,6 +949,7 @@ struct sched_domain {
|
||||||
unsigned int smt_gain;
|
unsigned int smt_gain;
|
||||||
int flags; /* See SD_* */
|
int flags; /* See SD_* */
|
||||||
int level;
|
int level;
|
||||||
|
int idle_buddy; /* cpu assigned to select_idle_sibling() */
|
||||||
|
|
||||||
/* Runtime fields. */
|
/* Runtime fields. */
|
||||||
unsigned long last_balance; /* init to jiffies. units in jiffies */
|
unsigned long last_balance; /* init to jiffies. units in jiffies */
|
||||||
|
@ -1244,6 +1245,9 @@ struct task_struct {
|
||||||
const struct sched_class *sched_class;
|
const struct sched_class *sched_class;
|
||||||
struct sched_entity se;
|
struct sched_entity se;
|
||||||
struct sched_rt_entity rt;
|
struct sched_rt_entity rt;
|
||||||
|
#ifdef CONFIG_CGROUP_SCHED
|
||||||
|
struct task_group *sched_task_group;
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
||||||
/* list of struct preempt_notifier: */
|
/* list of struct preempt_notifier: */
|
||||||
|
@ -2721,7 +2725,7 @@ extern int sched_group_set_rt_period(struct task_group *tg,
|
||||||
extern long sched_group_rt_period(struct task_group *tg);
|
extern long sched_group_rt_period(struct task_group *tg);
|
||||||
extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
|
extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif /* CONFIG_CGROUP_SCHED */
|
||||||
|
|
||||||
extern int task_can_switch_user(struct user_struct *up,
|
extern int task_can_switch_user(struct user_struct *up,
|
||||||
struct task_struct *tsk);
|
struct task_struct *tsk);
|
||||||
|
|
130
kernel/cpuset.c
130
kernel/cpuset.c
|
@ -147,6 +147,12 @@ typedef enum {
|
||||||
CS_SPREAD_SLAB,
|
CS_SPREAD_SLAB,
|
||||||
} cpuset_flagbits_t;
|
} cpuset_flagbits_t;
|
||||||
|
|
||||||
|
/* the type of hotplug event */
|
||||||
|
enum hotplug_event {
|
||||||
|
CPUSET_CPU_OFFLINE,
|
||||||
|
CPUSET_MEM_OFFLINE,
|
||||||
|
};
|
||||||
|
|
||||||
/* convenient tests for these bits */
|
/* convenient tests for these bits */
|
||||||
static inline int is_cpu_exclusive(const struct cpuset *cs)
|
static inline int is_cpu_exclusive(const struct cpuset *cs)
|
||||||
{
|
{
|
||||||
|
@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Walk the specified cpuset subtree and look for empty cpusets.
|
* Helper function to traverse cpusets.
|
||||||
* The tasks of such cpuset must be moved to a parent cpuset.
|
* It can be used to walk the cpuset tree from top to bottom, completing
|
||||||
|
* one layer before dropping down to the next (thus always processing a
|
||||||
|
* node before any of its children).
|
||||||
|
*/
|
||||||
|
static struct cpuset *cpuset_next(struct list_head *queue)
|
||||||
|
{
|
||||||
|
struct cpuset *cp;
|
||||||
|
struct cpuset *child; /* scans child cpusets of cp */
|
||||||
|
struct cgroup *cont;
|
||||||
|
|
||||||
|
if (list_empty(queue))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
cp = list_first_entry(queue, struct cpuset, stack_list);
|
||||||
|
list_del(queue->next);
|
||||||
|
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
|
||||||
|
child = cgroup_cs(cont);
|
||||||
|
list_add_tail(&child->stack_list, queue);
|
||||||
|
}
|
||||||
|
|
||||||
|
return cp;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
|
||||||
|
* online/offline) and update the cpusets accordingly.
|
||||||
|
* For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
|
||||||
|
* cpuset must be moved to a parent cpuset.
|
||||||
*
|
*
|
||||||
* Called with cgroup_mutex held. We take callback_mutex to modify
|
* Called with cgroup_mutex held. We take callback_mutex to modify
|
||||||
* cpus_allowed and mems_allowed.
|
* cpus_allowed and mems_allowed.
|
||||||
|
@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
|
||||||
* before dropping down to the next. It always processes a node before
|
* before dropping down to the next. It always processes a node before
|
||||||
* any of its children.
|
* any of its children.
|
||||||
*
|
*
|
||||||
* For now, since we lack memory hot unplug, we'll never see a cpuset
|
* In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
|
||||||
* that has tasks along with an empty 'mems'. But if we did see such
|
* if all present pages from a node are offlined.
|
||||||
* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
|
|
||||||
*/
|
*/
|
||||||
static void scan_for_empty_cpusets(struct cpuset *root)
|
static void
|
||||||
|
scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
|
||||||
{
|
{
|
||||||
LIST_HEAD(queue);
|
LIST_HEAD(queue);
|
||||||
struct cpuset *cp; /* scans cpusets being updated */
|
struct cpuset *cp; /* scans cpusets being updated */
|
||||||
struct cpuset *child; /* scans child cpusets of cp */
|
|
||||||
struct cgroup *cont;
|
|
||||||
static nodemask_t oldmems; /* protected by cgroup_mutex */
|
static nodemask_t oldmems; /* protected by cgroup_mutex */
|
||||||
|
|
||||||
list_add_tail((struct list_head *)&root->stack_list, &queue);
|
list_add_tail((struct list_head *)&root->stack_list, &queue);
|
||||||
|
|
||||||
while (!list_empty(&queue)) {
|
switch (event) {
|
||||||
cp = list_first_entry(&queue, struct cpuset, stack_list);
|
case CPUSET_CPU_OFFLINE:
|
||||||
list_del(queue.next);
|
while ((cp = cpuset_next(&queue)) != NULL) {
|
||||||
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
|
|
||||||
child = cgroup_cs(cont);
|
/* Continue past cpusets with all cpus online */
|
||||||
list_add_tail(&child->stack_list, &queue);
|
if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Remove offline cpus from this cpuset. */
|
||||||
|
mutex_lock(&callback_mutex);
|
||||||
|
cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
|
||||||
|
cpu_active_mask);
|
||||||
|
mutex_unlock(&callback_mutex);
|
||||||
|
|
||||||
|
/* Move tasks from the empty cpuset to a parent */
|
||||||
|
if (cpumask_empty(cp->cpus_allowed))
|
||||||
|
remove_tasks_in_empty_cpuset(cp);
|
||||||
|
else
|
||||||
|
update_tasks_cpumask(cp, NULL);
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
/* Continue past cpusets with all cpus, mems online */
|
case CPUSET_MEM_OFFLINE:
|
||||||
if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
|
while ((cp = cpuset_next(&queue)) != NULL) {
|
||||||
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
oldmems = cp->mems_allowed;
|
/* Continue past cpusets with all mems online */
|
||||||
|
if (nodes_subset(cp->mems_allowed,
|
||||||
|
node_states[N_HIGH_MEMORY]))
|
||||||
|
continue;
|
||||||
|
|
||||||
/* Remove offline cpus and mems from this cpuset. */
|
oldmems = cp->mems_allowed;
|
||||||
mutex_lock(&callback_mutex);
|
|
||||||
cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
|
/* Remove offline mems from this cpuset. */
|
||||||
cpu_active_mask);
|
mutex_lock(&callback_mutex);
|
||||||
nodes_and(cp->mems_allowed, cp->mems_allowed,
|
nodes_and(cp->mems_allowed, cp->mems_allowed,
|
||||||
node_states[N_HIGH_MEMORY]);
|
node_states[N_HIGH_MEMORY]);
|
||||||
mutex_unlock(&callback_mutex);
|
mutex_unlock(&callback_mutex);
|
||||||
|
|
||||||
/* Move tasks from the empty cpuset to a parent */
|
/* Move tasks from the empty cpuset to a parent */
|
||||||
if (cpumask_empty(cp->cpus_allowed) ||
|
if (nodes_empty(cp->mems_allowed))
|
||||||
nodes_empty(cp->mems_allowed))
|
remove_tasks_in_empty_cpuset(cp);
|
||||||
remove_tasks_in_empty_cpuset(cp);
|
else
|
||||||
else {
|
update_tasks_nodemask(cp, &oldmems, NULL);
|
||||||
update_tasks_cpumask(cp, NULL);
|
|
||||||
update_tasks_nodemask(cp, &oldmems, NULL);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
|
||||||
* (of no affect) on systems that are actively using CPU hotplug
|
* (of no affect) on systems that are actively using CPU hotplug
|
||||||
* but making no active use of cpusets.
|
* but making no active use of cpusets.
|
||||||
*
|
*
|
||||||
|
* The only exception to this is suspend/resume, where we don't
|
||||||
|
* modify cpusets at all.
|
||||||
|
*
|
||||||
* This routine ensures that top_cpuset.cpus_allowed tracks
|
* This routine ensures that top_cpuset.cpus_allowed tracks
|
||||||
* cpu_active_mask on each CPU hotplug (cpuhp) event.
|
* cpu_active_mask on each CPU hotplug (cpuhp) event.
|
||||||
*
|
*
|
||||||
* Called within get_online_cpus(). Needs to call cgroup_lock()
|
* Called within get_online_cpus(). Needs to call cgroup_lock()
|
||||||
* before calling generate_sched_domains().
|
* before calling generate_sched_domains().
|
||||||
|
*
|
||||||
|
* @cpu_online: Indicates whether this is a CPU online event (true) or
|
||||||
|
* a CPU offline event (false).
|
||||||
*/
|
*/
|
||||||
void cpuset_update_active_cpus(void)
|
void cpuset_update_active_cpus(bool cpu_online)
|
||||||
{
|
{
|
||||||
struct sched_domain_attr *attr;
|
struct sched_domain_attr *attr;
|
||||||
cpumask_var_t *doms;
|
cpumask_var_t *doms;
|
||||||
|
@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
|
||||||
mutex_lock(&callback_mutex);
|
mutex_lock(&callback_mutex);
|
||||||
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
|
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
|
||||||
mutex_unlock(&callback_mutex);
|
mutex_unlock(&callback_mutex);
|
||||||
scan_for_empty_cpusets(&top_cpuset);
|
|
||||||
|
if (!cpu_online)
|
||||||
|
scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
|
||||||
|
|
||||||
ndoms = generate_sched_domains(&doms, &attr);
|
ndoms = generate_sched_domains(&doms, &attr);
|
||||||
cgroup_unlock();
|
cgroup_unlock();
|
||||||
|
|
||||||
|
@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
|
||||||
/*
|
/*
|
||||||
* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
|
* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
|
||||||
* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
|
* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
|
||||||
* See also the previous routine cpuset_track_online_cpus().
|
* See cpuset_update_active_cpus() for CPU hotplug handling.
|
||||||
*/
|
*/
|
||||||
static int cpuset_track_online_nodes(struct notifier_block *self,
|
static int cpuset_track_online_nodes(struct notifier_block *self,
|
||||||
unsigned long action, void *arg)
|
unsigned long action, void *arg)
|
||||||
|
@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
|
||||||
case MEM_OFFLINE:
|
case MEM_OFFLINE:
|
||||||
/*
|
/*
|
||||||
* needn't update top_cpuset.mems_allowed explicitly because
|
* needn't update top_cpuset.mems_allowed explicitly because
|
||||||
* scan_for_empty_cpusets() will update it.
|
* scan_cpusets_upon_hotplug() will update it.
|
||||||
*/
|
*/
|
||||||
scan_for_empty_cpusets(&top_cpuset);
|
scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
||||||
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
|
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
|
||||||
*
|
*
|
||||||
* sched_move_task() holds both and thus holding either pins the cgroup,
|
* sched_move_task() holds both and thus holding either pins the cgroup,
|
||||||
* see set_task_rq().
|
* see task_group().
|
||||||
*
|
*
|
||||||
* Furthermore, all task_rq users should acquire both locks, see
|
* Furthermore, all task_rq users should acquire both locks, see
|
||||||
* task_rq_lock().
|
* task_rq_lock().
|
||||||
|
@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
|
||||||
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
|
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
|
||||||
* allows us to avoid some pointer chasing select_idle_sibling().
|
* allows us to avoid some pointer chasing select_idle_sibling().
|
||||||
*
|
*
|
||||||
|
* Iterate domains and sched_groups downward, assigning CPUs to be
|
||||||
|
* select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
|
||||||
|
* due to random perturbation self canceling, ie sw buddies pull
|
||||||
|
* their counterpart to their CPU's hw counterpart.
|
||||||
|
*
|
||||||
* Also keep a unique ID per domain (we use the first cpu number in
|
* Also keep a unique ID per domain (we use the first cpu number in
|
||||||
* the cpumask of the domain), this allows us to quickly tell if
|
* the cpumask of the domain), this allows us to quickly tell if
|
||||||
* two cpus are in the same cache domain, see cpus_share_cache().
|
* two cpus are in the same cache domain, see cpus_share_cache().
|
||||||
|
@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
|
||||||
int id = cpu;
|
int id = cpu;
|
||||||
|
|
||||||
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
|
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
|
||||||
if (sd)
|
if (sd) {
|
||||||
|
struct sched_domain *tmp = sd;
|
||||||
|
struct sched_group *sg, *prev;
|
||||||
|
bool right;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Traverse to first CPU in group, and count hops
|
||||||
|
* to cpu from there, switching direction on each
|
||||||
|
* hop, never ever pointing the last CPU rightward.
|
||||||
|
*/
|
||||||
|
do {
|
||||||
|
id = cpumask_first(sched_domain_span(tmp));
|
||||||
|
prev = sg = tmp->groups;
|
||||||
|
right = 1;
|
||||||
|
|
||||||
|
while (cpumask_first(sched_group_cpus(sg)) != id)
|
||||||
|
sg = sg->next;
|
||||||
|
|
||||||
|
while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
|
||||||
|
prev = sg;
|
||||||
|
sg = sg->next;
|
||||||
|
right = !right;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* A CPU went down, never point back to domain start. */
|
||||||
|
if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
|
||||||
|
right = false;
|
||||||
|
|
||||||
|
sg = right ? sg->next : prev;
|
||||||
|
tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
|
||||||
|
} while ((tmp = tmp->child));
|
||||||
|
|
||||||
id = cpumask_first(sched_domain_span(sd));
|
id = cpumask_first(sched_domain_span(sd));
|
||||||
|
}
|
||||||
|
|
||||||
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
|
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
|
||||||
per_cpu(sd_llc_id, cpu) = id;
|
per_cpu(sd_llc_id, cpu) = id;
|
||||||
|
@ -7097,34 +7134,66 @@ match2:
|
||||||
mutex_unlock(&sched_domains_mutex);
|
mutex_unlock(&sched_domains_mutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Update cpusets according to cpu_active mask. If cpusets are
|
* Update cpusets according to cpu_active mask. If cpusets are
|
||||||
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
|
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
|
||||||
* around partition_sched_domains().
|
* around partition_sched_domains().
|
||||||
|
*
|
||||||
|
* If we come here as part of a suspend/resume, don't touch cpusets because we
|
||||||
|
* want to restore it back to its original state upon resume anyway.
|
||||||
*/
|
*/
|
||||||
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
|
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
|
||||||
void *hcpu)
|
void *hcpu)
|
||||||
{
|
{
|
||||||
switch (action & ~CPU_TASKS_FROZEN) {
|
switch (action) {
|
||||||
|
case CPU_ONLINE_FROZEN:
|
||||||
|
case CPU_DOWN_FAILED_FROZEN:
|
||||||
|
|
||||||
|
/*
|
||||||
|
* num_cpus_frozen tracks how many CPUs are involved in suspend
|
||||||
|
* resume sequence. As long as this is not the last online
|
||||||
|
* operation in the resume sequence, just build a single sched
|
||||||
|
* domain, ignoring cpusets.
|
||||||
|
*/
|
||||||
|
num_cpus_frozen--;
|
||||||
|
if (likely(num_cpus_frozen)) {
|
||||||
|
partition_sched_domains(1, NULL, NULL);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is the last CPU online operation. So fall through and
|
||||||
|
* restore the original sched domains by considering the
|
||||||
|
* cpuset configurations.
|
||||||
|
*/
|
||||||
|
|
||||||
case CPU_ONLINE:
|
case CPU_ONLINE:
|
||||||
case CPU_DOWN_FAILED:
|
case CPU_DOWN_FAILED:
|
||||||
cpuset_update_active_cpus();
|
cpuset_update_active_cpus(true);
|
||||||
return NOTIFY_OK;
|
break;
|
||||||
default:
|
default:
|
||||||
return NOTIFY_DONE;
|
return NOTIFY_DONE;
|
||||||
}
|
}
|
||||||
|
return NOTIFY_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
|
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
|
||||||
void *hcpu)
|
void *hcpu)
|
||||||
{
|
{
|
||||||
switch (action & ~CPU_TASKS_FROZEN) {
|
switch (action) {
|
||||||
case CPU_DOWN_PREPARE:
|
case CPU_DOWN_PREPARE:
|
||||||
cpuset_update_active_cpus();
|
cpuset_update_active_cpus(false);
|
||||||
return NOTIFY_OK;
|
break;
|
||||||
|
case CPU_DOWN_PREPARE_FROZEN:
|
||||||
|
num_cpus_frozen++;
|
||||||
|
partition_sched_domains(1, NULL, NULL);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
return NOTIFY_DONE;
|
return NOTIFY_DONE;
|
||||||
}
|
}
|
||||||
|
return NOTIFY_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
void __init sched_init_smp(void)
|
void __init sched_init_smp(void)
|
||||||
|
@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg)
|
||||||
*/
|
*/
|
||||||
void sched_move_task(struct task_struct *tsk)
|
void sched_move_task(struct task_struct *tsk)
|
||||||
{
|
{
|
||||||
|
struct task_group *tg;
|
||||||
int on_rq, running;
|
int on_rq, running;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
struct rq *rq;
|
struct rq *rq;
|
||||||
|
@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk)
|
||||||
if (unlikely(running))
|
if (unlikely(running))
|
||||||
tsk->sched_class->put_prev_task(rq, tsk);
|
tsk->sched_class->put_prev_task(rq, tsk);
|
||||||
|
|
||||||
|
tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
|
||||||
|
lockdep_is_held(&tsk->sighand->siglock)),
|
||||||
|
struct task_group, css);
|
||||||
|
tg = autogroup_task_group(tsk, tg);
|
||||||
|
tsk->sched_task_group = tg;
|
||||||
|
|
||||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||||
if (tsk->sched_class->task_move_group)
|
if (tsk->sched_class->task_move_group)
|
||||||
tsk->sched_class->task_move_group(tsk, on_rq);
|
tsk->sched_class->task_move_group(tsk, on_rq);
|
||||||
|
|
|
@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
|
||||||
int cpu = smp_processor_id();
|
int cpu = smp_processor_id();
|
||||||
int prev_cpu = task_cpu(p);
|
int prev_cpu = task_cpu(p);
|
||||||
struct sched_domain *sd;
|
struct sched_domain *sd;
|
||||||
struct sched_group *sg;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the task is going to be woken-up on this cpu and if it is
|
* If the task is going to be woken-up on this cpu and if it is
|
||||||
|
@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
|
||||||
return prev_cpu;
|
return prev_cpu;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Otherwise, iterate the domains and find an elegible idle cpu.
|
* Otherwise, check assigned siblings to find an elegible idle cpu.
|
||||||
*/
|
*/
|
||||||
sd = rcu_dereference(per_cpu(sd_llc, target));
|
sd = rcu_dereference(per_cpu(sd_llc, target));
|
||||||
|
|
||||||
for_each_lower_domain(sd) {
|
for_each_lower_domain(sd) {
|
||||||
sg = sd->groups;
|
if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
|
||||||
do {
|
continue;
|
||||||
if (!cpumask_intersects(sched_group_cpus(sg),
|
if (idle_cpu(sd->idle_buddy))
|
||||||
tsk_cpus_allowed(p)))
|
return sd->idle_buddy;
|
||||||
goto next;
|
|
||||||
|
|
||||||
for_each_cpu(i, sched_group_cpus(sg)) {
|
|
||||||
if (!idle_cpu(i))
|
|
||||||
goto next;
|
|
||||||
}
|
|
||||||
|
|
||||||
target = cpumask_first_and(sched_group_cpus(sg),
|
|
||||||
tsk_cpus_allowed(p));
|
|
||||||
goto done;
|
|
||||||
next:
|
|
||||||
sg = sg->next;
|
|
||||||
} while (sg != sd->groups);
|
|
||||||
}
|
}
|
||||||
done:
|
|
||||||
return target;
|
return target;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
|
||||||
|
|
||||||
#define LBF_ALL_PINNED 0x01
|
#define LBF_ALL_PINNED 0x01
|
||||||
#define LBF_NEED_BREAK 0x02
|
#define LBF_NEED_BREAK 0x02
|
||||||
|
#define LBF_SOME_PINNED 0x04
|
||||||
|
|
||||||
struct lb_env {
|
struct lb_env {
|
||||||
struct sched_domain *sd;
|
struct sched_domain *sd;
|
||||||
|
|
||||||
int src_cpu;
|
|
||||||
struct rq *src_rq;
|
struct rq *src_rq;
|
||||||
|
int src_cpu;
|
||||||
|
|
||||||
int dst_cpu;
|
int dst_cpu;
|
||||||
struct rq *dst_rq;
|
struct rq *dst_rq;
|
||||||
|
|
||||||
|
struct cpumask *dst_grpmask;
|
||||||
|
int new_dst_cpu;
|
||||||
enum cpu_idle_type idle;
|
enum cpu_idle_type idle;
|
||||||
long imbalance;
|
long imbalance;
|
||||||
unsigned int flags;
|
unsigned int flags;
|
||||||
|
@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
||||||
* 3) are cache-hot on their current CPU.
|
* 3) are cache-hot on their current CPU.
|
||||||
*/
|
*/
|
||||||
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
|
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
|
||||||
|
int new_dst_cpu;
|
||||||
|
|
||||||
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
|
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remember if this task can be migrated to any other cpu in
|
||||||
|
* our sched_group. We may want to revisit it if we couldn't
|
||||||
|
* meet load balance goals by pulling other tasks on src_cpu.
|
||||||
|
*
|
||||||
|
* Also avoid computing new_dst_cpu if we have already computed
|
||||||
|
* one in current iteration.
|
||||||
|
*/
|
||||||
|
if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
new_dst_cpu = cpumask_first_and(env->dst_grpmask,
|
||||||
|
tsk_cpus_allowed(p));
|
||||||
|
if (new_dst_cpu < nr_cpu_ids) {
|
||||||
|
env->flags |= LBF_SOME_PINNED;
|
||||||
|
env->new_dst_cpu = new_dst_cpu;
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Record that we found atleast one task that could run on dst_cpu */
|
||||||
env->flags &= ~LBF_ALL_PINNED;
|
env->flags &= ~LBF_ALL_PINNED;
|
||||||
|
|
||||||
if (task_running(env->src_rq, p)) {
|
if (task_running(env->src_rq, p)) {
|
||||||
|
@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
||||||
struct sched_domain *sd, enum cpu_idle_type idle,
|
struct sched_domain *sd, enum cpu_idle_type idle,
|
||||||
int *balance)
|
int *balance)
|
||||||
{
|
{
|
||||||
int ld_moved, active_balance = 0;
|
int ld_moved, cur_ld_moved, active_balance = 0;
|
||||||
|
int lb_iterations, max_lb_iterations;
|
||||||
struct sched_group *group;
|
struct sched_group *group;
|
||||||
struct rq *busiest;
|
struct rq *busiest;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
||||||
.sd = sd,
|
.sd = sd,
|
||||||
.dst_cpu = this_cpu,
|
.dst_cpu = this_cpu,
|
||||||
.dst_rq = this_rq,
|
.dst_rq = this_rq,
|
||||||
|
.dst_grpmask = sched_group_cpus(sd->groups),
|
||||||
.idle = idle,
|
.idle = idle,
|
||||||
.loop_break = sched_nr_migrate_break,
|
.loop_break = sched_nr_migrate_break,
|
||||||
};
|
};
|
||||||
|
|
||||||
cpumask_copy(cpus, cpu_active_mask);
|
cpumask_copy(cpus, cpu_active_mask);
|
||||||
|
max_lb_iterations = cpumask_weight(env.dst_grpmask);
|
||||||
|
|
||||||
schedstat_inc(sd, lb_count[idle]);
|
schedstat_inc(sd, lb_count[idle]);
|
||||||
|
|
||||||
|
@ -4267,6 +4281,7 @@ redo:
|
||||||
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
|
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
|
||||||
|
|
||||||
ld_moved = 0;
|
ld_moved = 0;
|
||||||
|
lb_iterations = 1;
|
||||||
if (busiest->nr_running > 1) {
|
if (busiest->nr_running > 1) {
|
||||||
/*
|
/*
|
||||||
* Attempt to move tasks. If find_busiest_group has found
|
* Attempt to move tasks. If find_busiest_group has found
|
||||||
|
@ -4284,7 +4299,13 @@ more_balance:
|
||||||
double_rq_lock(this_rq, busiest);
|
double_rq_lock(this_rq, busiest);
|
||||||
if (!env.loop)
|
if (!env.loop)
|
||||||
update_h_load(env.src_cpu);
|
update_h_load(env.src_cpu);
|
||||||
ld_moved += move_tasks(&env);
|
|
||||||
|
/*
|
||||||
|
* cur_ld_moved - load moved in current iteration
|
||||||
|
* ld_moved - cumulative load moved across iterations
|
||||||
|
*/
|
||||||
|
cur_ld_moved = move_tasks(&env);
|
||||||
|
ld_moved += cur_ld_moved;
|
||||||
double_rq_unlock(this_rq, busiest);
|
double_rq_unlock(this_rq, busiest);
|
||||||
local_irq_restore(flags);
|
local_irq_restore(flags);
|
||||||
|
|
||||||
|
@ -4296,14 +4317,52 @@ more_balance:
|
||||||
/*
|
/*
|
||||||
* some other cpu did the load balance for us.
|
* some other cpu did the load balance for us.
|
||||||
*/
|
*/
|
||||||
if (ld_moved && this_cpu != smp_processor_id())
|
if (cur_ld_moved && env.dst_cpu != smp_processor_id())
|
||||||
resched_cpu(this_cpu);
|
resched_cpu(env.dst_cpu);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Revisit (affine) tasks on src_cpu that couldn't be moved to
|
||||||
|
* us and move them to an alternate dst_cpu in our sched_group
|
||||||
|
* where they can run. The upper limit on how many times we
|
||||||
|
* iterate on same src_cpu is dependent on number of cpus in our
|
||||||
|
* sched_group.
|
||||||
|
*
|
||||||
|
* This changes load balance semantics a bit on who can move
|
||||||
|
* load to a given_cpu. In addition to the given_cpu itself
|
||||||
|
* (or a ilb_cpu acting on its behalf where given_cpu is
|
||||||
|
* nohz-idle), we now have balance_cpu in a position to move
|
||||||
|
* load to given_cpu. In rare situations, this may cause
|
||||||
|
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
|
||||||
|
* _independently_ and at _same_ time to move some load to
|
||||||
|
* given_cpu) causing exceess load to be moved to given_cpu.
|
||||||
|
* This however should not happen so much in practice and
|
||||||
|
* moreover subsequent load balance cycles should correct the
|
||||||
|
* excess load moved.
|
||||||
|
*/
|
||||||
|
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
|
||||||
|
lb_iterations++ < max_lb_iterations) {
|
||||||
|
|
||||||
|
this_rq = cpu_rq(env.new_dst_cpu);
|
||||||
|
env.dst_rq = this_rq;
|
||||||
|
env.dst_cpu = env.new_dst_cpu;
|
||||||
|
env.flags &= ~LBF_SOME_PINNED;
|
||||||
|
env.loop = 0;
|
||||||
|
env.loop_break = sched_nr_migrate_break;
|
||||||
|
/*
|
||||||
|
* Go back to "more_balance" rather than "redo" since we
|
||||||
|
* need to continue with same src_cpu.
|
||||||
|
*/
|
||||||
|
goto more_balance;
|
||||||
|
}
|
||||||
|
|
||||||
/* All tasks on this runqueue were pinned by CPU affinity */
|
/* All tasks on this runqueue were pinned by CPU affinity */
|
||||||
if (unlikely(env.flags & LBF_ALL_PINNED)) {
|
if (unlikely(env.flags & LBF_ALL_PINNED)) {
|
||||||
cpumask_clear_cpu(cpu_of(busiest), cpus);
|
cpumask_clear_cpu(cpu_of(busiest), cpus);
|
||||||
if (!cpumask_empty(cpus))
|
if (!cpumask_empty(cpus)) {
|
||||||
|
env.loop = 0;
|
||||||
|
env.loop_break = sched_nr_migrate_break;
|
||||||
goto redo;
|
goto redo;
|
||||||
|
}
|
||||||
goto out_balanced;
|
goto out_balanced;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
|
||||||
/*
|
/*
|
||||||
* Return the group to which this tasks belongs.
|
* Return the group to which this tasks belongs.
|
||||||
*
|
*
|
||||||
* We use task_subsys_state_check() and extend the RCU verification with
|
* We cannot use task_subsys_state() and friends because the cgroup
|
||||||
* pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
|
* subsystem changes that value before the cgroup_subsys::attach() method
|
||||||
* task it moves into the cgroup. Therefore by holding either of those locks,
|
* is called, therefore we cannot pin it and might observe the wrong value.
|
||||||
* we pin the task to the current cgroup.
|
*
|
||||||
|
* The same is true for autogroup's p->signal->autogroup->tg, the autogroup
|
||||||
|
* core changes this before calling sched_move_task().
|
||||||
|
*
|
||||||
|
* Instead we use a 'copy' which is updated from sched_move_task() while
|
||||||
|
* holding both task_struct::pi_lock and rq::lock.
|
||||||
*/
|
*/
|
||||||
static inline struct task_group *task_group(struct task_struct *p)
|
static inline struct task_group *task_group(struct task_struct *p)
|
||||||
{
|
{
|
||||||
struct task_group *tg;
|
return p->sched_task_group;
|
||||||
struct cgroup_subsys_state *css;
|
|
||||||
|
|
||||||
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
|
|
||||||
lockdep_is_held(&p->pi_lock) ||
|
|
||||||
lockdep_is_held(&task_rq(p)->lock));
|
|
||||||
tg = container_of(css, struct task_group, css);
|
|
||||||
|
|
||||||
return autogroup_task_group(p, tg);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
|
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue