Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler changes from Ingo Molnar: "The biggest change is a performance improvement on SMP systems: | 4 socket 40 core + SMT Westmere box, single 30 sec tbench | runs, higher is better: | | clients 1 2 4 8 16 32 64 128 |.......................................................................... | pre 30 41 118 645 3769 6214 12233 14312 | post 299 603 1211 2418 4697 6847 11606 14557 | | A nice increase in performance. which speedup is particularly noticeable on heavily interacting few-tasks workloads, so the changes should help desktop-style Xorg workloads and interactivity as well, on multi-core CPUs. There are also cpuset suspend behavior fixes/restructuring and various smaller tweaks." * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Fix race in task_group() sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned) task sched: Reset loop counters if all tasks are pinned and we need to redo load balance sched: Reorder 'struct lb_env' members to reduce its size sched: Improve scalability via 'CPU buddies', which withstand random perturbations cpusets: Remove/update outdated comments cpusets, hotplug: Restructure functions that are invoked during hotplug cpusets, hotplug: Implement cpuset tree traversal in a helper function CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume sched/x86: Remove broken power estimation
2025-06-17 20:25:19 +00:00 · 2012-07-26 13:08:01 -07:00 · 2012-07-26 13:08:01 -07:00 · 79071638ce
commit 79071638ce
parent 44a6b84421 8323f26ce3
9 changed files with 291 additions and 146 deletions
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@ -14,7 +14,7 @@ CFLAGS_common.o		:= $(nostackp)
 obj-y			:= intel_cacheinfo.o scattered.o topology.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
-obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
+obj-y			+= vmware.o hypervisor.o mshyperv.o
 obj-y			+= rdrand.o
 obj-y			+= match.o
--- a/arch/x86/kernel/cpu/sched.c
+++ b/arch/x86/kernel/cpu/sched.c
@ -1,55 +0,0 @@
 #include <linux/sched.h>
 #include <linux/math64.h>
 #include <linux/percpu.h>
 #include <linux/irqflags.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
 static unsigned long scale_aperfmperf(void)
 {
 	struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
 	unsigned long ratio, flags;
 	local_irq_save(flags);
 	get_aperfmperf(&val);
 	local_irq_restore(flags);
 	ratio = calc_aperfmperf_ratio(old, &val);
 	*old = val;
 	return ratio;
 }
 unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
 {
 	/*
 	 * do aperf/mperf on the cpu level because it includes things
 	 * like turbo mode, which are relevant to full cores.
 	 */
 	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
 		return scale_aperfmperf();
 	/*
 	 * maybe have something cpufreq here
 	 */
 	return default_scale_freq_power(sd, cpu);
 }
 unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
 {
 	/*
 	 * aperf/mperf already includes the smt gain
 	 */
 	if (boot_cpu_has(X86_FEATURE_APERFMPERF))
 		return SCHED_LOAD_SCALE;
 	return default_scale_smt_power(sd, cpu);
 }
 #endif
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@ -20,7 +20,7 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_update_active_cpus(void);
+extern void cpuset_update_active_cpus(bool cpu_online);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@ -124,7 +124,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
-static inline void cpuset_update_active_cpus(void)
+static inline void cpuset_update_active_cpus(bool cpu_online)
 {
 	partition_sched_domains(1, NULL, NULL);
 }
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@ -123,8 +123,17 @@ extern struct group_info init_groups;
 extern struct cred init_cred;
 extern struct task_group root_task_group;
 #ifdef CONFIG_CGROUP_SCHED
 # define INIT_CGROUP_SCHED(tsk)						\
 	.sched_task_group = &root_task_group,
 #else
 # define INIT_CGROUP_SCHED(tsk)
 #endif
 #ifdef CONFIG_PERF_EVENTS
-# define INIT_PERF_EVENTS(tsk)					\
+# define INIT_PERF_EVENTS(tsk)						\
 	.perf_event_mutex = 						\
 		 __MUTEX_INITIALIZER(tsk.perf_event_mutex),		\
 	.perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list),
@ -161,6 +170,7 @@ extern struct cred init_cred;
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	INIT_PUSHABLE_TASKS(tsk)					\
 	INIT_CGROUP_SCHED(tsk)						\
 	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
 	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
 	.real_parent	= &tsk,						\
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -949,6 +949,7 @@ struct sched_domain {
 	unsigned int smt_gain;
 	int flags;			/* See SD_* */
 	int level;
 	int idle_buddy;			/* cpu assigned to select_idle_sibling() */
 	/* Runtime fields. */
 	unsigned long last_balance;	/* init to jiffies. units in jiffies */
@ -1244,6 +1245,9 @@ struct task_struct {
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
 #endif
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
@ -2721,7 +2725,7 @@ extern int sched_group_set_rt_period(struct task_group *tg,
 extern long sched_group_rt_period(struct task_group *tg);
 extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 #endif
-#endif
+#endif /* CONFIG_CGROUP_SCHED */
 extern int task_can_switch_user(struct user_struct *up,
 					struct task_struct *tsk);
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@ -147,6 +147,12 @@ typedef enum {
 	CS_SPREAD_SLAB,
 } cpuset_flagbits_t;
 /* the type of hotplug event */
 enum hotplug_event {
 	CPUSET_CPU_OFFLINE,
 	CPUSET_MEM_OFFLINE,
 };
 /* convenient tests for these bits */
 static inline int is_cpu_exclusive(const struct cpuset *cs)
 {
@ -1990,8 +1996,36 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 }
 /*
- * Walk the specified cpuset subtree and look for empty cpusets.
+ * Helper function to traverse cpusets.
- * The tasks of such cpuset must be moved to a parent cpuset.
+ * It can be used to walk the cpuset tree from top to bottom, completing
 * one layer before dropping down to the next (thus always processing a
 * node before any of its children).
 */
 static struct cpuset *cpuset_next(struct list_head *queue)
 {
 	struct cpuset *cp;
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct cgroup *cont;
 	if (list_empty(queue))
 		return NULL;
 	cp = list_first_entry(queue, struct cpuset, stack_list);
 	list_del(queue->next);
 	list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
 		child = cgroup_cs(cont);
 		list_add_tail(&child->stack_list, queue);
 	}
 	return cp;
 }
 /*
 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory
 * online/offline) and update the cpusets accordingly.
 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
 * cpuset must be moved to a parent cpuset.
 *
 * Called with cgroup_mutex held.  We take callback_mutex to modify
 * cpus_allowed and mems_allowed.
@ -2000,50 +2034,61 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 * before dropping down to the next.  It always processes a node before
 * any of its children.
 *
- * For now, since we lack memory hot unplug, we'll never see a cpuset
+ * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
- * that has tasks along with an empty 'mems'.  But if we did see such
+ * if all present pages from a node are offlined.
 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
 */
-static void scan_for_empty_cpusets(struct cpuset *root)
+static void
 scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
 {
 	LIST_HEAD(queue);
-	struct cpuset *cp;	/* scans cpusets being updated */
+	struct cpuset *cp;		/* scans cpusets being updated */
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct cgroup *cont;
 	static nodemask_t oldmems;	/* protected by cgroup_mutex */
 	list_add_tail((struct list_head *)&root->stack_list, &queue);
-	while (!list_empty(&queue)) {
+	switch (event) {
-		cp = list_first_entry(&queue, struct cpuset, stack_list);
+	case CPUSET_CPU_OFFLINE:
-		list_del(queue.next);
+		while ((cp = cpuset_next(&queue)) != NULL) {
-		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
+
-			child = cgroup_cs(cont);
+			/* Continue past cpusets with all cpus online */
-			list_add_tail(&child->stack_list, &queue);
+			if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
 				continue;
 			/* Remove offline cpus from this cpuset. */
 			mutex_lock(&callback_mutex);
 			cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
 							cpu_active_mask);
 			mutex_unlock(&callback_mutex);
 			/* Move tasks from the empty cpuset to a parent */
 			if (cpumask_empty(cp->cpus_allowed))
 				remove_tasks_in_empty_cpuset(cp);
 			else
 				update_tasks_cpumask(cp, NULL);
 		}
 		break;
-		/* Continue past cpusets with all cpus, mems online */
+	case CPUSET_MEM_OFFLINE:
-		if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
+		while ((cp = cpuset_next(&queue)) != NULL) {
 		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
 			continue;
-		oldmems = cp->mems_allowed;
+			/* Continue past cpusets with all mems online */
 			if (nodes_subset(cp->mems_allowed,
 					node_states[N_HIGH_MEMORY]))
 				continue;
-		/* Remove offline cpus and mems from this cpuset. */
+			oldmems = cp->mems_allowed;
-		mutex_lock(&callback_mutex);
+
-		cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
+			/* Remove offline mems from this cpuset. */
-			    cpu_active_mask);
+			mutex_lock(&callback_mutex);
-		nodes_and(cp->mems_allowed, cp->mems_allowed,
+			nodes_and(cp->mems_allowed, cp->mems_allowed,
 						node_states[N_HIGH_MEMORY]);
-		mutex_unlock(&callback_mutex);
+			mutex_unlock(&callback_mutex);
-		/* Move tasks from the empty cpuset to a parent */
+			/* Move tasks from the empty cpuset to a parent */
-		if (cpumask_empty(cp->cpus_allowed) ||
+			if (nodes_empty(cp->mems_allowed))
-		     nodes_empty(cp->mems_allowed))
+				remove_tasks_in_empty_cpuset(cp);
-			remove_tasks_in_empty_cpuset(cp);
+			else
-		else {
+				update_tasks_nodemask(cp, &oldmems, NULL);
 			update_tasks_cpumask(cp, NULL);
 			update_tasks_nodemask(cp, &oldmems, NULL);
 		}
 	}
 }
@ -2054,13 +2099,19 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 * (of no affect) on systems that are actively using CPU hotplug
 * but making no active use of cpusets.
 *
 * The only exception to this is suspend/resume, where we don't
 * modify cpusets at all.
 *
 * This routine ensures that top_cpuset.cpus_allowed tracks
 * cpu_active_mask on each CPU hotplug (cpuhp) event.
 *
 * Called within get_online_cpus().  Needs to call cgroup_lock()
 * before calling generate_sched_domains().
 *
 * @cpu_online: Indicates whether this is a CPU online event (true) or
 * a CPU offline event (false).
 */
-void cpuset_update_active_cpus(void)
+void cpuset_update_active_cpus(bool cpu_online)
 {
 	struct sched_domain_attr *attr;
 	cpumask_var_t *doms;
@ -2070,7 +2121,10 @@ void cpuset_update_active_cpus(void)
 	mutex_lock(&callback_mutex);
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
 	mutex_unlock(&callback_mutex);
-	scan_for_empty_cpusets(&top_cpuset);
+
 	if (!cpu_online)
 		scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
 	ndoms = generate_sched_domains(&doms, &attr);
 	cgroup_unlock();
@ -2082,7 +2136,7 @@ void cpuset_update_active_cpus(void)
 /*
 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * See also the previous routine cpuset_track_online_cpus().
+ * See cpuset_update_active_cpus() for CPU hotplug handling.
 */
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
@ -2101,9 +2155,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 	case MEM_OFFLINE:
 		/*
 		 * needn't update top_cpuset.mems_allowed explicitly because
-		 * scan_for_empty_cpusets() will update it.
+		 * scan_cpusets_upon_hotplug() will update it.
 		 */
-		scan_for_empty_cpusets(&top_cpuset);
+		scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
 		break;
 	default:
 		break;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
 	 *
 	 * sched_move_task() holds both and thus holding either pins the cgroup,
-	 * see set_task_rq().
+	 * see task_group().
 	 *
 	 * Furthermore, all task_rq users should acquire both locks, see
 	 * task_rq_lock().
@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
 * allows us to avoid some pointer chasing select_idle_sibling().
 *
 * Iterate domains and sched_groups downward, assigning CPUs to be
 * select_idle_sibling() hw buddy.  Cross-wiring hw makes bouncing
 * due to random perturbation self canceling, ie sw buddies pull
 * their counterpart to their CPU's hw counterpart.
 *
 * Also keep a unique ID per domain (we use the first cpu number in
 * the cpumask of the domain), this allows us to quickly tell if
 * two cpus are in the same cache domain, see cpus_share_cache().
@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu)
 	int id = cpu;
 	sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
-	if (sd)
+	if (sd) {
 		struct sched_domain *tmp = sd;
 		struct sched_group *sg, *prev;
 		bool right;
 		/*
 		 * Traverse to first CPU in group, and count hops
 		 * to cpu from there, switching direction on each
 		 * hop, never ever pointing the last CPU rightward.
 		 */
 		do {
 			id = cpumask_first(sched_domain_span(tmp));
 			prev = sg = tmp->groups;
 			right = 1;
 			while (cpumask_first(sched_group_cpus(sg)) != id)
 				sg = sg->next;
 			while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
 				prev = sg;
 				sg = sg->next;
 				right = !right;
 			}
 			/* A CPU went down, never point back to domain start. */
 			if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
 				right = false;
 			sg = right ? sg->next : prev;
 			tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
 		} while ((tmp = tmp->child));
 		id = cpumask_first(sched_domain_span(sd));
 	}
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_id, cpu) = id;
@ -7097,34 +7134,66 @@ match2:
 	mutex_unlock(&sched_domains_mutex);
 }
 static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
 /*
 * Update cpusets according to cpu_active mask.  If cpusets are
 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
 * around partition_sched_domains().
 *
 * If we come here as part of a suspend/resume, don't touch cpusets because we
 * want to restore it back to its original state upon resume anyway.
 */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
-	switch (action & ~CPU_TASKS_FROZEN) {
+	switch (action) {
 	case CPU_ONLINE_FROZEN:
 	case CPU_DOWN_FAILED_FROZEN:
 		/*
 		 * num_cpus_frozen tracks how many CPUs are involved in suspend
 		 * resume sequence. As long as this is not the last online
 		 * operation in the resume sequence, just build a single sched
 		 * domain, ignoring cpusets.
 		 */
 		num_cpus_frozen--;
 		if (likely(num_cpus_frozen)) {
 			partition_sched_domains(1, NULL, NULL);
 			break;
 		}
 		/*
 		 * This is the last CPU online operation. So fall through and
 		 * restore the original sched domains by considering the
 		 * cpuset configurations.
 		 */
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
-		cpuset_update_active_cpus();
+		cpuset_update_active_cpus(true);
-		return NOTIFY_OK;
+		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	return NOTIFY_OK;
 }
 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 			       void *hcpu)
 {
-	switch (action & ~CPU_TASKS_FROZEN) {
+	switch (action) {
 	case CPU_DOWN_PREPARE:
-		cpuset_update_active_cpus();
+		cpuset_update_active_cpus(false);
-		return NOTIFY_OK;
+		break;
 	case CPU_DOWN_PREPARE_FROZEN:
 		num_cpus_frozen++;
 		partition_sched_domains(1, NULL, NULL);
 		break;
 	default:
 		return NOTIFY_DONE;
 	}
 	return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
@ -7589,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg)
 */
 void sched_move_task(struct task_struct *tsk)
 {
 	struct task_group *tg;
 	int on_rq, running;
 	unsigned long flags;
 	struct rq *rq;
@ -7603,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 	tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
 				lockdep_is_held(&tsk->sighand->siglock)),
 			  struct task_group, css);
 	tg = autogroup_task_group(tsk, tg);
 	tsk->sched_task_group = tg;
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (tsk->sched_class->task_move_group)
 		tsk->sched_class->task_move_group(tsk, on_rq);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
 	int cpu = smp_processor_id();
 	int prev_cpu = task_cpu(p);
 	struct sched_domain *sd;
 	struct sched_group *sg;
 	int i;
 	/*
 	 * If the task is going to be woken-up on this cpu and if it is
@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
 		return prev_cpu;
 	/*
-	 * Otherwise, iterate the domains and find an elegible idle cpu.
+	 * Otherwise, check assigned siblings to find an elegible idle cpu.
 	 */
 	sd = rcu_dereference(per_cpu(sd_llc, target));
 	for_each_lower_domain(sd) {
-		sg = sd->groups;
+		if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
-		do {
+			continue;
-			if (!cpumask_intersects(sched_group_cpus(sg),
+		if (idle_cpu(sd->idle_buddy))
-						tsk_cpus_allowed(p)))
+			return sd->idle_buddy;
 				goto next;
 			for_each_cpu(i, sched_group_cpus(sg)) {
 				if (!idle_cpu(i))
 					goto next;
 			}
 			target = cpumask_first_and(sched_group_cpus(sg),
 					tsk_cpus_allowed(p));
 			goto done;
 next:
 			sg = sg->next;
 		} while (sg != sd->groups);
 	}
-done:
+
 	return target;
 }
@ -3068,16 +3054,19 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 #define LBF_ALL_PINNED	0x01
 #define LBF_NEED_BREAK	0x02
 #define LBF_SOME_PINNED 0x04
 struct lb_env {
 	struct sched_domain	*sd;
 	int			src_cpu;
 	struct rq		*src_rq;
 	int			src_cpu;
 	int			dst_cpu;
 	struct rq		*dst_rq;
 	struct cpumask		*dst_grpmask;
 	int			new_dst_cpu;
 	enum cpu_idle_type	idle;
 	long			imbalance;
 	unsigned int		flags;
@ -3145,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	 * 3) are cache-hot on their current CPU.
 	 */
 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
 		int new_dst_cpu;
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
 		/*
 		 * Remember if this task can be migrated to any other cpu in
 		 * our sched_group. We may want to revisit it if we couldn't
 		 * meet load balance goals by pulling other tasks on src_cpu.
 		 *
 		 * Also avoid computing new_dst_cpu if we have already computed
 		 * one in current iteration.
 		 */
 		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
 			return 0;
 		new_dst_cpu = cpumask_first_and(env->dst_grpmask,
 						tsk_cpus_allowed(p));
 		if (new_dst_cpu < nr_cpu_ids) {
 			env->flags |= LBF_SOME_PINNED;
 			env->new_dst_cpu = new_dst_cpu;
 		}
 		return 0;
 	}
 	/* Record that we found atleast one task that could run on dst_cpu */
 	env->flags &= ~LBF_ALL_PINNED;
 	if (task_running(env->src_rq, p)) {
@ -4227,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *balance)
 {
-	int ld_moved, active_balance = 0;
+	int ld_moved, cur_ld_moved, active_balance = 0;
 	int lb_iterations, max_lb_iterations;
 	struct sched_group *group;
 	struct rq *busiest;
 	unsigned long flags;
@ -4237,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 		.sd		= sd,
 		.dst_cpu	= this_cpu,
 		.dst_rq		= this_rq,
 		.dst_grpmask    = sched_group_cpus(sd->groups),
 		.idle		= idle,
 		.loop_break	= sched_nr_migrate_break,
 	};
 	cpumask_copy(cpus, cpu_active_mask);
 	max_lb_iterations = cpumask_weight(env.dst_grpmask);
 	schedstat_inc(sd, lb_count[idle]);
@ -4267,6 +4281,7 @@ redo:
 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
 	ld_moved = 0;
 	lb_iterations = 1;
 	if (busiest->nr_running > 1) {
 		/*
 		 * Attempt to move tasks. If find_busiest_group has found
@ -4284,7 +4299,13 @@ more_balance:
 		double_rq_lock(this_rq, busiest);
 		if (!env.loop)
 			update_h_load(env.src_cpu);
-		ld_moved += move_tasks(&env);
+
 		/*
 		 * cur_ld_moved - load moved in current iteration
 		 * ld_moved     - cumulative load moved across iterations
 		 */
 		cur_ld_moved = move_tasks(&env);
 		ld_moved += cur_ld_moved;
 		double_rq_unlock(this_rq, busiest);
 		local_irq_restore(flags);
@ -4296,14 +4317,52 @@ more_balance:
 		/*
 		 * some other cpu did the load balance for us.
 		 */
-		if (ld_moved && this_cpu != smp_processor_id())
+		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
-			resched_cpu(this_cpu);
+			resched_cpu(env.dst_cpu);
 		/*
 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
 		 * us and move them to an alternate dst_cpu in our sched_group
 		 * where they can run. The upper limit on how many times we
 		 * iterate on same src_cpu is dependent on number of cpus in our
 		 * sched_group.
 		 *
 		 * This changes load balance semantics a bit on who can move
 		 * load to a given_cpu. In addition to the given_cpu itself
 		 * (or a ilb_cpu acting on its behalf where given_cpu is
 		 * nohz-idle), we now have balance_cpu in a position to move
 		 * load to given_cpu. In rare situations, this may cause
 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
 		 * _independently_ and at _same_ time to move some load to
 		 * given_cpu) causing exceess load to be moved to given_cpu.
 		 * This however should not happen so much in practice and
 		 * moreover subsequent load balance cycles should correct the
 		 * excess load moved.
 		 */
 		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
 				lb_iterations++ < max_lb_iterations) {
 			this_rq		 = cpu_rq(env.new_dst_cpu);
 			env.dst_rq	 = this_rq;
 			env.dst_cpu	 = env.new_dst_cpu;
 			env.flags	&= ~LBF_SOME_PINNED;
 			env.loop	 = 0;
 			env.loop_break	 = sched_nr_migrate_break;
 			/*
 			 * Go back to "more_balance" rather than "redo" since we
 			 * need to continue with same src_cpu.
 			 */
 			goto more_balance;
 		}
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
-			if (!cpumask_empty(cpus))
+			if (!cpumask_empty(cpus)) {
 				env.loop = 0;
 				env.loop_break = sched_nr_migrate_break;
 				goto redo;
 			}
 			goto out_balanced;
 		}
 	}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg);
 /*
 * Return the group to which this tasks belongs.
 *
- * We use task_subsys_state_check() and extend the RCU verification with
+ * We cannot use task_subsys_state() and friends because the cgroup
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * subsystem changes that value before the cgroup_subsys::attach() method
- * task it moves into the cgroup. Therefore by holding either of those locks,
+ * is called, therefore we cannot pin it and might observe the wrong value.
- * we pin the task to the current cgroup.
+ *
 * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
 * core changes this before calling sched_move_task().
 *
 * Instead we use a 'copy' which is updated from sched_move_task() while
 * holding both task_struct::pi_lock and rq::lock.
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
-	struct task_group *tg;
+	return p->sched_task_group;
 	struct cgroup_subsys_state *css;
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&p->pi_lock) ||
 			lockdep_is_held(&task_rq(p)->lock));
 	tg = container_of(css, struct task_group, css);
 	return autogroup_task_group(p, tg);
 }
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */