Star64_linux/include/linux/sched/task.h
Zhang Qiao 4ef0c5c6b5 kernel/sched: Fix sched_fork() access an invalid sched_task_group
There is a small race between copy_process() and sched_fork()
where child->sched_task_group point to an already freed pointer.

	parent doing fork()      | someone moving the parent
				 | to another cgroup
  -------------------------------+-------------------------------
  copy_process()
      + dup_task_struct()<1>
				  parent move to another cgroup,
				  and free the old cgroup. <2>
      + sched_fork()
	+ __set_task_cpu()<3>
	+ task_fork_fair()
	  + sched_slice()<4>

In the worst case, this bug can lead to "use-after-free" and
cause panic as shown above:

  (1) parent copy its sched_task_group to child at <1>;

  (2) someone move the parent to another cgroup and free the old
      cgroup at <2>;

  (3) the sched_task_group and cfs_rq that belong to the old cgroup
      will be accessed at <3> and <4>, which cause a panic:

  [] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
  [] PGD 8000001fa0a86067 P4D 8000001fa0a86067 PUD 2029955067 PMD 0
  [] Oops: 0000 [#1] SMP PTI
  [] CPU: 7 PID: 648398 Comm: ebizzy Kdump: loaded Tainted: G           OE    --------- -  - 4.18.0.x86_64+ #1
  [] RIP: 0010:sched_slice+0x84/0xc0

  [] Call Trace:
  []  task_fork_fair+0x81/0x120
  []  sched_fork+0x132/0x240
  []  copy_process.part.5+0x675/0x20e0
  []  ? __handle_mm_fault+0x63f/0x690
  []  _do_fork+0xcd/0x3b0
  []  do_syscall_64+0x5d/0x1d0
  []  entry_SYSCALL_64_after_hwframe+0x65/0xca
  [] RIP: 0033:0x7f04418cd7e1

Between cgroup_can_fork() and cgroup_post_fork(), the cgroup
membership and thus sched_task_group can't change. So update child's
sched_task_group at sched_post_fork() and move task_fork() and
__set_task_cpu() (where accees the sched_task_group) from sched_fork()
to sched_post_fork().

Fixes: 8323f26ce3 ("sched: Fix race in task_group")
Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://lkml.kernel.org/r/20210915064030.2231-1-zhangqiao22@huawei.com
2021-10-14 13:09:58 +02:00

177 lines
4.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_H
#define _LINUX_SCHED_TASK_H
/*
* Interface between the scheduler and various task lifetime (fork()/exit())
* functionality:
*/
#include <linux/sched.h>
#include <linux/uaccess.h>
struct task_struct;
struct rusage;
union thread_union;
struct css_set;
/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL
struct kernel_clone_args {
u64 flags;
int __user *pidfd;
int __user *child_tid;
int __user *parent_tid;
int exit_signal;
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
pid_t *set_tid;
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
int io_thread;
struct cgroup *cgrp;
struct css_set *cset;
};
/*
* This serializes "schedule()" and also protects
* the run-queue from deletions/modifications (but
* _adding_ to the beginning of the run-queue has
* a separate lock).
*/
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;
extern union thread_union init_thread_union;
extern struct task_struct init_task;
extern int lockdep_tasklist_lock_is_held(void);
extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
extern void sched_dead(struct task_struct *p);
void __noreturn do_task_dead(void);
extern void proc_caches_init(void);
extern void fork_init(void);
extern void release_task(struct task_struct * p);
extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *, unsigned long);
extern void flush_thread(void);
#ifdef CONFIG_HAVE_EXIT_THREAD
extern void exit_thread(struct task_struct *tsk);
#else
static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
extern void do_group_exit(int);
extern void exit_files(struct task_struct *);
extern void exit_itimers(struct signal_struct *);
extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
struct mm_struct *copy_init_mm(void);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);
extern void free_task(struct task_struct *tsk);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
#else
#define sched_exec() {}
#endif
static inline struct task_struct *get_task_struct(struct task_struct *t)
{
refcount_inc(&t->usage);
return t;
}
extern void __put_task_struct(struct task_struct *t);
static inline void put_task_struct(struct task_struct *t)
{
if (refcount_dec_and_test(&t->usage))
__put_task_struct(t);
}
static inline void put_task_struct_many(struct task_struct *t, int nr)
{
if (refcount_sub_and_test(nr, &t->usage))
__put_task_struct(t);
}
void put_task_struct_rcu_user(struct task_struct *task);
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif
#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
/*
* If an architecture has not declared a thread_struct whitelist we
* must assume something there may need to be copied to userspace.
*/
static inline void arch_thread_struct_whitelist(unsigned long *offset,
unsigned long *size)
{
*offset = 0;
/* Handle dynamically sized thread_struct. */
*size = arch_task_struct_size - offsetof(struct task_struct, thread);
}
#endif
#ifdef CONFIG_VMAP_STACK
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
return t->stack_vm_area;
}
#else
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
return NULL;
}
#endif
/*
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
* pins the final release of task.io_context. Also protects ->cpuset and
* ->cgroup.subsys[]. And ->vfork_done.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),
* neither inside nor outside.
*/
static inline void task_lock(struct task_struct *p)
{
spin_lock(&p->alloc_lock);
}
static inline void task_unlock(struct task_struct *p)
{
spin_unlock(&p->alloc_lock);
}
#endif /* _LINUX_SCHED_TASK_H */