mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-06-24 23:52:40 +00:00
fork: extend clone3() to support setting a PID
The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber <areber@redhat.com> Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com> Reviewed-by: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Andrei Vagin <avagin@gmail.com> Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
This commit is contained in:
parent
17a810699c
commit
49cb2fc42c
7 changed files with 121 additions and 36 deletions
70
kernel/pid.c
70
kernel/pid.c
|
@ -157,7 +157,8 @@ void free_pid(struct pid *pid)
|
|||
call_rcu(&pid->rcu, delayed_put_pid);
|
||||
}
|
||||
|
||||
struct pid *alloc_pid(struct pid_namespace *ns)
|
||||
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
||||
size_t set_tid_size)
|
||||
{
|
||||
struct pid *pid;
|
||||
enum pid_type type;
|
||||
|
@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns)
|
|||
struct upid *upid;
|
||||
int retval = -ENOMEM;
|
||||
|
||||
/*
|
||||
* set_tid_size contains the size of the set_tid array. Starting at
|
||||
* the most nested currently active PID namespace it tells alloc_pid()
|
||||
* which PID to set for a process in that most nested PID namespace
|
||||
* up to set_tid_size PID namespaces. It does not have to set the PID
|
||||
* for a process in all nested PID namespaces but set_tid_size must
|
||||
* never be greater than the current ns->level + 1.
|
||||
*/
|
||||
if (set_tid_size > ns->level + 1)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
|
||||
if (!pid)
|
||||
return ERR_PTR(retval);
|
||||
|
@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns)
|
|||
pid->level = ns->level;
|
||||
|
||||
for (i = ns->level; i >= 0; i--) {
|
||||
int pid_min = 1;
|
||||
int tid = 0;
|
||||
|
||||
if (set_tid_size) {
|
||||
tid = set_tid[ns->level - i];
|
||||
|
||||
retval = -EINVAL;
|
||||
if (tid < 1 || tid >= pid_max)
|
||||
goto out_free;
|
||||
/*
|
||||
* Also fail if a PID != 1 is requested and
|
||||
* no PID 1 exists.
|
||||
*/
|
||||
if (tid != 1 && !tmp->child_reaper)
|
||||
goto out_free;
|
||||
retval = -EPERM;
|
||||
if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
|
||||
goto out_free;
|
||||
set_tid_size--;
|
||||
}
|
||||
|
||||
idr_preload(GFP_KERNEL);
|
||||
spin_lock_irq(&pidmap_lock);
|
||||
|
||||
/*
|
||||
* init really needs pid 1, but after reaching the maximum
|
||||
* wrap back to RESERVED_PIDS
|
||||
*/
|
||||
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
|
||||
pid_min = RESERVED_PIDS;
|
||||
if (tid) {
|
||||
nr = idr_alloc(&tmp->idr, NULL, tid,
|
||||
tid + 1, GFP_ATOMIC);
|
||||
/*
|
||||
* If ENOSPC is returned it means that the PID is
|
||||
* alreay in use. Return EEXIST in that case.
|
||||
*/
|
||||
if (nr == -ENOSPC)
|
||||
nr = -EEXIST;
|
||||
} else {
|
||||
int pid_min = 1;
|
||||
/*
|
||||
* init really needs pid 1, but after reaching the
|
||||
* maximum wrap back to RESERVED_PIDS
|
||||
*/
|
||||
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
|
||||
pid_min = RESERVED_PIDS;
|
||||
|
||||
/*
|
||||
* Store a null pointer so find_pid_ns does not find
|
||||
* a partially initialized PID (see below).
|
||||
*/
|
||||
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
|
||||
pid_max, GFP_ATOMIC);
|
||||
/*
|
||||
* Store a null pointer so find_pid_ns does not find
|
||||
* a partially initialized PID (see below).
|
||||
*/
|
||||
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
|
||||
pid_max, GFP_ATOMIC);
|
||||
}
|
||||
spin_unlock_irq(&pidmap_lock);
|
||||
idr_preload_end();
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue