clone3: allow spawning processes into cgroups

This adds support for creating a process in a different cgroup than its
parent. Callers can limit and account processes and threads right from
the moment they are spawned:
- A service manager can directly spawn new services into dedicated
  cgroups.
- A process can be directly created in a frozen cgroup and will be
  frozen as well.
- The initial accounting jitter experienced by process supervisors and
  daemons is eliminated with this.
- Threaded applications or even thread implementations can choose to
  create a specific cgroup layout where each thread is spawned
  directly into a dedicated cgroup.

This feature is limited to the unified hierarchy. Callers need to pass
a directory file descriptor for the target cgroup. The caller can
choose to pass an O_PATH file descriptor. All usual migration
restrictions apply, i.e. there can be no processes in inner nodes. In
general, creating a process directly in a target cgroup adheres to all
migration restrictions.

One of the biggest advantages of this feature is that CLONE_INTO_GROUP does
not need to grab the write side of the cgroup cgroup_threadgroup_rwsem.
This global lock makes moving tasks/threads around super expensive. With
clone3() this lock is avoided.

Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: cgroups@vger.kernel.org
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
/* v1 */
Link: https://lore.kernel.org/r/20191218173516.7875-3-christian.brauner@ubuntu.com

/* v2 */
Link: https://lore.kernel.org/r/20191223061504.28716-3-christian.brauner@ubuntu.com
- Oleg Nesterov <oleg@redhat.com>:
  - prevent deadlock from wrong locking order
- Christian Brauner <christian.brauner@ubuntu.com>:
  - Rework locking. In the previous patch version we would have already
    acquired the cgroup_threadgroup_rwsem before we grabbed cgroup mutex
    we need to hold when CLONE_INTO_CGROUP is specified. This meant we
    could deadlock with other codepaths that all require it to be done
    the other way around. Fix this by first grabbing cgroup mutex when
    CLONE_INTO_CGROUP is specified and then grabbing
    cgroup_threadgroup_rwsem unconditionally after. This way we don't
    require the cgroup mutex be held in codepaths that don't need it.
  - Switch from mutex_lock() to mutex_lock_killable().

/* v3 */
Link: https://lore.kernel.org/r/20200117002143.15559-5-christian.brauner@ubuntu.com
- Tejun Heo <tj@kernel.org>:
  - s/mutex_lock_killable()/mutex_lock()/ because it should only ever
    be held for a short time:
    diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
    index a9fedcfeae4b..d68d3fb6af1d 100644
    --- a/kernel/cgroup/cgroup.c
    +++ b/kernel/cgroup/cgroup.c
    @@ -5927,11 +5927,8 @@ static int cgroup_css_set_fork(struct task_struct *parent,
            struct super_block *sb;
            struct file *f;

    -       if (kargs->flags & CLONE_INTO_CGROUP) {
    -               ret = mutex_lock_killable(&cgroup_mutex);
    -               if (ret)
    -                       return ret;
    -       }
    +       if (kargs->flags & CLONE_INTO_CGROUP)
    +               mutex_lock(&cgroup_mutex);

            cgroup_threadgroup_change_begin(parent);
  - s/task_cgroup_from_root/cset->dfl_cgrp/:
    diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
    index d68d3fb6af1d..3ceef006d144 100644
    --- a/kernel/cgroup/cgroup.c
    +++ b/kernel/cgroup/cgroup.c
    @@ -5922,7 +5922,7 @@ static int cgroup_css_set_fork(struct task_struct *parent,
            __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
     {
            int ret;
    -       struct cgroup *dst_cgrp = NULL, *src_cgrp;
    +       struct cgroup *dst_cgrp = NULL;
            struct css_set *cset;
            struct super_block *sb;
            struct file *f;
    @@ -5956,11 +5956,7 @@ static int cgroup_css_set_fork(struct task_struct *parent,
                    goto err;
            }

    -       spin_lock_irq(&css_set_lock);
    -       src_cgrp = task_cgroup_from_root(parent, &cgrp_dfl_cgrp);
    -       spin_unlock_irq(&css_set_lock);
    -
    -       ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, sb,
    +       ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
                                            !!(kargs->flags & CLONE_THREAD));
            if (ret)
                    goto err;
  - pass struct css_set instead of struct kernel_clone_args into cgroup
    fork subsystem callbacks:
    diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
    index cd848c6bac4a..058bb16d073f 100644
    --- a/include/linux/cgroup-defs.h
    +++ b/include/linux/cgroup-defs.h
    @@ -630,9 +630,8 @@ struct cgroup_subsys {
     	void (*attach)(struct cgroup_taskset *tset);
     	void (*post_attach)(void);
     	int (*can_fork)(struct task_struct *parent, struct task_struct *child,
    -			struct kernel_clone_args *kargs);
    -	void (*cancel_fork)(struct task_struct *child,
    -			    struct kernel_clone_args *kargs);
    +			struct css_set *cset);
    +	void (*cancel_fork)(struct task_struct *child, struct css_set *cset);
     	void (*fork)(struct task_struct *task);
     	void (*exit)(struct task_struct *task);
     	void (*release)(struct task_struct *task);
    diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
    index 3ceef006d144..2ac1c37a3fcb 100644
    --- a/kernel/cgroup/cgroup.c
    +++ b/kernel/cgroup/cgroup.c
    @@ -6044,7 +6044,7 @@ int cgroup_can_fork(struct task_struct *parent, struct task_struct *child,
     		return ret;

     	do_each_subsys_mask(ss, i, have_canfork_callback) {
    -		ret = ss->can_fork(parent, child, kargs);
    +		ret = ss->can_fork(parent, child, kargs->cset);
     		if (ret)
     			goto out_revert;
     	} while_each_subsys_mask();
    @@ -6056,7 +6056,7 @@ int cgroup_can_fork(struct task_struct *parent, struct task_struct *child,
     		if (j >= i)
     			break;
     		if (ss->cancel_fork)
    -			ss->cancel_fork(child, kargs);
    +			ss->cancel_fork(child, kargs->cset);
     	}

     	cgroup_css_set_put_fork(parent, kargs);
    @@ -6082,7 +6082,7 @@ void cgroup_cancel_fork(struct task_struct *parent, struct task_struct *child,

     	for_each_subsys(ss, i)
     		if (ss->cancel_fork)
    -			ss->cancel_fork(child, kargs);
    +			ss->cancel_fork(child, kargs->cset);

     	cgroup_css_set_put_fork(parent, kargs);
     }
    diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
    index e5955bc1fb00..4e7c8819c8df 100644
    --- a/kernel/cgroup/pids.c
    +++ b/kernel/cgroup/pids.c
    @@ -216,20 +216,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
      * on cgroup_threadgroup_change_begin() held by the copy_process().
      */
     static int pids_can_fork(struct task_struct *parent, struct task_struct *child,
    -			 struct kernel_clone_args *args)
    +			 struct css_set *cset)
     {
    -	struct css_set *new_cset = NULL;
     	struct cgroup_subsys_state *css;
     	struct pids_cgroup *pids;
     	int err;

    -	if (args)
    -		new_cset = args->cset;
    -
    -	if (!new_cset)
    -		css = task_css_check(current, pids_cgrp_id, true);
    +	if (cset)
    +		css = cset->subsys[pids_cgrp_id];
     	else
    -		css = new_cset->subsys[pids_cgrp_id];
    +		css = task_css_check(current, pids_cgrp_id, true);
     	pids = css_pids(css);
     	err = pids_try_charge(pids, 1);
     	if (err) {
    @@ -244,20 +240,15 @@ static int pids_can_fork(struct task_struct *parent, struct task_struct *child,
     	return err;
     }

    -static void pids_cancel_fork(struct task_struct *task,
    -			     struct kernel_clone_args *args)
    +static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
     {
    -	struct css_set *new_cset = NULL;
     	struct cgroup_subsys_state *css;
     	struct pids_cgroup *pids;

    -	if (args)
    -		new_cset = args->cset;
    -
    -	if (!new_cset)
    -		css = task_css_check(current, pids_cgrp_id, true);
    +	if (cset)
    +		css = cset->subsys[pids_cgrp_id];
     	else
    -		css = new_cset->subsys[pids_cgrp_id];
    +		css = task_css_check(current, pids_cgrp_id, true);
     	pids = css_pids(css);
     	pids_uncharge(pids, 1);
     }
- Michal Koutný <mkoutny@suse.com>:
  - update comment for cgroup_fork()
  - if CLONE_NEWCGROUP and CLONE_INTO_CGROUP is requested, set the
    root_cset of the new cgroup namespace to the child's cset

/* v4 */
Link: https://lore.kernel.org/r/20200117181219.14542-6-christian.brauner@ubuntu.com
- Tejun Heo <tj@kernel.org>:
  - verify that we can write to the target cgroup since we're not going through
    the vfs layer which would do it for us
    diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
    index 61d1a6cd0059..6b38b2545667 100644
    --- a/kernel/cgroup/cgroup.c
    +++ b/kernel/cgroup/cgroup.c
    @@ -5966,6 +5966,15 @@ static int cgroup_css_set_fork(struct task_struct *parent,
                    goto err;
            }

    +       /*
    +        * Verify that we can the target cgroup is writable for us. This is
    +        * usally done by the vfs layer but since we're not going through the
    +        * vfs layer here we need to do it.
    +        */
    +       ret = cgroup_may_write(dst_cgrp, sb);
    +       if (ret)
    +               goto err;
    +
            ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
                                            !!(kargs->flags & CLONE_THREAD));
            if (ret)
/* v5 */
Link: https://lore.kernel.org/r/20200121154844.411-6-christian.brauner@ubuntu.com
- Oleg Nesterov <oleg@redhat.com>:
  - remove struct task_struct *parent argument from clone helpers in favor of
    using current directly
  - remove cgroup_same_domain_helper()
    diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
    index f4379401327a..4d36255ef25f 100644
    --- a/kernel/cgroup/cgroup.c
    +++ b/kernel/cgroup/cgroup.c
    @@ -4696,12 +4696,6 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
            return 0;
     }

    -static inline bool cgroup_same_domain(const struct cgroup *src_cgrp,
    -                                     const struct cgroup *dst_cgrp)
    -{
    -       return src_cgrp->dom_cgrp == dst_cgrp->dom_cgrp;
    -}
    -
     static int cgroup_attach_permissions(struct cgroup *src_cgrp,
                                         struct cgroup *dst_cgrp,
                                         struct super_block *sb, bool thread)
    @@ -4716,8 +4710,7 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
            if (ret)
                    return ret;

    -       if (thread &&
    -           !cgroup_same_domain(src_cgrp->dom_cgrp, dst_cgrp->dom_cgrp))
    +       if (thread && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
                    ret = -EOPNOTSUPP;

            return ret;
  - put kargs->cset on failure
    diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
    index 4d36255ef25f..482055d1e64a 100644
    --- a/kernel/cgroup/cgroup.c
    +++ b/kernel/cgroup/cgroup.c
    @@ -5994,6 +5994,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
            if (dst_cgrp)
                    cgroup_put(dst_cgrp);
            put_css_set(cset);
    +       if (kargs->cset)
    +               put_css_set(kargs->cset);
            return ret;
     }

/* v6 */
- Michal Koutný <mkoutny@suse.com>:
  - Move check whether cgroup is still alive right after getting it from the
    passed-in file descriptor:
    diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
    index 30a24ab3d74f..99bd4c1cea52 100644
    --- a/kernel/cgroup/cgroup.c
    +++ b/kernel/cgroup/cgroup.c
    @@ -5956,6 +5956,11 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
     		goto err;
     	}

    +	if (cgroup_is_dead(dst_cgrp)) {
    +		ret = -ENODEV;
    +		goto err;
    +	}
    +
     	/*
     	 * Verify that we the target cgroup is writable for us. This is
     	 * usually done by the vfs layer but since we're not going through
    @@ -5976,11 +5981,6 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
     		goto err;
     	}

    -	if (cgroup_is_dead(dst_cgrp)) {
    -		ret = -ENODEV;
    -		goto err;
    -	}
    -
     	put_css_set(cset);
     	fput(f);
     	kargs->cgrp = dst_cgrp;
    - put old root cset and get new root cset:
      diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
      index 99bd4c1cea52..2cb93b11ebf5 100644
      --- a/kernel/cgroup/cgroup.c
      +++ b/kernel/cgroup/cgroup.c
      @@ -6153,8 +6153,14 @@ void cgroup_post_fork(struct task_struct *child,
              }

              /* Make the new cset the root_cset of the new cgroup namespace. */
      -       if (kargs->flags & CLONE_NEWCGROUP)
      +
      +       if (kargs->flags & CLONE_NEWCGROUP) {
      +               struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
      +
      +               get_css_set(cset);
                      child->nsproxy->cgroup_ns->root_cset = cset;
      +               put_css_set(rcset);
      +       }
       }
    - use cgroup_css_set_put_fork() in cgroup_post_fork() too:
      diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
      index 2cb93b11ebf5..9b11e7f44686 100644
      --- a/kernel/cgroup/cgroup.c
      +++ b/kernel/cgroup/cgroup.c
      @@ -6104,9 +6104,12 @@ void cgroup_post_fork(struct task_struct *child,
       	__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
       {
       	struct cgroup_subsys *ss;
      -	struct css_set *cset = kargs->cset;
      +	struct css_set *cset;
       	int i;

      +	cset = kargs->cset;
      +	kargs->cset = NULL;
      +
       	spin_lock_irq(&css_set_lock);

       	WARN_ON_ONCE(!list_empty(&child->cg_list));
      @@ -6143,15 +6146,6 @@ void cgroup_post_fork(struct task_struct *child,
       		ss->fork(child);
       	} while_each_subsys_mask();

      -	cgroup_threadgroup_change_end(current);
      -
      -	if (kargs->flags & CLONE_INTO_CGROUP) {
      -		mutex_unlock(&cgroup_mutex);
      -
      -		cgroup_put(kargs->cgrp);
      -		kargs->cgrp = NULL;
      -	}
      -
       	/* Make the new cset the root_cset of the new cgroup namespace. */

       	if (kargs->flags & CLONE_NEWCGROUP) {
      @@ -6161,6 +6155,8 @@ void cgroup_post_fork(struct task_struct *child,
       		child->nsproxy->cgroup_ns->root_cset = cset;
       		put_css_set(rcset);
       	}
      +
      +	cgroup_css_set_put_fork(kargs);
       }
- Christian Brauner <christian.brauner@ubuntu.com>:
  - remove struct kernel_clone_args forward declaration from cgroup-defs.h
    header. This was needed in a previous version of the series but isn't
    needed anymore.
    diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
    index 89d627abcbd6..68c391f451d1 100644
    --- a/include/linux/cgroup-defs.h
    +++ b/include/linux/cgroup-defs.h
    @@ -33,7 +33,6 @@ struct kernfs_ops;
     struct kernfs_open_file;
     struct seq_file;
     struct poll_table_struct;
    -struct kernel_clone_args;

     #define MAX_CGROUP_TYPE_NAMELEN 32
     #define MAX_CGROUP_ROOT_NAMELEN 64
7 files changed