Skip to content

Commit

Permalink
cgroup: introduce cgroup namespaces
Browse files Browse the repository at this point in the history
Introduce the ability to create new cgroup namespace. The newly created
cgroup namespace remembers the cgroup of the process at the point
of creation of the cgroup namespace (referred as cgroupns-root).
The main purpose of cgroup namespace is to virtualize the contents
of /proc/self/cgroup file. Processes inside a cgroup namespace
are only able to see paths relative to their namespace root
(unless they are moved outside of their cgroupns-root, at which point
 they will see a relative path from their cgroupns-root).
For a correctly setup container this enables container-tools
(like libcontainer, lxc, lmctfy, etc.) to create completely virtualized
containers without leaking system level cgroup hierarchy to the task.
This patch only implements the 'unshare' part of the cgroupns.

Signed-off-by: Aditya Kali <adityakali@google.com>
Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
  • Loading branch information
adityakali authored and htejun committed Feb 16, 2016
1 parent 5e2bec7 commit a79a908
Show file tree
Hide file tree
Showing 8 changed files with 250 additions and 10 deletions.
3 changes: 3 additions & 0 deletions fs/proc/namespaces.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ static const struct proc_ns_operations *ns_entries[] = {
&userns_operations,
#endif
&mntns_operations,
#ifdef CONFIG_CGROUPS
&cgroupns_operations,
#endif
};

static const char *proc_ns_get_link(struct dentry *dentry,
Expand Down
49 changes: 49 additions & 0 deletions include/linux/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
#include <linux/nsproxy.h>
#include <linux/types.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>

#include <linux/cgroup-defs.h>

Expand Down Expand Up @@ -611,4 +616,48 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}

#endif /* CONFIG_CGROUP_DATA */

struct cgroup_namespace {
atomic_t count;
struct ns_common ns;
struct user_namespace *user_ns;
struct css_set *root_cset;
};

extern struct cgroup_namespace init_cgroup_ns;

#ifdef CONFIG_CGROUPS

void free_cgroup_ns(struct cgroup_namespace *ns);

struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
struct user_namespace *user_ns,
struct cgroup_namespace *old_ns);

char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);

#else /* !CONFIG_CGROUPS */

static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
static inline struct cgroup_namespace *
copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
struct cgroup_namespace *old_ns)
{
return old_ns;
}

#endif /* !CONFIG_CGROUPS */

static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
if (ns)
atomic_inc(&ns->count);
}

static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
if (ns && atomic_dec_and_test(&ns->count))
free_cgroup_ns(ns);
}

#endif /* _LINUX_CGROUP_H */
2 changes: 2 additions & 0 deletions include/linux/nsproxy.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ struct mnt_namespace;
struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
struct cgroup_namespace;
struct fs_struct;

/*
Expand All @@ -33,6 +34,7 @@ struct nsproxy {
struct mnt_namespace *mnt_ns;
struct pid_namespace *pid_ns_for_children;
struct net *net_ns;
struct cgroup_namespace *cgroup_ns;
};
extern struct nsproxy init_nsproxy;

Expand Down
4 changes: 4 additions & 0 deletions include/linux/proc_ns.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
struct pid_namespace;
struct nsproxy;
struct path;
struct task_struct;
struct inode;

struct proc_ns_operations {
const char *name;
Expand All @@ -24,6 +26,7 @@ extern const struct proc_ns_operations ipcns_operations;
extern const struct proc_ns_operations pidns_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
extern const struct proc_ns_operations cgroupns_operations;

/*
* We always define these enumerators
Expand All @@ -34,6 +37,7 @@ enum {
PROC_UTS_INIT_INO = 0xEFFFFFFEU,
PROC_USER_INIT_INO = 0xEFFFFFFDU,
PROC_PID_INIT_INO = 0xEFFFFFFCU,
PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
};

#ifdef CONFIG_PROC_FS
Expand Down
173 changes: 170 additions & 3 deletions kernel/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/cpuset.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
#include <net/sock.h>

/*
Expand Down Expand Up @@ -212,6 +215,15 @@ static unsigned long have_fork_callback __read_mostly;
static unsigned long have_exit_callback __read_mostly;
static unsigned long have_free_callback __read_mostly;

/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
.count = { .counter = 2, },
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
.root_cset = &init_css_set,
};

/* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly;

Expand Down Expand Up @@ -2177,6 +2189,35 @@ static struct file_system_type cgroup2_fs_type = {
.kill_sb = cgroup_kill_sb,
};

static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
int ret;

ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
if (ret < 0 || ret >= buflen)
return NULL;
return buf;
}

char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
char *ret;

mutex_lock(&cgroup_mutex);
spin_lock_bh(&css_set_lock);

ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);

spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);

return ret;
}
EXPORT_SYMBOL_GPL(cgroup_path_ns);

/**
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
Expand Down Expand Up @@ -2204,7 +2245,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)

if (root) {
cgrp = task_cgroup_from_root(task, root);
path = cgroup_path(cgrp, buf, buflen);
path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
if (strlcpy(buf, "/", buflen) < buflen)
Expand Down Expand Up @@ -5297,6 +5338,8 @@ int __init cgroup_init(void)
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));

get_user_ns(init_cgroup_ns.user_ns);

mutex_lock(&cgroup_mutex);

/* Add init_css_set to the hash table */
Expand Down Expand Up @@ -5438,7 +5481,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
* " (deleted)" is appended to the cgroup path.
*/
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
path = cgroup_path(cgrp, buf, PATH_MAX);
path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
if (!path) {
retval = -ENAMETOOLONG;
goto out_unlock;
Expand Down Expand Up @@ -5720,7 +5764,9 @@ static void cgroup_release_agent(struct work_struct *work)
if (!pathbuf || !agentbuf)
goto out;

path = cgroup_path(cgrp, pathbuf, PATH_MAX);
spin_lock_bh(&css_set_lock);
path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
spin_unlock_bh(&css_set_lock);
if (!path)
goto out;

Expand Down Expand Up @@ -5931,6 +5977,127 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)

#endif /* CONFIG_SOCK_CGROUP_DATA */

/* cgroup namespaces */

static struct cgroup_namespace *alloc_cgroup_ns(void)
{
struct cgroup_namespace *new_ns;
int ret;

new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_alloc_inum(&new_ns->ns);
if (ret) {
kfree(new_ns);
return ERR_PTR(ret);
}
atomic_set(&new_ns->count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}

void free_cgroup_ns(struct cgroup_namespace *ns)
{
put_css_set(ns->root_cset);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
}
EXPORT_SYMBOL(free_cgroup_ns);

struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
struct user_namespace *user_ns,
struct cgroup_namespace *old_ns)
{
struct cgroup_namespace *new_ns = NULL;
struct css_set *cset = NULL;
int err;

BUG_ON(!old_ns);

if (!(flags & CLONE_NEWCGROUP)) {
get_cgroup_ns(old_ns);
return old_ns;
}

/* Allow only sysadmin to create cgroup namespace. */
err = -EPERM;
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
goto err_out;

mutex_lock(&cgroup_mutex);
spin_lock_bh(&css_set_lock);

cset = task_css_set(current);
get_css_set(cset);

spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);

err = -ENOMEM;
new_ns = alloc_cgroup_ns();
if (!new_ns)
goto err_out;

new_ns->user_ns = get_user_ns(user_ns);
new_ns->root_cset = cset;

return new_ns;

err_out:
if (cset)
put_css_set(cset);
kfree(new_ns);
return ERR_PTR(err);
}

static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
{
return container_of(ns, struct cgroup_namespace, ns);
}

static int cgroupns_install(struct nsproxy *nsproxy, void *ns)
{
pr_info("setns not supported for cgroup namespace");
return -EINVAL;
}

static struct ns_common *cgroupns_get(struct task_struct *task)
{
struct cgroup_namespace *ns = NULL;
struct nsproxy *nsproxy;

task_lock(task);
nsproxy = task->nsproxy;
if (nsproxy) {
ns = nsproxy->cgroup_ns;
get_cgroup_ns(ns);
}
task_unlock(task);

return ns ? &ns->ns : NULL;
}

static void cgroupns_put(struct ns_common *ns)
{
put_cgroup_ns(to_cg_ns(ns));
}

const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
.type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
};

static __init int cgroup_namespaces_init(void)
{
return 0;
}
subsys_initcall(cgroup_namespaces_init);

#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
Expand Down
8 changes: 4 additions & 4 deletions kernel/cpuset.c
Original file line number Diff line number Diff line change
Expand Up @@ -2714,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
goto out;

retval = -ENAMETOOLONG;
rcu_read_lock();
css = task_css(tsk, cpuset_cgrp_id);
p = cgroup_path(css->cgroup, buf, PATH_MAX);
rcu_read_unlock();
css = task_get_css(tsk, cpuset_cgrp_id);
p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
current->nsproxy->cgroup_ns);
css_put(css);
if (!p)
goto out_free;
seq_puts(m, p);
Expand Down
2 changes: 1 addition & 1 deletion kernel/fork.c
Original file line number Diff line number Diff line change
Expand Up @@ -1884,7 +1884,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
CLONE_NEWUSER|CLONE_NEWPID))
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
Expand Down
Loading

0 comments on commit a79a908

Please sign in to comment.