Skip to content

Commit

Permalink
cgroup: don't hold css_set_rwsem across css task iteration
Browse files Browse the repository at this point in the history
css_sets are synchronized through css_set_rwsem but the locking scheme
is kinda bizarre.  The hot paths - fork and exit - have to write lock
the rwsem making the rw part pointless; furthermore, many readers
already hold cgroup_mutex.

One of the readers is css task iteration.  It read locks the rwsem
over the entire duration of iteration.  This leads to silly locking
behavior.  When cpuset tries to migrate processes of a cgroup to a
different NUMA node, css_set_rwsem is held across the entire migration
attempt which can take a long time locking out forking, exiting and
other cgroup operations.

This patch updates css task iteration so that it locks css_set_rwsem
only while the iterator is being advanced.  css task iteration
involves two levels - css_set and task iteration.  As css_sets in use
are practically immutable, simply pinning the current one is enough
for resuming iteration afterwards.  Task iteration is tricky as tasks
may leave their css_set while iteration is in progress.  This is
solved by keeping track of active iterators and advancing them if
their next task leaves its css_set.

v2: put_task_struct() in css_task_iter_next() moved outside
    css_set_rwsem.  A later patch will add cgroup operations to
    task_struct free path which may grab the same lock and this avoids
    deadlock possibilities.

    css_set_move_task() updated to use list_for_each_entry_safe() when
    walking task_iters and advancing them.  This is necessary as
    advancing an iter may remove it from the list.

Signed-off-by: Tejun Heo <tj@kernel.org>
  • Loading branch information
htejun committed Oct 15, 2015
1 parent ecb9d53 commit ed27b9f
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 14 deletions.
3 changes: 3 additions & 0 deletions include/linux/cgroup-defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,9 @@ struct css_set {
*/
struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];

/* all css_task_iters currently walking this cset */
struct list_head task_iters;

/* For RCU-protected deletion */
struct rcu_head rcu_head;
};
Expand Down
4 changes: 4 additions & 0 deletions include/linux/cgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ struct css_task_iter {
struct list_head *task_pos;
struct list_head *tasks_head;
struct list_head *mg_tasks_head;

struct css_set *cur_cset;
struct task_struct *cur_task;
struct list_head iters_node; /* css_set->task_iters */
};

extern struct cgroup_root cgrp_dfl_root;
Expand Down
87 changes: 73 additions & 14 deletions kernel/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ static struct cftype cgroup_legacy_base_files[];

static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned long ss_mask);
static void css_task_iter_advance(struct css_task_iter *it);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
bool visible);
Expand Down Expand Up @@ -593,6 +594,7 @@ struct css_set init_css_set = {
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
};

static int css_set_count = 1; /* 1 for init_css_set */
Expand Down Expand Up @@ -675,8 +677,9 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
* This function automatically handles populated_cnt updates but the caller
* is responsible for managing @from_cset and @to_cset's reference counts.
* This function automatically handles populated_cnt updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
static void css_set_move_task(struct task_struct *task,
struct css_set *from_cset, struct css_set *to_cset,
Expand All @@ -685,7 +688,22 @@ static void css_set_move_task(struct task_struct *task,
lockdep_assert_held(&css_set_rwsem);

if (from_cset) {
struct css_task_iter *it, *pos;

WARN_ON_ONCE(list_empty(&task->cg_list));

/*
* @task is leaving, advance task iterators which are
* pointing to it so that they can resume at the next
* position. Advancing an iterator might remove it from
* the list, use safe walk. See css_task_iter_advance*()
* for details.
*/
list_for_each_entry_safe(it, pos, &from_cset->task_iters,
iters_node)
if (it->task_pos == &task->cg_list)
css_task_iter_advance(it);

list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
Expand Down Expand Up @@ -1019,6 +1037,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->mg_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
INIT_LIST_HEAD(&cset->task_iters);
INIT_HLIST_NODE(&cset->hlist);

/* Copy the set of subsystem state objects generated in
Expand Down Expand Up @@ -3804,6 +3823,8 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
struct cgrp_cset_link *link;
struct css_set *cset;

lockdep_assert_held(&css_set_rwsem);

/* Advance to the next non-empty css_set */
do {
l = l->next;
Expand Down Expand Up @@ -3831,12 +3852,36 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)

it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;

/*
* We don't keep css_sets locked across iteration steps and thus
* need to take steps to ensure that iteration can be resumed after
* the lock is re-acquired. Iteration is performed at two levels -
* css_sets and tasks in them.
*
* Once created, a css_set never leaves its cgroup lists, so a
* pinned css_set is guaranteed to stay put and we can resume
* iteration afterwards.
*
* Tasks may leave @cset across iteration steps. This is resolved
* by registering each iterator with the css_set currently being
* walked and making css_set_move_task() advance iterators whose
* next task is leaving.
*/
if (it->cur_cset) {
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
}
get_css_set(cset);
it->cur_cset = cset;
list_add(&it->iters_node, &cset->task_iters);
}

static void css_task_iter_advance(struct css_task_iter *it)
{
struct list_head *l = it->task_pos;

lockdep_assert_held(&css_set_rwsem);
WARN_ON_ONCE(!l);

/*
Expand Down Expand Up @@ -3864,19 +3909,16 @@ static void css_task_iter_advance(struct css_task_iter *it)
* css_task_iter_next() to walk through the tasks until the function
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*
* Note that this function acquires a lock which is released when the
* iteration finishes. The caller can't sleep while iteration is in
* progress.
*/
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
__acquires(css_set_rwsem)
{
/* no one should try to iterate before mounting cgroups */
WARN_ON_ONCE(!use_task_css_set_links);

down_read(&css_set_rwsem);
memset(it, 0, sizeof(*it));

down_write(&css_set_rwsem);

it->ss = css->ss;

Expand All @@ -3888,6 +3930,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
it->cset_head = it->cset_pos;

css_task_iter_advance_css_set(it);

up_write(&css_set_rwsem);
}

/**
Expand All @@ -3900,14 +3944,22 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
*/
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;

if (!it->cset_pos)
return NULL;

res = list_entry(it->task_pos, struct task_struct, cg_list);
if (it->cur_task)
put_task_struct(it->cur_task);

down_write(&css_set_rwsem);

it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list);
get_task_struct(it->cur_task);

css_task_iter_advance(it);
return res;

up_write(&css_set_rwsem);

return it->cur_task;
}

/**
Expand All @@ -3917,9 +3969,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
* Finish task iteration started by css_task_iter_start().
*/
void css_task_iter_end(struct css_task_iter *it)
__releases(css_set_rwsem)
{
up_read(&css_set_rwsem);
if (it->cur_cset) {
down_write(&css_set_rwsem);
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
up_write(&css_set_rwsem);
}

if (it->cur_task)
put_task_struct(it->cur_task);
}

/**
Expand Down

0 comments on commit ed27b9f

Please sign in to comment.