summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c477
1 files changed, 271 insertions, 206 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09255ec8159c..53b8201b31eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = {
"pgmajfault",
};
+static const char * const mem_cgroup_lru_names[] = {
+ "inactive_anon",
+ "active_anon",
+ "inactive_file",
+ "active_file",
+ "unevictable",
+};
+
/*
* Per memcg event counter is incremented at every pagein/pageout. With THP,
* it will be incremated by the number of pages. This counter is used for
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node {
};
struct mem_cgroup_lru_info {
- struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
+ struct mem_cgroup_per_node *nodeinfo[0];
};
/*
@@ -276,17 +284,6 @@ struct mem_cgroup {
*/
struct res_counter kmem;
/*
- * Per cgroup active and inactive list, similar to the
- * per zone LRU lists.
- */
- struct mem_cgroup_lru_info info;
- int last_scanned_node;
-#if MAX_NUMNODES > 1
- nodemask_t scan_nodes;
- atomic_t numainfo_events;
- atomic_t numainfo_updating;
-#endif
- /*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
@@ -349,8 +346,29 @@ struct mem_cgroup {
/* Index in the kmem_cache->memcg_params->memcg_caches array */
int kmemcg_id;
#endif
+
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ atomic_t numainfo_events;
+ atomic_t numainfo_updating;
+#endif
+ /*
+ * Per cgroup active and inactive list, similar to the
+ * per zone LRU lists.
+ *
+ * WARNING: This has to be the last element of the struct. Don't
+ * add new fields after this point.
+ */
+ struct mem_cgroup_lru_info info;
};
+static size_t memcg_size(void)
+{
+ return sizeof(struct mem_cgroup) +
+ nr_node_ids * sizeof(struct mem_cgroup_per_node);
+}
+
/* internal only representation about the status of kmem accounting. */
enum {
KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
/* Stuffs for move charges at task migration. */
/*
- * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
- * left-shifted bitmap of these types.
+ * Types of charges to be moved. "move_charge_at_immitgrate" and
+ * "immigrate_flags" are treated as a left-shifted bitmap of these types.
*/
enum move_type {
MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
@@ -412,6 +430,7 @@ static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
+ unsigned long immigrate_flags;
unsigned long precharge;
unsigned long moved_charge;
unsigned long moved_swap;
@@ -424,14 +443,12 @@ static struct move_charge_struct {
static bool move_anon(void)
{
- return test_bit(MOVE_CHARGE_TYPE_ANON,
- &mc.to->move_charge_at_immigrate);
+ return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
}
static bool move_file(void)
{
- return test_bit(MOVE_CHARGE_TYPE_FILE,
- &mc.to->move_charge_at_immigrate);
+ return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
}
/*
@@ -471,6 +488,13 @@ enum res_type {
#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
+/*
+ * The memcg_create_mutex will be held whenever a new cgroup is created.
+ * As a consequence, any change that needs to protect against new child cgroups
+ * appearing has to hold it as well.
+ */
+static DEFINE_MUTEX(memcg_create_mutex);
+
static void mem_cgroup_get(struct mem_cgroup *memcg);
static void mem_cgroup_put(struct mem_cgroup *memcg);
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
{
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
}
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
return inactive * inactive_ratio < active;
}
-int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
-{
- unsigned long active;
- unsigned long inactive;
-
- inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
-
- return (active > inactive);
-}
-
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
spin_unlock_irqrestore(&memcg->move_lock, *flags);
}
+#define K(x) ((x) << (PAGE_SHIFT-10))
/**
- * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
+ * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
*/
static char memcg_name[PATH_MAX];
int ret;
+ struct mem_cgroup *iter;
+ unsigned int i;
- if (!memcg || !p)
+ if (!p)
return;
rcu_read_lock();
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
}
rcu_read_unlock();
- printk(KERN_INFO "Task in %s killed", memcg_name);
+ pr_info("Task in %s killed", memcg_name);
rcu_read_lock();
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
/*
* Continues from above, so we don't need an KERN_ level
*/
- printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
+ pr_cont(" as a result of limit of %s\n", memcg_name);
done:
- printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
+ pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->res, RES_FAILCNT));
- printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
- "failcnt %llu\n",
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
- printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ pr_info("Memory cgroup stats");
+
+ rcu_read_lock();
+ ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
+ if (!ret)
+ pr_cont(" for %s", memcg_name);
+ rcu_read_unlock();
+ pr_cont(":");
+
+ for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ continue;
+ pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+ K(mem_cgroup_read_stat(iter, i)));
+ }
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
+ K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
+
+ pr_cont("\n");
+ }
}
/*
@@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy)
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
+static void __init memcg_stock_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct memcg_stock_pcp *stock =
+ &per_cpu(memcg_stock, cpu);
+ INIT_WORK(&stock->work, drain_local_stock);
+ }
+}
+
/*
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
@@ -3030,7 +3081,9 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
if (memcg) {
s->memcg_params->memcg = memcg;
s->memcg_params->root_cache = root_cache;
- }
+ } else
+ s->memcg_params->is_root_cache = true;
+
return 0;
}
@@ -4389,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page)
pc = lookup_page_cgroup_used(page);
if (pc) {
- printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
- pc, pc->flags, pc->mem_cgroup);
+ pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
+ pc, pc->flags, pc->mem_cgroup);
}
}
#endif
@@ -4717,6 +4770,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
}
/*
+ * This mainly exists for tests during the setting of set of use_hierarchy.
+ * Since this is the very setting we are changing, the current hierarchy value
+ * is meaningless
+ */
+static inline bool __memcg_has_children(struct mem_cgroup *memcg)
+{
+ struct cgroup *pos;
+
+ /* bounce at first found */
+ cgroup_for_each_child(pos, memcg->css.cgroup)
+ return true;
+ return false;
+}
+
+/*
+ * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
+ * to be already dead (as in mem_cgroup_force_empty, for instance). This is
+ * from mem_cgroup_count_children(), in the sense that we don't really care how
+ * many children we have; we only need to know if we have any. It also counts
+ * any memcg without hierarchy as infertile.
+ */
+static inline bool memcg_has_children(struct mem_cgroup *memcg)
+{
+ return memcg->use_hierarchy && __memcg_has_children(memcg);
+}
+
+/*
* Reclaims as many pages from the given memcg as possible and moves
* the rest to the parent.
*
@@ -4786,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
if (parent)
parent_memcg = mem_cgroup_from_cont(parent);
- cgroup_lock();
+ mutex_lock(&memcg_create_mutex);
if (memcg->use_hierarchy == val)
goto out;
@@ -4801,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
*/
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
- if (list_empty(&cont->children))
+ if (!__memcg_has_children(memcg))
memcg->use_hierarchy = val;
else
retval = -EBUSY;
@@ -4809,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
retval = -EINVAL;
out:
- cgroup_unlock();
+ mutex_unlock(&memcg_create_mutex);
return retval;
}
@@ -4894,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
{
int ret = -EINVAL;
#ifdef CONFIG_MEMCG_KMEM
- bool must_inc_static_branch = false;
-
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
/*
* For simplicity, we won't allow this to be disabled. It also can't
@@ -4908,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
*
* After it first became limited, changes in the value of the limit are
* of course permitted.
- *
- * Taking the cgroup_lock is really offensive, but it is so far the only
- * way to guarantee that no children will appear. There are plenty of
- * other offenders, and they should all go away. Fine grained locking
- * is probably the way to go here. When we are fully hierarchical, we
- * can also get rid of the use_hierarchy check.
*/
- cgroup_lock();
+ mutex_lock(&memcg_create_mutex);
mutex_lock(&set_limit_mutex);
if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
- if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
- !list_empty(&cont->children))) {
+ if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
ret = -EBUSY;
goto out;
}
@@ -4931,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
goto out;
}
- must_inc_static_branch = true;
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * setting the active bit after the inc will guarantee no one
+ * starts accounting before all call sites are patched
+ */
+ memcg_kmem_set_active(memcg);
+
/*
* kmem charges can outlive the cgroup. In the case of slab
* pages, for instance, a page contain objects from various
@@ -4943,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
ret = res_counter_set_limit(&memcg->kmem, val);
out:
mutex_unlock(&set_limit_mutex);
- cgroup_unlock();
-
- /*
- * We are by now familiar with the fact that we can't inc the static
- * branch inside cgroup_lock. See disarm functions for details. A
- * worker here is overkill, but also wrong: After the limit is set, we
- * must start accounting right away. Since this operation can't fail,
- * we can safely defer it to here - no rollback will be needed.
- *
- * The boolean used to control this is also safe, because
- * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
- * able to set it to true;
- */
- if (must_inc_static_branch) {
- static_key_slow_inc(&memcg_kmem_enabled_key);
- /*
- * setting the active bit after the inc will guarantee no one
- * starts accounting before all call sites are patched
- */
- memcg_kmem_set_active(memcg);
- }
-
+ mutex_unlock(&memcg_create_mutex);
#endif
return ret;
}
+#ifdef CONFIG_MEMCG_KMEM
static int memcg_propagate_kmem(struct mem_cgroup *memcg)
{
int ret = 0;
@@ -4977,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
goto out;
memcg->kmem_account_flags = parent->kmem_account_flags;
-#ifdef CONFIG_MEMCG_KMEM
/*
* When that happen, we need to disable the static branch only on those
* memcgs that enabled it. To achieve this, we would be forced to
@@ -5003,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
mutex_lock(&set_limit_mutex);
ret = memcg_update_cache_sizes(memcg);
mutex_unlock(&set_limit_mutex);
-#endif
out:
return ret;
}
+#endif /* CONFIG_MEMCG_KMEM */
/*
* The user of this function is...
@@ -5146,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
if (val >= (1 << NR_MOVE_TYPE))
return -EINVAL;
+
/*
- * We check this value several times in both in can_attach() and
- * attach(), so we need cgroup lock to prevent this value from being
- * inconsistent.
+ * No kind of locking is needed in here, because ->can_attach() will
+ * check this value once in the beginning of the process, and then carry
+ * on with stale data. This means that changes to this value will only
+ * affect task migrations starting after the change.
*/
- cgroup_lock();
memcg->move_charge_at_immigrate = val;
- cgroup_unlock();
-
return 0;
}
#else
@@ -5212,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
}
#endif /* CONFIG_NUMA */
-static const char * const mem_cgroup_lru_names[] = {
- "inactive_anon",
- "active_anon",
- "inactive_file",
- "active_file",
- "unevictable",
-};
-
static inline void mem_cgroup_lru_names_not_uptodate(void)
{
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -5333,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
parent = mem_cgroup_from_cont(cgrp->parent);
- cgroup_lock();
+ mutex_lock(&memcg_create_mutex);
/* If under hierarchy, only empty-root can set this value */
- if ((parent->use_hierarchy) ||
- (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
- cgroup_unlock();
+ if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
+ mutex_unlock(&memcg_create_mutex);
return -EINVAL;
}
memcg->swappiness = val;
- cgroup_unlock();
+ mutex_unlock(&memcg_create_mutex);
return 0;
}
@@ -5670,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
parent = mem_cgroup_from_cont(cgrp->parent);
- cgroup_lock();
+ mutex_lock(&memcg_create_mutex);
/* oom-kill-disable is a flag for subhierarchy. */
- if ((parent->use_hierarchy) ||
- (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
- cgroup_unlock();
+ if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
+ mutex_unlock(&memcg_create_mutex);
return -EINVAL;
}
memcg->oom_kill_disable = val;
if (!val)
memcg_oom_recover(memcg);
- cgroup_unlock();
+ mutex_unlock(&memcg_create_mutex);
return 0;
}
@@ -5795,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = {
.read_seq_string = memcg_numa_stat_show,
},
#endif
-#ifdef CONFIG_MEMCG_SWAP
- {
- .name = "memsw.usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read = mem_cgroup_read,
- .register_event = mem_cgroup_usage_register_event,
- .unregister_event = mem_cgroup_usage_unregister_event,
- },
- {
- .name = "memsw.max_usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
- .trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
- },
- {
- .name = "memsw.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
- .write_string = mem_cgroup_write,
- .read = mem_cgroup_read,
- },
- {
- .name = "memsw.failcnt",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
- .trigger = mem_cgroup_reset,
- .read = mem_cgroup_read,
- },
-#endif
#ifdef CONFIG_MEMCG_KMEM
{
.name = "kmem.limit_in_bytes",
@@ -5856,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = {
{ }, /* terminate */
};
+#ifdef CONFIG_MEMCG_SWAP
+static struct cftype memsw_cgroup_files[] = {
+ {
+ .name = "memsw.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+ .read = mem_cgroup_read,
+ .register_event = mem_cgroup_usage_register_event,
+ .unregister_event = mem_cgroup_usage_unregister_event,
+ },
+ {
+ .name = "memsw.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "memsw.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+ .write_string = mem_cgroup_write,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "memsw.failcnt",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
+ },
+ { }, /* terminate */
+};
+#endif
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -5894,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- int size = sizeof(struct mem_cgroup);
+ size_t size = memcg_size();
- /* Can be very big if MAX_NUMNODES is very big */
+ /* Can be very big if nr_node_ids is very big */
if (size < PAGE_SIZE)
memcg = kzalloc(size, GFP_KERNEL);
else
@@ -5933,7 +5981,7 @@ out_free:
static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- int size = sizeof(struct mem_cgroup);
+ size_t size = memcg_size();
mem_cgroup_remove_from_trees(memcg);
free_css_id(&mem_cgroup_subsys, &memcg->css);
@@ -6015,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(parent_mem_cgroup);
-#ifdef CONFIG_MEMCG_SWAP
-static void __init enable_swap_cgroup(void)
-{
- if (!mem_cgroup_disabled() && really_do_swap_account)
- do_swap_account = 1;
-}
-#else
-static void __init enable_swap_cgroup(void)
-{
-}
-#endif
-
-static int mem_cgroup_soft_limit_tree_init(void)
+static void __init mem_cgroup_soft_limit_tree_init(void)
{
struct mem_cgroup_tree_per_node *rtpn;
struct mem_cgroup_tree_per_zone *rtpz;
@@ -6038,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
- if (!rtpn)
- goto err_cleanup;
+ BUG_ON(!rtpn);
soft_limit_tree.rb_tree_per_node[node] = rtpn;
@@ -6049,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void)
spin_lock_init(&rtpz->lock);
}
}
- return 0;
-
-err_cleanup:
- for_each_node(node) {
- if (!soft_limit_tree.rb_tree_per_node[node])
- break;
- kfree(soft_limit_tree.rb_tree_per_node[node]);
- soft_limit_tree.rb_tree_per_node[node] = NULL;
- }
- return 1;
-
}
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup *cont)
{
- struct mem_cgroup *memcg, *parent;
+ struct mem_cgroup *memcg;
long error = -ENOMEM;
int node;
@@ -6079,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont)
/* root ? */
if (cont->parent == NULL) {
- int cpu;
- enable_swap_cgroup();
- parent = NULL;
- if (mem_cgroup_soft_limit_tree_init())
- goto free_out;
root_mem_cgroup = memcg;
- for_each_possible_cpu(cpu) {
- struct memcg_stock_pcp *stock =
- &per_cpu(memcg_stock, cpu);
- INIT_WORK(&stock->work, drain_local_stock);
- }
- } else {
- parent = mem_cgroup_from_cont(cont->parent);
- memcg->use_hierarchy = parent->use_hierarchy;
- memcg->oom_kill_disable = parent->oom_kill_disable;
+ res_counter_init(&memcg->res, NULL);
+ res_counter_init(&memcg->memsw, NULL);
+ res_counter_init(&memcg->kmem, NULL);
}
- if (parent && parent->use_hierarchy) {
+ memcg->last_scanned_node = MAX_NUMNODES;
+ INIT_LIST_HEAD(&memcg->oom_notify);
+ atomic_set(&memcg->refcnt, 1);
+ memcg->move_charge_at_immigrate = 0;
+ mutex_init(&memcg->thresholds_lock);
+ spin_lock_init(&memcg->move_lock);
+
+ return &memcg->css;
+
+free_out:
+ __mem_cgroup_free(memcg);
+ return ERR_PTR(error);
+}
+
+static int
+mem_cgroup_css_online(struct cgroup *cont)
+{
+ struct mem_cgroup *memcg, *parent;
+ int error = 0;
+
+ if (!cont->parent)
+ return 0;
+
+ mutex_lock(&memcg_create_mutex);
+ memcg = mem_cgroup_from_cont(cont);
+ parent = mem_cgroup_from_cont(cont->parent);
+
+ memcg->use_hierarchy = parent->use_hierarchy;
+ memcg->oom_kill_disable = parent->oom_kill_disable;
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+
+ if (parent->use_hierarchy) {
res_counter_init(&memcg->res, &parent->res);
res_counter_init(&memcg->memsw, &parent->memsw);
res_counter_init(&memcg->kmem, &parent->kmem);
@@ -6117,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont)
* much sense so let cgroup subsystem know about this
* unfortunate state in our controller.
*/
- if (parent && parent != root_mem_cgroup)
+ if (parent != root_mem_cgroup)
mem_cgroup_subsys.broken_hierarchy = true;
}
- memcg->last_scanned_node = MAX_NUMNODES;
- INIT_LIST_HEAD(&memcg->oom_notify);
-
- if (parent)
- memcg->swappiness = mem_cgroup_swappiness(parent);
- atomic_set(&memcg->refcnt, 1);
- memcg->move_charge_at_immigrate = 0;
- mutex_init(&memcg->thresholds_lock);
- spin_lock_init(&memcg->move_lock);
error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
+ mutex_unlock(&memcg_create_mutex);
if (error) {
/*
* We call put now because our (and parent's) refcnts
@@ -6138,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont)
* call __mem_cgroup_free, so return directly
*/
mem_cgroup_put(memcg);
- return ERR_PTR(error);
+ if (parent->use_hierarchy)
+ mem_cgroup_put(parent);
}
- return &memcg->css;
-free_out:
- __mem_cgroup_free(memcg);
- return ERR_PTR(error);
+ return error;
}
static void mem_cgroup_css_offline(struct cgroup *cont)
@@ -6279,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
*/
- page = find_get_page(&swapper_space, ent.val);
+ page = find_get_page(swap_address_space(ent), ent.val);
if (do_swap_account)
entry->val = ent.val;
@@ -6320,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
swp_entry_t swap = radix_to_swp_entry(page);
if (do_swap_account)
*entry = swap;
- page = find_get_page(&swapper_space, swap.val);
+ page = find_get_page(swap_address_space(swap), swap.val);
}
#endif
return page;
@@ -6530,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
struct task_struct *p = cgroup_taskset_first(tset);
int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
+ unsigned long move_charge_at_immigrate;
- if (memcg->move_charge_at_immigrate) {
+ /*
+ * We are now commited to this value whatever it is. Changes in this
+ * tunable will only affect upcoming migrations, not the current one.
+ * So we need to save it, and keep it going.
+ */
+ move_charge_at_immigrate = memcg->move_charge_at_immigrate;
+ if (move_charge_at_immigrate) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -6551,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
spin_lock(&mc.lock);
mc.from = from;
mc.to = memcg;
+ mc.immigrate_flags = move_charge_at_immigrate;
spin_unlock(&mc.lock);
/* We set mc.moving_task later */
@@ -6745,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
.subsys_id = mem_cgroup_subsys_id,
.css_alloc = mem_cgroup_css_alloc,
+ .css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
.css_free = mem_cgroup_css_free,
.can_attach = mem_cgroup_can_attach,
@@ -6755,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
.use_id = 1,
};
-/*
- * The rest of init is performed during ->css_alloc() for root css which
- * happens before initcalls. hotcpu_notifier() can't be done together as
- * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
- * dependency. Do it from a subsys_initcall().
- */
-static int __init mem_cgroup_init(void)
-{
- hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
- return 0;
-}
-subsys_initcall(mem_cgroup_init);
-
#ifdef CONFIG_MEMCG_SWAP
static int __init enable_swap_account(char *s)
{
@@ -6780,4 +6810,39 @@ static int __init enable_swap_account(char *s)
}
__setup("swapaccount=", enable_swap_account);
+static void __init memsw_file_init(void)
+{
+ WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
+}
+
+static void __init enable_swap_cgroup(void)
+{
+ if (!mem_cgroup_disabled() && really_do_swap_account) {
+ do_swap_account = 1;
+ memsw_file_init();
+ }
+}
+
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
#endif
+
+/*
+ * subsys_initcall() for memory controller.
+ *
+ * Some parts like hotcpu_notifier() have to be initialized from this context
+ * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
+ * everything that doesn't depend on a specific mem_cgroup structure should
+ * be initialized from here.
+ */
+static int __init mem_cgroup_init(void)
+{
+ hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
+ enable_swap_cgroup();
+ mem_cgroup_soft_limit_tree_init();
+ memcg_stock_init();
+ return 0;
+}
+subsys_initcall(mem_cgroup_init);