summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig37
-rw-r--r--mm/Makefile5
-rw-r--r--mm/backing-dev.c50
-rw-r--r--mm/balloon_compaction.c302
-rw-r--r--mm/bootmem.c81
-rw-r--r--mm/compaction.c602
-rw-r--r--mm/dmapool.c55
-rw-r--r--mm/fadvise.c34
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/filemap_xip.c10
-rw-r--r--mm/fremap.c19
-rw-r--r--mm/frontswap.c34
-rw-r--r--mm/highmem.c32
-rw-r--r--mm/huge_memory.c1056
-rw-r--r--mm/hugetlb.c97
-rw-r--r--mm/hugetlb_cgroup.c42
-rw-r--r--mm/internal.h64
-rw-r--r--mm/interval_tree.c112
-rw-r--r--mm/kmemleak.c109
-rw-r--r--mm/ksm.c77
-rw-r--r--mm/madvise.c8
-rw-r--r--mm/memblock.c29
-rw-r--r--mm/memcontrol.c1563
-rw-r--r--mm/memory-failure.c51
-rw-r--r--mm/memory.c351
-rw-r--r--mm/memory_hotplug.c500
-rw-r--r--mm/mempolicy.c493
-rw-r--r--mm/migrate.c438
-rw-r--r--mm/mlock.c27
-rw-r--r--mm/mmap.c781
-rw-r--r--mm/mmu_notifier.c89
-rw-r--r--mm/mmzone.c6
-rw-r--r--mm/mprotect.c151
-rw-r--r--mm/mremap.c75
-rw-r--r--mm/nobootmem.c24
-rw-r--r--mm/nommu.c54
-rw-r--r--mm/oom_kill.c140
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/page_alloc.c609
-rw-r--r--mm/page_cgroup.c5
-rw-r--r--mm/page_isolation.c70
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c7
-rw-r--r--mm/pgtable-generic.c59
-rw-r--r--mm/prio_tree.c208
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/rmap.c305
-rw-r--r--mm/shmem.c316
-rw-r--r--mm/slab.c631
-rw-r--r--mm/slab.h203
-rw-r--r--mm/slab_common.c438
-rw-r--r--mm/slob.c123
-rw-r--r--mm/slub.c601
-rw-r--r--mm/sparse.c35
-rw-r--r--mm/swap.c13
-rw-r--r--mm/swapfile.c46
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/util.c37
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c225
-rw-r--r--mm/vmstat.c44
61 files changed, 7994 insertions, 3638 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c6627..278e3ab1f169 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,25 @@ config NO_BOOTMEM
config MEMORY_ISOLATION
boolean
+config MOVABLE_NODE
+ boolean "Enable to assign a node which has only movable memory"
+ depends on HAVE_MEMBLOCK
+ depends on NO_BOOTMEM
+ depends on X86_64
+ depends on NUMA
+ default n
+ help
+ Allow a node to have only movable memory. Pages used by the kernel,
+ such as direct mapping pages cannot be migrated. So the corresponding
+ memory device cannot be hotplugged. This option allows users to
+ online all the memory of a node as movable memory so that the whole
+ node can be hotplugged. Users who don't use the memory hotplug
+ feature are fine with this option on since they don't online memory
+ as movable.
+
+ Say Y here if you want to hotplug a whole node.
+ Say N here if you want kernel to use memory on all nodes evenly.
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
@@ -188,9 +207,25 @@ config SPLIT_PTLOCK_CPUS
default "4"
#
+# support for memory balloon compaction
+config BALLOON_COMPACTION
+ bool "Allow for balloon memory compaction/migration"
+ def_bool y
+ depends on COMPACTION && VIRTIO_BALLOON
+ help
+ Memory fragmentation introduced by ballooning might reduce
+ significantly the number of 2MB contiguous memory blocks that can be
+ used within a guest, thus imposing performance penalties associated
+ with the reduced number of transparent huge pages that could be used
+ by the guest workload. Allowing the compaction & migration for memory
+ pages enlisted as being part of memory balloon devices avoids the
+ scenario aforementioned and helps improving memory defragmentation.
+
+#
# support for memory compaction
config COMPACTION
bool "Allow for memory compaction"
+ def_bool y
select MIGRATION
depends on MMU
help
@@ -318,7 +353,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
- depends on X86 && MMU
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
select COMPACTION
help
Transparent Hugepages allows the kernel to use huge pages and
diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82da..3a4628751f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,10 @@ endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
- prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+ util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o $(mmu-y)
+ compaction.o balloon_compaction.o \
+ interval_tree.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b41823cc05e6..d3ca2b3ee176 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -158,16 +158,16 @@ static ssize_t read_ahead_kb_store(struct device *dev,
const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- char *end;
unsigned long read_ahead_kb;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
- read_ahead_kb = simple_strtoul(buf, &end, 10);
- if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
- bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
- ret = count;
- }
- return ret;
+ ret = kstrtoul(buf, 10, &read_ahead_kb);
+ if (ret < 0)
+ return ret;
+
+ bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+
+ return count;
}
#define K(pages) ((pages) << (PAGE_SHIFT - 10))
@@ -187,16 +187,17 @@ static ssize_t min_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- char *end;
unsigned int ratio;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
- ratio = simple_strtoul(buf, &end, 10);
- if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
- ret = bdi_set_min_ratio(bdi, ratio);
- if (!ret)
- ret = count;
- }
return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio)
@@ -205,16 +206,17 @@ static ssize_t max_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- char *end;
unsigned int ratio;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
- ratio = simple_strtoul(buf, &end, 10);
- if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
- ret = bdi_set_max_ratio(bdi, ratio);
- if (!ret)
- ret = count;
- }
return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio)
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 000000000000..07dbc8ec46cf
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
+/*
+ * mm/balloon_compaction.c
+ *
+ * Common interface for making balloon pages movable by compaction.
+ *
+ * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
+ */
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/balloon_compaction.h>
+
+/*
+ * balloon_devinfo_alloc - allocates a balloon device information descriptor.
+ * @balloon_dev_descriptor: pointer to reference the balloon device which
+ * this struct balloon_dev_info will be servicing.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct balloon_dev_info which will be used to reference a balloon device
+ * as well as to keep track of the balloon device page list.
+ */
+struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
+{
+ struct balloon_dev_info *b_dev_info;
+ b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
+ if (!b_dev_info)
+ return ERR_PTR(-ENOMEM);
+
+ b_dev_info->balloon_device = balloon_dev_descriptor;
+ b_dev_info->mapping = NULL;
+ b_dev_info->isolated_pages = 0;
+ spin_lock_init(&b_dev_info->pages_lock);
+ INIT_LIST_HEAD(&b_dev_info->pages);
+
+ return b_dev_info;
+}
+EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
+
+/*
+ * balloon_page_enqueue - allocates a new page and inserts it into the balloon
+ * page list.
+ * @b_dev_info: balloon device decriptor where we will insert a new page to
+ *
+ * Driver must call it to properly allocate a new enlisted balloon page
+ * before definetively removing it from the guest system.
+ * This function returns the page address for the recently enqueued page or
+ * NULL in the case we fail to allocate a new page this turn.
+ */
+struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
+{
+ unsigned long flags;
+ struct page *page = alloc_page(balloon_mapping_gfp_mask() |
+ __GFP_NOMEMALLOC | __GFP_NORETRY);
+ if (!page)
+ return NULL;
+
+ /*
+ * Block others from accessing the 'page' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'page' at this point.
+ */
+ BUG_ON(!trylock_page(page));
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ unlock_page(page);
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_enqueue);
+
+/*
+ * balloon_page_dequeue - removes a page from balloon's page list and returns
+ * the its address to allow the driver release the page.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ *
+ * Driver must call it to properly de-allocate a previous enlisted balloon page
+ * before definetively releasing it back to the guest system.
+ * This function returns the page address for the recently dequeued page or
+ * NULL in the case we find balloon's page list temporarily empty due to
+ * compaction isolated pages.
+ */
+struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
+{
+ struct page *page, *tmp;
+ unsigned long flags;
+ bool dequeued_page;
+
+ dequeued_page = false;
+ list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+ /*
+ * Block others from accessing the 'page' while we get around
+ * establishing additional references and preparing the 'page'
+ * to be released by the balloon driver.
+ */
+ if (trylock_page(page)) {
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ /*
+ * Raise the page refcount here to prevent any wrong
+ * attempt to isolate this page, in case of coliding
+ * with balloon_page_isolate() just after we release
+ * the page lock.
+ *
+ * balloon_page_free() will take care of dropping
+ * this extra refcount later.
+ */
+ get_page(page);
+ balloon_page_delete(page);
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ unlock_page(page);
+ dequeued_page = true;
+ break;
+ }
+ }
+
+ if (!dequeued_page) {
+ /*
+ * If we are unable to dequeue a balloon page because the page
+ * list is empty and there is no isolated pages, then something
+ * went out of track and some balloon pages are lost.
+ * BUG() here, otherwise the balloon driver may get stuck into
+ * an infinite loop while attempting to release all its pages.
+ */
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ if (unlikely(list_empty(&b_dev_info->pages) &&
+ !b_dev_info->isolated_pages))
+ BUG();
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ page = NULL;
+ }
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_dequeue);
+
+#ifdef CONFIG_BALLOON_COMPACTION
+/*
+ * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
+ * @b_dev_info: holds the balloon device information descriptor.
+ * @a_ops: balloon_mapping address_space_operations descriptor.
+ *
+ * Driver must call it to properly allocate and initialize an instance of
+ * struct address_space which will be used as the special page->mapping for
+ * balloon device enlisted page instances.
+ */
+struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
+ const struct address_space_operations *a_ops)
+{
+ struct address_space *mapping;
+
+ mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
+ if (!mapping)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Give a clean 'zeroed' status to all elements of this special
+ * balloon page->mapping struct address_space instance.
+ */
+ address_space_init_once(mapping);
+
+ /*
+ * Set mapping->flags appropriately, to allow balloon pages
+ * ->mapping identification.
+ */
+ mapping_set_balloon(mapping);
+ mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
+
+ /* balloon's page->mapping->a_ops callback descriptor */
+ mapping->a_ops = a_ops;
+
+ /*
+ * Establish a pointer reference back to the balloon device descriptor
+ * this particular page->mapping will be servicing.
+ * This is used by compaction / migration procedures to identify and
+ * access the balloon device pageset while isolating / migrating pages.
+ *
+ * As some balloon drivers can register multiple balloon devices
+ * for a single guest, this also helps compaction / migration to
+ * properly deal with multiple balloon pagesets, when required.
+ */
+ mapping->private_data = b_dev_info;
+ b_dev_info->mapping = mapping;
+
+ return mapping;
+}
+EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
+
+static inline void __isolate_balloon_page(struct page *page)
+{
+ struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ unsigned long flags;
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_del(&page->lru);
+ b_dev_info->isolated_pages++;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline void __putback_balloon_page(struct page *page)
+{
+ struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ unsigned long flags;
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_add(&page->lru, &b_dev_info->pages);
+ b_dev_info->isolated_pages--;
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+}
+
+static inline int __migrate_balloon_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
+}
+
+/* __isolate_lru_page() counterpart for a ballooned page */
+bool balloon_page_isolate(struct page *page)
+{
+ /*
+ * Avoid burning cycles with pages that are yet under __free_pages(),
+ * or just got freed under us.
+ *
+ * In case we 'win' a race for a balloon page being freed under us and
+ * raise its refcount preventing __free_pages() from doing its job
+ * the put_page() at the end of this block will take care of
+ * release this page, thus avoiding a nasty leakage.
+ */
+ if (likely(get_page_unless_zero(page))) {
+ /*
+ * As balloon pages are not isolated from LRU lists, concurrent
+ * compaction threads can race against page migration functions
+ * as well as race against the balloon driver releasing a page.
+ *
+ * In order to avoid having an already isolated balloon page
+ * being (wrongly) re-isolated while it is under migration,
+ * or to avoid attempting to isolate pages being released by
+ * the balloon driver, lets be sure we have the page lock
+ * before proceeding with the balloon page isolation steps.
+ */
+ if (likely(trylock_page(page))) {
+ /*
+ * A ballooned page, by default, has just one refcount.
+ * Prevent concurrent compaction threads from isolating
+ * an already isolated balloon page by refcount check.
+ */
+ if (__is_movable_balloon_page(page) &&
+ page_count(page) == 2) {
+ __isolate_balloon_page(page);
+ unlock_page(page);
+ return true;
+ }
+ unlock_page(page);
+ }
+ put_page(page);
+ }
+ return false;
+}
+
+/* putback_lru_page() counterpart for a ballooned page */
+void balloon_page_putback(struct page *page)
+{
+ /*
+ * 'lock_page()' stabilizes the page and prevents races against
+ * concurrent isolation threads attempting to re-isolate it.
+ */
+ lock_page(page);
+
+ if (__is_movable_balloon_page(page)) {
+ __putback_balloon_page(page);
+ /* drop the extra ref count taken for page isolation */
+ put_page(page);
+ } else {
+ WARN_ON(1);
+ dump_page(page);
+ }
+ unlock_page(page);
+}
+
+/* move_to_new_page() counterpart for a ballooned page */
+int balloon_page_migrate(struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct address_space *mapping;
+ int rc = -EAGAIN;
+
+ /*
+ * Block others from accessing the 'newpage' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'newpage' at this point.
+ */
+ BUG_ON(!trylock_page(newpage));
+
+ if (WARN_ON(!__is_movable_balloon_page(page))) {
+ dump_page(page);
+ unlock_page(newpage);
+ return rc;
+ }
+
+ mapping = page->mapping;
+ if (mapping)
+ rc = __migrate_balloon_page(mapping, newpage, page, mode);
+
+ unlock_page(newpage);
+ return rc;
+}
+#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index bcb63ac48cc5..1324cd74faec 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
/*
* free_bootmem_late - free bootmem pages directly to page allocator
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
* @size: size of the range in bytes
*
* This is only useful when the bootmem allocator has already been torn
* down, but we are still initializing the system. Pages are given directly
* to the page allocator, no bootmem metadata is updated because it is gone.
*/
-void __init free_bootmem_late(unsigned long addr, unsigned long size)
+void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
{
unsigned long cursor, end;
- kmemleak_free_part(__va(addr), size);
+ kmemleak_free_part(__va(physaddr), size);
- cursor = PFN_UP(addr);
- end = PFN_DOWN(addr + size);
+ cursor = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
for (; cursor < end; cursor++) {
__free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ /*
+ * In free_area_init_core(), highmem zone's managed_pages is set to
+ * present_pages, and bootmem allocator doesn't allocate from highmem
+ * zones. So there's no need to recalculate managed_pages because all
+ * highmem pages will be managed by the buddy system. Here highmem
+ * zone also includes highmem movable zone.
+ */
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ if (!is_highmem(z))
+ z->managed_pages = 0;
+}
+
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
* @pgdat: node to be released
@@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
+ reset_node_lowmem_managed_pages(pgdat);
return free_all_bootmem_core(pgdat->bdata);
}
@@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void)
{
unsigned long total_pages = 0;
bootmem_data_t *bdata;
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_lowmem_managed_pages(pgdat);
list_for_each_entry(bdata, &bdata_list, list)
total_pages += free_all_bootmem_core(bdata);
@@ -377,21 +398,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
/**
* free_bootmem - mark a page range as usable
- * @addr: starting address of the range
+ * @addr: starting physical address of the range
* @size: size of the range in bytes
*
* Partial pages will be considered reserved and left as they are.
*
* The range must be contiguous but may span node boundaries.
*/
-void __init free_bootmem(unsigned long addr, unsigned long size)
+void __init free_bootmem(unsigned long physaddr, unsigned long size)
{
unsigned long start, end;
- kmemleak_free_part(__va(addr), size);
+ kmemleak_free_part(__va(physaddr), size);
- start = PFN_UP(addr);
- end = PFN_DOWN(addr + size);
+ start = PFN_UP(physaddr);
+ end = PFN_DOWN(physaddr + size);
mark_bootmem(start, end, 0, 0);
}
@@ -419,7 +440,7 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
}
/**
- * reserve_bootmem - mark a page range as usable
+ * reserve_bootmem - mark a page range as reserved
* @addr: starting address of the range
* @size: size of the range in bytes
* @flags: reservation flags (see linux/bootmem.h)
@@ -439,12 +460,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
return mark_bootmem(start, end, 1, flags);
}
-int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
- int flags)
-{
- return reserve_bootmem(phys, len, flags);
-}
-
static unsigned long __init align_idx(struct bootmem_data *bdata,
unsigned long idx, unsigned long step)
{
@@ -575,27 +590,6 @@ find_block:
return NULL;
}
-static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
- unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
-{
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
-
-#ifdef CONFIG_HAVE_ARCH_BOOTMEM
- {
- bootmem_data_t *p_bdata;
-
- p_bdata = bootmem_arch_preferred_node(bdata, size, align,
- goal, limit);
- if (p_bdata)
- return alloc_bootmem_bdata(p_bdata, size, align,
- goal, limit);
- }
-#endif
- return NULL;
-}
-
static void * __init alloc_bootmem_core(unsigned long size,
unsigned long align,
unsigned long goal,
@@ -604,9 +598,8 @@ static void * __init alloc_bootmem_core(unsigned long size,
bootmem_data_t *bdata;
void *region;
- region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
- if (region)
- return region;
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
list_for_each_entry(bdata, &bdata_list, list) {
if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -704,11 +697,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
{
void *ptr;
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
again:
- ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
- align, goal, limit);
- if (ptr)
- return ptr;
/* do not panic in alloc_bootmem_bdata() */
if (limit && goal + size > limit)
diff --git a/mm/compaction.c b/mm/compaction.c
index 7fcd3a52e68d..5ad7f4f4d6f7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -14,6 +14,7 @@
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
+#include <linux/balloon_compaction.h>
#include "internal.h"
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -50,6 +51,111 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
+#ifdef CONFIG_COMPACTION
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ if (cc->ignore_skip_hint)
+ return true;
+
+ return !get_pageblock_skip(page);
+}
+
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void __reset_isolation_suitable(struct zone *zone)
+{
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ unsigned long pfn;
+
+ zone->compact_cached_migrate_pfn = start_pfn;
+ zone->compact_cached_free_pfn = end_pfn;
+ zone->compact_blockskip_flush = false;
+
+ /* Walk the zone and mark every pageblock as suitable for isolation */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ struct page *page;
+
+ cond_resched();
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (zone != page_zone(page))
+ continue;
+
+ clear_pageblock_skip(page);
+ }
+}
+
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ /* Only flush if a full compaction finished recently */
+ if (zone->compact_blockskip_flush)
+ __reset_isolation_suitable(zone);
+ }
+}
+
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by __reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+ bool migrate_scanner)
+{
+ struct zone *zone = cc->zone;
+ if (!page)
+ return;
+
+ if (!nr_isolated) {
+ unsigned long pfn = page_to_pfn(page);
+ set_pageblock_skip(page);
+
+ /* Update where compaction should restart */
+ if (migrate_scanner) {
+ if (!cc->finished_update_migrate &&
+ pfn > zone->compact_cached_migrate_pfn)
+ zone->compact_cached_migrate_pfn = pfn;
+ } else {
+ if (!cc->finished_update_free &&
+ pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
+ }
+ }
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ return true;
+}
+
+static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+ bool migrate_scanner)
+{
+}
+#endif /* CONFIG_COMPACTION */
+
+static inline bool should_release_lock(spinlock_t *lock)
+{
+ return need_resched() || spin_is_contended(lock);
+}
+
/*
* Compaction requires the taking of some coarse locks that are potentially
* very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +168,7 @@ static inline bool migrate_async_suitable(int migratetype)
static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
bool locked, struct compact_control *cc)
{
- if (need_resched() || spin_is_contended(lock)) {
+ if (should_release_lock(lock)) {
if (locked) {
spin_unlock_irqrestore(lock, *flags);
locked = false;
@@ -70,14 +176,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
/* async aborts if taking too long or contended */
if (!cc->sync) {
- if (cc->contended)
- *cc->contended = true;
+ cc->contended = true;
return false;
}
cond_resched();
- if (fatal_signal_pending(current))
- return false;
}
if (!locked)
@@ -91,44 +194,85 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
return compact_checklock_irqsave(lock, flags, false, cc);
}
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+ int migratetype = get_pageblock_migratetype(page);
+
+ /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+ return false;
+
+ /* If the page is a large free page, then allow migration */
+ if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ return true;
+
+ /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+ if (migrate_async_suitable(migratetype))
+ return true;
+
+ /* Otherwise skip the block */
+ return false;
+}
+
/*
* Isolate free pages onto a private freelist. Caller must hold zone->lock.
* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
* pages inside of the pageblock (even though it may still end up isolating
* some pages).
*/
-static unsigned long isolate_freepages_block(unsigned long blockpfn,
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+ unsigned long blockpfn,
unsigned long end_pfn,
struct list_head *freelist,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
- struct page *cursor;
+ struct page *cursor, *valid_page = NULL;
+ unsigned long nr_strict_required = end_pfn - blockpfn;
+ unsigned long flags;
+ bool locked = false;
cursor = pfn_to_page(blockpfn);
- /* Isolate free pages. This assumes the block is valid */
+ /* Isolate free pages. */
for (; blockpfn < end_pfn; blockpfn++, cursor++) {
int isolated, i;
struct page *page = cursor;
- if (!pfn_valid_within(blockpfn)) {
- if (strict)
- return 0;
- continue;
- }
nr_scanned++;
+ if (!pfn_valid_within(blockpfn))
+ continue;
+ if (!valid_page)
+ valid_page = page;
+ if (!PageBuddy(page))
+ continue;
- if (!PageBuddy(page)) {
- if (strict)
- return 0;
+ /*
+ * The zone lock must be held to isolate freepages.
+ * Unfortunately this is a very coarse lock and can be
+ * heavily contended if there are parallel allocations
+ * or parallel compactions. For async compaction do not
+ * spin on the lock and we acquire the lock as late as
+ * possible.
+ */
+ locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+ locked, cc);
+ if (!locked)
+ break;
+
+ /* Recheck this is a suitable migration target under lock */
+ if (!strict && !suitable_migration_target(page))
+ break;
+
+ /* Recheck this is a buddy page under lock */
+ if (!PageBuddy(page))
continue;
- }
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
if (!isolated && strict)
- return 0;
+ break;
total_isolated += isolated;
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
@@ -143,6 +287,26 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
}
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+
+ /*
+ * If strict isolation is requested by CMA then check that all the
+ * pages requested were isolated. If there were any failures, 0 is
+ * returned and CMA will fail.
+ */
+ if (strict && nr_strict_required > total_isolated)
+ total_isolated = 0;
+
+ if (locked)
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /* Update the pageblock-skip if the whole pageblock was scanned */
+ if (blockpfn == end_pfn)
+ update_pageblock_skip(cc, valid_page, total_isolated, false);
+
+ count_vm_events(COMPACTFREE_SCANNED, nr_scanned);
+ if (total_isolated)
+ count_vm_events(COMPACTISOLATED, total_isolated);
+
return total_isolated;
}
@@ -160,17 +324,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
* a free page).
*/
unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long isolated, pfn, block_end_pfn, flags;
- struct zone *zone = NULL;
+ unsigned long isolated, pfn, block_end_pfn;
LIST_HEAD(freelist);
- if (pfn_valid(start_pfn))
- zone = page_zone(pfn_to_page(start_pfn));
-
for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
- if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
+ if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
break;
/*
@@ -180,10 +341,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- spin_lock_irqsave(&zone->lock, flags);
- isolated = isolate_freepages_block(pfn, block_end_pfn,
+ isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
&freelist, true);
- spin_unlock_irqrestore(&zone->lock, flags);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -253,6 +412,7 @@ static bool too_many_isolated(struct zone *zone)
* @cc: Compaction control structure.
* @low_pfn: The first PFN of the range.
* @end_pfn: The one-past-the-last PFN of the range.
+ * @unevictable: true if it allows to isolate unevictable pages
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). Returns zero if there is a fatal signal
@@ -268,7 +428,7 @@ static bool too_many_isolated(struct zone *zone)
*/
unsigned long
isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
- unsigned long low_pfn, unsigned long end_pfn)
+ unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
{
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -276,7 +436,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
isolate_mode_t mode = 0;
struct lruvec *lruvec;
unsigned long flags;
- bool locked;
+ bool locked = false;
+ struct page *page = NULL, *valid_page = NULL;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -296,23 +457,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
/* Time to isolate some pages for migration */
cond_resched();
- spin_lock_irqsave(&zone->lru_lock, flags);
- locked = true;
for (; low_pfn < end_pfn; low_pfn++) {
- struct page *page;
-
/* give a chance to irqs before checking need_resched() */
- if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- locked = false;
+ if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ if (should_release_lock(&zone->lru_lock)) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ locked = false;
+ }
}
- /* Check if it is ok to still hold the lock */
- locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
- locked, cc);
- if (!locked)
- break;
-
/*
* migrate_pfn does not necessarily start aligned to a
* pageblock. Ensure that pfn_valid is called when moving
@@ -340,6 +493,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (page_zone(page) != zone)
continue;
+ if (!valid_page)
+ valid_page = page;
+
+ /* If isolation recently failed, do not retry */
+ pageblock_nr = low_pfn >> pageblock_order;
+ if (!isolation_suitable(cc, page))
+ goto next_pageblock;
+
/* Skip if free */
if (PageBuddy(page))
continue;
@@ -349,24 +510,58 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
* migration is optimistic to see if the minimum amount of work
* satisfies the allocation
*/
- pageblock_nr = low_pfn >> pageblock_order;
if (!cc->sync && last_pageblock_nr != pageblock_nr &&
!migrate_async_suitable(get_pageblock_migratetype(page))) {
- low_pfn += pageblock_nr_pages;
- low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
- last_pageblock_nr = pageblock_nr;
- continue;
+ cc->finished_update_migrate = true;
+ goto next_pageblock;
}
- if (!PageLRU(page))
+ /*
+ * Check may be lockless but that's ok as we recheck later.
+ * It's possible to migrate LRU pages and balloon pages
+ * Skip any other type of page
+ */
+ if (!PageLRU(page)) {
+ if (unlikely(balloon_page_movable(page))) {
+ if (locked && balloon_page_isolate(page)) {
+ /* Successfully isolated */
+ cc->finished_update_migrate = true;
+ list_add(&page->lru, migratelist);
+ cc->nr_migratepages++;
+ nr_isolated++;
+ goto check_compact_cluster;
+ }
+ }
continue;
+ }
/*
- * PageLRU is set, and lru_lock excludes isolation,
- * splitting and collapsing (collapsing has already
- * happened if PageLRU is set).
+ * PageLRU is set. lru_lock normally excludes isolation
+ * splitting and collapsing (collapsing has already happened
+ * if PageLRU is set) but the lock is not necessarily taken
+ * here and it is wasteful to take it just to check transhuge.
+ * Check TransHuge without lock and skip the whole pageblock if
+ * it's either a transhuge or hugetlbfs page, as calling
+ * compound_order() without preventing THP from splitting the
+ * page underneath us may return surprising results.
*/
if (PageTransHuge(page)) {
+ if (!locked)
+ goto next_pageblock;
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
+ /* Check if it is ok to still hold the lock */
+ locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+ locked, cc);
+ if (!locked || fatal_signal_pending(current))
+ break;
+
+ /* Recheck PageLRU and PageTransHuge under lock */
+ if (!PageLRU(page))
+ continue;
+ if (PageTransHuge(page)) {
low_pfn += (1 << compound_order(page)) - 1;
continue;
}
@@ -374,6 +569,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (!cc->sync)
mode |= ISOLATE_ASYNC_MIGRATE;
+ if (unevictable)
+ mode |= ISOLATE_UNEVICTABLE;
+
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
@@ -383,16 +581,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
VM_BUG_ON(PageTransCompound(page));
/* Successfully isolated */
+ cc->finished_update_migrate = true;
del_page_from_lru_list(page, lruvec, page_lru(page));
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
+check_compact_cluster:
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}
+
+ continue;
+
+next_pageblock:
+ low_pfn += pageblock_nr_pages;
+ low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+ last_pageblock_nr = pageblock_nr;
}
acct_isolated(zone, locked, cc);
@@ -400,50 +607,21 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
+ /* Update the pageblock-skip if the whole pageblock was scanned */
+ if (low_pfn == end_pfn)
+ update_pageblock_skip(cc, valid_page, nr_isolated, true);
+
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+ count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+ if (nr_isolated)
+ count_vm_events(COMPACTISOLATED, nr_isolated);
+
return low_pfn;
}
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
-
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-
- int migratetype = get_pageblock_migratetype(page);
-
- /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
- if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
- return false;
-
- /* If the page is a large free page, then allow migration */
- if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return true;
-
- /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(migratetype))
- return true;
-
- /* Otherwise skip the block */
- return false;
-}
-
-/*
- * Returns the start pfn of the last page block in a zone. This is the starting
- * point for full compaction of a zone. Compaction searches for free pages from
- * the end of each zone, while isolate_freepages_block scans forward inside each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
- unsigned long free_pfn;
- free_pfn = zone->zone_start_pfn + zone->spanned_pages;
- free_pfn &= ~(pageblock_nr_pages-1);
- return free_pfn;
-}
-
/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
@@ -453,7 +631,6 @@ static void isolate_freepages(struct zone *zone,
{
struct page *page;
unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
- unsigned long flags;
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
@@ -501,30 +678,24 @@ static void isolate_freepages(struct zone *zone,
if (!suitable_migration_target(page))
continue;
- /*
- * Found a block suitable for isolating free pages from. Now
- * we disabled interrupts, double check things are ok and
- * isolate the pages. This is to minimise the time IRQs
- * are disabled
- */
+ /* If isolation recently failed, do not retry */
+ if (!isolation_suitable(cc, page))
+ continue;
+
+ /* Found a block suitable for isolating free pages from */
isolated = 0;
/*
- * The zone lock must be held to isolate freepages. This
- * unfortunately this is a very coarse lock and can be
- * heavily contended if there are parallel allocations
- * or parallel compactions. For async compaction do not
- * spin on the lock
+ * As pfn may not start aligned, pfn+pageblock_nr_page
+ * may cross a MAX_ORDER_NR_PAGES boundary and miss
+ * a pfn_valid check. Ensure isolate_freepages_block()
+ * only scans within a pageblock
*/
- if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
- break;
- if (suitable_migration_target(page)) {
- end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
- isolated = isolate_freepages_block(pfn, end_pfn,
- freelist, false);
- nr_freepages += isolated;
- }
- spin_unlock_irqrestore(&zone->lock, flags);
+ end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ end_pfn = min(end_pfn, zone_end_pfn);
+ isolated = isolate_freepages_block(cc, pfn, end_pfn,
+ freelist, false);
+ nr_freepages += isolated;
/*
* Record the highest PFN we isolated pages from. When next
@@ -532,17 +703,8 @@ static void isolate_freepages(struct zone *zone,
* page migration may have returned some pages to the allocator
*/
if (isolated) {
+ cc->finished_update_free = true;
high_pfn = max(high_pfn, pfn);
-
- /*
- * If the free scanner has wrapped, update
- * compact_cached_free_pfn to point to the highest
- * pageblock with free pages. This reduces excessive
- * scanning of full pageblocks near the end of the
- * zone
- */
- if (cc->order > 0 && cc->wrapped)
- zone->compact_cached_free_pfn = high_pfn;
}
}
@@ -551,11 +713,6 @@ static void isolate_freepages(struct zone *zone,
cc->free_pfn = high_pfn;
cc->nr_freepages = nr_freepages;
-
- /* If compact_cached_free_pfn is reset then set it now */
- if (cc->order > 0 && !cc->wrapped &&
- zone->compact_cached_free_pfn == start_free_pfn(zone))
- zone->compact_cached_free_pfn = high_pfn;
}
/*
@@ -633,8 +790,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
/* Perform the isolation */
- low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
- if (!low_pfn)
+ low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
+ if (!low_pfn || cc->contended)
return ISOLATE_ABORT;
cc->migrate_pfn = low_pfn;
@@ -645,33 +802,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
static int compact_finished(struct zone *zone,
struct compact_control *cc)
{
- unsigned int order;
unsigned long watermark;
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
- /*
- * A full (order == -1) compaction run starts at the beginning and
- * end of a zone; it completes when the migrate and free scanner meet.
- * A partial (order > 0) compaction can start with the free scanner
- * at a random point in the zone, and may have to restart.
- */
+ /* Compaction run completes if the migrate and free scanner meet */
if (cc->free_pfn <= cc->migrate_pfn) {
- if (cc->order > 0 && !cc->wrapped) {
- /* We started partway through; restart at the end. */
- unsigned long free_pfn = start_free_pfn(zone);
- zone->compact_cached_free_pfn = free_pfn;
- cc->free_pfn = free_pfn;
- cc->wrapped = 1;
- return COMPACT_CONTINUE;
- }
- return COMPACT_COMPLETE;
- }
+ /*
+ * Mark that the PG_migrate_skip information should be cleared
+ * by kswapd when it goes to sleep. kswapd does not set the
+ * flag itself as the decision to be clear should be directly
+ * based on an allocation request.
+ */
+ if (!current_is_kswapd())
+ zone->compact_blockskip_flush = true;
- /* We wrapped around and ended up where we started. */
- if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
return COMPACT_COMPLETE;
+ }
/*
* order == -1 is expected when compacting via
@@ -688,14 +836,22 @@ static int compact_finished(struct zone *zone,
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
- for (order = cc->order; order < MAX_ORDER; order++) {
- /* Job done if page is free of the right migratetype */
- if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
- return COMPACT_PARTIAL;
-
- /* Job done if allocation would set block type */
- if (order >= pageblock_order && zone->free_area[order].nr_free)
+ if (cc->page) {
+ /* Was a suitable page captured? */
+ if (*cc->page)
return COMPACT_PARTIAL;
+ } else {
+ unsigned int order;
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[cc->order];
+ /* Job done if page is free of the right migratetype */
+ if (!list_empty(&area->free_list[cc->migratetype]))
+ return COMPACT_PARTIAL;
+
+ /* Job done if allocation would set block type */
+ if (cc->order >= pageblock_order && area->nr_free)
+ return COMPACT_PARTIAL;
+ }
}
return COMPACT_CONTINUE;
@@ -751,9 +907,65 @@ unsigned long compaction_suitable(struct zone *zone, int order)
return COMPACT_CONTINUE;
}
+static void compact_capture_page(struct compact_control *cc)
+{
+ unsigned long flags;
+ int mtype, mtype_low, mtype_high;
+
+ if (!cc->page || *cc->page)
+ return;
+
+ /*
+ * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+ * regardless of the migratetype of the freelist is is captured from.
+ * This is fine because the order for a high-order MIGRATE_MOVABLE
+ * allocation is typically at least a pageblock size and overall
+ * fragmentation is not impaired. Other allocation types must
+ * capture pages from their own migratelist because otherwise they
+ * could pollute other pageblocks like MIGRATE_MOVABLE with
+ * difficult to move pages and making fragmentation worse overall.
+ */
+ if (cc->migratetype == MIGRATE_MOVABLE) {
+ mtype_low = 0;
+ mtype_high = MIGRATE_PCPTYPES;
+ } else {
+ mtype_low = cc->migratetype;
+ mtype_high = cc->migratetype + 1;
+ }
+
+ /* Speculatively examine the free lists without zone lock */
+ for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+ int order;
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct page *page;
+ struct free_area *area;
+ area = &(cc->zone->free_area[order]);
+ if (list_empty(&area->free_list[mtype]))
+ continue;
+
+ /* Take the lock and attempt capture of the page */
+ if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+ return;
+ if (!list_empty(&area->free_list[mtype])) {
+ page = list_entry(area->free_list[mtype].next,
+ struct page, lru);
+ if (capture_free_page(page, cc->order, mtype)) {
+ spin_unlock_irqrestore(&cc->zone->lock,
+ flags);
+ *cc->page = page;
+ return;
+ }
+ }
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+ }
+ }
+}
+
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
ret = compaction_suitable(zone, cc->order);
switch (ret) {
@@ -766,18 +978,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
;
}
- /* Setup to move all movable pages to the end of the zone */
- cc->migrate_pfn = zone->zone_start_pfn;
-
- if (cc->order > 0) {
- /* Incremental compaction. Start where the last one stopped. */
- cc->free_pfn = zone->compact_cached_free_pfn;
- cc->start_free_pfn = cc->free_pfn;
- } else {
- /* Order == -1 starts at the end of the zone. */
- cc->free_pfn = start_free_pfn(zone);
+ /*
+ * Setup to move all movable pages to the end of the zone. Used cached
+ * information on where the scanners should start but check that it
+ * is initialised by ensuring the values are within zone boundaries.
+ */
+ cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+ cc->free_pfn = zone->compact_cached_free_pfn;
+ if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
+ cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ zone->compact_cached_free_pfn = cc->free_pfn;
+ }
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ cc->migrate_pfn = start_pfn;
+ zone->compact_cached_migrate_pfn = cc->migrate_pfn;
}
+ /*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred. kswapd does not do
+ * this reset as it'll reset the cached information when going to sleep.
+ */
+ if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ __reset_isolation_suitable(zone);
+
migrate_prep_local();
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
@@ -787,6 +1011,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_PARTIAL;
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
continue;
@@ -797,26 +1023,26 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
- cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+ MR_COMPACTION);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
- count_vm_event(COMPACTBLOCKS);
- count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
- if (nr_remaining)
- count_vm_events(COMPACTPAGEFAILED, nr_remaining);
trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
nr_remaining);
- /* Release LRU pages not migrated */
+ /* Release isolated pages not migrated */
if (err) {
- putback_lru_pages(&cc->migratepages);
+ putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
if (err == -ENOMEM) {
ret = COMPACT_PARTIAL;
goto out;
}
}
+
+ /* Capture a page now if it is a suitable size */
+ compact_capture_page(cc);
}
out:
@@ -829,8 +1055,10 @@ out:
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
- bool sync, bool *contended)
+ bool sync, bool *contended,
+ struct page **page)
{
+ unsigned long ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
@@ -838,12 +1066,18 @@ static unsigned long compact_zone_order(struct zone *zone,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
- .contended = contended,
+ .page = page,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- return compact_zone(zone, &cc);
+ ret = compact_zone(zone, &cc);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+
+ *contended = cc.contended;
+ return ret;
}
int sysctl_extfrag_threshold = 500;
@@ -855,12 +1089,14 @@ int sysctl_extfrag_threshold = 500;
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
* @sync: Whether migration is synchronous or not
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- bool sync, bool *contended)
+ bool sync, bool *contended, struct page **page)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -868,28 +1104,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
+ int alloc_flags = 0;
- /*
- * Check whether it is worth even starting compaction. The order check is
- * made because an assumption is made that the page allocator can satisfy
- * the "cheaper" orders without taking special steps
- */
+ /* Check if the GFP flags allow compaction */
if (!order || !may_enter_fs || !may_perform_io)
return rc;
count_vm_event(COMPACTSTALL);
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
int status;
status = compact_zone_order(zone, order, gfp_mask, sync,
- contended);
+ contended, page);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+ alloc_flags))
break;
}
@@ -940,6 +1178,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
struct compact_control cc = {
.order = order,
.sync = false,
+ .page = NULL,
};
return __compact_pgdat(pgdat, &cc);
@@ -950,6 +1189,7 @@ static int compact_node(int nid)
struct compact_control cc = {
.order = -1,
.sync = true,
+ .page = NULL,
};
return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c5ab33bca0a8..c69781e97cf9 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */
size_t allocation;
size_t boundary;
char name[32];
- wait_queue_head_t waitq;
struct list_head pools;
};
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
unsigned int offset;
};
-#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
-
static DEFINE_MUTEX(pools_lock);
static ssize_t
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
- init_waitqueue_head(&retval->waitq);
if (dev) {
int ret;
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
#endif
pool_initialise_page(pool, page);
- list_add(&page->page_list, &pool->page_list);
page->in_use = 0;
page->offset = 0;
} else {
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
might_sleep_if(mem_flags & __GFP_WAIT);
spin_lock_irqsave(&pool->lock, flags);
- restart:
list_for_each_entry(page, &pool->page_list, page_list) {
if (page->offset < pool->allocation)
goto ready;
}
- page = pool_alloc_page(pool, GFP_ATOMIC);
- if (!page) {
- if (mem_flags & __GFP_WAIT) {
- DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_UNINTERRUPTIBLE);
- __add_wait_queue(&pool->waitq, &wait);
- spin_unlock_irqrestore(&pool->lock, flags);
+ /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
+ spin_unlock_irqrestore(&pool->lock, flags);
- schedule_timeout(POOL_TIMEOUT_JIFFIES);
+ page = pool_alloc_page(pool, mem_flags);
+ if (!page)
+ return NULL;
- spin_lock_irqsave(&pool->lock, flags);
- __remove_wait_queue(&pool->waitq, &wait);
- goto restart;
- }
- retval = NULL;
- goto done;
- }
+ spin_lock_irqsave(&pool->lock, flags);
+ list_add(&page->page_list, &pool->page_list);
ready:
page->in_use++;
offset = page->offset;
@@ -346,9 +332,32 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
retval = offset + page->vaddr;
*handle = offset + page->dma;
#ifdef DMAPOOL_DEBUG
+ {
+ int i;
+ u8 *data = retval;
+ /* page->offset is stored in first 4 bytes */
+ for (i = sizeof(page->offset); i < pool->size; i++) {
+ if (data[i] == POOL_POISON_FREED)
+ continue;
+ if (pool->dev)
+ dev_err(pool->dev,
+ "dma_pool_alloc %s, %p (corruped)\n",
+ pool->name, retval);
+ else
+ pr_err("dma_pool_alloc %s, %p (corruped)\n",
+ pool->name, retval);
+
+ /*
+ * Dump the first 4 bytes even if they are not
+ * POOL_POISON_FREED
+ */
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
+ data, pool->size, 1);
+ break;
+ }
+ }
memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
- done:
spin_unlock_irqrestore(&pool->lock, flags);
return retval;
}
@@ -435,8 +444,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
- if (waitqueue_active(&pool->waitq))
- wake_up_locked(&pool->waitq);
/*
* Resist a temptation to do
* if (!is_page_busy(page)) pool_free_page(pool, page);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 9b75a045dbf4..a47f0f50c89f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -26,7 +26,7 @@
*/
SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
{
- struct file *file = fget(fd);
+ struct fd f = fdget(fd);
struct address_space *mapping;
struct backing_dev_info *bdi;
loff_t endbyte; /* inclusive */
@@ -35,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
unsigned long nrpages;
int ret = 0;
- if (!file)
+ if (!f.file)
return -EBADF;
- if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+ if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
- mapping = file->f_mapping;
+ mapping = f.file->f_mapping;
if (!mapping || len < 0) {
ret = -EINVAL;
goto out;
@@ -76,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
switch (advice) {
case POSIX_FADV_NORMAL:
- file->f_ra.ra_pages = bdi->ra_pages;
- spin_lock(&file->f_lock);
- file->f_mode &= ~FMODE_RANDOM;
- spin_unlock(&file->f_lock);
+ f.file->f_ra.ra_pages = bdi->ra_pages;
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
break;
case POSIX_FADV_RANDOM:
- spin_lock(&file->f_lock);
- file->f_mode |= FMODE_RANDOM;
- spin_unlock(&file->f_lock);
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode |= FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
- file->f_ra.ra_pages = bdi->ra_pages * 2;
- spin_lock(&file->f_lock);
- file->f_mode &= ~FMODE_RANDOM;
- spin_unlock(&file->f_lock);
+ f.file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
break;
case POSIX_FADV_WILLNEED:
/* First and last PARTIAL page! */
@@ -106,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
* Ignore return value because fadvise() shall return
* success even if filesystem can't retrieve a hint,
*/
- force_page_cache_readahead(mapping, file, start_index,
+ force_page_cache_readahead(mapping, f.file, start_index,
nrpages);
break;
case POSIX_FADV_NOREUSE:
@@ -128,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
ret = -EINVAL;
}
out:
- fput(file);
+ fdput(f);
return ret;
}
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
diff --git a/mm/filemap.c b/mm/filemap.c
index 384344575c37..83efee76a5c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* Do we have something in the page cache already?
*/
page = find_get_page(mapping, offset);
- if (likely(page)) {
+ if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
/*
* We found the page, so try async readahead before
* waiting for the lock.
*/
do_async_mmap_readahead(vma, ra, file, page, offset);
- } else {
+ } else if (!page) {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = filemap_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
/* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &generic_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270c..a912da6ddfd4 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
{
struct vm_area_struct *vma;
struct mm_struct *mm;
- struct prio_tree_iter iter;
unsigned long address;
pte_t *pte;
pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
retry:
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -193,11 +192,13 @@ retry:
if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
page_remove_rmap(page);
dec_mm_counter(mm, MM_FILEPAGES);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
+ /* must invalidate_page _before_ freeing the page */
+ mmu_notifier_invalidate_page(mm, address);
page_cache_release(page);
}
}
@@ -305,6 +306,7 @@ out:
static const struct vm_operations_struct xip_file_vm_ops = {
.fault = xip_file_fault,
.page_mkwrite = filemap_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
file_accessed(file);
vma->vm_ops = &xip_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP;
return 0;
}
EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 9ed4fd432467..a0aaf0e56800 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
*
* started by Ingo Molnar, Copyright (C) 2002, 2003
*/
+#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
return err;
}
-static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, unsigned long size, pgoff_t pgoff)
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, pgoff_t pgoff)
{
+ struct mm_struct *mm = vma->vm_mm;
int err;
do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
pgoff++;
} while (size);
- return 0;
-
+ return 0;
}
+EXPORT_SYMBOL(generic_file_remap_pages);
/**
* sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
goto out;
- if (!(vma->vm_flags & VM_CAN_NONLINEAR))
+ if (!vma->vm_ops || !vma->vm_ops->remap_pages)
goto out;
if (start < vma->vm_start || start + size > vma->vm_end)
@@ -195,10 +197,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
*/
if (mapping_cap_account_dirty(mapping)) {
unsigned long addr;
- struct file *file = vma->vm_file;
+ struct file *file = get_file(vma->vm_file);
flags &= MAP_NONBLOCK;
- get_file(file);
addr = mmap_region(file, start, size,
flags, vma->vm_flags, pgoff);
fput(file);
@@ -213,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
@@ -229,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
mmu_notifier_invalidate_range_start(mm, start, start + size);
- err = populate_range(mm, vma, start, size, pgoff);
+ err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
mmu_notifier_invalidate_range_end(mm, start, start + size);
if (!err && !(flags & MAP_NONBLOCK)) {
if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 6b3e71a2cd48..2890e67d6026 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -44,6 +44,13 @@ EXPORT_SYMBOL(frontswap_enabled);
*/
static bool frontswap_writethrough_enabled __read_mostly;
+/*
+ * If enabled, the underlying tmem implementation is capable of doing
+ * exclusive gets, so frontswap_load, on a successful tmem_get must
+ * mark the page as no longer in frontswap AND mark it dirty.
+ */
+static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
+
#ifdef CONFIG_DEBUG_FS
/*
* Counters available via /sys/kernel/debug/frontswap (if debugfs is
@@ -97,6 +104,15 @@ void frontswap_writethrough(bool enable)
EXPORT_SYMBOL(frontswap_writethrough);
/*
+ * Enable/disable frontswap exclusive gets (see above).
+ */
+void frontswap_tmem_exclusive_gets(bool enable)
+{
+ frontswap_tmem_exclusive_gets_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
+
+/*
* Called when a swap device is swapon'd.
*/
void __frontswap_init(unsigned type)
@@ -174,8 +190,13 @@ int __frontswap_load(struct page *page)
BUG_ON(sis == NULL);
if (frontswap_test(sis, offset))
ret = frontswap_ops.load(type, offset, page);
- if (ret == 0)
+ if (ret == 0) {
inc_frontswap_loads();
+ if (frontswap_tmem_exclusive_gets_enabled) {
+ SetPageDirty(page);
+ frontswap_clear(sis, offset);
+ }
+ }
return ret;
}
EXPORT_SYMBOL(__frontswap_load);
@@ -263,6 +284,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
return ret;
}
+/*
+ * Used to check if it's necessory and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shink pages,
+ * error code when there is an error.
+ */
static int __frontswap_shrink(unsigned long target_pages,
unsigned long *pages_to_unuse,
int *type)
@@ -275,7 +301,7 @@ static int __frontswap_shrink(unsigned long target_pages,
if (total_pages <= target_pages) {
/* Nothing to do */
*pages_to_unuse = 0;
- return 0;
+ return 1;
}
total_pages_to_unuse = total_pages - target_pages;
return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
@@ -292,7 +318,7 @@ static int __frontswap_shrink(unsigned long target_pages,
void frontswap_shrink(unsigned long target_pages)
{
unsigned long pages_to_unuse = 0;
- int type, ret;
+ int uninitialized_var(type), ret;
/*
* we don't want to hold swap_lock while doing a very
@@ -302,7 +328,7 @@ void frontswap_shrink(unsigned long target_pages)
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
spin_unlock(&swap_lock);
- if (ret == 0 && pages_to_unuse)
+ if (ret == 0)
try_to_unuse(type, true, pages_to_unuse);
return;
}
diff --git a/mm/highmem.c b/mm/highmem.c
index d517cd16a6eb..b32b70cdaed6 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -98,13 +98,14 @@ struct page *kmap_to_page(void *vaddr)
{
unsigned long addr = (unsigned long)vaddr;
- if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
- int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+ if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
+ int i = PKMAP_NR(addr);
return pte_page(pkmap_page_table[i]);
}
return virt_to_page(addr);
}
+EXPORT_SYMBOL(kmap_to_page);
static void flush_all_zero_pkmaps(void)
{
@@ -137,8 +138,7 @@ static void flush_all_zero_pkmaps(void)
* So no dangers, even with speculative execution.
*/
page = pte_page(pkmap_page_table[i]);
- pte_clear(&init_mm, (unsigned long)page_address(page),
- &pkmap_page_table[i]);
+ pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
set_page_address(page, NULL);
need_flush = 1;
@@ -324,11 +324,7 @@ struct page_address_map {
struct list_head list;
};
-/*
- * page_address_map freelist, allocated from page_address_maps.
- */
-static struct list_head page_address_pool; /* freelist */
-static spinlock_t pool_lock; /* protects page_address_pool */
+static struct page_address_map page_address_maps[LAST_PKMAP];
/*
* Hash table bucket
@@ -393,14 +389,7 @@ void set_page_address(struct page *page, void *virtual)
pas = page_slot(page);
if (virtual) { /* Add */
- BUG_ON(list_empty(&page_address_pool));
-
- spin_lock_irqsave(&pool_lock, flags);
- pam = list_entry(page_address_pool.next,
- struct page_address_map, list);
- list_del(&pam->list);
- spin_unlock_irqrestore(&pool_lock, flags);
-
+ pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
pam->page = page;
pam->virtual = virtual;
@@ -413,9 +402,6 @@ void set_page_address(struct page *page, void *virtual)
if (pam->page == page) {
list_del(&pam->list);
spin_unlock_irqrestore(&pas->lock, flags);
- spin_lock_irqsave(&pool_lock, flags);
- list_add_tail(&pam->list, &page_address_pool);
- spin_unlock_irqrestore(&pool_lock, flags);
goto done;
}
}
@@ -425,20 +411,14 @@ done:
return;
}
-static struct page_address_map page_address_maps[LAST_PKMAP];
-
void __init page_address_init(void)
{
int i;
- INIT_LIST_HEAD(&page_address_pool);
- for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
- list_add(&page_address_maps[i].list, &page_address_pool);
for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
INIT_LIST_HEAD(&page_address_htable[i].lh);
spin_lock_init(&page_address_htable[i].lock);
}
- spin_lock_init(&pool_lock);
}
#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb695097..32754eece63e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,11 +12,15 @@
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
+#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/migrate.h>
+
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
@@ -36,7 +40,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
@@ -102,10 +107,7 @@ static int set_recommended_min_free_kbytes(void)
unsigned long recommended_min;
extern int min_free_kbytes;
- if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
- &transparent_hugepage_flags) &&
- !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
- &transparent_hugepage_flags))
+ if (!khugepaged_enabled())
return 0;
for_each_populated_zone(zone)
@@ -139,12 +141,6 @@ static int start_khugepaged(void)
{
int err = 0;
if (khugepaged_enabled()) {
- int wakeup;
- if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
- err = -ENOMEM;
- goto out;
- }
- mutex_lock(&khugepaged_mutex);
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -154,19 +150,90 @@ static int start_khugepaged(void)
err = PTR_ERR(khugepaged_thread);
khugepaged_thread = NULL;
}
- wakeup = !list_empty(&khugepaged_scan.mm_head);
- mutex_unlock(&khugepaged_mutex);
- if (wakeup)
+
+ if (!list_empty(&khugepaged_scan.mm_head))
wake_up_interruptible(&khugepaged_wait);
set_recommended_min_free_kbytes();
- } else
- /* wakeup to exit */
- wake_up_interruptible(&khugepaged_wait);
-out:
+ } else if (khugepaged_thread) {
+ kthread_stop(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+
return err;
}
+static atomic_t huge_zero_refcount;
+static unsigned long huge_zero_pfn __read_mostly;
+
+static inline bool is_huge_zero_pfn(unsigned long pfn)
+{
+ unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn);
+ return zero_pfn && pfn == zero_pfn;
+}
+
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_pfn(pmd_pfn(pmd));
+}
+
+static unsigned long get_huge_zero_page(void)
+{
+ struct page *zero_page;
+retry:
+ if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
+ return ACCESS_ONCE(huge_zero_pfn);
+
+ zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ HPAGE_PMD_ORDER);
+ if (!zero_page) {
+ count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
+ return 0;
+ }
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
+ preempt_disable();
+ if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) {
+ preempt_enable();
+ __free_page(zero_page);
+ goto retry;
+ }
+
+ /* We take additional reference here. It will be put back by shrinker */
+ atomic_set(&huge_zero_refcount, 2);
+ preempt_enable();
+ return ACCESS_ONCE(huge_zero_pfn);
+}
+
+static void put_huge_zero_page(void)
+{
+ /*
+ * Counter should never go to zero here. Only shrinker can put
+ * last reference.
+ */
+ BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
+}
+
+static int shrink_huge_zero_page(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ if (!sc->nr_to_scan)
+ /* we can free zero page only if last reference remains */
+ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+
+ if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
+ unsigned long zero_pfn = xchg(&huge_zero_pfn, 0);
+ BUG_ON(zero_pfn == 0);
+ __free_page(__pfn_to_page(zero_pfn));
+ }
+
+ return 0;
+}
+
+static struct shrinker huge_zero_page_shrinker = {
+ .shrink = shrink_huge_zero_page,
+ .seeks = DEFAULT_SEEKS,
+};
+
#ifdef CONFIG_SYSFS
static ssize_t double_flag_show(struct kobject *kobj,
@@ -224,18 +291,16 @@ static ssize_t enabled_store(struct kobject *kobj,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
if (ret > 0) {
- int err = start_khugepaged();
+ int err;
+
+ mutex_lock(&khugepaged_mutex);
+ err = start_khugepaged();
+ mutex_unlock(&khugepaged_mutex);
+
if (err)
ret = err;
}
- if (ret > 0 &&
- (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
- &transparent_hugepage_flags) ||
- test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
- &transparent_hugepage_flags)))
- set_recommended_min_free_kbytes();
-
return ret;
}
static struct kobj_attribute enabled_attr =
@@ -294,6 +359,20 @@ static ssize_t defrag_store(struct kobject *kobj,
static struct kobj_attribute defrag_attr =
__ATTR(defrag, 0644, defrag_show, defrag_store);
+static ssize_t use_zero_page_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static ssize_t use_zero_page_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+}
+static struct kobj_attribute use_zero_page_attr =
+ __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
#ifdef CONFIG_DEBUG_VM
static ssize_t debug_cow_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -315,6 +394,7 @@ static struct kobj_attribute debug_cow_attr =
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
+ &use_zero_page_attr.attr,
#ifdef CONFIG_DEBUG_VM
&debug_cow_attr.attr,
#endif
@@ -560,6 +640,8 @@ static int __init hugepage_init(void)
goto out;
}
+ register_shrinker(&huge_zero_page_shrinker);
+
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
@@ -570,8 +652,6 @@ static int __init hugepage_init(void)
start_khugepaged();
- set_recommended_min_free_kbytes();
-
return 0;
out:
hugepage_exit_sysfs(hugepage_kobj);
@@ -611,26 +691,22 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static void prepare_pmd_huge_pte(pgtable_t pgtable,
- struct mm_struct *mm)
-{
- assert_spin_locked(&mm->page_table_lock);
-
- /* FIFO */
- if (!mm->pmd_huge_pte)
- INIT_LIST_HEAD(&pgtable->lru);
- else
- list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
- mm->pmd_huge_pte = pgtable;
-}
-
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
pmd = pmd_mkwrite(pmd);
return pmd;
}
+static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
+{
+ pmd_t entry;
+ entry = mk_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ return entry;
+}
+
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
@@ -654,9 +730,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pte_free(mm, pgtable);
} else {
pmd_t entry;
- entry = mk_pmd(page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- entry = pmd_mkhuge(entry);
+ entry = mk_huge_pmd(page, vma);
/*
* The spinlocking to take the lru_lock inside
* page_add_new_anon_rmap() acts as a full memory
@@ -665,7 +739,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
page_add_new_anon_rmap(page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
- prepare_pmd_huge_pte(pgtable, mm);
+ pgtable_trans_huge_deposit(mm, pgtable);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm->nr_ptes++;
spin_unlock(&mm->page_table_lock);
@@ -696,6 +770,22 @@ static inline struct page *alloc_hugepage(int defrag)
}
#endif
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
+ unsigned long zero_pfn)
+{
+ pmd_t entry;
+ if (!pmd_none(*pmd))
+ return false;
+ entry = pfn_pmd(zero_pfn, vma->vm_page_prot);
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ pgtable_trans_huge_deposit(mm, pgtable);
+ mm->nr_ptes++;
+ return true;
+}
+
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
@@ -709,6 +799,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
+ if (!(flags & FAULT_FLAG_WRITE) &&
+ transparent_hugepage_use_zero_page()) {
+ pgtable_t pgtable;
+ unsigned long zero_pfn;
+ bool set;
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable))
+ return VM_FAULT_OOM;
+ zero_pfn = get_huge_zero_page();
+ if (unlikely(!zero_pfn)) {
+ pte_free(mm, pgtable);
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+ spin_lock(&mm->page_table_lock);
+ set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
+ zero_pfn);
+ spin_unlock(&mm->page_table_lock);
+ if (!set) {
+ pte_free(mm, pgtable);
+ put_huge_zero_page();
+ }
+ return 0;
+ }
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr, numa_node_id(), 0);
if (unlikely(!page)) {
@@ -735,7 +849,8 @@ out:
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -773,6 +888,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_free(dst_mm, pgtable);
goto out_unlock;
}
+ /*
+ * mm->page_table_lock is enough to be sure that huge zero pmd is not
+ * under splitting since we don't split the page itself, only pmd to
+ * a page table.
+ */
+ if (is_huge_zero_pmd(pmd)) {
+ unsigned long zero_pfn;
+ bool set;
+ /*
+ * get_huge_zero_page() will never allocate a new page here,
+ * since we already have a zero page to copy. It just takes a
+ * reference.
+ */
+ zero_pfn = get_huge_zero_page();
+ set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+ zero_pfn);
+ BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
+ ret = 0;
+ goto out_unlock;
+ }
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
spin_unlock(&src_mm->page_table_lock);
@@ -791,7 +926,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- prepare_pmd_huge_pte(pgtable, dst_mm);
+ pgtable_trans_huge_deposit(dst_mm, pgtable);
dst_mm->nr_ptes++;
ret = 0;
@@ -802,23 +937,100 @@ out:
return ret;
}
-/* no "address" argument so destroys page coloring of some arch */
-pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+void huge_pmd_set_accessed(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ int dirty)
+{
+ pmd_t entry;
+ unsigned long haddr;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto unlock;
+
+ entry = pmd_mkyoung(orig_pmd);
+ haddr = address & HPAGE_PMD_MASK;
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
+ update_mmu_cache_pmd(vma, address, pmd);
+
+unlock:
+ spin_unlock(&mm->page_table_lock);
+}
+
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
{
pgtable_t pgtable;
+ pmd_t _pmd;
+ struct page *page;
+ int i, ret = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
- assert_spin_locked(&mm->page_table_lock);
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!page) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
- /* FIFO */
- pgtable = mm->pmd_huge_pte;
- if (list_empty(&pgtable->lru))
- mm->pmd_huge_pte = NULL;
- else {
- mm->pmd_huge_pte = list_entry(pgtable->lru.next,
- struct page, lru);
- list_del(&pgtable->lru);
+ if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
}
- return pgtable;
+
+ clear_user_highpage(page, address);
+ __SetPageUptodate(page);
+
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_page;
+
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ if (haddr == (address & PAGE_MASK)) {
+ entry = mk_pte(page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(page, vma, haddr);
+ } else {
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ }
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ spin_unlock(&mm->page_table_lock);
+ put_huge_zero_page();
+ inc_mm_counter(mm, MM_ANONPAGES);
+
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ ret |= VM_FAULT_WRITE;
+out:
+ return ret;
+out_free_page:
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ goto out;
}
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
@@ -832,6 +1044,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
pmd_t _pmd;
int ret = 0, i;
struct page **pages;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
GFP_KERNEL);
@@ -868,15 +1082,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
cond_resched();
}
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = get_pmd_huge_pte(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +1114,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
page_remove_rmap(page);
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
ret |= VM_FAULT_WRITE;
put_page(page);
@@ -904,6 +1124,7 @@ out:
out_free_pages:
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
mem_cgroup_uncharge_page(pages[i]);
@@ -918,29 +1139,33 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
int ret = 0;
- struct page *page, *new_page;
+ struct page *page = NULL, *new_page;
unsigned long haddr;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
VM_BUG_ON(!vma->anon_vma);
+ haddr = address & HPAGE_PMD_MASK;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto alloc;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
VM_BUG_ON(!PageCompound(page) || !PageHead(page));
- haddr = address & HPAGE_PMD_MASK;
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache_pmd(vma, address, pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_page(page);
spin_unlock(&mm->page_table_lock);
-
+alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -950,58 +1175,81 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!new_page)) {
count_vm_event(THP_FAULT_FALLBACK);
- ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
- pmd, orig_pmd, page, haddr);
- if (ret & VM_FAULT_OOM)
- split_huge_page(page);
- put_page(page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
+ address, pmd, orig_pmd, haddr);
+ } else {
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ if (ret & VM_FAULT_OOM)
+ split_huge_page(page);
+ put_page(page);
+ }
goto out;
}
count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
- split_huge_page(page);
- put_page(page);
+ if (page) {
+ split_huge_page(page);
+ put_page(page);
+ }
ret |= VM_FAULT_OOM;
goto out;
}
- copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ if (is_huge_zero_pmd(orig_pmd))
+ clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
+ else
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
spin_lock(&mm->page_table_lock);
- put_page(page);
+ if (page)
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
- goto out;
+ goto out_mn;
} else {
pmd_t entry;
- VM_BUG_ON(!PageHead(page));
- entry = mk_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- entry = pmd_mkhuge(entry);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ entry = mk_huge_pmd(new_page, vma);
+ pmdp_clear_flush(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
- update_mmu_cache(vma, address, entry);
- page_remove_rmap(page);
- put_page(page);
+ update_mmu_cache_pmd(vma, address, pmd);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ put_huge_zero_page();
+ } else {
+ VM_BUG_ON(!PageHead(page));
+ page_remove_rmap(page);
+ put_page(page);
+ }
ret |= VM_FAULT_WRITE;
}
-out_unlock:
spin_unlock(&mm->page_table_lock);
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return ret;
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
}
-struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags)
{
+ struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +1272,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
}
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain();
+ if (page->mapping)
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON(!PageCompound(page));
if (flags & FOLL_GET)
@@ -1033,6 +1289,81 @@ out:
return page;
}
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+ struct page *page;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int target_nid;
+ int current_nid = -1;
+ bool migrated;
+ bool page_locked = false;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+
+ page = pmd_page(pmd);
+ get_page(page);
+ current_nid = page_to_nid(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ target_nid = mpol_misplaced(page, vma, haddr);
+ if (target_nid == -1) {
+ put_page(page);
+ goto clear_pmdnuma;
+ }
+
+ /* Acquire the page lock to serialise THP migrations */
+ spin_unlock(&mm->page_table_lock);
+ lock_page(page);
+ page_locked = true;
+
+ /* Confirm the PTE did not while locked */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ /* Migrate the THP to the requested node */
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr,
+ page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+ else {
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ goto out_unlock;
+ }
+ goto clear_pmdnuma;
+ }
+
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+
+clear_pmdnuma:
+ pmd = pmd_mknonnuma(pmd);
+ set_pmd_at(mm, haddr, pmdp, pmd);
+ VM_BUG_ON(pmd_numa(*pmdp));
+ update_mmu_cache_pmd(vma, addr, pmdp);
+ if (page_locked)
+ unlock_page(page);
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ if (current_nid != -1)
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
@@ -1041,17 +1372,24 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
struct page *page;
pgtable_t pgtable;
- pgtable = get_pmd_huge_pte(tlb->mm);
- page = pmd_page(*pmd);
- pmd_clear(pmd);
+ pmd_t orig_pmd;
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+ orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- page_remove_rmap(page);
- VM_BUG_ON(page_mapcount(page) < 0);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON(!PageHead(page));
- tlb->mm->nr_ptes--;
- spin_unlock(&tlb->mm->page_table_lock);
- tlb_remove_page(tlb, page);
+ if (is_huge_zero_pmd(orig_pmd)) {
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ put_huge_zero_page();
+ } else {
+ page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ }
pte_free(tlb->mm, pgtable);
ret = 1;
}
@@ -1114,7 +1452,7 @@ out:
}
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot)
+ unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
@@ -1122,7 +1460,18 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
+ if (!prot_numa) {
+ entry = pmd_modify(entry, newprot);
+ BUG_ON(pmd_write(entry));
+ } else {
+ struct page *page = pmd_page(*pmd);
+
+ /* only check non-shared pages */
+ if (page_mapcount(page) == 1 &&
+ !pmd_numa(*pmd)) {
+ entry = pmd_mknuma(entry);
+ }
+ }
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);
ret = 1;
@@ -1161,22 +1510,14 @@ pmd_t *page_check_address_pmd(struct page *page,
unsigned long address,
enum page_check_address_pmd_flag flag)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd, *ret = NULL;
if (address & ~HPAGE_PMD_MASK)
goto out;
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- goto out;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
goto out;
-
- pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
goto out;
if (pmd_page(*pmd) != page)
@@ -1207,7 +1548,11 @@ static int __split_huge_page_splitting(struct page *page,
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
int ret = 0;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = address;
+ const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1216,13 +1561,14 @@ static int __split_huge_page_splitting(struct page *page,
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->mutex to
+ * and it won't wait on the anon_vma->root->rwsem to
* serialize against split_huge_page*.
*/
- pmdp_splitting_flush_notify(vma, address, pmd);
+ pmdp_splitting_flush(vma, address, pmd);
ret = 1;
}
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return ret;
}
@@ -1306,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
+ page_xchg_last_nid(page_tail, page_last_nid(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
@@ -1358,11 +1705,11 @@ static int __split_huge_page_map(struct page *page,
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
if (pmd) {
- pgtable = get_pmd_huge_pte(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm);
pmd_populate(mm, &_pmd, pgtable);
- for (i = 0, haddr = address; i < HPAGE_PMD_NR;
- i++, haddr += PAGE_SIZE) {
+ haddr = address;
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1373,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
BUG_ON(page_mapcount(page) != 1);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
+ if (pmd_numa(*pmd))
+ entry = pte_mknuma(entry);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -1406,8 +1755,7 @@ static int __split_huge_page_map(struct page *page,
* SMP TLB and finally we write the non-huge version
* of the pmd entry with pmd_populate.
*/
- set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ pmdp_invalidate(vma, address, pmd);
pmd_populate(mm, pmd, pgtable);
ret = 1;
}
@@ -1416,23 +1764,22 @@ static int __split_huge_page_map(struct page *page,
return ret;
}
-/* must be called with anon_vma->root->mutex hold */
+/* must be called with anon_vma->root->rwsem held */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
int mapcount, mapcount2;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma_chain *avc;
BUG_ON(!PageHead(page));
BUG_ON(PageTail(page));
mapcount = 0;
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long addr = vma_address(page, vma);
BUG_ON(is_vma_temporary_stack(vma));
- if (addr == -EFAULT)
- continue;
mapcount += __split_huge_page_splitting(page, vma, addr);
}
/*
@@ -1453,12 +1800,10 @@ static void __split_huge_page(struct page *page,
__split_huge_page_refcount(page);
mapcount2 = 0;
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long addr = vma_address(page, vma);
BUG_ON(is_vma_temporary_stack(vma));
- if (addr == -EFAULT)
- continue;
mapcount2 += __split_huge_page_map(page, vma, addr);
}
if (mapcount != mapcount2)
@@ -1472,8 +1817,9 @@ int split_huge_page(struct page *page)
struct anon_vma *anon_vma;
int ret = 1;
+ BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
BUG_ON(!PageAnon(page));
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
goto out;
ret = 0;
@@ -1486,17 +1832,18 @@ int split_huge_page(struct page *page)
BUG_ON(PageCompound(page));
out_unlock:
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
out:
return ret;
}
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
- VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
+ struct mm_struct *mm = vma->vm_mm;
+
switch (advice) {
case MADV_HUGEPAGE:
/*
@@ -1504,6 +1851,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
*/
if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
return -EINVAL;
+ if (mm->def_flags & VM_NOHUGEPAGE)
+ return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
@@ -1655,11 +2004,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- /*
- * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
- * true too, verify it here.
- */
- VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -1716,64 +2061,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
}
}
-static void release_all_pte_pages(pte_t *pte)
-{
- release_pte_pages(pte, pte + HPAGE_PMD_NR);
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
struct page *page;
pte_t *_pte;
- int referenced = 0, isolated = 0, none = 0;
+ int referenced = 0, none = 0;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval)) {
if (++none <= khugepaged_max_ptes_none)
continue;
- else {
- release_pte_pages(pte, _pte);
+ else
goto out;
- }
}
- if (!pte_present(pteval) || !pte_write(pteval)) {
- release_pte_pages(pte, _pte);
+ if (!pte_present(pteval) || !pte_write(pteval))
goto out;
- }
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page)) {
- release_pte_pages(pte, _pte);
+ if (unlikely(!page))
goto out;
- }
+
VM_BUG_ON(PageCompound(page));
BUG_ON(!PageAnon(page));
VM_BUG_ON(!PageSwapBacked(page));
/* cannot use mapcount: can't collapse if there's a gup pin */
- if (page_count(page) != 1) {
- release_pte_pages(pte, _pte);
+ if (page_count(page) != 1)
goto out;
- }
/*
* We can do it before isolate_lru_page because the
* page can't be freed from under us. NOTE: PG_lock
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page)) {
- release_pte_pages(pte, _pte);
+ if (!trylock_page(page))
goto out;
- }
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
if (isolate_lru_page(page)) {
unlock_page(page);
- release_pte_pages(pte, _pte);
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
@@ -1786,12 +2116,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
mmu_notifier_test_young(vma->vm_mm, address))
referenced = 1;
}
- if (unlikely(!referenced))
- release_all_pte_pages(pte);
- else
- isolated = 1;
+ if (likely(referenced))
+ return 1;
out:
- return isolated;
+ release_pte_pages(pte, _pte);
+ return 0;
}
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -1833,28 +2162,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
}
}
-static void collapse_huge_page(struct mm_struct *mm,
- unsigned long address,
- struct page **hpage,
- struct vm_area_struct *vma,
- int node)
+static void khugepaged_alloc_sleep(void)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd, _pmd;
- pte_t *pte;
- pgtable_t pgtable;
- struct page *new_page;
- spinlock_t *ptl;
- int isolated;
- unsigned long hstart, hend;
+ wait_event_freezable_timeout(khugepaged_wait, false,
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-#ifndef CONFIG_NUMA
- up_read(&mm->mmap_sem);
- VM_BUG_ON(!*hpage);
- new_page = *hpage;
-#else
+#ifdef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (IS_ERR(*hpage)) {
+ if (!*wait)
+ return false;
+
+ *wait = false;
+ *hpage = NULL;
+ khugepaged_alloc_sleep();
+ } else if (*hpage) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
+ return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ int node)
+{
VM_BUG_ON(*hpage);
/*
* Allocate the page while the vma is still valid and under
@@ -1866,7 +2202,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+ *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
node, __GFP_OTHER_NODE);
/*
@@ -1874,20 +2210,97 @@ static void collapse_huge_page(struct mm_struct *mm,
* preparation for taking it in write mode.
*/
up_read(&mm->mmap_sem);
- if (unlikely(!new_page)) {
+ if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
- return;
+ return NULL;
}
-#endif
count_vm_event(THP_COLLAPSE_ALLOC);
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-#ifdef CONFIG_NUMA
- put_page(new_page);
+ return *hpage;
+}
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_hugepage(khugepaged_defrag());
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (!*wait)
+ return NULL;
+
+ *wait = false;
+ khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+ return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (!*hpage)
+ *hpage = khugepaged_alloc_hugepage(wait);
+
+ if (unlikely(!*hpage))
+ return false;
+
+ return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ int node)
+{
+ up_read(&mm->mmap_sem);
+ VM_BUG_ON(!*hpage);
+ return *hpage;
+}
#endif
+
+static bool hugepage_vma_check(struct vm_area_struct *vma)
+{
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
+ return false;
+
+ if (!vma->anon_vma || vma->vm_ops)
+ return false;
+ if (is_vma_temporary_stack(vma))
+ return false;
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ return true;
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ struct vm_area_struct *vma,
+ int node)
+{
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *ptl;
+ int isolated;
+ unsigned long hstart, hend;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ /* release the mmap_sem read lock. */
+ new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+ if (!new_page)
+ return;
+
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
return;
- }
/*
* Prevent all access to pagetables with the exception of
@@ -1903,39 +2316,22 @@ static void collapse_huge_page(struct mm_struct *mm,
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
goto out;
-
- if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE))
- goto out;
-
- if (!vma->anon_vma || vma->vm_ops)
- goto out;
- if (is_vma_temporary_stack(vma))
- goto out;
- /*
- * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
- * true too, verify it here.
- */
- VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
+ if (!hugepage_vma_check(vma))
goto out;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
goto out;
-
- pmd = pmd_offset(pud, address);
- /* pmd can't go away or become huge under us */
- if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ if (pmd_trans_huge(*pmd))
goto out;
- anon_vma_lock(vma->anon_vma);
+ anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
ptl = pte_lockptr(mm, pmd);
+ mmun_start = address;
+ mmun_end = address + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
@@ -1943,8 +2339,9 @@ static void collapse_huge_page(struct mm_struct *mm,
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
- _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+ _pmd = pmdp_clear_flush(vma, address, pmd);
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
spin_lock(ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1970,12 +2367,8 @@ static void collapse_huge_page(struct mm_struct *mm,
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
- VM_BUG_ON(page_count(pgtable) != 1);
- VM_BUG_ON(page_mapcount(pgtable) != 0);
- _pmd = mk_pmd(new_page, vma->vm_page_prot);
- _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
- _pmd = pmd_mkhuge(_pmd);
+ _pmd = mk_huge_pmd(new_page, vma);
/*
* spin_lock() below is not the equivalent of smp_wmb(), so
@@ -1988,13 +2381,12 @@ static void collapse_huge_page(struct mm_struct *mm,
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
set_pmd_at(mm, address, pmd, _pmd);
- update_mmu_cache(vma, address, _pmd);
- prepare_pmd_huge_pte(pgtable, mm);
+ update_mmu_cache_pmd(vma, address, pmd);
+ pgtable_trans_huge_deposit(mm, pgtable);
spin_unlock(&mm->page_table_lock);
-#ifndef CONFIG_NUMA
*hpage = NULL;
-#endif
+
khugepaged_pages_collapsed++;
out_up_write:
up_write(&mm->mmap_sem);
@@ -2002,9 +2394,6 @@ out_up_write:
out:
mem_cgroup_uncharge_page(new_page);
-#ifdef CONFIG_NUMA
- put_page(new_page);
-#endif
goto out_up_write;
}
@@ -2013,8 +2402,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
unsigned long address,
struct page **hpage)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *pte, *_pte;
int ret = 0, referenced = 0, none = 0;
@@ -2025,16 +2412,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
goto out;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- goto out;
-
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ if (pmd_trans_huge(*pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2142,25 +2523,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
progress++;
break;
}
-
- if ((!(vma->vm_flags & VM_HUGEPAGE) &&
- !khugepaged_always()) ||
- (vma->vm_flags & VM_NOHUGEPAGE)) {
- skip:
+ if (!hugepage_vma_check(vma)) {
+skip:
progress++;
continue;
}
- if (!vma->anon_vma || vma->vm_ops)
- goto skip;
- if (is_vma_temporary_stack(vma))
- goto skip;
- /*
- * If is_pfn_mapping() is true is_learn_pfn_mapping()
- * must be true too, verify it here.
- */
- VM_BUG_ON(is_linear_pfn_mapping(vma) ||
- vma->vm_flags & VM_NO_THP);
-
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart >= hend)
@@ -2234,32 +2601,23 @@ static int khugepaged_has_work(void)
static int khugepaged_wait_event(void)
{
return !list_empty(&khugepaged_scan.mm_head) ||
- !khugepaged_enabled();
+ kthread_should_stop();
}
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
{
+ struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
unsigned int pages = khugepaged_pages_to_scan;
+ bool wait = true;
barrier(); /* write khugepaged_pages_to_scan to local stack */
while (progress < pages) {
- cond_resched();
-
-#ifndef CONFIG_NUMA
- if (!*hpage) {
- *hpage = alloc_hugepage(khugepaged_defrag());
- if (unlikely(!*hpage)) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- break;
- }
- count_vm_event(THP_COLLAPSE_ALLOC);
- }
-#else
- if (IS_ERR(*hpage))
+ if (!khugepaged_prealloc_page(&hpage, &wait))
break;
-#endif
+
+ cond_resched();
if (unlikely(kthread_should_stop() || freezing(current)))
break;
@@ -2270,73 +2628,32 @@ static void khugepaged_do_scan(struct page **hpage)
if (khugepaged_has_work() &&
pass_through_head < 2)
progress += khugepaged_scan_mm_slot(pages - progress,
- hpage);
+ &hpage);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
}
-}
-static void khugepaged_alloc_sleep(void)
-{
- wait_event_freezable_timeout(khugepaged_wait, false,
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
-}
-
-#ifndef CONFIG_NUMA
-static struct page *khugepaged_alloc_hugepage(void)
-{
- struct page *hpage;
-
- do {
- hpage = alloc_hugepage(khugepaged_defrag());
- if (!hpage) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- khugepaged_alloc_sleep();
- } else
- count_vm_event(THP_COLLAPSE_ALLOC);
- } while (unlikely(!hpage) &&
- likely(khugepaged_enabled()));
- return hpage;
+ if (!IS_ERR_OR_NULL(hpage))
+ put_page(hpage);
}
-#endif
-static void khugepaged_loop(void)
+static void khugepaged_wait_work(void)
{
- struct page *hpage;
+ try_to_freeze();
-#ifdef CONFIG_NUMA
- hpage = NULL;
-#endif
- while (likely(khugepaged_enabled())) {
-#ifndef CONFIG_NUMA
- hpage = khugepaged_alloc_hugepage();
- if (unlikely(!hpage))
- break;
-#else
- if (IS_ERR(hpage)) {
- khugepaged_alloc_sleep();
- hpage = NULL;
- }
-#endif
+ if (khugepaged_has_work()) {
+ if (!khugepaged_scan_sleep_millisecs)
+ return;
- khugepaged_do_scan(&hpage);
-#ifndef CONFIG_NUMA
- if (hpage)
- put_page(hpage);
-#endif
- try_to_freeze();
- if (unlikely(kthread_should_stop()))
- break;
- if (khugepaged_has_work()) {
- if (!khugepaged_scan_sleep_millisecs)
- continue;
- wait_event_freezable_timeout(khugepaged_wait, false,
- msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
- } else if (khugepaged_enabled())
- wait_event_freezable(khugepaged_wait,
- khugepaged_wait_event());
+ wait_event_freezable_timeout(khugepaged_wait,
+ kthread_should_stop(),
+ msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+ return;
}
+
+ if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
static int khugepaged(void *none)
@@ -2346,20 +2663,9 @@ static int khugepaged(void *none)
set_freezable();
set_user_nice(current, 19);
- /* serialize with start_khugepaged() */
- mutex_lock(&khugepaged_mutex);
-
- for (;;) {
- mutex_unlock(&khugepaged_mutex);
- VM_BUG_ON(khugepaged_thread != current);
- khugepaged_loop();
- VM_BUG_ON(khugepaged_thread != current);
-
- mutex_lock(&khugepaged_mutex);
- if (!khugepaged_enabled())
- break;
- if (unlikely(kthread_should_stop()))
- break;
+ while (!kthread_should_stop()) {
+ khugepaged_do_scan();
+ khugepaged_wait_work();
}
spin_lock(&khugepaged_mm_lock);
@@ -2368,26 +2674,68 @@ static int khugepaged(void *none)
if (mm_slot)
collect_mm_slot(mm_slot);
spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
- khugepaged_thread = NULL;
- mutex_unlock(&khugepaged_mutex);
+static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int i;
- return 0;
+ pmdp_clear_flush(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = pgtable_trans_huge_withdraw(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+ entry = pte_mkspecial(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ put_huge_zero_page();
}
-void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd)
{
struct page *page;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ return;
+ }
+ if (is_huge_zero_pmd(*pmd)) {
+ __split_huge_zero_page_pmd(vma, haddr, pmd);
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
split_huge_page(page);
@@ -2395,31 +2743,31 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
BUG_ON(pmd_trans_huge(*pmd));
}
+void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd)
+{
+ struct vm_area_struct *vma;
+
+ vma = find_vma(mm, address);
+ BUG_ON(vma == NULL);
+ split_huge_page_pmd(vma, address, pmd);
+}
+
static void split_huge_page_address(struct mm_struct *mm,
unsigned long address)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return;
-
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd_mm(mm, address, pmd);
}
void __vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd44..4f3ea0b1e57c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,6 +1,6 @@
/*
* Generic hugetlb support.
- * (C) William Irwin, April 2004
+ * (C) Nadia Yvette Chambers, April 2004
*/
#include <linux/list.h>
#include <linux/init.h>
@@ -30,7 +30,6 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
-#include <linux/hugetlb_cgroup.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page)
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
} else {
+ arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
}
}
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages. See the PageTransHuge() documentation for more
+ * details.
+ */
int PageHuge(struct page *page)
{
compound_page_dtor *dtor;
@@ -1052,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* on-line nodes with memory and will handle the hstate accounting.
*/
while (nr_pages--) {
- if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
+ if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
break;
}
}
@@ -1175,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
int __weak alloc_bootmem_huge_page(struct hstate *h)
{
struct huge_bootmem_page *m;
- int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+ int nr_nodes = nodes_weight(node_states[N_MEMORY]);
while (nr_nodes) {
void *addr;
addr = __alloc_bootmem_node_nopanic(
NODE_DATA(hstate_next_node_to_alloc(h,
- &node_states[N_HIGH_MEMORY])),
+ &node_states[N_MEMORY])),
huge_page_size(h), huge_page_size(h), 0);
if (addr) {
@@ -1254,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
- &node_states[N_HIGH_MEMORY]))
+ &node_states[N_MEMORY]))
break;
}
h->max_huge_pages = i;
@@ -1522,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_HIGH_MEMORY];
+ nodes_allowed = &node_states[N_MEMORY];
}
} else if (nodes_allowed) {
/*
@@ -1532,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
init_nodemask_of_node(nodes_allowed, nid);
} else
- nodes_allowed = &node_states[N_HIGH_MEMORY];
+ nodes_allowed = &node_states[N_MEMORY];
h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
- if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+ if (nodes_allowed != &node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
return len;
@@ -1795,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
* remove hstate attributes from any nodes that have them.
*/
for (nid = 0; nid < nr_node_ids; nid++)
- hugetlb_unregister_node(&node_devices[nid]);
+ hugetlb_unregister_node(node_devices[nid]);
}
/*
@@ -1839,8 +1844,8 @@ static void hugetlb_register_all_nodes(void)
{
int nid;
- for_each_node_state(nid, N_HIGH_MEMORY) {
- struct node *node = &node_devices[nid];
+ for_each_node_state(nid, N_MEMORY) {
+ struct node *node = node_devices[nid];
if (node->dev.id == nid)
hugetlb_register_node(node);
}
@@ -1901,14 +1906,12 @@ static int __init hugetlb_init(void)
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
hugetlb_init_hstates();
-
gather_bootmem_prealloc();
-
report_hugepages();
hugetlb_sysfs_init();
-
hugetlb_register_all_nodes();
+ hugetlb_cgroup_file_init();
return 0;
}
@@ -1934,17 +1937,10 @@ void __init hugetlb_add_hstate(unsigned order)
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
- h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
- h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
+ h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
+ h->next_nid_to_free = first_node(node_states[N_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
- /*
- * Add cgroup control files only if the huge page consists
- * of more than two normal pages. This is because we use
- * page[2].lru.next for storing cgoup details.
- */
- if (order >= HUGETLB_CGROUP_MIN_ORDER)
- hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
parsed_hstate = h;
}
@@ -2030,11 +2026,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (!(obey_mempolicy &&
init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_HIGH_MEMORY];
+ nodes_allowed = &node_states[N_MEMORY];
}
h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
- if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+ if (nodes_allowed != &node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
}
out:
@@ -2355,13 +2351,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ const unsigned long mmun_start = start; /* For mmu_notifiers */
+ const unsigned long mmun_end = end; /* For mmu_notifiers */
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
tlb_start_vma(tlb, vma);
- mmu_notifier_invalidate_range_start(mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
again:
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
@@ -2379,8 +2377,10 @@ again:
/*
* HWPoisoned hugepage is already unmapped and dropped reference
*/
- if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ pte_clear(mm, address, ptep);
continue;
+ }
page = pte_page(pte);
/*
@@ -2425,7 +2425,7 @@ again:
if (address < end && !ref_page)
goto again;
}
- mmu_notifier_invalidate_range_end(mm, start, end);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
tlb_end_vma(tlb, vma);
}
@@ -2473,7 +2473,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
struct address_space *mapping;
- struct prio_tree_iter iter;
pgoff_t pgoff;
/*
@@ -2481,7 +2480,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* from page cache lookup which is in HPAGE_SIZE units.
*/
address = address & huge_page_mask(h);
- pgoff = vma_hugecache_offset(h, vma, address);
+ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
/*
@@ -2490,7 +2490,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* __unmap_hugepage_range() is called as the lock is already held
*/
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
continue;
@@ -2525,6 +2525,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *old_page, *new_page;
int avoidcopy;
int outside_reserve = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
old_page = pte_page(pte);
@@ -2611,6 +2613,9 @@ retry_avoidcopy:
pages_per_huge_page(h));
__SetPageUptodate(new_page);
+ mmun_start = address & huge_page_mask(h);
+ mmun_end = mmun_start + huge_page_size(h);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
* Retake the page_table_lock to check for racing updates
* before the page tables are altered
@@ -2619,9 +2624,6 @@ retry_avoidcopy:
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
- mmu_notifier_invalidate_range_start(mm,
- address & huge_page_mask(h),
- (address & huge_page_mask(h)) + huge_page_size(h));
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2631,11 @@ retry_avoidcopy:
hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
- mmu_notifier_invalidate_range_end(mm,
- address & huge_page_mask(h),
- (address & huge_page_mask(h)) + huge_page_size(h));
}
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
page_cache_release(new_page);
page_cache_release(old_page);
return 0;
@@ -3004,7 +3007,7 @@ same_page:
return i ? i : -EFAULT;
}
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
@@ -3012,6 +3015,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
+ unsigned long pages = 0;
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
@@ -3022,12 +3026,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- if (huge_pmd_unshare(mm, &address, ptep))
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ pages++;
continue;
+ }
if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
set_huge_pte_at(mm, address, ptep, pte);
+ pages++;
}
}
spin_unlock(&mm->page_table_lock);
@@ -3039,6 +3046,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
*/
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+
+ return pages << h->order;
}
int hugetlb_reserve_pages(struct inode *inode,
@@ -3160,7 +3169,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
spin_lock(&hugetlb_lock);
if (is_hugepage_on_freelist(hpage)) {
- list_del(&hpage->lru);
+ /*
+ * Hwpoisoned hugepage isn't linked to activelist or freelist,
+ * but dangling hpage->lru can trigger list-debug warnings
+ * (this happens when we call unpoison_memory() on it),
+ * so let it point to itself with list_del_init().
+ */
+ list_del_init(&hpage->lru);
set_page_refcounted(hpage);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a3f358fb8a0c..9cea7de22ffb 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -77,7 +77,7 @@ static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
return false;
}
-static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
+static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup)
{
int idx;
struct cgroup *parent_cgroup;
@@ -101,7 +101,7 @@ static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
return &h_cgroup->css;
}
-static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
+static void hugetlb_cgroup_css_free(struct cgroup *cgroup)
{
struct hugetlb_cgroup *h_cgroup;
@@ -155,18 +155,13 @@ out:
* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
* the parent cgroup.
*/
-static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
+static void hugetlb_cgroup_css_offline(struct cgroup *cgroup)
{
struct hstate *h;
struct page *page;
- int ret = 0, idx = 0;
+ int idx = 0;
do {
- if (cgroup_task_count(cgroup) ||
- !list_empty(&cgroup->children)) {
- ret = -EBUSY;
- goto out;
- }
for_each_hstate(h) {
spin_lock(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
}
cond_resched();
} while (hugetlb_cgroup_have_usage(cgroup));
-out:
- return ret;
}
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
@@ -340,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
return buf;
}
-int __init hugetlb_cgroup_file_init(int idx)
+static void __init __hugetlb_cgroup_file_init(int idx)
{
char buf[32];
struct cftype *cft;
@@ -382,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
- return 0;
+ return;
+}
+
+void __init hugetlb_cgroup_file_init(void)
+{
+ struct hstate *h;
+
+ for_each_hstate(h) {
+ /*
+ * Add cgroup control files only if the huge page consists
+ * of more than two normal pages. This is because we use
+ * page[2].lru.next for storing cgroup details.
+ */
+ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
+ __hugetlb_cgroup_file_init(hstate_index(h));
+ }
}
/*
@@ -411,8 +419,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
struct cgroup_subsys hugetlb_subsys = {
.name = "hugetlb",
- .create = hugetlb_cgroup_create,
- .pre_destroy = hugetlb_cgroup_pre_destroy,
- .destroy = hugetlb_cgroup_destroy,
- .subsys_id = hugetlb_subsys_id,
+ .css_alloc = hugetlb_cgroup_css_alloc,
+ .css_offline = hugetlb_cgroup_css_offline,
+ .css_free = hugetlb_cgroup_css_free,
+ .subsys_id = hugetlb_subsys_id,
};
diff --git a/mm/internal.h b/mm/internal.h
index b8c91b342e24..d597f94cc205 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
/*
+ * in mm/rmap.c:
+ */
+extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+
+/*
* in mm/page_alloc.c
*/
extern void __free_pages_bootmem(struct page *page, unsigned int order);
@@ -118,26 +123,27 @@ struct compact_control {
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
- unsigned long start_free_pfn; /* where we started the search */
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
- bool wrapped; /* Order > 0 compactions are
- incremental, once free_pfn
- and migrate_pfn meet, we restart
- from the top of the zone;
- remember we wrapped around. */
+ bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool finished_update_free; /* True when the zone cached pfns are
+ * no longer being updated
+ */
+ bool finished_update_migrate;
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
- bool *contended; /* True if a lock was contended */
+ bool contended; /* True if a lock was contended */
+ struct page **page; /* Page captured of requested size */
};
unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn);
unsigned long
isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
- unsigned long low_pfn, unsigned long end_pfn);
+ unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
#endif
@@ -167,9 +173,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
}
/*
- * Called only in fault path via page_evictable() for a new page
- * to determine if it's being mapped into a LOCKED vma.
- * If so, mark page as mlocked.
+ * Called only in fault path, to determine if a new page is being
+ * mapped into a LOCKED vma. If it is, mark page as mlocked.
*/
static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
struct page *page)
@@ -180,7 +185,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
return 0;
if (!TestSetPageMlocked(page)) {
- inc_zone_page_state(page, NR_MLOCK);
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
count_vm_event(UNEVICTABLE_PGMLOCKED);
}
return 1;
@@ -201,12 +207,7 @@ extern void munlock_vma_page(struct page *page);
* If called for a page that is still mapped by mlocked vmas, all we do
* is revert to lazy LRU behaviour -- semantics are not broken.
*/
-extern void __clear_page_mlock(struct page *page);
-static inline void clear_page_mlock(struct page *page)
-{
- if (unlikely(TestClearPageMlocked(page)))
- __clear_page_mlock(page);
-}
+extern void clear_page_mlock(struct page *page);
/*
* mlock_migrate_page - called only from migrate_page_copy() to
@@ -216,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page)) {
unsigned long flags;
+ int nr_pages = hpage_nr_pages(page);
local_irq_save(flags);
- __dec_zone_page_state(page, NR_MLOCK);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
SetPageMlocked(newpage);
- __inc_zone_page_state(newpage, NR_MLOCK);
+ __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
local_irq_restore(flags);
}
}
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
@@ -340,7 +344,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#define ZONE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0
#define ZONE_RECLAIM_SUCCESS 1
-#endif
extern int hwpoison_filter(struct page *p);
@@ -356,3 +359,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN WMARK_MIN
+#define ALLOC_WMARK_LOW WMARK_LOW
+#define ALLOC_WMARK_HIGH WMARK_HIGH
+#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
+
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
+
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+
+#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 000000000000..4a5822a586e6
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rmap.h>
+#include <linux/interval_tree_generic.h>
+
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff;
+}
+
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+ unsigned long, shared.linear.rb_subtree_last,
+ vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+ struct vm_area_struct *prev,
+ struct rb_root *root)
+{
+ struct rb_node **link;
+ struct vm_area_struct *parent;
+ unsigned long last = vma_last_pgoff(node);
+
+ VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+
+ if (!prev->shared.linear.rb.rb_right) {
+ parent = prev;
+ link = &prev->shared.linear.rb.rb_right;
+ } else {
+ parent = rb_entry(prev->shared.linear.rb.rb_right,
+ struct vm_area_struct, shared.linear.rb);
+ if (parent->shared.linear.rb_subtree_last < last)
+ parent->shared.linear.rb_subtree_last = last;
+ while (parent->shared.linear.rb.rb_left) {
+ parent = rb_entry(parent->shared.linear.rb.rb_left,
+ struct vm_area_struct, shared.linear.rb);
+ if (parent->shared.linear.rb_subtree_last < last)
+ parent->shared.linear.rb_subtree_last = last;
+ }
+ link = &parent->shared.linear.rb.rb_left;
+ }
+
+ node->shared.linear.rb_subtree_last = last;
+ rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+ rb_insert_augmented(&node->shared.linear.rb, root,
+ &vma_interval_tree_augment);
+}
+
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_start_pgoff(avc->vma);
+}
+
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_last_pgoff(avc->vma);
+}
+
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+ avc_start_pgoff, avc_last_pgoff,
+ static inline, __anon_vma_interval_tree)
+
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+ struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+ node->cached_vma_start = avc_start_pgoff(node);
+ node->cached_vma_last = avc_last_pgoff(node);
+#endif
+ __anon_vma_interval_tree_insert(node, root);
+}
+
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+ struct rb_root *root)
+{
+ __anon_vma_interval_tree_remove(node, root);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+ WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+ WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 45eb6217bf38..752a705c77c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
* - kmemleak_lock (rwlock): protects the object_list modifications and
* accesses to the object_tree_root. The object_list is the main list
* holding the metadata (struct kmemleak_object) for the allocated memory
- * blocks. The object_tree_root is a priority search tree used to look-up
+ * blocks. The object_tree_root is a red black tree used to look-up
* metadata based on a pointer to the corresponding memory block. The
* kmemleak_object structures are added to the object_list and
* object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/kthread.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
#include <linux/fs.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
* Structure holding the metadata for each allocated memory block.
* Modifications to such objects should be made while holding the
* object->lock. Insertions or deletions from object_list, gray_list or
- * tree_node are already protected by the corresponding locks or mutex (see
+ * rb_node are already protected by the corresponding locks or mutex (see
* the notes on locking above). These objects are reference-counted
* (use_count) and freed using the RCU mechanism.
*/
@@ -141,7 +141,7 @@ struct kmemleak_object {
unsigned long flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
- struct prio_tree_node tree_node;
+ struct rb_node rb_node;
struct rcu_head rcu; /* object_list lockless traversal */
/* object usage count; object freed when use_count == 0 */
atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
static LIST_HEAD(object_list);
/* the list of gray-colored objects (see color_gray comment below) */
static LIST_HEAD(gray_list);
-/* prio search tree for object boundaries */
-static struct prio_tree_root object_tree_root;
-/* rw_lock protecting the access to object_list and prio_tree_root */
+/* search tree for object boundaries */
+static struct rb_root object_tree_root = RB_ROOT;
+/* rw_lock protecting the access to object_list and object_tree_root */
static DEFINE_RWLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
trace.entries = object->trace;
pr_notice("Object 0x%08lx (size %zu):\n",
- object->tree_node.start, object->size);
+ object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
}
/*
- * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * Look-up a memory block metadata (kmemleak_object) in the object search
* tree based on a pointer value. If alias is 0, only values pointing to the
* beginning of the memory block are allowed. The kmemleak_lock must be held
* when calling this function.
*/
static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
{
- struct prio_tree_node *node;
- struct prio_tree_iter iter;
- struct kmemleak_object *object;
-
- prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
- node = prio_tree_next(&iter);
- if (node) {
- object = prio_tree_entry(node, struct kmemleak_object,
- tree_node);
- if (!alias && object->pointer != ptr) {
+ struct rb_node *rb = object_tree_root.rb_node;
+
+ while (rb) {
+ struct kmemleak_object *object =
+ rb_entry(rb, struct kmemleak_object, rb_node);
+ if (ptr < object->pointer)
+ rb = object->rb_node.rb_left;
+ else if (object->pointer + object->size <= ptr)
+ rb = object->rb_node.rb_right;
+ else if (object->pointer == ptr || alias)
+ return object;
+ else {
kmemleak_warn("Found object by alias at 0x%08lx\n",
ptr);
dump_object_info(object);
- object = NULL;
+ break;
}
- } else
- object = NULL;
-
- return object;
+ }
+ return NULL;
}
/*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
}
/*
- * Look up an object in the prio search tree and increase its use_count.
+ * Look up an object in the object search tree and increase its use_count.
*/
static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
{
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
int min_count, gfp_t gfp)
{
unsigned long flags;
- struct kmemleak_object *object;
- struct prio_tree_node *node;
+ struct kmemleak_object *object, *parent;
+ struct rb_node **link, *rb_parent;
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
/* kernel backtrace */
object->trace_len = __save_stack_trace(object->trace);
- INIT_PRIO_TREE_NODE(&object->tree_node);
- object->tree_node.start = ptr;
- object->tree_node.last = ptr + size - 1;
-
write_lock_irqsave(&kmemleak_lock, flags);
min_addr = min(min_addr, ptr);
max_addr = max(max_addr, ptr + size);
- node = prio_tree_insert(&object_tree_root, &object->tree_node);
- /*
- * The code calling the kernel does not yet have the pointer to the
- * memory block to be able to free it. However, we still hold the
- * kmemleak_lock here in case parts of the kernel started freeing
- * random memory blocks.
- */
- if (node != &object->tree_node) {
- kmemleak_stop("Cannot insert 0x%lx into the object search tree "
- "(already existing)\n", ptr);
- object = lookup_object(ptr, 1);
- spin_lock(&object->lock);
- dump_object_info(object);
- spin_unlock(&object->lock);
-
- goto out;
+ link = &object_tree_root.rb_node;
+ rb_parent = NULL;
+ while (*link) {
+ rb_parent = *link;
+ parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
+ if (ptr + size <= parent->pointer)
+ link = &parent->rb_node.rb_left;
+ else if (parent->pointer + parent->size <= ptr)
+ link = &parent->rb_node.rb_right;
+ else {
+ kmemleak_stop("Cannot insert 0x%lx into the object "
+ "search tree (overlaps existing)\n",
+ ptr);
+ kmem_cache_free(object_cache, object);
+ object = parent;
+ spin_lock(&object->lock);
+ dump_object_info(object);
+ spin_unlock(&object->lock);
+ goto out;
+ }
}
+ rb_link_node(&object->rb_node, rb_parent, link);
+ rb_insert_color(&object->rb_node, &object_tree_root);
+
list_add_tail_rcu(&object->object_list, &object_list);
out:
write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
unsigned long flags;
write_lock_irqsave(&kmemleak_lock, flags);
- prio_tree_remove(&object_tree_root, &object->tree_node);
+ rb_erase(&object->rb_node, &object_tree_root);
list_del_rcu(&object->object_list);
write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -1483,13 +1486,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct kmemleak_object *prev_obj = v;
struct kmemleak_object *next_obj = NULL;
- struct list_head *n = &prev_obj->object_list;
+ struct kmemleak_object *obj = prev_obj;
++(*pos);
- list_for_each_continue_rcu(n, &object_list) {
- struct kmemleak_object *obj =
- list_entry(n, struct kmemleak_object, object_list);
+ list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
if (get_object(obj)) {
next_obj = obj;
break;
@@ -1555,7 +1556,8 @@ static int dump_str_object_info(const char *str)
struct kmemleak_object *object;
unsigned long addr;
- addr= simple_strtoul(str, NULL, 0);
+ if (kstrtoul(str, 0, &addr))
+ return -EINVAL;
object = find_and_get_object(addr, 0);
if (!object) {
pr_info("Unknown object at 0x%08lx\n", addr);
@@ -1768,7 +1770,6 @@ void __init kmemleak_init(void)
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
- INIT_PRIO_TREE_ROOT(&object_tree_root);
if (crt_early_log >= ARRAY_SIZE(early_log))
pr_warning("Early log buffer exceeded (%d), please increase "
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c885368890..51573858938d 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
spinlock_t *ptl;
int swapped;
int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
BUG_ON(PageTransCompound(page));
+
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
- goto out;
+ goto out_mn;
if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
out_unlock:
pte_unmap_unlock(ptep, ptl);
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
@@ -769,35 +778,31 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
goto out;
-
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
- goto out;
-
- pmd = pmd_offset(pud, addr);
BUG_ON(pmd_trans_huge(*pmd));
- if (!pmd_present(*pmd))
- goto out;
+
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte_same(*ptep, orig_pte)) {
pte_unmap_unlock(ptep, ptl);
- goto out;
+ goto out_mn;
}
get_page(kpage);
@@ -814,6 +819,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pte_unmap_unlock(ptep, ptl);
err = 0;
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
@@ -1469,10 +1476,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
*/
if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
+ VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
return 0; /* just ignore the advice */
+#ifdef VM_SAO
+ if (*vm_flags & VM_SAO)
+ return 0;
+#endif
+
if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
err = __ksm_enter(mm);
if (err)
@@ -1582,7 +1593,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
SetPageSwapBacked(new_page);
__set_page_locked(new_page);
- if (page_evictable(new_page, vma))
+ if (!mlocked_vma_newpage(vma, new_page))
lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
else
add_page_to_unevictable_list(new_page);
@@ -1613,8 +1624,9 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
@@ -1636,7 +1648,7 @@ again:
if (!search_new_forks || !mapcount)
break;
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
if (!mapcount)
goto out;
}
@@ -1666,8 +1678,9 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
@@ -1684,11 +1697,11 @@ again:
ret = try_to_unmap_one(page, vma,
rmap_item->address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page)) {
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
goto out;
}
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1718,8 +1731,9 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
@@ -1735,11 +1749,11 @@ again:
ret = rmap_one(page, vma, rmap_item->address, arg);
if (ret != SWAP_AGAIN) {
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
goto out;
}
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
@@ -1905,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
if (ksm_run != flags) {
ksm_run = flags;
if (flags & KSM_RUN_UNMERGE) {
- int oom_score_adj;
-
- oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
+ set_current_oom_origin();
err = unmerge_and_remove_all_rmap_items();
- compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
- oom_score_adj);
+ clear_current_oom_origin();
if (err) {
ksm_run = KSM_RUN_STOP;
count = err;
diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d17..03dfa5c7adb3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
new_flags &= ~VM_DONTCOPY;
break;
case MADV_DONTDUMP:
- new_flags |= VM_NODUMP;
+ new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- new_flags &= ~VM_NODUMP;
+ if (new_flags & VM_SPECIAL) {
+ error = -EINVAL;
+ goto out;
+ }
+ new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 82aa349d2f7a..625905523c2a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
/* inline so we don't get a warning when pr_debug is compiled out */
-static inline const char *memblock_type_name(struct memblock_type *type)
+static __init_memblock const char *
+memblock_type_name(struct memblock_type *type)
{
if (type == &memblock.memory)
return "memory";
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
return ret;
for (i = start_rgn; i < end_rgn; i++)
- type->regions[i].nid = nid;
+ memblock_set_region_node(&type->regions[i], nid);
memblock_merge_regions(type);
return 0;
@@ -929,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
}
+void __init_memblock memblock_trim_memory(phys_addr_t align)
+{
+ int i;
+ phys_addr_t start, end, orig_start, orig_end;
+ struct memblock_type *mem = &memblock.memory;
+
+ for (i = 0; i < mem->cnt; i++) {
+ orig_start = mem->regions[i].base;
+ orig_end = mem->regions[i].base + mem->regions[i].size;
+ start = round_up(orig_start, align);
+ end = round_down(orig_end, align);
+
+ if (start == orig_start && end == orig_end)
+ continue;
+
+ if (start < end) {
+ mem->regions[i].base = start;
+ mem->regions[i].size = end - start;
+ } else {
+ memblock_remove_region(mem, i);
+ i--;
+ }
+ }
+}
void __init_memblock memblock_set_current_limit(phys_addr_t limit)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 795e525afaba..f3009b4bae51 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
+ * Kernel Memory Controller
+ * Copyright (C) 2012 Parallels Inc. and Google Inc.
+ * Authors: Glauber Costa and Suleiman Souhlal
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -51,6 +55,7 @@
#include <linux/oom.h>
#include "internal.h"
#include <net/sock.h>
+#include <net/ip.h>
#include <net/tcp_memcontrol.h>
#include <asm/uaccess.h>
@@ -58,6 +63,8 @@
#include <trace/events/vmscan.h>
struct cgroup_subsys mem_cgroup_subsys __read_mostly;
+EXPORT_SYMBOL(mem_cgroup_subsys);
+
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -265,6 +272,10 @@ struct mem_cgroup {
};
/*
+ * the counter to account for kernel memory usage.
+ */
+ struct res_counter kmem;
+ /*
* Per cgroup active and inactive list, similar to the
* per zone LRU lists.
*/
@@ -279,6 +290,7 @@ struct mem_cgroup {
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
+ unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
bool oom_lock;
atomic_t under_oom;
@@ -326,11 +338,64 @@ struct mem_cgroup {
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
-#ifdef CONFIG_INET
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
#endif
+#if defined(CONFIG_MEMCG_KMEM)
+ /* analogous to slab_common's slab_caches list. per-memcg */
+ struct list_head memcg_slab_caches;
+ /* Not a spinlock, we can take a lot of time walking the list */
+ struct mutex slab_caches_mutex;
+ /* Index in the kmem_cache->memcg_params->memcg_caches array */
+ int kmemcg_id;
+#endif
};
+/* internal only representation about the status of kmem accounting. */
+enum {
+ KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
+ KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+ KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
+};
+
+/* We account when limit is on, but only after call sites are patched */
+#define KMEM_ACCOUNTED_MASK \
+ ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
+{
+ set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+
+static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+ return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
+{
+ set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
+{
+ clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+
+static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
+{
+ if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
+ set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
+}
+
+static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
+{
+ return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
+ &memcg->kmem_account_flags);
+}
+#endif
+
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -385,9 +450,13 @@ enum charge_type {
};
/* for encoding cft->private value on file */
-#define _MEM (0)
-#define _MEMSWAP (1)
-#define _OOM_TYPE (2)
+enum res_type {
+ _MEM,
+ _MEMSWAP,
+ _OOM_TYPE,
+ _KMEM,
+};
+
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
@@ -411,12 +480,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
return container_of(s, struct mem_cgroup, css);
}
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+ return (memcg == root_mem_cgroup);
+}
+
/* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_MEMCG_KMEM
-#include <net/sock.h>
-#include <net/ip.h>
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
-static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
void sock_update_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled) {
@@ -461,7 +532,6 @@ void sock_release_memcg(struct sock *sk)
}
}
-#ifdef CONFIG_INET
struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +540,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
return &memcg->tcp_mem.cg_proto;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
-#endif /* CONFIG_INET */
-#endif /* CONFIG_MEMCG_KMEM */
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -486,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
}
#endif
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * There are two main reasons for not using the css_id for this:
+ * 1) this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
+ *
+ * 2) In order not to violate the cgroup API, we would like to do all memory
+ * allocation in ->create(). At that point, we haven't yet allocated the
+ * css_id. Having a separate index prevents us from messing with the cgroup
+ * core for this
+ *
+ * The current size of the caches array is stored in
+ * memcg_limited_groups_array_size. It will double each time we have to
+ * increase it.
+ */
+static DEFINE_IDA(kmem_limited_groups);
+int memcg_limited_groups_array_size;
+
+/*
+ * MIN_SIZE is different than 1, because we would like to avoid going through
+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
+ * cgroups is a reasonable guess. In the future, it could be a parameter or
+ * tunable, but that is strictly not necessary.
+ *
+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * this constant directly from cgroup, but it is understandable that this is
+ * better kept as an internal representation in cgroup.c. In any case, the
+ * css_id space is not getting any smaller, and we don't have to necessarily
+ * increase ours as well if it increases.
+ */
+#define MEMCG_CACHES_MIN_SIZE 4
+#define MEMCG_CACHES_MAX_SIZE 65535
+
+/*
+ * A lot of the calls to the cache allocation functions are expected to be
+ * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * conditional to this static branch, we'll have to allow modules that does
+ * kmem_cache_alloc and the such to see this symbol as well
+ */
+struct static_key memcg_kmem_enabled_key;
+EXPORT_SYMBOL(memcg_kmem_enabled_key);
+
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+ if (memcg_kmem_is_active(memcg)) {
+ static_key_slow_dec(&memcg_kmem_enabled_key);
+ ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
+ }
+ /*
+ * This check can't live in kmem destruction function,
+ * since the charges will outlive the cgroup
+ */
+ WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+}
+#else
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+static void disarm_static_keys(struct mem_cgroup *memcg)
+{
+ disarm_sock_keys(memcg);
+ disarm_kmem_keys(memcg);
+}
+
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
@@ -801,7 +937,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
int nid;
u64 total = 0;
- for_each_node_state(nid, N_HIGH_MEMORY)
+ for_each_node_state(nid, N_MEMORY)
total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
return total;
}
@@ -1016,18 +1152,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
- return (memcg == root_mem_cgroup);
-}
-
-void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
+void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
struct mem_cgroup *memcg;
- if (!mm)
- return;
-
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
@@ -1046,7 +1174,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
out:
rcu_read_unlock();
}
-EXPORT_SYMBOL(mem_cgroup_count_vm_event);
+EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
@@ -1061,12 +1189,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
struct mem_cgroup *memcg)
{
struct mem_cgroup_per_zone *mz;
+ struct lruvec *lruvec;
- if (mem_cgroup_disabled())
- return &zone->lruvec;
+ if (mem_cgroup_disabled()) {
+ lruvec = &zone->lruvec;
+ goto out;
+ }
mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
- return &mz->lruvec;
+ lruvec = &mz->lruvec;
+out:
+ /*
+ * Since a node can be onlined after the mem_cgroup was created,
+ * we have to be prepared to initialize lruvec->zone here;
+ * and if offlined then reonlined, we need to reinitialize it.
+ */
+ if (unlikely(lruvec->zone != zone))
+ lruvec->zone = zone;
+ return lruvec;
}
/*
@@ -1093,9 +1233,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
struct mem_cgroup_per_zone *mz;
struct mem_cgroup *memcg;
struct page_cgroup *pc;
+ struct lruvec *lruvec;
- if (mem_cgroup_disabled())
- return &zone->lruvec;
+ if (mem_cgroup_disabled()) {
+ lruvec = &zone->lruvec;
+ goto out;
+ }
pc = lookup_page_cgroup(page);
memcg = pc->mem_cgroup;
@@ -1113,7 +1256,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
pc->mem_cgroup = memcg = root_mem_cgroup;
mz = page_cgroup_zoneinfo(memcg, page);
- return &mz->lruvec;
+ lruvec = &mz->lruvec;
+out:
+ /*
+ * Since a node can be onlined after the mem_cgroup was created,
+ * we have to be prepared to initialize lruvec->zone here;
+ * and if offlined then reonlined, we need to reinitialize it.
+ */
+ if (unlikely(lruvec->zone != zone))
+ lruvec->zone = zone;
+ return lruvec;
}
/**
@@ -1436,6 +1588,10 @@ done:
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
+ printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+ res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
+ res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
+ res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
}
/*
@@ -1458,21 +1614,30 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
u64 limit;
- u64 memsw;
limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
- limit += total_swap_pages << PAGE_SHIFT;
- memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
- * If memsw is finite and limits the amount of swap space available
- * to this memcg, return that limit.
+ * Do not consider swap space if we cannot swap due to swappiness
*/
- return min(limit, memsw);
+ if (mem_cgroup_swappiness(memcg)) {
+ u64 memsw;
+
+ limit += total_swap_pages << PAGE_SHIFT;
+ memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+
+ /*
+ * If memsw is finite and limits the amount of swap space
+ * available to this memcg, return that limit.
+ */
+ limit = min(limit, memsw);
+ }
+
+ return limit;
}
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
- int order)
+static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ int order)
{
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
@@ -1617,9 +1782,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
return;
/* make a nodemask where this memcg uses memory from */
- memcg->scan_nodes = node_states[N_HIGH_MEMORY];
+ memcg->scan_nodes = node_states[N_MEMORY];
- for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+ for_each_node_mask(nid, node_states[N_MEMORY]) {
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
node_clear(nid, memcg->scan_nodes);
@@ -1690,7 +1855,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
/*
* Check rest of nodes.
*/
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
if (node_isset(nid, memcg->scan_nodes))
continue;
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
@@ -2034,20 +2199,28 @@ struct memcg_stock_pcp {
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
-/*
- * Try to consume stocked charge on this cpu. If success, one page is consumed
- * from local stock and true is returned. If the stock is 0 or charges from a
- * cgroup which is not current target, returns false. This stock will be
- * refilled.
+/**
+ * consume_stock: Try to consume stocked charge on this cpu.
+ * @memcg: memcg to consume from.
+ * @nr_pages: how many pages to charge.
+ *
+ * The charges will only happen if @memcg matches the current cpu's memcg
+ * stock, and at least @nr_pages are available in that stock. Failure to
+ * service an allocation will refill the stock.
+ *
+ * returns true if successful, false otherwise.
*/
-static bool consume_stock(struct mem_cgroup *memcg)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
bool ret = true;
+ if (nr_pages > CHARGE_BATCH)
+ return false;
+
stock = &get_cpu_var(memcg_stock);
- if (memcg == stock->cached && stock->nr_pages)
- stock->nr_pages--;
+ if (memcg == stock->cached && stock->nr_pages >= nr_pages)
+ stock->nr_pages -= nr_pages;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
@@ -2224,7 +2397,8 @@ enum {
};
static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages, bool oom_check)
+ unsigned int nr_pages, unsigned int min_pages,
+ bool oom_check)
{
unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
@@ -2247,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
/*
- * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
- * of regular pages (CHARGE_BATCH), or a single regular page (1).
- *
* Never reclaim on behalf of optional batching, retry with a
* single page instead.
*/
- if (nr_pages == CHARGE_BATCH)
+ if (nr_pages > min_pages)
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
+ if (gfp_mask & __GFP_NORETRY)
+ return CHARGE_NOMEM;
+
ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
return CHARGE_RETRY;
@@ -2271,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
* unlikely to succeed so close to the limit, and we fall back
* to regular pages anyway in case of failure.
*/
- if (nr_pages == 1 && ret)
+ if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
return CHARGE_RETRY;
/*
@@ -2343,10 +2517,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
again:
if (*ptr) { /* css should be a valid one */
memcg = *ptr;
- VM_BUG_ON(css_is_removed(&memcg->css));
if (mem_cgroup_is_root(memcg))
goto done;
- if (nr_pages == 1 && consume_stock(memcg))
+ if (consume_stock(memcg, nr_pages))
goto done;
css_get(&memcg->css);
} else {
@@ -2371,7 +2544,7 @@ again:
rcu_read_unlock();
goto done;
}
- if (nr_pages == 1 && consume_stock(memcg)) {
+ if (consume_stock(memcg, nr_pages)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
@@ -2406,7 +2579,8 @@ again:
nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
+ ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
+ oom_check);
switch (ret) {
case CHARGE_OK:
break;
@@ -2483,9 +2657,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
/*
* A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller must check css_is_removed() or some if
- * it's concern. (dropping refcnt from swap can be called against removed
- * memcg.)
+ * rcu_read_lock(). The caller is responsible for calling css_tryget if
+ * the mem_cgroup is used for charging. (dropping refcnt from swap can be
+ * called against removed memcg.)
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
@@ -2599,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
memcg_check_events(memcg, page);
}
+static DEFINE_MUTEX(set_limit_mutex);
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
+{
+ return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
+ (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+}
+
+/*
+ * This is a bit cumbersome, but it is rarely used and avoids a backpointer
+ * in the memcg_cache_params struct.
+ */
+static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
+{
+ struct kmem_cache *cachep;
+
+ VM_BUG_ON(p->is_root_cache);
+ cachep = p->root_cache;
+ return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+}
+
+#ifdef CONFIG_SLABINFO
+static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ struct memcg_cache_params *params;
+
+ if (!memcg_can_account_kmem(memcg))
+ return -EIO;
+
+ print_slabinfo_header(m);
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list)
+ cache_show(memcg_params_to_cache(params), m);
+ mutex_unlock(&memcg->slab_caches_mutex);
+
+ return 0;
+}
+#endif
+
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+{
+ struct res_counter *fail_res;
+ struct mem_cgroup *_memcg;
+ int ret = 0;
+ bool may_oom;
+
+ ret = res_counter_charge(&memcg->kmem, size, &fail_res);
+ if (ret)
+ return ret;
+
+ /*
+ * Conditions under which we can wait for the oom_killer. Those are
+ * the same conditions tested by the core page allocator
+ */
+ may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
+
+ _memcg = memcg;
+ ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
+ &_memcg, may_oom);
+
+ if (ret == -EINTR) {
+ /*
+ * __mem_cgroup_try_charge() chosed to bypass to root due to
+ * OOM kill or fatal signal. Since our only options are to
+ * either fail the allocation or charge it to this cgroup, do
+ * it as a temporary condition. But we can't fail. From a
+ * kmem/slab perspective, the cache has already been selected,
+ * by mem_cgroup_kmem_get_cache(), so it is too late to change
+ * our minds.
+ *
+ * This condition will only trigger if the task entered
+ * memcg_charge_kmem in a sane state, but was OOM-killed during
+ * __mem_cgroup_try_charge() above. Tasks that were already
+ * dying when the allocation triggers should have been already
+ * directed to the root cgroup in memcontrol.h
+ */
+ res_counter_charge_nofail(&memcg->res, size, &fail_res);
+ if (do_swap_account)
+ res_counter_charge_nofail(&memcg->memsw, size,
+ &fail_res);
+ ret = 0;
+ } else if (ret)
+ res_counter_uncharge(&memcg->kmem, size);
+
+ return ret;
+}
+
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+{
+ res_counter_uncharge(&memcg->res, size);
+ if (do_swap_account)
+ res_counter_uncharge(&memcg->memsw, size);
+
+ /* Not down to 0 */
+ if (res_counter_uncharge(&memcg->kmem, size))
+ return;
+
+ if (memcg_kmem_test_and_clear_dead(memcg))
+ mem_cgroup_put(memcg);
+}
+
+void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
+{
+ if (!memcg)
+ return;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+int memcg_cache_id(struct mem_cgroup *memcg)
+{
+ return memcg ? memcg->kmemcg_id : -1;
+}
+
+/*
+ * This ends up being protected by the set_limit mutex, during normal
+ * operation, because that is its main call site.
+ *
+ * But when we create a new cache, we can call this as well if its parent
+ * is kmem-limited. That will have to hold set_limit_mutex as well.
+ */
+int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+{
+ int num, ret;
+
+ num = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (num < 0)
+ return num;
+ /*
+ * After this point, kmem_accounted (that we test atomically in
+ * the beginning of this conditional), is no longer 0. This
+ * guarantees only one process will set the following boolean
+ * to true. We don't need test_and_set because we're protected
+ * by the set_limit_mutex anyway.
+ */
+ memcg_kmem_set_activated(memcg);
+
+ ret = memcg_update_all_caches(num+1);
+ if (ret) {
+ ida_simple_remove(&kmem_limited_groups, num);
+ memcg_kmem_clear_activated(memcg);
+ return ret;
+ }
+
+ memcg->kmemcg_id = num;
+ INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+ mutex_init(&memcg->slab_caches_mutex);
+ return 0;
+}
+
+static size_t memcg_caches_array_size(int num_groups)
+{
+ ssize_t size;
+ if (num_groups <= 0)
+ return 0;
+
+ size = 2 * num_groups;
+ if (size < MEMCG_CACHES_MIN_SIZE)
+ size = MEMCG_CACHES_MIN_SIZE;
+ else if (size > MEMCG_CACHES_MAX_SIZE)
+ size = MEMCG_CACHES_MAX_SIZE;
+
+ return size;
+}
+
+/*
+ * We should update the current array size iff all caches updates succeed. This
+ * can only be done from the slab side. The slab mutex needs to be held when
+ * calling this.
+ */
+void memcg_update_array_size(int num)
+{
+ if (num > memcg_limited_groups_array_size)
+ memcg_limited_groups_array_size = memcg_caches_array_size(num);
+}
+
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
+{
+ struct memcg_cache_params *cur_params = s->memcg_params;
+
+ VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+
+ if (num_groups > memcg_limited_groups_array_size) {
+ int i;
+ ssize_t size = memcg_caches_array_size(num_groups);
+
+ size *= sizeof(void *);
+ size += sizeof(struct memcg_cache_params);
+
+ s->memcg_params = kzalloc(size, GFP_KERNEL);
+ if (!s->memcg_params) {
+ s->memcg_params = cur_params;
+ return -ENOMEM;
+ }
+
+ s->memcg_params->is_root_cache = true;
+
+ /*
+ * There is the chance it will be bigger than
+ * memcg_limited_groups_array_size, if we failed an allocation
+ * in a cache, in which case all caches updated before it, will
+ * have a bigger array.
+ *
+ * But if that is the case, the data after
+ * memcg_limited_groups_array_size is certainly unused
+ */
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
+ if (!cur_params->memcg_caches[i])
+ continue;
+ s->memcg_params->memcg_caches[i] =
+ cur_params->memcg_caches[i];
+ }
+
+ /*
+ * Ideally, we would wait until all caches succeed, and only
+ * then free the old one. But this is not worth the extra
+ * pointer per-cache we'd have to have for this.
+ *
+ * It is not a big deal if some caches are left with a size
+ * bigger than the others. And all updates will reset this
+ * anyway.
+ */
+ kfree(cur_params);
+ }
+ return 0;
+}
+
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+ struct kmem_cache *root_cache)
+{
+ size_t size = sizeof(struct memcg_cache_params);
+
+ if (!memcg_kmem_enabled())
+ return 0;
+
+ if (!memcg)
+ size += memcg_limited_groups_array_size * sizeof(void *);
+
+ s->memcg_params = kzalloc(size, GFP_KERNEL);
+ if (!s->memcg_params)
+ return -ENOMEM;
+
+ if (memcg) {
+ s->memcg_params->memcg = memcg;
+ s->memcg_params->root_cache = root_cache;
+ }
+ return 0;
+}
+
+void memcg_release_cache(struct kmem_cache *s)
+{
+ struct kmem_cache *root;
+ struct mem_cgroup *memcg;
+ int id;
+
+ /*
+ * This happens, for instance, when a root cache goes away before we
+ * add any memcg.
+ */
+ if (!s->memcg_params)
+ return;
+
+ if (s->memcg_params->is_root_cache)
+ goto out;
+
+ memcg = s->memcg_params->memcg;
+ id = memcg_cache_id(memcg);
+
+ root = s->memcg_params->root_cache;
+ root->memcg_params->memcg_caches[id] = NULL;
+ mem_cgroup_put(memcg);
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_del(&s->memcg_params->list);
+ mutex_unlock(&memcg->slab_caches_mutex);
+
+out:
+ kfree(s->memcg_params);
+}
+
+/*
+ * During the creation a new cache, we need to disable our accounting mechanism
+ * altogether. This is true even if we are not creating, but rather just
+ * enqueing new caches to be created.
+ *
+ * This is because that process will trigger allocations; some visible, like
+ * explicit kmallocs to auxiliary data structures, name strings and internal
+ * cache structures; some well concealed, like INIT_WORK() that can allocate
+ * objects during debug.
+ *
+ * If any allocation happens during memcg_kmem_get_cache, we will recurse back
+ * to it. This may not be a bounded recursion: since the first cache creation
+ * failed to complete (waiting on the allocation), we'll just try to create the
+ * cache again, failing at the same point.
+ *
+ * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
+ * memcg_kmem_skip_account. So we enclose anything that might allocate memory
+ * inside the following two functions.
+ */
+static inline void memcg_stop_kmem_account(void)
+{
+ VM_BUG_ON(!current->mm);
+ current->memcg_kmem_skip_account++;
+}
+
+static inline void memcg_resume_kmem_account(void)
+{
+ VM_BUG_ON(!current->mm);
+ current->memcg_kmem_skip_account--;
+}
+
+static void kmem_cache_destroy_work_func(struct work_struct *w)
+{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *p;
+
+ p = container_of(w, struct memcg_cache_params, destroy);
+
+ cachep = memcg_params_to_cache(p);
+
+ /*
+ * If we get down to 0 after shrink, we could delete right away.
+ * However, memcg_release_pages() already puts us back in the workqueue
+ * in that case. If we proceed deleting, we'll get a dangling
+ * reference, and removing the object from the workqueue in that case
+ * is unnecessary complication. We are not a fast path.
+ *
+ * Note that this case is fundamentally different from racing with
+ * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
+ * kmem_cache_shrink, not only we would be reinserting a dead cache
+ * into the queue, but doing so from inside the worker racing to
+ * destroy it.
+ *
+ * So if we aren't down to zero, we'll just schedule a worker and try
+ * again
+ */
+ if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+ kmem_cache_shrink(cachep);
+ if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+ return;
+ } else
+ kmem_cache_destroy(cachep);
+}
+
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+ if (!cachep->memcg_params->dead)
+ return;
+
+ /*
+ * There are many ways in which we can get here.
+ *
+ * We can get to a memory-pressure situation while the delayed work is
+ * still pending to run. The vmscan shrinkers can then release all
+ * cache memory and get us to destruction. If this is the case, we'll
+ * be executed twice, which is a bug (the second time will execute over
+ * bogus data). In this case, cancelling the work should be fine.
+ *
+ * But we can also get here from the worker itself, if
+ * kmem_cache_shrink is enough to shake all the remaining objects and
+ * get the page count to 0. In this case, we'll deadlock if we try to
+ * cancel the work (the worker runs with an internal lock held, which
+ * is the same lock we would hold for cancel_work_sync().)
+ *
+ * Since we can't possibly know who got us here, just refrain from
+ * running if there is already work pending
+ */
+ if (work_pending(&cachep->memcg_params->destroy))
+ return;
+ /*
+ * We have to defer the actual destroying to a workqueue, because
+ * we might currently be in a context that cannot sleep.
+ */
+ schedule_work(&cachep->memcg_params->destroy);
+}
+
+static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
+{
+ char *name;
+ struct dentry *dentry;
+
+ rcu_read_lock();
+ dentry = rcu_dereference(memcg->css.cgroup->dentry);
+ rcu_read_unlock();
+
+ BUG_ON(dentry == NULL);
+
+ name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
+ memcg_cache_id(memcg), dentry->d_name.name);
+
+ return name;
+}
+
+static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
+ struct kmem_cache *s)
+{
+ char *name;
+ struct kmem_cache *new;
+
+ name = memcg_cache_name(memcg, s);
+ if (!name)
+ return NULL;
+
+ new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
+ (s->flags & ~SLAB_PANIC), s->ctor, s);
+
+ if (new)
+ new->allocflags |= __GFP_KMEMCG;
+
+ kfree(name);
+ return new;
+}
+
+/*
+ * This lock protects updaters, not readers. We want readers to be as fast as
+ * they can, and they will either see NULL or a valid cache value. Our model
+ * allow them to see NULL, in which case the root memcg will be selected.
+ *
+ * We need this lock because multiple allocations to the same cache from a non
+ * will span more than one worker. Only one of them can create the cache.
+ */
+static DEFINE_MUTEX(memcg_cache_mutex);
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ struct kmem_cache *new_cachep;
+ int idx;
+
+ BUG_ON(!memcg_can_account_kmem(memcg));
+
+ idx = memcg_cache_id(memcg);
+
+ mutex_lock(&memcg_cache_mutex);
+ new_cachep = cachep->memcg_params->memcg_caches[idx];
+ if (new_cachep)
+ goto out;
+
+ new_cachep = kmem_cache_dup(memcg, cachep);
+ if (new_cachep == NULL) {
+ new_cachep = cachep;
+ goto out;
+ }
+
+ mem_cgroup_get(memcg);
+ atomic_set(&new_cachep->memcg_params->nr_pages , 0);
+
+ cachep->memcg_params->memcg_caches[idx] = new_cachep;
+ /*
+ * the readers won't lock, make sure everybody sees the updated value,
+ * so they won't put stuff in the queue again for no reason
+ */
+ wmb();
+out:
+ mutex_unlock(&memcg_cache_mutex);
+ return new_cachep;
+}
+
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+ struct kmem_cache *c;
+ int i;
+
+ if (!s->memcg_params)
+ return;
+ if (!s->memcg_params->is_root_cache)
+ return;
+
+ /*
+ * If the cache is being destroyed, we trust that there is no one else
+ * requesting objects from it. Even if there are, the sanity checks in
+ * kmem_cache_destroy should caught this ill-case.
+ *
+ * Still, we don't want anyone else freeing memcg_caches under our
+ * noses, which can happen if a new memcg comes to life. As usual,
+ * we'll take the set_limit_mutex to protect ourselves against this.
+ */
+ mutex_lock(&set_limit_mutex);
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
+ c = s->memcg_params->memcg_caches[i];
+ if (!c)
+ continue;
+
+ /*
+ * We will now manually delete the caches, so to avoid races
+ * we need to cancel all pending destruction workers and
+ * proceed with destruction ourselves.
+ *
+ * kmem_cache_destroy() will call kmem_cache_shrink internally,
+ * and that could spawn the workers again: it is likely that
+ * the cache still have active pages until this very moment.
+ * This would lead us back to mem_cgroup_destroy_cache.
+ *
+ * But that will not execute at all if the "dead" flag is not
+ * set, so flip it down to guarantee we are in control.
+ */
+ c->memcg_params->dead = false;
+ cancel_work_sync(&c->memcg_params->destroy);
+ kmem_cache_destroy(c);
+ }
+ mutex_unlock(&set_limit_mutex);
+}
+
+struct create_work {
+ struct mem_cgroup *memcg;
+ struct kmem_cache *cachep;
+ struct work_struct work;
+};
+
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+ struct kmem_cache *cachep;
+ struct memcg_cache_params *params;
+
+ if (!memcg_kmem_is_active(memcg))
+ return;
+
+ mutex_lock(&memcg->slab_caches_mutex);
+ list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+ cachep = memcg_params_to_cache(params);
+ cachep->memcg_params->dead = true;
+ INIT_WORK(&cachep->memcg_params->destroy,
+ kmem_cache_destroy_work_func);
+ schedule_work(&cachep->memcg_params->destroy);
+ }
+ mutex_unlock(&memcg->slab_caches_mutex);
+}
+
+static void memcg_create_cache_work_func(struct work_struct *w)
+{
+ struct create_work *cw;
+
+ cw = container_of(w, struct create_work, work);
+ memcg_create_kmem_cache(cw->memcg, cw->cachep);
+ /* Drop the reference gotten when we enqueued. */
+ css_put(&cw->memcg->css);
+ kfree(cw);
+}
+
+/*
+ * Enqueue the creation of a per-memcg kmem_cache.
+ * Called with rcu_read_lock.
+ */
+static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ struct create_work *cw;
+
+ cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+ if (cw == NULL)
+ return;
+
+ /* The corresponding put will be done in the workqueue. */
+ if (!css_tryget(&memcg->css)) {
+ kfree(cw);
+ return;
+ }
+
+ cw->memcg = memcg;
+ cw->cachep = cachep;
+
+ INIT_WORK(&cw->work, memcg_create_cache_work_func);
+ schedule_work(&cw->work);
+}
+
+static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
+{
+ /*
+ * We need to stop accounting when we kmalloc, because if the
+ * corresponding kmalloc cache is not yet created, the first allocation
+ * in __memcg_create_cache_enqueue will recurse.
+ *
+ * However, it is better to enclose the whole function. Depending on
+ * the debugging options enabled, INIT_WORK(), for instance, can
+ * trigger an allocation. This too, will make us recurse. Because at
+ * this point we can't allow ourselves back into memcg_kmem_get_cache,
+ * the safest choice is to do it like this, wrapping the whole function.
+ */
+ memcg_stop_kmem_account();
+ __memcg_create_cache_enqueue(memcg, cachep);
+ memcg_resume_kmem_account();
+}
+/*
+ * Return the kmem_cache we're supposed to use for a slab allocation.
+ * We try to use the current memcg's version of the cache.
+ *
+ * If the cache does not exist yet, if we are the first user of it,
+ * we either create it immediately, if possible, or create it asynchronously
+ * in a workqueue.
+ * In the latter case, we will let the current allocation go through with
+ * the original cache.
+ *
+ * Can't be called in interrupt context or from kernel threads.
+ * This function needs to be called with rcu_read_lock() held.
+ */
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
+ gfp_t gfp)
+{
+ struct mem_cgroup *memcg;
+ int idx;
+
+ VM_BUG_ON(!cachep->memcg_params);
+ VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+
+ if (!current->mm || current->memcg_kmem_skip_account)
+ return cachep;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+ rcu_read_unlock();
+
+ if (!memcg_can_account_kmem(memcg))
+ return cachep;
+
+ idx = memcg_cache_id(memcg);
+
+ /*
+ * barrier to mare sure we're always seeing the up to date value. The
+ * code updating memcg_caches will issue a write barrier to match this.
+ */
+ read_barrier_depends();
+ if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
+ /*
+ * If we are in a safe context (can wait, and not in interrupt
+ * context), we could be be predictable and return right away.
+ * This would guarantee that the allocation being performed
+ * already belongs in the new cache.
+ *
+ * However, there are some clashes that can arrive from locking.
+ * For instance, because we acquire the slab_mutex while doing
+ * kmem_cache_dup, this means no further allocation could happen
+ * with the slab_mutex held.
+ *
+ * Also, because cache creation issue get_online_cpus(), this
+ * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+ * that ends up reversed during cpu hotplug. (cpuset allocates
+ * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+ * better to defer everything.
+ */
+ memcg_create_cache_enqueue(memcg, cachep);
+ return cachep;
+ }
+
+ return cachep->memcg_params->memcg_caches[idx];
+}
+EXPORT_SYMBOL(__memcg_kmem_get_cache);
+
+/*
+ * We need to verify if the allocation against current->mm->owner's memcg is
+ * possible for the given order. But the page is not allocated yet, so we'll
+ * need a further commit step to do the final arrangements.
+ *
+ * It is possible for the task to switch cgroups in this mean time, so at
+ * commit time, we can't rely on task conversion any longer. We'll then use
+ * the handle argument to return to the caller which cgroup we should commit
+ * against. We could also return the memcg directly and avoid the pointer
+ * passing, but a boolean return value gives better semantics considering
+ * the compiled-out case as well.
+ *
+ * Returning true means the allocation is possible.
+ */
+bool
+__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ *_memcg = NULL;
+ memcg = try_get_mem_cgroup_from_mm(current->mm);
+
+ /*
+ * very rare case described in mem_cgroup_from_task. Unfortunately there
+ * isn't much we can do without complicating this too much, and it would
+ * be gfp-dependent anyway. Just let it go
+ */
+ if (unlikely(!memcg))
+ return true;
+
+ if (!memcg_can_account_kmem(memcg)) {
+ css_put(&memcg->css);
+ return true;
+ }
+
+ ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+ if (!ret)
+ *_memcg = memcg;
+
+ css_put(&memcg->css);
+ return (ret == 0);
+}
+
+void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
+ int order)
+{
+ struct page_cgroup *pc;
+
+ VM_BUG_ON(mem_cgroup_is_root(memcg));
+
+ /* The page allocation failed. Revert */
+ if (!page) {
+ memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+ return;
+ }
+
+ pc = lookup_page_cgroup(page);
+ lock_page_cgroup(pc);
+ pc->mem_cgroup = memcg;
+ SetPageCgroupUsed(pc);
+ unlock_page_cgroup(pc);
+}
+
+void __memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct page_cgroup *pc;
+
+
+ pc = lookup_page_cgroup(page);
+ /*
+ * Fast unlocked return. Theoretically might have changed, have to
+ * check again after locking.
+ */
+ if (!PageCgroupUsed(pc))
+ return;
+
+ lock_page_cgroup(pc);
+ if (PageCgroupUsed(pc)) {
+ memcg = pc->mem_cgroup;
+ ClearPageCgroupUsed(pc);
+ }
+ unlock_page_cgroup(pc);
+
+ /*
+ * We trust that only if there is a memcg associated with the page, it
+ * is a valid allocation
+ */
+ if (!memcg)
+ return;
+
+ VM_BUG_ON(mem_cgroup_is_root(memcg));
+ memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+}
+#else
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
@@ -2682,13 +3616,6 @@ static int mem_cgroup_move_account(struct page *page,
/* caller should have done css_get */
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, anon, nr_pages);
- /*
- * We charges against "to" which may not have any tasks. Then, "to"
- * can be under rmdir(). But in current implementation, caller of
- * this function is just force_empty() and move charge, so it's
- * guaranteed that "to" is never removed. So, we don't check rmdir
- * status here.
- */
move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
@@ -2702,10 +3629,27 @@ out:
return ret;
}
-/*
- * move charges to its parent.
+/**
+ * mem_cgroup_move_parent - moves page to the parent group
+ * @page: the page to move
+ * @pc: page_cgroup of the page
+ * @child: page's cgroup
+ *
+ * move charges to its parent or the root cgroup if the group has no
+ * parent (aka use_hierarchy==0).
+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
+ * mem_cgroup_move_account fails) the failure is always temporary and
+ * it signals a race with a page removal/uncharge or migration. In the
+ * first case the page is on the way out and it will vanish from the LRU
+ * on the next attempt and the call should be retried later.
+ * Isolation from the LRU fails only if page has been isolated from
+ * the LRU since we looked at it and that usually means either global
+ * reclaim or migration going on. The page will either get back to the
+ * LRU or vanish.
+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
+ * (!PageCgroupUsed) or moved to a different group. The page will
+ * disappear in the next attempt.
*/
-
static int mem_cgroup_move_parent(struct page *page,
struct page_cgroup *pc,
struct mem_cgroup *child)
@@ -2715,9 +3659,7 @@ static int mem_cgroup_move_parent(struct page *page,
unsigned long uninitialized_var(flags);
int ret;
- /* Is ROOT ? */
- if (mem_cgroup_is_root(child))
- return -EINVAL;
+ VM_BUG_ON(mem_cgroup_is_root(child));
ret = -EBUSY;
if (!get_page_unless_zero(page))
@@ -2734,8 +3676,10 @@ static int mem_cgroup_move_parent(struct page *page,
if (!parent)
parent = root_mem_cgroup;
- if (nr_pages > 1)
+ if (nr_pages > 1) {
+ VM_BUG_ON(!PageTransHuge(page));
flags = compound_lock_irqsave(page);
+ }
ret = mem_cgroup_move_account(page, nr_pages,
pc, child, parent);
@@ -2877,7 +3821,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
return;
if (!memcg)
return;
- cgroup_exclude_rmdir(&memcg->css);
__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
/*
@@ -2891,12 +3834,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
swp_entry_t ent = {.val = page_private(page)};
mem_cgroup_uncharge_swap(ent);
}
- /*
- * At swapin, we may charge account against cgroup which has no tasks.
- * So, rmdir()->pre_destroy() can be called while we do this charge.
- * In that case, we need to call pre_destroy() again. check it here.
- */
- cgroup_release_and_wakeup_rmdir(&memcg->css);
}
void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3261,15 +4198,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
struct mem_cgroup **memcgp)
{
struct mem_cgroup *memcg = NULL;
+ unsigned int nr_pages = 1;
struct page_cgroup *pc;
enum charge_type ctype;
*memcgp = NULL;
- VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return;
+ if (PageTransHuge(page))
+ nr_pages <<= compound_order(page);
+
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
@@ -3331,7 +4271,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
* charged to the res_counter since we plan on replacing the
* old one and only one page is going to be left afterwards.
*/
- __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+ __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
}
/* remove redundant charge if migration failed*/
@@ -3344,8 +4284,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
if (!memcg)
return;
- /* blocks rmdir() */
- cgroup_exclude_rmdir(&memcg->css);
+
if (!migration_ok) {
used = oldpage;
unused = newpage;
@@ -3379,13 +4318,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
*/
if (anon)
mem_cgroup_uncharge_page(used);
- /*
- * At migration, we may charge account against cgroup which has no
- * tasks.
- * So, rmdir()->pre_destroy() can be called while we do this charge.
- * In that case, we need to call pre_destroy() again. check it here.
- */
- cgroup_release_and_wakeup_rmdir(&memcg->css);
}
/*
@@ -3463,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
}
#endif
-static DEFINE_MUTEX(set_limit_mutex);
-
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
@@ -3685,30 +4615,32 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
return nr_reclaimed;
}
-/*
+/**
+ * mem_cgroup_force_empty_list - clears LRU of a group
+ * @memcg: group to clear
+ * @node: NUMA node
+ * @zid: zone id
+ * @lru: lru to to clear
+ *
* Traverse a specified page_cgroup list and try to drop them all. This doesn't
- * reclaim the pages page themselves - it just removes the page_cgroups.
- * Returns true if some page_cgroups were not freed, indicating that the caller
- * must retry this operation.
+ * reclaim the pages page themselves - pages are moved to the parent (or root)
+ * group.
*/
-static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
int node, int zid, enum lru_list lru)
{
- struct mem_cgroup_per_zone *mz;
- unsigned long flags, loop;
+ struct lruvec *lruvec;
+ unsigned long flags;
struct list_head *list;
struct page *busy;
struct zone *zone;
zone = &NODE_DATA(node)->node_zones[zid];
- mz = mem_cgroup_zoneinfo(memcg, node, zid);
- list = &mz->lruvec.lists[lru];
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ list = &lruvec->lists[lru];
- loop = mz->lru_size[lru];
- /* give some margin against EBUSY etc...*/
- loop += 256;
busy = NULL;
- while (loop--) {
+ do {
struct page_cgroup *pc;
struct page *page;
@@ -3734,76 +4666,80 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
cond_resched();
} else
busy = NULL;
- }
- return !list_empty(list);
+ } while (!list_empty(list));
}
/*
- * make mem_cgroup's charge to be 0 if there is no task.
+ * make mem_cgroup's charge to be 0 if there is no task by moving
+ * all the charges and pages to the parent.
* This enables deleting this mem_cgroup.
+ *
+ * Caller is responsible for holding css reference on the memcg.
*/
-static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
+static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
{
- int ret;
- int node, zid, shrink;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct cgroup *cgrp = memcg->css.cgroup;
-
- css_get(&memcg->css);
+ int node, zid;
+ u64 usage;
- shrink = 0;
- /* should free all ? */
- if (free_all)
- goto try_to_free;
-move_account:
do {
- ret = -EBUSY;
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
- goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
drain_all_stock_sync(memcg);
- ret = 0;
mem_cgroup_start_move(memcg);
- for_each_node_state(node, N_HIGH_MEMORY) {
- for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
+ for_each_node_state(node, N_MEMORY) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
enum lru_list lru;
for_each_lru(lru) {
- ret = mem_cgroup_force_empty_list(memcg,
+ mem_cgroup_force_empty_list(memcg,
node, zid, lru);
- if (ret)
- break;
}
}
- if (ret)
- break;
}
mem_cgroup_end_move(memcg);
memcg_oom_recover(memcg);
cond_resched();
- /* "ret" should also be checked to ensure all lists are empty. */
- } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
-out:
- css_put(&memcg->css);
- return ret;
-try_to_free:
+ /*
+ * Kernel memory may not necessarily be trackable to a specific
+ * process. So they are not migrated, and therefore we can't
+ * expect their value to drop to 0 here.
+ * Having res filled up with kmem only is enough.
+ *
+ * This is a safety check because mem_cgroup_force_empty_list
+ * could have raced with mem_cgroup_replace_page_cache callers
+ * so the lru seemed empty but the page could have been added
+ * right after the check. RES_USAGE should be safe as we always
+ * charge before adding to the LRU.
+ */
+ usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
+ res_counter_read_u64(&memcg->kmem, RES_USAGE);
+ } while (usage > 0);
+}
+
+/*
+ * Reclaims as many pages from the given memcg as possible and moves
+ * the rest to the parent.
+ *
+ * Caller is responsible for holding css reference for memcg.
+ */
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
+{
+ int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct cgroup *cgrp = memcg->css.cgroup;
+
/* returns EBUSY if there is a task or if we come here twice. */
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
- ret = -EBUSY;
- goto out;
- }
+ if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+ return -EBUSY;
+
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
/* try to free all pages in this cgroup */
- shrink = 1;
while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
int progress;
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
+ if (signal_pending(current))
+ return -EINTR;
+
progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
false);
if (!progress) {
@@ -3814,13 +4750,23 @@ try_to_free:
}
lru_add_drain();
- /* try move_account...there may be some *locked* pages. */
- goto move_account;
+ mem_cgroup_reparent_charges(memcg);
+
+ return 0;
}
static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
- return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ int ret;
+
+ if (mem_cgroup_is_root(memcg))
+ return -EINVAL;
+ css_get(&memcg->css);
+ ret = mem_cgroup_force_empty(memcg);
+ css_put(&memcg->css);
+
+ return ret;
}
@@ -3911,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
char str[64];
u64 val;
- int type, name, len;
+ int name, len;
+ enum res_type type;
type = MEMFILE_TYPE(cft->private);
name = MEMFILE_ATTR(cft->private);
@@ -3932,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
else
val = res_counter_read_u64(&memcg->memsw, name);
break;
+ case _KMEM:
+ val = res_counter_read_u64(&memcg->kmem, name);
+ break;
default:
BUG();
}
@@ -3939,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
return simple_read_from_buffer(buf, nbytes, ppos, str, len);
}
+
+static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
+{
+ int ret = -EINVAL;
+#ifdef CONFIG_MEMCG_KMEM
+ bool must_inc_static_branch = false;
+
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ /*
+ * For simplicity, we won't allow this to be disabled. It also can't
+ * be changed if the cgroup has children already, or if tasks had
+ * already joined.
+ *
+ * If tasks join before we set the limit, a person looking at
+ * kmem.usage_in_bytes will have no way to determine when it took
+ * place, which makes the value quite meaningless.
+ *
+ * After it first became limited, changes in the value of the limit are
+ * of course permitted.
+ *
+ * Taking the cgroup_lock is really offensive, but it is so far the only
+ * way to guarantee that no children will appear. There are plenty of
+ * other offenders, and they should all go away. Fine grained locking
+ * is probably the way to go here. When we are fully hierarchical, we
+ * can also get rid of the use_hierarchy check.
+ */
+ cgroup_lock();
+ mutex_lock(&set_limit_mutex);
+ if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
+ if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
+ !list_empty(&cont->children))) {
+ ret = -EBUSY;
+ goto out;
+ }
+ ret = res_counter_set_limit(&memcg->kmem, val);
+ VM_BUG_ON(ret);
+
+ ret = memcg_update_cache_sizes(memcg);
+ if (ret) {
+ res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+ goto out;
+ }
+ must_inc_static_branch = true;
+ /*
+ * kmem charges can outlive the cgroup. In the case of slab
+ * pages, for instance, a page contain objects from various
+ * processes, so it is unfeasible to migrate them away. We
+ * need to reference count the memcg because of that.
+ */
+ mem_cgroup_get(memcg);
+ } else
+ ret = res_counter_set_limit(&memcg->kmem, val);
+out:
+ mutex_unlock(&set_limit_mutex);
+ cgroup_unlock();
+
+ /*
+ * We are by now familiar with the fact that we can't inc the static
+ * branch inside cgroup_lock. See disarm functions for details. A
+ * worker here is overkill, but also wrong: After the limit is set, we
+ * must start accounting right away. Since this operation can't fail,
+ * we can safely defer it to here - no rollback will be needed.
+ *
+ * The boolean used to control this is also safe, because
+ * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
+ * able to set it to true;
+ */
+ if (must_inc_static_branch) {
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+ /*
+ * setting the active bit after the inc will guarantee no one
+ * starts accounting before all call sites are patched
+ */
+ memcg_kmem_set_active(memcg);
+ }
+
+#endif
+ return ret;
+}
+
+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+{
+ int ret = 0;
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ goto out;
+
+ memcg->kmem_account_flags = parent->kmem_account_flags;
+#ifdef CONFIG_MEMCG_KMEM
+ /*
+ * When that happen, we need to disable the static branch only on those
+ * memcgs that enabled it. To achieve this, we would be forced to
+ * complicate the code by keeping track of which memcgs were the ones
+ * that actually enabled limits, and which ones got it from its
+ * parents.
+ *
+ * It is a lot simpler just to do static_key_slow_inc() on every child
+ * that is accounted.
+ */
+ if (!memcg_kmem_is_active(memcg))
+ goto out;
+
+ /*
+ * destroy(), called if we fail, will issue static_key_slow_inc() and
+ * mem_cgroup_put() if kmem is enabled. We have to either call them
+ * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
+ * this more consistent, since it always leads to the same destroy path
+ */
+ mem_cgroup_get(memcg);
+ static_key_slow_inc(&memcg_kmem_enabled_key);
+
+ mutex_lock(&set_limit_mutex);
+ ret = memcg_update_cache_sizes(memcg);
+ mutex_unlock(&set_limit_mutex);
+#endif
+out:
+ return ret;
+}
+
/*
* The user of this function is...
* RES_LIMIT.
@@ -3947,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
const char *buffer)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- int type, name;
+ enum res_type type;
+ int name;
unsigned long long val;
int ret;
@@ -3969,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
break;
if (type == _MEM)
ret = mem_cgroup_resize_limit(memcg, val);
- else
+ else if (type == _MEMSWAP)
ret = mem_cgroup_resize_memsw_limit(memcg, val);
+ else if (type == _KMEM)
+ ret = memcg_update_kmem_limit(cont, val);
+ else
+ return -EINVAL;
break;
case RES_SOFT_LIMIT:
ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4023,7 +5097,8 @@ out:
static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- int type, name;
+ int name;
+ enum res_type type;
type = MEMFILE_TYPE(event);
name = MEMFILE_ATTR(event);
@@ -4035,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
case RES_MAX_USAGE:
if (type == _MEM)
res_counter_reset_max(&memcg->res);
- else
+ else if (type == _MEMSWAP)
res_counter_reset_max(&memcg->memsw);
+ else if (type == _KMEM)
+ res_counter_reset_max(&memcg->kmem);
+ else
+ return -EINVAL;
break;
case RES_FAILCNT:
if (type == _MEM)
res_counter_reset_failcnt(&memcg->res);
- else
+ else if (type == _MEMSWAP)
res_counter_reset_failcnt(&memcg->memsw);
+ else if (type == _KMEM)
+ res_counter_reset_failcnt(&memcg->kmem);
+ else
+ return -EINVAL;
break;
}
@@ -4093,7 +5176,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
seq_printf(m, "total=%lu", total_nr);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
seq_printf(m, " N%d=%lu", nid, node_nr);
}
@@ -4101,7 +5184,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
seq_printf(m, "file=%lu", file_nr);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_FILE);
seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4110,7 +5193,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
seq_printf(m, "anon=%lu", anon_nr);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_ANON);
seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4119,7 +5202,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
seq_printf(m, "unevictable=%lu", unevictable_nr);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
BIT(LRU_UNEVICTABLE));
seq_printf(m, " N%d=%lu", nid, node_nr);
@@ -4359,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- int type = MEMFILE_TYPE(cft->private);
+ enum res_type type = MEMFILE_TYPE(cft->private);
u64 threshold, usage;
int i, size, ret;
@@ -4442,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
- int type = MEMFILE_TYPE(cft->private);
+ enum res_type type = MEMFILE_TYPE(cft->private);
u64 usage;
int i, j, size;
@@ -4520,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *event;
- int type = MEMFILE_TYPE(cft->private);
+ enum res_type type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
event = kmalloc(sizeof(*event), GFP_KERNEL);
@@ -4545,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
struct mem_cgroup_eventfd_list *ev, *tmp;
- int type = MEMFILE_TYPE(cft->private);
+ enum res_type type = MEMFILE_TYPE(cft->private);
BUG_ON(type != _OOM_TYPE);
@@ -4604,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
#ifdef CONFIG_MEMCG_KMEM
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
+ int ret;
+
+ memcg->kmemcg_id = -1;
+ ret = memcg_propagate_kmem(memcg);
+ if (ret)
+ return ret;
+
return mem_cgroup_sockets_init(memcg, ss);
};
static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
{
mem_cgroup_sockets_destroy(memcg);
+
+ memcg_kmem_mark_dead(memcg);
+
+ if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
+ return;
+
+ /*
+ * Charges already down to 0, undo mem_cgroup_get() done in the charge
+ * path here, being careful not to race with memcg_uncharge_kmem: it is
+ * possible that the charges went down to 0 between mark_dead and the
+ * res_counter read, so in that case, we don't need the put
+ */
+ if (memcg_kmem_test_and_clear_dead(memcg))
+ mem_cgroup_put(memcg);
}
#else
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4718,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = {
.read = mem_cgroup_read,
},
#endif
+#ifdef CONFIG_MEMCG_KMEM
+ {
+ .name = "kmem.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+ .write_string = mem_cgroup_write,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "kmem.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "kmem.failcnt",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "kmem.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
+ .trigger = mem_cgroup_reset,
+ .read = mem_cgroup_read,
+ },
+#ifdef CONFIG_SLABINFO
+ {
+ .name = "kmem.slabinfo",
+ .read_seq_string = mem_cgroup_slabinfo_read,
+ },
+#endif
+#endif
{ }, /* terminate */
};
@@ -4742,7 +5877,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
- lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
+ lruvec_init(&mz->lruvec);
mz->usage_in_excess = 0;
mz->on_tree = false;
mz->memcg = memcg;
@@ -4785,16 +5920,29 @@ out_free:
}
/*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * but in process context. The work_freeing structure is overlaid
- * on the rcu_freeing structure, which itself is overlaid on memsw.
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
+ * (scanning all at force_empty is too costly...)
+ *
+ * Instead of clearing all references at force_empty, we remember
+ * the number of reference from swap_cgroup and free mem_cgroup when
+ * it goes down to 0.
+ *
+ * Removal of cgroup itself succeeds regardless of refs from swap.
*/
-static void free_work(struct work_struct *work)
+
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg;
+ int node;
int size = sizeof(struct mem_cgroup);
- memcg = container_of(work, struct mem_cgroup, work_freeing);
+ mem_cgroup_remove_from_trees(memcg);
+ free_css_id(&mem_cgroup_subsys, &memcg->css);
+
+ for_each_node(node)
+ free_mem_cgroup_per_zone_info(memcg, node);
+
+ free_percpu(memcg->stat);
+
/*
* We need to make sure that (at least for now), the jump label
* destruction code runs outside of the cgroup lock. This is because
@@ -4806,45 +5954,34 @@ static void free_work(struct work_struct *work)
* to move this code around, and make sure it is outside
* the cgroup_lock.
*/
- disarm_sock_keys(memcg);
+ disarm_static_keys(memcg);
if (size < PAGE_SIZE)
kfree(memcg);
else
vfree(memcg);
}
-static void free_rcu(struct rcu_head *rcu_head)
-{
- struct mem_cgroup *memcg;
-
- memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
- INIT_WORK(&memcg->work_freeing, free_work);
- schedule_work(&memcg->work_freeing);
-}
/*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
- * (scanning all at force_empty is too costly...)
- *
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
+ * but in process context. The work_freeing structure is overlaid
+ * on the rcu_freeing structure, which itself is overlaid on memsw.
*/
-
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
+static void free_work(struct work_struct *work)
{
- int node;
+ struct mem_cgroup *memcg;
- mem_cgroup_remove_from_trees(memcg);
- free_css_id(&mem_cgroup_subsys, &memcg->css);
+ memcg = container_of(work, struct mem_cgroup, work_freeing);
+ __mem_cgroup_free(memcg);
+}
- for_each_node(node)
- free_mem_cgroup_per_zone_info(memcg, node);
+static void free_rcu(struct rcu_head *rcu_head)
+{
+ struct mem_cgroup *memcg;
- free_percpu(memcg->stat);
- call_rcu(&memcg->rcu_freeing, free_rcu);
+ memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
+ INIT_WORK(&memcg->work_freeing, free_work);
+ schedule_work(&memcg->work_freeing);
}
static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4856,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
{
if (atomic_sub_and_test(count, &memcg->refcnt)) {
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- __mem_cgroup_free(memcg);
+ call_rcu(&memcg->rcu_freeing, free_rcu);
if (parent)
mem_cgroup_put(parent);
}
@@ -4926,7 +6063,7 @@ err_cleanup:
}
static struct cgroup_subsys_state * __ref
-mem_cgroup_create(struct cgroup *cont)
+mem_cgroup_css_alloc(struct cgroup *cont)
{
struct mem_cgroup *memcg, *parent;
long error = -ENOMEM;
@@ -4963,6 +6100,8 @@ mem_cgroup_create(struct cgroup *cont)
if (parent && parent->use_hierarchy) {
res_counter_init(&memcg->res, &parent->res);
res_counter_init(&memcg->memsw, &parent->memsw);
+ res_counter_init(&memcg->kmem, &parent->kmem);
+
/*
* We increment refcnt of the parent to ensure that we can
* safely access it on res_counter_charge/uncharge.
@@ -4973,6 +6112,14 @@ mem_cgroup_create(struct cgroup *cont)
} else {
res_counter_init(&memcg->res, NULL);
res_counter_init(&memcg->memsw, NULL);
+ res_counter_init(&memcg->kmem, NULL);
+ /*
+ * Deeper hierachy with use_hierarchy == false doesn't make
+ * much sense so let cgroup subsystem know about this
+ * unfortunate state in our controller.
+ */
+ if (parent && parent != root_mem_cgroup)
+ mem_cgroup_subsys.broken_hierarchy = true;
}
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
@@ -5000,14 +6147,15 @@ free_out:
return ERR_PTR(error);
}
-static int mem_cgroup_pre_destroy(struct cgroup *cont)
+static void mem_cgroup_css_offline(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- return mem_cgroup_force_empty(memcg, false);
+ mem_cgroup_reparent_charges(memcg);
+ mem_cgroup_destroy_all_caches(memcg);
}
-static void mem_cgroup_destroy(struct cgroup *cont)
+static void mem_cgroup_css_free(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -5597,16 +6745,15 @@ static void mem_cgroup_move_task(struct cgroup *cont,
struct cgroup_subsys mem_cgroup_subsys = {
.name = "memory",
.subsys_id = mem_cgroup_subsys_id,
- .create = mem_cgroup_create,
- .pre_destroy = mem_cgroup_pre_destroy,
- .destroy = mem_cgroup_destroy,
+ .css_alloc = mem_cgroup_css_alloc,
+ .css_offline = mem_cgroup_css_offline,
+ .css_free = mem_cgroup_css_free,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.attach = mem_cgroup_move_task,
.base_cftypes = mem_cgroup_files,
.early_init = 0,
.use_id = 1,
- .__DEPRECATED_clear_css_refs = true,
};
#ifdef CONFIG_MEMCG_SWAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a6610..c6e4dd3e1c08 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct anon_vma *av;
+ pgoff_t pgoff;
- av = page_lock_anon_vma(page);
+ av = page_lock_anon_vma_read(page);
if (av == NULL) /* Not actually mapped anymore */
return;
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
if (!task_early_kill(tsk))
continue;
- list_for_each_entry(vmac, &av->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+ pgoff, pgoff) {
vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
@@ -420,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
}
}
read_unlock(&tasklist_lock);
- page_unlock_anon_vma(av);
+ page_unlock_anon_vma_read(av);
}
/*
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- struct prio_tree_iter iter;
struct address_space *mapping = page->mapping;
mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
if (!task_early_kill(tsk))
continue;
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
pgoff) {
/*
* Send early kill signal to tasks where a vma covers
@@ -779,16 +781,16 @@ static struct page_state {
{ compound, compound, "huge", me_huge_page },
#endif
- { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty },
- { sc|dirty, sc, "swapcache", me_swapcache_clean },
+ { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
+ { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
- { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
- { unevict, unevict, "unevictable LRU", me_pagecache_clean},
+ { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
+ { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
- { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty },
- { mlock, mlock, "mlocked LRU", me_pagecache_clean },
+ { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
+ { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
- { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty },
+ { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
{ lru|dirty, lru, "clean LRU", me_pagecache_clean },
/*
@@ -810,14 +812,14 @@ static struct page_state {
#undef slab
#undef reserved
+/*
+ * "Dirty/Clean" indication is not 100% accurate due to the possibility of
+ * setting PG_dirty outside page lock. See also comment above set_page_dirty().
+ */
static void action_result(unsigned long pfn, char *msg, int result)
{
- struct page *page = pfn_to_page(pfn);
-
- printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
- pfn,
- PageDirty(page) ? "dirty " : "",
- msg, action_name[result]);
+ pr_err("MCE %#lx: %s page recovery: %s\n",
+ pfn, msg, action_name[result]);
}
static int page_action(struct page_state *ps, struct page *p,
@@ -1383,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
* Isolate the page, so that it doesn't get reallocated if it
* was free.
*/
- set_migratetype_isolate(p);
+ set_migratetype_isolate(p, true);
/*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
@@ -1474,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_trans_head(page);
if (PageHuge(page))
return soft_offline_huge_page(page, flags);
+ if (PageTransHuge(hpage)) {
+ if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ return -EBUSY;
+ }
+ }
ret = get_any_page(page, pfn, flags);
if (ret < 0)
@@ -1556,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_MEMORY_FAILURE);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a5..e0a9b0ce4f10 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,8 @@
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
+#include <linux/migrate.h>
+#include <linux/string.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -712,25 +714,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
add_taint(TAINT_BAD_PAGE);
}
-static inline int is_cow_mapping(vm_flags_t flags)
+static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
-#ifndef is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
- return pfn == zero_pfn;
-}
-#endif
-
-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- return zero_pfn;
-}
-#endif
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
@@ -1039,6 +1027,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
unsigned long next;
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ bool is_cow;
int ret;
/*
@@ -1047,7 +1038,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+ if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+ VM_PFNMAP | VM_MIXEDMAP))) {
if (!vma->anon_vma)
return 0;
}
@@ -1055,12 +1047,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
- if (unlikely(is_pfn_mapping(vma))) {
+ if (unlikely(vma->vm_flags & VM_PFNMAP)) {
/*
* We do not free on error cases below as remove_vma
* gets called on error from higher level routine
*/
- ret = track_pfn_vma_copy(vma);
+ ret = track_pfn_copy(vma);
if (ret)
return ret;
}
@@ -1071,8 +1063,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* parent mm. And a permission downgrade will only happen if
* is_cow_mapping() returns true.
*/
- if (is_cow_mapping(vma->vm_flags))
- mmu_notifier_invalidate_range_start(src_mm, addr, end);
+ is_cow = is_cow_mapping(vma->vm_flags);
+ mmun_start = addr;
+ mmun_end = end;
+ if (is_cow)
+ mmu_notifier_invalidate_range_start(src_mm, mmun_start,
+ mmun_end);
ret = 0;
dst_pgd = pgd_offset(dst_mm, addr);
@@ -1088,9 +1084,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
- if (is_cow_mapping(vma->vm_flags))
- mmu_notifier_invalidate_range_end(src_mm,
- vma->vm_start, end);
+ if (is_cow)
+ mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
return ret;
}
@@ -1243,7 +1238,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
BUG();
}
#endif
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1327,8 +1322,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file)
uprobe_munmap(vma, start, end);
- if (unlikely(is_pfn_mapping(vma)))
- untrack_pfn_vma(vma, 0, 0);
+ if (unlikely(vma->vm_flags & VM_PFNMAP))
+ untrack_pfn(vma, 0, 0);
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1510,9 +1505,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
+ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ goto no_page_table;
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(mm, pmd);
+ split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
spin_lock(&mm->page_table_lock);
@@ -1521,7 +1518,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
spin_unlock(&mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, pmd);
} else {
- page = follow_trans_huge_pmd(mm, address,
+ page = follow_trans_huge_pmd(vma, address,
pmd, flags);
spin_unlock(&mm->page_table_lock);
goto out;
@@ -1539,6 +1536,8 @@ split_fallthrough:
pte = *ptep;
if (!pte_present(pte))
goto no_page;
+ if ((flags & FOLL_NUMA) && pte_numa(pte))
+ goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
@@ -1576,12 +1575,12 @@ split_fallthrough:
if (page->mapping && trylock_page(page)) {
lru_add_drain(); /* push cached pages to LRU */
/*
- * Because we lock page here and migration is
- * blocked by the pte's page reference, we need
- * only check for file-cache page truncation.
+ * Because we lock page here, and migration is
+ * blocked by the pte's page reference, and we
+ * know the page is still mapped, we don't even
+ * need to check for file-cache page truncation.
*/
- if (page->mapping)
- mlock_vma_page(page);
+ mlock_vma_page(page);
unlock_page(page);
}
}
@@ -1690,6 +1689,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
vm_flags &= (gup_flags & FOLL_FORCE) ?
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ /*
+ * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
+ * would be called on PROT_NONE ranges. We must never invoke
+ * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
+ * page faults would unprotect the PROT_NONE ranges if
+ * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
+ * bitflag. So to avoid that, don't set FOLL_NUMA if
+ * FOLL_FORCE is set.
+ */
+ if (!(gup_flags & FOLL_FORCE))
+ gup_flags |= FOLL_NUMA;
+
i = 0;
do {
@@ -2085,6 +2097,11 @@ out:
* ask for a shared writable mapping!
*
* The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
*/
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
@@ -2093,7 +2110,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
return -EFAULT;
if (!page_count(page))
return -EINVAL;
- vma->vm_flags |= VM_INSERTPAGE;
+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
+ BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
+ vma->vm_flags |= VM_MIXEDMAP;
+ }
return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);
@@ -2132,7 +2153,7 @@ out:
* @addr: target user address of this page
* @pfn: source kernel pfn
*
- * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
* they've allocated into a user vma. Same comments apply.
*
* This function should only be called from a vm_ops->fault handler, and
@@ -2162,14 +2183,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
- if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
+ if (track_pfn_insert(vma, &pgprot, pfn))
return -EINVAL;
ret = insert_pfn(vma, addr, pfn, pgprot);
- if (ret)
- untrack_pfn_vma(vma, pfn, PAGE_SIZE);
-
return ret;
}
EXPORT_SYMBOL(vm_insert_pfn);
@@ -2290,37 +2308,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
- * VM_RESERVED is specified all over the place, because
- * in 2.4 it kept swapout's vma scan off this vma; but
- * in 2.6 the LRU scan won't even find its pages, so this
- * flag means no more than count its pages in reserved_vm,
- * and omit it from core dump, even when VM_IO turned off.
* VM_PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
+ * VM_DONTEXPAND
+ * Disable vma merging and expanding with mremap().
+ * VM_DONTDUMP
+ * Omit vma from core dump, even when VM_IO turned off.
*
* There's a horrible special case to handle copy-on-write
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
+ * See vm_normal_page() for details.
*/
- if (addr == vma->vm_start && end == vma->vm_end) {
+ if (is_cow_mapping(vma->vm_flags)) {
+ if (addr != vma->vm_start || end != vma->vm_end)
+ return -EINVAL;
vma->vm_pgoff = pfn;
- vma->vm_flags |= VM_PFN_AT_MMAP;
- } else if (is_cow_mapping(vma->vm_flags))
- return -EINVAL;
-
- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ }
- err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
- if (err) {
- /*
- * To indicate that track_pfn related cleanup is not
- * needed from higher level routine calling unmap_vmas
- */
- vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
- vma->vm_flags &= ~VM_PFN_AT_MMAP;
+ err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
+ if (err)
return -EINVAL;
- }
+
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -2335,7 +2346,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
} while (pgd++, addr = next, addr != end);
if (err)
- untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+ untrack_pfn(vma, pfn, PAGE_ALIGN(size));
return err;
}
@@ -2516,11 +2527,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl, pte_t orig_pte)
__releases(ptl)
{
- struct page *old_page, *new_page;
+ struct page *old_page, *new_page = NULL;
pte_t entry;
int ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
+ unsigned long mmun_start = 0; /* For mmu_notifiers */
+ unsigned long mmun_end = 0; /* For mmu_notifiers */
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
@@ -2698,6 +2711,10 @@ gotten:
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
+ mmun_start = address & PAGE_MASK;
+ mmun_end = mmun_start + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
/*
* Re-check the pte - we dropped the lock
*/
@@ -2764,6 +2781,8 @@ gotten:
page_cache_release(new_page);
unlock:
pte_unmap_unlock(page_table, ptl);
+ if (mmun_end > mmun_start)
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
@@ -2780,13 +2799,8 @@ unlock:
oom_free_new:
page_cache_release(new_page);
oom:
- if (old_page) {
- if (page_mkwrite) {
- unlock_page(old_page);
- page_cache_release(old_page);
- }
+ if (old_page)
page_cache_release(old_page);
- }
return VM_FAULT_OOM;
unwritable_page:
@@ -2801,14 +2815,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
-static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root *root,
struct zap_details *details)
{
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
pgoff_t vba, vea, zba, zea;
- vma_prio_tree_foreach(vma, &iter, root,
+ vma_interval_tree_foreach(vma, root,
details->first_index, details->last_index) {
vba = vma->vm_pgoff;
@@ -2839,7 +2852,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
* across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point.
*/
- list_for_each_entry(vma, head, shared.vm_set.list) {
+ list_for_each_entry(vma, head, shared.nonlinear) {
details->nonlinear_vma = vma;
unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
}
@@ -2883,7 +2896,7 @@ void unmap_mapping_range(struct address_space *mapping,
mutex_lock(&mapping->i_mmap_mutex);
- if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
@@ -3418,6 +3431,170 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int current_nid)
+{
+ get_page(page);
+
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ return mpol_misplaced(page, vma, addr);
+}
+
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ int current_nid = -1;
+ int target_nid;
+ bool migrated = false;
+
+ /*
+ * The "pte" at this point cannot be used safely without
+ * validation through pte_unmap_same(). It's of NUMA type but
+ * the pfn may be screwed if the read is non atomic.
+ *
+ * ptep_modify_prot_start is not called as this is clearing
+ * the _PAGE_NUMA bit and it is not really expected that there
+ * would be concurrent hardware modifications to the PTE.
+ */
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*ptep, pte))) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out;
+ }
+
+ pte = pte_mknonnuma(pte);
+ set_pte_at(mm, addr, ptep, pte);
+ update_mmu_cache(vma, addr, ptep);
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+
+ current_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+ pte_unmap_unlock(ptep, ptl);
+ if (target_nid == -1) {
+ /*
+ * Account for the fault against the current node if it not
+ * being replaced regardless of where the page is located.
+ */
+ current_nid = numa_node_id();
+ put_page(page);
+ goto out;
+ }
+
+ /* Migrate to the requested node */
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+
+out:
+ if (current_nid != -1)
+ task_numa_fault(current_nid, 1, migrated);
+ return 0;
+}
+
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd;
+ pte_t *pte, *orig_pte;
+ unsigned long _addr = addr & PMD_MASK;
+ unsigned long offset;
+ spinlock_t *ptl;
+ bool numa = false;
+ int local_nid = numa_node_id();
+
+ spin_lock(&mm->page_table_lock);
+ pmd = *pmdp;
+ if (pmd_numa(pmd)) {
+ set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+ numa = true;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ if (!numa)
+ return 0;
+
+ /* we're in a page fault so some vma must be in the range */
+ BUG_ON(!vma);
+ BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+ offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+ VM_BUG_ON(offset >= PMD_SIZE);
+ orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+ pte += offset >> PAGE_SHIFT;
+ for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+ pte_t pteval = *pte;
+ struct page *page;
+ int curr_nid = local_nid;
+ int target_nid;
+ bool migrated;
+ if (!pte_present(pteval))
+ continue;
+ if (!pte_numa(pteval))
+ continue;
+ if (addr >= vma->vm_end) {
+ vma = find_vma(mm, addr);
+ /* there's a pte present so there must be a vma */
+ BUG_ON(!vma);
+ BUG_ON(addr < vma->vm_start);
+ }
+ if (pte_numa(pteval)) {
+ pteval = pte_mknonnuma(pteval);
+ set_pte_at(mm, addr, pte, pteval);
+ }
+ page = vm_normal_page(vma, addr, pteval);
+ if (unlikely(!page))
+ continue;
+ /* only check non-shared pages */
+ if (unlikely(page_mapcount(page) != 1))
+ continue;
+
+ /*
+ * Note that the NUMA fault is later accounted to either
+ * the node that is currently running or where the page is
+ * migrated to.
+ */
+ curr_nid = local_nid;
+ target_nid = numa_migrate_prep(page, vma, addr,
+ page_to_nid(page));
+ if (target_nid == -1) {
+ put_page(page);
+ continue;
+ }
+
+ /* Migrate to the requested node */
+ pte_unmap_unlock(pte, ptl);
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ curr_nid = target_nid;
+ task_numa_fault(curr_nid, 1, migrated);
+
+ pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+
+ return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ BUG();
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3456,6 +3633,9 @@ int handle_pte_fault(struct mm_struct *mm,
pte, pmd, flags, entry);
}
+ if (pte_numa(entry))
+ return do_numa_page(mm, vma, address, entry, pte, pmd);
+
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
@@ -3524,9 +3704,13 @@ retry:
barrier();
if (pmd_trans_huge(orig_pmd)) {
- if (flags & FAULT_FLAG_WRITE &&
- !pmd_write(orig_pmd) &&
- !pmd_trans_splitting(orig_pmd)) {
+ unsigned int dirty = flags & FAULT_FLAG_WRITE;
+
+ if (pmd_numa(orig_pmd))
+ return do_huge_pmd_numa_page(mm, vma, address,
+ orig_pmd, pmd);
+
+ if (dirty && !pmd_write(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
/*
@@ -3537,17 +3721,25 @@ retry:
if (unlikely(ret & VM_FAULT_OOM))
goto retry;
return ret;
+ } else {
+ huge_pmd_set_accessed(mm, vma, address, pmd,
+ orig_pmd, dirty);
}
+
return 0;
}
}
+ if (pmd_numa(*pmd))
+ return do_pmd_numa_page(mm, vma, address, pmd);
+
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -3927,15 +4119,12 @@ void print_vma_addr(char *prefix, unsigned long ip)
struct file *f = vma->vm_file;
char *buf = (char *)__get_free_page(GFP_KERNEL);
if (buf) {
- char *p, *s;
+ char *p;
p = d_path(&f->f_path, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
- s = strrchr(p, '/');
- if (s)
- p = s+1;
- printk("%s%s[%lx+%lx]", prefix, p,
+ printk("%s%s[%lx+%lx]", prefix, kbasename(p),
vma->vm_start,
vma->vm_end - vma->vm_start);
free_page((unsigned long)buf);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d0cfd7..d04ed87bfacb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
void __ref put_page_bootmem(struct page *page)
{
unsigned long type;
+ static DEFINE_MUTEX(ppb_lock);
type = (unsigned long) page->lru.next;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
+
+ /*
+ * Please refer to comment for __free_pages_bootmem()
+ * for why we serialize here.
+ */
+ mutex_lock(&ppb_lock);
__free_pages_bootmem(page, 0);
+ mutex_unlock(&ppb_lock);
}
}
@@ -205,7 +213,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writelock(zone);
old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
- if (start_pfn < zone->zone_start_pfn)
+ if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
zone->zone_start_pfn = start_pfn;
zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +222,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writeunlock(zone);
}
+static void resize_zone(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ zone_span_writelock(zone);
+
+ if (end_pfn - start_pfn) {
+ zone->zone_start_pfn = start_pfn;
+ zone->spanned_pages = end_pfn - start_pfn;
+ } else {
+ /*
+ * make it consist as free_area_init_core(),
+ * if spanned_pages = 0, then keep start_pfn = 0
+ */
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
+
+ zone_span_writeunlock(zone);
+}
+
+static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ enum zone_type zid = zone_idx(zone);
+ int nid = zone->zone_pgdat->node_id;
+ unsigned long pfn;
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++)
+ set_page_links(pfn_to_page(pfn), zid, nid, pfn);
+}
+
+static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int ret;
+ unsigned long flags;
+ unsigned long z1_start_pfn;
+
+ if (!z1->wait_table) {
+ ret = init_currently_empty_zone(z1, start_pfn,
+ end_pfn - start_pfn, MEMMAP_HOTPLUG);
+ if (ret)
+ return ret;
+ }
+
+ pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+ /* can't move pfns which are higher than @z2 */
+ if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
+ goto out_fail;
+ /* the move out part mast at the left most of @z2 */
+ if (start_pfn > z2->zone_start_pfn)
+ goto out_fail;
+ /* must included/overlap */
+ if (end_pfn <= z2->zone_start_pfn)
+ goto out_fail;
+
+ /* use start_pfn for z1's start_pfn if z1 is empty */
+ if (z1->spanned_pages)
+ z1_start_pfn = z1->zone_start_pfn;
+ else
+ z1_start_pfn = start_pfn;
+
+ resize_zone(z1, z1_start_pfn, end_pfn);
+ resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
+
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+ fix_zone_id(z1, start_pfn, end_pfn);
+
+ return 0;
+out_fail:
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+ return -1;
+}
+
+static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ int ret;
+ unsigned long flags;
+ unsigned long z2_end_pfn;
+
+ if (!z2->wait_table) {
+ ret = init_currently_empty_zone(z2, start_pfn,
+ end_pfn - start_pfn, MEMMAP_HOTPLUG);
+ if (ret)
+ return ret;
+ }
+
+ pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+ /* can't move pfns which are lower than @z1 */
+ if (z1->zone_start_pfn > start_pfn)
+ goto out_fail;
+ /* the move out part mast at the right most of @z1 */
+ if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
+ goto out_fail;
+ /* must included/overlap */
+ if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
+ goto out_fail;
+
+ /* use end_pfn for z2's end_pfn if z2 is empty */
+ if (z2->spanned_pages)
+ z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
+ else
+ z2_end_pfn = end_pfn;
+
+ resize_zone(z1, z1->zone_start_pfn, start_pfn);
+ resize_zone(z2, start_pfn, z2_end_pfn);
+
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+ fix_zone_id(z2, start_pfn, end_pfn);
+
+ return 0;
+out_fail:
+ pgdat_resize_unlock(z1->zone_pgdat, &flags);
+ return -1;
+}
+
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long old_pgdat_end_pfn =
pgdat->node_start_pfn + pgdat->node_spanned_pages;
- if (start_pfn < pgdat->node_start_pfn)
+ if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
pgdat->node_start_pfn = start_pfn;
pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -362,11 +491,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
+ release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
+
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
- release_mem_region(pfn << PAGE_SHIFT,
- PAGES_PER_SECTION << PAGE_SHIFT);
ret = __remove_section(zone, __pfn_to_section(pfn));
if (ret)
break;
@@ -460,8 +589,99 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
return 0;
}
+#ifdef CONFIG_MOVABLE_NODE
+/*
+ * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
+ * normal memory.
+ */
+static bool can_online_high_movable(struct zone *zone)
+{
+ return true;
+}
+#else /* CONFIG_MOVABLE_NODE */
+/* ensure every online node has NORMAL memory */
+static bool can_online_high_movable(struct zone *zone)
+{
+ return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+}
+#endif /* CONFIG_MOVABLE_NODE */
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
+/* check which state of node_states will be changed when online memory */
+static void node_states_check_changes_online(unsigned long nr_pages,
+ struct zone *zone, struct memory_notify *arg)
+{
+ int nid = zone_to_nid(zone);
+ enum zone_type zone_last = ZONE_NORMAL;
+
+ /*
+ * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_NORMAL,
+ * set zone_last to ZONE_NORMAL.
+ *
+ * If we don't have HIGHMEM nor movable node,
+ * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+ * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+ */
+ if (N_MEMORY == N_NORMAL_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ /*
+ * if the memory to be online is in a zone of 0...zone_last, and
+ * the zones of 0...zone_last don't have memory before online, we will
+ * need to set the node to node_states[N_NORMAL_MEMORY] after
+ * the memory is online.
+ */
+ if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
+ arg->status_change_nid_normal = nid;
+ else
+ arg->status_change_nid_normal = -1;
+
+#ifdef CONFIG_HIGHMEM
+ /*
+ * If we have movable node, node_states[N_HIGH_MEMORY]
+ * contains nodes which have zones of 0...ZONE_HIGHMEM,
+ * set zone_last to ZONE_HIGHMEM.
+ *
+ * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_MOVABLE,
+ * set zone_last to ZONE_MOVABLE.
+ */
+ zone_last = ZONE_HIGHMEM;
+ if (N_MEMORY == N_HIGH_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
+ arg->status_change_nid_high = nid;
+ else
+ arg->status_change_nid_high = -1;
+#else
+ arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+
+ /*
+ * if the node don't have memory befor online, we will need to
+ * set the node to node_states[N_MEMORY] after the memory
+ * is online.
+ */
+ if (!node_state(nid, N_MEMORY))
+ arg->status_change_nid = nid;
+ else
+ arg->status_change_nid = -1;
+}
+
+static void node_states_set_node(int node, struct memory_notify *arg)
+{
+ if (arg->status_change_nid_normal >= 0)
+ node_set_state(node, N_NORMAL_MEMORY);
+
+ if (arg->status_change_nid_high >= 0)
+ node_set_state(node, N_HIGH_MEMORY);
+
+ node_set_state(node, N_MEMORY);
+}
+
+
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long onlined_pages = 0;
struct zone *zone;
@@ -471,13 +691,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
struct memory_notify arg;
lock_memory_hotplug();
+ /*
+ * This doesn't need a lock to do pfn_to_page().
+ * The section can't be removed here because of the
+ * memory_block->state_mutex.
+ */
+ zone = page_zone(pfn_to_page(pfn));
+
+ if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
+ !can_online_high_movable(zone)) {
+ unlock_memory_hotplug();
+ return -1;
+ }
+
+ if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
+ if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
+ unlock_memory_hotplug();
+ return -1;
+ }
+ }
+ if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
+ if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
+ unlock_memory_hotplug();
+ return -1;
+ }
+ }
+
+ /* Previous code may changed the zone of the pfn range */
+ zone = page_zone(pfn_to_page(pfn));
+
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
- arg.status_change_nid = -1;
+ node_states_check_changes_online(nr_pages, zone, &arg);
nid = page_to_nid(pfn_to_page(pfn));
- if (node_present_pages(nid) == 0)
- arg.status_change_nid = nid;
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
@@ -487,23 +734,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
return ret;
}
/*
- * This doesn't need a lock to do pfn_to_page().
- * The section can't be removed here because of the
- * memory_block->state_mutex.
- */
- zone = page_zone(pfn_to_page(pfn));
- /*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
*/
mutex_lock(&zonelists_mutex);
- if (!populated_zone(zone))
+ if (!populated_zone(zone)) {
need_zonelists_rebuild = 1;
+ build_all_zonelists(NULL, zone);
+ }
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+ if (need_zonelists_rebuild)
+ zone_pcp_reset(zone);
mutex_unlock(&zonelists_mutex);
printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
(unsigned long long) pfn << PAGE_SHIFT,
@@ -514,12 +759,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
return ret;
}
+ zone->managed_pages += onlined_pages;
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
if (onlined_pages) {
- node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+ node_states_set_node(zone_to_nid(zone), &arg);
if (need_zonelists_rebuild)
- build_all_zonelists(NULL, zone);
+ build_all_zonelists(NULL, NULL);
else
zone_pcp_update(zone);
}
@@ -756,13 +1002,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
return 0;
}
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
- /* This should be improooooved!! */
- return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
-
#define NR_OFFLINE_AT_ONCE_PAGES (256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,9 +1052,14 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
putback_lru_pages(&source);
goto out;
}
- /* this function returns # of failed pages */
- ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
- true, MIGRATE_SYNC);
+
+ /*
+ * alloc_migrate_target should be improooooved!!
+ * migrate_pages returns # of failed pages.
+ */
+ ret = migrate_pages(&source, alloc_migrate_target, 0,
+ true, MIGRATE_SYNC,
+ MR_MEMORY_HOTPLUG);
if (ret)
putback_lru_pages(&source);
}
@@ -850,7 +1094,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
{
int ret;
long offlined = *(long *)data;
- ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+ ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
offlined = nr_pages;
if (!ret)
*(long *)data += offlined;
@@ -870,7 +1114,133 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
-static int __ref offline_pages(unsigned long start_pfn,
+#ifdef CONFIG_MOVABLE_NODE
+/*
+ * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
+ * normal memory.
+ */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+ return true;
+}
+#else /* CONFIG_MOVABLE_NODE */
+/* ensure the node has NORMAL memory if it is still online */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long present_pages = 0;
+ enum zone_type zt;
+
+ for (zt = 0; zt <= ZONE_NORMAL; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+
+ if (present_pages > nr_pages)
+ return true;
+
+ present_pages = 0;
+ for (; zt <= ZONE_MOVABLE; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+
+ /*
+ * we can't offline the last normal memory until all
+ * higher memory is offlined.
+ */
+ return present_pages == 0;
+}
+#endif /* CONFIG_MOVABLE_NODE */
+
+/* check which state of node_states will be changed when offline memory */
+static void node_states_check_changes_offline(unsigned long nr_pages,
+ struct zone *zone, struct memory_notify *arg)
+{
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long present_pages = 0;
+ enum zone_type zt, zone_last = ZONE_NORMAL;
+
+ /*
+ * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_NORMAL,
+ * set zone_last to ZONE_NORMAL.
+ *
+ * If we don't have HIGHMEM nor movable node,
+ * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+ * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+ */
+ if (N_MEMORY == N_NORMAL_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ /*
+ * check whether node_states[N_NORMAL_MEMORY] will be changed.
+ * If the memory to be offline is in a zone of 0...zone_last,
+ * and it is the last present memory, 0...zone_last will
+ * become empty after offline , thus we can determind we will
+ * need to clear the node from node_states[N_NORMAL_MEMORY].
+ */
+ for (zt = 0; zt <= zone_last; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ arg->status_change_nid_normal = zone_to_nid(zone);
+ else
+ arg->status_change_nid_normal = -1;
+
+#ifdef CONFIG_HIGHMEM
+ /*
+ * If we have movable node, node_states[N_HIGH_MEMORY]
+ * contains nodes which have zones of 0...ZONE_HIGHMEM,
+ * set zone_last to ZONE_HIGHMEM.
+ *
+ * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+ * contains nodes which have zones of 0...ZONE_MOVABLE,
+ * set zone_last to ZONE_MOVABLE.
+ */
+ zone_last = ZONE_HIGHMEM;
+ if (N_MEMORY == N_HIGH_MEMORY)
+ zone_last = ZONE_MOVABLE;
+
+ for (; zt <= zone_last; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ arg->status_change_nid_high = zone_to_nid(zone);
+ else
+ arg->status_change_nid_high = -1;
+#else
+ arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+
+ /*
+ * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
+ */
+ zone_last = ZONE_MOVABLE;
+
+ /*
+ * check whether node_states[N_HIGH_MEMORY] will be changed
+ * If we try to offline the last present @nr_pages from the node,
+ * we can determind we will need to clear the node from
+ * node_states[N_HIGH_MEMORY].
+ */
+ for (; zt <= zone_last; zt++)
+ present_pages += pgdat->node_zones[zt].present_pages;
+ if (nr_pages >= present_pages)
+ arg->status_change_nid = zone_to_nid(zone);
+ else
+ arg->status_change_nid = -1;
+}
+
+static void node_states_clear_node(int node, struct memory_notify *arg)
+{
+ if (arg->status_change_nid_normal >= 0)
+ node_clear_state(node, N_NORMAL_MEMORY);
+
+ if ((N_MEMORY != N_NORMAL_MEMORY) &&
+ (arg->status_change_nid_high >= 0))
+ node_clear_state(node, N_HIGH_MEMORY);
+
+ if ((N_MEMORY != N_HIGH_MEMORY) &&
+ (arg->status_change_nid >= 0))
+ node_clear_state(node, N_MEMORY);
+}
+
+static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn, unsigned long timeout)
{
unsigned long pfn, nr_pages, expire;
@@ -896,16 +1266,19 @@ static int __ref offline_pages(unsigned long start_pfn,
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
+ ret = -EINVAL;
+ if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
+ goto out;
+
/* set above range as isolated */
- ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+ ret = start_isolate_page_range(start_pfn, end_pfn,
+ MIGRATE_MOVABLE, true);
if (ret)
goto out;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
- arg.status_change_nid = -1;
- if (nr_pages >= node_present_pages(node))
- arg.status_change_nid = node;
+ node_states_check_changes_offline(nr_pages, zone, &arg);
ret = memory_notify(MEM_GOING_OFFLINE, &arg);
ret = notifier_to_errno(ret);
@@ -946,10 +1319,10 @@ repeat:
goto repeat;
}
}
- /* drain all zone's lru pagevec, this is asyncronous... */
+ /* drain all zone's lru pagevec, this is asynchronous... */
lru_add_drain_all();
yield();
- /* drain pcp pages , this is synchrouns. */
+ /* drain pcp pages, this is synchronous. */
drain_all_pages();
/* check again */
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
@@ -958,25 +1331,30 @@ repeat:
goto failed_removal;
}
printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
- /* Ok, all of our target is islaoted.
+ /* Ok, all of our target is isolated.
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
/* reset pagetype flags and makes migrate type to be MOVABLE */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
/* removal success */
+ zone->managed_pages -= offlined_pages;
zone->present_pages -= offlined_pages;
zone->zone_pgdat->node_present_pages -= offlined_pages;
totalram_pages -= offlined_pages;
init_per_zone_wmark_min();
- if (!populated_zone(zone))
+ if (!populated_zone(zone)) {
zone_pcp_reset(zone);
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL, NULL);
+ mutex_unlock(&zonelists_mutex);
+ } else
+ zone_pcp_update(zone);
- if (!node_present_pages(node)) {
- node_clear_state(node, N_HIGH_MEMORY);
+ node_states_clear_node(node, &arg);
+ if (arg.status_change_nid >= 0)
kswapd_stop(node);
- }
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
@@ -998,15 +1376,55 @@ out:
return ret;
}
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+ return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
+
int remove_memory(u64 start, u64 size)
{
+ struct memory_block *mem = NULL;
+ struct mem_section *section;
unsigned long start_pfn, end_pfn;
+ unsigned long pfn, section_nr;
+ int ret;
start_pfn = PFN_DOWN(start);
end_pfn = start_pfn + PFN_DOWN(size);
- return offline_pages(start_pfn, end_pfn, 120 * HZ);
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ section_nr = pfn_to_section_nr(pfn);
+ if (!present_section_nr(section_nr))
+ continue;
+
+ section = __nr_to_section(section_nr);
+ /* same memblock? */
+ if (mem)
+ if ((section_nr >= mem->start_section_nr) &&
+ (section_nr <= mem->end_section_nr))
+ continue;
+
+ mem = find_memory_block_hinted(section, mem);
+ if (!mem)
+ continue;
+
+ ret = offline_memory_block(mem);
+ if (ret) {
+ kobject_put(&mem->dev.kobj);
+ return ret;
+ }
+ }
+
+ if (mem)
+ kobject_put(&mem->dev.kobj);
+
+ return 0;
}
#else
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+ return -EINVAL;
+}
int remove_memory(u64 start, u64 size)
{
return -EINVAL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ada3be6e252..d1b315e98627 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
.flags = MPOL_F_LOCAL,
};
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+ struct mempolicy *pol = p->mempolicy;
+ int node;
+
+ if (!pol) {
+ node = numa_node_id();
+ if (node != -1)
+ pol = &preferred_node_policy[node];
+
+ /* preferred_node_policy is not initialised early in boot */
+ if (!pol->mode)
+ pol = NULL;
+ }
+
+ return pol;
+}
+
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
/*
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
- /* Check N_HIGH_MEMORY */
+ /* Check N_MEMORY */
nodes_and(nsc->mask1,
- cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
+ cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- return NULL; /* simply delete any existing policy */
+ return NULL;
}
VM_BUG_ON(!nodes);
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
}
+ } else if (mode == MPOL_LOCAL) {
+ if (!nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- split_huge_page_pmd(vma->vm_mm, pmd);
+ split_huge_page_pmd(vma, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
return 0;
}
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ int nr_updated;
+ BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+
+ nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+ if (nr_updated)
+ count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+
+ return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+
/*
* Check if all pages in a range are on a set of nodes.
* If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return ERR_PTR(-EFAULT);
prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ unsigned long endvma = vma->vm_end;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
}
- if (!is_vm_hugetlb_page(vma) &&
- ((flags & MPOL_MF_STRICT) ||
+
+ if (is_vm_hugetlb_page(vma))
+ goto next;
+
+ if (flags & MPOL_MF_LAZY) {
+ change_prot_numa(vma, start, endvma);
+ goto next;
+ }
+
+ if ((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))) {
- unsigned long endvma = vma->vm_end;
+ vma_migratable(vma))) {
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
err = check_pgd_range(vma, start, endvma, nodes,
flags, private);
if (err) {
@@ -602,11 +667,48 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
break;
}
}
+next:
prev = vma;
}
return first;
}
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+ struct mempolicy *pol)
+{
+ int err;
+ struct mempolicy *old;
+ struct mempolicy *new;
+
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_ops, vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+ new = mpol_dup(pol);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ if (vma->vm_ops && vma->vm_ops->set_policy) {
+ err = vma->vm_ops->set_policy(vma, new);
+ if (err)
+ goto err_out;
+ }
+
+ old = vma->vm_policy;
+ vma->vm_policy = new; /* protected by mmap_sem */
+ mpol_put(old);
+
+ return 0;
+ err_out:
+ mpol_put(new);
+ return err;
+}
+
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +757,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (err)
goto out;
}
-
- /*
- * Apply policy to a single VMA. The reference counting of
- * policy for vma_policy linkages has already been handled by
- * vma_merge and split_vma as necessary. If this is a shared
- * policy then ->set_policy will increment the reference count
- * for an sp node.
- */
- pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
- vma->vm_start, vma->vm_end, vma->vm_pgoff,
- vma->vm_ops, vma->vm_file,
- vma->vm_ops ? vma->vm_ops->set_policy : NULL);
- if (vma->vm_ops && vma->vm_ops->set_policy) {
- err = vma->vm_ops->set_policy(vma, new_pol);
- if (err)
- goto out;
- }
+ err = vma_replace_policy(vma, new_pol);
+ if (err)
+ goto out;
}
out:
@@ -924,19 +1012,23 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
- struct vm_area_struct *vma;
nodes_clear(nmask);
node_set(source, nmask);
- vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ /*
+ * This does not "check" the range but isolates all pages that
+ * need migration. Between passing in the full user address
+ * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+ */
+ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+ check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
- if (IS_ERR(vma))
- return PTR_ERR(vma);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1108,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
int err;
LIST_HEAD(pagelist);
- if (flags & ~(unsigned long)(MPOL_MF_STRICT |
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
@@ -1132,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (IS_ERR(new))
return PTR_ERR(new);
+ if (flags & MPOL_MF_LAZY)
+ new->flags |= MPOL_F_MOF;
+
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
@@ -1168,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
- err = PTR_ERR(vma);
- if (!IS_ERR(vma)) {
- int nr_failed = 0;
-
+ err = PTR_ERR(vma); /* maybe ... */
+ if (!IS_ERR(vma))
err = mbind_range(mm, start, end, new);
+ if (!err) {
+ int nr_failed = 0;
+
if (!list_empty(&pagelist)) {
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_lru_pages(&pagelist);
}
- if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
} else
putback_lru_pages(&pagelist);
@@ -1363,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
goto out_put;
}
- if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
+ if (!nodes_subset(*new, node_states[N_MEMORY])) {
err = -EINVAL;
goto out_put;
}
@@ -1511,9 +1608,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
*
* Returns effective policy for a VMA at specified address.
* Falls back to @task or system default policy, as necessary.
- * Current or other task's task mempolicy and non-shared vma policies
- * are protected by the task's mmap_sem, which must be held for read by
- * the caller.
+ * Current or other task's task mempolicy and non-shared vma policies must be
+ * protected by task_lock(task) by the caller.
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
* count--added by the get_policy() vm_op, as appropriate--to protect against
* freeing by another task. It is the caller's responsibility to free the
@@ -1522,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
- struct mempolicy *pol = task->mempolicy;
+ struct mempolicy *pol = get_task_policy(task);
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1530,8 +1626,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
addr);
if (vpol)
pol = vpol;
- } else if (vma->vm_policy)
+ } else if (vma->vm_policy) {
pol = vma->vm_policy;
+
+ /*
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+ * count on these policies which will be dropped by
+ * mpol_cond_put() later
+ */
+ if (mpol_needs_cond_ref(pol))
+ mpol_get(pol);
+ }
}
if (!pol)
pol = &default_policy;
@@ -1873,7 +1979,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node)
{
struct mempolicy *pol;
- struct zonelist *zl;
struct page *page;
unsigned int cpuset_mems_cookie;
@@ -1892,23 +1997,11 @@ retry_cpuset:
return page;
}
- zl = policy_zonelist(gfp, pol, node);
- if (unlikely(mpol_needs_cond_ref(pol))) {
- /*
- * slow path: ref counted shared policy
- */
- struct page *page = __alloc_pages_nodemask(gfp, order,
- zl, policy_nodemask(gfp, pol));
- __mpol_put(pol);
- if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
- goto retry_cpuset;
- return page;
- }
- /*
- * fast path: default or task policy
- */
- page = __alloc_pages_nodemask(gfp, order, zl,
+ page = __alloc_pages_nodemask(gfp, order,
+ policy_zonelist(gfp, pol, node),
policy_nodemask(gfp, pol));
+ if (unlikely(mpol_needs_cond_ref(pol)))
+ __mpol_put(pol);
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
@@ -1935,7 +2028,7 @@ retry_cpuset:
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = get_task_policy(current);
struct page *page;
unsigned int cpuset_mems_cookie;
@@ -2003,28 +2096,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
return new;
}
-/*
- * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
- * eliminate the * MPOL_F_* flags that require conditional ref and
- * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
- * after return. Use the returned value.
- *
- * Allows use of a mempolicy for, e.g., multiple allocations with a single
- * policy lookup, even if the policy needs/has extra ref on lookup.
- * shmem_readahead needs this.
- */
-struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
- struct mempolicy *frompol)
-{
- if (!mpol_needs_cond_ref(frompol))
- return frompol;
-
- *tompol = *frompol;
- tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
- __mpol_put(frompol);
- return tompol;
-}
-
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
@@ -2061,7 +2132,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*/
/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2125,36 +2196,159 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- spin_unlock(&sp->lock);
+ mutex_unlock(&sp->mutex);
return pol;
}
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page - page to be checked
+ * @vma - vm area where page mapped
+ * @addr - virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol;
+ struct zone *zone;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int polnid = -1;
+ int ret = -1;
+
+ BUG_ON(!vma);
+
+ pol = get_vma_policy(current, vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, vma, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ (void)first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes, &zone);
+ polnid = zone->node;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Migrate the page towards the node whose CPU is referencing it */
+ if (pol->flags & MPOL_F_MORON) {
+ int last_nid;
+
+ polnid = numa_node_id();
+
+ /*
+ * Multi-stage node selection is used in conjunction
+ * with a periodic migration fault to build a temporal
+ * task<->page relation. By using a two-stage filter we
+ * remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist
+ * probability, we can equate a task's usage of a
+ * particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and
+ * getting the same result twice in a row, given these
+ * samples are fully independent, is then given by
+ * P(n)^2, provided our sample period is sufficiently
+ * short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making
+ * it less likely we act on an unlikely task<->page
+ * relation.
+ */
+ last_nid = page_xchg_last_nid(page, polnid);
+ if (last_nid != polnid)
+ goto out;
+ }
+
+ if (curnid != polnid)
+ ret = polnid;
+out:
+ mpol_cond_put(pol);
+
+ return ret;
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_free(n);
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
- struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ struct sp_node *n;
+ struct mempolicy *newpol;
+ n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
+
+ newpol = mpol_dup(pol);
+ if (IS_ERR(newpol)) {
+ kmem_cache_free(sn_cache, n);
+ return NULL;
+ }
+ newpol->flags |= MPOL_F_SHARED;
+
n->start = start;
n->end = end;
- mpol_get(pol);
- pol->flags |= MPOL_F_SHARED; /* for unref */
- n->policy = pol;
+ n->policy = newpol;
+
return n;
}
@@ -2162,10 +2356,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
- struct sp_node *n, *new2 = NULL;
+ struct sp_node *n;
+ int ret = 0;
-restart:
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2178,16 +2372,14 @@ restart:
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
+ struct sp_node *new2;
+ new2 = sp_alloc(end, n->end, n->policy);
if (!new2) {
- spin_unlock(&sp->lock);
- new2 = sp_alloc(end, n->end, n->policy);
- if (!new2)
- return -ENOMEM;
- goto restart;
+ ret = -ENOMEM;
+ goto out;
}
n->end = start;
sp_insert(sp, new2);
- new2 = NULL;
break;
} else
n->end = start;
@@ -2198,12 +2390,9 @@ restart:
}
if (new)
sp_insert(sp, new);
- spin_unlock(&sp->lock);
- if (new2) {
- mpol_put(new2->policy);
- kmem_cache_free(sn_cache, new2);
- }
- return 0;
+out:
+ mutex_unlock(&sp->mutex);
+ return ret;
}
/**
@@ -2221,7 +2410,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- spin_lock_init(&sp->lock);
+ mutex_init(&sp->mutex);
if (mpol) {
struct vm_area_struct pvma;
@@ -2275,7 +2464,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
- kmem_cache_free(sn_cache, new);
+ sp_free(new);
return err;
}
@@ -2287,18 +2476,60 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
- spin_lock(&p->lock);
+ mutex_lock(&p->mutex);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- rb_erase(&n->nd, &p->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_delete(p, n);
}
- spin_unlock(&p->lock);
+ mutex_unlock(&p->mutex);
}
+#ifdef CONFIG_NUMA_BALANCING
+static bool __initdata numabalancing_override;
+
+static void __init check_numabalancing_enable(void)
+{
+ bool numabalancing_default = false;
+
+ if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+ numabalancing_default = true;
+
+ if (nr_node_ids > 1 && !numabalancing_override) {
+ printk(KERN_INFO "Enabling automatic NUMA balancing. "
+ "Configure with numa_balancing= or sysctl");
+ set_numabalancing_state(numabalancing_default);
+ }
+}
+
+static int __init setup_numabalancing(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ numabalancing_override = true;
+
+ if (!strcmp(str, "enable")) {
+ set_numabalancing_state(true);
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ set_numabalancing_state(false);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+
+ return ret;
+}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
@@ -2314,13 +2545,22 @@ void __init numa_policy_init(void)
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
+ for_each_node(nid) {
+ preferred_node_policy[nid] = (struct mempolicy) {
+ .refcnt = ATOMIC_INIT(1),
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_MOF | MPOL_F_MORON,
+ .v = { .preferred_node = nid, },
+ };
+ }
+
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
@@ -2340,6 +2580,8 @@ void __init numa_policy_init(void)
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
+
+ check_numabalancing_enable();
}
/* Reset policy of current process to default */
@@ -2356,14 +2598,13 @@ void numa_default_policy(void)
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
* Used only for mpol_parse_str() and mpol_to_str()
*/
-#define MPOL_LOCAL MPOL_MAX
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
- [MPOL_LOCAL] = "local"
+ [MPOL_LOCAL] = "local",
};
@@ -2401,7 +2642,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
*nodelist++ = '\0';
if (nodelist_parse(nodelist, nodes))
goto out;
- if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
+ if (!nodes_subset(nodes, node_states[N_MEMORY]))
goto out;
} else
nodes_clear(nodes);
@@ -2409,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+ for (mode = 0; mode < MPOL_MAX; mode++) {
if (!strcmp(str, policy_modes[mode])) {
break;
}
}
- if (mode > MPOL_LOCAL)
+ if (mode >= MPOL_MAX)
goto out;
switch (mode) {
@@ -2435,7 +2676,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
- nodes = node_states[N_HIGH_MEMORY];
+ nodes = node_states[N_MEMORY];
break;
case MPOL_LOCAL:
/*
diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d773705..3b676b0c5c3e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,9 +35,13 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/balloon_compaction.h>
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
+
#include "internal.h"
/*
@@ -79,7 +83,30 @@ void putback_lru_pages(struct list_head *l)
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- putback_lru_page(page);
+ putback_lru_page(page);
+ }
+}
+
+/*
+ * Put previously isolated pages back onto the appropriate lists
+ * from where they were once taken off for compaction/migration.
+ *
+ * This function shall be used instead of putback_lru_pages(),
+ * whenever the isolated pageset has been built by isolate_migratepages_range()
+ */
+void putback_movable_pages(struct list_head *l)
+{
+ struct page *page;
+ struct page *page2;
+
+ list_for_each_entry_safe(page, page2, l, lru) {
+ list_del(&page->lru);
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ if (unlikely(balloon_page_movable(page)))
+ balloon_page_putback(page);
+ else
+ putback_lru_page(page);
}
}
@@ -91,8 +118,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
swp_entry_t entry;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
spinlock_t *ptl;
@@ -103,19 +128,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
goto out;
ptl = &mm->page_table_lock;
} else {
- pgd = pgd_offset(mm, addr);
- if (!pgd_present(*pgd))
- goto out;
-
- pud = pud_offset(pgd, addr);
- if (!pud_present(*pud))
+ pmd = mm_find_pmd(mm, addr);
+ if (!pmd)
goto out;
-
- pmd = pmd_offset(pud, addr);
if (pmd_trans_huge(*pmd))
goto out;
- if (!pmd_present(*pmd))
- goto out;
ptep = pte_offset_map(pmd, addr);
@@ -279,14 +296,14 @@ static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
struct buffer_head *head, enum migrate_mode mode)
{
- int expected_count;
+ int expected_count = 0;
void **pslot;
if (!mapping) {
/* Anonymous page without mapping */
if (page_count(page) != 1)
return -EAGAIN;
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +373,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
}
spin_unlock_irq(&mapping->tree_lock);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
/*
@@ -372,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
if (!mapping) {
if (page_count(page) != 1)
return -EAGAIN;
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +416,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
page_unfreeze_refs(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
/*
@@ -407,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
- if (PageHuge(page))
+ if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
@@ -486,11 +503,11 @@ int migrate_page(struct address_space *mapping,
rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
return rc;
migrate_page_copy(newpage, page);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page);
@@ -513,7 +530,7 @@ int buffer_migrate_page(struct address_space *mapping,
rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
return rc;
/*
@@ -549,7 +566,7 @@ int buffer_migrate_page(struct address_space *mapping,
} while (bh != head);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(buffer_migrate_page);
#endif
@@ -628,7 +645,7 @@ static int fallback_migrate_page(struct address_space *mapping,
*
* Return value:
* < 0 - error code
- * == 0 - success
+ * MIGRATEPAGE_SUCCESS - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +682,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
else
rc = fallback_migrate_page(mapping, newpage, page, mode);
- if (rc) {
+ if (rc != MIGRATEPAGE_SUCCESS) {
newpage->mapping = NULL;
} else {
if (remap_swapcache)
@@ -751,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
*/
if (PageAnon(page)) {
/*
- * Only page_lock_anon_vma() understands the subtleties of
+ * Only page_lock_anon_vma_read() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
*/
anon_vma = page_get_anon_vma(page);
@@ -778,6 +795,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
}
+ if (unlikely(balloon_page_movable(page))) {
+ /*
+ * A ballooned page does not need any special attention from
+ * physical to virtual reverse mapping procedures.
+ * Skip any attempt to unmap PTEs or to remap swap cache,
+ * in order to avoid burning cycles at rmap level, and perform
+ * the page migration right away (proteced by page lock).
+ */
+ rc = balloon_page_migrate(newpage, page, mode);
+ goto uncharge;
+ }
+
/*
* Corner case handling:
* 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +843,9 @@ skip_unmap:
put_anon_vma(anon_vma);
uncharge:
- mem_cgroup_end_migration(mem, page, newpage, rc == 0);
+ mem_cgroup_end_migration(mem, page, newpage,
+ (rc == MIGRATEPAGE_SUCCESS ||
+ rc == MIGRATEPAGE_BALLOON_SUCCESS));
unlock:
unlock_page(page);
out:
@@ -846,6 +877,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
goto out;
rc = __unmap_and_move(page, newpage, force, offlining, mode);
+
+ if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
+ /*
+ * A ballooned page has been migrated already.
+ * Now, it's the time to wrap-up counters,
+ * handle the page back to Buddy and return.
+ */
+ dec_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
+ balloon_page_free(page);
+ return MIGRATEPAGE_SUCCESS;
+ }
out:
if (rc != -EAGAIN) {
/*
@@ -958,10 +1001,11 @@ out:
*/
int migrate_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
+ int nr_succeeded = 0;
int pass = 0;
struct page *page;
struct page *page2;
@@ -987,7 +1031,8 @@ int migrate_pages(struct list_head *from,
case -EAGAIN:
retry++;
break;
- case 0:
+ case MIGRATEPAGE_SUCCESS:
+ nr_succeeded++;
break;
default:
/* Permanent failure */
@@ -996,15 +1041,18 @@ int migrate_pages(struct list_head *from,
}
}
}
- rc = 0;
+ rc = nr_failed + retry;
out:
+ if (nr_succeeded)
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ if (nr_failed)
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
- if (rc)
- return rc;
-
- return nr_failed + retry;
+ return rc;
}
int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1072,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
/* try again */
cond_resched();
break;
- case 0:
+ case MIGRATEPAGE_SUCCESS:
goto out;
default:
rc = -EIO;
@@ -1139,7 +1187,8 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0, MIGRATE_SYNC);
+ (unsigned long)pm, 0, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1201,7 +1250,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
if (node < 0 || node >= MAX_NUMNODES)
goto out_pm;
- if (!node_state(node, N_HIGH_MEMORY))
+ if (!node_state(node, N_MEMORY))
goto out_pm;
err = -EACCES;
@@ -1403,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
}
return err;
}
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+ int nr_migrate_pages)
+{
+ int z;
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable)
+ continue;
+
+ /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone) +
+ nr_migrate_pages,
+ 0, 0))
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static struct page *alloc_misplaced_dst_page(struct page *page,
+ unsigned long data,
+ int **result)
+{
+ int nid = (int) data;
+ struct page *newpage;
+
+ newpage = alloc_pages_exact_node(nid,
+ (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN) &
+ ~GFP_IOFS, 0);
+ if (newpage)
+ page_xchg_last_nid(newpage, page_last_nid(page));
+
+ return newpage;
+}
+
+/*
+ * page migration rate limiting control.
+ * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
+ * window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
+ */
+static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
+static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+ msecs_to_jiffies(pteupdate_interval_millisecs)))
+ return false;
+
+ if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+ return false;
+
+ return true;
+}
+
+/* Returns true if the node is migrate rate-limited after the update */
+bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+{
+ bool rate_limited = false;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ spin_lock(&pgdat->numabalancing_migrate_lock);
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies +
+ msecs_to_jiffies(migrate_interval_millisecs);
+ }
+ if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+ rate_limited = true;
+ else
+ pgdat->numabalancing_migrate_nr_pages += nr_pages;
+ spin_unlock(&pgdat->numabalancing_migrate_lock);
+
+ return rate_limited;
+}
+
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+ int ret = 0;
+
+ /* Avoid migrating to a node that is nearly full */
+ if (migrate_balanced_pgdat(pgdat, 1)) {
+ int page_lru;
+
+ if (isolate_lru_page(page)) {
+ put_page(page);
+ return 0;
+ }
+
+ /* Page is isolated */
+ ret = 1;
+ page_lru = page_is_file_cache(page);
+ if (!PageTransHuge(page))
+ inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+ else
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ HPAGE_PMD_NR);
+ }
+
+ /*
+ * Page is either isolated or there is not enough space on the target
+ * node. If isolated, then it has taken a reference count and the
+ * callers reference can be safely dropped without the page
+ * disappearing underneath us during migration. Otherwise the page is
+ * not to be migrated but the callers reference should still be
+ * dropped so it does not leak.
+ */
+ put_page(page);
+
+ return ret;
+}
+
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ int nr_remaining;
+ LIST_HEAD(migratepages);
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1) {
+ put_page(page);
+ goto out;
+ }
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, 1)) {
+ put_page(page);
+ goto out;
+ }
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated)
+ goto out;
+
+ list_add(&page->lru, &migratepages);
+ nr_remaining = migrate_pages(&migratepages,
+ alloc_misplaced_dst_page,
+ node, false, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
+ if (nr_remaining) {
+ putback_lru_pages(&migratepages);
+ isolated = 0;
+ } else
+ count_vm_numa_event(NUMA_PAGE_MIGRATE);
+ BUG_ON(!list_empty(&migratepages));
+out:
+ return isolated;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node)
+{
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ struct page *new_page = NULL;
+ struct mem_cgroup *memcg = NULL;
+ int page_lru = page_is_file_cache(page);
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1)
+ goto out_dropref;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
+ goto out_dropref;
+
+ new_page = alloc_pages_node(node,
+ (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+ if (!new_page) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out_dropref;
+ }
+ page_xchg_last_nid(new_page, page_last_nid(page));
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ put_page(new_page);
+ goto out_keep_locked;
+ }
+
+ /* Prepare a page as a migration target */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+
+ /* anon mapping, we can simply copy page->mapping to the new page: */
+ new_page->mapping = page->mapping;
+ new_page->index = page->index;
+ migrate_page_copy(new_page, page);
+ WARN_ON(PageLRU(new_page));
+
+ /* Recheck the target PMD */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry))) {
+ spin_unlock(&mm->page_table_lock);
+
+ /* Reverse changes made by migrate_page_copy() */
+ if (TestClearPageActive(new_page))
+ SetPageActive(page);
+ if (TestClearPageUnevictable(new_page))
+ SetPageUnevictable(page);
+ mlock_migrate_page(page, new_page);
+
+ unlock_page(new_page);
+ put_page(new_page); /* Free it */
+
+ unlock_page(page);
+ putback_lru_page(page);
+
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out;
+ }
+
+ /*
+ * Traditional migration needs to prepare the memcg charge
+ * transaction early to prevent the old page from being
+ * uncharged when installing migration entries. Here we can
+ * save the potential rollback and start the charge transfer
+ * only when migration is already known to end successfully.
+ */
+ mem_cgroup_prepare_migration(page, new_page, &memcg);
+
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = pmd_mknonnuma(entry);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+
+ page_add_new_anon_rmap(new_page, vma, haddr);
+
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+ page_remove_rmap(page);
+ /*
+ * Finish the charge transaction under the page table lock to
+ * prevent split_huge_page() from dividing up the charge
+ * before it's fully transferred to the new page.
+ */
+ mem_cgroup_end_migration(memcg, page, new_page, true);
+ spin_unlock(&mm->page_table_lock);
+
+ unlock_page(new_page);
+ unlock_page(page);
+ put_page(page); /* Drop the rmap reference */
+ put_page(page); /* Drop the LRU isolation reference */
+
+ count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+
+out:
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ -HPAGE_PMD_NR);
+ return isolated;
+
+out_dropref:
+ put_page(page);
+out_keep_locked:
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* CONFIG_NUMA */
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e9..f0b9ce572fc7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock);
/*
* LRU accounting for clear_page_mlock()
*/
-void __clear_page_mlock(struct page *page)
+void clear_page_mlock(struct page *page)
{
- VM_BUG_ON(!PageLocked(page));
-
- if (!page->mapping) { /* truncated ? */
+ if (!TestClearPageMlocked(page))
return;
- }
- dec_zone_page_state(page, NR_MLOCK);
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ -hpage_nr_pages(page));
count_vm_event(UNEVICTABLE_PGCLEARED);
if (!isolate_lru_page(page)) {
putback_lru_page(page);
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page)
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page)) {
- inc_zone_page_state(page, NR_MLOCK);
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
count_vm_event(UNEVICTABLE_PGMLOCKED);
if (!isolate_lru_page(page))
putback_lru_page(page);
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page)
BUG_ON(!PageLocked(page));
if (TestClearPageMlocked(page)) {
- dec_zone_page_state(page, NR_MLOCK);
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ -hpage_nr_pages(page));
if (!isolate_lru_page(page)) {
int ret = SWAP_AGAIN;
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
goto no_mlock;
- if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+ if (!((vma->vm_flags & VM_DONTEXPAND) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))) {
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
if (page && !IS_ERR(page)) {
lock_page(page);
- /*
- * Like in __mlock_vma_pages_range(),
- * because we lock page here and migration is
- * blocked by the elevated reference, we need
- * only check for file-cache page truncation.
- */
- if (page->mapping)
- munlock_vma_page(page);
+ munlock_vma_page(page);
unlock_page(page);
put_page(page);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index ae18a48e7e4e..f54b235f29a9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
+#include <linux/rbtree_augmented.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -51,12 +52,6 @@ static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
-
/* description of effects of mapping type and prot in current implementation.
* this is due to the limited x86 page protection hardware. The expected
* behavior is in parens:
@@ -95,6 +90,20 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_read_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
+/*
* Check that a process has enough memory to allocate a new virtual
* mapping. 0 means there is enough memory for the allocation to
* succeed and -ENOMEM implies there is not.
@@ -199,14 +208,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
flush_dcache_mmap_lock(mapping);
if (unlikely(vma->vm_flags & VM_NONLINEAR))
- list_del_init(&vma->shared.vm_set.list);
+ list_del_init(&vma->shared.nonlinear);
else
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
/*
- * Unlink a file-based vm structure from its prio_tree, to hide
+ * Unlink a file-based vm structure from its interval tree, to hide
* vma from rmap and vmtruncate before freeing its page tables.
*/
void unlink_file_vma(struct vm_area_struct *vma)
@@ -231,11 +240,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file) {
+ if (vma->vm_file)
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(vma->vm_mm);
- }
mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
return next;
@@ -306,67 +312,202 @@ out:
return retval;
}
-#ifdef DEBUG_MM_RB
+static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+ unsigned long max, subtree_gap;
+ max = vma->vm_start;
+ if (vma->vm_prev)
+ max -= vma->vm_prev->vm_end;
+ if (vma->vm_rb.rb_left) {
+ subtree_gap = rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb)->rb_subtree_gap;
+ if (subtree_gap > max)
+ max = subtree_gap;
+ }
+ if (vma->vm_rb.rb_right) {
+ subtree_gap = rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb)->rb_subtree_gap;
+ if (subtree_gap > max)
+ max = subtree_gap;
+ }
+ return max;
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct rb_root *root)
{
- int i = 0, j;
+ int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev)
- printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1;
- if (vma->vm_start < pend)
+ if (vma->vm_start < prev) {
+ printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ bug = 1;
+ }
+ if (vma->vm_start < pend) {
printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
- if (vma->vm_start > vma->vm_end)
- printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start);
+ bug = 1;
+ }
+ if (vma->vm_start > vma->vm_end) {
+ printk("vm_end %lx < vm_start %lx\n",
+ vma->vm_end, vma->vm_start);
+ bug = 1;
+ }
+ if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
+ printk("free gap %lx, correct %lx\n",
+ vma->rb_subtree_gap,
+ vma_compute_subtree_gap(vma));
+ bug = 1;
+ }
i++;
pn = nd;
prev = vma->vm_start;
pend = vma->vm_end;
}
j = 0;
- for (nd = pn; nd; nd = rb_prev(nd)) {
+ for (nd = pn; nd; nd = rb_prev(nd))
j++;
+ if (i != j) {
+ printk("backwards %d, forwards %d\n", j, i);
+ bug = 1;
+ }
+ return bug ? -1 : i;
+}
+
+static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
+{
+ struct rb_node *nd;
+
+ for (nd = rb_first(root); nd; nd = rb_next(nd)) {
+ struct vm_area_struct *vma;
+ vma = rb_entry(nd, struct vm_area_struct, vm_rb);
+ BUG_ON(vma != ignore &&
+ vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
}
- if (i != j)
- printk("backwards %d, forwards %d\n", j, i), i = 0;
- return i;
}
void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- struct vm_area_struct *tmp = mm->mmap;
- while (tmp) {
- tmp = tmp->vm_next;
+ unsigned long highest_address = 0;
+ struct vm_area_struct *vma = mm->mmap;
+ while (vma) {
+ struct anon_vma_chain *avc;
+ vma_lock_anon_vma(vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ vma_unlock_anon_vma(vma);
+ highest_address = vma->vm_end;
+ vma = vma->vm_next;
i++;
}
- if (i != mm->map_count)
- printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
+ if (i != mm->map_count) {
+ printk("map_count %d vm_next %d\n", mm->map_count, i);
+ bug = 1;
+ }
+ if (highest_address != mm->highest_vm_end) {
+ printk("mm->highest_vm_end %lx, found %lx\n",
+ mm->highest_vm_end, highest_address);
+ bug = 1;
+ }
i = browse_rb(&mm->mm_rb);
- if (i != mm->map_count)
- printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
+ if (i != mm->map_count) {
+ printk("map_count %d rb %d\n", mm->map_count, i);
+ bug = 1;
+ }
BUG_ON(bug);
}
#else
+#define validate_mm_rb(root, ignore) do { } while (0)
#define validate_mm(mm) do { } while (0)
#endif
-static struct vm_area_struct *
-find_vma_prepare(struct mm_struct *mm, unsigned long addr,
- struct vm_area_struct **pprev, struct rb_node ***rb_link,
- struct rb_node ** rb_parent)
+RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
+ unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+
+/*
+ * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
+ * vma->vm_prev->vm_end values changed, without modifying the vma's position
+ * in the rbtree.
+ */
+static void vma_gap_update(struct vm_area_struct *vma)
{
- struct vm_area_struct * vma;
- struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+ /*
+ * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
+ * function that does exacltly what we want.
+ */
+ vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
+}
+
+static inline void vma_rb_insert(struct vm_area_struct *vma,
+ struct rb_root *root)
+{
+ /* All rb_subtree_gap values must be consistent prior to insertion */
+ validate_mm_rb(root, NULL);
+
+ rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
+static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+{
+ /*
+ * All rb_subtree_gap values must be consistent prior to erase,
+ * with the possible exception of the vma being erased.
+ */
+ validate_mm_rb(root, vma);
+
+ /*
+ * Note rb_erase_augmented is a fairly large inline function,
+ * so make sure we instantiate it only once with our desired
+ * augmented rbtree callbacks.
+ */
+ rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
+}
+
+/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+ unsigned long end, struct vm_area_struct **pprev,
+ struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+ struct rb_node **__rb_link, *__rb_parent, *rb_prev;
__rb_link = &mm->mm_rb.rb_node;
rb_prev = __rb_parent = NULL;
- vma = NULL;
while (*__rb_link) {
struct vm_area_struct *vma_tmp;
@@ -375,9 +516,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
+ /* Fail if an existing vma overlaps the area */
+ if (vma_tmp->vm_start < end)
+ return -ENOMEM;
__rb_link = &__rb_parent->rb_left;
} else {
rb_prev = __rb_parent;
@@ -390,14 +531,31 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
*rb_link = __rb_link;
*rb_parent = __rb_parent;
- return vma;
+ return 0;
}
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
+ /* Update tracking information for the gap following the new vma. */
+ if (vma->vm_next)
+ vma_gap_update(vma->vm_next);
+ else
+ mm->highest_vm_end = vma->vm_end;
+
+ /*
+ * vma->vm_prev wasn't known when we followed the rbtree to find the
+ * correct insertion point for that vma. As a result, we could not
+ * update the vma vm_rb parents rb_subtree_gap values on the way down.
+ * So, we first insert the vma with a zero rb_subtree_gap value
+ * (to be consistent with what we did on the way down), and then
+ * immediately update the gap to the correct value. Finally we
+ * rebalance the rbtree after all augmented values have been set.
+ */
rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+ vma->rb_subtree_gap = 0;
+ vma_gap_update(vma);
+ vma_rb_insert(vma, &mm->mm_rb);
}
static void __vma_link_file(struct vm_area_struct *vma)
@@ -417,7 +575,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
if (unlikely(vma->vm_flags & VM_NONLINEAR))
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
else
- vma_prio_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
}
@@ -455,15 +613,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree. It has already been inserted into the prio_tree.
+ * mm's list and rbtree. It has already been inserted into the interval tree.
*/
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *__vma, *prev;
+ struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
- __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
- BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
+ BUG();
__vma_link(mm, vma, prev, rb_link, rb_parent);
mm->map_count++;
}
@@ -472,12 +631,12 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next;
- prev->vm_next = next;
+ vma_rb_erase(vma, &mm->mm_rb);
+ prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
- rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
}
@@ -496,9 +655,10 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *next = vma->vm_next;
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
- struct prio_tree_root *root = NULL;
+ struct rb_root *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
+ bool start_changed = false, end_changed = false;
long adjust_next = 0;
int remove_next = 0;
@@ -559,7 +719,7 @@ again: remove_next = 1 + (end > next->vm_end);
mutex_lock(&mapping->i_mmap_mutex);
if (insert) {
/*
- * Put into prio_tree now, so instantiated pages
+ * Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
@@ -570,26 +730,33 @@ again: remove_next = 1 + (end > next->vm_end);
vma_adjust_trans_huge(vma, start, end, adjust_next);
- /*
- * When changing only vma->vm_end, we don't really need anon_vma
- * lock. This is a fairly rare case by itself, but the anon_vma
- * lock may be shared between many sibling processes. Skipping
- * the lock for brk adjustments makes a difference sometimes.
- */
- if (vma->anon_vma && (importer || start != vma->vm_start)) {
- anon_vma = vma->anon_vma;
- anon_vma_lock(anon_vma);
+ anon_vma = vma->anon_vma;
+ if (!anon_vma && adjust_next)
+ anon_vma = next->anon_vma;
+ if (anon_vma) {
+ VM_BUG_ON(adjust_next && next->anon_vma &&
+ anon_vma != next->anon_vma);
+ anon_vma_lock_write(anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ if (adjust_next)
+ anon_vma_interval_tree_pre_update_vma(next);
}
if (root) {
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_remove(vma, root);
+ vma_interval_tree_remove(vma, root);
if (adjust_next)
- vma_prio_tree_remove(next, root);
+ vma_interval_tree_remove(next, root);
}
- vma->vm_start = start;
- vma->vm_end = end;
+ if (start != vma->vm_start) {
+ vma->vm_start = start;
+ start_changed = true;
+ }
+ if (end != vma->vm_end) {
+ vma->vm_end = end;
+ end_changed = true;
+ }
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next << PAGE_SHIFT;
@@ -598,8 +765,8 @@ again: remove_next = 1 + (end > next->vm_end);
if (root) {
if (adjust_next)
- vma_prio_tree_insert(next, root);
- vma_prio_tree_insert(vma, root);
+ vma_interval_tree_insert(next, root);
+ vma_interval_tree_insert(vma, root);
flush_dcache_mmap_unlock(mapping);
}
@@ -618,10 +785,23 @@ again: remove_next = 1 + (end > next->vm_end);
* (it may either follow vma or precede it).
*/
__insert_vm_struct(mm, insert);
+ } else {
+ if (start_changed)
+ vma_gap_update(vma);
+ if (end_changed) {
+ if (!next)
+ mm->highest_vm_end = end;
+ else if (!adjust_next)
+ vma_gap_update(next);
+ }
}
- if (anon_vma)
+ if (anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ if (adjust_next)
+ anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock(anon_vma);
+ }
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
@@ -636,8 +816,6 @@ again: remove_next = 1 + (end > next->vm_end);
if (file) {
uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
- if (next->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
}
if (next->anon_vma)
anon_vma_merge(vma, next);
@@ -649,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end);
* we must remove another next too. It would clutter
* up the code too much to do both in one go.
*/
- if (remove_next == 2) {
- next = vma->vm_next;
+ next = vma->vm_next;
+ if (remove_next == 2)
goto again;
- }
+ else if (next)
+ vma_gap_update(next);
+ else
+ mm->highest_vm_end = end;
}
if (insert && file)
uprobe_mmap(insert);
@@ -669,8 +850,7 @@ again: remove_next = 1 + (end > next->vm_end);
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
- if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
+ if (vma->vm_flags ^ vm_flags)
return 0;
if (vma->vm_file != file)
return 0;
@@ -951,8 +1131,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
mm->exec_vm += pages;
} else if (flags & stack_flags)
mm->stack_vm += pages;
- if (flags & (VM_RESERVED|VM_IO))
- mm->reserved_vm += pages;
}
#endif /* CONFIG_PROC_FS */
@@ -1127,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
* memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
- VM_NORESERVE, &user,
- HUGETLB_ANONHUGE_INODE);
+ VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE,
+ (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
}
@@ -1190,7 +1369,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
return 0;
/* Specialty mapping? */
- if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+ if (vm_flags & VM_PFNMAP)
return 0;
/* Can the mapping track the dirty pages? */
@@ -1229,8 +1408,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Clear old maps */
error = -ENOMEM;
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
- if (vma && vma->vm_start < addr + len) {
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
@@ -1301,19 +1479,20 @@ munmap_back:
goto free_vma;
correct_wcount = 1;
}
- vma->vm_file = file;
- get_file(file);
+ vma->vm_file = get_file(file);
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- if (vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
+ * Bug: If addr is changed, prev, rb_link, rb_parent should
+ * be updated for vma_link()
*/
+ WARN_ON_ONCE(addr != vma->vm_start);
+
addr = vma->vm_start;
pgoff = vma->vm_pgoff;
vm_flags = vma->vm_flags;
@@ -1378,6 +1557,206 @@ unacct_error:
return error;
}
+unsigned long unmapped_area(struct vm_unmapped_area_info *info)
+{
+ /*
+ * We implement the search by looking for an rbtree node that
+ * immediately follows a suitable gap. That is,
+ * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
+ * - gap_end = vma->vm_start >= info->low_limit + length;
+ * - gap_end - gap_start >= length
+ */
+
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask;
+ if (length < info->length)
+ return -ENOMEM;
+
+ /* Adjust search limits by the desired length */
+ if (info->high_limit < length)
+ return -ENOMEM;
+ high_limit = info->high_limit - length;
+
+ if (info->low_limit > high_limit)
+ return -ENOMEM;
+ low_limit = info->low_limit + length;
+
+ /* Check if rbtree root looks promising */
+ if (RB_EMPTY_ROOT(&mm->mm_rb))
+ goto check_highest;
+ vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+ if (vma->rb_subtree_gap < length)
+ goto check_highest;
+
+ while (true) {
+ /* Visit left subtree if it looks promising */
+ gap_end = vma->vm_start;
+ if (gap_end >= low_limit && vma->vm_rb.rb_left) {
+ struct vm_area_struct *left =
+ rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb);
+ if (left->rb_subtree_gap >= length) {
+ vma = left;
+ continue;
+ }
+ }
+
+ gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+check_current:
+ /* Check if current node has a suitable gap */
+ if (gap_start > high_limit)
+ return -ENOMEM;
+ if (gap_end >= low_limit && gap_end - gap_start >= length)
+ goto found;
+
+ /* Visit right subtree if it looks promising */
+ if (vma->vm_rb.rb_right) {
+ struct vm_area_struct *right =
+ rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb);
+ if (right->rb_subtree_gap >= length) {
+ vma = right;
+ continue;
+ }
+ }
+
+ /* Go back up the rbtree to find next candidate node */
+ while (true) {
+ struct rb_node *prev = &vma->vm_rb;
+ if (!rb_parent(prev))
+ goto check_highest;
+ vma = rb_entry(rb_parent(prev),
+ struct vm_area_struct, vm_rb);
+ if (prev == vma->vm_rb.rb_left) {
+ gap_start = vma->vm_prev->vm_end;
+ gap_end = vma->vm_start;
+ goto check_current;
+ }
+ }
+ }
+
+check_highest:
+ /* Check highest gap, which does not precede any rbtree node */
+ gap_start = mm->highest_vm_end;
+ gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
+ if (gap_start > high_limit)
+ return -ENOMEM;
+
+found:
+ /* We found a suitable gap. Clip it with the original low_limit. */
+ if (gap_start < info->low_limit)
+ gap_start = info->low_limit;
+
+ /* Adjust gap address to the desired alignment */
+ gap_start += (info->align_offset - gap_start) & info->align_mask;
+
+ VM_BUG_ON(gap_start + info->length > info->high_limit);
+ VM_BUG_ON(gap_start + info->length > gap_end);
+ return gap_start;
+}
+
+unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long length, low_limit, high_limit, gap_start, gap_end;
+
+ /* Adjust search length to account for worst case alignment overhead */
+ length = info->length + info->align_mask;
+ if (length < info->length)
+ return -ENOMEM;
+
+ /*
+ * Adjust search limits by the desired length.
+ * See implementation comment at top of unmapped_area().
+ */
+ gap_end = info->high_limit;
+ if (gap_end < length)
+ return -ENOMEM;
+ high_limit = gap_end - length;
+
+ if (info->low_limit > high_limit)
+ return -ENOMEM;
+ low_limit = info->low_limit + length;
+
+ /* Check highest gap, which does not precede any rbtree node */
+ gap_start = mm->highest_vm_end;
+ if (gap_start <= high_limit)
+ goto found_highest;
+
+ /* Check if rbtree root looks promising */
+ if (RB_EMPTY_ROOT(&mm->mm_rb))
+ return -ENOMEM;
+ vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
+ if (vma->rb_subtree_gap < length)
+ return -ENOMEM;
+
+ while (true) {
+ /* Visit right subtree if it looks promising */
+ gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+ if (gap_start <= high_limit && vma->vm_rb.rb_right) {
+ struct vm_area_struct *right =
+ rb_entry(vma->vm_rb.rb_right,
+ struct vm_area_struct, vm_rb);
+ if (right->rb_subtree_gap >= length) {
+ vma = right;
+ continue;
+ }
+ }
+
+check_current:
+ /* Check if current node has a suitable gap */
+ gap_end = vma->vm_start;
+ if (gap_end < low_limit)
+ return -ENOMEM;
+ if (gap_start <= high_limit && gap_end - gap_start >= length)
+ goto found;
+
+ /* Visit left subtree if it looks promising */
+ if (vma->vm_rb.rb_left) {
+ struct vm_area_struct *left =
+ rb_entry(vma->vm_rb.rb_left,
+ struct vm_area_struct, vm_rb);
+ if (left->rb_subtree_gap >= length) {
+ vma = left;
+ continue;
+ }
+ }
+
+ /* Go back up the rbtree to find next candidate node */
+ while (true) {
+ struct rb_node *prev = &vma->vm_rb;
+ if (!rb_parent(prev))
+ return -ENOMEM;
+ vma = rb_entry(rb_parent(prev),
+ struct vm_area_struct, vm_rb);
+ if (prev == vma->vm_rb.rb_right) {
+ gap_start = vma->vm_prev ?
+ vma->vm_prev->vm_end : 0;
+ goto check_current;
+ }
+ }
+ }
+
+found:
+ /* We found a suitable gap. Clip it with the original high_limit. */
+ if (gap_end > info->high_limit)
+ gap_end = info->high_limit;
+
+found_highest:
+ /* Compute highest gap address at the desired alignment */
+ gap_end -= info->length;
+ gap_end -= (gap_end - info->align_offset) & info->align_mask;
+
+ VM_BUG_ON(gap_end < info->low_limit);
+ VM_BUG_ON(gap_end < gap_start);
+ return gap_end;
+}
+
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -1396,7 +1775,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long start_addr;
+ struct vm_unmapped_area_info info;
if (len > TASK_SIZE)
return -ENOMEM;
@@ -1411,40 +1790,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
(!vma || addr + len <= vma->vm_start))
return addr;
}
- if (len > mm->cached_hole_size) {
- start_addr = addr = mm->free_area_cache;
- } else {
- start_addr = addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- }
-full_search:
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != TASK_UNMAPPED_BASE) {
- addr = TASK_UNMAPPED_BASE;
- start_addr = addr;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
- if (!vma || addr + len <= vma->vm_start) {
- /*
- * Remember the place where we stopped the search:
- */
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = vma->vm_end;
- }
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = 0;
+ return vm_unmapped_area(&info);
}
#endif
@@ -1469,7 +1821,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0, start_addr;
+ unsigned long addr = addr0;
+ struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
if (len > TASK_SIZE)
@@ -1487,53 +1840,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
}
- /* check if free_area_cache is useful for us */
- if (len <= mm->cached_hole_size) {
- mm->cached_hole_size = 0;
- mm->free_area_cache = mm->mmap_base;
- }
-
-try_again:
- /* either no address requested or can't fit in requested address hole */
- start_addr = addr = mm->free_area_cache;
-
- if (addr < len)
- goto fail;
-
- addr -= len;
- do {
- /*
- * Lookup failure means no vma is above this address,
- * else if new region fits below vma->vm_start,
- * return with success:
- */
- vma = find_vma(mm, addr);
- if (!vma || addr+len <= vma->vm_start)
- /* remember the address as a hint for next time */
- return (mm->free_area_cache = addr);
-
- /* remember the largest hole we saw so far */
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
-
- /* try just below the current vma->vm_start */
- addr = vma->vm_start-len;
- } while (len < vma->vm_start);
-
-fail:
- /*
- * if hint left us with no space for the requested
- * mapping then try again:
- *
- * Note: this is different with the case of bottomup
- * which does the fully line-search, but we use find_vma
- * here that causes some holes skipped.
- */
- if (start_addr != mm->mmap_base) {
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = 0;
- goto try_again;
- }
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = mm->mmap_base;
+ info.align_mask = 0;
+ addr = vm_unmapped_area(&info);
/*
* A failed mmap() very likely causes application failure,
@@ -1541,14 +1853,13 @@ fail:
* can happen with large stack limits and large mmap()
* allocations.
*/
- mm->cached_hole_size = ~0UL;
- mm->free_area_cache = TASK_UNMAPPED_BASE;
- addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
- /*
- * Restore the topdown base:
- */
- mm->free_area_cache = mm->mmap_base;
- mm->cached_hole_size = ~0UL;
+ if (addr & ~PAGE_MASK) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ addr = vm_unmapped_area(&info);
+ }
return addr;
}
@@ -1758,13 +2069,34 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
+ /*
+ * vma_gap_update() doesn't support concurrent
+ * updates, but we only hold a shared mmap_sem
+ * lock here, so we need to protect against
+ * concurrent vma expansions.
+ * vma_lock_anon_vma() doesn't help here, as
+ * we don't guarantee that all growable vmas
+ * in a mm share the same root anon vma.
+ * So, we reuse mm->page_table_lock to guard
+ * against concurrent vma expansions.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
+ anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
+ anon_vma_interval_tree_post_update_vma(vma);
+ if (vma->vm_next)
+ vma_gap_update(vma->vm_next);
+ else
+ vma->vm_mm->highest_vm_end = address;
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
perf_event_mmap(vma);
}
}
}
vma_unlock_anon_vma(vma);
khugepaged_enter_vma_merge(vma);
+ validate_mm(vma->vm_mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,14 +2140,32 @@ int expand_downwards(struct vm_area_struct *vma,
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
+ /*
+ * vma_gap_update() doesn't support concurrent
+ * updates, but we only hold a shared mmap_sem
+ * lock here, so we need to protect against
+ * concurrent vma expansions.
+ * vma_lock_anon_vma() doesn't help here, as
+ * we don't guarantee that all growable vmas
+ * in a mm share the same root anon vma.
+ * So, we reuse mm->page_table_lock to guard
+ * against concurrent vma expansions.
+ */
+ spin_lock(&vma->vm_mm->page_table_lock);
+ anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ anon_vma_interval_tree_post_update_vma(vma);
+ vma_gap_update(vma);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
perf_event_mmap(vma);
}
}
}
vma_unlock_anon_vma(vma);
khugepaged_enter_vma_merge(vma);
+ validate_mm(vma->vm_mm);
return error;
}
@@ -1931,14 +2281,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
vma->vm_prev = NULL;
do {
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+ vma_rb_erase(vma, &mm->mm_rb);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
- if (vma)
+ if (vma) {
vma->vm_prev = prev;
+ vma_gap_update(vma);
+ } else
+ mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
@@ -1989,11 +2342,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
if (anon_vma_clone(new, vma))
goto out_free_mpol;
- if (new->vm_file) {
+ if (new->vm_file)
get_file(new->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
- }
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
@@ -2011,11 +2361,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
/* Clean everything up if vma_adjust failed. */
if (new->vm_ops && new->vm_ops->close)
new->vm_ops->close(new);
- if (new->vm_file) {
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
+ if (new->vm_file)
fput(new->vm_file);
- }
unlink_anon_vmas(new);
out_free_mpol:
mpol_put(pol);
@@ -2200,8 +2547,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
* Clear old maps. this also does some error checking for us
*/
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
- if (vma && vma->vm_start < addr + len) {
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
@@ -2315,10 +2661,10 @@ void exit_mmap(struct mm_struct *mm)
* and into the inode's i_mmap tree. If vm_file is non-NULL
* then i_mmap_mutex is taken here.
*/
-int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *prev;
+ struct rb_node **rb_link, *rb_parent;
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2336,8 +2682,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
- if (__vma && __vma->vm_start < vma->vm_end)
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2352,7 +2698,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
* prior to moving page table entries, to effect an mremap move.
*/
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
- unsigned long addr, unsigned long len, pgoff_t pgoff)
+ unsigned long addr, unsigned long len, pgoff_t pgoff,
+ bool *need_rmap_locks)
{
struct vm_area_struct *vma = *vmap;
unsigned long vma_start = vma->vm_start;
@@ -2371,7 +2718,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
- find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
if (new_vma) {
@@ -2393,32 +2741,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
* linear if there are no pages mapped yet.
*/
VM_BUG_ON(faulted_in_anon_vma);
- *vmap = new_vma;
- } else
- anon_vma_moveto_tail(new_vma);
+ *vmap = vma = new_vma;
+ }
+ *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
} else {
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
*new_vma = *vma;
+ new_vma->vm_start = addr;
+ new_vma->vm_end = addr + len;
+ new_vma->vm_pgoff = pgoff;
pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol))
goto out_free_vma;
+ vma_set_policy(new_vma, pol);
INIT_LIST_HEAD(&new_vma->anon_vma_chain);
if (anon_vma_clone(new_vma, vma))
goto out_free_mempol;
- vma_set_policy(new_vma, pol);
- new_vma->vm_start = addr;
- new_vma->vm_end = addr + len;
- new_vma->vm_pgoff = pgoff;
- if (new_vma->vm_file) {
+ if (new_vma->vm_file)
get_file(new_vma->vm_file);
-
- if (vma->vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
- }
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ *need_rmap_locks = false;
}
}
return new_vma;
@@ -2536,23 +2881,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
- if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
+ down_write(&anon_vma->root->rwsem);
/*
* We can safely modify head.next after taking the
- * anon_vma->root->mutex. If some other vma in this mm shares
+ * anon_vma->root->rwsem. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->root->mutex.
+ * anon_vma->root->rwsem.
*/
if (__test_and_set_bit(0, (unsigned long *)
- &anon_vma->root->head.next))
+ &anon_vma->root->rb_root.rb_node))
BUG();
}
}
@@ -2593,7 +2938,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* A single task can't take more than one mm_take_all_locks() in a row
* or it would deadlock.
*
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
* mapping->flags avoid to take the same lock twice, if more than one
* vma in this mm is backed by the same anon_vma or address_space.
*
@@ -2640,21 +2985,21 @@ out_unlock:
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
- if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+ if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
/*
* The LSB of head.next can't change to 0 from under
* us because we hold the mm_all_locks_mutex.
*
* We must however clear the bitflag before unlocking
- * the vma so the users using the anon_vma->head will
+ * the vma so the users using the anon_vma->rb_root will
* never see our bitflag.
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->root->mutex.
+ * anon_vma->root->rwsem.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
- &anon_vma->root->head.next))
+ &anon_vma->root->rb_root.rb_node))
BUG();
anon_vma_unlock(anon_vma);
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9f..8a5ac8c686b0 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/err.h>
+#include <linux/srcu.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/slab.h>
+/* global SRCU for all MMs */
+static struct srcu_struct srcu;
+
/*
* This function can't run concurrently against mmu_notifier_register
* because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
* in parallel despite there being no task using this mm any more,
* through the vmas outside of the exit_mmap context, such as with
* vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
* can't go away from under us as exit_mmap holds an mm_count pin
* itself.
*/
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
/*
- * RCU here will block mmu_notifier_unregister until
+ * SRCU here will block mmu_notifier_unregister until
* ->release returns.
*/
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
/*
* if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
*/
if (mn->ops->release)
mn->ops->release(mn, mm);
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
spin_unlock(&mm->mmu_notifier_mm->lock);
/*
- * synchronize_rcu here prevents mmu_notifier_release to
+ * synchronize_srcu here prevents mmu_notifier_release to
* return to exit_mmap (which would proceed freeing all pages
* in the mm) until the ->release method returns, if it was
* invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
* The mmu_notifier_mm can't go away from under us because one
* mm_count is hold by exit_mmap.
*/
- synchronize_rcu();
+ synchronize_srcu(&srcu);
}
/*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int young = 0;
+ int young = 0, id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->clear_flush_young)
young |= mn->ops->clear_flush_young(mn, mm, address);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
return young;
}
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int young = 0;
+ int young = 0, id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->test_young) {
young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
break;
}
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
return young;
}
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->change_pte)
mn->ops->change_pte(mn, mm, address, pte);
- /*
- * Some drivers don't have change_pte,
- * so we must call invalidate_page in that case.
- */
- else if (mn->ops->invalidate_page)
- mn->ops->invalidate_page(mn, mm, address);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_page)
mn->ops->invalidate_page(mn, mm, address);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start)
mn->ops->invalidate_range_start(mn, mm, start, end);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_end)
mn->ops->invalidate_range_end(mn, mm, start, end);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,6 +195,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ /*
+ * Verify that mmu_notifier_init() already run and the global srcu is
+ * initialized.
+ */
+ BUG_ON(!srcu.per_cpu_ref);
+
ret = -ENOMEM;
mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
if (unlikely(!mmu_notifier_mm))
@@ -201,11 +210,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
down_write(&mm->mmap_sem);
ret = mm_take_all_locks(mm);
if (unlikely(ret))
- goto out_cleanup;
+ goto out_clean;
if (!mm_has_notifiers(mm)) {
INIT_HLIST_HEAD(&mmu_notifier_mm->list);
spin_lock_init(&mmu_notifier_mm->lock);
+
mm->mmu_notifier_mm = mmu_notifier_mm;
mmu_notifier_mm = NULL;
}
@@ -224,10 +234,9 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
spin_unlock(&mm->mmu_notifier_mm->lock);
mm_drop_all_locks(mm);
-out_cleanup:
+out_clean:
if (take_mmap_sem)
up_write(&mm->mmap_sem);
- /* kfree() does nothing if mmu_notifier_mm is NULL */
kfree(mmu_notifier_mm);
out:
BUG_ON(atomic_read(&mm->mm_users) <= 0);
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
/*
* This releases the mm_count pin automatically and frees the mm
* structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
* calling mmu_notifier_unregister. ->release or any other notifier
* method may be invoked concurrently with mmu_notifier_unregister,
* and only after mmu_notifier_unregister returned we're guaranteed
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
if (!hlist_unhashed(&mn->hlist)) {
/*
- * RCU here will force exit_mmap to wait ->release to finish
+ * SRCU here will force exit_mmap to wait ->release to finish
* before freeing the pages.
*/
- rcu_read_lock();
+ int id;
+ id = srcu_read_lock(&srcu);
/*
* exit_mmap will block in mmu_notifier_release to
* guarantee ->release is called before freeing the
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
*/
if (mn->ops->release)
mn->ops->release(mn, mm);
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
spin_lock(&mm->mmu_notifier_mm->lock);
hlist_del_rcu(&mn->hlist);
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
* Wait any running method to finish, of course including
* ->release if it was run by mmu_notifier_relase instead of us.
*/
- synchronize_rcu();
+ synchronize_srcu(&srcu);
BUG_ON(atomic_read(&mm->mm_count) <= 0);
mmdrop(mm);
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+static int __init mmu_notifier_init(void)
+{
+ return init_srcu_struct(&srcu);
+}
+
+module_init(mmu_notifier_init);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 3cef80f6ac79..4596d81b89b1 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pfn,
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+void lruvec_init(struct lruvec *lruvec)
{
enum lru_list lru;
@@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
-
-#ifdef CONFIG_MEMCG
- lruvec->zone = zone;
-#endif
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index a40992610ab6..94722a4d6b43 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
}
#endif
-static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa, bool *ret_all_same_node)
{
+ struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
+ unsigned long pages = 0;
+ bool all_same_node = true;
+ int last_nid = -1;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
+ bool updated = false;
ptent = ptep_modify_prot_start(mm, addr, pte);
- ptent = pte_modify(ptent, newprot);
+ if (!prot_numa) {
+ ptent = pte_modify(ptent, newprot);
+ updated = true;
+ } else {
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, oldpte);
+ if (page) {
+ int this_nid = page_to_nid(page);
+ if (last_nid == -1)
+ last_nid = this_nid;
+ if (last_nid != this_nid)
+ all_same_node = false;
+
+ /* only check non-shared pages */
+ if (!pte_numa(oldpte) &&
+ page_mapcount(page) == 1) {
+ ptent = pte_mknuma(ptent);
+ updated = true;
+ }
+ }
+ }
/*
* Avoid taking write faults for pages we know to be
* dirty.
*/
- if (dirty_accountable && pte_dirty(ptent))
+ if (dirty_accountable && pte_dirty(ptent)) {
ptent = pte_mkwrite(ptent);
+ updated = true;
+ }
+ if (updated)
+ pages++;
ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,61 +102,101 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
set_pte_at(mm, addr, pte,
swp_entry_to_pte(entry));
}
+ pages++;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
+
+ *ret_all_same_node = all_same_node;
+ return pages;
}
-static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+#ifdef CONFIG_NUMA_BALANCING
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ spin_lock(&mm->page_table_lock);
+ set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
+ spin_unlock(&mm->page_table_lock);
+}
+#else
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
+ pud_t *pud, unsigned long addr, unsigned long end,
+ pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
unsigned long next;
+ unsigned long pages = 0;
+ bool all_same_node;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- split_huge_page_pmd(vma->vm_mm, pmd);
- else if (change_huge_pmd(vma, pmd, addr, newprot))
+ split_huge_page_pmd(vma, addr, pmd);
+ else if (change_huge_pmd(vma, pmd, addr, newprot,
+ prot_numa)) {
+ pages += HPAGE_PMD_NR;
continue;
+ }
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
continue;
- change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
- dirty_accountable);
+ pages += change_pte_range(vma, pmd, addr, next, newprot,
+ dirty_accountable, prot_numa, &all_same_node);
+
+ /*
+ * If we are changing protections for NUMA hinting faults then
+ * set pmd_numa if the examined pages were all on the same
+ * node. This allows a regular PMD to be handled as one fault
+ * and effectively batches the taking of the PTL
+ */
+ if (prot_numa && all_same_node)
+ change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end);
+
+ return pages;
}
-static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+static inline unsigned long change_pud_range(struct vm_area_struct *vma,
+ pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pud_t *pud;
unsigned long next;
+ unsigned long pages = 0;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- change_pmd_range(vma, pud, addr, next, newprot,
- dirty_accountable);
+ pages += change_pmd_range(vma, pud, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pud++, addr = next, addr != end);
+
+ return pages;
}
-static void change_protection(struct vm_area_struct *vma,
+static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
+ unsigned long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
@@ -135,10 +205,32 @@ static void change_protection(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- change_pud_range(vma, pgd, addr, next, newprot,
- dirty_accountable);
+ pages += change_pud_range(vma, pgd, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pgd++, addr = next, addr != end);
- flush_tlb_range(vma, start, end);
+
+ /* Only flush the TLB if we actually modified any entries: */
+ if (pages)
+ flush_tlb_range(vma, start, end);
+
+ return pages;
+}
+
+unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ int dirty_accountable, int prot_numa)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long pages;
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ if (is_vm_hugetlb_page(vma))
+ pages = hugetlb_change_protection(vma, start, end, newprot);
+ else
+ pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+
+ return pages;
}
int
@@ -213,12 +305,9 @@ success:
dirty_accountable = 1;
}
- mmu_notifier_invalidate_range_start(mm, start, end);
- if (is_vm_hugetlb_page(vma))
- hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
- else
- change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
- mmu_notifier_invalidate_range_end(mm, start, end);
+ change_protection(vma, start, end, vma->vm_page_prot,
+ dirty_accountable, 0);
+
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
perf_event_mmap(vma);
@@ -274,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = -EINVAL;
if (!(vma->vm_flags & VM_GROWSDOWN))
goto out;
- }
- else {
+ } else {
if (vma->vm_start > start)
goto out;
if (unlikely(grows & PROT_GROWSUP)) {
@@ -291,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
for (nstart = start ; ; ) {
unsigned long newflags;
- /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
+ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+ newflags = vm_flags;
+ newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d05..e1031e1f6a61 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
- unsigned long new_addr)
+ unsigned long new_addr, bool need_rmap_locks)
{
struct address_space *mapping = NULL;
+ struct anon_vma *anon_vma = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
- if (vma->vm_file) {
- /*
- * Subtle point from Rajesh Venkatasubramanian: before
- * moving file-based ptes, we must lock truncate_pagecache
- * out, since it might clean the dst vma before the src vma,
- * and we propagate stale pages into the dst afterward.
- */
- mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ /*
+ * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+ * locks to ensure that rmap will always observe either the old or the
+ * new ptes. This is the easiest way to avoid races with
+ * truncate_pagecache(), page migration, etc...
+ *
+ * When need_rmap_locks is false, we use other ways to avoid
+ * such races:
+ *
+ * - During exec() shift_arg_pages(), we use a specially tagged vma
+ * which rmap call sites look for using is_vma_temporary_stack().
+ *
+ * - During mremap(), new_vma is often known to be placed after vma
+ * in rmap traversal order. This ensures rmap will always observe
+ * either the old pte, or the new pte, or both (the page table locks
+ * serialize access to individual ptes, but only rmap traversal
+ * order guarantees that we won't miss both the old and new ptes).
+ */
+ if (need_rmap_locks) {
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ mutex_lock(&mapping->i_mmap_mutex);
+ }
+ if (vma->anon_vma) {
+ anon_vma = vma->anon_vma;
+ anon_vma_lock_write(anon_vma);
+ }
}
/*
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
+ if (anon_vma)
+ anon_vma_unlock(anon_vma);
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len)
+ unsigned long new_addr, unsigned long len,
+ bool need_rmap_locks)
{
unsigned long extent, next, old_end;
pmd_t *old_pmd, *new_pmd;
bool need_flush = false;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+ mmun_start = old_addr;
+ mmun_end = old_end;
+ mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
@@ -156,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
need_flush = true;
continue;
} else if (!err) {
- split_huge_page_pmd(vma->vm_mm, old_pmd);
+ split_huge_page_pmd(vma, old_addr, old_pmd);
}
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (extent > LATENCY_LIMIT)
extent = LATENCY_LIMIT;
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
- new_vma, new_pmd, new_addr);
+ new_vma, new_pmd, new_addr, need_rmap_locks);
need_flush = true;
}
if (likely(need_flush))
flush_tlb_range(vma, old_end-len, old_addr);
- mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
return len + old_addr - old_end; /* how much done */
}
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long hiwater_vm;
int split = 0;
int err;
+ bool need_rmap_locks;
/*
* We'd prefer to avoid failure later on in do_munmap:
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return err;
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
- new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+ new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+ &need_rmap_locks);
if (!new_vma)
return -ENOMEM;
- moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+ moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+ need_rmap_locks);
if (moved_len < old_len) {
/*
- * Before moving the page tables from the new vma to
- * the old vma, we need to be sure the old vma is
- * queued after new vma in the same_anon_vma list to
- * prevent SMP races with rmap_walk (that could lead
- * rmap_walk to miss some page table).
- */
- anon_vma_moveto_tail(vma);
-
- /*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
*/
- move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+ true);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f99..b8294fc03df8 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
return count;
}
+static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ /*
+ * In free_area_init_core(), highmem zone's managed_pages is set to
+ * present_pages, and bootmem allocator doesn't allocate from highmem
+ * zones. So there's no need to recalculate managed_pages because all
+ * highmem pages will be managed by the buddy system. Here highmem
+ * zone also includes highmem movable zone.
+ */
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ if (!is_highmem(z))
+ z->managed_pages = 0;
+}
+
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
* @pgdat: node to be released
@@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
+ reset_node_lowmem_managed_pages(pgdat);
/* free_low_memory_core_early(MAX_NUMNODES) will be called later */
return 0;
@@ -158,12 +175,15 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
*/
unsigned long __init free_all_bootmem(void)
{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_lowmem_managed_pages(pgdat);
+
/*
* We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
- * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
- * will be used instead of only Node0 related
*/
return free_low_memory_core_early(MAX_NUMNODES);
}
diff --git a/mm/nommu.c b/mm/nommu.c
index d4b0c10872de..79c3cac87afa 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -66,6 +66,21 @@ int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_read_positive(&vm_committed_as);
+}
+
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(num_physpages);
@@ -698,7 +713,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -764,7 +779,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -789,11 +804,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
kenter("%p", vma);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file) {
+ if (vma->vm_file)
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
- }
put_nommu_region(vma->vm_region);
kmem_cache_free(vm_area_cachep, vma);
}
@@ -1282,14 +1294,8 @@ unsigned long do_mmap_pgoff(struct file *file,
vma->vm_pgoff = pgoff;
if (file) {
- region->vm_file = file;
- get_file(file);
- vma->vm_file = file;
- get_file(file);
- if (vm_flags & VM_EXECUTABLE) {
- added_exe_file_vma(current->mm);
- vma->vm_mm = current->mm;
- }
+ region->vm_file = get_file(file);
+ vma->vm_file = get_file(file);
}
down_write(&nommu_region_sem);
@@ -1442,8 +1448,6 @@ error:
kmem_cache_free(vm_region_jar, region);
if (vma->vm_file)
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(vma->vm_mm);
kmem_cache_free(vm_area_cachep, vma);
kleave(" = %d", ret);
return ret;
@@ -1822,7 +1826,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
if (addr != (pfn << PAGE_SHIFT))
return -EINVAL;
- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1963,6 +1967,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, pgoff_t pgoff)
+{
+ BUG();
+ return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
+
static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, int write)
{
@@ -2047,7 +2059,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
size_t newsize)
{
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
struct vm_region *region;
pgoff_t low, high;
size_t r_size, r_top;
@@ -2059,8 +2070,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
mutex_lock(&inode->i_mapping->i_mmap_mutex);
/* search for VMAs that fall within the dead zone */
- vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
- low, high) {
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
/* found one - only interested if it's shared out of the page
* cache */
if (vma->vm_flags & VM_SHARED) {
@@ -2076,8 +2086,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
* we don't check for any regions that start beyond the EOF as there
* shouldn't be any
*/
- vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
- 0, ULONG_MAX) {
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
+ 0, ULONG_MAX) {
if (!(vma->vm_flags & VM_SHARED))
continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 198600861638..0399f146ae49 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
-/*
- * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
- * @old_val: old oom_score_adj for compare
- * @new_val: new oom_score_adj for swap
- *
- * Sets the oom_score_adj value for current to @new_val iff its present value is
- * @old_val. Usually used to reinstate a previous value to prevent racing with
- * userspacing tuning the value in the interim.
- */
-void compare_swap_oom_score_adj(int old_val, int new_val)
-{
- struct sighand_struct *sighand = current->sighand;
-
- spin_lock_irq(&sighand->siglock);
- if (current->signal->oom_score_adj == old_val)
- current->signal->oom_score_adj = new_val;
- trace_oom_score_adj_update(current);
- spin_unlock_irq(&sighand->siglock);
-}
-
-/**
- * test_set_oom_score_adj() - set current's oom_score_adj and return old value
- * @new_val: new oom_score_adj value
- *
- * Sets the oom_score_adj value for current to @new_val with proper
- * synchronization and returns the old value. Usually used to temporarily
- * set a value, save the old value in the caller, and then reinstate it later.
- */
-int test_set_oom_score_adj(int new_val)
-{
- struct sighand_struct *sighand = current->sighand;
- int old_val;
-
- spin_lock_irq(&sighand->siglock);
- old_val = current->signal->oom_score_adj;
- current->signal->oom_score_adj = new_val;
- trace_oom_score_adj_update(current);
- spin_unlock_irq(&sighand->siglock);
-
- return old_val;
-}
-
#ifdef CONFIG_NUMA
/**
* has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
if (!p)
return 0;
- adj = p->signal->oom_score_adj;
+ adj = (long)p->signal->oom_score_adj;
if (adj == OOM_SCORE_ADJ_MIN) {
task_unlock(p);
return 0;
@@ -257,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+ if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
*totalpages = total_swap_pages;
for_each_node_mask(nid, *nodemask)
*totalpages += node_spanned_pages(nid);
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
if (!task->mm)
return OOM_SCAN_CONTINUE;
- if (task->flags & PF_EXITING) {
+ /*
+ * If task is allocating a lot of memory and has been marked to be
+ * killed first if it triggers an oom, then select it.
+ */
+ if (oom_task_origin(task))
+ return OOM_SCAN_SELECT;
+
+ if (task->flags & PF_EXITING && !force_kill) {
/*
- * If task is current and is in the process of releasing memory,
- * allow the "kill" to set TIF_MEMDIE, which will allow it to
- * access memory reserves. Otherwise, it may stall forever.
- *
- * The iteration isn't broken here, however, in case other
- * threads are found to have already been oom killed.
+ * If this task is not being ptraced on exit, then wait for it
+ * to finish before killing some other task unnecessarily.
*/
- if (task == current)
- return OOM_SCAN_SELECT;
- else if (!force_kill) {
- /*
- * If this task is not being ptraced on exit, then wait
- * for it to finish before killing some other task
- * unnecessarily.
- */
- if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
- return OOM_SCAN_ABORT;
- }
+ if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
+ return OOM_SCAN_ABORT;
}
return OOM_SCAN_OK;
}
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
+ pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
task->mm->nr_ptes,
@@ -428,8 +380,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_adj=%d, oom_score_adj=%d\n",
- current->comm, gfp_mask, order, current->signal->oom_adj,
+ "oom_score_adj=%hd\n",
+ current->comm, gfp_mask, order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
@@ -639,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
spin_unlock(&zone_scan_lock);
}
-/*
- * Try to acquire the oom killer lock for all system zones. Returns zero if a
- * parallel oom killing is taking place, otherwise locks all zones and returns
- * non-zero.
- */
-static int try_set_system_oom(void)
-{
- struct zone *zone;
- int ret = 1;
-
- spin_lock(&zone_scan_lock);
- for_each_populated_zone(zone)
- if (zone_is_oom_locked(zone)) {
- ret = 0;
- goto out;
- }
- for_each_populated_zone(zone)
- zone_set_flag(zone, ZONE_OOM_LOCKED);
-out:
- spin_unlock(&zone_scan_lock);
- return ret;
-}
-
-/*
- * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
- * attempts or page faults may now recall the oom killer, if necessary.
- */
-static void clear_system_oom(void)
-{
- struct zone *zone;
-
- spin_lock(&zone_scan_lock);
- for_each_populated_zone(zone)
- zone_clear_flag(zone, ZONE_OOM_LOCKED);
- spin_unlock(&zone_scan_lock);
-}
-
/**
* out_of_memory - kill the "best" process when we run out of memory
* @zonelist: zonelist pointer
@@ -706,11 +621,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
return;
/*
- * If current has a pending SIGKILL, then automatically select it. The
- * goal is to allow it to allocate so that it may quickly exit and free
- * its memory.
+ * If current has a pending SIGKILL or is exiting, then automatically
+ * select it. The goal is to allow it to allocate so that it may
+ * quickly exit and free its memory.
*/
- if (fatal_signal_pending(current)) {
+ if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
set_thread_flag(TIF_MEMDIE);
return;
}
@@ -756,15 +671,16 @@ out:
/*
* The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
- * oom killing is already in progress so do nothing. If a task is found with
- * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
+ * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
+ * parallel oom killing is already in progress so do nothing.
*/
void pagefault_out_of_memory(void)
{
- if (try_set_system_oom()) {
+ struct zonelist *zonelist = node_zonelist(first_online_node,
+ GFP_KERNEL);
+
+ if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
out_of_memory(NULL, 0, 0, NULL, false);
- clear_system_oom();
+ clear_zonelist_oom(zonelist, GFP_KERNEL);
}
- schedule_timeout_killable(1);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5ad5ce23c1e0..6f4271224493 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
}
/*
- * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
* will look to see if it needs to start dirty throttling.
*
* If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/**
- * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * balance_dirty_pages_ratelimited - balance dirty memory state
* @mapping: address_space which was dirtied
- * @nr_pages_dirtied: number of pages which the caller has just dirtied
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
@@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
* limit we decrease the ratelimiting by a lot, to prevent individual processes
* from overshooting the limit by (ratelimit_pages) each.
*/
-void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
- unsigned long nr_pages_dirtied)
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
int ratelimit;
@@ -1484,6 +1482,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
*/
p = &__get_cpu_var(dirty_throttle_leaks);
if (*p > 0 && current->nr_dirtied < ratelimit) {
+ unsigned long nr_pages_dirtied;
nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
*p -= nr_pages_dirtied;
current->nr_dirtied += nr_pages_dirtied;
@@ -1493,7 +1492,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(mapping, current->nr_dirtied);
}
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
void throttle_vm_writeout(gfp_t gfp_mask)
{
@@ -1602,10 +1601,18 @@ void writeback_set_ratelimit(void)
}
static int __cpuinit
-ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+ratelimit_handler(struct notifier_block *self, unsigned long action,
+ void *hcpu)
{
- writeback_set_ratelimit();
- return NOTIFY_DONE;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ writeback_set_ratelimit();
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
}
static struct notifier_block __cpuinitdata ratelimit_nb = {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea7538891..2ad2ad168efe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
+#ifdef CONFIG_MOVABLE_NODE
+ [N_MEMORY] = { { [0] = 1UL } },
+#endif
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
@@ -368,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
int nr_pages = 1 << order;
int bad = 0;
- if (unlikely(compound_order(page) != order) ||
- unlikely(!PageHead(page))) {
+ if (unlikely(compound_order(page) != order)) {
bad_page(page);
bad++;
}
@@ -523,7 +525,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* If a block is freed, and its buddy is also free, then this
* triggers coalescing into a block of larger size.
*
- * -- wli
+ * -- nyc
*/
static inline void __free_one_page(struct page *page,
@@ -558,7 +560,8 @@ static inline void __free_one_page(struct page *page,
if (page_is_guard(buddy)) {
clear_page_guard_flag(buddy);
set_page_private(page, 0);
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+ __mod_zone_freepage_state(zone, 1 << order,
+ migratetype);
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
@@ -597,17 +600,6 @@ out:
zone->free_area[order].nr_free++;
}
-/*
- * free_page_mlock() -- clean up attempts to free and mlocked() page.
- * Page should not be on lru, so no need to fix that up.
- * free_pages_check() will verify...
- */
-static inline void free_page_mlock(struct page *page)
-{
- __dec_zone_page_state(page, NR_MLOCK);
- __count_vm_event(UNEVICTABLE_MLOCKFREED);
-}
-
static inline int free_pages_check(struct page *page)
{
if (unlikely(page_mapcount(page) |
@@ -618,6 +610,7 @@ static inline int free_pages_check(struct page *page)
bad_page(page);
return 1;
}
+ reset_page_last_nid(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -668,15 +661,22 @@ static void free_pcppages_bulk(struct zone *zone, int count,
batch_free = to_free;
do {
+ int mt; /* migratetype of the to-be-freed page */
+
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
+ mt = get_freepage_migratetype(page);
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
- __free_one_page(page, zone, 0, page_private(page));
- trace_mm_page_pcpu_drain(page, 0, page_private(page));
+ __free_one_page(page, zone, 0, mt);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+ if (is_migrate_cma(mt))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+ }
} while (--to_free && --batch_free && !list_empty(list));
}
- __mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
}
@@ -688,7 +688,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
zone->pages_scanned = 0;
__free_one_page(page, zone, order, migratetype);
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+ if (unlikely(migratetype != MIGRATE_ISOLATE))
+ __mod_zone_freepage_state(zone, 1 << order, migratetype);
spin_unlock(&zone->lock);
}
@@ -721,20 +722,26 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
- int wasMlocked = __TestClearPageMlocked(page);
+ int migratetype;
if (!free_pages_prepare(page, order))
return;
local_irq_save(flags);
- if (unlikely(wasMlocked))
- free_page_mlock(page);
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, order,
- get_pageblock_migratetype(page));
+ migratetype = get_pageblock_migratetype(page);
+ set_freepage_migratetype(page, migratetype);
+ free_one_page(page_zone(page), page, order, migratetype);
local_irq_restore(flags);
}
+/*
+ * Read access to zone->managed_pages is safe because it's unsigned long,
+ * but we still need to serialize writers. Currently all callers of
+ * __free_pages_bootmem() except put_page_bootmem() should only be used
+ * at boot time. So for shorter boot time, we shift the burden to
+ * put_page_bootmem() to serialize writers.
+ */
void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
@@ -750,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
set_page_count(p, 0);
}
+ page_zone(page)->managed_pages += 1 << order;
set_page_refcounted(page);
__free_pages(page, order);
}
@@ -785,7 +793,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
- * -- wli
+ * -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, struct free_area *area,
@@ -811,7 +819,8 @@ static inline void expand(struct zone *zone, struct page *page,
set_page_guard_flag(&page[size]);
set_page_private(&page[size], high);
/* Guard pages are not available for any usage */
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+ __mod_zone_freepage_state(zone, -(1 << high),
+ migratetype);
continue;
}
#endif
@@ -915,7 +924,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
-static int move_freepages(struct zone *zone,
+int move_freepages(struct zone *zone,
struct page *start_page, struct page *end_page,
int migratetype)
{
@@ -951,6 +960,7 @@ static int move_freepages(struct zone *zone,
order = page_order(page);
list_move(&page->lru,
&zone->free_area[order].free_list[migratetype]);
+ set_freepage_migratetype(page, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -1135,8 +1145,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
mt = migratetype;
}
- set_page_private(page, mt);
+ set_freepage_migratetype(page, mt);
list = &page->lru;
+ if (is_migrate_cma(mt))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+ -(1 << order));
}
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
@@ -1296,16 +1309,13 @@ void free_hot_cold_page(struct page *page, int cold)
struct per_cpu_pages *pcp;
unsigned long flags;
int migratetype;
- int wasMlocked = __TestClearPageMlocked(page);
if (!free_pages_prepare(page, 0))
return;
migratetype = get_pageblock_migratetype(page);
- set_page_private(page, migratetype);
+ set_freepage_migratetype(page, migratetype);
local_irq_save(flags);
- if (unlikely(wasMlocked))
- free_page_mlock(page);
__count_vm_event(PGFREE);
/*
@@ -1380,41 +1390,42 @@ void split_page(struct page *page, unsigned int order)
}
/*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
+ * Similar to the split_page family of functions except that the page
+ * required at the given order and being isolated now to prevent races
+ * with parallel allocators
*/
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
{
unsigned int order;
unsigned long watermark;
struct zone *zone;
+ int mt;
BUG_ON(!PageBuddy(page));
zone = page_zone(page);
order = page_order(page);
+ mt = get_pageblock_migratetype(page);
- /* Obey watermarks as if the page was being allocated */
- watermark = low_wmark_pages(zone) + (1 << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
- return 0;
+ if (mt != MIGRATE_ISOLATE) {
+ /* Obey watermarks as if the page was being allocated */
+ watermark = low_wmark_pages(zone) + (1 << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return 0;
+
+ __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
+ }
/* Remove page from free list */
list_del(&page->lru);
zone->free_area[order].nr_free--;
rmv_page_order(page);
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
- /* Split into individual pages */
- set_page_refcounted(page);
- split_page(page, order);
+ if (alloc_order != order)
+ expand(zone, page, alloc_order, order,
+ &zone->free_area[order], migratetype);
+ /* Set the pageblock if the captured page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1436,35 @@ int split_free_page(struct page *page)
}
}
- return 1 << order;
+ return 1UL << alloc_order;
+}
+
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+ unsigned int order;
+ int nr_pages;
+
+ BUG_ON(!PageBuddy(page));
+ order = page_order(page);
+
+ nr_pages = capture_free_page(page, order, 0);
+ if (!nr_pages)
+ return 0;
+
+ /* Split into individual pages */
+ set_page_refcounted(page);
+ split_page(page, order);
+ return nr_pages;
}
/*
@@ -1484,7 +1523,8 @@ again:
spin_unlock(&zone->lock);
if (!page)
goto failed;
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pageblock_migratetype(page));
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1501,19 +1541,6 @@ failed:
return NULL;
}
-/* The ALLOC_WMARK bits are used as an index to zone->watermark */
-#define ALLOC_WMARK_MIN WMARK_MIN
-#define ALLOC_WMARK_LOW WMARK_LOW
-#define ALLOC_WMARK_HIGH WMARK_HIGH
-#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
-
-/* Mask to get the watermark bits */
-#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
-
-#define ALLOC_HARDER 0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
-
#ifdef CONFIG_FAIL_PAGE_ALLOC
static struct {
@@ -1608,7 +1635,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
-
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
if (free_pages <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
@@ -1675,7 +1706,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
*
* If the zonelist cache is present in the passed in zonelist, then
* returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
+ * tasks mems_allowed, or node_states[N_MEMORY].)
*
* If the zonelist cache is not available for this zonelist, does
* nothing and returns NULL.
@@ -1704,7 +1735,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
&cpuset_current_mems_allowed :
- &node_states[N_HIGH_MEMORY];
+ &node_states[N_MEMORY];
return allowednodes;
}
@@ -1782,6 +1813,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
}
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+ return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
+}
+
+static void __paginginit init_zone_allows_reclaim(int nid)
+{
+ int i;
+
+ for_each_online_node(i)
+ if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+ node_set(i, NODE_DATA(nid)->reclaim_nodes);
+ else
+ zone_reclaim_mode = 1;
+}
+
#else /* CONFIG_NUMA */
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1802,6 +1849,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
static void zlc_clear_zones_full(struct zonelist *zonelist)
{
}
+
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+ return true;
+}
+
+static inline void init_zone_allows_reclaim(int nid)
+{
+}
#endif /* CONFIG_NUMA */
/*
@@ -1829,7 +1885,7 @@ zonelist_scan:
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
- if (NUMA_BUILD && zlc_active &&
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1875,7 +1931,8 @@ zonelist_scan:
classzone_idx, alloc_flags))
goto try_this_zone;
- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
@@ -1886,14 +1943,15 @@ zonelist_scan:
did_zlc_setup = 1;
}
- if (zone_reclaim_mode == 0)
+ if (zone_reclaim_mode == 0 ||
+ !zone_allows_reclaim(preferred_zone, zone))
goto this_zone_full;
/*
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*/
- if (NUMA_BUILD && zlc_active &&
+ if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
@@ -1919,11 +1977,11 @@ try_this_zone:
if (page)
break;
this_zone_full:
- if (NUMA_BUILD)
+ if (IS_ENABLED(CONFIG_NUMA))
zlc_mark_zone_full(zonelist, z);
}
- if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+ if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
@@ -2105,7 +2163,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
- struct page *page;
+ struct page *page = NULL;
if (!order)
return NULL;
@@ -2118,10 +2176,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration,
- contended_compaction);
+ contended_compaction, &page);
current->flags &= ~PF_MEMALLOC;
- if (*did_some_progress != COMPACT_SKIPPED) {
+ /* If compaction captured a page, prep and use it */
+ if (page) {
+ prep_new_page(page, order, gfp_mask);
+ goto got_page;
+ }
+
+ if (*did_some_progress != COMPACT_SKIPPED) {
/* Page migration frees to the PCP lists but we want merging */
drain_pages(get_cpu());
put_cpu();
@@ -2131,6 +2195,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page) {
+got_page:
+ preferred_zone->compact_blockskip_flush = false;
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
if (order >= preferred_zone->compact_order_failed)
@@ -2215,7 +2281,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
return NULL;
/* After successful reclaim, reconsider all zones for allocation */
- if (NUMA_BUILD)
+ if (IS_ENABLED(CONFIG_NUMA))
zlc_clear_zones_full(zonelist);
retry:
@@ -2315,7 +2381,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
-
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
return alloc_flags;
}
@@ -2358,7 +2427,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* allowed per node queues are empty and that nodes are
* over allocated.
*/
- if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
@@ -2541,6 +2611,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+ struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
@@ -2559,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ /*
+ * Will only have any effect when __GFP_KMEMCG is set. This is
+ * verified in the (always inline) callee
+ */
+ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+ return NULL;
+
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
@@ -2569,9 +2648,13 @@ retry_cpuset:
if (!preferred_zone)
goto out;
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
+ zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype);
if (unlikely(!page))
page = __alloc_pages_slowpath(gfp_mask, order,
@@ -2590,6 +2673,8 @@ out:
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
+ memcg_kmem_commit_charge(page, memcg, order);
+
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2642,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
+/*
+ * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
+ * pages allocated with __GFP_KMEMCG.
+ *
+ * Those pages are accounted to a particular memcg, embedded in the
+ * corresponding page_cgroup. To avoid adding a hit in the allocator to search
+ * for that information only to find out that it is NULL for users who have no
+ * interest in that whatsoever, we provide these functions.
+ *
+ * The caller knows better which flags it relies on.
+ */
+void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+{
+ memcg_kmem_uncharge_pages(page, order);
+ __free_pages(page, order);
+}
+
+void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+{
+ if (addr != 0) {
+ VM_BUG_ON(!virt_addr_valid((void *)addr));
+ __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+ }
+}
+
static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
{
if (addr) {
@@ -2760,7 +2870,7 @@ unsigned int nr_free_pagecache_pages(void)
static inline void show_node(struct zone *zone)
{
- if (NUMA_BUILD)
+ if (IS_ENABLED(CONFIG_NUMA))
printk("Node %d ", zone_to_nid(zone));
}
@@ -2818,6 +2928,31 @@ out:
#define K(x) ((x) << (PAGE_SHIFT-10))
+static void show_migration_types(unsigned char type)
+{
+ static const char types[MIGRATE_TYPES] = {
+ [MIGRATE_UNMOVABLE] = 'U',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_MOVABLE] = 'M',
+ [MIGRATE_RESERVE] = 'R',
+#ifdef CONFIG_CMA
+ [MIGRATE_CMA] = 'C',
+#endif
+ [MIGRATE_ISOLATE] = 'I',
+ };
+ char tmp[MIGRATE_TYPES + 1];
+ char *p = tmp;
+ int i;
+
+ for (i = 0; i < MIGRATE_TYPES; i++) {
+ if (type & (1 << i))
+ *p++ = types[i];
+ }
+
+ *p = '\0';
+ printk("(%s) ", tmp);
+}
+
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
@@ -2852,7 +2987,8 @@ void show_free_areas(unsigned int filter)
" unevictable:%lu"
" dirty:%lu writeback:%lu unstable:%lu\n"
" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
+ " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_ISOLATED_ANON),
@@ -2869,7 +3005,8 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FILE_MAPPED),
global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
- global_page_state(NR_BOUNCE));
+ global_page_state(NR_BOUNCE),
+ global_page_state(NR_FREE_CMA_PAGES));
for_each_populated_zone(zone) {
int i;
@@ -2890,6 +3027,7 @@ void show_free_areas(unsigned int filter)
" isolated(anon):%lukB"
" isolated(file):%lukB"
" present:%lukB"
+ " managed:%lukB"
" mlocked:%lukB"
" dirty:%lukB"
" writeback:%lukB"
@@ -2901,6 +3039,7 @@ void show_free_areas(unsigned int filter)
" pagetables:%lukB"
" unstable:%lukB"
" bounce:%lukB"
+ " free_cma:%lukB"
" writeback_tmp:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
@@ -2918,6 +3057,7 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_ISOLATED_ANON)),
K(zone_page_state(zone, NR_ISOLATED_FILE)),
K(zone->present_pages),
+ K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_FILE_DIRTY)),
K(zone_page_state(zone, NR_WRITEBACK)),
@@ -2930,6 +3070,7 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no")
@@ -2942,6 +3083,7 @@ void show_free_areas(unsigned int filter)
for_each_populated_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ unsigned char types[MAX_ORDER];
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
@@ -2950,12 +3092,24 @@ void show_free_areas(unsigned int filter)
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
- nr[order] = zone->free_area[order].nr_free;
+ struct free_area *area = &zone->free_area[order];
+ int type;
+
+ nr[order] = area->nr_free;
total += nr[order] << order;
+
+ types[order] = 0;
+ for (type = 0; type < MIGRATE_TYPES; type++) {
+ if (!list_empty(&area->free_list[type]))
+ types[order] |= 1 << type;
+ }
}
spin_unlock_irqrestore(&zone->lock, flags);
- for (order = 0; order < MAX_ORDER; order++)
+ for (order = 0; order < MAX_ORDER; order++) {
printk("%lu*%lukB ", nr[order], K(1UL) << order);
+ if (nr[order])
+ show_migration_types(types[order]);
+ }
printk("= %lukB\n", K(total));
}
@@ -3132,7 +3286,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
return node;
}
- for_each_node_state(n, N_HIGH_MEMORY) {
+ for_each_node_state(n, N_MEMORY) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
@@ -3274,7 +3428,7 @@ static int default_zonelist_order(void)
* local memory, NODE_ORDER may be suitable.
*/
average_size = total_size /
- (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
+ (nodes_weight(node_states[N_MEMORY]) + 1);
for_each_online_node(nid) {
low_kmem_size = 0;
total_size = 0;
@@ -3328,21 +3482,13 @@ static void build_zonelists(pg_data_t *pgdat)
j = 0;
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
- int distance = node_distance(local_node, node);
-
- /*
- * If another node is sufficiently far away then it is better
- * to reclaim pages in a zone before going off node.
- */
- if (distance > RECLAIM_DISTANCE)
- zone_reclaim_mode = 1;
-
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
- if (distance != node_distance(local_node, prev_node))
+ if (node_distance(local_node, node) !=
+ node_distance(local_node, prev_node))
node_load[node] = load;
prev_node = node;
@@ -3772,6 +3918,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
reset_page_mapcount(page);
+ reset_page_last_nid(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
@@ -4378,6 +4525,26 @@ void __init set_pageblock_order(void)
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
+ unsigned long present_pages)
+{
+ unsigned long pages = spanned_pages;
+
+ /*
+ * Provide a more accurate estimation if there are holes within
+ * the zone and SPARSEMEM is in use. If there are holes within the
+ * zone, each populated memory region may cost us one or two extra
+ * memmap pages due to alignment because memmap pages for each
+ * populated regions may not naturally algined on page boundary.
+ * So the (present_pages >> 4) heuristic is a tradeoff for that.
+ */
+ if (spanned_pages > present_pages + (present_pages >> 4) &&
+ IS_ENABLED(CONFIG_SPARSEMEM))
+ pages = present_pages;
+
+ return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
+}
+
/*
* Set up the zone data structures:
* - mark all pages reserved
@@ -4395,59 +4562,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
int ret;
pgdat_resize_init(pgdat);
+#ifdef CONFIG_NUMA_BALANCING
+ spin_lock_init(&pgdat->numabalancing_migrate_lock);
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize, memmap_pages;
+ unsigned long size, realsize, freesize, memmap_pages;
size = zone_spanned_pages_in_node(nid, j, zones_size);
- realsize = size - zone_absent_pages_in_node(nid, j,
+ realsize = freesize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
/*
- * Adjust realsize so that it accounts for how much memory
+ * Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
- memmap_pages =
- PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
- if (realsize >= memmap_pages) {
- realsize -= memmap_pages;
+ memmap_pages = calc_memmap_size(size, realsize);
+ if (freesize >= memmap_pages) {
+ freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
- " %s zone: %lu pages exceeds realsize %lu\n",
- zone_names[j], memmap_pages, realsize);
+ " %s zone: %lu pages exceeds freesize %lu\n",
+ zone_names[j], memmap_pages, freesize);
/* Account for reserved pages */
- if (j == 0 && realsize > dma_reserve) {
- realsize -= dma_reserve;
+ if (j == 0 && freesize > dma_reserve) {
+ freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
- nr_kernel_pages += realsize;
- nr_all_pages += realsize;
+ nr_kernel_pages += freesize;
+ /* Charge for highmem memmap if there are enough kernel pages */
+ else if (nr_kernel_pages > memmap_pages * 2)
+ nr_kernel_pages -= memmap_pages;
+ nr_all_pages += freesize;
zone->spanned_pages = size;
- zone->present_pages = realsize;
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
- zone->compact_cached_free_pfn = zone->zone_start_pfn +
- zone->spanned_pages;
- zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
-#endif
+ zone->present_pages = freesize;
+ /*
+ * Set an approximate value for lowmem here, it will be adjusted
+ * when the bootmem allocator frees pages into the buddy system.
+ * And all highmem pages will be managed by the buddy system.
+ */
+ zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
+ zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
/ 100;
- zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
+ zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
@@ -4456,7 +4631,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- lruvec_init(&zone->lruvec, zone);
+ lruvec_init(&zone->lruvec);
if (!size)
continue;
@@ -4521,6 +4696,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
+ init_zone_allows_reclaim(nid);
calculate_node_totalpages(pgdat, zones_size, zholes_size);
alloc_node_mem_map(pgdat);
@@ -4637,7 +4813,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
/*
* early_calculate_totalpages()
* Sum pages in active regions for movable zone.
- * Populate N_HIGH_MEMORY for calculating usable_nodes.
+ * Populate N_MEMORY for calculating usable_nodes.
*/
static unsigned long __init early_calculate_totalpages(void)
{
@@ -4650,7 +4826,7 @@ static unsigned long __init early_calculate_totalpages(void)
totalpages += pages;
if (pages)
- node_set_state(nid, N_HIGH_MEMORY);
+ node_set_state(nid, N_MEMORY);
}
return totalpages;
}
@@ -4667,9 +4843,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
unsigned long usable_startpfn;
unsigned long kernelcore_node, kernelcore_remaining;
/* save the state before borrow the nodemask */
- nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
+ nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
- int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+ int usable_nodes = nodes_weight(node_states[N_MEMORY]);
/*
* If movablecore was specified, calculate what size of
@@ -4704,7 +4880,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
restart:
/* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
/*
@@ -4796,23 +4972,27 @@ restart:
out:
/* restore the node_state */
- node_states[N_HIGH_MEMORY] = saved_node_state;
+ node_states[N_MEMORY] = saved_node_state;
}
-/* Any regular memory on that node ? */
-static void __init check_for_regular_memory(pg_data_t *pgdat)
+/* Any regular or high memory on that node ? */
+static void check_for_memory(pg_data_t *pgdat, int nid)
{
-#ifdef CONFIG_HIGHMEM
enum zone_type zone_type;
- for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
+ if (N_MEMORY == N_NORMAL_MEMORY)
+ return;
+
+ for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (zone->present_pages) {
- node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
+ zone_type <= ZONE_NORMAL)
+ node_set_state(nid, N_NORMAL_MEMORY);
break;
}
}
-#endif
}
/**
@@ -4879,7 +5059,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
zone_movable_pfn[i] << PAGE_SHIFT);
}
- /* Print out the early_node_map[] */
+ /* Print out the early node map */
printk("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
@@ -4895,8 +5075,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Any memory on that node */
if (pgdat->node_present_pages)
- node_set_state(nid, N_HIGH_MEMORY);
- check_for_regular_memory(pgdat);
+ node_set_state(nid, N_MEMORY);
+ check_for_memory(pgdat, nid);
}
}
@@ -5124,10 +5304,6 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
- zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
- zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
- zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
-
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -5525,7 +5701,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
* MIGRATE_MOVABLE block might include unmovable pages. It means you can't
* expect this function should be exact.
*/
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+ bool skip_hwpoisoned_pages)
{
unsigned long pfn, iter, found;
int mt;
@@ -5560,6 +5737,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
continue;
}
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if (skip_hwpoisoned_pages && PageHWPoison(page))
+ continue;
+
if (!PageLRU(page))
found++;
/*
@@ -5602,7 +5786,7 @@ bool is_pageblock_removable_nolock(struct page *page)
zone->zone_start_pfn + zone->spanned_pages <= pfn)
return false;
- return !has_unmovable_pages(zone, page, 0);
+ return !has_unmovable_pages(zone, page, 0, true);
}
#ifdef CONFIG_CMA
@@ -5619,47 +5803,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
pageblock_nr_pages));
}
-static struct page *
-__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
- int **resultp)
-{
- gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
- return alloc_page(gfp_mask);
-}
-
/* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
-
+ unsigned long nr_reclaimed;
unsigned long pfn = start;
unsigned int tries = 0;
int ret = 0;
- struct compact_control cc = {
- .nr_migratepages = 0,
- .order = -1,
- .zone = page_zone(pfn_to_page(start)),
- .sync = true,
- };
- INIT_LIST_HEAD(&cc.migratepages);
-
- migrate_prep_local();
+ migrate_prep();
- while (pfn < end || !list_empty(&cc.migratepages)) {
+ while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
- if (list_empty(&cc.migratepages)) {
- cc.nr_migratepages = 0;
- pfn = isolate_migratepages_range(cc.zone, &cc,
- pfn, end);
+ if (list_empty(&cc->migratepages)) {
+ cc->nr_migratepages = 0;
+ pfn = isolate_migratepages_range(cc->zone, cc,
+ pfn, end, true);
if (!pfn) {
ret = -EINTR;
break;
@@ -5670,61 +5835,18 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
break;
}
- ret = migrate_pages(&cc.migratepages,
- __alloc_contig_migrate_alloc,
- 0, false, MIGRATE_SYNC);
- }
-
- putback_lru_pages(&cc.migratepages);
- return ret > 0 ? 0 : ret;
-}
-
-/*
- * Update zone's cma pages counter used for watermark level calculation.
- */
-static inline void __update_cma_watermarks(struct zone *zone, int count)
-{
- unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
- zone->min_cma_pages += count;
- spin_unlock_irqrestore(&zone->lock, flags);
- setup_per_zone_wmarks();
-}
-
-/*
- * Trigger memory pressure bump to reclaim some pages in order to be able to
- * allocate 'count' pages in single page units. Does similar work as
- *__alloc_pages_slowpath() function.
- */
-static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
-{
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- struct zonelist *zonelist = node_zonelist(0, gfp_mask);
- int did_some_progress = 0;
- int order = 1;
-
- /*
- * Increase level of watermarks to force kswapd do his job
- * to stabilise at new watermark level.
- */
- __update_cma_watermarks(zone, count);
-
- /* Obey watermarks as if the page was being allocated */
- while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
- wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
+ nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
+ &cc->migratepages);
+ cc->nr_migratepages -= nr_reclaimed;
- did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
- NULL);
- if (!did_some_progress) {
- /* Exhausted what can be done so it's blamo time */
- out_of_memory(zonelist, gfp_mask, order, NULL, false);
- }
+ ret = migrate_pages(&cc->migratepages,
+ alloc_migrate_target,
+ 0, false, MIGRATE_SYNC,
+ MR_CMA);
}
- /* Restore original watermark levels. */
- __update_cma_watermarks(zone, -count);
-
- return count;
+ putback_movable_pages(&cc->migratepages);
+ return ret > 0 ? 0 : ret;
}
/**
@@ -5750,10 +5872,18 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype)
{
- struct zone *zone = page_zone(pfn_to_page(start));
unsigned long outer_start, outer_end;
int ret = 0, order;
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(start)),
+ .sync = true,
+ .ignore_skip_hint = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
/*
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
@@ -5779,11 +5909,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
ret = start_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype);
+ pfn_max_align_up(end), migratetype,
+ false);
if (ret)
- goto done;
+ return ret;
- ret = __alloc_contig_migrate_range(start, end);
+ ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret)
goto done;
@@ -5818,21 +5949,16 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
/* Make sure the range is really isolated. */
- if (test_pages_isolated(outer_start, end)) {
+ if (test_pages_isolated(outer_start, end, false)) {
pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
outer_start, end);
ret = -EBUSY;
goto done;
}
- /*
- * Reclaim enough pages to make sure that contiguous allocation
- * will not starve the system.
- */
- __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
/* Grab isolated pages from freelists. */
- outer_end = isolate_freepages_range(outer_start, end);
+ outer_end = isolate_freepages_range(&cc, outer_start, end);
if (!outer_end) {
ret = -EBUSY;
goto done;
@@ -5874,6 +6000,7 @@ static int __meminit __zone_pcp_update(void *data)
local_irq_save(flags);
if (pcp->count > 0)
free_pcppages_bulk(zone, pcp->count, pcp);
+ drain_zonestat(zone, pset);
setup_pageset(pset, batch);
local_irq_restore(flags);
}
@@ -5886,20 +6013,26 @@ void __meminit zone_pcp_update(struct zone *zone)
}
#endif
-#ifdef CONFIG_MEMORY_HOTREMOVE
void zone_pcp_reset(struct zone *zone)
{
unsigned long flags;
+ int cpu;
+ struct per_cpu_pageset *pset;
/* avoid races with drain_pages() */
local_irq_save(flags);
if (zone->pageset != &boot_pageset) {
+ for_each_online_cpu(cpu) {
+ pset = per_cpu_ptr(zone->pageset, cpu);
+ drain_zonestat(zone, pset);
+ }
free_percpu(zone->pageset);
zone->pageset = &boot_pageset;
}
local_irq_restore(flags);
}
+#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* All pages in the range must be isolated before calling this.
*/
@@ -5926,6 +6059,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
continue;
}
page = pfn_to_page(pfn);
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
+ pfn++;
+ SetPageReserved(page);
+ continue;
+ }
+
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
@@ -5936,8 +6079,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
list_del(&page->lru);
rmv_page_order(page);
zone->free_area[order].nr_free--;
- __mod_zone_page_state(zone, NR_FREE_PAGES,
- - (1UL << order));
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c6daa6..6d757e3a872a 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
mn->nr_pages, mn->status_change_nid);
break;
case MEM_CANCEL_ONLINE:
+ offline_page_cgroup(mn->start_pfn,
+ mn->nr_pages, mn->status_change_nid);
+ break;
case MEM_GOING_OFFLINE:
break;
case MEM_ONLINE:
@@ -271,7 +274,7 @@ void __init page_cgroup_init(void)
if (mem_cgroup_disabled())
return;
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
start_pfn = node_start_pfn(nid);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f175739..9d2264ea4606 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -30,7 +30,7 @@ static void restore_pageblock_isolate(struct page *page, int migratetype)
zone->nr_pageblock_isolate--;
}
-int set_migratetype_isolate(struct page *page)
+int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
{
struct zone *zone;
unsigned long flags, pfn;
@@ -66,7 +66,8 @@ int set_migratetype_isolate(struct page *page)
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
*/
- if (!has_unmovable_pages(zone, page, arg.pages_found))
+ if (!has_unmovable_pages(zone, page, arg.pages_found,
+ skip_hwpoisoned_pages))
ret = 0;
/*
@@ -76,8 +77,13 @@ int set_migratetype_isolate(struct page *page)
out:
if (!ret) {
+ unsigned long nr_pages;
+ int migratetype = get_pageblock_migratetype(page);
+
set_pageblock_isolate(page);
- move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+
+ __mod_zone_freepage_state(zone, -nr_pages, migratetype);
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +95,14 @@ out:
void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
- unsigned long flags;
+ unsigned long flags, nr_pages;
+
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
goto out;
- move_freepages_block(zone, page, migratetype);
+ nr_pages = move_freepages_block(zone, page, migratetype);
+ __mod_zone_freepage_state(zone, nr_pages, migratetype);
restore_pageblock_isolate(page, migratetype);
out:
spin_unlock_irqrestore(&zone->lock, flags);
@@ -127,7 +135,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* Returns 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype)
+ unsigned migratetype, bool skip_hwpoisoned_pages)
{
unsigned long pfn;
unsigned long undo_pfn;
@@ -140,7 +148,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page && set_migratetype_isolate(page)) {
+ if (page &&
+ set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
undo_pfn = pfn;
goto undo;
}
@@ -183,7 +192,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* Returns 1 if all pages in the range are isolated.
*/
static int
-__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
+__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
+ bool skip_hwpoisoned_pages)
{
struct page *page;
@@ -193,11 +203,34 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
continue;
}
page = pfn_to_page(pfn);
- if (PageBuddy(page))
+ if (PageBuddy(page)) {
+ /*
+ * If race between isolatation and allocation happens,
+ * some free pages could be in MIGRATE_MOVABLE list
+ * although pageblock's migratation type of the page
+ * is MIGRATE_ISOLATE. Catch it and move the page into
+ * MIGRATE_ISOLATE list.
+ */
+ if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
+ struct page *end_page;
+
+ end_page = page + (1 << page_order(page)) - 1;
+ move_freepages(page_zone(page), page, end_page,
+ MIGRATE_ISOLATE);
+ }
pfn += 1 << page_order(page);
+ }
else if (page_count(page) == 0 &&
- page_private(page) == MIGRATE_ISOLATE)
+ get_freepage_migratetype(page) == MIGRATE_ISOLATE)
pfn += 1;
+ else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
+ /*
+ * The HWPoisoned page may be not in buddy
+ * system, and page_count() is not 0.
+ */
+ pfn++;
+ continue;
+ }
else
break;
}
@@ -206,7 +239,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
return 1;
}
-int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
+ bool skip_hwpoisoned_pages)
{
unsigned long pfn, flags;
struct page *page;
@@ -229,7 +263,19 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
/* Check all pages are free or Marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
+ ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
+ skip_hwpoisoned_pages);
spin_unlock_irqrestore(&zone->lock, flags);
return ret ? 0 : -EBUSY;
}
+
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+ int **resultp)
+{
+ gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ return alloc_page(gfp_mask);
+}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 6c118d012bb5..35aa294656cd 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
if (!walk->pte_entry)
continue;
- split_huge_page_pmd(walk->mm, pmd);
+ split_huge_page_pmd_mm(walk->mm, addr, pmd);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/percpu.c b/mm/percpu.c
index bb4be7435ce3..8c8e08f3a692 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -631,7 +631,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
if (!chunk)
return;
pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
- kfree(chunk);
+ pcpu_mem_free(chunk, pcpu_chunk_struct_size);
}
/*
@@ -1370,7 +1370,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
#ifdef CONFIG_SMP
-const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
+const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
[PCPU_FC_PAGE] = "page",
@@ -1380,6 +1380,9 @@ enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
static int __init percpu_alloc_setup(char *str)
{
+ if (!str)
+ return -EINVAL;
+
if (0)
/* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa0..0c8323fe6c8f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
- * Only sets the access flags (dirty, accessed, and
- * writable). Furthermore, we know it always gets set to a "more
+ * Only sets the access flags (dirty, accessed), as well as write
+ * permission. Furthermore, we know it always gets set to a "more
* permissive" setting, which allows most architectures to optimize
* this. We return whether the PTE actually changed, which in turn
* instructs the caller to do things like update__mmu_cache. This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed) {
set_pte_at(vma->vm_mm, address, ptep, entry);
- flush_tlb_page(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address);
}
return changed;
}
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pte_t pte;
pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
- flush_tlb_page(vma, address);
+ if (pte_accessible(pte))
+ flush_tlb_page(vma, address);
return pte;
}
#endif
@@ -120,3 +121,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ if (!mm->pmd_huge_pte)
+ INIT_LIST_HEAD(&pgtable->lru);
+ else
+ list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+ mm->pmd_huge_pte = pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ pgtable = mm->pmd_huge_pte;
+ if (list_empty(&pgtable->lru))
+ mm->pmd_huge_pte = NULL;
+ else {
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+ struct page, lru);
+ list_del(&pgtable->lru);
+ }
+ return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8c..000000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * mm/prio_tree.c - priority search tree for mapping->i_mmap
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004 Initial version
- */
-
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-#include <linux/prefetch.h>
-
-/*
- * See lib/prio_tree.c for details on the general radix priority search tree
- * code.
- */
-
-/*
- * The following #defines are mirrored from lib/prio_tree.c. They're only used
- * for debugging, and should be removed (along with the debugging code using
- * them) when switching also VMAs to the regular prio_tree code.
- */
-
-#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
-#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-
-/*
- * Radix priority search tree for address_space->i_mmap
- *
- * For each vma that map a unique set of file pages i.e., unique [radix_index,
- * heap_index] value, we have a corresponding priority search tree node. If
- * multiple vmas have identical [radix_index, heap_index] value, then one of
- * them is used as a tree node and others are stored in a vm_set list. The tree
- * node points to the first vma (head) of the list using vm_set.head.
- *
- * prio_tree_root
- * |
- * A vm_set.head
- * / \ /
- * L R -> H-I-J-K-M-N-O-P-Q-S
- * ^ ^ <-- vm_set.list -->
- * tree nodes
- *
- * We need some way to identify whether a vma is a tree node, head of a vm_set
- * list, or just a member of a vm_set list. We cannot use vm_flags to store
- * such information. The reason is, in the above figure, it is possible that
- * vm_flags' of R and H are covered by the different mmap_sems. When R is
- * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
- * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
- * That's why some trick involving shared.vm_set.parent is used for identifying
- * tree nodes and list head nodes.
- *
- * vma radix priority search tree node rules:
- *
- * vma->shared.vm_set.parent != NULL ==> a tree node
- * vma->shared.vm_set.head != NULL ==> list of others mapping same range
- * vma->shared.vm_set.head == NULL ==> no others map the same range
- *
- * vma->shared.vm_set.parent == NULL
- * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
- * vma->shared.vm_set.head == NULL ==> a list node
- */
-
-/*
- * Add a new vma known to map the same set of pages as the old vma:
- * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
- * Note that it just happens to work correctly on i_mmap_nonlinear too.
- */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
-{
- /* Leave these BUG_ONs till prio_tree patch stabilizes */
- BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
- BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
-
- vma->shared.vm_set.head = NULL;
- vma->shared.vm_set.parent = NULL;
-
- if (!old->shared.vm_set.parent)
- list_add(&vma->shared.vm_set.list,
- &old->shared.vm_set.list);
- else if (old->shared.vm_set.head)
- list_add_tail(&vma->shared.vm_set.list,
- &old->shared.vm_set.head->shared.vm_set.list);
- else {
- INIT_LIST_HEAD(&vma->shared.vm_set.list);
- vma->shared.vm_set.head = old;
- old->shared.vm_set.head = vma;
- }
-}
-
-void vma_prio_tree_insert(struct vm_area_struct *vma,
- struct prio_tree_root *root)
-{
- struct prio_tree_node *ptr;
- struct vm_area_struct *old;
-
- vma->shared.vm_set.head = NULL;
-
- ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
- if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
- old = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- vma_prio_tree_add(vma, old);
- }
-}
-
-void vma_prio_tree_remove(struct vm_area_struct *vma,
- struct prio_tree_root *root)
-{
- struct vm_area_struct *node, *head, *new_head;
-
- if (!vma->shared.vm_set.head) {
- if (!vma->shared.vm_set.parent)
- list_del_init(&vma->shared.vm_set.list);
- else
- raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
- } else {
- /* Leave this BUG_ON till prio_tree patch stabilizes */
- BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
- if (vma->shared.vm_set.parent) {
- head = vma->shared.vm_set.head;
- if (!list_empty(&head->shared.vm_set.list)) {
- new_head = list_entry(
- head->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&head->shared.vm_set.list);
- } else
- new_head = NULL;
-
- raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
- &head->shared.prio_tree_node);
- head->shared.vm_set.head = new_head;
- if (new_head)
- new_head->shared.vm_set.head = head;
-
- } else {
- node = vma->shared.vm_set.head;
- if (!list_empty(&vma->shared.vm_set.list)) {
- new_head = list_entry(
- vma->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&vma->shared.vm_set.list);
- node->shared.vm_set.head = new_head;
- new_head->shared.vm_set.head = node;
- } else
- node->shared.vm_set.head = NULL;
- }
- }
-}
-
-/*
- * Helper function to enumerate vmas that map a given file page or a set of
- * contiguous file pages. The function returns vmas that at least map a single
- * page in the given range of contiguous file pages.
- */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
- struct prio_tree_iter *iter)
-{
- struct prio_tree_node *ptr;
- struct vm_area_struct *next;
-
- if (!vma) {
- /*
- * First call is with NULL vma
- */
- ptr = prio_tree_next(iter);
- if (ptr) {
- next = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- prefetch(next->shared.vm_set.head);
- return next;
- } else
- return NULL;
- }
-
- if (vma->shared.vm_set.parent) {
- if (vma->shared.vm_set.head) {
- next = vma->shared.vm_set.head;
- prefetch(next->shared.vm_set.list.next);
- return next;
- }
- } else {
- next = list_entry(vma->shared.vm_set.list.next,
- struct vm_area_struct, shared.vm_set.list);
- if (!next->shared.vm_set.head) {
- prefetch(next->shared.vm_set.list.next);
- return next;
- }
- }
-
- ptr = prio_tree_next(iter);
- if (ptr) {
- next = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- prefetch(next->shared.vm_set.head);
- return next;
- } else
- return NULL;
-}
diff --git a/mm/readahead.c b/mm/readahead.c
index ea8f8fa21649..7963f2391236 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -579,19 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp,
SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
{
ssize_t ret;
- struct file *file;
+ struct fd f;
ret = -EBADF;
- file = fget(fd);
- if (file) {
- if (file->f_mode & FMODE_READ) {
- struct address_space *mapping = file->f_mapping;
+ f = fdget(fd);
+ if (f.file) {
+ if (f.file->f_mode & FMODE_READ) {
+ struct address_space *mapping = f.file->f_mapping;
pgoff_t start = offset >> PAGE_CACHE_SHIFT;
pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
unsigned long len = end - start + 1;
- ret = do_readahead(mapping, file, start, len);
+ ret = do_readahead(mapping, f.file, start, len);
}
- fput(file);
+ fdput(f);
}
return ret;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a24..2c78f8cadc95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
* mm->mmap_sem
* page->flags PG_locked (lock_page)
* mapping->i_mmap_mutex
- * anon_vma->mutex
+ * anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
* in arch-dependent flush_dcache_mmap_lock,
* within bdi.wb->list_lock in __sync_single_inode)
*
- * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
*/
@@ -56,6 +56,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
+#include <linux/backing-dev.h>
#include <asm/tlbflush.h>
@@ -86,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
VM_BUG_ON(atomic_read(&anon_vma->refcount));
/*
- * Synchronize against page_lock_anon_vma() such that
+ * Synchronize against page_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting
* freed.
*
* Relies on the full mb implied by the atomic_dec_and_test() from
* put_anon_vma() against the acquire barrier implied by
- * mutex_trylock() from page_lock_anon_vma(). This orders:
+ * down_read_trylock() from page_lock_anon_vma_read(). This orders:
*
- * page_lock_anon_vma() VS put_anon_vma()
- * mutex_trylock() atomic_dec_and_test()
+ * page_lock_anon_vma_read() VS put_anon_vma()
+ * down_read_trylock() atomic_dec_and_test()
* LOCK MB
- * atomic_read() mutex_is_locked()
+ * atomic_read() rwsem_is_locked()
*
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
- if (mutex_is_locked(&anon_vma->root->mutex)) {
- anon_vma_lock(anon_vma);
+ if (rwsem_is_locked(&anon_vma->root->rwsem)) {
+ anon_vma_lock_write(anon_vma);
anon_vma_unlock(anon_vma);
}
@@ -127,12 +128,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
avc->vma = vma;
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
-
- /*
- * It's critical to add new vmas to the tail of the anon_vma,
- * see comment in huge_memory.c:__split_huge_page().
- */
- list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}
/**
@@ -150,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* allocate a new one.
*
* Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * optimistically looked up an anon_vma in page_lock_anon_vma_read()
* and that may actually touch the spinlock even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
@@ -185,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
allocated = anon_vma;
}
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
@@ -223,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
struct anon_vma *new_root = anon_vma->root;
if (new_root != root) {
if (WARN_ON_ONCE(root))
- mutex_unlock(&root->mutex);
+ up_write(&root->rwsem);
root = new_root;
- mutex_lock(&root->mutex);
+ down_write(&root->rwsem);
}
return root;
}
@@ -233,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
static inline void unlock_anon_vma_root(struct anon_vma *root)
{
if (root)
- mutex_unlock(&root->mutex);
+ up_write(&root->rwsem);
}
/*
@@ -269,51 +265,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
}
/*
- * Some rmap walk that needs to find all ptes/hugepmds without false
- * negatives (like migrate and split_huge_page) running concurrent
- * with operations that copy or move pagetables (like mremap() and
- * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
- * list to be in a certain order: the dst_vma must be placed after the
- * src_vma in the list. This is always guaranteed by fork() but
- * mremap() needs to call this function to enforce it in case the
- * dst_vma isn't newly allocated and chained with the anon_vma_clone()
- * function but just an extension of a pre-existing vma through
- * vma_merge.
- *
- * NOTE: the same_anon_vma list can still be changed by other
- * processes while mremap runs because mremap doesn't hold the
- * anon_vma mutex to prevent modifications to the list while it
- * runs. All we need to enforce is that the relative order of this
- * process vmas isn't changing (we don't care about other vmas
- * order). Each vma corresponds to an anon_vma_chain structure so
- * there's no risk that other processes calling anon_vma_moveto_tail()
- * and changing the same_anon_vma list under mremap() will screw with
- * the relative order of this process vmas in the list, because we
- * they can't alter the order of any vma that belongs to this
- * process. And there can't be another anon_vma_moveto_tail() running
- * concurrently with mremap() coming from this process because we hold
- * the mmap_sem for the whole mremap(). fork() ordering dependency
- * also shouldn't be affected because fork() only cares that the
- * parent vmas are placed in the list before the child vmas and
- * anon_vma_moveto_tail() won't reorder vmas from either the fork()
- * parent or child.
- */
-void anon_vma_moveto_tail(struct vm_area_struct *dst)
-{
- struct anon_vma_chain *pavc;
- struct anon_vma *root = NULL;
-
- list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma = pavc->anon_vma;
- VM_BUG_ON(pavc->vma != dst);
- root = lock_anon_vma_root(root, anon_vma);
- list_del(&pavc->same_anon_vma);
- list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
- }
- unlock_anon_vma_root(root);
-}
-
-/*
* Attach vma to its own anon_vma, as well as to the anon_vmas that
* the corresponding VMA in the parent process is attached to.
* Returns 0 on success, non-zero on failure.
@@ -355,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
anon_vma_unlock(anon_vma);
@@ -381,13 +332,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
struct anon_vma *anon_vma = avc->anon_vma;
root = lock_anon_vma_root(root, anon_vma);
- list_del(&avc->same_anon_vma);
+ anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
/*
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
- if (list_empty(&anon_vma->head))
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root))
continue;
list_del(&avc->same_vma);
@@ -398,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
/*
* Iterate the list once more, it now only contains empty and unlinked
* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
- * needing to acquire the anon_vma->root->mutex.
+ * needing to write-acquire the anon_vma->root->rwsem.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
@@ -414,9 +365,9 @@ static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
- mutex_init(&anon_vma->mutex);
+ init_rwsem(&anon_vma->rwsem);
atomic_set(&anon_vma->refcount, 0);
- INIT_LIST_HEAD(&anon_vma->head);
+ anon_vma->rb_root = RB_ROOT;
}
void __init anon_vma_init(void)
@@ -491,7 +442,7 @@ out:
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
* reference like with page_get_anon_vma() and then block on the mutex.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma_read(struct page *page)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
@@ -506,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = ACCESS_ONCE(anon_vma->root);
- if (mutex_trylock(&root_anon_vma->mutex)) {
+ if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
* If the page is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free().
*/
if (!page_mapped(page)) {
- mutex_unlock(&root_anon_vma->mutex);
+ up_read(&root_anon_vma->rwsem);
anon_vma = NULL;
}
goto out;
@@ -533,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
/* we pinned the anon_vma, its safe to sleep */
rcu_read_unlock();
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
if (atomic_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because
- * we'll deadlock on the anon_vma_lock() recursion.
+ * we'll deadlock on the anon_vma_lock_write() recursion.
*/
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
__put_anon_vma(anon_vma);
anon_vma = NULL;
}
@@ -553,29 +504,33 @@ out:
return anon_vma;
}
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
{
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
}
/*
* At what user virtual address is page expected in @vma?
- * Returns virtual address or -EFAULT if page's index/offset is not
- * within the range mapped the @vma.
*/
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- unsigned long address;
if (unlikely(is_vm_hugetlb_page(vma)))
pgoff = page->index << huge_page_order(page_hstate(page));
- address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
- /* page should be within @vma mapping range */
- return -EFAULT;
- }
+
+ return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long address = __vma_address(page, vma);
+
+ /* page should be within @vma mapping range */
+ VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+
return address;
}
@@ -585,6 +540,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
+ unsigned long address;
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
@@ -600,7 +556,31 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
} else
return -EFAULT;
- return vma_address(page, vma);
+ address = __vma_address(page, vma);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ return -EFAULT;
+ return address;
+}
+
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd = NULL;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ pmd = NULL;
+out:
+ return pmd;
}
/*
@@ -615,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
unsigned long address, spinlock_t **ptlp, int sync)
{
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *pte;
spinlock_t *ptl;
@@ -627,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
goto check;
}
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
return NULL;
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
- return NULL;
if (pmd_trans_huge(*pmd))
return NULL;
@@ -674,8 +645,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
pte_t *pte;
spinlock_t *ptl;
- address = vma_address(page, vma);
- if (address == -EFAULT) /* out of vma range */
+ address = __vma_address(page, vma);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
return 0;
pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
if (!pte) /* the page is not in this mm */
@@ -769,19 +740,19 @@ static int page_referenced_anon(struct page *page,
{
unsigned int mapcount;
struct anon_vma *anon_vma;
+ pgoff_t pgoff;
struct anon_vma_chain *avc;
int referenced = 0;
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return referenced;
mapcount = page_mapcount(page);
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
@@ -795,7 +766,7 @@ static int page_referenced_anon(struct page *page,
break;
}
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
return referenced;
}
@@ -820,7 +791,6 @@ static int page_referenced_file(struct page *page,
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int referenced = 0;
/*
@@ -846,10 +816,8 @@ static int page_referenced_file(struct page *page,
*/
mapcount = page_mapcount(page);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
@@ -929,7 +897,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
pte_t entry;
flush_cache_page(vma, address, pte_pfn(*pte));
- entry = ptep_clear_flush_notify(vma, address, pte);
+ entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(mm, address, pte, entry);
@@ -937,6 +905,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
}
pte_unmap_unlock(pte, ptl);
+
+ if (ret)
+ mmu_notifier_invalidate_page(mm, address);
out:
return ret;
}
@@ -945,17 +916,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = 0;
BUG_ON(PageAnon(page));
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (vma->vm_flags & VM_SHARED) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret += page_mkclean_one(page, vma, address);
}
}
@@ -971,11 +939,8 @@ int page_mkclean(struct page *page)
if (page_mapped(page)) {
struct address_space *mapping = page_mapping(page);
- if (mapping) {
+ if (mapping)
ret = page_mkclean_file(mapping, page);
- if (page_test_and_clear_dirty(page_to_pfn(page), 1))
- ret = 1;
- }
}
return ret;
@@ -1128,7 +1093,7 @@ void page_add_new_anon_rmap(struct page *page,
else
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__page_set_anon_rmap(page, vma, address, 1);
- if (page_evictable(page, vma))
+ if (!mlocked_vma_newpage(vma, page))
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
else
add_page_to_unevictable_list(page);
@@ -1161,6 +1126,7 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
bool anon = PageAnon(page);
bool locked;
unsigned long flags;
@@ -1183,8 +1149,21 @@ void page_remove_rmap(struct page *page)
* this if the page is anon, so about to be freed; but perhaps
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
+ *
+ * And we can skip it on file pages, so long as the filesystem
+ * participates in dirty tracking (note that this is not only an
+ * optimization but also solves problems caused by dirty flag in
+ * storage key getting set by a write from inside kernel); but need to
+ * catch shm and tmpfs and ramfs pages which have been modified since
+ * creation by read fault.
+ *
+ * Note that mapping must be decided above, before decrementing
+ * mapcount (which luckily provides a barrier): once page is unmapped,
+ * it could be truncated and page->mapping reset to NULL at any moment.
+ * Note also that we are relying on page_mapping(page) to set mapping
+ * to &swapper_space when PageSwapCache(page).
*/
- if ((!anon || PageSwapCache(page)) &&
+ if (mapping && !mapping_cap_account_dirty(mapping) &&
page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
/*
@@ -1203,7 +1182,10 @@ void page_remove_rmap(struct page *page)
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1213,6 +1195,7 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
+ return;
out:
if (!anon)
mem_cgroup_end_update_page_stat(page, &locked, &flags);
@@ -1256,7 +1239,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
@@ -1266,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
update_hiwater_rss(mm);
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
+ if (!PageHuge(page)) {
+ if (PageAnon(page))
+ dec_mm_counter(mm, MM_ANONPAGES);
+ else
+ dec_mm_counter(mm, MM_FILEPAGES);
+ }
set_pte_at(mm, address, pte,
- swp_entry_to_pte(make_hwpoison_entry(page)));
+ swp_entry_to_pte(make_hwpoison_entry(page)));
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(page) };
@@ -1318,6 +1303,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
out_unmap:
pte_unmap_unlock(pte, ptl);
+ if (ret != SWAP_FAIL)
+ mmu_notifier_invalidate_page(mm, address);
out:
return ret;
@@ -1328,7 +1315,7 @@ out_mlock:
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
+ * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
@@ -1374,14 +1361,14 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
struct vm_area_struct *vma, struct page *check_page)
{
struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pte_t pteval;
spinlock_t *ptl;
struct page *page;
unsigned long address;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
unsigned long end;
int ret = SWAP_AGAIN;
int locked_vma = 0;
@@ -1393,17 +1380,13 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
if (end > vma->vm_end)
end = vma->vm_end;
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return ret;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
+ pmd = mm_find_pmd(mm, address);
+ if (!pmd)
return ret;
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
- return ret;
+ mmun_start = address;
+ mmun_end = end;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
* If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
@@ -1438,7 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
@@ -1454,6 +1437,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (locked_vma)
up_read(&vma->vm_mm->mmap_sem);
return ret;
@@ -1492,14 +1476,16 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
{
struct anon_vma *anon_vma;
+ pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return ret;
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address;
@@ -1516,14 +1502,12 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
continue;
address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = try_to_unmap_one(page, vma, address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page))
break;
}
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
return ret;
}
@@ -1547,7 +1531,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
@@ -1555,10 +1538,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
unsigned int mapcount;
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = try_to_unmap_one(page, vma, address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page))
goto out;
@@ -1576,7 +1557,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
goto out;
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.vm_set.list) {
+ shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
@@ -1608,7 +1589,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
do {
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.vm_set.list) {
+ shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
while ( cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1612,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
* in locked vmas). Reset cursor on all unreserved nonlinear
* vmas, now forgetting on which ones it had fallen behind.
*/
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
vma->vm_private_data = NULL;
out:
mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,11 +1697,12 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
struct vm_area_struct *, unsigned long, void *), void *arg)
{
struct anon_vma *anon_vma;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
/*
- * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
+ * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
* are holding mmap_sem. Users without mmap_sem are required to
* take a reference count to prevent the anon_vma disappearing
@@ -1728,17 +1710,15 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
anon_vma = page_anon_vma(page);
if (!anon_vma)
return ret;
- anon_vma_lock(anon_vma);
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = rmap_one(page, vma, address, arg);
if (ret != SWAP_AGAIN)
break;
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
return ret;
}
@@ -1748,16 +1728,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = SWAP_AGAIN;
if (!mapping)
return ret;
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = rmap_one(page, vma, address, arg);
if (ret != SWAP_AGAIN)
break;
diff --git a/mm/shmem.c b/mm/shmem.c
index d4e184e2a38e..5c90d84c2b02 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt;
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128
-struct shmem_xattr {
- struct list_head list; /* anchored by shmem_inode_info->xattr_list */
- char *name; /* xattr name */
- size_t size;
- char value[0];
-};
-
/*
* shmem_fallocate and shmem_writepage communicate via inode->i_private
* (with i_mutex making sure that it has only one user at a time):
@@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_xattr *xattr, *nxattr;
if (inode->i_mapping->a_ops == &shmem_aops) {
shmem_unacct_size(info->flags, inode->i_size);
@@ -650,11 +642,8 @@ static void shmem_evict_inode(struct inode *inode)
} else
kfree(info->symlink);
- list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
- kfree(xattr->name);
- kfree(xattr);
- }
- BUG_ON(inode->i_blocks);
+ simple_xattrs_free(&info->xattrs);
+ WARN_ON(inode->i_blocks);
shmem_free_inode(inode->i_sb);
clear_inode(inode);
}
@@ -921,25 +910,29 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct mempolicy mpol, *spol;
struct vm_area_struct pvma;
-
- spol = mpol_cond_copy(&mpol,
- mpol_shared_policy_lookup(&info->policy, index));
+ struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
/* Bias interleave by inode number to distribute better across nodes */
pvma.vm_pgoff = index + info->vfs_inode.i_ino;
pvma.vm_ops = NULL;
- pvma.vm_policy = spol;
- return swapin_readahead(swap, gfp, &pvma, 0);
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+
+ page = swapin_readahead(swap, gfp, &pvma, 0);
+
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
+ return page;
}
static struct page *shmem_alloc_page(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
+ struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
@@ -948,10 +941,12 @@ static struct page *shmem_alloc_page(gfp_t gfp,
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
- /*
- * alloc_page_vma() will drop the shared policy reference
- */
- return alloc_page_vma(gfp, &pvma, 0);
+ page = alloc_page_vma(gfp, &pvma, 0);
+
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
+ return page;
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
@@ -1156,8 +1151,20 @@ repeat:
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
gfp, swp_to_radix_entry(swap));
- /* We already confirmed swap, and make no allocation */
- VM_BUG_ON(error);
+ /*
+ * We already confirmed swap under page lock, and make
+ * no memory allocation here, so usually no possibility
+ * of error; but free_swap_and_cache() only trylocks a
+ * page, so it is just possible that the entry has been
+ * truncated or holepunched since swap was confirmed.
+ * shmem_undo_range() will have done some of the
+ * unaccounting, now delete_from_swap_cache() will do
+ * the rest (including mem_cgroup_uncharge_swapcache).
+ * Reset swap.val? No, leave it so "failed" goes back to
+ * "repeat": reading a hole and writing should succeed.
+ */
+ if (error)
+ delete_from_swap_cache(page);
}
if (error)
goto failed;
@@ -1350,7 +1357,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
@@ -1377,7 +1383,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
spin_lock_init(&info->lock);
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->swaplist);
- INIT_LIST_HEAD(&info->xattr_list);
+ simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
switch (mode & S_IFMT) {
@@ -1709,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
return error;
}
+/*
+ * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ */
+static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
+ pgoff_t index, pgoff_t end, int whence)
+{
+ struct page *page;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool done = false;
+ int i;
+
+ pagevec_init(&pvec, 0);
+ pvec.nr = 1; /* start small: we may be there already */
+ while (!done) {
+ pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ pvec.nr, pvec.pages, indices);
+ if (!pvec.nr) {
+ if (whence == SEEK_DATA)
+ index = end;
+ break;
+ }
+ for (i = 0; i < pvec.nr; i++, index++) {
+ if (index < indices[i]) {
+ if (whence == SEEK_HOLE) {
+ done = true;
+ break;
+ }
+ index = indices[i];
+ }
+ page = pvec.pages[i];
+ if (page && !radix_tree_exceptional_entry(page)) {
+ if (!PageUptodate(page))
+ page = NULL;
+ }
+ if (index >= end ||
+ (page && whence == SEEK_DATA) ||
+ (!page && whence == SEEK_HOLE)) {
+ done = true;
+ break;
+ }
+ }
+ shmem_deswap_pagevec(&pvec);
+ pagevec_release(&pvec);
+ pvec.nr = PAGEVEC_SIZE;
+ cond_resched();
+ }
+ return index;
+}
+
+static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ pgoff_t start, end;
+ loff_t new_offset;
+
+ if (whence != SEEK_DATA && whence != SEEK_HOLE)
+ return generic_file_llseek_size(file, offset, whence,
+ MAX_LFS_FILESIZE, i_size_read(inode));
+ mutex_lock(&inode->i_mutex);
+ /* We're holding i_mutex so we can access i_size directly */
+
+ if (offset < 0)
+ offset = -EINVAL;
+ else if (offset >= inode->i_size)
+ offset = -ENXIO;
+ else {
+ start = offset >> PAGE_CACHE_SHIFT;
+ end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ new_offset = shmem_seek_hole_data(mapping, start, end, whence);
+ new_offset <<= PAGE_CACHE_SHIFT;
+ if (new_offset > offset) {
+ if (new_offset < inode->i_size)
+ offset = new_offset;
+ else if (whence == SEEK_DATA)
+ offset = -ENXIO;
+ else
+ offset = inode->i_size;
+ }
+ }
+
+ if (offset >= 0 && offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ }
+ mutex_unlock(&inode->i_mutex);
+ return offset;
+}
+
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -2060,28 +2156,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
*/
/*
- * Allocate new xattr and copy in the value; but leave the name to callers.
- */
-static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
-{
- struct shmem_xattr *new_xattr;
- size_t len;
-
- /* wrap around? */
- len = sizeof(*new_xattr) + size;
- if (len <= sizeof(*new_xattr))
- return NULL;
-
- new_xattr = kmalloc(len, GFP_KERNEL);
- if (!new_xattr)
- return NULL;
-
- new_xattr->size = size;
- memcpy(new_xattr->value, value, size);
- return new_xattr;
-}
-
-/*
* Callback for security_inode_init_security() for acquiring xattrs.
*/
static int shmem_initxattrs(struct inode *inode,
@@ -2090,11 +2164,11 @@ static int shmem_initxattrs(struct inode *inode,
{
struct shmem_inode_info *info = SHMEM_I(inode);
const struct xattr *xattr;
- struct shmem_xattr *new_xattr;
+ struct simple_xattr *new_xattr;
size_t len;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
+ new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
if (!new_xattr)
return -ENOMEM;
@@ -2111,91 +2185,12 @@ static int shmem_initxattrs(struct inode *inode,
memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
xattr->name, len);
- spin_lock(&info->lock);
- list_add(&new_xattr->list, &info->xattr_list);
- spin_unlock(&info->lock);
+ simple_xattr_list_add(&info->xattrs, new_xattr);
}
return 0;
}
-static int shmem_xattr_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
-{
- struct shmem_inode_info *info;
- struct shmem_xattr *xattr;
- int ret = -ENODATA;
-
- info = SHMEM_I(dentry->d_inode);
-
- spin_lock(&info->lock);
- list_for_each_entry(xattr, &info->xattr_list, list) {
- if (strcmp(name, xattr->name))
- continue;
-
- ret = xattr->size;
- if (buffer) {
- if (size < xattr->size)
- ret = -ERANGE;
- else
- memcpy(buffer, xattr->value, xattr->size);
- }
- break;
- }
- spin_unlock(&info->lock);
- return ret;
-}
-
-static int shmem_xattr_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_xattr *xattr;
- struct shmem_xattr *new_xattr = NULL;
- int err = 0;
-
- /* value == NULL means remove */
- if (value) {
- new_xattr = shmem_xattr_alloc(value, size);
- if (!new_xattr)
- return -ENOMEM;
-
- new_xattr->name = kstrdup(name, GFP_KERNEL);
- if (!new_xattr->name) {
- kfree(new_xattr);
- return -ENOMEM;
- }
- }
-
- spin_lock(&info->lock);
- list_for_each_entry(xattr, &info->xattr_list, list) {
- if (!strcmp(name, xattr->name)) {
- if (flags & XATTR_CREATE) {
- xattr = new_xattr;
- err = -EEXIST;
- } else if (new_xattr) {
- list_replace(&xattr->list, &new_xattr->list);
- } else {
- list_del(&xattr->list);
- }
- goto out;
- }
- }
- if (flags & XATTR_REPLACE) {
- xattr = new_xattr;
- err = -ENODATA;
- } else {
- list_add(&new_xattr->list, &info->xattr_list);
- xattr = NULL;
- }
-out:
- spin_unlock(&info->lock);
- if (xattr)
- kfree(xattr->name);
- kfree(xattr);
- return err;
-}
-
static const struct xattr_handler *shmem_xattr_handlers[] = {
#ifdef CONFIG_TMPFS_POSIX_ACL
&generic_acl_access_handler,
@@ -2226,6 +2221,7 @@ static int shmem_xattr_validate(const char *name)
static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
int err;
/*
@@ -2240,12 +2236,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
if (err)
return err;
- return shmem_xattr_get(dentry, name, buffer, size);
+ return simple_xattr_get(&info->xattrs, name, buffer, size);
}
static int shmem_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
int err;
/*
@@ -2260,15 +2257,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
if (err)
return err;
- if (size == 0)
- value = ""; /* empty EA, do not remove */
-
- return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
-
+ return simple_xattr_set(&info->xattrs, name, value, size, flags);
}
static int shmem_removexattr(struct dentry *dentry, const char *name)
{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
int err;
/*
@@ -2283,45 +2277,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
if (err)
return err;
- return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
-}
-
-static bool xattr_is_trusted(const char *name)
-{
- return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+ return simple_xattr_remove(&info->xattrs, name);
}
static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- bool trusted = capable(CAP_SYS_ADMIN);
- struct shmem_xattr *xattr;
- struct shmem_inode_info *info;
- size_t used = 0;
-
- info = SHMEM_I(dentry->d_inode);
-
- spin_lock(&info->lock);
- list_for_each_entry(xattr, &info->xattr_list, list) {
- size_t len;
-
- /* skip "trusted." attributes for unprivileged callers */
- if (!trusted && xattr_is_trusted(xattr->name))
- continue;
-
- len = strlen(xattr->name) + 1;
- used += len;
- if (buffer) {
- if (size < used) {
- used = -ERANGE;
- break;
- }
- memcpy(buffer, xattr->name, len);
- buffer += len;
- }
- }
- spin_unlock(&info->lock);
-
- return used;
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ return simple_xattr_list(&info->xattrs, buffer, size);
}
#endif /* CONFIG_TMPFS_XATTR */
@@ -2366,12 +2328,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
{
struct inode *inode;
struct dentry *dentry = NULL;
- u64 inum = fid->raw[2];
- inum = (inum << 32) | fid->raw[1];
+ u64 inum;
if (fh_len < 3)
return NULL;
+ inum = fid->raw[2];
+ inum = (inum << 32) | fid->raw[1];
+
inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
shmem_match, fid->raw);
if (inode) {
@@ -2712,7 +2676,7 @@ static const struct address_space_operations shmem_aops = {
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
- .llseek = generic_file_llseek,
+ .llseek = shmem_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = shmem_file_aio_read,
@@ -2788,6 +2752,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
+ .remap_pages = generic_file_remap_pages,
};
static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2981,7 +2946,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
fput(vma->vm_file);
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index c6854759bcf1..e7667a3584bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,7 +87,6 @@
*/
#include <linux/slab.h>
-#include "slab.h"
#include <linux/mm.h>
#include <linux/poison.h>
#include <linux/swap.h>
@@ -128,6 +127,8 @@
#include "internal.h"
+#include "slab.h"
+
/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -162,23 +163,6 @@
*/
static bool pfmemalloc_active __read_mostly;
-/* Legal flag mask for kmem_cache_create(). */
-#if DEBUG
-# define CREATE_MASK (SLAB_RED_ZONE | \
- SLAB_POISON | SLAB_HWCACHE_ALIGN | \
- SLAB_CACHE_DMA | \
- SLAB_STORE_USER | \
- SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
- SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
-#else
-# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
- SLAB_CACHE_DMA | \
- SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
- SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
- SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK)
-#endif
-
/*
* kmem_bufctl_t:
*
@@ -498,14 +482,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#ifdef CONFIG_TRACING
-size_t slab_buffer_size(struct kmem_cache *cachep)
-{
- return cachep->size;
-}
-EXPORT_SYMBOL(slab_buffer_size);
-#endif
-
/*
* Do not go above this order unless 0 objects fit into the slab or
* overridden on the command line.
@@ -515,13 +491,6 @@ EXPORT_SYMBOL(slab_buffer_size);
static int slab_max_order = SLAB_MAX_ORDER_LO;
static bool slab_max_order_set __initdata;
-static inline struct kmem_cache *page_get_cache(struct page *page)
-{
- page = compound_head(page);
- BUG_ON(!PageSlab(page));
- return page->slab_cache;
-}
-
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
struct page *page = virt_to_head_page(obj);
@@ -579,15 +548,11 @@ static struct cache_names __initdata cache_names[] = {
#undef CACHE
};
-static struct arraycache_init initarray_cache __initdata =
- { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
static struct arraycache_init initarray_generic =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
/* internal cache of cache description objs */
-static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
-static struct kmem_cache cache_cache = {
- .nodelists = cache_cache_nodelists,
+static struct kmem_cache kmem_cache_boot = {
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
@@ -677,6 +642,26 @@ static void init_node_lock_keys(int q)
}
}
+static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
+{
+ struct kmem_list3 *l3;
+ l3 = cachep->nodelists[q];
+ if (!l3)
+ return;
+
+ slab_set_lock_classes(cachep, &on_slab_l3_key,
+ &on_slab_alc_key, q);
+}
+
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+ int node;
+
+ VM_BUG_ON(OFF_SLAB(cachep));
+ for_each_node(node)
+ on_slab_lock_classes_node(cachep, node);
+}
+
static inline void init_lock_keys(void)
{
int node;
@@ -693,6 +678,14 @@ static inline void init_lock_keys(void)
{
}
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+}
+
+static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
+
static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
{
}
@@ -810,6 +803,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}
+#if DEBUG
#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
static void __slab_error(const char *function, struct kmem_cache *cachep,
@@ -818,7 +812,9 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
function, cachep->name, msg);
dump_stack();
+ add_taint(TAINT_BAD_PAGE);
}
+#endif
/*
* By default on NUMA we use alien caches to stage the freeing of
@@ -900,7 +896,7 @@ static void __cpuinit start_cpu_timer(int cpu)
*/
if (keventd_up() && reap_work->work.func == NULL) {
init_reap_node(cpu);
- INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
+ INIT_DEFERRABLE_WORK(reap_work, cache_reap);
schedule_delayed_work_on(cpu, reap_work,
__round_jiffies_relative(HZ, cpu));
}
@@ -1418,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
free_alien_cache(alien);
if (cachep->flags & SLAB_DEBUG_OBJECTS)
slab_set_debugobj_lock_classes_node(cachep, node);
+ else if (!OFF_SLAB(cachep) &&
+ !(cachep->flags & SLAB_DESTROY_BY_RCU))
+ on_slab_lock_classes_node(cachep, node);
}
init_node_lock_keys(node);
@@ -1589,27 +1588,34 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
}
/*
+ * The memory after the last cpu cache pointer is used for the
+ * the nodelists pointer.
+ */
+static void setup_nodelists_pointer(struct kmem_cache *cachep)
+{
+ cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+}
+
+/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
*/
void __init kmem_cache_init(void)
{
- size_t left_over;
struct cache_sizes *sizes;
struct cache_names *names;
int i;
- int order;
- int node;
+
+ kmem_cache = &kmem_cache_boot;
+ setup_nodelists_pointer(kmem_cache);
if (num_possible_nodes() == 1)
use_alien_caches = 0;
- for (i = 0; i < NUM_INIT_LISTS; i++) {
+ for (i = 0; i < NUM_INIT_LISTS; i++)
kmem_list3_init(&initkmem_list3[i]);
- if (i < MAX_NUMNODES)
- cache_cache.nodelists[i] = NULL;
- }
- set_up_list3s(&cache_cache, CACHE_CACHE);
+
+ set_up_list3s(kmem_cache, CACHE_CACHE);
/*
* Fragmentation resistance on low memory - only use bigger
@@ -1621,9 +1627,9 @@ void __init kmem_cache_init(void)
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
- * 1) initialize the cache_cache cache: it contains the struct
- * kmem_cache structures of all caches, except cache_cache itself:
- * cache_cache is statically allocated.
+ * 1) initialize the kmem_cache cache: it contains the struct
+ * kmem_cache structures of all caches, except kmem_cache itself:
+ * kmem_cache is statically allocated.
* Initially an __init data area is used for the head array and the
* kmem_list3 structures, it's replaced with a kmalloc allocated
* array at the end of the bootstrap.
@@ -1632,44 +1638,23 @@ void __init kmem_cache_init(void)
* An __init data area is used for the head array.
* 3) Create the remaining kmalloc caches, with minimally sized
* head arrays.
- * 4) Replace the __init data head arrays for cache_cache and the first
+ * 4) Replace the __init data head arrays for kmem_cache and the first
* kmalloc cache with kmalloc allocated arrays.
- * 5) Replace the __init data for kmem_list3 for cache_cache and
+ * 5) Replace the __init data for kmem_list3 for kmem_cache and
* the other cache's with kmalloc allocated memory.
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
- node = numa_mem_id();
-
- /* 1) create the cache_cache */
- INIT_LIST_HEAD(&slab_caches);
- list_add(&cache_cache.list, &slab_caches);
- cache_cache.colour_off = cache_line_size();
- cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
- cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
+ /* 1) create the kmem_cache */
/*
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
- cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
- nr_node_ids * sizeof(struct kmem_list3 *);
- cache_cache.object_size = cache_cache.size;
- cache_cache.size = ALIGN(cache_cache.size,
- cache_line_size());
- cache_cache.reciprocal_buffer_size =
- reciprocal_value(cache_cache.size);
-
- for (order = 0; order < MAX_ORDER; order++) {
- cache_estimate(order, cache_cache.size,
- cache_line_size(), 0, &left_over, &cache_cache.num);
- if (cache_cache.num)
- break;
- }
- BUG_ON(!cache_cache.num);
- cache_cache.gfporder = order;
- cache_cache.colour = left_over / cache_cache.colour_off;
- cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
- sizeof(struct slab), cache_line_size());
+ create_boot_cache(kmem_cache, "kmem_cache",
+ offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+ nr_node_ids * sizeof(struct kmem_list3 *),
+ SLAB_HWCACHE_ALIGN);
+ list_add(&kmem_cache->list, &slab_caches);
/* 2+3) create the kmalloc caches */
sizes = malloc_sizes;
@@ -1681,20 +1666,13 @@ void __init kmem_cache_init(void)
* bug.
*/
- sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
- sizes[INDEX_AC].cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
- NULL);
+ sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
+ sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
- if (INDEX_AC != INDEX_L3) {
+ if (INDEX_AC != INDEX_L3)
sizes[INDEX_L3].cs_cachep =
- __kmem_cache_create(names[INDEX_L3].name,
- sizes[INDEX_L3].cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
- NULL);
- }
+ create_kmalloc_cache(names[INDEX_L3].name,
+ sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
slab_early_init = 0;
@@ -1706,21 +1684,14 @@ void __init kmem_cache_init(void)
* Note for systems short on memory removing the alignment will
* allow tighter packing of the smaller caches.
*/
- if (!sizes->cs_cachep) {
- sizes->cs_cachep = __kmem_cache_create(names->name,
- sizes->cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
- NULL);
- }
+ if (!sizes->cs_cachep)
+ sizes->cs_cachep = create_kmalloc_cache(names->name,
+ sizes->cs_size, ARCH_KMALLOC_FLAGS);
+
#ifdef CONFIG_ZONE_DMA
- sizes->cs_dmacachep = __kmem_cache_create(
- names->name_dma,
- sizes->cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
- SLAB_PANIC,
- NULL);
+ sizes->cs_dmacachep = create_kmalloc_cache(
+ names->name_dma, sizes->cs_size,
+ SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
#endif
sizes++;
names++;
@@ -1731,15 +1702,14 @@ void __init kmem_cache_init(void)
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
- BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
- memcpy(ptr, cpu_cache_get(&cache_cache),
+ memcpy(ptr, cpu_cache_get(kmem_cache),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->lock);
- cache_cache.array[smp_processor_id()] = ptr;
+ kmem_cache->array[smp_processor_id()] = ptr;
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
@@ -1760,7 +1730,7 @@ void __init kmem_cache_init(void)
int nid;
for_each_online_node(nid) {
- init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
+ init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
init_list(malloc_sizes[INDEX_AC].cs_cachep,
&initkmem_list3[SIZE_AC + nid], nid);
@@ -1781,9 +1751,6 @@ void __init kmem_cache_init_late(void)
slab_state = UP;
- /* Annotate slab for lockdep -- annotate the malloc caches */
- init_lock_keys();
-
/* 6) resize the head arrays to their final sizes */
mutex_lock(&slab_mutex);
list_for_each_entry(cachep, &slab_caches, list)
@@ -1791,6 +1758,9 @@ void __init kmem_cache_init_late(void)
BUG();
mutex_unlock(&slab_mutex);
+ /* Annotate slab for lockdep -- annotate the malloc caches */
+ init_lock_keys();
+
/* Done! */
slab_state = FULL;
@@ -1925,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
if (page->pfmemalloc)
SetPageSlabPfmemalloc(page + i);
}
+ memcg_bind_pages(cachep, cachep->gfporder);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1961,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
__ClearPageSlab(page);
page++;
}
+
+ memcg_release_pages(cachep, cachep->gfporder);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- free_pages((unsigned long)addr, cachep->gfporder);
+ free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -2209,27 +2182,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
}
}
-static void __kmem_cache_destroy(struct kmem_cache *cachep)
-{
- int i;
- struct kmem_list3 *l3;
-
- for_each_online_cpu(i)
- kfree(cachep->array[i]);
-
- /* NUMA: free the list3 structures */
- for_each_online_node(i) {
- l3 = cachep->nodelists[i];
- if (l3) {
- kfree(l3->shared);
- free_alien_cache(l3->alien);
- kfree(l3);
- }
- }
- kmem_cache_free(&cache_cache, cachep);
-}
-
-
/**
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
@@ -2307,7 +2259,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
if (slab_state == DOWN) {
/*
- * Note: the first kmem_cache_create must create the cache
+ * Note: Creation of first cache (kmem_cache).
+ * The setup_list3s is taken care
+ * of by the caller of __kmem_cache_create
+ */
+ cachep->array[smp_processor_id()] = &initarray_generic.cache;
+ slab_state = PARTIAL;
+ } else if (slab_state == PARTIAL) {
+ /*
+ * Note: the second kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
* further caches will BUG().
*/
@@ -2315,7 +2275,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
/*
* If the cache that's used by kmalloc(sizeof(kmem_list3)) is
- * the first cache, then we need to set up all its list3s,
+ * the second cache, then we need to set up all its list3s,
* otherwise the creation of further caches will BUG().
*/
set_up_list3s(cachep, SIZE_AC);
@@ -2324,6 +2284,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
else
slab_state = PARTIAL_ARRAYCACHE;
} else {
+ /* Remaining boot caches */
cachep->array[smp_processor_id()] =
kmalloc(sizeof(struct arraycache_init), gfp);
@@ -2356,19 +2317,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
/**
* __kmem_cache_create - Create a cache.
- * @name: A string which is used in /proc/slabinfo to identify this cache.
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
+ * @cachep: cache management descriptor
* @flags: SLAB flags
- * @ctor: A constructor for the objects.
*
* Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a int, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
- * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting unloaded.
- *
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -2381,13 +2336,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
-struct kmem_cache *
-__kmem_cache_create (const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+int
+__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
size_t left_over, slab_size, ralign;
- struct kmem_cache *cachep = NULL;
gfp_t gfp;
+ int err;
+ size_t size = cachep->size;
#if DEBUG
#if FORCED_DEBUG
@@ -2406,11 +2361,6 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
if (flags & SLAB_DESTROY_BY_RCU)
BUG_ON(flags & SLAB_POISON);
#endif
- /*
- * Always checks flags, a caller might be expecting debug support which
- * isn't available.
- */
- BUG_ON(flags & ~CREATE_MASK);
/*
* Check that size is in terms of words. This is needed to avoid
@@ -2422,22 +2372,6 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
size &= ~(BYTES_PER_WORD - 1);
}
- /* calculate the final buffer alignment: */
-
- /* 1) arch recommendation: can be overridden for debug */
- if (flags & SLAB_HWCACHE_ALIGN) {
- /*
- * Default alignment: as specified by the arch code. Except if
- * an object is really small, then squeeze multiple objects into
- * one cacheline.
- */
- ralign = cache_line_size();
- while (size <= ralign / 2)
- ralign /= 2;
- } else {
- ralign = BYTES_PER_WORD;
- }
-
/*
* Redzoning and user store require word alignment or possibly larger.
* Note this will be overridden by architecture or caller mandated
@@ -2454,13 +2388,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
size &= ~(REDZONE_ALIGN - 1);
}
- /* 2) arch mandated alignment */
- if (ralign < ARCH_SLAB_MINALIGN) {
- ralign = ARCH_SLAB_MINALIGN;
- }
/* 3) caller mandated alignment */
- if (ralign < align) {
- ralign = align;
+ if (ralign < cachep->align) {
+ ralign = cachep->align;
}
/* disable debug if necessary */
if (ralign > __alignof__(unsigned long long))
@@ -2468,21 +2398,14 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
/*
* 4) Store it.
*/
- align = ralign;
+ cachep->align = ralign;
if (slab_is_available())
gfp = GFP_KERNEL;
else
gfp = GFP_NOWAIT;
- /* Get cache's description obj. */
- cachep = kmem_cache_zalloc(&cache_cache, gfp);
- if (!cachep)
- return NULL;
-
- cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
- cachep->object_size = size;
- cachep->align = align;
+ setup_nodelists_pointer(cachep);
#if DEBUG
/*
@@ -2506,8 +2429,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
+ && cachep->object_size > cache_line_size()
+ && ALIGN(size, cachep->align) < PAGE_SIZE) {
+ cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
size = PAGE_SIZE;
}
#endif
@@ -2527,18 +2451,15 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
*/
flags |= CFLGS_OFF_SLAB;
- size = ALIGN(size, align);
+ size = ALIGN(size, cachep->align);
- left_over = calculate_slab_order(cachep, size, align, flags);
+ left_over = calculate_slab_order(cachep, size, cachep->align, flags);
+
+ if (!cachep->num)
+ return -E2BIG;
- if (!cachep->num) {
- printk(KERN_ERR
- "kmem_cache_create: couldn't create cache %s.\n", name);
- kmem_cache_free(&cache_cache, cachep);
- return NULL;
- }
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
- + sizeof(struct slab), align);
+ + sizeof(struct slab), cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
@@ -2566,8 +2487,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
- if (cachep->colour_off < align)
- cachep->colour_off = align;
+ if (cachep->colour_off < cachep->align)
+ cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
cachep->slab_size = slab_size;
cachep->flags = flags;
@@ -2588,12 +2509,11 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
- cachep->ctor = ctor;
- cachep->name = name;
- if (setup_cpu_cache(cachep, gfp)) {
- __kmem_cache_destroy(cachep);
- return NULL;
+ err = setup_cpu_cache(cachep, gfp);
+ if (err) {
+ __kmem_cache_shutdown(cachep);
+ return err;
}
if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2604,11 +2524,10 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
slab_set_debugobj_lock_classes(cachep);
- }
+ } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
+ on_slab_lock_classes(cachep);
- /* cache setup completed, link it into the list */
- list_add(&cachep->list, &slab_caches);
- return cachep;
+ return 0;
}
#if DEBUG
@@ -2767,49 +2686,29 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-/**
- * kmem_cache_destroy - delete a cache
- * @cachep: the cache to destroy
- *
- * Remove a &struct kmem_cache object from the slab cache.
- *
- * It is expected this function will be called by a module when it is
- * unloaded. This will remove the cache completely, and avoid a duplicate
- * cache being allocated each time a module is loaded and unloaded, if the
- * module doesn't have persistent in-kernel storage across loads and unloads.
- *
- * The cache must be empty before calling this function.
- *
- * The caller must guarantee that no one will allocate memory from the cache
- * during the kmem_cache_destroy().
- */
-void kmem_cache_destroy(struct kmem_cache *cachep)
+int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
- BUG_ON(!cachep || in_interrupt());
+ int i;
+ struct kmem_list3 *l3;
+ int rc = __cache_shrink(cachep);
- /* Find the cache in the chain of caches. */
- get_online_cpus();
- mutex_lock(&slab_mutex);
- /*
- * the chain is never empty, cache_cache is never destroyed
- */
- list_del(&cachep->list);
- if (__cache_shrink(cachep)) {
- slab_error(cachep, "Can't free all objects");
- list_add(&cachep->list, &slab_caches);
- mutex_unlock(&slab_mutex);
- put_online_cpus();
- return;
- }
+ if (rc)
+ return rc;
- if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
- rcu_barrier();
+ for_each_online_cpu(i)
+ kfree(cachep->array[i]);
- __kmem_cache_destroy(cachep);
- mutex_unlock(&slab_mutex);
- put_online_cpus();
+ /* NUMA: free the list3 structures */
+ for_each_online_node(i) {
+ l3 = cachep->nodelists[i];
+ if (l3) {
+ kfree(l3->shared);
+ free_alien_cache(l3->alien);
+ kfree(l3);
+ }
+ }
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_destroy);
/*
* Get the memory for a slab management obj.
@@ -3098,7 +2997,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
}
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
- void *caller)
+ unsigned long caller)
{
struct page *page;
unsigned int objnr;
@@ -3118,7 +3017,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
if (cachep->flags & SLAB_STORE_USER)
- *dbg_userword(cachep, objp) = caller;
+ *dbg_userword(cachep, objp) = (void *)caller;
objnr = obj_to_index(cachep, slabp, objp);
@@ -3131,7 +3030,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
- store_stackinfo(cachep, objp, (unsigned long)caller);
+ store_stackinfo(cachep, objp, caller);
kernel_map_pages(virt_to_page(objp),
cachep->size / PAGE_SIZE, 0);
} else {
@@ -3285,7 +3184,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
#if DEBUG
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
- gfp_t flags, void *objp, void *caller)
+ gfp_t flags, void *objp, unsigned long caller)
{
if (!objp)
return objp;
@@ -3302,7 +3201,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
- *dbg_userword(cachep, objp) = caller;
+ *dbg_userword(cachep, objp) = (void *)caller;
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
@@ -3343,7 +3242,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
{
- if (cachep == &cache_cache)
+ if (cachep == kmem_cache)
return false;
return should_failslab(cachep->object_size, flags, cachep->flags);
@@ -3576,8 +3475,8 @@ done:
* Fallback to other node is possible if __GFP_THISNODE is not set.
*/
static __always_inline void *
-__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
- void *caller)
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+ unsigned long caller)
{
unsigned long save_flags;
void *ptr;
@@ -3590,6 +3489,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (slab_should_failslab(cachep, flags))
return NULL;
+ cachep = memcg_kmem_get_cache(cachep, flags);
+
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
@@ -3663,7 +3564,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */
static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp;
@@ -3675,6 +3576,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
if (slab_should_failslab(cachep, flags))
return NULL;
+ cachep = memcg_kmem_get_cache(cachep, flags);
+
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
@@ -3799,7 +3702,7 @@ free_done:
* be in this state _before_ it is released. Called with disabled ints.
*/
static inline void __cache_free(struct kmem_cache *cachep, void *objp,
- void *caller)
+ unsigned long caller)
{
struct array_cache *ac = cpu_cache_get(cachep);
@@ -3839,7 +3742,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
- void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+ void *ret = slab_alloc(cachep, flags, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3850,14 +3753,14 @@ EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
void *
-kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
+kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
void *ret;
- ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+ ret = slab_alloc(cachep, flags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret,
- size, slab_buffer_size(cachep), flags);
+ size, cachep->size, flags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -3866,8 +3769,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- void *ret = __cache_alloc_node(cachep, flags, nodeid,
- __builtin_return_address(0));
+ void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
@@ -3878,17 +3780,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
EXPORT_SYMBOL(kmem_cache_alloc_node);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(size_t size,
- struct kmem_cache *cachep,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
gfp_t flags,
- int nodeid)
+ int nodeid,
+ size_t size)
{
void *ret;
- ret = __cache_alloc_node(cachep, flags, nodeid,
- __builtin_return_address(0));
+ ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+
trace_kmalloc_node(_RET_IP_, ret,
- size, slab_buffer_size(cachep),
+ size, cachep->size,
flags, nodeid);
return ret;
}
@@ -3896,34 +3798,33 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
+__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
{
struct kmem_cache *cachep;
cachep = kmem_find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- return kmem_cache_alloc_node_trace(size, cachep, flags, node);
+ return kmem_cache_alloc_node_trace(cachep, flags, node, size);
}
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
- return __do_kmalloc_node(size, flags, node,
- __builtin_return_address(0));
+ return __do_kmalloc_node(size, flags, node, _RET_IP_);
}
EXPORT_SYMBOL(__kmalloc_node);
void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
int node, unsigned long caller)
{
- return __do_kmalloc_node(size, flags, node, (void *)caller);
+ return __do_kmalloc_node(size, flags, node, caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
#else
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
- return __do_kmalloc_node(size, flags, node, NULL);
+ return __do_kmalloc_node(size, flags, node, 0);
}
EXPORT_SYMBOL(__kmalloc_node);
#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
@@ -3936,7 +3837,7 @@ EXPORT_SYMBOL(__kmalloc_node);
* @caller: function caller for debug tracking of the caller
*/
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
- void *caller)
+ unsigned long caller)
{
struct kmem_cache *cachep;
void *ret;
@@ -3949,9 +3850,9 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
cachep = __find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = __cache_alloc(cachep, flags, caller);
+ ret = slab_alloc(cachep, flags, caller);
- trace_kmalloc((unsigned long) caller, ret,
+ trace_kmalloc(caller, ret,
size, cachep->size, flags);
return ret;
@@ -3961,20 +3862,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc(size_t size, gfp_t flags)
{
- return __do_kmalloc(size, flags, __builtin_return_address(0));
+ return __do_kmalloc(size, flags, _RET_IP_);
}
EXPORT_SYMBOL(__kmalloc);
void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
{
- return __do_kmalloc(size, flags, (void *)caller);
+ return __do_kmalloc(size, flags, caller);
}
EXPORT_SYMBOL(__kmalloc_track_caller);
#else
void *__kmalloc(size_t size, gfp_t flags)
{
- return __do_kmalloc(size, flags, NULL);
+ return __do_kmalloc(size, flags, 0);
}
EXPORT_SYMBOL(__kmalloc);
#endif
@@ -3990,12 +3891,15 @@ EXPORT_SYMBOL(__kmalloc);
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
unsigned long flags;
+ cachep = cache_from_obj(cachep, objp);
+ if (!cachep)
+ return;
local_irq_save(flags);
debug_check_no_locks_freed(objp, cachep->object_size);
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, cachep->object_size);
- __cache_free(cachep, objp, __builtin_return_address(0));
+ __cache_free(cachep, objp, _RET_IP_);
local_irq_restore(flags);
trace_kmem_cache_free(_RET_IP_, objp);
@@ -4026,17 +3930,11 @@ void kfree(const void *objp)
debug_check_no_locks_freed(objp, c->object_size);
debug_check_no_obj_freed(objp, c->object_size);
- __cache_free(c, (void *)objp, __builtin_return_address(0));
+ __cache_free(c, (void *)objp, _RET_IP_);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);
-unsigned int kmem_cache_size(struct kmem_cache *cachep)
-{
- return cachep->object_size;
-}
-EXPORT_SYMBOL(kmem_cache_size);
-
/*
* This initializes kmem_list3 or resizes various caches for all nodes.
*/
@@ -4143,7 +4041,7 @@ static void do_ccupdate_local(void *info)
}
/* Always called with the slab_mutex held */
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
struct ccupdate_struct *new;
@@ -4186,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
return alloc_kmemlist(cachep, gfp);
}
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared, gfp_t gfp)
+{
+ int ret;
+ struct kmem_cache *c = NULL;
+ int i = 0;
+
+ ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+
+ if (slab_state < FULL)
+ return ret;
+
+ if ((ret < 0) || !is_root_cache(cachep))
+ return ret;
+
+ VM_BUG_ON(!mutex_is_locked(&slab_mutex));
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg(cachep, i);
+ if (c)
+ /* return value determined by the parent cache only */
+ __do_tune_cpucache(c, limit, batchcount, shared, gfp);
+ }
+
+ return ret;
+}
+
/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
- int limit, shared;
+ int limit = 0;
+ int shared = 0;
+ int batchcount = 0;
+
+ if (!is_root_cache(cachep)) {
+ struct kmem_cache *root = memcg_root_cache(cachep);
+ limit = root->limit;
+ shared = root->shared;
+ batchcount = root->batchcount;
+ }
+ if (limit && shared && batchcount)
+ goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4233,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
if (limit > 32)
limit = 32;
#endif
- err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
+ batchcount = (limit + 1) / 2;
+skip_setup:
+ err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (err)
printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
cachep->name, -err);
@@ -4338,54 +4275,8 @@ out:
}
#ifdef CONFIG_SLABINFO
-
-static void print_slabinfo_header(struct seq_file *m)
-{
- /*
- * Output format version, so at least we can change it
- * without _too_ many complaints.
- */
-#if STATS
- seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
-#else
- seq_puts(m, "slabinfo - version: 2.1\n");
-#endif
- seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
- "<objperslab> <pagesperslab>");
- seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
- seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
-#if STATS
- seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
- "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
- seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
-#endif
- seq_putc(m, '\n');
-}
-
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
- loff_t n = *pos;
-
- mutex_lock(&slab_mutex);
- if (!n)
- print_slabinfo_header(m);
-
- return seq_list_start(&slab_caches, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &slab_caches, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
+void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
- mutex_unlock(&slab_mutex);
-}
-
-static int s_show(struct seq_file *m, void *p)
-{
- struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
struct slab *slabp;
unsigned long active_objs;
unsigned long num_objs;
@@ -4440,13 +4331,20 @@ static int s_show(struct seq_file *m, void *p)
if (error)
printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
- seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
- name, active_objs, num_objs, cachep->size,
- cachep->num, (1 << cachep->gfporder));
- seq_printf(m, " : tunables %4u %4u %4u",
- cachep->limit, cachep->batchcount, cachep->shared);
- seq_printf(m, " : slabdata %6lu %6lu %6lu",
- active_slabs, num_slabs, shared_avail);
+ sinfo->active_objs = active_objs;
+ sinfo->num_objs = num_objs;
+ sinfo->active_slabs = active_slabs;
+ sinfo->num_slabs = num_slabs;
+ sinfo->shared_avail = shared_avail;
+ sinfo->limit = cachep->limit;
+ sinfo->batchcount = cachep->batchcount;
+ sinfo->shared = cachep->shared;
+ sinfo->objects_per_slab = cachep->num;
+ sinfo->cache_order = cachep->gfporder;
+}
+
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
+{
#if STATS
{ /* list3 stats */
unsigned long high = cachep->high_mark;
@@ -4476,31 +4374,8 @@ static int s_show(struct seq_file *m, void *p)
allochit, allocmiss, freehit, freemiss);
}
#endif
- seq_putc(m, '\n');
- return 0;
}
-/*
- * slabinfo_op - iterator that generates /proc/slabinfo
- *
- * Output layout:
- * cache-name
- * num-active-objs
- * total-objs
- * object size
- * num-active-slabs
- * total-slabs
- * num-pages-per-slab
- * + further values on SMP and with statistics enabled
- */
-
-static const struct seq_operations slabinfo_op = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
-
#define MAX_SLABINFO_WRITE 128
/**
* slabinfo_write - Tuning for the slab allocator
@@ -4509,7 +4384,7 @@ static const struct seq_operations slabinfo_op = {
* @count: data length
* @ppos: unused
*/
-static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
@@ -4552,19 +4427,6 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
return res;
}
-static int slabinfo_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &slabinfo_op);
-}
-
-static const struct file_operations proc_slabinfo_operations = {
- .open = slabinfo_open,
- .read = seq_read,
- .write = slabinfo_write,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
#ifdef CONFIG_DEBUG_SLAB_LEAK
static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4693,6 +4555,16 @@ static int leaks_show(struct seq_file *m, void *p)
return 0;
}
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&slab_mutex);
+}
+
static const struct seq_operations slabstats_op = {
.start = leaks_start,
.next = s_next,
@@ -4727,7 +4599,6 @@ static const struct file_operations proc_slabstats_operations = {
static int __init slab_proc_init(void)
{
- proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
#ifdef CONFIG_DEBUG_SLAB_LEAK
proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
#endif
diff --git a/mm/slab.h b/mm/slab.h
index db7848caaa25..34a98d642196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -25,9 +25,208 @@ extern enum slab_state slab_state;
/* The slab cache mutex protects the management structures during changes */
extern struct mutex slab_mutex;
+
+/* The list of all slab caches on the system */
extern struct list_head slab_caches;
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *));
+/* The slab cache that manages slab cache information */
+extern struct kmem_cache *kmem_cache;
+
+unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size);
+
+/* Functions provided by the slab allocators */
+extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
+
+extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
+ unsigned long flags);
+extern void create_boot_cache(struct kmem_cache *, const char *name,
+ size_t size, unsigned long flags);
+
+struct mem_cgroup;
+#ifdef CONFIG_SLUB
+struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *));
+#else
+static inline struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
+{ return NULL; }
+#endif
+
+
+/* Legal flag mask for kmem_cache_create(), for various configurations */
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
+ SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS )
+
+#if defined(CONFIG_DEBUG_SLAB)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+#elif defined(CONFIG_SLUB_DEBUG)
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_DEBUG_FREE)
+#else
+#define SLAB_DEBUG_FLAGS (0)
+#endif
+
+#if defined(CONFIG_SLAB)
+#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
+ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+#elif defined(CONFIG_SLUB)
+#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_TEMPORARY | SLAB_NOTRACK)
+#else
+#define SLAB_CACHE_FLAGS (0)
+#endif
+
+#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
+
+int __kmem_cache_shutdown(struct kmem_cache *);
+
+struct seq_file;
+struct file;
+
+struct slabinfo {
+ unsigned long active_objs;
+ unsigned long num_objs;
+ unsigned long active_slabs;
+ unsigned long num_slabs;
+ unsigned long shared_avail;
+ unsigned int limit;
+ unsigned int batchcount;
+ unsigned int shared;
+ unsigned int objects_per_slab;
+ unsigned int cache_order;
+};
+
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos);
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+ return !s->memcg_params || s->memcg_params->is_root_cache;
+}
+
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+ struct mem_cgroup *memcg)
+{
+ return (is_root_cache(cachep) && !memcg) ||
+ (cachep->memcg_params->memcg == memcg);
+}
+
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+ if (!is_root_cache(s))
+ atomic_add(1 << order, &s->memcg_params->nr_pages);
+}
+
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+ if (is_root_cache(s))
+ return;
+
+ if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
+ mem_cgroup_destroy_cache(s);
+}
+
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+ struct kmem_cache *p)
+{
+ return (p == s) ||
+ (s->memcg_params && (p == s->memcg_params->root_cache));
+}
+
+/*
+ * We use suffixes to the name in memcg because we can't have caches
+ * created in the system with the same name. But when we print them
+ * locally, better refer to them with the base name
+ */
+static inline const char *cache_name(struct kmem_cache *s)
+{
+ if (!is_root_cache(s))
+ return s->memcg_params->root_cache->name;
+ return s->name;
+}
+
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+ return s->memcg_params->memcg_caches[idx];
+}
+
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+ if (is_root_cache(s))
+ return s;
+ return s->memcg_params->root_cache;
+}
+#else
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+ return true;
+}
+
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+ struct mem_cgroup *memcg)
+{
+ return true;
+}
+
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+}
+
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+}
+
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+ struct kmem_cache *p)
+{
+ return true;
+}
+
+static inline const char *cache_name(struct kmem_cache *s)
+{
+ return s->name;
+}
+
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+ return NULL;
+}
+
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+ return s;
+}
+#endif
+
+static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
+{
+ struct kmem_cache *cachep;
+ struct page *page;
+
+ /*
+ * When kmemcg is not being used, both assignments should return the
+ * same value. but we don't want to pay the assignment price in that
+ * case. If it is not compiled in, the compiler should be smart enough
+ * to not do even the assignment. In that case, slab_equal_or_root
+ * will also be a constant.
+ */
+ if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+ return s;
+
+ page = virt_to_head_page(x);
+ cachep = page->slab_cache;
+ if (slab_equal_or_root(cachep, s))
+ return cachep;
+ pr_err("%s: Wrong slab cache. %s but object is from %s\n",
+ __FUNCTION__, cachep->name, s->name);
+ WARN_ON_ONCE(1);
+ return s;
+}
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index aa3ca5bb01b5..3f3cd97d3fdf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -13,15 +13,129 @@
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
+#include <linux/memcontrol.h>
#include "slab.h"
enum slab_state slab_state;
LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
+struct kmem_cache *kmem_cache;
+
+#ifdef CONFIG_DEBUG_VM
+static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
+ size_t size)
+{
+ struct kmem_cache *s = NULL;
+
+ if (!name || in_interrupt() || size < sizeof(void *) ||
+ size > KMALLOC_MAX_SIZE) {
+ pr_err("kmem_cache_create(%s) integrity check failed\n", name);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(s, &slab_caches, list) {
+ char tmp;
+ int res;
+
+ /*
+ * This happens when the module gets unloaded and doesn't
+ * destroy its slab cache and no-one else reuses the vmalloc
+ * area of the module. Print a warning.
+ */
+ res = probe_kernel_address(s->name, tmp);
+ if (res) {
+ pr_err("Slab cache with size %d has lost its name\n",
+ s->object_size);
+ continue;
+ }
+
+ /*
+ * For simplicity, we won't check this in the list of memcg
+ * caches. We have control over memcg naming, and if there
+ * aren't duplicates in the global list, there won't be any
+ * duplicates in the memcg lists as well.
+ */
+ if (!memcg && !strcmp(s->name, name)) {
+ pr_err("%s (%s): Cache name already exists.\n",
+ __func__, name);
+ dump_stack();
+ s = NULL;
+ return -EINVAL;
+ }
+ }
+
+ WARN_ON(strchr(name, ' ')); /* It confuses parsers */
+ return 0;
+}
+#else
+static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
+ const char *name, size_t size)
+{
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_update_all_caches(int num_memcgs)
+{
+ struct kmem_cache *s;
+ int ret = 0;
+ mutex_lock(&slab_mutex);
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!is_root_cache(s))
+ continue;
+
+ ret = memcg_update_cache_size(s, num_memcgs);
+ /*
+ * See comment in memcontrol.c, memcg_update_cache_size:
+ * Instead of freeing the memory, we'll just leave the caches
+ * up to this point in an updated state.
+ */
+ if (ret)
+ goto out;
+ }
+
+ memcg_update_array_size(num_memcgs);
+out:
+ mutex_unlock(&slab_mutex);
+ return ret;
+}
+#endif
+
+/*
+ * Figure out what the alignment of the objects will be given a set of
+ * flags, a user specified alignment and the size of the objects.
+ */
+unsigned long calculate_alignment(unsigned long flags,
+ unsigned long align, unsigned long size)
+{
+ /*
+ * If the user wants hardware cache aligned objects then follow that
+ * suggestion if the object is sufficiently large.
+ *
+ * The hardware cache alignment cannot override the specified
+ * alignment though. If that is greater then use it.
+ */
+ if (flags & SLAB_HWCACHE_ALIGN) {
+ unsigned long ralign = cache_line_size();
+ while (size <= ralign / 2)
+ ralign /= 2;
+ align = max(align, ralign);
+ }
+
+ if (align < ARCH_SLAB_MINALIGN)
+ align = ARCH_SLAB_MINALIGN;
+
+ return ALIGN(align, sizeof(void *));
+}
+
/*
* kmem_cache_create - Create a cache.
@@ -48,73 +162,305 @@ DEFINE_MUTEX(slab_mutex);
* as davem.
*/
-struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+struct kmem_cache *
+kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *),
+ struct kmem_cache *parent_cache)
{
struct kmem_cache *s = NULL;
-
-#ifdef CONFIG_DEBUG_VM
- if (!name || in_interrupt() || size < sizeof(void *) ||
- size > KMALLOC_MAX_SIZE) {
- printk(KERN_ERR "kmem_cache_create(%s) integrity check"
- " failed\n", name);
- goto out;
- }
-#endif
+ int err = 0;
get_online_cpus();
mutex_lock(&slab_mutex);
-#ifdef CONFIG_DEBUG_VM
- list_for_each_entry(s, &slab_caches, list) {
- char tmp;
- int res;
+ if (!kmem_cache_sanity_check(memcg, name, size) == 0)
+ goto out_locked;
- /*
- * This happens when the module gets unloaded and doesn't
- * destroy its slab cache and no-one else reuses the vmalloc
- * area of the module. Print a warning.
- */
- res = probe_kernel_address(s->name, tmp);
- if (res) {
- printk(KERN_ERR
- "Slab cache with size %d has lost its name\n",
- s->object_size);
- continue;
- }
+ /*
+ * Some allocators will constraint the set of valid flags to a subset
+ * of all flags. We expect them to define CACHE_CREATE_MASK in this
+ * case, and we'll just provide them with a sanitized version of the
+ * passed flags.
+ */
+ flags &= CACHE_CREATE_MASK;
- if (!strcmp(s->name, name)) {
- printk(KERN_ERR "kmem_cache_create(%s): Cache name"
- " already exists.\n",
- name);
- dump_stack();
- s = NULL;
- goto oops;
+ s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
+ if (s)
+ goto out_locked;
+
+ s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+ if (s) {
+ s->object_size = s->size = size;
+ s->align = calculate_alignment(flags, align, size);
+ s->ctor = ctor;
+
+ if (memcg_register_cache(memcg, s, parent_cache)) {
+ kmem_cache_free(kmem_cache, s);
+ err = -ENOMEM;
+ goto out_locked;
}
- }
- WARN_ON(strchr(name, ' ')); /* It confuses parsers */
-#endif
+ s->name = kstrdup(name, GFP_KERNEL);
+ if (!s->name) {
+ kmem_cache_free(kmem_cache, s);
+ err = -ENOMEM;
+ goto out_locked;
+ }
- s = __kmem_cache_create(name, size, align, flags, ctor);
+ err = __kmem_cache_create(s, flags);
+ if (!err) {
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
+ memcg_cache_list_add(memcg, s);
+ } else {
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ }
+ } else
+ err = -ENOMEM;
-#ifdef CONFIG_DEBUG_VM
-oops:
-#endif
+out_locked:
mutex_unlock(&slab_mutex);
put_online_cpus();
-#ifdef CONFIG_DEBUG_VM
-out:
-#endif
- if (!s && (flags & SLAB_PANIC))
- panic("kmem_cache_create: Failed to create slab '%s'\n", name);
+ if (err) {
+
+ if (flags & SLAB_PANIC)
+ panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
+ name, err);
+ else {
+ printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+ name, err);
+ dump_stack();
+ }
+
+ return NULL;
+ }
return s;
}
+
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
+{
+ return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+}
EXPORT_SYMBOL(kmem_cache_create);
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+ /* Destroy all the children caches if we aren't a memcg cache */
+ kmem_cache_destroy_memcg_children(s);
+
+ get_online_cpus();
+ mutex_lock(&slab_mutex);
+ s->refcount--;
+ if (!s->refcount) {
+ list_del(&s->list);
+
+ if (!__kmem_cache_shutdown(s)) {
+ mutex_unlock(&slab_mutex);
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ rcu_barrier();
+
+ memcg_release_cache(s);
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ } else {
+ list_add(&s->list, &slab_caches);
+ mutex_unlock(&slab_mutex);
+ printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
+ s->name);
+ dump_stack();
+ }
+ } else {
+ mutex_unlock(&slab_mutex);
+ }
+ put_online_cpus();
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
int slab_is_available(void)
{
return slab_state >= UP;
}
+
+#ifndef CONFIG_SLOB
+/* Create a cache during boot when no slab services are available yet */
+void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
+ unsigned long flags)
+{
+ int err;
+
+ s->name = name;
+ s->size = s->object_size = size;
+ s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+ err = __kmem_cache_create(s, flags);
+
+ if (err)
+ panic("Creation of kmalloc slab %s size=%zd failed. Reason %d\n",
+ name, size, err);
+
+ s->refcount = -1; /* Exempt from merging for now */
+}
+
+struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
+ unsigned long flags)
+{
+ struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+
+ if (!s)
+ panic("Out of memory when creating slab %s\n", name);
+
+ create_boot_cache(s, name, size, flags);
+ list_add(&s->list, &slab_caches);
+ s->refcount = 1;
+ return s;
+}
+
+#endif /* !CONFIG_SLOB */
+
+
+#ifdef CONFIG_SLABINFO
+void print_slabinfo_header(struct seq_file *m)
+{
+ /*
+ * Output format version, so at least we can change it
+ * without _too_ many complaints.
+ */
+#ifdef CONFIG_DEBUG_SLAB
+ seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+#else
+ seq_puts(m, "slabinfo - version: 2.1\n");
+#endif
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
+ "<objperslab> <pagesperslab>");
+ seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+ seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+#ifdef CONFIG_DEBUG_SLAB
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
+ "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+#endif
+ seq_putc(m, '\n');
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+ loff_t n = *pos;
+
+ mutex_lock(&slab_mutex);
+ if (!n)
+ print_slabinfo_header(m);
+
+ return seq_list_start(&slab_caches, *pos);
+}
+
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ return seq_list_next(p, &slab_caches, pos);
+}
+
+static void s_stop(struct seq_file *m, void *p)
+{
+ mutex_unlock(&slab_mutex);
+}
+
+static void
+memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
+{
+ struct kmem_cache *c;
+ struct slabinfo sinfo;
+ int i;
+
+ if (!is_root_cache(s))
+ return;
+
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg(s, i);
+ if (!c)
+ continue;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ get_slabinfo(c, &sinfo);
+
+ info->active_slabs += sinfo.active_slabs;
+ info->num_slabs += sinfo.num_slabs;
+ info->shared_avail += sinfo.shared_avail;
+ info->active_objs += sinfo.active_objs;
+ info->num_objs += sinfo.num_objs;
+ }
+}
+
+int cache_show(struct kmem_cache *s, struct seq_file *m)
+{
+ struct slabinfo sinfo;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ get_slabinfo(s, &sinfo);
+
+ memcg_accumulate_slabinfo(s, &sinfo);
+
+ seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
+ cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
+ sinfo.objects_per_slab, (1 << sinfo.cache_order));
+
+ seq_printf(m, " : tunables %4u %4u %4u",
+ sinfo.limit, sinfo.batchcount, sinfo.shared);
+ seq_printf(m, " : slabdata %6lu %6lu %6lu",
+ sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
+ slabinfo_show_stats(m, s);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static int s_show(struct seq_file *m, void *p)
+{
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+
+ if (!is_root_cache(s))
+ return 0;
+ return cache_show(s, m);
+}
+
+/*
+ * slabinfo_op - iterator that generates /proc/slabinfo
+ *
+ * Output layout:
+ * cache-name
+ * num-active-objs
+ * total-objs
+ * object size
+ * num-active-slabs
+ * total-slabs
+ * num-pages-per-slab
+ * + further values on SMP and with statistics enabled
+ */
+static const struct seq_operations slabinfo_op = {
+ .start = s_start,
+ .next = s_next,
+ .stop = s_stop,
+ .show = s_show,
+};
+
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &slabinfo_op);
+}
+
+static const struct file_operations proc_slabinfo_operations = {
+ .open = slabinfo_open,
+ .read = seq_read,
+ .write = slabinfo_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static int __init slab_proc_init(void)
+{
+ proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
+ return 0;
+}
+module_init(slab_proc_init);
+#endif /* CONFIG_SLABINFO */
diff --git a/mm/slob.c b/mm/slob.c
index 45d4ca79933a..a99fdf7a0907 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -28,9 +28,8 @@
* from kmalloc are prepended with a 4-byte header with the kmalloc size.
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
* alloc_pages() directly, allocating compound pages so the page order
- * does not have to be separately tracked, and also stores the exact
- * allocation size in page->private so that it can be used to accurately
- * provide ksize(). These objects are detected in kfree() because slob_page()
+ * does not have to be separately tracked.
+ * These objects are detected in kfree() because PageSlab()
* is false for them.
*
* SLAB is emulated on top of SLOB by simply calling constructors and
@@ -59,7 +58,6 @@
#include <linux/kernel.h>
#include <linux/slab.h>
-#include "slab.h"
#include <linux/mm.h>
#include <linux/swap.h> /* struct reclaim_state */
@@ -74,6 +72,7 @@
#include <linux/atomic.h>
+#include "slab.h"
/*
* slob_block has a field 'units', which indicates size of block if +ve,
* or offset of next block if -ve (in SLOB_UNITs).
@@ -124,7 +123,6 @@ static inline void clear_slob_page_free(struct page *sp)
#define SLOB_UNIT sizeof(slob_t)
#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
-#define SLOB_ALIGN L1_CACHE_BYTES
/*
* struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -194,7 +192,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
void *page;
#ifdef CONFIG_NUMA
- if (node != -1)
+ if (node != NUMA_NO_NODE)
page = alloc_pages_exact_node(node, gfp, order);
else
#endif
@@ -290,7 +288,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
* If there's a node specification, search for a partial
* page with a matching node id in the freelist.
*/
- if (node != -1 && page_to_nid(sp) != node)
+ if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
continue;
#endif
/* Enough room on this page? */
@@ -425,10 +423,11 @@ out:
* End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
*/
-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
{
unsigned int *m;
- int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
void *ret;
gfp &= gfp_allowed_mask;
@@ -446,7 +445,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
*m = size;
ret = (void *)m + align;
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(caller, ret,
size, size + align, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -454,21 +453,36 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
if (likely(order))
gfp |= __GFP_COMP;
ret = slob_new_pages(gfp, order, node);
- if (ret) {
- struct page *page;
- page = virt_to_page(ret);
- page->private = size;
- }
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(caller, ret,
size, PAGE_SIZE << order, gfp, node);
}
kmemleak_alloc(ret, size, 1, gfp);
return ret;
}
+
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+ return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+}
EXPORT_SYMBOL(__kmalloc_node);
+#ifdef CONFIG_TRACING
+void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
+{
+ return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
+}
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, gfp, node, caller);
+}
+#endif
+#endif
+
void kfree(const void *block)
{
struct page *sp;
@@ -481,11 +495,11 @@ void kfree(const void *block)
sp = virt_to_page(block);
if (PageSlab(sp)) {
- int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
} else
- put_page(sp);
+ __free_pages(sp, compound_order(sp));
}
EXPORT_SYMBOL(kfree);
@@ -493,59 +507,32 @@ EXPORT_SYMBOL(kfree);
size_t ksize(const void *block)
{
struct page *sp;
+ int align;
+ unsigned int *m;
BUG_ON(!block);
if (unlikely(block == ZERO_SIZE_PTR))
return 0;
sp = virt_to_page(block);
- if (PageSlab(sp)) {
- int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
- unsigned int *m = (unsigned int *)(block - align);
- return SLOB_UNITS(*m) * SLOB_UNIT;
- } else
- return sp->private;
+ if (unlikely(!PageSlab(sp)))
+ return PAGE_SIZE << compound_order(sp);
+
+ align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ m = (unsigned int *)(block - align);
+ return SLOB_UNITS(*m) * SLOB_UNIT;
}
EXPORT_SYMBOL(ksize);
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
{
- struct kmem_cache *c;
-
- c = slob_alloc(sizeof(struct kmem_cache),
- GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
-
- if (c) {
- c->name = name;
- c->size = size;
- if (flags & SLAB_DESTROY_BY_RCU) {
- /* leave room for rcu footer at the end of object */
- c->size += sizeof(struct slob_rcu);
- }
- c->flags = flags;
- c->ctor = ctor;
- /* ignore alignment unless it's forced */
- c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
- if (c->align < ARCH_SLAB_MINALIGN)
- c->align = ARCH_SLAB_MINALIGN;
- if (c->align < align)
- c->align = align;
-
- kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
- c->refcount = 1;
+ if (flags & SLAB_DESTROY_BY_RCU) {
+ /* leave room for rcu footer at the end of object */
+ c->size += sizeof(struct slob_rcu);
}
- return c;
-}
-
-void kmem_cache_destroy(struct kmem_cache *c)
-{
- kmemleak_free(c);
- if (c->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
- slob_free(c, sizeof(struct kmem_cache));
+ c->flags = flags;
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_destroy);
void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
@@ -557,12 +544,12 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+ trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
} else {
b = slob_new_pages(flags, get_order(c->size), node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+ trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
PAGE_SIZE << get_order(c->size),
flags, node);
}
@@ -607,11 +594,11 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
}
EXPORT_SYMBOL(kmem_cache_free);
-unsigned int kmem_cache_size(struct kmem_cache *c)
+int __kmem_cache_shutdown(struct kmem_cache *c)
{
- return c->size;
+ /* No way to check for remaining objects */
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_size);
int kmem_cache_shrink(struct kmem_cache *d)
{
@@ -619,8 +606,16 @@ int kmem_cache_shrink(struct kmem_cache *d)
}
EXPORT_SYMBOL(kmem_cache_shrink);
+struct kmem_cache kmem_cache_boot = {
+ .name = "kmem_cache",
+ .size = sizeof(struct kmem_cache),
+ .flags = SLAB_PANIC,
+ .align = ARCH_KMALLOC_MINALIGN,
+};
+
void __init kmem_cache_init(void)
{
+ kmem_cache = &kmem_cache_boot;
slab_state = UP;
}
diff --git a/mm/slub.c b/mm/slub.c
index 2fdd96f9e998..ba2ca53f6c3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -31,6 +31,7 @@
#include <linux/fault-inject.h>
#include <linux/stacktrace.h>
#include <linux/prefetch.h>
+#include <linux/memcontrol.h>
#include <trace/events/kmem.h>
@@ -112,9 +113,6 @@
* the fast path and disables lockless freelists.
*/
-#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DEBUG_FREE)
-
static inline int kmem_cache_debug(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_DEBUG
@@ -179,8 +177,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#define __OBJECT_POISON 0x80000000UL /* Poison object */
#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
-static int kmem_size = sizeof(struct kmem_cache);
-
#ifdef CONFIG_SMP
static struct notifier_block slab_notifier;
#endif
@@ -205,17 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
-
+static void memcg_propagate_slab_attrs(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s)
-{
- kfree(s->name);
- kfree(s);
-}
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
+static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -568,6 +561,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
printk(KERN_ERR "----------------------------------------"
"-------------------------------------\n\n");
+
+ add_taint(TAINT_BAD_PAGE);
}
static void slab_fix(struct kmem_cache *s, char *fmt, ...)
@@ -624,7 +619,7 @@ static void object_err(struct kmem_cache *s, struct page *page,
print_trailer(s, page, object);
}
-static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
+static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...)
{
va_list args;
char buf[100];
@@ -1069,13 +1064,13 @@ bad:
return 0;
}
-static noinline int free_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, unsigned long addr)
+static noinline struct kmem_cache_node *free_debug_processing(
+ struct kmem_cache *s, struct page *page, void *object,
+ unsigned long addr, unsigned long *flags)
{
- unsigned long flags;
- int rc = 0;
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- local_irq_save(flags);
+ spin_lock_irqsave(&n->list_lock, *flags);
slab_lock(page);
if (!check_slab(s, page))
@@ -1094,11 +1089,11 @@ static noinline int free_debug_processing(struct kmem_cache *s,
if (!check_object(s, page, object, SLUB_RED_ACTIVE))
goto out;
- if (unlikely(s != page->slab)) {
+ if (unlikely(s != page->slab_cache)) {
if (!PageSlab(page)) {
slab_err(s, page, "Attempt to free object(0x%p) "
"outside of slab", object);
- } else if (!page->slab) {
+ } else if (!page->slab_cache) {
printk(KERN_ERR
"SLUB <none>: no slab for object 0x%p.\n",
object);
@@ -1113,15 +1108,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
set_track(s, object, TRACK_FREE, addr);
trace(s, page, object, 0);
init_object(s, object, SLUB_RED_INACTIVE);
- rc = 1;
out:
slab_unlock(page);
- local_irq_restore(flags);
- return rc;
+ /*
+ * Keep node_lock to preserve integrity
+ * until the object is actually freed
+ */
+ return n;
fail:
+ slab_unlock(page);
+ spin_unlock_irqrestore(&n->list_lock, *flags);
slab_fix(s, "Object at 0x%p not freed", object);
- goto out;
+ return NULL;
}
static int __init setup_slub_debug(char *str)
@@ -1214,8 +1213,9 @@ static inline void setup_object_debug(struct kmem_cache *s,
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
-static inline int free_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, unsigned long addr) { return 0; }
+static inline struct kmem_cache_node *free_debug_processing(
+ struct kmem_cache *s, struct page *page, void *object,
+ unsigned long addr, unsigned long *flags) { return NULL; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
@@ -1345,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
void *start;
void *last;
void *p;
+ int order;
BUG_ON(flags & GFP_SLAB_BUG_MASK);
@@ -1353,8 +1354,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!page)
goto out;
+ order = compound_order(page);
inc_slabs_node(s, page_to_nid(page), page->objects);
- page->slab = s;
+ memcg_bind_pages(s, order);
+ page->slab_cache = s;
__SetPageSlab(page);
if (page->pfmemalloc)
SetPageSlabPfmemalloc(page);
@@ -1362,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
start = page_address(page);
if (unlikely(s->flags & SLAB_POISON))
- memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
+ memset(start, POISON_INUSE, PAGE_SIZE << order);
last = start;
for_each_object(p, s, start, page->objects) {
@@ -1403,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
+
+ memcg_release_pages(s, order);
reset_page_mapcount(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_pages(page, order);
+ __free_memcg_kmem_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -1421,7 +1426,7 @@ static void rcu_free_slab(struct rcu_head *h)
else
page = container_of((struct list_head *)h, struct page, lru);
- __free_slab(page->slab, page);
+ __free_slab(page->slab_cache, page);
}
static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1714,7 +1719,7 @@ static inline void note_cmpxchg_failure(const char *n,
stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
}
-void init_kmem_cache_cpus(struct kmem_cache *s)
+static void init_kmem_cache_cpus(struct kmem_cache *s)
{
int cpu;
@@ -1869,12 +1874,14 @@ redo:
/*
* Unfreeze all the cpu partial slabs.
*
- * This function must be called with interrupt disabled.
+ * This function must be called with interrupts disabled
+ * for the cpu using c (or some other guarantee must be there
+ * to guarantee no concurrent accesses).
*/
-static void unfreeze_partials(struct kmem_cache *s)
+static void unfreeze_partials(struct kmem_cache *s,
+ struct kmem_cache_cpu *c)
{
struct kmem_cache_node *n = NULL, *n2 = NULL;
- struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
struct page *page, *discard_page = NULL;
while ((page = c->partial)) {
@@ -1939,7 +1946,7 @@ static void unfreeze_partials(struct kmem_cache *s)
* If we did not find a slot then simply move all the partials to the
* per node partial list.
*/
-int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
{
struct page *oldpage;
int pages;
@@ -1960,8 +1967,9 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
* set to the per node partial list.
*/
local_irq_save(flags);
- unfreeze_partials(s);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
local_irq_restore(flags);
+ oldpage = NULL;
pobjects = 0;
pages = 0;
stat(s, CPU_PARTIAL_DRAIN);
@@ -2002,7 +2010,7 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
if (c->page)
flush_slab(s, c);
- unfreeze_partials(s);
+ unfreeze_partials(s, c);
}
}
@@ -2310,7 +2318,7 @@ new_slab:
*
* Otherwise we can simply pick the next object from the lockless free list.
*/
-static __always_inline void *slab_alloc(struct kmem_cache *s,
+static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)
{
void **object;
@@ -2321,6 +2329,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
+ s = memcg_kmem_get_cache(s, gfpflags);
redo:
/*
@@ -2380,9 +2389,15 @@ redo:
return object;
}
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, unsigned long addr)
+{
+ return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+}
+
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
@@ -2393,7 +2408,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
- void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
return ret;
}
@@ -2411,7 +2426,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
- void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
s->object_size, s->size, gfpflags, node);
@@ -2425,7 +2440,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
int node, size_t size)
{
- void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
@@ -2449,7 +2464,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
void *prior;
void **object = (void *)x;
int was_frozen;
- int inuse;
struct page new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
@@ -2457,17 +2471,22 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
- if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
+ if (kmem_cache_debug(s) &&
+ !(n = free_debug_processing(s, page, x, addr, &flags)))
return;
do {
+ if (unlikely(n)) {
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ n = NULL;
+ }
prior = page->freelist;
counters = page->counters;
set_freepointer(s, object, prior);
new.counters = counters;
was_frozen = new.frozen;
new.inuse--;
- if ((!new.inuse || !prior) && !was_frozen && !n) {
+ if ((!new.inuse || !prior) && !was_frozen) {
if (!kmem_cache_debug(s) && !prior)
@@ -2492,7 +2511,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
}
}
- inuse = new.inuse;
} while (!cmpxchg_double_slab(s, page,
prior, counters,
@@ -2518,25 +2536,17 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
return;
}
+ if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+ goto slab_empty;
+
/*
- * was_frozen may have been set after we acquired the list_lock in
- * an earlier loop. So we need to check it here again.
+ * Objects left in the slab. If it was not on the partial list before
+ * then add it.
*/
- if (was_frozen)
- stat(s, FREE_FROZEN);
- else {
- if (unlikely(!inuse && n->nr_partial > s->min_partial))
- goto slab_empty;
-
- /*
- * Objects left in the slab. If it was not on the partial list before
- * then add it.
- */
- if (unlikely(!prior)) {
- remove_full(s, page);
- add_partial(n, page, DEACTIVATE_TO_TAIL);
- stat(s, FREE_ADD_PARTIAL);
- }
+ if (kmem_cache_debug(s) && unlikely(!prior)) {
+ remove_full(s, page);
+ add_partial(n, page, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
return;
@@ -2608,12 +2618,10 @@ redo:
void kmem_cache_free(struct kmem_cache *s, void *x)
{
- struct page *page;
-
- page = virt_to_head_page(x);
-
- slab_free(s, page, x, _RET_IP_);
-
+ s = cache_from_obj(s, x);
+ if (!s)
+ return;
+ slab_free(s, virt_to_head_page(x), x, _RET_IP_);
trace_kmem_cache_free(_RET_IP_, x);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -2751,32 +2759,6 @@ static inline int calculate_order(int size, int reserved)
return -ENOSYS;
}
-/*
- * Figure out what the alignment of the objects will be.
- */
-static unsigned long calculate_alignment(unsigned long flags,
- unsigned long align, unsigned long size)
-{
- /*
- * If the user wants hardware cache aligned objects then follow that
- * suggestion if the object is sufficiently large.
- *
- * The hardware cache alignment cannot override the specified
- * alignment though. If that is greater then use it.
- */
- if (flags & SLAB_HWCACHE_ALIGN) {
- unsigned long ralign = cache_line_size();
- while (size <= ralign / 2)
- ralign /= 2;
- align = max(align, ralign);
- }
-
- if (align < ARCH_SLAB_MINALIGN)
- align = ARCH_SLAB_MINALIGN;
-
- return ALIGN(align, sizeof(void *));
-}
-
static void
init_kmem_cache_node(struct kmem_cache_node *n)
{
@@ -2910,7 +2892,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
unsigned long flags = s->flags;
unsigned long size = s->object_size;
- unsigned long align = s->align;
int order;
/*
@@ -2982,19 +2963,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
#endif
/*
- * Determine the alignment based on various parameters that the
- * user specified and the dynamic determination of cache line size
- * on bootup.
- */
- align = calculate_alignment(flags, align, s->object_size);
- s->align = align;
-
- /*
* SLUB stores one object immediately after another beginning from
* offset 0. In order to align the objects we have to simply size
* each object to conform to the alignment.
*/
- size = ALIGN(size, align);
+ size = ALIGN(size, s->align);
s->size = size;
if (forced_order >= 0)
order = forced_order;
@@ -3023,20 +2996,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
s->max = s->oo;
return !!oo_objects(s->oo);
-
}
-static int kmem_cache_open(struct kmem_cache *s,
- const char *name, size_t size,
- size_t align, unsigned long flags,
- void (*ctor)(void *))
+static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
{
- memset(s, 0, kmem_size);
- s->name = name;
- s->ctor = ctor;
- s->object_size = size;
- s->align = align;
- s->flags = kmem_cache_flags(size, flags, name, ctor);
+ s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
s->reserved = 0;
if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
@@ -3098,7 +3062,6 @@ static int kmem_cache_open(struct kmem_cache *s,
else
s->cpu_partial = 30;
- s->refcount = 1;
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
@@ -3106,26 +3069,17 @@ static int kmem_cache_open(struct kmem_cache *s,
goto error;
if (alloc_kmem_cache_cpus(s))
- return 1;
+ return 0;
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
panic("Cannot create slab %s size=%lu realsize=%u "
"order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)size, s->size, oo_order(s->oo),
+ s->name, (unsigned long)s->size, s->size, oo_order(s->oo),
s->offset, flags);
- return 0;
-}
-
-/*
- * Determine the size of a slab object
- */
-unsigned int kmem_cache_size(struct kmem_cache *s)
-{
- return s->object_size;
+ return -EINVAL;
}
-EXPORT_SYMBOL(kmem_cache_size);
static void list_slab_objects(struct kmem_cache *s, struct page *page,
const char *text)
@@ -3137,7 +3091,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
sizeof(long), GFP_ATOMIC);
if (!map)
return;
- slab_err(s, page, "%s", text);
+ slab_err(s, page, text, s->name);
slab_lock(page);
get_map(s, page, map);
@@ -3169,7 +3123,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
discard_slab(s, page);
} else {
list_slab_objects(s, page,
- "Objects remaining on kmem_cache_close()");
+ "Objects remaining in %s on kmem_cache_close()");
}
}
}
@@ -3182,7 +3136,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
int node;
flush_all(s);
- free_percpu(s->cpu_slab);
/* Attempt to free all objects */
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3191,33 +3144,31 @@ static inline int kmem_cache_close(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
+ free_percpu(s->cpu_slab);
free_kmem_cache_nodes(s);
return 0;
}
-/*
- * Close a cache and release the kmem_cache structure
- * (must be used for caches created using kmem_cache_create)
- */
-void kmem_cache_destroy(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
{
- mutex_lock(&slab_mutex);
- s->refcount--;
- if (!s->refcount) {
- list_del(&s->list);
+ int rc = kmem_cache_close(s);
+
+ if (!rc) {
+ /*
+ * We do the same lock strategy around sysfs_slab_add, see
+ * __kmem_cache_create. Because this is pretty much the last
+ * operation we do and the lock will be released shortly after
+ * that in slab_common.c, we could just move sysfs_slab_remove
+ * to a later point in common code. We should do that when we
+ * have a common sysfs framework for all allocators.
+ */
mutex_unlock(&slab_mutex);
- if (kmem_cache_close(s)) {
- printk(KERN_ERR "SLUB %s: %s called for cache that "
- "still has objects.\n", s->name, __func__);
- dump_stack();
- }
- if (s->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
sysfs_slab_remove(s);
- } else
- mutex_unlock(&slab_mutex);
+ mutex_lock(&slab_mutex);
+ }
+
+ return rc;
}
-EXPORT_SYMBOL(kmem_cache_destroy);
/********************************************************************
* Kmalloc subsystem
@@ -3226,8 +3177,6 @@ EXPORT_SYMBOL(kmem_cache_destroy);
struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
EXPORT_SYMBOL(kmalloc_caches);
-static struct kmem_cache *kmem_cache;
-
#ifdef CONFIG_ZONE_DMA
static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
#endif
@@ -3268,29 +3217,6 @@ static int __init setup_slub_nomerge(char *str)
__setup("slub_nomerge", setup_slub_nomerge);
-static struct kmem_cache *__init create_kmalloc_cache(const char *name,
- int size, unsigned int flags)
-{
- struct kmem_cache *s;
-
- s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-
- /*
- * This function is called with IRQs disabled during early-boot on
- * single CPU so there's no need to take slab_mutex here.
- */
- if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
- flags, NULL))
- goto panic;
-
- list_add(&s->list, &slab_caches);
- return s;
-
-panic:
- panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
- return NULL;
-}
-
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -3362,7 +3288,7 @@ void *__kmalloc(size_t size, gfp_t flags)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
+ ret = slab_alloc(s, flags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -3376,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
struct page *page;
void *ptr = NULL;
- flags |= __GFP_COMP | __GFP_NOTRACK;
+ flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
page = alloc_pages_node(node, flags, get_order(size));
if (page)
ptr = page_address(page);
@@ -3405,7 +3331,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, node, _RET_IP_);
+ ret = slab_alloc_node(s, flags, node, _RET_IP_);
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
@@ -3428,7 +3354,7 @@ size_t ksize(const void *object)
return PAGE_SIZE << compound_order(page);
}
- return slab_ksize(page->slab);
+ return slab_ksize(page->slab_cache);
}
EXPORT_SYMBOL(ksize);
@@ -3453,8 +3379,8 @@ bool verify_mem_not_deleted(const void *x)
}
slab_lock(page);
- if (on_freelist(page->slab, page, object)) {
- object_err(page->slab, page, object, "Object is on free-list");
+ if (on_freelist(page->slab_cache, page, object)) {
+ object_err(page->slab_cache, page, object, "Object is on free-list");
rv = false;
} else {
rv = true;
@@ -3482,10 +3408,10 @@ void kfree(const void *x)
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kmemleak_free(x);
- put_page(page);
+ __free_memcg_kmem_pages(page, compound_order(page));
return;
}
- slab_free(page->slab, page, object, _RET_IP_);
+ slab_free(page->slab_cache, page, object, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
@@ -3577,7 +3503,7 @@ static void slab_mem_offline_callback(void *arg)
struct memory_notify *marg = arg;
int offline_node;
- offline_node = marg->status_change_nid;
+ offline_node = marg->status_change_nid_normal;
/*
* If the node still has available memory. we need kmem_cache_node
@@ -3610,7 +3536,7 @@ static int slab_mem_going_online_callback(void *arg)
struct kmem_cache_node *n;
struct kmem_cache *s;
struct memory_notify *marg = arg;
- int nid = marg->status_change_nid;
+ int nid = marg->status_change_nid_normal;
int ret = 0;
/*
@@ -3680,15 +3606,16 @@ static int slab_memory_callback(struct notifier_block *self,
/*
* Used for early kmem_cache structures that were allocated using
- * the page allocator
+ * the page allocator. Allocate them properly then fix up the pointers
+ * that may be pointing to the wrong kmem_cache structure.
*/
-static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
{
int node;
+ struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
- list_add(&s->list, &slab_caches);
- s->refcount = -1;
+ memcpy(s, static_cache, kmem_cache->object_size);
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3696,75 +3623,52 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
if (n) {
list_for_each_entry(p, &n->partial, lru)
- p->slab = s;
+ p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG
list_for_each_entry(p, &n->full, lru)
- p->slab = s;
+ p->slab_cache = s;
#endif
}
}
+ list_add(&s->list, &slab_caches);
+ return s;
}
void __init kmem_cache_init(void)
{
+ static __initdata struct kmem_cache boot_kmem_cache,
+ boot_kmem_cache_node;
int i;
- int caches = 0;
- struct kmem_cache *temp_kmem_cache;
- int order;
- struct kmem_cache *temp_kmem_cache_node;
- unsigned long kmalloc_size;
+ int caches = 2;
if (debug_guardpage_minorder())
slub_max_order = 0;
- kmem_size = offsetof(struct kmem_cache, node) +
- nr_node_ids * sizeof(struct kmem_cache_node *);
-
- /* Allocate two kmem_caches from the page allocator */
- kmalloc_size = ALIGN(kmem_size, cache_line_size());
- order = get_order(2 * kmalloc_size);
- kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
+ kmem_cache_node = &boot_kmem_cache_node;
+ kmem_cache = &boot_kmem_cache;
- /*
- * Must first have the slab cache available for the allocations of the
- * struct kmem_cache_node's. There is special bootstrap code in
- * kmem_cache_open for slab_state == DOWN.
- */
- kmem_cache_node = (void *)kmem_cache + kmalloc_size;
-
- kmem_cache_open(kmem_cache_node, "kmem_cache_node",
- sizeof(struct kmem_cache_node),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ create_boot_cache(kmem_cache_node, "kmem_cache_node",
+ sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN);
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
/* Able to allocate the per node structures */
slab_state = PARTIAL;
- temp_kmem_cache = kmem_cache;
- kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
- kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
- memcpy(kmem_cache, temp_kmem_cache, kmem_size);
+ create_boot_cache(kmem_cache, "kmem_cache",
+ offsetof(struct kmem_cache, node) +
+ nr_node_ids * sizeof(struct kmem_cache_node *),
+ SLAB_HWCACHE_ALIGN);
+
+ kmem_cache = bootstrap(&boot_kmem_cache);
/*
* Allocate kmem_cache_node properly from the kmem_cache slab.
* kmem_cache_node is separately allocated so no need to
* update any list pointers.
*/
- temp_kmem_cache_node = kmem_cache_node;
-
- kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
- memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
-
- kmem_cache_bootstrap_fixup(kmem_cache_node);
-
- caches++;
- kmem_cache_bootstrap_fixup(kmem_cache);
- caches++;
- /* Free temporary boot structure */
- free_pages((unsigned long)temp_kmem_cache, order);
+ kmem_cache_node = bootstrap(&boot_kmem_cache_node);
/* Now we can use the kmem_cache to allocate kmalloc slabs */
@@ -3892,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
return 0;
}
-static struct kmem_cache *find_mergeable(size_t size,
+static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
size_t align, unsigned long flags, const char *name,
void (*ctor)(void *))
{
@@ -3928,18 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
if (s->size - size >= sizeof(void *))
continue;
+ if (!cache_match_memcg(s, memcg))
+ continue;
+
return s;
}
return NULL;
}
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+struct kmem_cache *
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
- char *n;
- s = find_mergeable(size, align, flags, name, ctor);
+ s = find_mergeable(memcg, size, align, flags, name, ctor);
if (s) {
s->refcount++;
/*
@@ -3951,36 +3858,34 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
if (sysfs_slab_alias(s, name)) {
s->refcount--;
- return NULL;
+ s = NULL;
}
- return s;
}
- n = kstrdup(name, GFP_KERNEL);
- if (!n)
- return NULL;
+ return s;
+}
- s = kmalloc(kmem_size, GFP_KERNEL);
- if (s) {
- if (kmem_cache_open(s, n,
- size, align, flags, ctor)) {
- int r;
+int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
+{
+ int err;
+
+ err = kmem_cache_open(s, flags);
+ if (err)
+ return err;
- list_add(&s->list, &slab_caches);
- mutex_unlock(&slab_mutex);
- r = sysfs_slab_add(s);
- mutex_lock(&slab_mutex);
+ /* Mutex is not taken during early boot */
+ if (slab_state <= UP)
+ return 0;
- if (!r)
- return s;
+ memcg_propagate_slab_attrs(s);
+ mutex_unlock(&slab_mutex);
+ err = sysfs_slab_add(s);
+ mutex_lock(&slab_mutex);
- list_del(&s->list);
- kmem_cache_close(s);
- }
- kfree(s);
- }
- kfree(n);
- return NULL;
+ if (err)
+ kmem_cache_close(s);
+
+ return err;
}
#ifdef CONFIG_SMP
@@ -4033,7 +3938,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
+ ret = slab_alloc(s, gfpflags, caller);
/* Honor the call site pointer we received. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4063,7 +3968,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, node, caller);
+ ret = slab_alloc_node(s, gfpflags, node, caller);
/* Honor the call site pointer we received. */
trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
@@ -5206,16 +5111,93 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return -EIO;
err = attribute->store(s, buf, len);
+#ifdef CONFIG_MEMCG_KMEM
+ if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
+ int i;
+
+ mutex_lock(&slab_mutex);
+ if (s->max_attr_size < len)
+ s->max_attr_size = len;
+ /*
+ * This is a best effort propagation, so this function's return
+ * value will be determined by the parent cache only. This is
+ * basically because not all attributes will have a well
+ * defined semantics for rollbacks - most of the actions will
+ * have permanent effects.
+ *
+ * Returning the error value of any of the children that fail
+ * is not 100 % defined, in the sense that users seeing the
+ * error code won't be able to know anything about the state of
+ * the cache.
+ *
+ * Only returning the error code for the parent cache at least
+ * has well defined semantics. The cache being written to
+ * directly either failed or succeeded, in which case we loop
+ * through the descendants with best-effort propagation.
+ */
+ for_each_memcg_cache_index(i) {
+ struct kmem_cache *c = cache_from_memcg(s, i);
+ if (c)
+ attribute->store(c, buf, len);
+ }
+ mutex_unlock(&slab_mutex);
+ }
+#endif
return err;
}
-static void kmem_cache_release(struct kobject *kobj)
+static void memcg_propagate_slab_attrs(struct kmem_cache *s)
{
- struct kmem_cache *s = to_slab(kobj);
+#ifdef CONFIG_MEMCG_KMEM
+ int i;
+ char *buffer = NULL;
+
+ if (!is_root_cache(s))
+ return;
- kfree(s->name);
- kfree(s);
+ /*
+ * This mean this cache had no attribute written. Therefore, no point
+ * in copying default values around
+ */
+ if (!s->max_attr_size)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
+ char mbuf[64];
+ char *buf;
+ struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+
+ if (!attr || !attr->store || !attr->show)
+ continue;
+
+ /*
+ * It is really bad that we have to allocate here, so we will
+ * do it only as a fallback. If we actually allocate, though,
+ * we can just use the allocated buffer until the end.
+ *
+ * Most of the slub attributes will tend to be very small in
+ * size, but sysfs allows buffers up to a page, so they can
+ * theoretically happen.
+ */
+ if (buffer)
+ buf = buffer;
+ else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+ buf = mbuf;
+ else {
+ buffer = (char *) get_zeroed_page(GFP_KERNEL);
+ if (WARN_ON(!buffer))
+ continue;
+ buf = buffer;
+ }
+
+ attr->show(s->memcg_params->root_cache, buf);
+ attr->store(s, buf, strlen(buf));
+ }
+
+ if (buffer)
+ free_page((unsigned long)buffer);
+#endif
}
static const struct sysfs_ops slab_sysfs_ops = {
@@ -5225,7 +5207,6 @@ static const struct sysfs_ops slab_sysfs_ops = {
static struct kobj_type slab_ktype = {
.sysfs_ops = &slab_sysfs_ops,
- .release = kmem_cache_release
};
static int uevent_filter(struct kset *kset, struct kobject *kobj)
@@ -5275,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
if (p != name + 1)
*p++ = '-';
p += sprintf(p, "%07d", s->size);
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s))
+ p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
+#endif
+
BUG_ON(p > name + ID_STR_LENGTH - 1);
return name;
}
@@ -5283,13 +5270,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
const char *name;
- int unmergeable;
-
- if (slab_state < FULL)
- /* Defer until later */
- return 0;
+ int unmergeable = slab_unmergeable(s);
- unmergeable = slab_unmergeable(s);
if (unmergeable) {
/*
* Slabcache can never be merged so we can use the name proper.
@@ -5423,49 +5405,14 @@ __initcall(slab_sysfs_init);
* The /proc/slabinfo ABI
*/
#ifdef CONFIG_SLABINFO
-static void print_slabinfo_header(struct seq_file *m)
-{
- seq_puts(m, "slabinfo - version: 2.1\n");
- seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
- "<objperslab> <pagesperslab>");
- seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
- seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
- seq_putc(m, '\n');
-}
-
-static void *s_start(struct seq_file *m, loff_t *pos)
-{
- loff_t n = *pos;
-
- mutex_lock(&slab_mutex);
- if (!n)
- print_slabinfo_header(m);
-
- return seq_list_start(&slab_caches, *pos);
-}
-
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
- return seq_list_next(p, &slab_caches, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&slab_mutex);
-}
-
-static int s_show(struct seq_file *m, void *p)
+void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
{
unsigned long nr_partials = 0;
unsigned long nr_slabs = 0;
- unsigned long nr_inuse = 0;
unsigned long nr_objs = 0;
unsigned long nr_free = 0;
- struct kmem_cache *s;
int node;
- s = list_entry(p, struct kmem_cache, list);
-
for_each_online_node(node) {
struct kmem_cache_node *n = get_node(s, node);
@@ -5478,41 +5425,21 @@ static int s_show(struct seq_file *m, void *p)
nr_free += count_partial(n, count_free);
}
- nr_inuse = nr_objs - nr_free;
-
- seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
- nr_objs, s->size, oo_objects(s->oo),
- (1 << oo_order(s->oo)));
- seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
- seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
- 0UL);
- seq_putc(m, '\n');
- return 0;
+ sinfo->active_objs = nr_objs - nr_free;
+ sinfo->num_objs = nr_objs;
+ sinfo->active_slabs = nr_slabs;
+ sinfo->num_slabs = nr_slabs;
+ sinfo->objects_per_slab = oo_objects(s->oo);
+ sinfo->cache_order = oo_order(s->oo);
}
-static const struct seq_operations slabinfo_op = {
- .start = s_start,
- .next = s_next,
- .stop = s_stop,
- .show = s_show,
-};
-
-static int slabinfo_open(struct inode *inode, struct file *file)
+void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s)
{
- return seq_open(file, &slabinfo_op);
}
-static const struct file_operations proc_slabinfo_operations = {
- .open = slabinfo_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static int __init slab_proc_init(void)
+ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
{
- proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
- return 0;
+ return -EIO;
}
-module_init(slab_proc_init);
#endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse.c b/mm/sparse.c
index fac95f2888f2..6b5fb762e2ca 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
return; /* XXX: Not implemented yet */
}
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
}
#else
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
got_map_page:
ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:
- memset(ret, 0, memmap_size);
return ret;
}
@@ -658,10 +657,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
get_order(sizeof(struct page) * nr_pages));
}
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
unsigned long magic;
+ struct page *page = virt_to_page(memmap);
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->lru.next;
@@ -710,13 +710,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
*/
if (memmap) {
- struct page *memmap_page;
- memmap_page = virt_to_page(memmap);
-
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
>> PAGE_SHIFT;
- free_map_bootmem(memmap_page, nr_pages);
+ free_map_bootmem(memmap, nr_pages);
}
}
@@ -760,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
goto out;
}
+ memset(memmap, 0, sizeof(struct page) * nr_pages);
+
ms->section_mem_map |= SECTION_MARKED_PRESENT;
ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
@@ -773,6 +772,27 @@ out:
return ret;
}
+#ifdef CONFIG_MEMORY_FAILURE
+static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+ int i;
+
+ if (!memmap)
+ return;
+
+ for (i = 0; i < PAGES_PER_SECTION; i++) {
+ if (PageHWPoison(&memmap[i])) {
+ atomic_long_sub(1, &mce_bad_pages);
+ ClearPageHWPoison(&memmap[i]);
+ }
+ }
+}
+#else
+static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+}
+#endif
+
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
{
struct page *memmap = NULL;
@@ -786,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
ms->pageblock_flags = NULL;
}
+ clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
free_section_usemap(memmap, usemap);
}
#endif
diff --git a/mm/swap.c b/mm/swap.c
index 77825883298f..6310dc2008ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
}
EXPORT_SYMBOL(mark_page_accessed);
+/*
+ * Order of operations is important: flush the pagevec when it's already
+ * full, not when adding the last page, to make sure that last page is
+ * not added to the LRU directly when passed to this function. Because
+ * mark_page_accessed() (called after this when writing) only activates
+ * pages that are on the LRU, linear writes in subpage chunks would see
+ * every PAGEVEC_SIZE page activated, which is unexpected.
+ */
void __lru_cache_add(struct page *page, enum lru_list lru)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
page_cache_get(page);
- if (!pagevec_add(pvec, page))
+ if (!pagevec_space(pvec))
__pagevec_lru_add(pvec, lru);
+ pagevec_add(pvec, page);
put_cpu_var(lru_add_pvecs);
}
EXPORT_SYMBOL(__lru_cache_add);
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
SetPageLRU(page_tail);
- if (page_evictable(page_tail, NULL)) {
+ if (page_evictable(page_tail)) {
if (PageActive(page)) {
SetPageActive(page_tail);
active = 1;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14e254c768fc..e97a0e5aea91 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
return generic_swapfile_activate(sis, swap_file, span);
}
-static void enable_swap_info(struct swap_info_struct *p, int prio,
+static void _enable_swap_info(struct swap_info_struct *p, int prio,
unsigned char *swap_map,
unsigned long *frontswap_map)
{
int i, prev;
- spin_lock(&swap_lock);
if (prio >= 0)
p->prio = prio;
else
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
swap_list.head = swap_list.next = p->type;
else
swap_info[prev]->next = p->type;
+}
+
+static void enable_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ unsigned long *frontswap_map)
+{
+ spin_lock(&swap_lock);
+ _enable_swap_info(p, prio, swap_map, frontswap_map);
frontswap_init(p->type);
spin_unlock(&swap_lock);
}
+static void reinsert_swap_info(struct swap_info_struct *p)
+{
+ spin_lock(&swap_lock);
+ _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+ spin_unlock(&swap_lock);
+}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -1483,8 +1497,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
- char *pathname;
- int oom_score_adj;
+ struct filename *pathname;
int i, type, prev;
int err;
@@ -1494,12 +1507,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
BUG_ON(!current->mm);
pathname = getname(specialfile);
- err = PTR_ERR(pathname);
if (IS_ERR(pathname))
- goto out;
+ return PTR_ERR(pathname);
- victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
- putname(pathname);
+ victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
err = PTR_ERR(victim);
if (IS_ERR(victim))
goto out;
@@ -1545,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->flags &= ~SWP_WRITEOK;
spin_unlock(&swap_lock);
- oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
+ set_current_oom_origin();
err = try_to_unuse(type, false, 0); /* force all pages to be unused */
- compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
+ clear_current_oom_origin();
if (err) {
- /*
- * reading p->prio and p->swap_map outside the lock is
- * safe here because only sys_swapon and sys_swapoff
- * change them, and there can be no other sys_swapon or
- * sys_swapoff for this swap_info_struct at this point.
- */
/* re-insert swap space back into swap_list */
- enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+ reinsert_swap_info(p);
goto out_dput;
}
@@ -1609,6 +1614,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
out_dput:
filp_close(victim, NULL);
out:
+ putname(pathname);
return err;
}
@@ -1936,7 +1942,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
struct swap_info_struct *p;
- char *name;
+ struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
int i;
@@ -1967,7 +1973,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
name = NULL;
goto bad_swap;
}
- swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+ swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
if (IS_ERR(swap_file)) {
error = PTR_ERR(swap_file);
swap_file = NULL;
@@ -2053,7 +2059,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
printk(KERN_INFO "Adding %uk swap on %s. "
"Priority:%d extents:%d across:%lluk %s%s%s\n",
- p->pages<<(PAGE_SHIFT-10), name, p->prio,
+ p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
(p->flags & SWP_DISCARDABLE) ? "D" : "",
diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac7..d51ce92d6e83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
cancel_dirty_page(page, PAGE_CACHE_SIZE);
- clear_page_mlock(page);
ClearPageMappedToDisk(page);
delete_from_page_cache(page);
return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, 0))
return 0;
- clear_page_mlock(page);
ret = remove_mapping(mapping, page);
return ret;
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (PageDirty(page))
goto failed;
- clear_page_mlock(page);
BUG_ON(page_has_private(page));
__delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/util.c b/mm/util.c
index 8c7265afa29f..c55e26b17d93 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -105,6 +105,25 @@ void *memdup_user(const void __user *src, size_t len)
}
EXPORT_SYMBOL(memdup_user);
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+ gfp_t flags)
+{
+ void *ret;
+ size_t ks = 0;
+
+ if (p)
+ ks = ksize(p);
+
+ if (ks >= new_size)
+ return (void *)p;
+
+ ret = kmalloc_track_caller(new_size, flags);
+ if (ret && p)
+ memcpy(ret, p, ks);
+
+ return ret;
+}
+
/**
* __krealloc - like krealloc() but don't free @p.
* @p: object to reallocate memory for.
@@ -117,23 +136,11 @@ EXPORT_SYMBOL(memdup_user);
*/
void *__krealloc(const void *p, size_t new_size, gfp_t flags)
{
- void *ret;
- size_t ks = 0;
-
if (unlikely(!new_size))
return ZERO_SIZE_PTR;
- if (p)
- ks = ksize(p);
+ return __do_krealloc(p, new_size, flags);
- if (ks >= new_size)
- return (void *)p;
-
- ret = kmalloc_track_caller(new_size, flags);
- if (ret && p)
- memcpy(ret, p, ks);
-
- return ret;
}
EXPORT_SYMBOL(__krealloc);
@@ -145,7 +152,7 @@ EXPORT_SYMBOL(__krealloc);
*
* The contents of the object pointed to are preserved up to the
* lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @size is 0 and @p is not a
+ * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
* %NULL pointer, the object pointed to is freed.
*/
void *krealloc(const void *p, size_t new_size, gfp_t flags)
@@ -157,7 +164,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
return ZERO_SIZE_PTR;
}
- ret = __krealloc(p, new_size, flags);
+ ret = __do_krealloc(p, new_size, flags);
if (ret && p != ret)
kfree(p);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241c..5123a169ab7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
usize -= PAGE_SIZE;
} while (usize > 0);
- /* Prevent "things" like memory migration? VM_flags need a cleanup... */
- vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
return 0;
}
@@ -2551,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
- if (NUMA_BUILD) {
+ if (IS_ENABLED(CONFIG_NUMA)) {
unsigned int nr, *counters = m->private;
if (!counters)
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
{
struct vm_struct *v = p;
- seq_printf(m, "0x%p-0x%p %7ld",
+ seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
if (v->caller)
@@ -2616,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
unsigned int *ptr = NULL;
int ret;
- if (NUMA_BUILD) {
+ if (IS_ENABLED(CONFIG_NUMA)) {
ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
if (ptr == NULL)
return -ENOMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b434b674c0..adc7e9058181 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
redo:
ClearPageUnevictable(page);
- if (page_evictable(page, NULL)) {
+ if (page_evictable(page)) {
/*
* For evictable pages, we can use the cache.
* In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
* page is on unevictable list, it never be freed. To avoid that,
* check after we added it to the list, again.
*/
- if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+ if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
if (!isolate_lru_page(page)) {
put_page(page);
goto redo;
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
static unsigned long shrink_page_list(struct list_head *page_list,
struct zone *zone,
struct scan_control *sc,
+ enum ttu_flags ttu_flags,
unsigned long *ret_nr_dirty,
- unsigned long *ret_nr_writeback)
+ unsigned long *ret_nr_writeback,
+ bool force_reclaim)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
mem_cgroup_uncharge_start();
while (!list_empty(page_list)) {
- enum page_references references;
struct address_space *mapping;
struct page *page;
int may_enter_fs;
+ enum page_references references = PAGEREF_RECLAIM_CLEAN;
cond_resched();
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
sc->nr_scanned++;
- if (unlikely(!page_evictable(page, NULL)))
+ if (unlikely(!page_evictable(page)))
goto cull_mlocked;
if (!sc->may_unmap && page_mapped(page))
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
wait_on_page_writeback(page);
}
- references = page_check_references(page, sc);
+ if (!force_reclaim)
+ references = page_check_references(page, sc);
+
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, TTU_UNMAP)) {
+ switch (try_to_unmap(page, ttu_flags)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
return nr_reclaimed;
}
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ struct list_head *page_list)
+{
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_unmap = 1,
+ };
+ unsigned long ret, dummy1, dummy2;
+ struct page *page, *next;
+ LIST_HEAD(clean_pages);
+
+ list_for_each_entry_safe(page, next, page_list, lru) {
+ if (page_is_file_cache(page) && !PageDirty(page)) {
+ ClearPageActive(page);
+ list_move(&page->lru, &clean_pages);
+ }
+ }
+
+ ret = shrink_page_list(&clean_pages, zone, &sc,
+ TTU_UNMAP|TTU_IGNORE_ACCESS,
+ &dummy1, &dummy2, true);
+ list_splice(&clean_pages, page_list);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ return ret;
+}
+
/*
* Attempt to remove the specified page from its LRU. Only take this page
* if it is of the appropriate PageActive status. Pages which are being
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
if (!PageLRU(page))
return ret;
- /* Do not give back unevictable pages for compaction */
- if (PageUnevictable(page))
+ /* Compaction should not handle unevictable pages but CMA can do so */
+ if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
return ret;
ret = -EBUSY;
@@ -1146,7 +1177,11 @@ int isolate_lru_page(struct page *page)
}
/*
- * Are there way too many processes in the direct reclaim path already?
+ * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
+ * then get resheduled. When there are massive number of tasks doing page
+ * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
+ * the LRU list will go small and be scanned faster than necessary, leading to
+ * unnecessary swapping, thrashing and OOM.
*/
static int too_many_isolated(struct zone *zone, int file,
struct scan_control *sc)
@@ -1167,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
isolated = zone_page_state(zone, NR_ISOLATED_ANON);
}
+ /*
+ * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+ * won't get blocked by normal direct-reclaimers, forming a circular
+ * deadlock.
+ */
+ if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ inactive >>= 3;
+
return isolated > inactive;
}
@@ -1186,7 +1229,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
VM_BUG_ON(PageLRU(page));
list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
+ if (unlikely(!page_evictable(page))) {
spin_unlock_irq(&zone->lru_lock);
putback_lru_page(page);
spin_lock_irq(&zone->lru_lock);
@@ -1278,8 +1321,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, zone, sc,
- &nr_dirty, &nr_writeback);
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+ &nr_dirty, &nr_writeback, false);
spin_lock_irq(&zone->lru_lock);
@@ -1439,7 +1482,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
page = lru_to_page(&l_hold);
list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
+ if (unlikely(!page_evictable(page))) {
putback_lru_page(page);
continue;
}
@@ -1648,13 +1691,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
if (global_reclaim(sc)) {
free = zone_page_state(zone, NR_FREE_PAGES);
- /* If we have very few page cache pages,
- force-scan anon pages. */
if (unlikely(file + free <= high_wmark_pages(zone))) {
+ /*
+ * If we have very few page cache pages, force-scan
+ * anon pages.
+ */
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
goto out;
+ } else if (!inactive_file_is_low_global(zone)) {
+ /*
+ * There is enough inactive page cache, do not
+ * reclaim anything from the working set right now.
+ */
+ fraction[0] = 0;
+ fraction[1] = 1;
+ denominator = 1;
+ goto out;
}
}
@@ -1721,7 +1775,7 @@ out:
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
- if (COMPACTION_BUILD && sc->order &&
+ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
sc->priority < DEF_PRIORITY - 2))
return true;
@@ -1974,7 +2028,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
if (zone->all_unreclaimable &&
sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- if (COMPACTION_BUILD) {
+ if (IS_ENABLED(CONFIG_COMPACTION)) {
/*
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
@@ -2176,9 +2230,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
* Throttle direct reclaimers if backing storage is backed by the network
* and the PFMEMALLOC reserve for the preferred node is getting dangerously
* depleted. kswapd will continue to make progress and wake the processes
- * when the low watermark is reached
+ * when the low watermark is reached.
+ *
+ * Returns true if a fatal signal was delivered during throttling. If this
+ * happens, the page allocator should not consider triggering the OOM killer.
*/
-static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
nodemask_t *nodemask)
{
struct zone *zone;
@@ -2193,13 +2250,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
* processes to block on log_wait_commit().
*/
if (current->flags & PF_KTHREAD)
- return;
+ goto out;
+
+ /*
+ * If a fatal signal is pending, this process should not throttle.
+ * It should return quickly so it can exit and free its memory
+ */
+ if (fatal_signal_pending(current))
+ goto out;
/* Check if the pfmemalloc reserves are ok */
first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
pgdat = zone->zone_pgdat;
if (pfmemalloc_watermark_ok(pgdat))
- return;
+ goto out;
/* Account for the throttling */
count_vm_event(PGSCAN_DIRECT_THROTTLE);
@@ -2215,12 +2279,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
if (!(gfp_mask & __GFP_FS)) {
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat), HZ);
- return;
+
+ goto check_pending;
}
/* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
pfmemalloc_watermark_ok(pgdat));
+
+check_pending:
+ if (fatal_signal_pending(current))
+ return true;
+
+out:
+ return false;
}
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
@@ -2242,13 +2314,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.gfp_mask = sc.gfp_mask,
};
- throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
-
/*
- * Do not enter reclaim if fatal signal is pending. 1 is returned so
- * that the page allocator does not consider triggering OOM
+ * Do not enter reclaim if fatal signal was delivered while throttled.
+ * 1 is returned so that the page allocator does not OOM kill at this
+ * point.
*/
- if (fatal_signal_pending(current))
+ if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
return 1;
trace_mm_vmscan_direct_reclaim_begin(order,
@@ -2366,6 +2437,20 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
+static bool zone_balanced(struct zone *zone, int order,
+ unsigned long balance_gap, int classzone_idx)
+{
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
+ balance_gap, classzone_idx, 0))
+ return false;
+
+ if (IS_ENABLED(CONFIG_COMPACTION) && order &&
+ !compaction_suitable(zone, order))
+ return false;
+
+ return true;
+}
+
/*
* pgdat_balanced is used when checking if a node is balanced for high-order
* allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2444,8 +2529,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
continue;
}
- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
- i, 0))
+ if (!zone_balanced(zone, order, 0, i))
all_zones_ok = false;
else
balanced += zone->present_pages;
@@ -2486,7 +2570,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
int *classzone_idx)
{
- int all_zones_ok;
+ struct zone *unbalanced_zone;
unsigned long balanced;
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -2520,7 +2604,7 @@ loop_again:
unsigned long lru_pages = 0;
int has_under_min_watermark_zone = 0;
- all_zones_ok = 1;
+ unbalanced_zone = NULL;
balanced = 0;
/*
@@ -2554,8 +2638,7 @@ loop_again:
break;
}
- if (!zone_watermark_ok_safe(zone, order,
- high_wmark_pages(zone), 0, 0)) {
+ if (!zone_balanced(zone, order, 0, 0)) {
end_zone = i;
break;
} else {
@@ -2625,15 +2708,14 @@ loop_again:
* Do not reclaim more than needed for compaction.
*/
testorder = order;
- if (COMPACTION_BUILD && order &&
+ if (IS_ENABLED(CONFIG_COMPACTION) && order &&
compaction_suitable(zone, order) !=
COMPACT_SKIPPED)
testorder = 0;
if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
- !zone_watermark_ok_safe(zone, testorder,
- high_wmark_pages(zone) + balance_gap,
- end_zone, 0)) {
+ !zone_balanced(zone, testorder,
+ balance_gap, end_zone)) {
shrink_zone(zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2660,9 +2742,8 @@ loop_again:
continue;
}
- if (!zone_watermark_ok_safe(zone, testorder,
- high_wmark_pages(zone), end_zone, 0)) {
- all_zones_ok = 0;
+ if (!zone_balanced(zone, testorder, 0, end_zone)) {
+ unbalanced_zone = zone;
/*
* We are still under min water mark. This
* means that we have a GFP_ATOMIC allocation
@@ -2695,7 +2776,7 @@ loop_again:
pfmemalloc_watermark_ok(pgdat))
wake_up(&pgdat->pfmemalloc_wait);
- if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
+ if (!unbalanced_zone || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
@@ -2705,7 +2786,7 @@ loop_again:
if (has_under_min_watermark_zone)
count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
else
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
}
/*
@@ -2724,7 +2805,7 @@ out:
* high-order: Balanced zones must make up at least 25% of the node
* for the node to be balanced
*/
- if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
+ if (unbalanced_zone && (!order || !pgdat_balanced(pgdat, balanced, *classzone_idx))) {
cond_resched();
try_to_freeze();
@@ -2766,29 +2847,10 @@ out:
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable &&
- sc.priority != DEF_PRIORITY)
- continue;
-
- /* Would compaction fail due to lack of free memory? */
- if (COMPACTION_BUILD &&
- compaction_suitable(zone, order) == COMPACT_SKIPPED)
- goto loop_again;
-
- /* Confirm the zone is balanced for order-0 */
- if (!zone_watermark_ok(zone, 0,
- high_wmark_pages(zone), 0, 0)) {
- order = sc.order = 0;
- goto loop_again;
- }
-
/* Check if the memory needs to be defragmented. */
if (zone_watermark_ok(zone, order,
low_wmark_pages(zone), *classzone_idx, 0))
zones_need_compaction = 0;
-
- /* If balanced, clear the congested flag */
- zone_clear_flag(zone, ZONE_CONGESTED);
}
if (zones_need_compaction)
@@ -2839,6 +2901,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+ /*
+ * Compaction records what page blocks it recently failed to
+ * isolate pages from and skips them in the future scanning.
+ * When kswapd is going to sleep, it is reasonable to assume
+ * that pages and compaction may succeed so reset the cache.
+ */
+ reset_isolation_suitable(pgdat);
+
if (!kthread_should_stop())
schedule();
@@ -2905,7 +2975,7 @@ static int kswapd(void *p)
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
- int ret;
+ bool ret;
/*
* If the last balance_pgdat was unsuccessful it's unlikely a
@@ -2953,6 +3023,8 @@ static int kswapd(void *p)
&balanced_classzone_idx);
}
}
+
+ current->reclaim_state = NULL;
return 0;
}
@@ -3071,7 +3143,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
int nid;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
- for_each_node_state(nid, N_HIGH_MEMORY) {
+ for_each_node_state(nid, N_MEMORY) {
pg_data_t *pgdat = NODE_DATA(nid);
const struct cpumask *mask;
@@ -3101,9 +3173,9 @@ int kswapd_run(int nid)
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
- printk("Failed to start kswapd on node %d\n",nid);
pgdat->kswapd = NULL;
- ret = -1;
+ pr_err("Failed to start kswapd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kswapd);
}
return ret;
}
@@ -3127,7 +3199,7 @@ static int __init kswapd_init(void)
int nid;
swap_setup();
- for_each_node_state(nid, N_HIGH_MEMORY)
+ for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
@@ -3350,27 +3422,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
/*
* page_evictable - test whether a page is evictable
* @page: the page to test
- * @vma: the VMA in which the page is or will be mapped, may be NULL
*
* Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list. The vma argument is !NULL when called from the
- * fault path to determine how to instantate a new page.
+ * lists vs unevictable list.
*
* Reasons page might not be evictable:
* (1) page's mapping marked unevictable
* (2) page is part of an mlocked VMA
*
*/
-int page_evictable(struct page *page, struct vm_area_struct *vma)
+int page_evictable(struct page *page)
{
-
- if (mapping_unevictable(page_mapping(page)))
- return 0;
-
- if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
- return 0;
-
- return 1;
+ return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
}
#ifdef CONFIG_SHMEM
@@ -3408,7 +3471,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
if (!PageLRU(page) || !PageUnevictable(page))
continue;
- if (page_evictable(page, NULL)) {
+ if (page_evictable(page)) {
enum lru_list lru = page_lru_base_type(page);
VM_BUG_ON(PageActive(page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df7a6748231d..9800306c8195 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
atomic_long_add(global_diff[i], &vm_stat[i]);
}
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+{
+ int i;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (pset->vm_stat_diff[i]) {
+ int v = pset->vm_stat_diff[i];
+ pset->vm_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_stat[i]);
+ atomic_long_add(v, &vm_stat[i]);
+ }
+}
#endif
#ifdef CONFIG_NUMA
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = {
"numa_other",
#endif
"nr_anon_transparent_hugepages",
+ "nr_free_cma",
"nr_dirty_threshold",
"nr_dirty_background_threshold",
@@ -761,10 +774,20 @@ const char * const vmstat_text[] = {
"pgrotated",
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_pte_updates",
+ "numa_hint_faults",
+ "numa_hint_faults_local",
+ "numa_pages_migrated",
+#endif
+#ifdef CONFIG_MIGRATION
+ "pgmigrate_success",
+ "pgmigrate_fail",
+#endif
#ifdef CONFIG_COMPACTION
- "compact_blocks_moved",
- "compact_pages_moved",
- "compact_pagemigrate_failed",
+ "compact_migrate_scanned",
+ "compact_free_scanned",
+ "compact_isolated",
"compact_stall",
"compact_fail",
"compact_success",
@@ -781,7 +804,6 @@ const char * const vmstat_text[] = {
"unevictable_pgs_munlocked",
"unevictable_pgs_cleared",
"unevictable_pgs_stranded",
- "unevictable_pgs_mlockfreed",
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"thp_fault_alloc",
@@ -789,6 +811,8 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
"thp_split",
+ "thp_zero_page_alloc",
+ "thp_zero_page_alloc_failed",
#endif
#endif /* CONFIG_VM_EVENTS_COUNTERS */
@@ -918,7 +942,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
pg_data_t *pgdat = (pg_data_t *)arg;
/* check memoryless node */
- if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
seq_printf(m, "Page block order: %d\n", pageblock_order);
@@ -980,14 +1004,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n high %lu"
"\n scanned %lu"
"\n spanned %lu"
- "\n present %lu",
+ "\n present %lu"
+ "\n managed %lu",
zone_page_state(zone, NR_FREE_PAGES),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
zone->pages_scanned,
zone->spanned_pages,
- zone->present_pages);
+ zone->present_pages,
+ zone->managed_pages);
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", vmstat_text[i],
@@ -1157,7 +1183,7 @@ static void __cpuinit start_cpu_timer(int cpu)
{
struct delayed_work *work = &per_cpu(vmstat_work, cpu);
- INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
+ INIT_DEFERRABLE_WORK(work, vmstat_update);
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}
@@ -1280,7 +1306,7 @@ static int unusable_show(struct seq_file *m, void *arg)
pg_data_t *pgdat = (pg_data_t *)arg;
/* check memoryless node */
- if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
+ if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
walk_zones_in_node(m, pgdat, unusable_show_print);