summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/huge_memory.c129
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/list_lru.c3
-rw-r--r--mm/memcontrol.c262
-rw-r--r--mm/memory.c178
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c51
-rw-r--r--mm/mm_init.c18
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c72
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c5
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/vmscan.c1
-rw-r--r--mm/zswap.c4
21 files changed, 467 insertions, 409 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4a2d2e..ae4846ff4849 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
- bool memcg_oom;
pgoff_t size;
int ret = 0;
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
/*
- * Do we have something in the page cache already? Either
- * way, try readahead, but disable the memcg OOM killer for it
- * as readahead is optional and no errors are propagated up
- * the fault stack. The OOM killer is enabled while trying to
- * instantiate the faulting page individually below.
+ * Do we have something in the page cache already?
*/
page = find_get_page(mapping, offset);
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* We found the page, so try async readahead before
* waiting for the lock.
*/
- memcg_oom = mem_cgroup_toggle_oom(false);
do_async_mmap_readahead(vma, ra, file, page, offset);
- mem_cgroup_toggle_oom(memcg_oom);
} else if (!page) {
/* No page in the page cache at all */
- memcg_oom = mem_cgroup_toggle_oom(false);
do_sync_mmap_readahead(vma, ra, file, offset);
- mem_cgroup_toggle_oom(memcg_oom);
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7489884682d8..2612f60f53ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1278,64 +1278,105 @@ out:
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{
+ struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
- int target_nid;
- int current_nid = -1;
- bool migrated;
+ int page_nid = -1, this_nid = numa_node_id();
+ int target_nid, last_cpupid = -1;
+ bool page_locked;
+ bool migrated = false;
+ int flags = 0;
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
page = pmd_page(pmd);
- get_page(page);
- current_nid = page_to_nid(page);
+ BUG_ON(is_huge_zero_page(page));
+ page_nid = page_to_nid(page);
+ last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == this_nid) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ flags |= TNF_FAULT_LOCAL;
+ }
+
+ /*
+ * Avoid grouping on DSO/COW pages in specific and RO pages
+ * in general, RO pages shouldn't hurt as much anyway since
+ * they can be in shared cache state.
+ */
+ if (!pmd_write(pmd))
+ flags |= TNF_NO_GROUP;
+ /*
+ * Acquire the page lock to serialise THP migrations but avoid dropping
+ * page_table_lock if at all possible
+ */
+ page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) {
- put_page(page);
- goto clear_pmdnuma;
+ /* If the page was locked, there are no parallel migrations */
+ if (page_locked)
+ goto clear_pmdnuma;
+
+ /*
+ * Otherwise wait for potential migrations and retry. We do
+ * relock and check_same as the page may no longer be mapped.
+ * As the fault is being retried, do not account for it.
+ */
+ spin_unlock(&mm->page_table_lock);
+ wait_on_page_locked(page);
+ page_nid = -1;
+ goto out;
}
- /* Acquire the page lock to serialise THP migrations */
+ /* Page is misplaced, serialise migrations and parallel THP splits */
+ get_page(page);
spin_unlock(&mm->page_table_lock);
- lock_page(page);
+ if (!page_locked)
+ lock_page(page);
+ anon_vma = page_lock_anon_vma_read(page);
- /* Confirm the PTE did not while locked */
+ /* Confirm the PMD did not change while page_table_lock was released */
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
+ page_nid = -1;
goto out_unlock;
}
- spin_unlock(&mm->page_table_lock);
- /* Migrate the THP to the requested node */
+ /*
+ * Migrate the THP to the requested node, returns with page unlocked
+ * and pmd_numa cleared.
+ */
+ spin_unlock(&mm->page_table_lock);
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
- if (!migrated)
- goto check_same;
-
- task_numa_fault(target_nid, HPAGE_PMD_NR, true);
- return 0;
+ if (migrated) {
+ flags |= TNF_MIGRATED;
+ page_nid = target_nid;
+ }
-check_same:
- spin_lock(&mm->page_table_lock);
- if (unlikely(!pmd_same(pmd, *pmdp)))
- goto out_unlock;
+ goto out;
clear_pmdnuma:
+ BUG_ON(!PageLocked(page));
pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
+ unlock_page(page);
out_unlock:
spin_unlock(&mm->page_table_lock);
- if (current_nid != -1)
- task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+
+out:
+ if (anon_vma)
+ page_unlock_anon_vma_read(anon_vma);
+
+ if (page_nid != -1)
+ task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
+
return 0;
}
@@ -1432,6 +1473,12 @@ out:
return ret;
}
+/*
+ * Returns
+ * - 0 if PMD could not be locked
+ * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
+ * - HPAGE_PMD_NR is protections changed and TLB flush necessary
+ */
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
@@ -1440,22 +1487,34 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
- entry = pmdp_get_and_clear(mm, addr, pmd);
+ ret = 1;
if (!prot_numa) {
+ entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
+ ret = HPAGE_PMD_NR;
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
- /* only check non-shared pages */
- if (page_mapcount(page) == 1 &&
+ /*
+ * Do not trap faults against the zero page. The
+ * read-only data is likely to be read-cached on the
+ * local CPU cache and it is less useful to know about
+ * local vs remote hits on the zero page.
+ */
+ if (!is_huge_zero_page(page) &&
!pmd_numa(*pmd)) {
+ entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_mknuma(entry);
+ ret = HPAGE_PMD_NR;
}
}
- set_pmd_at(mm, addr, pmd, entry);
+
+ /* Set PMD if cleared earlier */
+ if (ret == HPAGE_PMD_NR)
+ set_pmd_at(mm, addr, pmd, entry);
+
spin_unlock(&vma->vm_mm->page_table_lock);
- ret = 1;
}
return ret;
@@ -1636,7 +1695,7 @@ static void __split_huge_page_refcount(struct page *page,
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
- page_nid_xchg_last(page_tail, page_nid_last(page));
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
@@ -2697,6 +2756,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
+again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2719,7 +2779,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
split_huge_page(page);
put_page(page);
- BUG_ON(pmd_trans_huge(*pmd));
+
+ /*
+ * We don't always have down_write of mmap_sem here: a racing
+ * do_huge_pmd_wp_page() might have copied-on-write to another
+ * huge page before our split_huge_page() got the anon_vma lock.
+ */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ goto again;
}
void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b49579c7f2a5..0b7656e804d1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -653,6 +653,7 @@ static void free_huge_page(struct page *page)
BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
restore_reserve = PagePrivate(page);
+ ClearPagePrivate(page);
spin_lock(&hugetlb_lock);
hugetlb_cgroup_uncharge_page(hstate_index(h),
@@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
__SetPageHead(page);
+ __ClearPageReserved(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
__SetPageTail(p);
+ /*
+ * For gigantic hugepages allocated through bootmem at
+ * boot, it's safer to be consistent with the not-gigantic
+ * hugepages and clear the PG_reserved bit from all tail pages
+ * too. Otherwse drivers using get_user_pages() to access tail
+ * pages may get the reference counting wrong if they see
+ * PG_reserved set on a tail page (despite the head page not
+ * having PG_reserved set). Enforcing this consistency between
+ * head and tail pages allows drivers to optimize away a check
+ * on the head page when they need know if put_page() is needed
+ * after get_user_pages().
+ */
+ __ClearPageReserved(p);
set_page_count(p, 0);
p->first_page = page;
}
@@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void)
#else
page = virt_to_page(m);
#endif
- __ClearPageReserved(page);
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
+ WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
/*
* If we had gigantic hugepages allocated at boot time, we need
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 72467914b856..72f9decb0104 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -81,8 +81,9 @@ restart:
* decrement nr_to_walk first so that we don't livelock if we
* get stuck on large numbesr of LRU_RETRY items
*/
- if (--(*nr_to_walk) == 0)
+ if (!*nr_to_walk)
break;
+ --*nr_to_walk;
ret = isolate(item, &nlru->lock, cb_arg);
switch (ret) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c52ddbc839b..796820925de0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -54,6 +54,7 @@
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/lockdep.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -498,6 +499,29 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
+/*
+ * We restrict the id in the range of [1, 65535], so it can fit into
+ * an unsigned short.
+ */
+#define MEM_CGROUP_ID_MAX USHRT_MAX
+
+static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+{
+ /*
+ * The ID of the root cgroup is 0, but memcg treat 0 as an
+ * invalid ID, so we return (cgroup_id + 1).
+ */
+ return memcg->css.cgroup->id + 1;
+}
+
+static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_from_id(id - 1, &mem_cgroup_subsys);
+ return mem_cgroup_from_css(css);
+}
+
/* Writing them here to avoid exposing memcg's inner layout */
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
@@ -569,16 +593,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
- * There are two main reasons for not using the css_id for this:
- * 1) this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
- *
- * 2) In order not to violate the cgroup API, we would like to do all memory
- * allocation in ->create(). At that point, we haven't yet allocated the
- * css_id. Having a separate index prevents us from messing with the cgroup
- * core for this
+ * The main reason for not using cgroup id for this:
+ * this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
*
* The current size of the caches array is stored in
* memcg_limited_groups_array_size. It will double each time we have to
@@ -593,14 +612,14 @@ int memcg_limited_groups_array_size;
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
- * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
- * css_id space is not getting any smaller, and we don't have to necessarily
+ * cgrp_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
-#define MEMCG_CACHES_MAX_SIZE 65535
+#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
/*
* A lot of the calls to the cache allocation functions are expected to be
@@ -866,6 +885,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
unsigned long val = 0;
int cpu;
+ get_online_cpus();
for_each_online_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
@@ -873,6 +893,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
val += memcg->nocpu_base.events[idx];
spin_unlock(&memcg->pcp_counter_lock);
#endif
+ put_online_cpus();
return val;
}
@@ -1405,7 +1426,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;
- return css_is_ancestor(&memcg->css, &root_memcg->css);
+ return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@ -2044,6 +2065,12 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
return total;
}
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map memcg_oom_lock_dep_map = {
+ .name = "memcg_oom_lock",
+};
+#endif
+
static DEFINE_SPINLOCK(memcg_oom_lock);
/*
@@ -2081,7 +2108,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
}
iter->oom_lock = false;
}
- }
+ } else
+ mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
spin_unlock(&memcg_oom_lock);
@@ -2093,6 +2121,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
spin_lock(&memcg_oom_lock);
+ mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
for_each_mem_cgroup_tree(iter, memcg)
iter->oom_lock = false;
spin_unlock(&memcg_oom_lock);
@@ -2159,110 +2188,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
memcg_wakeup_oom(memcg);
}
-/*
- * try to call OOM killer
- */
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- bool locked;
- int wakeups;
-
if (!current->memcg_oom.may_oom)
return;
-
- current->memcg_oom.in_memcg_oom = 1;
-
/*
- * As with any blocking lock, a contender needs to start
- * listening for wakeups before attempting the trylock,
- * otherwise it can miss the wakeup from the unlock and sleep
- * indefinitely. This is just open-coded because our locking
- * is so particular to memcg hierarchies.
+ * We are in the middle of the charge context here, so we
+ * don't want to block when potentially sitting on a callstack
+ * that holds all kinds of filesystem and mm locks.
+ *
+ * Also, the caller may handle a failed allocation gracefully
+ * (like optional page cache readahead) and so an OOM killer
+ * invocation might not even be necessary.
+ *
+ * That's why we don't do anything here except remember the
+ * OOM context and then deal with it at the end of the page
+ * fault when the stack is unwound, the locks are released,
+ * and when we know whether the fault was overall successful.
*/
- wakeups = atomic_read(&memcg->oom_wakeups);
- mem_cgroup_mark_under_oom(memcg);
-
- locked = mem_cgroup_oom_trylock(memcg);
-
- if (locked)
- mem_cgroup_oom_notify(memcg);
-
- if (locked && !memcg->oom_kill_disable) {
- mem_cgroup_unmark_under_oom(memcg);
- mem_cgroup_out_of_memory(memcg, mask, order);
- mem_cgroup_oom_unlock(memcg);
- /*
- * There is no guarantee that an OOM-lock contender
- * sees the wakeups triggered by the OOM kill
- * uncharges. Wake any sleepers explicitely.
- */
- memcg_oom_recover(memcg);
- } else {
- /*
- * A system call can just return -ENOMEM, but if this
- * is a page fault and somebody else is handling the
- * OOM already, we need to sleep on the OOM waitqueue
- * for this memcg until the situation is resolved.
- * Which can take some time because it might be
- * handled by a userspace task.
- *
- * However, this is the charge context, which means
- * that we may sit on a large call stack and hold
- * various filesystem locks, the mmap_sem etc. and we
- * don't want the OOM handler to deadlock on them
- * while we sit here and wait. Store the current OOM
- * context in the task_struct, then return -ENOMEM.
- * At the end of the page fault handler, with the
- * stack unwound, pagefault_out_of_memory() will check
- * back with us by calling
- * mem_cgroup_oom_synchronize(), possibly putting the
- * task to sleep.
- */
- current->memcg_oom.oom_locked = locked;
- current->memcg_oom.wakeups = wakeups;
- css_get(&memcg->css);
- current->memcg_oom.wait_on_memcg = memcg;
- }
+ css_get(&memcg->css);
+ current->memcg_oom.memcg = memcg;
+ current->memcg_oom.gfp_mask = mask;
+ current->memcg_oom.order = order;
}
/**
* mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
*
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
*
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
* sleep on a waitqueue until the userspace task resolves the
* situation. Sleeping directly in the charge context with all kinds
* of locks held is not a good idea, instead we remember an OOM state
* in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
*
* Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
*/
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
{
+ struct mem_cgroup *memcg = current->memcg_oom.memcg;
struct oom_wait_info owait;
- struct mem_cgroup *memcg;
+ bool locked;
/* OOM is global, do not handle */
- if (!current->memcg_oom.in_memcg_oom)
- return false;
-
- /*
- * We invoked the OOM killer but there is a chance that a kill
- * did not free up any charges. Everybody else might already
- * be sleeping, so restart the fault and keep the rampage
- * going until some charges are released.
- */
- memcg = current->memcg_oom.wait_on_memcg;
if (!memcg)
- goto out;
+ return false;
- if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
- goto out_memcg;
+ if (!handle)
+ goto cleanup;
owait.memcg = memcg;
owait.wait.flags = 0;
@@ -2271,13 +2249,25 @@ bool mem_cgroup_oom_synchronize(void)
INIT_LIST_HEAD(&owait.wait.task_list);
prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
- /* Only sleep if we didn't miss any wakeups since OOM */
- if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ if (locked && !memcg->oom_kill_disable) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+ current->memcg_oom.order);
+ } else {
schedule();
- finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
- mem_cgroup_unmark_under_oom(memcg);
- if (current->memcg_oom.oom_locked) {
+ mem_cgroup_unmark_under_oom(memcg);
+ finish_wait(&memcg_oom_waitq, &owait.wait);
+ }
+
+ if (locked) {
mem_cgroup_oom_unlock(memcg);
/*
* There is no guarantee that an OOM-lock contender
@@ -2286,10 +2276,9 @@ out_memcg:
*/
memcg_oom_recover(memcg);
}
+cleanup:
+ current->memcg_oom.memcg = NULL;
css_put(&memcg->css);
- current->memcg_oom.wait_on_memcg = NULL;
-out:
- current->memcg_oom.in_memcg_oom = 0;
return true;
}
@@ -2703,6 +2692,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
|| fatal_signal_pending(current)))
goto bypass;
+ if (unlikely(task_in_memcg_oom(current)))
+ goto bypass;
+
/*
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
@@ -2800,8 +2792,10 @@ done:
*ptr = memcg;
return 0;
nomem:
- *ptr = NULL;
- return -ENOMEM;
+ if (!(gfp_mask & __GFP_NOFAIL)) {
+ *ptr = NULL;
+ return -ENOMEM;
+ }
bypass:
*ptr = root_mem_cgroup;
return -EINTR;
@@ -2850,15 +2844,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
- struct cgroup_subsys_state *css;
-
/* ID 0 is unused ID */
if (!id)
return NULL;
- css = css_lookup(&mem_cgroup_subsys, id);
- if (!css)
- return NULL;
- return mem_cgroup_from_css(css);
+ return mem_cgroup_from_id(id);
}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -3806,8 +3795,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
{
/* Update stat data for mem_cgroup */
preempt_disable();
- WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
- __this_cpu_add(from->stat->count[idx], -nr_pages);
+ __this_cpu_sub(from->stat->count[idx], nr_pages);
__this_cpu_add(to->stat->count[idx], nr_pages);
preempt_enable();
}
@@ -4375,7 +4363,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
* css_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
- swap_cgroup_record(ent, css_id(&memcg->css));
+ swap_cgroup_record(ent, mem_cgroup_id(memcg));
}
#endif
@@ -4427,8 +4415,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
{
unsigned short old_id, new_id;
- old_id = css_id(&from->css);
- new_id = css_id(&to->css);
+ old_id = mem_cgroup_id(from);
+ new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
@@ -4983,31 +4971,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
} while (usage > 0);
}
-/*
- * This mainly exists for tests during the setting of set of use_hierarchy.
- * Since this is the very setting we are changing, the current hierarchy value
- * is meaningless
- */
-static inline bool __memcg_has_children(struct mem_cgroup *memcg)
-{
- struct cgroup_subsys_state *pos;
-
- /* bounce at first found */
- css_for_each_child(pos, &memcg->css)
- return true;
- return false;
-}
-
-/*
- * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
- * to be already dead (as in mem_cgroup_force_empty, for instance). This is
- * from mem_cgroup_count_children(), in the sense that we don't really care how
- * many children we have; we only need to know if we have any. It also counts
- * any memcg without hierarchy as infertile.
- */
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
- return memcg->use_hierarchy && __memcg_has_children(memcg);
+ lockdep_assert_held(&memcg_create_mutex);
+ /*
+ * The lock does not prevent addition or deletion to the list
+ * of children, but it prevents a new child from being
+ * initialized based on this parent in css_online(), so it's
+ * enough to decide whether hierarchically inherited
+ * attributes can still be changed or not.
+ */
+ return memcg->use_hierarchy &&
+ !list_empty(&memcg->css.cgroup->children);
}
/*
@@ -5087,7 +5062,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
*/
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
(val == 1 || val == 0)) {
- if (!__memcg_has_children(memcg))
+ if (list_empty(&memcg->css.cgroup->children))
memcg->use_hierarchy = val;
else
retval = -EBUSY;
@@ -6204,7 +6179,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
size_t size = memcg_size();
mem_cgroup_remove_from_trees(memcg);
- free_css_id(&mem_cgroup_subsys, &memcg->css);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
@@ -6307,6 +6281,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
int error = 0;
+ if (css->cgroup->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;
+
if (!parent)
return 0;
@@ -6578,7 +6555,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
- css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
+ mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
@@ -6998,7 +6975,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
.bind = mem_cgroup_bind,
.base_cftypes = mem_cgroup_files,
.early_init = 0,
- .use_id = 1,
};
#ifdef CONFIG_MEMCG_SWAP
diff --git a/mm/memory.c b/mm/memory.c
index e2bbba42604c..33a3dbec3cc8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
#include "internal.h"
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -837,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
make_migration_entry_read(&entry);
pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*src_pte))
+ pte = pte_swp_mksoft_dirty(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
}
@@ -2719,6 +2721,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
get_page(dirty_page);
reuse:
+ /*
+ * Clear the pages cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
+ */
+ if (old_page)
+ page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
+
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3519,13 +3529,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int current_nid)
+ unsigned long addr, int page_nid,
+ int *flags)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (current_nid == numa_node_id())
+ if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ *flags |= TNF_FAULT_LOCAL;
+ }
return mpol_misplaced(page, vma, addr);
}
@@ -3535,9 +3548,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *page = NULL;
spinlock_t *ptl;
- int current_nid = -1;
+ int page_nid = -1;
+ int last_cpupid;
int target_nid;
bool migrated = false;
+ int flags = 0;
/*
* The "pte" at this point cannot be used safely without
@@ -3564,123 +3579,44 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(ptep, ptl);
return 0;
}
+ BUG_ON(is_zero_pfn(page_to_pfn(page)));
+
+ /*
+ * Avoid grouping on DSO/COW pages in specific and RO pages
+ * in general, RO pages shouldn't hurt as much anyway since
+ * they can be in shared cache state.
+ */
+ if (!pte_write(pte))
+ flags |= TNF_NO_GROUP;
- current_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+ /*
+ * Flag if the page is shared between multiple address spaces. This
+ * is later used when determining whether to group tasks together
+ */
+ if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+ flags |= TNF_SHARED;
+
+ last_cpupid = page_cpupid_last(page);
+ page_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
- /*
- * Account for the fault against the current node if it not
- * being replaced regardless of where the page is located.
- */
- current_nid = numa_node_id();
put_page(page);
goto out;
}
/* Migrate to the requested node */
- migrated = migrate_misplaced_page(page, target_nid);
- if (migrated)
- current_nid = target_nid;
-
-out:
- if (current_nid != -1)
- task_numa_fault(current_nid, 1, migrated);
- return 0;
-}
-
-/* NUMA hinting page fault entry point for regular pmds */
-#ifdef CONFIG_NUMA_BALANCING
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp)
-{
- pmd_t pmd;
- pte_t *pte, *orig_pte;
- unsigned long _addr = addr & PMD_MASK;
- unsigned long offset;
- spinlock_t *ptl;
- bool numa = false;
- int local_nid = numa_node_id();
-
- spin_lock(&mm->page_table_lock);
- pmd = *pmdp;
- if (pmd_numa(pmd)) {
- set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
- numa = true;
+ migrated = migrate_misplaced_page(page, vma, target_nid);
+ if (migrated) {
+ page_nid = target_nid;
+ flags |= TNF_MIGRATED;
}
- spin_unlock(&mm->page_table_lock);
- if (!numa)
- return 0;
-
- /* we're in a page fault so some vma must be in the range */
- BUG_ON(!vma);
- BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
- offset = max(_addr, vma->vm_start) & ~PMD_MASK;
- VM_BUG_ON(offset >= PMD_SIZE);
- orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
- pte += offset >> PAGE_SHIFT;
- for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
- pte_t pteval = *pte;
- struct page *page;
- int curr_nid = local_nid;
- int target_nid;
- bool migrated;
- if (!pte_present(pteval))
- continue;
- if (!pte_numa(pteval))
- continue;
- if (addr >= vma->vm_end) {
- vma = find_vma(mm, addr);
- /* there's a pte present so there must be a vma */
- BUG_ON(!vma);
- BUG_ON(addr < vma->vm_start);
- }
- if (pte_numa(pteval)) {
- pteval = pte_mknonnuma(pteval);
- set_pte_at(mm, addr, pte, pteval);
- }
- page = vm_normal_page(vma, addr, pteval);
- if (unlikely(!page))
- continue;
- /* only check non-shared pages */
- if (unlikely(page_mapcount(page) != 1))
- continue;
-
- /*
- * Note that the NUMA fault is later accounted to either
- * the node that is currently running or where the page is
- * migrated to.
- */
- curr_nid = local_nid;
- target_nid = numa_migrate_prep(page, vma, addr,
- page_to_nid(page));
- if (target_nid == -1) {
- put_page(page);
- continue;
- }
-
- /* Migrate to the requested node */
- pte_unmap_unlock(pte, ptl);
- migrated = migrate_misplaced_page(page, target_nid);
- if (migrated)
- curr_nid = target_nid;
- task_numa_fault(curr_nid, 1, migrated);
-
- pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
- }
- pte_unmap_unlock(orig_pte, ptl);
-
- return 0;
-}
-#else
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp)
-{
- BUG();
+out:
+ if (page_nid != -1)
+ task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
-#endif /* CONFIG_NUMA_BALANCING */
/*
* These routines also need to handle stuff like marking pages dirty
@@ -3820,8 +3756,8 @@ retry:
}
}
- if (pmd_numa(*pmd))
- return do_pmd_numa_page(mm, vma, address, pmd);
+ /* THP should already have been handled */
+ BUG_ON(pmd_numa(*pmd));
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
@@ -3863,15 +3799,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
- mem_cgroup_enable_oom();
+ mem_cgroup_oom_enable();
ret = __handle_mm_fault(mm, vma, address, flags);
- if (flags & FAULT_FLAG_USER)
- mem_cgroup_disable_oom();
-
- if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
- mem_cgroup_oom_synchronize();
+ if (flags & FAULT_FLAG_USER) {
+ mem_cgroup_oom_disable();
+ /*
+ * The task may have entered a memcg OOM situation but
+ * if the allocation error was handled gracefully (no
+ * VM_FAULT_OOM), there is no need to kill anything.
+ * Just clean up the OOM state peacefully.
+ */
+ if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+ mem_cgroup_oom_synchronize(false);
+ }
return ret;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04729647f359..71cb253368cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
return pol;
}
+bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
+{
+ struct mempolicy *pol = get_task_policy(task);
+ if (vma) {
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ bool ret = false;
+
+ pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+ if (pol && (pol->flags & MPOL_F_MOF))
+ ret = true;
+ mpol_cond_put(pol);
+
+ return ret;
+ } else if (vma->vm_policy) {
+ pol = vma->vm_policy;
+ }
+ }
+
+ if (!pol)
+ return default_policy.flags & MPOL_F_MOF;
+
+ return pol->flags & MPOL_F_MOF;
+}
+
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
@@ -2277,6 +2301,35 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
+#ifdef CONFIG_NUMA_BALANCING
+static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+ /* Never defer a private fault */
+ if (cpupid_match_pid(p, last_cpupid))
+ return false;
+
+ if (p->numa_migrate_deferred) {
+ p->numa_migrate_deferred--;
+ return true;
+ }
+ return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+ p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
+}
+#else
+static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+ return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/**
* mpol_misplaced - check whether current page node is valid in policy
*
@@ -2300,6 +2353,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
struct zone *zone;
int curnid = page_to_nid(page);
unsigned long pgoff;
+ int thiscpu = raw_smp_processor_id();
+ int thisnid = cpu_to_node(thiscpu);
int polnid = -1;
int ret = -1;
@@ -2348,9 +2403,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
- int last_nid;
+ int last_cpupid;
+ int this_cpupid;
- polnid = numa_node_id();
+ polnid = thisnid;
+ this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
/*
* Multi-stage node selection is used in conjunction
@@ -2373,8 +2430,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
* it less likely we act on an unlikely task<->page
* relation.
*/
- last_nid = page_nid_xchg_last(page, polnid);
- if (last_nid != polnid)
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
+
+ /* See sysctl_numa_balancing_migrate_deferred comment */
+ if (!cpupid_match_pid(current, last_cpupid))
+ defer_numa_migrate(current);
+
+ goto out;
+ }
+
+ /*
+ * The quadratic filter above reduces extraneous migration
+ * of shared pages somewhat. This code reduces it even more,
+ * reducing the overhead of page migrations of shared pages.
+ * This makes workloads with shared pages rely more on
+ * "move task near its memory", and less on "move memory
+ * towards its task", which is exactly what we want.
+ */
+ if (numa_migrate_deferred(current, last_cpupid))
goto out;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index a26bccd44ccb..dfc8300ecbb2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
get_page(new);
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+ if (pte_swp_soft_dirty(*ptep))
+ pte = pte_mksoft_dirty(pte);
if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte);
#ifdef CONFIG_HUGETLB_PAGE
@@ -443,6 +445,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
+ int cpupid;
+
if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
@@ -479,6 +483,13 @@ void migrate_page_copy(struct page *newpage, struct page *page)
__set_page_dirty_nobuffers(newpage);
}
+ /*
+ * Copy NUMA information to the new page, to prevent over-eager
+ * future migrations of this same page.
+ */
+ cpupid = page_cpupid_xchg_last(page, -1);
+ page_cpupid_xchg_last(newpage, cpupid);
+
mlock_migrate_page(newpage, page);
ksm_migrate_page(newpage, page);
/*
@@ -1498,7 +1509,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
__GFP_NOWARN) &
~GFP_IOFS, 0);
if (newpage)
- page_nid_xchg_last(newpage, page_nid_last(page));
+ page_cpupid_xchg_last(newpage, page_cpupid_last(page));
return newpage;
}
@@ -1599,7 +1610,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
* node. Caller is expected to have an elevated reference count on
* the page that will be dropped by this function before returning.
*/
-int migrate_misplaced_page(struct page *page, int node)
+int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+ int node)
{
pg_data_t *pgdat = NODE_DATA(node);
int isolated;
@@ -1607,10 +1619,11 @@ int migrate_misplaced_page(struct page *page, int node)
LIST_HEAD(migratepages);
/*
- * Don't migrate pages that are mapped in multiple processes.
- * TODO: Handle false sharing detection instead of this hammer
+ * Don't migrate file pages that are mapped in multiple processes
+ * with execute permissions as they are probably shared libraries.
*/
- if (page_mapcount(page) != 1)
+ if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
+ (vma->vm_flags & VM_EXEC))
goto out;
/*
@@ -1661,13 +1674,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int page_lru = page_is_file_cache(page);
/*
- * Don't migrate pages that are mapped in multiple processes.
- * TODO: Handle false sharing detection instead of this hammer
- */
- if (page_mapcount(page) != 1)
- goto out_dropref;
-
- /*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
@@ -1680,7 +1686,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
if (!new_page)
goto out_fail;
- page_nid_xchg_last(new_page, page_nid_last(page));
+ page_cpupid_xchg_last(new_page, page_cpupid_last(page));
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
@@ -1713,12 +1719,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
unlock_page(new_page);
put_page(new_page); /* Free it */
- unlock_page(page);
+ /* Retake the callers reference and putback on LRU */
+ get_page(page);
putback_lru_page(page);
-
- count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
- isolated = 0;
- goto out;
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
+ goto out_fail;
}
/*
@@ -1735,9 +1741,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
- page_add_new_anon_rmap(new_page, vma, haddr);
-
+ pmdp_clear_flush(vma, haddr, pmd);
set_pmd_at(mm, haddr, pmd, entry);
+ page_add_new_anon_rmap(new_page, vma, haddr);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(page);
/*
@@ -1756,7 +1762,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
-out:
mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru,
-HPAGE_PMD_NR);
@@ -1765,6 +1770,10 @@ out:
out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
out_dropref:
+ entry = pmd_mknonnuma(entry);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+
unlock_page(page);
put_page(page);
return 0;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 633c08863fd8..68562e92d50c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void)
unsigned long or_mask, add_mask;
shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastnid %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
- LAST_NID_WIDTH,
+ LAST_CPUPID_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastnid %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d\n",
SECTIONS_SHIFT,
NODES_SHIFT,
ZONES_SHIFT,
- LAST_NID_SHIFT);
+ LAST_CPUPID_SHIFT);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
- "Section %lu Node %lu Zone %lu Lastnid %lu\n",
+ "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
(unsigned long)SECTIONS_PGSHIFT,
(unsigned long)NODES_PGSHIFT,
(unsigned long)ZONES_PGSHIFT,
- (unsigned long)LAST_NID_PGSHIFT);
+ (unsigned long)LAST_CPUPID_PGSHIFT);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
"Node/Zone ID: %lu -> %lu\n",
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void)
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
"Node not in page flags");
#endif
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
- "Last nid not in page flags");
+ "Last cpupid not in page flags");
#endif
if (SECTIONS_WIDTH) {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 2ac0afbd68f3..bf34fb8556db 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
INIT_LIST_HEAD(&lruvec->lists[lru]);
}
-#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
-int page_nid_xchg_last(struct page *page, int nid)
+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
+int page_cpupid_xchg_last(struct page *page, int cpupid)
{
unsigned long old_flags, flags;
- int last_nid;
+ int last_cpupid;
do {
old_flags = flags = page->flags;
- last_nid = page_nid_last(page);
+ last_cpupid = page_cpupid_last(page);
- flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
- flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
+ flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
+ flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
} while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
- return last_nid;
+ return last_cpupid;
}
#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4d6b43..a597f2ffcd6f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa, bool *ret_all_same_node)
+ int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
- bool all_same_node = true;
- int last_nid = -1;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -63,15 +61,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = vm_normal_page(vma, addr, oldpte);
if (page) {
- int this_nid = page_to_nid(page);
- if (last_nid == -1)
- last_nid = this_nid;
- if (last_nid != this_nid)
- all_same_node = false;
-
- /* only check non-shared pages */
- if (!pte_numa(oldpte) &&
- page_mapcount(page) == 1) {
+ if (!pte_numa(oldpte)) {
ptent = pte_mknuma(ptent);
updated = true;
}
@@ -94,40 +84,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
+ pte_t newpte;
/*
* A protection check is difficult so
* just be safe and disable write
*/
make_migration_entry_read(&entry);
- set_pte_at(mm, addr, pte,
- swp_entry_to_pte(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(oldpte))
+ newpte = pte_swp_mksoft_dirty(newpte);
+ set_pte_at(mm, addr, pte, newpte);
+
+ pages++;
}
- pages++;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
- *ret_all_same_node = all_same_node;
return pages;
}
-#ifdef CONFIG_NUMA_BALANCING
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmd)
-{
- spin_lock(&mm->page_table_lock);
- set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
- spin_unlock(&mm->page_table_lock);
-}
-#else
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
- pmd_t *pmd)
-{
- BUG();
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -135,34 +112,33 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pmd_t *pmd;
unsigned long next;
unsigned long pages = 0;
- bool all_same_node;
pmd = pmd_offset(pud, addr);
do {
+ unsigned long this_pages;
+
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma, addr, pmd);
- else if (change_huge_pmd(vma, pmd, addr, newprot,
- prot_numa)) {
- pages += HPAGE_PMD_NR;
- continue;
+ else {
+ int nr_ptes = change_huge_pmd(vma, pmd, addr,
+ newprot, prot_numa);
+
+ if (nr_ptes) {
+ if (nr_ptes == HPAGE_PMD_NR)
+ pages++;
+
+ continue;
+ }
}
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
continue;
- pages += change_pte_range(vma, pmd, addr, next, newprot,
- dirty_accountable, prot_numa, &all_same_node);
-
- /*
- * If we are changing protections for NUMA hinting faults then
- * set pmd_numa if the examined pages were all on the same
- * node. This allows a regular PMD to be handled as one fault
- * and effectively batches the taking of the PTL
- */
- if (prot_numa && all_same_node)
- change_pmd_protnuma(vma->vm_mm, addr, pmd);
+ this_pages = change_pte_range(vma, pmd, addr, next, newprot,
+ dirty_accountable, prot_numa);
+ pages += this_pages;
} while (pmd++, addr = next, addr != end);
return pages;
diff --git a/mm/mremap.c b/mm/mremap.c
index 91b13d6a16d4..0843feb66f3d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,7 +25,6 @@
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
#include "internal.h"
@@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
pmd = pmd_alloc(mm, pud, addr);
- if (!pmd) {
- pud_free(mm, pud);
+ if (!pmd)
return NULL;
- }
VM_BUG_ON(pmd_trans_huge(*pmd));
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d274381..6738c47f1f72 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
{
struct zonelist *zonelist;
- if (mem_cgroup_oom_synchronize())
+ if (mem_cgroup_oom_synchronize(true))
return;
zonelist = node_zonelist(first_online_node, GFP_KERNEL);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f5236f804aa6..63807583d8e8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
return 1;
}
-static long bdi_max_pause(struct backing_dev_info *bdi,
- unsigned long bdi_dirty)
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
+ unsigned long bdi_dirty)
{
- long bw = bdi->avg_write_bandwidth;
- long t;
+ unsigned long bw = bdi->avg_write_bandwidth;
+ unsigned long t;
/*
* Limit pause time for small memory systems. If sleeping for too long
@@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi,
t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
t++;
- return min_t(long, t, MAX_PAUSE);
+ return min_t(unsigned long, t, MAX_PAUSE);
}
static long bdi_min_pause(struct backing_dev_info *bdi,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd886fac451a..73d812f16dde 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page)
bad_page(page);
return 1;
}
- page_nid_reset_last(page);
+ page_cpupid_reset_last(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -4015,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
- page_nid_reset_last(page);
+ page_cpupid_reset_last(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 5da2cbcfdbb5..2beeabf502c5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -242,7 +242,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
if (err)
break;
pgd++;
- } while (addr = next, addr != end);
+ } while (addr = next, addr < end);
return err;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 8c8e08f3a692..0d10defe951e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1706,8 +1706,9 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
- free_fn(areas[group],
- ai->groups[group].nr_units * ai->unit_size);
+ if (areas[group])
+ free_fn(areas[group],
+ ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
if (areas)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a3443278ce3a..e2e98af703ea 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -56,6 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
continue;
}
+#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
/*
* For simplicity, we won't check this in the list of memcg
* caches. We have control over memcg naming, and if there
@@ -69,6 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
s = NULL;
return -EINVAL;
}
+#endif
}
WARN_ON(strchr(name, ' ')); /* It confuses parsers */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3963fc24fcc1..de7c904e52e5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct filename *pathname;
int i, type, prev;
int err;
+ unsigned int old_block_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
}
swap_file = p->swap_file;
+ old_block_size = p->old_block_size;
p->swap_file = NULL;
p->max = 0;
swap_map = p->swap_map;
@@ -1938,7 +1940,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
- set_blocksize(bdev, p->old_block_size);
+ set_blocksize(bdev, old_block_size);
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
} else {
mutex_lock(&inode->i_mutex);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 53f2f82f83ae..eea668d9cff6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -211,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker)
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
+ kfree(shrinker->nr_deferred);
}
EXPORT_SYMBOL(unregister_shrinker);
diff --git a/mm/zswap.c b/mm/zswap.c
index 841e35f1db22..d93510c6aa2d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -804,6 +804,10 @@ static void zswap_frontswap_invalidate_area(unsigned type)
}
tree->rbroot = RB_ROOT;
spin_unlock(&tree->lock);
+
+ zbud_destroy_pool(tree->pool);
+ kfree(tree);
+ zswap_trees[type] = NULL;
}
static struct zbud_ops zswap_zbud_ops = {