summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorMarcel Ziswiler <marcel.ziswiler@toradex.com>2020-02-04 11:31:11 +0100
committerMarcel Ziswiler <marcel.ziswiler@toradex.com>2020-02-07 13:50:20 +0100
commit500f9a04cd76f504285ad99b15e390e20ab17e0c (patch)
treee1c5ff3a78e75f0847b47fbac63d279dd411e7e5 /mm
parent0f549d8c4d5edd68a2a492afd48228458d3bca4a (diff)
parent6d0c334a400db31751c787c411e7187ab59a3f1d (diff)
Merge tag 'v4.14.164' into 4.14-2.3.x-imx
This is the 4.14.164 stable release Conflicts: arch/arm/Kconfig.debug arch/arm/boot/dts/imx7s.dtsi arch/arm/mach-imx/cpuidle-imx6q.c arch/arm/mach-imx/cpuidle-imx6sx.c arch/arm64/kernel/cpu_errata.c arch/arm64/kvm/hyp/tlb.c drivers/crypto/caam/caamalg.c drivers/crypto/mxs-dcp.c drivers/dma/imx-sdma.c drivers/gpio/gpio-vf610.c drivers/gpu/drm/bridge/adv7511/adv7511_drv.c drivers/input/keyboard/imx_keypad.c drivers/input/keyboard/snvs_pwrkey.c drivers/mmc/core/block.c drivers/mmc/core/queue.h drivers/mmc/host/sdhci-esdhc-imx.c drivers/net/can/flexcan.c drivers/net/can/rx-offload.c drivers/net/ethernet/freescale/fec_main.c drivers/net/wireless/ath/ath10k/pci.c drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c drivers/pci/dwc/pci-imx6.c drivers/spi/spi-fsl-lpspi.c drivers/usb/dwc3/gadget.c include/net/tcp.h sound/soc/fsl/Kconfig sound/soc/fsl/fsl_esai.c
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/cma.c40
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/compaction.c35
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/gup.c60
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/hugetlb.c78
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h10
-rw-r--r--mm/khugepaged.c3
-rw-r--r--mm/kmemleak.c20
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/list_lru.c10
-rw-r--r--mm/memcontrol.c51
-rw-r--r--mm/memory-failure.c14
-rw-r--r--mm/memory.c20
-rw-r--r--mm/memory_hotplug.c131
-rw-r--r--mm/mempolicy.c50
-rw-r--r--mm/migrate.c22
-rw-r--r--mm/mincore.c23
-rw-r--r--mm/mlock.c4
-rw-r--r--mm/mmap.c20
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c33
-rw-r--r--mm/page_alloc.c16
-rw-r--r--mm/page_ext.c5
-rw-r--r--mm/page_idle.c4
-rw-r--r--mm/page_owner.c5
-rw-r--r--mm/page_poison.c4
-rw-r--r--mm/percpu-km.c5
-rw-r--r--mm/percpu.c17
-rw-r--r--mm/shmem.c34
-rw-r--r--mm/slab.c26
-rw-r--r--mm/slub.c13
-rw-r--r--mm/usercopy.c10
-rw-r--r--mm/userfaultfd.c3
-rw-r--r--mm/vmalloc.c17
-rw-r--r--mm/vmscan.c73
-rw-r--r--mm/vmstat.c14
-rw-r--r--mm/zsmalloc.c85
43 files changed, 660 insertions, 352 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 59efbd3337e0..cc4d633947bf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -708,12 +708,12 @@ config MIGRATE_VMA_HELPER
config HMM
bool
+ select MMU_NOTIFIER
select MIGRATE_VMA_HELPER
config HMM_MIRROR
bool "HMM mirror CPU page table into a device page table"
depends on ARCH_HAS_HMM
- select MMU_NOTIFIER
select HMM
help
Select HMM_MIRROR if you want to mirror range of the CPU page table of a
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 9386c98dac12..6fa31754eadd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -684,6 +684,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
+ init_rwsem(&bdi->wb_switch_rwsem);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
diff --git a/mm/cma.c b/mm/cma.c
index 022e52bd8370..c4a34c813d47 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -105,8 +105,10 @@ static int __init cma_activate_area(struct cma *cma)
cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!cma->bitmap)
+ if (!cma->bitmap) {
+ cma->count = 0;
return -ENOMEM;
+ }
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -275,6 +277,12 @@ int __init cma_declare_contiguous(phys_addr_t base,
*/
alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+ if (fixed && base & (alignment - 1)) {
+ ret = -EINVAL;
+ pr_err("Region at %pa must be aligned to %pa bytes\n",
+ &base, &alignment);
+ goto err;
+ }
base = ALIGN(base, alignment);
size = ALIGN(size, alignment);
limit &= ~(alignment - 1);
@@ -305,6 +313,13 @@ int __init cma_declare_contiguous(phys_addr_t base,
if (limit == 0 || limit > memblock_end)
limit = memblock_end;
+ if (base + size > limit) {
+ ret = -EINVAL;
+ pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
+ &size, &base, &limit);
+ goto err;
+ }
+
/* Reserve memory */
if (fixed) {
if (memblock_is_region_reserved(base, size) ||
@@ -348,12 +363,14 @@ int __init cma_declare_contiguous(phys_addr_t base,
ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
if (ret)
- goto err;
+ goto free_mem;
pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
&base);
return 0;
+free_mem:
+ memblock_free(base, size);
err:
pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
return ret;
@@ -362,23 +379,26 @@ err:
#ifdef CONFIG_CMA_DEBUG
static void cma_debug_show_areas(struct cma *cma)
{
- unsigned long next_zero_bit, next_set_bit;
+ unsigned long next_zero_bit, next_set_bit, nr_zero;
unsigned long start = 0;
- unsigned int nr_zero, nr_total = 0;
+ unsigned long nr_part, nr_total = 0;
+ unsigned long nbits = cma_bitmap_maxno(cma);
mutex_lock(&cma->lock);
pr_info("number of available pages: ");
for (;;) {
- next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start);
- if (next_zero_bit >= cma->count)
+ next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
+ if (next_zero_bit >= nbits)
break;
- next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit);
+ next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
nr_zero = next_set_bit - next_zero_bit;
- pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit);
- nr_total += nr_zero;
+ nr_part = nr_zero << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
+ next_zero_bit);
+ nr_total += nr_part;
start = next_zero_bit + nr_zero;
}
- pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count);
+ pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
mutex_unlock(&cma->lock);
}
#else
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 275df8b5b22e..c47ea3cfee79 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -58,7 +58,7 @@ static int cma_maxchunk_get(void *data, u64 *val)
mutex_lock(&cma->lock);
for (;;) {
start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
- if (start >= cma->count)
+ if (start >= bitmap_maxno)
break;
end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
diff --git a/mm/compaction.c b/mm/compaction.c
index 85395dc6eb13..eb8e7f5d3a08 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1517,6 +1517,17 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
unsigned long end_pfn = zone_end_pfn(zone);
const bool sync = cc->mode != MIGRATE_ASYNC;
+ /*
+ * These counters track activities during zone compaction. Initialize
+ * them before compacting a new zone.
+ */
+ cc->total_migrate_scanned = 0;
+ cc->total_free_scanned = 0;
+ cc->nr_migratepages = 0;
+ cc->nr_freepages = 0;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
+
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
@@ -1680,10 +1691,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
{
enum compact_result ret;
struct compact_control cc = {
- .nr_freepages = 0,
- .nr_migratepages = 0,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.order = order,
.gfp_mask = gfp_mask,
.zone = zone,
@@ -1696,8 +1703,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
};
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
ret = compact_zone(zone, &cc);
@@ -1796,8 +1801,6 @@ static void compact_node(int nid)
struct zone *zone;
struct compact_control cc = {
.order = -1,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
@@ -1811,11 +1814,7 @@ static void compact_node(int nid)
if (!populated_zone(zone))
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
compact_zone(zone, &cc);
@@ -1924,8 +1923,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = true,
@@ -1950,16 +1947,10 @@ static void kcompactd_do_work(pg_data_t *pgdat)
COMPACT_CONTINUE)
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
- cc.total_migrate_scanned = 0;
- cc.total_free_scanned = 0;
- cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
-
if (kthread_should_stop())
return;
+
+ cc.zone = zone;
status = compact_zone(zone, &cc);
if (status == COMPACT_SUCCESS) {
diff --git a/mm/filemap.c b/mm/filemap.c
index e2e738cc08b1..a30dbf93de99 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -338,7 +338,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping))
+ if (!mapping_cap_writeback_dirty(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -464,6 +465,28 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
EXPORT_SYMBOL(filemap_fdatawait_range);
/**
+ * filemap_fdatawait_range_keep_errors - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space in the
+ * given range and wait for all of them. Unlike filemap_fdatawait_range(),
+ * this function does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves. Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return filemap_check_and_keep_errors(mapping);
+}
+EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
+
+/**
* file_fdatawait_range - wait for writeback to complete
* @file: file pointing to address space structure to wait for
* @start_byte: offset in bytes where the range starts
diff --git a/mm/gup.c b/mm/gup.c
index 4cc8a6ff0f56..12b9626b1a9e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -153,7 +153,10 @@ retry:
}
if (flags & FOLL_GET) {
- get_page(page);
+ if (unlikely(!try_get_page(page))) {
+ page = ERR_PTR(-ENOMEM);
+ goto out;
+ }
/* drop the pgmap reference now that we hold the page */
if (pgmap) {
@@ -280,7 +283,10 @@ retry_locked:
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
} else {
- get_page(page);
+ if (unlikely(!try_get_page(page))) {
+ spin_unlock(ptl);
+ return ERR_PTR(-ENOMEM);
+ }
spin_unlock(ptl);
lock_page(page);
ret = split_huge_page(page);
@@ -436,11 +442,14 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pgd = pgd_offset_k(address);
else
pgd = pgd_offset_gate(mm, address);
- BUG_ON(pgd_none(*pgd));
+ if (pgd_none(*pgd))
+ return -EFAULT;
p4d = p4d_offset(pgd, address);
- BUG_ON(p4d_none(*p4d));
+ if (p4d_none(*p4d))
+ return -EFAULT;
pud = pud_offset(p4d, address);
- BUG_ON(pud_none(*pud));
+ if (pud_none(*pud))
+ return -EFAULT;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return -EFAULT;
@@ -464,7 +473,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if (is_device_public_page(*page))
goto unmap;
}
- get_page(*page);
+ if (unlikely(!try_get_page(*page))) {
+ ret = -ENOMEM;
+ goto unmap;
+ }
out:
ret = 0;
unmap:
@@ -1355,7 +1367,8 @@ static inline pte_t gup_get_pte(pte_t *ptep)
}
#endif
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
+ struct page **pages)
{
while ((*nr) - nr_start) {
struct page *page = pages[--(*nr)];
@@ -1365,6 +1378,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
}
+/*
+ * Return the compund head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+ return head;
+}
+
#ifdef __HAVE_ARCH_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
@@ -1399,9 +1426,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ head = try_get_compound_head(page, 1);
+ if (!head)
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
@@ -1537,8 +1564,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pmd_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pmd_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1575,8 +1602,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pud_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pud_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1612,8 +1639,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pgd_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pgd_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1643,7 +1670,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (!pmd_present(pmd))
return 0;
- if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+ if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
+ pmd_devmap(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 930f2aa3bb4d..1adc2e6c50f9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -33,6 +33,7 @@
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
+#include <linux/page_owner.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -2387,6 +2388,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
ClearPageCompound(head);
+
+ split_page_owner(head, HPAGE_PMD_ORDER);
+
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
/* Additional pin to radix tree of swap cache */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 224cdd953a79..310656b4ede6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1081,11 +1081,10 @@ static bool pfn_range_valid_gigantic(struct zone *z,
struct page *page;
for (i = start_pfn; i < end_pfn; i++) {
- if (!pfn_valid(i))
+ page = pfn_to_online_page(i);
+ if (!page)
return false;
- page = pfn_to_page(i);
-
if (page_zone(page) != z)
return false;
@@ -1271,12 +1270,23 @@ void free_huge_page(struct page *page)
ClearPagePrivate(page);
/*
- * A return code of zero implies that the subpool will be under its
- * minimum size if the reservation is not restored after page is free.
- * Therefore, force restore_reserve operation.
+ * If PagePrivate() was set on page, page allocation consumed a
+ * reservation. If the page was associated with a subpool, there
+ * would have been a page reserved in the subpool before allocation
+ * via hugepage_subpool_get_pages(). Since we are 'restoring' the
+ * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * remove the reserved page from the subpool.
*/
- if (hugepage_subpool_put_pages(spool, 1) == 0)
- restore_reserve = true;
+ if (!restore_reserve) {
+ /*
+ * A return code of zero implies that the subpool will be
+ * under its minimum size if the reservation is not restored
+ * after page is free. Therefore, force restore_reserve
+ * operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+ }
spin_lock(&hugetlb_lock);
clear_page_huge_active(page);
@@ -3577,7 +3587,6 @@ retry_avoidcopy:
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- set_page_huge_active(new_page);
mmun_start = address & huge_page_mask(h);
mmun_end = mmun_start + huge_page_size(h);
@@ -3600,6 +3609,7 @@ retry_avoidcopy:
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, address);
+ set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
@@ -3682,6 +3692,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
pte_t new_pte;
spinlock_t *ptl;
+ bool new_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -3728,8 +3739,8 @@ retry:
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, address);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx,
+ address);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3747,7 +3758,7 @@ retry:
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
- set_page_huge_active(page);
+ new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
@@ -3818,6 +3829,15 @@ retry:
}
spin_unlock(ptl);
+
+ /*
+ * Only make newly allocated pages active. Existing pages found
+ * in the pagecache could be !page_huge_active() if they have been
+ * isolated for migration.
+ */
+ if (new_page)
+ set_page_huge_active(page);
+
unlock_page(page);
out:
return ret;
@@ -3832,21 +3852,14 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
unsigned long key[2];
u32 hash;
- if (vma->vm_flags & VM_SHARED) {
- key[0] = (unsigned long) mapping;
- key[1] = idx;
- } else {
- key[0] = (unsigned long) mm;
- key[1] = address >> huge_page_shift(h);
- }
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
@@ -3857,9 +3870,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
return 0;
@@ -3905,7 +3916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, address);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4053,7 +4064,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
* the set_pte_at() write.
*/
__SetPageUptodate(page);
- set_page_huge_active(page);
mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -4121,6 +4131,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
+ set_page_huge_active(page);
if (vm_shared)
unlock_page(page);
ret = 0;
@@ -4245,6 +4256,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
+
+ /*
+ * Instead of doing 'try_get_page()' below in the same_page
+ * loop, just check the count once here.
+ */
+ if (unlikely(page_count(page) <= 0)) {
+ if (pages) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
+ }
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index eec1150125b9..e430e04997ee 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -196,7 +196,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
- if (!css_tryget_online(&h_cg->css)) {
+ if (!css_tryget(&h_cg->css)) {
rcu_read_unlock();
goto again;
}
diff --git a/mm/internal.h b/mm/internal.h
index 1df011f62480..a182506242c4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -455,6 +455,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#define NODE_RECLAIM_SOME 0
#define NODE_RECLAIM_SUCCESS 1
+#ifdef CONFIG_NUMA
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+#else
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
+{
+ return NODE_RECLAIM_NOSCAN;
+}
+#endif
+
extern int hwpoison_filter(struct page *p);
extern u32 hwpoison_filter_dev_major;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d27a73737f1a..acf66c5dc9bd 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1006,6 +1006,9 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
+ result = SCAN_ANY_PROCESS;
+ if (!mmget_still_valid(mm))
+ goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d9e0be2a8189..d779181bed4d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -577,7 +577,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
if (in_irq()) {
object->pid = 0;
strncpy(object->comm, "hardirq", sizeof(object->comm));
- } else if (in_softirq()) {
+ } else if (in_serving_softirq()) {
object->pid = 0;
strncpy(object->comm, "softirq", sizeof(object->comm));
} else {
@@ -1364,6 +1364,7 @@ static void scan_block(void *_start, void *_end,
/*
* Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
*/
+#ifdef CONFIG_SMP
static void scan_large_block(void *start, void *end)
{
void *next;
@@ -1375,6 +1376,7 @@ static void scan_large_block(void *start, void *end)
cond_resched();
}
}
+#endif
/*
* Scan a memory block corresponding to a kmemleak_object. A condition is
@@ -1492,11 +1494,6 @@ static void kmemleak_scan(void)
}
rcu_read_unlock();
- /* data/bss scanning */
- scan_large_block(_sdata, _edata);
- scan_large_block(__bss_start, __bss_stop);
- scan_large_block(__start_ro_after_init, __end_ro_after_init);
-
#ifdef CONFIG_SMP
/* per-cpu sections scanning */
for_each_possible_cpu(i)
@@ -2027,6 +2024,17 @@ void __init kmemleak_init(void)
}
local_irq_restore(flags);
+ /* register the data/bss sections */
+ create_object((unsigned long)_sdata, _edata - _sdata,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+ create_object((unsigned long)__bss_start, __bss_stop - __bss_start,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+ /* only register .data..ro_after_init if not within .data */
+ if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata)
+ create_object((unsigned long)__start_ro_after_init,
+ __end_ro_after_init - __start_ro_after_init,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+
/*
* This is the point where tracking allocations is safe. Automatic
* scanning is started during the late initcall. Add the early logged
diff --git a/mm/ksm.c b/mm/ksm.c
index f50cc573815f..764486ffcd16 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -849,13 +849,13 @@ static int remove_stable_node(struct stable_node *stable_node)
return 0;
}
- if (WARN_ON_ONCE(page_mapped(page))) {
- /*
- * This should not happen: but if it does, just refuse to let
- * merge_across_nodes be switched - there is no need to panic.
- */
- err = -EBUSY;
- } else {
+ /*
+ * Page could be still mapped if this races with __mmput() running in
+ * between ksm_exit() and exit_mmap(). Just refuse to let
+ * merge_across_nodes/max_page_sharing be switched.
+ */
+ err = -EBUSY;
+ if (!page_mapped(page)) {
/*
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f141f0c80ff3..d67348afd1dc 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -42,11 +42,7 @@ static void list_lru_unregister(struct list_lru *lru)
#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
- /*
- * This needs node 0 to be always present, even
- * in the systems supporting sparse numa ids.
- */
- return !!lru->node[0].memcg_lrus;
+ return lru->memcg_aware;
}
static inline struct list_lru_one *
@@ -317,7 +313,7 @@ static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
}
return 0;
fail:
- __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+ __memcg_destroy_list_lru_node(memcg_lrus, begin, i);
return -ENOMEM;
}
@@ -389,6 +385,8 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
int i;
+ lru->memcg_aware = memcg_aware;
+
if (!memcg_aware)
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6a9a7e1066ef..326525a97c47 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -725,7 +725,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
if (unlikely(!memcg))
memcg = root_mem_cgroup;
}
- } while (!css_tryget_online(&memcg->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
@@ -871,26 +871,45 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
css_put(&prev->css);
}
-static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
+ struct mem_cgroup *dead_memcg)
{
- struct mem_cgroup *memcg = dead_memcg;
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
int i;
- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(memcg, nid);
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ for_each_node(nid) {
+ mz = mem_cgroup_nodeinfo(from, nid);
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
}
}
}
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup *last;
+
+ do {
+ __invalidate_reclaim_iterators(memcg, dead_memcg);
+ last = memcg;
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ /*
+ * When cgruop1 non-hierarchy mode is used,
+ * parent_mem_cgroup() does not walk all the way up to the
+ * cgroup root (root_mem_cgroup). So we have to handle
+ * dead_memcg from cgroup root separately.
+ */
+ if (last != root_mem_cgroup)
+ __invalidate_reclaim_iterators(root_mem_cgroup,
+ dead_memcg);
+}
+
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -2333,6 +2352,16 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+ /*
+ * Enforce __GFP_NOFAIL allocation because callers are not
+ * prepared to see failures and likely do not have any failure
+ * handling code.
+ */
+ if (gfp & __GFP_NOFAIL) {
+ page_counter_charge(&memcg->kmem, nr_pages);
+ return 0;
+ }
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ef080fa682a6..001b6bfccbfb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1701,19 +1701,17 @@ static int soft_offline_in_use_page(struct page *page, int flags)
struct page *hpage = compound_head(page);
if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(hpage);
- if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
- unlock_page(hpage);
- if (!PageAnon(hpage))
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unlock_page(page);
+ if (!PageAnon(page))
pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
else
pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(hpage);
+ put_hwpoison_page(page);
return -EBUSY;
}
- unlock_page(hpage);
- get_hwpoison_page(page);
- put_hwpoison_page(hpage);
+ unlock_page(page);
}
if (PageHuge(page))
diff --git a/mm/memory.c b/mm/memory.c
index fb9f7737c1ff..e9bce27bc18c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1804,14 +1804,21 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* in may not match the PFN we have mapped if the
* mapped PFN is a writeable COW page. In the mkwrite
* case we are creating a writable PTE for a shared
- * mapping and we expect the PFNs to match.
+ * mapping and we expect the PFNs to match. If they
+ * don't match, we are likely racing with block
+ * allocation and mapping invalidation so just skip the
+ * update.
*/
- if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+ if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
goto out_unlock;
- entry = *pte;
- goto out_mkwrite;
- } else
- goto out_unlock;
+ }
+ entry = pte_mkyoung(*pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+ update_mmu_cache(vma, addr, pte);
+ }
+ goto out_unlock;
}
/* Ok, finally just insert the thing.. */
@@ -1820,7 +1827,6 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
-out_mkwrite:
if (mkwrite) {
entry = pte_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c7c74a927d6f..2d6626ab29d1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -343,12 +343,8 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
-
for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(start_pfn);
-
- if (unlikely(!valid_section(ms)))
+ if (unlikely(!pfn_to_online_page(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -368,15 +364,12 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
unsigned long pfn;
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
@@ -398,7 +391,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
unsigned long zone_end_pfn = z;
unsigned long pfn;
- struct mem_section *ms;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
@@ -436,9 +428,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
*/
pfn = zone_start_pfn;
for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (page_zone(pfn_to_page(pfn)) != zone)
@@ -459,70 +449,33 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writeunlock(zone);
}
-static void shrink_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
+static void update_pgdat_span(struct pglist_data *pgdat)
{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
- unsigned long pgdat_end_pfn = p;
- unsigned long pfn;
- struct mem_section *ms;
- int nid = pgdat->node_id;
-
- if (pgdat_start_pfn == start_pfn) {
- /*
- * If the section is smallest section in the pgdat, it need
- * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
- * In this case, we find second smallest valid mem_section
- * for shrinking zone.
- */
- pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
- pgdat_end_pfn);
- if (pfn) {
- pgdat->node_start_pfn = pfn;
- pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
- }
- } else if (pgdat_end_pfn == end_pfn) {
- /*
- * If the section is biggest section in the pgdat, it need
- * shrink pgdat->node_spanned_pages.
- * In this case, we find second biggest valid mem_section for
- * shrinking zone.
- */
- pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
- start_pfn);
- if (pfn)
- pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
- }
-
- /*
- * If the section is not biggest or smallest mem_section in the pgdat,
- * it only creates a hole in the pgdat. So in this case, we need not
- * change the pgdat.
- * But perhaps, the pgdat has only hole data. Thus it check the pgdat
- * has only hole or not.
- */
- pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
+ unsigned long node_start_pfn = 0, node_end_pfn = 0;
+ struct zone *zone;
- if (unlikely(!valid_section(ms)))
- continue;
+ for (zone = pgdat->node_zones;
+ zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+ unsigned long zone_end_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
- if (pfn_to_nid(pfn) != nid)
+ /* No need to lock the zones, they can't change. */
+ if (!zone->spanned_pages)
continue;
-
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
+ if (!node_end_pfn) {
+ node_start_pfn = zone->zone_start_pfn;
+ node_end_pfn = zone_end_pfn;
continue;
+ }
- /* If we find valid section, we have nothing to do */
- return;
+ if (zone_end_pfn > node_end_pfn)
+ node_end_pfn = zone_end_pfn;
+ if (zone->zone_start_pfn < node_start_pfn)
+ node_start_pfn = zone->zone_start_pfn;
}
- /* The pgdat has no valid section */
- pgdat->node_start_pfn = 0;
- pgdat->node_spanned_pages = 0;
+ pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
static void __remove_zone(struct zone *zone, unsigned long start_pfn)
@@ -531,9 +484,19 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
int nr_pages = PAGES_PER_SECTION;
unsigned long flags;
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
+ * we will not try to shrink the zones - which is okay as
+ * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
+ */
+ if (zone_idx(zone) == ZONE_DEVICE)
+ return;
+#endif
+
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
- shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+ update_pgdat_span(pgdat);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
@@ -1110,7 +1073,12 @@ static int online_memory_block(struct memory_block *mem, void *arg)
return device_online(&mem->dev);
}
-/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+/*
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations (triggered e.g. by sysfs).
+ *
+ * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
+ */
int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
@@ -1203,9 +1171,9 @@ out:
mem_hotplug_done();
return ret;
}
-EXPORT_SYMBOL_GPL(add_memory_resource);
-int __ref add_memory(int nid, u64 start, u64 size)
+/* requires device_hotplug_lock, see add_memory_resource() */
+int __ref __add_memory(int nid, u64 start, u64 size)
{
struct resource *res;
int ret;
@@ -1219,6 +1187,17 @@ int __ref add_memory(int nid, u64 start, u64 size)
release_memory_resource(res);
return ret;
}
+
+int add_memory(int nid, u64 start, u64 size)
+{
+ int rc;
+
+ lock_device_hotplug();
+ rc = __add_memory(nid, start, size);
+ unlock_device_hotplug();
+
+ return rc;
+}
EXPORT_SYMBOL_GPL(add_memory);
#ifdef CONFIG_MEMORY_HOTREMOVE
@@ -1256,7 +1235,8 @@ static struct page *next_active_pageblock(struct page *page)
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
struct page *page = pfn_to_page(start_pfn);
- struct page *end_page = page + nr_pages;
+ unsigned long end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(page)));
+ struct page *end_page = pfn_to_page(end_pfn);
/* Check the starting page of each pageblock within the range */
for (; page < end_page; page = next_active_pageblock(page)) {
@@ -1296,6 +1276,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
i++;
if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
continue;
+ /* Check if we got outside of the zone */
+ if (zone && !zone_spans_pfn(zone, pfn + i))
+ return 0;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1b93535d875f..a37cfa88669e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -305,7 +305,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
else {
nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
*nodes);
- pol->w.cpuset_mems_allowed = tmp;
+ pol->w.cpuset_mems_allowed = *nodes;
}
if (nodes_empty(tmp))
@@ -349,7 +349,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) &&
+ if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -427,6 +427,13 @@ static inline bool queue_pages_required(struct page *page,
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
+/*
+ * queue_pages_pmd() has three possible return values:
+ * 1 - pages are placed on the right node or queued successfully.
+ * 0 - THP was split.
+ * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing
+ * page was already on a node that does not follow the policy.
+ */
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
@@ -436,7 +443,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long flags;
if (unlikely(is_pmd_migration_entry(*pmd))) {
- ret = 1;
+ ret = -EIO;
goto unlock;
}
page = pmd_page(*pmd);
@@ -462,8 +469,15 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
ret = 1;
flags = qp->flags;
/* go to thp migration */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ if (!vma_migratable(walk->vma)) {
+ ret = -EIO;
+ goto unlock;
+ }
+
migrate_page_add(page, qp->pagelist, flags);
+ } else
+ ret = -EIO;
unlock:
spin_unlock(ptl);
out:
@@ -488,8 +502,10 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
- if (ret)
+ if (ret > 0)
return 0;
+ else if (ret < 0)
+ return ret;
}
if (pmd_trans_unstable(pmd))
@@ -526,11 +542,16 @@ retry:
goto retry;
}
- migrate_page_add(page, qp->pagelist, flags);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ if (!vma_migratable(vma))
+ break;
+ migrate_page_add(page, qp->pagelist, flags);
+ } else
+ break;
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
- return 0;
+ return addr != end ? -EIO : 0;
}
static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
@@ -600,7 +621,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
- if (!vma_migratable(vma))
+ /*
+ * Need check MPOL_MF_STRICT to return -EIO if possible
+ * regardless of vma_migratable
+ */
+ if (!vma_migratable(vma) &&
+ !(flags & MPOL_MF_STRICT))
return 1;
if (endvma > end)
@@ -627,7 +653,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
}
/* queue pages from current vma */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & MPOL_MF_VALID)
return 0;
return 1;
}
@@ -1325,7 +1351,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
- const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+ unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
@@ -1486,7 +1512,7 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
int uninitialized_var(pval);
nodemask_t nodes;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
+ if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1515,7 +1541,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask)
diff --git a/mm/migrate.c b/mm/migrate.c
index 8c57cdd77ba5..9a3ce8847308 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -247,10 +247,8 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
pte = swp_entry_to_pte(entry);
} else if (is_device_public_page(new)) {
pte = pte_mkdevmap(pte);
- flush_dcache_page(new);
}
- } else
- flush_dcache_page(new);
+ }
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
@@ -971,6 +969,13 @@ static int move_to_new_page(struct page *newpage, struct page *page,
*/
if (!PageMappingFlags(page))
page->mapping = NULL;
+
+ if (unlikely(is_zone_device_page(newpage))) {
+ if (is_device_public_page(newpage))
+ flush_dcache_page(newpage);
+ } else
+ flush_dcache_page(newpage);
+
}
out:
return rc;
@@ -1303,6 +1308,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
lock_page(hpage);
}
+ /*
+ * Check for pages which are in the process of being freed. Without
+ * page_mapping() set, hugetlbfs specific move page routine will not
+ * be called and we could leak usage counts for subpools.
+ */
+ if (page_private(hpage) && !page_mapping(hpage)) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+
if (PageAnon(hpage))
anon_vma = page_get_anon_vma(hpage);
@@ -1334,6 +1349,7 @@ put_anon:
set_page_owner_migrate_reason(new_hpage, reason);
}
+out_unlock:
unlock_page(hpage);
out:
if (rc != -EAGAIN)
diff --git a/mm/mincore.c b/mm/mincore.c
index fc37afe226e6..2732c8c0764c 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -169,6 +169,22 @@ out:
return 0;
}
+static inline bool can_do_mincore(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * Reveal pagecache information only for non-anonymous mappings that
+ * correspond to the files the calling process could (if tried) open
+ * for writing; otherwise we'd be including shared non-exclusive
+ * mappings, which opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
/*
* Do a chunk of "sys_mincore()". We've already checked
* all the arguments, we hold the mmap semaphore: we should
@@ -189,8 +205,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
return -ENOMEM;
- mincore_walk.mm = vma->vm_mm;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+ if (!can_do_mincore(vma)) {
+ unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
+ memset(vec, 1, pages);
+ return pages;
+ }
+ mincore_walk.mm = vma->vm_mm;
err = walk_page_range(addr, end, &mincore_walk);
if (err < 0)
return err;
diff --git a/mm/mlock.c b/mm/mlock.c
index 46af369c13e5..1f9ee86672e8 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -629,11 +629,11 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
* is also counted.
* Return value: previously mlocked page counts
*/
-static int count_mm_mlocked_page_nr(struct mm_struct *mm,
+static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
unsigned long start, size_t len)
{
struct vm_area_struct *vma;
- int count = 0;
+ unsigned long count = 0;
if (mm == NULL)
mm = current->mm;
diff --git a/mm/mmap.c b/mm/mmap.c
index 2398776195d2..8c6ed06983f9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -45,6 +45,7 @@
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
+#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -88,12 +89,6 @@ static void unmap_region(struct mm_struct *mm,
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
- * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
- * MAP_PRIVATE:
- * r: (no) no
- * w: (no) no
- * x: (yes) yes
*/
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
@@ -2348,12 +2343,11 @@ int expand_downwards(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
- int error;
+ int error = 0;
address &= PAGE_MASK;
- error = security_mmap_addr(address);
- if (error)
- return error;
+ if (address < mmap_min_addr)
+ return -EPERM;
/* Enforce stack_guard_gap */
prev = vma->vm_prev;
@@ -2449,7 +2443,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
- if (!prev || expand_stack(prev, addr))
+ /* don't alter vm_end if the coredump is running */
+ if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2475,6 +2470,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
+ /* don't alter vm_start if the coredump is running */
+ if (!mmget_still_valid(mm))
+ return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 314285284e6e..70d0efb06374 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -267,7 +267,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
* thanks to mm_take_all_locks().
*/
spin_lock(&mm->mmu_notifier_mm->lock);
- hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+ hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
spin_unlock(&mm->mmu_notifier_mm->lock);
mm_drop_all_locks(mm);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fe0aac2348e5..7a5c0b229c6a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1050,9 +1050,10 @@ bool out_of_memory(struct oom_control *oc)
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here.
+ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
+ * invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e001de5ac50c..a40c075fd8f1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2150,6 +2150,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback);
* not miss some pages (e.g., because some other process has cleared TOWRITE
* tag we set). The rule we follow is that TOWRITE tag can be cleared only
* by the process clearing the DIRTY tag (and submitting the page for IO).
+ *
+ * To avoid deadlocks between range_cyclic writeback and callers that hold
+ * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
+ * we do not loop back to the start of the file. Doing so causes a page
+ * lock/page writeback access order inversion - we should only ever lock
+ * multiple pages in ascending page->index order, and looping back to the start
+ * of the file violates that rule and causes deadlocks.
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -2164,7 +2171,6 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- int cycled;
int range_whole = 0;
int tag;
@@ -2172,23 +2178,17 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
- if (index == 0)
- cycled = 1;
- else
- cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- cycled = 1; /* ignore range_cyclic tests */
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
-retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
@@ -2296,17 +2296,14 @@ continue_unlock:
pagevec_release(&pvec);
cond_resched();
}
- if (!cycled && !done) {
- /*
- * range_cyclic:
- * We hit the last page and there is more work to be done: wrap
- * back to the start of the file
- */
- cycled = 1;
- index = 0;
- end = writeback_index - 1;
- goto retry;
- }
+
+ /*
+ * If we hit the last page and there is more work to be done: wrap
+ * back the index back to the start of the file for the next
+ * time we are called.
+ */
+ if (wbc->range_cyclic && !done)
+ done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a2f365f40433..6f71518a4558 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1764,8 +1764,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
@@ -4325,11 +4325,11 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- page_ref_add(page, size - 1);
+ page_ref_add(page, size);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
nc->offset = size;
}
@@ -4345,10 +4345,10 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- set_page_count(page, size);
+ set_page_count(page, size + 1);
/* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
offset = size - fragsz;
}
@@ -5727,13 +5727,15 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long *zone_end_pfn,
unsigned long *ignored)
{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
zone_start_pfn, zone_end_pfn);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2c16216c29b6..dece2bdf86fe 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -271,6 +271,7 @@ static void free_page_ext(void *addr)
table_size = get_entry_size() * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
+ kmemleak_free(addr);
free_pages_exact(addr, table_size);
}
}
@@ -396,10 +397,8 @@ void __init page_ext_init(void)
* We know some arch can have a nodes layout such as
* -------------pfn-------------->
* N0 | N1 | N2 | N0 | N1 | N2|....
- *
- * Take into account DEFERRED_STRUCT_PAGE_INIT.
*/
- if (early_pfn_to_nid(pfn) != nid)
+ if (pfn_to_nid(pfn) != nid)
continue;
if (init_section_page_ext(pfn, nid))
goto oom;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index e412a63b2b74..504684181827 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -136,7 +136,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
end_pfn = pfn + count * BITS_PER_BYTE;
if (end_pfn > max_pfn)
- end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+ end_pfn = max_pfn;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
@@ -181,7 +181,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
end_pfn = pfn + count * BITS_PER_BYTE;
if (end_pfn > max_pfn)
- end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+ end_pfn = max_pfn;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index a71fe4c623ef..6ac05a6ff2d1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -273,7 +273,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
* not matter as the mixed block count will still be correct
*/
for (; pfn < end_pfn; ) {
- if (!pfn_valid(pfn)) {
+ page = pfn_to_online_page(pfn);
+ if (!page) {
pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
continue;
}
@@ -281,13 +282,13 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- page = pfn_to_page(pfn);
pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
if (!pfn_valid_within(pfn))
continue;
+ /* The pageblock is online, no need to recheck. */
page = pfn_to_page(pfn);
if (page_zone(page) != zone)
diff --git a/mm/page_poison.c b/mm/page_poison.c
index e83fd44867de..a7ba9e315a12 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,6 +6,7 @@
#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
+#include <linux/kasan.h>
static bool want_page_poisoning __read_mostly;
@@ -34,7 +35,10 @@ static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
+ /* KASAN still think the page is in-use, so skip it. */
+ kasan_disable_current();
memset(addr, PAGE_POISON, PAGE_SIZE);
+ kasan_enable_current();
kunmap_atomic(addr);
}
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 0d88d7bd5706..c22d959105b6 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -50,6 +50,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
struct page *pages;
+ unsigned long flags;
int i;
chunk = pcpu_alloc_chunk(gfp);
@@ -68,9 +69,9 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
chunk->data = pages;
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
- spin_lock_irq(&pcpu_lock);
+ spin_lock_irqsave(&pcpu_lock, flags);
pcpu_chunk_populated(chunk, 0, nr_pages, false);
- spin_unlock_irq(&pcpu_lock);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
pcpu_stats_chunk_alloc();
trace_percpu_create_chunk(chunk->base_addr);
diff --git a/mm/percpu.c b/mm/percpu.c
index 3074148b7e0d..9beb84800d8d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -984,7 +984,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
/*
* Search to find a fit.
*/
- end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
+ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
+ pcpu_chunk_map_bits(chunk));
bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
alloc_bits, align_mask);
if (bit_off >= end)
@@ -1702,6 +1703,7 @@ void free_percpu(void __percpu *ptr)
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
+ bool need_balance = false;
if (!ptr)
return;
@@ -1723,7 +1725,7 @@ void free_percpu(void __percpu *ptr)
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
if (pos != chunk) {
- pcpu_schedule_balance_work();
+ need_balance = true;
break;
}
}
@@ -1731,6 +1733,9 @@ void free_percpu(void __percpu *ptr)
trace_percpu_free_percpu(chunk->base_addr, off, ptr);
spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ if (need_balance)
+ pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);
@@ -2507,8 +2512,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
ai->groups[group].base_offset = areas[group] - base;
}
- pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
- PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
+ pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
+ PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
rc = pcpu_setup_first_chunk(ai, base);
@@ -2629,8 +2634,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
}
/* we're ready, commit */
- pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
- unit_pages, psize_str, vm.addr, ai->static_size,
+ pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
+ unit_pages, psize_str, ai->static_size,
ai->reserved_size, ai->dyn_size);
rc = pcpu_setup_first_chunk(ai, vm.addr);
diff --git a/mm/shmem.c b/mm/shmem.c
index 6c10f1d92251..69106c600692 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2657,31 +2657,33 @@ static void shmem_tag_pins(struct address_space *mapping)
void **slot;
pgoff_t start;
struct page *page;
+ unsigned int tagged = 0;
lru_add_drain();
start = 0;
- rcu_read_lock();
+ spin_lock_irq(&mapping->tree_lock);
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- page = radix_tree_deref_slot(slot);
+ page = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
if (!page || radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
slot = radix_tree_iter_retry(&iter);
continue;
}
} else if (page_count(page) - page_mapcount(page) > 1) {
- spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_set(&mapping->page_tree, iter.index,
SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
}
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
+ if (++tagged % 1024)
+ continue;
+
+ slot = radix_tree_iter_resume(slot, &iter);
+ spin_unlock_irq(&mapping->tree_lock);
+ cond_resched();
+ spin_lock_irq(&mapping->tree_lock);
}
- rcu_read_unlock();
+ spin_unlock_irq(&mapping->tree_lock);
}
/*
@@ -2893,7 +2895,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
}
shmem_falloc.waitq = &shmem_falloc_waitq;
- shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
spin_lock(&inode->i_lock);
inode->i_private = &shmem_falloc;
@@ -3096,16 +3098,20 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
- int ret;
+ int ret = 0;
/*
* No ordinary (disk based) filesystem counts links as inodes;
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
+ * But if an O_TMPFILE file is linked into the tmpfs, the
+ * first link must skip that, to get the accounting right.
*/
- ret = shmem_reserve_inode(inode->i_sb);
- if (ret)
- goto out;
+ if (inode->i_nlink) {
+ ret = shmem_reserve_inode(inode->i_sb);
+ if (ret)
+ goto out;
+ }
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
diff --git a/mm/slab.c b/mm/slab.c
index 09df506ae830..a04aeae42306 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -563,14 +563,6 @@ static void start_cpu_timer(int cpu)
static void init_arraycache(struct array_cache *ac, int limit, int batch)
{
- /*
- * The array_cache structures contain pointers to free object.
- * However, when such objects are allocated or transferred to another
- * cache the pointers are not cleared and they could be counted as
- * valid references during a kmemleak scan. Therefore, kmemleak must
- * not scan such objects.
- */
- kmemleak_no_scan(ac);
if (ac) {
ac->avail = 0;
ac->limit = limit;
@@ -586,6 +578,14 @@ static struct array_cache *alloc_arraycache(int node, int entries,
struct array_cache *ac = NULL;
ac = kmalloc_node(memsize, gfp, node);
+ /*
+ * The array_cache structures contain pointers to free object.
+ * However, when such objects are allocated or transferred to another
+ * cache the pointers are not cleared and they could be counted as
+ * valid references during a kmemleak scan. Therefore, kmemleak must
+ * not scan such objects.
+ */
+ kmemleak_no_scan(ac);
init_arraycache(ac, entries, batchcount);
return ac;
}
@@ -680,6 +680,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
alc = kmalloc_node(memsize, gfp, node);
if (alc) {
+ kmemleak_no_scan(alc);
init_arraycache(&alc->ac, entries, batch);
spin_lock_init(&alc->lock);
}
@@ -4298,7 +4299,8 @@ static void show_symbol(struct seq_file *m, unsigned long address)
static int leaks_show(struct seq_file *m, void *p)
{
- struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
+ root_caches_node);
struct page *page;
struct kmem_cache_node *n;
const char *name;
@@ -4318,8 +4320,12 @@ static int leaks_show(struct seq_file *m, void *p)
* whole processing.
*/
do {
- set_store_user_clean(cachep);
drain_cpu_caches(cachep);
+ /*
+ * drain_cpu_caches() could make kmemleak_object and
+ * debug_objects_cache dirty, so reset afterwards.
+ */
+ set_store_user_clean(cachep);
x[1] = 0;
diff --git a/mm/slub.c b/mm/slub.c
index 220d42e592ef..07aeb129f3f8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4790,7 +4790,17 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
- get_online_mems();
+ /*
+ * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
+ * already held which will conflict with an existing lock order:
+ *
+ * mem_hotplug_lock->slab_mutex->kernfs_mutex
+ *
+ * We don't really need mem_hotplug_lock (to hold off
+ * slab_mem_going_offline_callback) here because slab's memory hot
+ * unplug code doesn't destroy the kmem_cache->node[] data.
+ */
+
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
struct kmem_cache_node *n;
@@ -4831,7 +4841,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- put_online_mems();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
diff --git a/mm/usercopy.c b/mm/usercopy.c
index a9852b24715d..f8d74e09f8e4 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -15,6 +15,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -121,7 +122,7 @@ static inline const char *check_kernel_text_object(const void *ptr,
static inline const char *check_bogus_address(const void *ptr, unsigned long n)
{
/* Reject if object wraps past end of memory. */
- if ((unsigned long)ptr + n < (unsigned long)ptr)
+ if ((unsigned long)ptr + (n - 1) < (unsigned long)ptr)
return "<wrapped address>";
/* Reject if NULL or ZERO-allocation. */
@@ -203,7 +204,12 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n,
if (!virt_addr_valid(ptr))
return NULL;
- page = virt_to_head_page(ptr);
+ /*
+ * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
+ * highmem page or fallback to virt_to_page(). The following
+ * is effectively a highmem-aware virt_to_head_page().
+ */
+ page = compound_head(kmap_to_page((void *)ptr));
/* Check slab allocator for flags and size. */
if (PageSlab(page))
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 5d70fdbd8bc0..d3b4a78d79b6 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -272,8 +272,7 @@ retry:
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9ff21a12ea00..0b8852d80f44 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -498,7 +498,11 @@ nocache:
}
found:
- if (addr + size > vend)
+ /*
+ * Check also calculated address against the vstart,
+ * because it can be 0 because of big align request.
+ */
+ if (addr + size > vend || addr < vstart)
goto overflow;
va->va_start = addr;
@@ -1762,6 +1766,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return NULL;
/*
+ * First make sure the mappings are removed from all page-tables
+ * before they are freed.
+ */
+ vmalloc_sync_all();
+
+ /*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
@@ -2262,7 +2272,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!(area->flags & VM_USERMAP))
return -EINVAL;
- if (kaddr + size > area->addr + area->size)
+ if (kaddr + size > area->addr + get_vm_area_size(area))
return -EINVAL;
do {
@@ -2310,6 +2320,9 @@ EXPORT_SYMBOL(remap_vmalloc_range);
/*
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
+ *
+ * The purpose of this function is to make sure the vmalloc area
+ * mappings are identical in all page-tables in the system.
*/
void __weak vmalloc_sync_all(void)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9734e62654fa..0cc3c1eb15f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -502,6 +502,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
sc.nid = 0;
freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+ /*
+ * Bail out if someone want to register a new shrinker to
+ * prevent the regsitration from being stalled for long periods
+ * by parallel ongoing shrinking.
+ */
+ if (rwsem_is_contended(&shrinker_rwsem)) {
+ freed = freed ? : 1;
+ break;
+ }
}
up_read(&shrinker_rwsem);
@@ -1384,7 +1393,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !__PageMovable(page)) {
+ !__PageMovable(page) && !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
@@ -2111,8 +2120,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct mem_cgroup *memcg,
- struct scan_control *sc, bool actual_reclaim)
+ struct scan_control *sc, bool trace)
{
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2132,17 +2140,13 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
/*
* When refaults are being observed, it means a new workingset
* is being established. Disable active list protection to get
* rid of the stale workingset quickly.
*/
- if (file && actual_reclaim && lruvec->refaults != refaults) {
+ refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+ if (file && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2152,7 +2156,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
inactive_ratio = 1;
}
- if (actual_reclaim)
+ if (trace)
trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
@@ -2162,12 +2166,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct lruvec *lruvec, struct mem_cgroup *memcg,
- struct scan_control *sc)
+ struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru),
- memcg, sc, true))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2267,7 +2269,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anonymous pages on the LRU in eligible zones.
* Otherwise, the small LRU gets thrashed.
*/
- if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, false, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
>> sc->priority) {
scan_balance = SCAN_ANON;
@@ -2285,7 +2287,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, true, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2438,7 +2440,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, memcg, sc);
+ lruvec, sc);
}
}
@@ -2505,7 +2507,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2830,12 +2832,8 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
unsigned long refaults;
struct lruvec *lruvec;
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
lruvec->refaults = refaults;
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
}
@@ -3183,7 +3181,7 @@ static void age_active_anon(struct pglist_data *pgdat,
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -3441,19 +3439,18 @@ out:
}
/*
- * pgdat->kswapd_classzone_idx is the highest zone index that a recent
- * allocation request woke kswapd for. When kswapd has not woken recently,
- * the value is MAX_NR_ZONES which is not a valid index. This compares a
- * given classzone and returns it or the highest classzone index kswapd
- * was recently woke for.
+ * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
+ * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
+ * a valid index then either kswapd runs for first time or kswapd couldn't sleep
+ * after previous reclaim attempt (node is still unbalanced). In that case
+ * return the zone index of the previous kswapd reclaim cycle.
*/
static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
- enum zone_type classzone_idx)
+ enum zone_type prev_classzone_idx)
{
if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- return classzone_idx;
-
- return max(pgdat->kswapd_classzone_idx, classzone_idx);
+ return prev_classzone_idx;
+ return pgdat->kswapd_classzone_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
@@ -3594,7 +3591,7 @@ kswapd_try_sleep:
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = kswapd_classzone_idx(pgdat, 0);
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
@@ -3645,8 +3642,12 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
- classzone_idx);
+
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ pgdat->kswapd_classzone_idx = classzone_idx;
+ else
+ pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6389e876c7a7..e2197b03da57 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1201,13 +1201,8 @@ const char * const vmstat_text[] = {
#endif
#endif /* CONFIG_MEMORY_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH
-#ifdef CONFIG_SMP
"nr_tlb_remote_flush",
"nr_tlb_remote_flush_received",
-#else
- "", /* nr_tlb_remote_flush */
- "", /* nr_tlb_remote_flush_received */
-#endif /* CONFIG_SMP */
"nr_tlb_local_flush_all",
"nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
@@ -1810,12 +1805,13 @@ static bool need_update(int cpu)
/*
* The fast way of checking if there are any vmstat diffs.
- * This works because the diffs are byte sized items.
*/
- if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
+ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
+ sizeof(p->vm_stat_diff[0])))
return true;
#ifdef CONFIG_NUMA
- if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
+ if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
+ sizeof(p->vm_numa_stat_diff[0])))
return true;
#endif
}
@@ -1956,7 +1952,7 @@ void __init init_mm_internals(void)
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
- proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
+ proc_create("pagetypeinfo", 0400, NULL, &pagetypeinfo_file_operations);
proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
#endif
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 685049a9048d..c6df483b3751 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -52,6 +52,7 @@
#include <linux/zpool.h>
#include <linux/mount.h>
#include <linux/migrate.h>
+#include <linux/wait.h>
#include <linux/pagemap.h>
#define ZSPAGE_MAGIC 0x58
@@ -267,6 +268,10 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct inode *inode;
struct work_struct free_work;
+ /* A wait queue for when migration races with async_free_zspage() */
+ struct wait_queue_head migration_wait;
+ atomic_long_t isolated_pages;
+ bool destroying;
#endif
};
@@ -1878,6 +1883,31 @@ static void dec_zspage_isolation(struct zspage *zspage)
zspage->isolated--;
}
+static void putback_zspage_deferred(struct zs_pool *pool,
+ struct size_class *class,
+ struct zspage *zspage)
+{
+ enum fullness_group fg;
+
+ fg = putback_zspage(class, zspage);
+ if (fg == ZS_EMPTY)
+ schedule_work(&pool->free_work);
+
+}
+
+static inline void zs_pool_dec_isolated(struct zs_pool *pool)
+{
+ VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
+ atomic_long_dec(&pool->isolated_pages);
+ /*
+ * There's no possibility of racing, since wait_for_isolated_drain()
+ * checks the isolated count under &class->lock after enqueuing
+ * on migration_wait.
+ */
+ if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+ wake_up_all(&pool->migration_wait);
+}
+
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -1947,6 +1977,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode)
*/
if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
get_zspage_mapping(zspage, &class_idx, &fullness);
+ atomic_long_inc(&pool->isolated_pages);
remove_zspage(class, zspage, fullness);
}
@@ -2046,8 +2077,21 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
* Page migration is done so let's putback isolated zspage to
* the list if @page is final isolated subpage in the zspage.
*/
- if (!is_zspage_isolated(zspage))
- putback_zspage(class, zspage);
+ if (!is_zspage_isolated(zspage)) {
+ /*
+ * We cannot race with zs_destroy_pool() here because we wait
+ * for isolation to hit zero before we start destroying.
+ * Also, we ensure that everyone can see pool->destroying before
+ * we start waiting.
+ */
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
+ }
+
+ if (page_zone(newpage) != page_zone(page)) {
+ dec_zone_page_state(page, NR_ZSPAGES);
+ inc_zone_page_state(newpage, NR_ZSPAGES);
+ }
reset_page(page);
put_page(page);
@@ -2093,13 +2137,12 @@ void zs_page_putback(struct page *page)
spin_lock(&class->lock);
dec_zspage_isolation(zspage);
if (!is_zspage_isolated(zspage)) {
- fg = putback_zspage(class, zspage);
/*
* Due to page_lock, we cannot free zspage immediately
* so let's defer.
*/
- if (fg == ZS_EMPTY)
- schedule_work(&pool->free_work);
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
}
spin_unlock(&class->lock);
}
@@ -2123,8 +2166,36 @@ static int zs_register_migration(struct zs_pool *pool)
return 0;
}
+static bool pool_isolated_are_drained(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->isolated_pages) == 0;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+
+ /*
+ * We're in the process of destroying the pool, so there are no
+ * active allocations. zs_page_isolate() fails for completely free
+ * zspages, so we need only wait for the zs_pool's isolated
+ * count to hit zero.
+ */
+ wait_event(pool->migration_wait,
+ pool_isolated_are_drained(pool));
+}
+
static void zs_unregister_migration(struct zs_pool *pool)
{
+ pool->destroying = true;
+ /*
+ * We need a memory barrier here to ensure global visibility of
+ * pool->destroying. Thus pool->isolated pages will either be 0 in which
+ * case we don't care, or it will be > 0 and pool->destroying will
+ * ensure that we wake up once isolation hits 0.
+ */
+ smp_mb();
+ wait_for_isolated_drain(pool); /* This can block */
flush_work(&pool->free_work);
iput(pool->inode);
}
@@ -2365,6 +2436,10 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool->name)
goto err;
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pool->migration_wait);
+#endif
+
if (create_cache(pool))
goto err;