diff options
author | Marcel Ziswiler <marcel.ziswiler@toradex.com> | 2020-02-04 11:31:11 +0100 |
---|---|---|
committer | Marcel Ziswiler <marcel.ziswiler@toradex.com> | 2020-02-07 13:50:20 +0100 |
commit | 500f9a04cd76f504285ad99b15e390e20ab17e0c (patch) | |
tree | e1c5ff3a78e75f0847b47fbac63d279dd411e7e5 /mm | |
parent | 0f549d8c4d5edd68a2a492afd48228458d3bca4a (diff) | |
parent | 6d0c334a400db31751c787c411e7187ab59a3f1d (diff) |
Merge tag 'v4.14.164' into 4.14-2.3.x-imx
This is the 4.14.164 stable release
Conflicts:
arch/arm/Kconfig.debug
arch/arm/boot/dts/imx7s.dtsi
arch/arm/mach-imx/cpuidle-imx6q.c
arch/arm/mach-imx/cpuidle-imx6sx.c
arch/arm64/kernel/cpu_errata.c
arch/arm64/kvm/hyp/tlb.c
drivers/crypto/caam/caamalg.c
drivers/crypto/mxs-dcp.c
drivers/dma/imx-sdma.c
drivers/gpio/gpio-vf610.c
drivers/gpu/drm/bridge/adv7511/adv7511_drv.c
drivers/input/keyboard/imx_keypad.c
drivers/input/keyboard/snvs_pwrkey.c
drivers/mmc/core/block.c
drivers/mmc/core/queue.h
drivers/mmc/host/sdhci-esdhc-imx.c
drivers/net/can/flexcan.c
drivers/net/can/rx-offload.c
drivers/net/ethernet/freescale/fec_main.c
drivers/net/wireless/ath/ath10k/pci.c
drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
drivers/pci/dwc/pci-imx6.c
drivers/spi/spi-fsl-lpspi.c
drivers/usb/dwc3/gadget.c
include/net/tcp.h
sound/soc/fsl/Kconfig
sound/soc/fsl/fsl_esai.c
Diffstat (limited to 'mm')
43 files changed, 660 insertions, 352 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 59efbd3337e0..cc4d633947bf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -708,12 +708,12 @@ config MIGRATE_VMA_HELPER config HMM bool + select MMU_NOTIFIER select MIGRATE_VMA_HELPER config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" depends on ARCH_HAS_HMM - select MMU_NOTIFIER select HMM help Select HMM_MIRROR if you want to mirror range of the CPU page table of a diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 9386c98dac12..6fa31754eadd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -684,6 +684,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; mutex_init(&bdi->cgwb_release_mutex); + init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { @@ -105,8 +105,10 @@ static int __init cma_activate_area(struct cma *cma) cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!cma->bitmap) + if (!cma->bitmap) { + cma->count = 0; return -ENOMEM; + } WARN_ON_ONCE(!pfn_valid(pfn)); zone = page_zone(pfn_to_page(pfn)); @@ -275,6 +277,12 @@ int __init cma_declare_contiguous(phys_addr_t base, */ alignment = max(alignment, (phys_addr_t)PAGE_SIZE << max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); + if (fixed && base & (alignment - 1)) { + ret = -EINVAL; + pr_err("Region at %pa must be aligned to %pa bytes\n", + &base, &alignment); + goto err; + } base = ALIGN(base, alignment); size = ALIGN(size, alignment); limit &= ~(alignment - 1); @@ -305,6 +313,13 @@ int __init cma_declare_contiguous(phys_addr_t base, if (limit == 0 || limit > memblock_end) limit = memblock_end; + if (base + size > limit) { + ret = -EINVAL; + pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", + &size, &base, &limit); + goto err; + } + /* Reserve memory */ if (fixed) { if (memblock_is_region_reserved(base, size) || @@ -348,12 +363,14 @@ int __init cma_declare_contiguous(phys_addr_t base, ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); if (ret) - goto err; + goto free_mem; pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, &base); return 0; +free_mem: + memblock_free(base, size); err: pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; @@ -362,23 +379,26 @@ err: #ifdef CONFIG_CMA_DEBUG static void cma_debug_show_areas(struct cma *cma) { - unsigned long next_zero_bit, next_set_bit; + unsigned long next_zero_bit, next_set_bit, nr_zero; unsigned long start = 0; - unsigned int nr_zero, nr_total = 0; + unsigned long nr_part, nr_total = 0; + unsigned long nbits = cma_bitmap_maxno(cma); mutex_lock(&cma->lock); pr_info("number of available pages: "); for (;;) { - next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start); - if (next_zero_bit >= cma->count) + next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); + if (next_zero_bit >= nbits) break; - next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit); + next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit); nr_zero = next_set_bit - next_zero_bit; - pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit); - nr_total += nr_zero; + nr_part = nr_zero << cma->order_per_bit; + pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part, + next_zero_bit); + nr_total += nr_part; start = next_zero_bit + nr_zero; } - pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count); + pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); mutex_unlock(&cma->lock); } #else diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 275df8b5b22e..c47ea3cfee79 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -58,7 +58,7 @@ static int cma_maxchunk_get(void *data, u64 *val) mutex_lock(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); - if (start >= cma->count) + if (start >= bitmap_maxno) break; end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); diff --git a/mm/compaction.c b/mm/compaction.c index 85395dc6eb13..eb8e7f5d3a08 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1517,6 +1517,17 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro unsigned long end_pfn = zone_end_pfn(zone); const bool sync = cc->mode != MIGRATE_ASYNC; + /* + * These counters track activities during zone compaction. Initialize + * them before compacting a new zone. + */ + cc->total_migrate_scanned = 0; + cc->total_free_scanned = 0; + cc->nr_migratepages = 0; + cc->nr_freepages = 0; + INIT_LIST_HEAD(&cc->freepages); + INIT_LIST_HEAD(&cc->migratepages); + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); @@ -1680,10 +1691,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, { enum compact_result ret; struct compact_control cc = { - .nr_freepages = 0, - .nr_migratepages = 0, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .order = order, .gfp_mask = gfp_mask, .zone = zone, @@ -1696,8 +1703,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) }; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); ret = compact_zone(zone, &cc); @@ -1796,8 +1801,6 @@ static void compact_node(int nid) struct zone *zone; struct compact_control cc = { .order = -1, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .mode = MIGRATE_SYNC, .ignore_skip_hint = true, .whole_zone = true, @@ -1811,11 +1814,7 @@ static void compact_node(int nid) if (!populated_zone(zone)) continue; - cc.nr_freepages = 0; - cc.nr_migratepages = 0; cc.zone = zone; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); compact_zone(zone, &cc); @@ -1924,8 +1923,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) struct zone *zone; struct compact_control cc = { .order = pgdat->kcompactd_max_order, - .total_migrate_scanned = 0, - .total_free_scanned = 0, .classzone_idx = pgdat->kcompactd_classzone_idx, .mode = MIGRATE_SYNC_LIGHT, .ignore_skip_hint = true, @@ -1950,16 +1947,10 @@ static void kcompactd_do_work(pg_data_t *pgdat) COMPACT_CONTINUE) continue; - cc.nr_freepages = 0; - cc.nr_migratepages = 0; - cc.total_migrate_scanned = 0; - cc.total_free_scanned = 0; - cc.zone = zone; - INIT_LIST_HEAD(&cc.freepages); - INIT_LIST_HEAD(&cc.migratepages); - if (kthread_should_stop()) return; + + cc.zone = zone; status = compact_zone(zone, &cc); if (status == COMPACT_SUCCESS) { diff --git a/mm/filemap.c b/mm/filemap.c index e2e738cc08b1..a30dbf93de99 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -338,7 +338,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_cap_writeback_dirty(mapping)) + if (!mapping_cap_writeback_dirty(mapping) || + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(&wbc, mapping->host); @@ -464,6 +465,28 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, EXPORT_SYMBOL(filemap_fdatawait_range); /** + * filemap_fdatawait_range_keep_errors - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space in the + * given range and wait for all of them. Unlike filemap_fdatawait_range(), + * this function does not clear error status of the address space. + * + * Use this function if callers don't handle errors themselves. Expected + * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), + * fsfreeze(8) + */ +int filemap_fdatawait_range_keep_errors(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return filemap_check_and_keep_errors(mapping); +} +EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); + +/** * file_fdatawait_range - wait for writeback to complete * @file: file pointing to address space structure to wait for * @start_byte: offset in bytes where the range starts @@ -153,7 +153,10 @@ retry: } if (flags & FOLL_GET) { - get_page(page); + if (unlikely(!try_get_page(page))) { + page = ERR_PTR(-ENOMEM); + goto out; + } /* drop the pgmap reference now that we hold the page */ if (pgmap) { @@ -280,7 +283,10 @@ retry_locked: if (pmd_trans_unstable(pmd)) ret = -EBUSY; } else { - get_page(page); + if (unlikely(!try_get_page(page))) { + spin_unlock(ptl); + return ERR_PTR(-ENOMEM); + } spin_unlock(ptl); lock_page(page); ret = split_huge_page(page); @@ -436,11 +442,14 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, pgd = pgd_offset_k(address); else pgd = pgd_offset_gate(mm, address); - BUG_ON(pgd_none(*pgd)); + if (pgd_none(*pgd)) + return -EFAULT; p4d = p4d_offset(pgd, address); - BUG_ON(p4d_none(*p4d)); + if (p4d_none(*p4d)) + return -EFAULT; pud = pud_offset(p4d, address); - BUG_ON(pud_none(*pud)); + if (pud_none(*pud)) + return -EFAULT; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return -EFAULT; @@ -464,7 +473,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if (is_device_public_page(*page)) goto unmap; } - get_page(*page); + if (unlikely(!try_get_page(*page))) { + ret = -ENOMEM; + goto unmap; + } out: ret = 0; unmap: @@ -1355,7 +1367,8 @@ static inline pte_t gup_get_pte(pte_t *ptep) } #endif -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, + struct page **pages) { while ((*nr) - nr_start) { struct page *page = pages[--(*nr)]; @@ -1365,6 +1378,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } +/* + * Return the compund head page with ref appropriately incremented, + * or NULL if that failed. + */ +static inline struct page *try_get_compound_head(struct page *page, int refs) +{ + struct page *head = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(head) < 0)) + return NULL; + if (unlikely(!page_cache_add_speculative(head, refs))) + return NULL; + return head; +} + #ifdef __HAVE_ARCH_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) @@ -1399,9 +1426,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) + head = try_get_compound_head(page, 1); + if (!head) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { @@ -1537,8 +1564,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pmd_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pmd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1575,8 +1602,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pud_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pud_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1612,8 +1639,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pgd_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pgd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1643,7 +1670,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (!pmd_present(pmd)) return 0; - if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { + if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || + pmd_devmap(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 930f2aa3bb4d..1adc2e6c50f9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -33,6 +33,7 @@ #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/oom.h> +#include <linux/page_owner.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -2387,6 +2388,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); + + split_page_owner(head, HPAGE_PMD_ORDER); + /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to radix tree of swap cache */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 224cdd953a79..310656b4ede6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1081,11 +1081,10 @@ static bool pfn_range_valid_gigantic(struct zone *z, struct page *page; for (i = start_pfn; i < end_pfn; i++) { - if (!pfn_valid(i)) + page = pfn_to_online_page(i); + if (!page) return false; - page = pfn_to_page(i); - if (page_zone(page) != z) return false; @@ -1271,12 +1270,23 @@ void free_huge_page(struct page *page) ClearPagePrivate(page); /* - * A return code of zero implies that the subpool will be under its - * minimum size if the reservation is not restored after page is free. - * Therefore, force restore_reserve operation. + * If PagePrivate() was set on page, page allocation consumed a + * reservation. If the page was associated with a subpool, there + * would have been a page reserved in the subpool before allocation + * via hugepage_subpool_get_pages(). Since we are 'restoring' the + * reservtion, do not call hugepage_subpool_put_pages() as this will + * remove the reserved page from the subpool. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) - restore_reserve = true; + if (!restore_reserve) { + /* + * A return code of zero implies that the subpool will be + * under its minimum size if the reservation is not restored + * after page is free. Therefore, force restore_reserve + * operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + } spin_lock(&hugetlb_lock); clear_page_huge_active(page); @@ -3577,7 +3587,6 @@ retry_avoidcopy: copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); - set_page_huge_active(new_page); mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); @@ -3600,6 +3609,7 @@ retry_avoidcopy: make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, address); + set_page_huge_active(new_page); /* Make the old page be freed below */ new_page = old_page; } @@ -3682,6 +3692,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t new_pte; spinlock_t *ptl; + bool new_page = false; /* * Currently, we are forced to kill the process in the event the @@ -3728,8 +3739,8 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, address); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, + address); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3747,7 +3758,7 @@ retry: } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); - set_page_huge_active(page); + new_page = true; if (vma->vm_flags & VM_MAYSHARE) { int err = huge_add_to_page_cache(page, mapping, idx); @@ -3818,6 +3829,15 @@ retry: } spin_unlock(ptl); + + /* + * Only make newly allocated pages active. Existing pages found + * in the pagecache could be !page_huge_active() if they have been + * isolated for migration. + */ + if (new_page) + set_page_huge_active(page); + unlock_page(page); out: return ret; @@ -3832,21 +3852,14 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { unsigned long key[2]; u32 hash; - if (vma->vm_flags & VM_SHARED) { - key[0] = (unsigned long) mapping; - key[1] = idx; - } else { - key[0] = (unsigned long) mm; - key[1] = address >> huge_page_shift(h); - } + key[0] = (unsigned long) mapping; + key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); @@ -3857,9 +3870,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { return 0; @@ -3905,7 +3916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, address); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -4053,7 +4064,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * the set_pte_at() write. */ __SetPageUptodate(page); - set_page_huge_active(page); mapping = dst_vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, dst_vma, dst_addr); @@ -4121,6 +4131,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); + set_page_huge_active(page); if (vm_shared) unlock_page(page); ret = 0; @@ -4245,6 +4256,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); + + /* + * Instead of doing 'try_get_page()' below in the same_page + * loop, just check the count once here. + */ + if (unlikely(page_count(page) <= 0)) { + if (pages) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } + } same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index eec1150125b9..e430e04997ee 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -196,7 +196,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); - if (!css_tryget_online(&h_cg->css)) { + if (!css_tryget(&h_cg->css)) { rcu_read_unlock(); goto again; } diff --git a/mm/internal.h b/mm/internal.h index 1df011f62480..a182506242c4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -455,6 +455,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 +#ifdef CONFIG_NUMA +extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +#else +static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, + unsigned int order) +{ + return NODE_RECLAIM_NOSCAN; +} +#endif + extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d27a73737f1a..acf66c5dc9bd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1006,6 +1006,9 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ down_write(&mm->mmap_sem); + result = SCAN_ANY_PROCESS; + if (!mmget_still_valid(mm)) + goto out; result = hugepage_vma_revalidate(mm, address, &vma); if (result) goto out; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d9e0be2a8189..d779181bed4d 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -577,7 +577,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, if (in_irq()) { object->pid = 0; strncpy(object->comm, "hardirq", sizeof(object->comm)); - } else if (in_softirq()) { + } else if (in_serving_softirq()) { object->pid = 0; strncpy(object->comm, "softirq", sizeof(object->comm)); } else { @@ -1364,6 +1364,7 @@ static void scan_block(void *_start, void *_end, /* * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency. */ +#ifdef CONFIG_SMP static void scan_large_block(void *start, void *end) { void *next; @@ -1375,6 +1376,7 @@ static void scan_large_block(void *start, void *end) cond_resched(); } } +#endif /* * Scan a memory block corresponding to a kmemleak_object. A condition is @@ -1492,11 +1494,6 @@ static void kmemleak_scan(void) } rcu_read_unlock(); - /* data/bss scanning */ - scan_large_block(_sdata, _edata); - scan_large_block(__bss_start, __bss_stop); - scan_large_block(__start_ro_after_init, __end_ro_after_init); - #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) @@ -2027,6 +2024,17 @@ void __init kmemleak_init(void) } local_irq_restore(flags); + /* register the data/bss sections */ + create_object((unsigned long)_sdata, _edata - _sdata, + KMEMLEAK_GREY, GFP_ATOMIC); + create_object((unsigned long)__bss_start, __bss_stop - __bss_start, + KMEMLEAK_GREY, GFP_ATOMIC); + /* only register .data..ro_after_init if not within .data */ + if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata) + create_object((unsigned long)__start_ro_after_init, + __end_ro_after_init - __start_ro_after_init, + KMEMLEAK_GREY, GFP_ATOMIC); + /* * This is the point where tracking allocations is safe. Automatic * scanning is started during the late initcall. Add the early logged @@ -849,13 +849,13 @@ static int remove_stable_node(struct stable_node *stable_node) return 0; } - if (WARN_ON_ONCE(page_mapped(page))) { - /* - * This should not happen: but if it does, just refuse to let - * merge_across_nodes be switched - there is no need to panic. - */ - err = -EBUSY; - } else { + /* + * Page could be still mapped if this races with __mmput() running in + * between ksm_exit() and exit_mmap(). Just refuse to let + * merge_across_nodes/max_page_sharing be switched. + */ + err = -EBUSY; + if (!page_mapped(page)) { /* * The stable node did not yet appear stale to get_ksm_page(), * since that allows for an unmapped ksm page to be recognized diff --git a/mm/list_lru.c b/mm/list_lru.c index f141f0c80ff3..d67348afd1dc 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -42,11 +42,7 @@ static void list_lru_unregister(struct list_lru *lru) #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static inline bool list_lru_memcg_aware(struct list_lru *lru) { - /* - * This needs node 0 to be always present, even - * in the systems supporting sparse numa ids. - */ - return !!lru->node[0].memcg_lrus; + return lru->memcg_aware; } static inline struct list_lru_one * @@ -317,7 +313,7 @@ static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, } return 0; fail: - __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1); + __memcg_destroy_list_lru_node(memcg_lrus, begin, i); return -ENOMEM; } @@ -389,6 +385,8 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { int i; + lru->memcg_aware = memcg_aware; + if (!memcg_aware) return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a9a7e1066ef..326525a97c47 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -725,7 +725,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) if (unlikely(!memcg)) memcg = root_mem_cgroup; } - } while (!css_tryget_online(&memcg->css)); + } while (!css_tryget(&memcg->css)); rcu_read_unlock(); return memcg; } @@ -871,26 +871,45 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, css_put(&prev->css); } -static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +static void __invalidate_reclaim_iterators(struct mem_cgroup *from, + struct mem_cgroup *dead_memcg) { - struct mem_cgroup *memcg = dead_memcg; struct mem_cgroup_reclaim_iter *iter; struct mem_cgroup_per_node *mz; int nid; int i; - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - for_each_node(nid) { - mz = mem_cgroup_nodeinfo(memcg, nid); - for (i = 0; i <= DEF_PRIORITY; i++) { - iter = &mz->iter[i]; - cmpxchg(&iter->position, - dead_memcg, NULL); - } + for_each_node(nid) { + mz = mem_cgroup_nodeinfo(from, nid); + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); } } } +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup *last; + + do { + __invalidate_reclaim_iterators(memcg, dead_memcg); + last = memcg; + } while ((memcg = parent_mem_cgroup(memcg))); + + /* + * When cgruop1 non-hierarchy mode is used, + * parent_mem_cgroup() does not walk all the way up to the + * cgroup root (root_mem_cgroup). So we have to handle + * dead_memcg from cgroup root separately. + */ + if (last != root_mem_cgroup) + __invalidate_reclaim_iterators(root_mem_cgroup, + dead_memcg); +} + /* * Iteration constructs for visiting all cgroups (under a tree). If * loops are exited prematurely (break), mem_cgroup_iter_break() must @@ -2333,6 +2352,16 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + + /* + * Enforce __GFP_NOFAIL allocation because callers are not + * prepared to see failures and likely do not have any failure + * handling code. + */ + if (gfp & __GFP_NOFAIL) { + page_counter_charge(&memcg->kmem, nr_pages); + return 0; + } cancel_charge(memcg, nr_pages); return -ENOMEM; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ef080fa682a6..001b6bfccbfb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1701,19 +1701,17 @@ static int soft_offline_in_use_page(struct page *page, int flags) struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) { - lock_page(hpage); - if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { - unlock_page(hpage); - if (!PageAnon(hpage)) + lock_page(page); + if (!PageAnon(page) || unlikely(split_huge_page(page))) { + unlock_page(page); + if (!PageAnon(page)) pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); else pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); - put_hwpoison_page(hpage); + put_hwpoison_page(page); return -EBUSY; } - unlock_page(hpage); - get_hwpoison_page(page); - put_hwpoison_page(hpage); + unlock_page(page); } if (PageHuge(page)) diff --git a/mm/memory.c b/mm/memory.c index fb9f7737c1ff..e9bce27bc18c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1804,14 +1804,21 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared - * mapping and we expect the PFNs to match. + * mapping and we expect the PFNs to match. If they + * don't match, we are likely racing with block + * allocation and mapping invalidation so just skip the + * update. */ - if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) + if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); goto out_unlock; - entry = *pte; - goto out_mkwrite; - } else - goto out_unlock; + } + entry = pte_mkyoung(*pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, addr, pte, entry, 1)) + update_mmu_cache(vma, addr, pte); + } + goto out_unlock; } /* Ok, finally just insert the thing.. */ @@ -1820,7 +1827,6 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); -out_mkwrite: if (mkwrite) { entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c7c74a927d6f..2d6626ab29d1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -343,12 +343,8 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - struct mem_section *ms; - for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(start_pfn); - - if (unlikely(!valid_section(ms))) + if (unlikely(!pfn_to_online_page(start_pfn))) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) @@ -368,15 +364,12 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - struct mem_section *ms; unsigned long pfn; /* pfn is the end pfn of a memory section. */ pfn = end_pfn - 1; for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); - - if (unlikely(!valid_section(ms))) + if (unlikely(!pfn_to_online_page(pfn))) continue; if (unlikely(pfn_to_nid(pfn) != nid)) @@ -398,7 +391,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ unsigned long zone_end_pfn = z; unsigned long pfn; - struct mem_section *ms; int nid = zone_to_nid(zone); zone_span_writelock(zone); @@ -436,9 +428,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, */ pfn = zone_start_pfn; for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); - - if (unlikely(!valid_section(ms))) + if (unlikely(!pfn_to_online_page(pfn))) continue; if (page_zone(pfn_to_page(pfn)) != zone) @@ -459,70 +449,33 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writeunlock(zone); } -static void shrink_pgdat_span(struct pglist_data *pgdat, - unsigned long start_pfn, unsigned long end_pfn) +static void update_pgdat_span(struct pglist_data *pgdat) { - unsigned long pgdat_start_pfn = pgdat->node_start_pfn; - unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ - unsigned long pgdat_end_pfn = p; - unsigned long pfn; - struct mem_section *ms; - int nid = pgdat->node_id; - - if (pgdat_start_pfn == start_pfn) { - /* - * If the section is smallest section in the pgdat, it need - * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. - * In this case, we find second smallest valid mem_section - * for shrinking zone. - */ - pfn = find_smallest_section_pfn(nid, NULL, end_pfn, - pgdat_end_pfn); - if (pfn) { - pgdat->node_start_pfn = pfn; - pgdat->node_spanned_pages = pgdat_end_pfn - pfn; - } - } else if (pgdat_end_pfn == end_pfn) { - /* - * If the section is biggest section in the pgdat, it need - * shrink pgdat->node_spanned_pages. - * In this case, we find second biggest valid mem_section for - * shrinking zone. - */ - pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, - start_pfn); - if (pfn) - pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; - } - - /* - * If the section is not biggest or smallest mem_section in the pgdat, - * it only creates a hole in the pgdat. So in this case, we need not - * change the pgdat. - * But perhaps, the pgdat has only hole data. Thus it check the pgdat - * has only hole or not. - */ - pfn = pgdat_start_pfn; - for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); + unsigned long node_start_pfn = 0, node_end_pfn = 0; + struct zone *zone; - if (unlikely(!valid_section(ms))) - continue; + for (zone = pgdat->node_zones; + zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { + unsigned long zone_end_pfn = zone->zone_start_pfn + + zone->spanned_pages; - if (pfn_to_nid(pfn) != nid) + /* No need to lock the zones, they can't change. */ + if (!zone->spanned_pages) continue; - - /* If the section is current section, it continues the loop */ - if (start_pfn == pfn) + if (!node_end_pfn) { + node_start_pfn = zone->zone_start_pfn; + node_end_pfn = zone_end_pfn; continue; + } - /* If we find valid section, we have nothing to do */ - return; + if (zone_end_pfn > node_end_pfn) + node_end_pfn = zone_end_pfn; + if (zone->zone_start_pfn < node_start_pfn) + node_start_pfn = zone->zone_start_pfn; } - /* The pgdat has no valid section */ - pgdat->node_start_pfn = 0; - pgdat->node_spanned_pages = 0; + pgdat->node_start_pfn = node_start_pfn; + pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; } static void __remove_zone(struct zone *zone, unsigned long start_pfn) @@ -531,9 +484,19 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) int nr_pages = PAGES_PER_SECTION; unsigned long flags; +#ifdef CONFIG_ZONE_DEVICE + /* + * Zone shrinking code cannot properly deal with ZONE_DEVICE. So + * we will not try to shrink the zones - which is okay as + * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. + */ + if (zone_idx(zone) == ZONE_DEVICE) + return; +#endif + pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); - shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); + update_pgdat_span(pgdat); pgdat_resize_unlock(zone->zone_pgdat, &flags); } @@ -1110,7 +1073,12 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } -/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +/* + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations (triggered e.g. by sysfs). + * + * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG + */ int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; @@ -1203,9 +1171,9 @@ out: mem_hotplug_done(); return ret; } -EXPORT_SYMBOL_GPL(add_memory_resource); -int __ref add_memory(int nid, u64 start, u64 size) +/* requires device_hotplug_lock, see add_memory_resource() */ +int __ref __add_memory(int nid, u64 start, u64 size) { struct resource *res; int ret; @@ -1219,6 +1187,17 @@ int __ref add_memory(int nid, u64 start, u64 size) release_memory_resource(res); return ret; } + +int add_memory(int nid, u64 start, u64 size) +{ + int rc; + + lock_device_hotplug(); + rc = __add_memory(nid, start, size); + unlock_device_hotplug(); + + return rc; +} EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE @@ -1256,7 +1235,8 @@ static struct page *next_active_pageblock(struct page *page) bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { struct page *page = pfn_to_page(start_pfn); - struct page *end_page = page + nr_pages; + unsigned long end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(page))); + struct page *end_page = pfn_to_page(end_pfn); /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { @@ -1296,6 +1276,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, i++; if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn) continue; + /* Check if we got outside of the zone */ + if (zone && !zone_spans_pfn(zone, pfn + i)) + return 0; page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1b93535d875f..a37cfa88669e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -305,7 +305,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) else { nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, *nodes); - pol->w.cpuset_mems_allowed = tmp; + pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) @@ -349,7 +349,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && + if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; @@ -427,6 +427,13 @@ static inline bool queue_pages_required(struct page *page, return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } +/* + * queue_pages_pmd() has three possible return values: + * 1 - pages are placed on the right node or queued successfully. + * 0 - THP was split. + * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing + * page was already on a node that does not follow the policy. + */ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -436,7 +443,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long flags; if (unlikely(is_pmd_migration_entry(*pmd))) { - ret = 1; + ret = -EIO; goto unlock; } page = pmd_page(*pmd); @@ -462,8 +469,15 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = 1; flags = qp->flags; /* go to thp migration */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + if (!vma_migratable(walk->vma)) { + ret = -EIO; + goto unlock; + } + migrate_page_add(page, qp->pagelist, flags); + } else + ret = -EIO; unlock: spin_unlock(ptl); out: @@ -488,8 +502,10 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { ret = queue_pages_pmd(pmd, ptl, addr, end, walk); - if (ret) + if (ret > 0) return 0; + else if (ret < 0) + return ret; } if (pmd_trans_unstable(pmd)) @@ -526,11 +542,16 @@ retry: goto retry; } - migrate_page_add(page, qp->pagelist, flags); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + if (!vma_migratable(vma)) + break; + migrate_page_add(page, qp->pagelist, flags); + } else + break; } pte_unmap_unlock(pte - 1, ptl); cond_resched(); - return 0; + return addr != end ? -EIO : 0; } static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, @@ -600,7 +621,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; - if (!vma_migratable(vma)) + /* + * Need check MPOL_MF_STRICT to return -EIO if possible + * regardless of vma_migratable + */ + if (!vma_migratable(vma) && + !(flags & MPOL_MF_STRICT)) return 1; if (endvma > end) @@ -627,7 +653,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, } /* queue pages from current vma */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & MPOL_MF_VALID) return 0; return 1; } @@ -1325,7 +1351,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1486,7 +1512,7 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, int uninitialized_var(pval); nodemask_t nodes; - if (nmask != NULL && maxnode < MAX_NUMNODES) + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; err = do_get_mempolicy(&pval, &nodes, addr, flags); @@ -1515,7 +1541,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) diff --git a/mm/migrate.c b/mm/migrate.c index 8c57cdd77ba5..9a3ce8847308 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -247,10 +247,8 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, pte = swp_entry_to_pte(entry); } else if (is_device_public_page(new)) { pte = pte_mkdevmap(pte); - flush_dcache_page(new); } - } else - flush_dcache_page(new); + } #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { @@ -971,6 +969,13 @@ static int move_to_new_page(struct page *newpage, struct page *page, */ if (!PageMappingFlags(page)) page->mapping = NULL; + + if (unlikely(is_zone_device_page(newpage))) { + if (is_device_public_page(newpage)) + flush_dcache_page(newpage); + } else + flush_dcache_page(newpage); + } out: return rc; @@ -1303,6 +1308,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, lock_page(hpage); } + /* + * Check for pages which are in the process of being freed. Without + * page_mapping() set, hugetlbfs specific move page routine will not + * be called and we could leak usage counts for subpools. + */ + if (page_private(hpage) && !page_mapping(hpage)) { + rc = -EBUSY; + goto out_unlock; + } + if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); @@ -1334,6 +1349,7 @@ put_anon: set_page_owner_migrate_reason(new_hpage, reason); } +out_unlock: unlock_page(hpage); out: if (rc != -EAGAIN) diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e6..2732c8c0764c 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -169,6 +169,22 @@ out: return 0; } +static inline bool can_do_mincore(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * Reveal pagecache information only for non-anonymous mappings that + * correspond to the files the calling process could (if tried) open + * for writing; otherwise we'd be including shared non-exclusive + * mappings, which opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -189,8 +205,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) return -ENOMEM; - mincore_walk.mm = vma->vm_mm; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + if (!can_do_mincore(vma)) { + unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE); + memset(vec, 1, pages); + return pages; + } + mincore_walk.mm = vma->vm_mm; err = walk_page_range(addr, end, &mincore_walk); if (err < 0) return err; diff --git a/mm/mlock.c b/mm/mlock.c index 46af369c13e5..1f9ee86672e8 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -629,11 +629,11 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, * is also counted. * Return value: previously mlocked page counts */ -static int count_mm_mlocked_page_nr(struct mm_struct *mm, +static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long start, size_t len) { struct vm_area_struct *vma; - int count = 0; + unsigned long count = 0; if (mm == NULL) mm = current->mm; diff --git a/mm/mmap.c b/mm/mmap.c index 2398776195d2..8c6ed06983f9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -45,6 +45,7 @@ #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> +#include <linux/sched/mm.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -88,12 +89,6 @@ static void unmap_region(struct mm_struct *mm, * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and - * MAP_PRIVATE: - * r: (no) no - * w: (no) no - * x: (yes) yes */ pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, @@ -2348,12 +2343,11 @@ int expand_downwards(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; - int error; + int error = 0; address &= PAGE_MASK; - error = security_mmap_addr(address); - if (error) - return error; + if (address < mmap_min_addr) + return -EPERM; /* Enforce stack_guard_gap */ prev = vma->vm_prev; @@ -2449,7 +2443,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + /* don't alter vm_end if the coredump is running */ + if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2475,6 +2470,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; + /* don't alter vm_start if the coredump is running */ + if (!mmget_still_valid(mm)) + return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 314285284e6e..70d0efb06374 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -267,7 +267,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, * thanks to mm_take_all_locks(). */ spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); + hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); mm_drop_all_locks(mm); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fe0aac2348e5..7a5c0b229c6a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1050,9 +1050,10 @@ bool out_of_memory(struct oom_control *oc) * The OOM killer does not compensate for IO-less reclaim. * pagefault_out_of_memory lost its gfp context so we have to * make sure exclude 0 mask - all other users should have at least - * ___GFP_DIRECT_RECLAIM to get here. + * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to + * invoke the OOM killer even if it is a GFP_NOFS allocation. */ - if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) return true; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e001de5ac50c..a40c075fd8f1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2150,6 +2150,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback); * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). + * + * To avoid deadlocks between range_cyclic writeback and callers that hold + * pages in PageWriteback to aggregate IO until write_cache_pages() returns, + * we do not loop back to the start of the file. Doing so causes a page + * lock/page writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping back to the start + * of the file violates that rule and causes deadlocks. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -2164,7 +2171,6 @@ int write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int cycled; int range_whole = 0; int tag; @@ -2172,23 +2178,17 @@ int write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; -retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; @@ -2296,17 +2296,14 @@ continue_unlock: pagevec_release(&pvec); cond_resched(); } - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; - goto retry; - } + + /* + * If we hit the last page and there is more work to be done: wrap + * back the index back to the start of the file for the next + * time we are called. + */ + if (wbc->range_cyclic && !done) + done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2f365f40433..6f71518a4558 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1764,8 +1764,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); - kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); + kernel_poison_pages(page, 1 << order, 1); set_page_owner(page, order, gfp_flags); } @@ -4325,11 +4325,11 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - page_ref_add(page, size - 1); + page_ref_add(page, size); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); - nc->pagecnt_bias = size; + nc->pagecnt_bias = size + 1; nc->offset = size; } @@ -4345,10 +4345,10 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - set_page_count(page, size); + set_page_count(page, size + 1); /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; + nc->pagecnt_bias = size + 1; offset = size - fragsz; } @@ -5727,13 +5727,15 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long *zone_end_pfn, unsigned long *ignored) { + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, zone_start_pfn, zone_end_pfn); diff --git a/mm/page_ext.c b/mm/page_ext.c index 2c16216c29b6..dece2bdf86fe 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -271,6 +271,7 @@ static void free_page_ext(void *addr) table_size = get_entry_size() * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); + kmemleak_free(addr); free_pages_exact(addr, table_size); } } @@ -396,10 +397,8 @@ void __init page_ext_init(void) * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... - * - * Take into account DEFERRED_STRUCT_PAGE_INIT. */ - if (early_pfn_to_nid(pfn) != nid) + if (pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; diff --git a/mm/page_idle.c b/mm/page_idle.c index e412a63b2b74..504684181827 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -136,7 +136,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, end_pfn = pfn + count * BITS_PER_BYTE; if (end_pfn > max_pfn) - end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + end_pfn = max_pfn; for (; pfn < end_pfn; pfn++) { bit = pfn % BITMAP_CHUNK_BITS; @@ -181,7 +181,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, end_pfn = pfn + count * BITS_PER_BYTE; if (end_pfn > max_pfn) - end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + end_pfn = max_pfn; for (; pfn < end_pfn; pfn++) { bit = pfn % BITMAP_CHUNK_BITS; diff --git a/mm/page_owner.c b/mm/page_owner.c index a71fe4c623ef..6ac05a6ff2d1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -273,7 +273,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { - if (!pfn_valid(pfn)) { + page = pfn_to_online_page(pfn); + if (!page) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; } @@ -281,13 +282,13 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - page = pfn_to_page(pfn); pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { if (!pfn_valid_within(pfn)) continue; + /* The pageblock is online, no need to recheck. */ page = pfn_to_page(pfn); if (page_zone(page) != zone) diff --git a/mm/page_poison.c b/mm/page_poison.c index e83fd44867de..a7ba9e315a12 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -6,6 +6,7 @@ #include <linux/page_ext.h> #include <linux/poison.h> #include <linux/ratelimit.h> +#include <linux/kasan.h> static bool want_page_poisoning __read_mostly; @@ -34,7 +35,10 @@ static void poison_page(struct page *page) { void *addr = kmap_atomic(page); + /* KASAN still think the page is in-use, so skip it. */ + kasan_disable_current(); memset(addr, PAGE_POISON, PAGE_SIZE); + kasan_enable_current(); kunmap_atomic(addr); } diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 0d88d7bd5706..c22d959105b6 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -50,6 +50,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; struct pcpu_chunk *chunk; struct page *pages; + unsigned long flags; int i; chunk = pcpu_alloc_chunk(gfp); @@ -68,9 +69,9 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_populated(chunk, 0, nr_pages, false); - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(chunk->base_addr); diff --git a/mm/percpu.c b/mm/percpu.c index 3074148b7e0d..9beb84800d8d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -984,7 +984,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, /* * Search to find a fit. */ - end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS; + end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, + pcpu_chunk_map_bits(chunk)); bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start, alloc_bits, align_mask); if (bit_off >= end) @@ -1702,6 +1703,7 @@ void free_percpu(void __percpu *ptr) struct pcpu_chunk *chunk; unsigned long flags; int off; + bool need_balance = false; if (!ptr) return; @@ -1723,7 +1725,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - pcpu_schedule_balance_work(); + need_balance = true; break; } } @@ -1731,6 +1733,9 @@ void free_percpu(void __percpu *ptr) trace_percpu_free_percpu(chunk->base_addr, off, ptr); spin_unlock_irqrestore(&pcpu_lock, flags); + + if (need_balance) + pcpu_schedule_balance_work(); } EXPORT_SYMBOL_GPL(free_percpu); @@ -2507,8 +2512,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, ai->groups[group].base_offset = areas[group] - base; } - pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", - PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); @@ -2629,8 +2634,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", - unit_pages, psize_str, vm.addr, ai->static_size, + pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", + unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); rc = pcpu_setup_first_chunk(ai, vm.addr); diff --git a/mm/shmem.c b/mm/shmem.c index 6c10f1d92251..69106c600692 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2657,31 +2657,33 @@ static void shmem_tag_pins(struct address_space *mapping) void **slot; pgoff_t start; struct page *page; + unsigned int tagged = 0; lru_add_drain(); start = 0; - rcu_read_lock(); + spin_lock_irq(&mapping->tree_lock); radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - page = radix_tree_deref_slot(slot); + page = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!page || radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { slot = radix_tree_iter_retry(&iter); continue; } } else if (page_count(page) - page_mapcount(page) > 1) { - spin_lock_irq(&mapping->tree_lock); radix_tree_tag_set(&mapping->page_tree, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); } - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } + if (++tagged % 1024) + continue; + + slot = radix_tree_iter_resume(slot, &iter); + spin_unlock_irq(&mapping->tree_lock); + cond_resched(); + spin_lock_irq(&mapping->tree_lock); } - rcu_read_unlock(); + spin_unlock_irq(&mapping->tree_lock); } /* @@ -2893,7 +2895,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, } shmem_falloc.waitq = &shmem_falloc_waitq; - shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; @@ -3096,16 +3098,20 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - int ret; + int ret = 0; /* * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. + * But if an O_TMPFILE file is linked into the tmpfs, the + * first link must skip that, to get the accounting right. */ - ret = shmem_reserve_inode(inode->i_sb); - if (ret) - goto out; + if (inode->i_nlink) { + ret = shmem_reserve_inode(inode->i_sb); + if (ret) + goto out; + } dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); diff --git a/mm/slab.c b/mm/slab.c index 09df506ae830..a04aeae42306 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -563,14 +563,6 @@ static void start_cpu_timer(int cpu) static void init_arraycache(struct array_cache *ac, int limit, int batch) { - /* - * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transferred to another - * cache the pointers are not cleared and they could be counted as - * valid references during a kmemleak scan. Therefore, kmemleak must - * not scan such objects. - */ - kmemleak_no_scan(ac); if (ac) { ac->avail = 0; ac->limit = limit; @@ -586,6 +578,14 @@ static struct array_cache *alloc_arraycache(int node, int entries, struct array_cache *ac = NULL; ac = kmalloc_node(memsize, gfp, node); + /* + * The array_cache structures contain pointers to free object. + * However, when such objects are allocated or transferred to another + * cache the pointers are not cleared and they could be counted as + * valid references during a kmemleak scan. Therefore, kmemleak must + * not scan such objects. + */ + kmemleak_no_scan(ac); init_arraycache(ac, entries, batchcount); return ac; } @@ -680,6 +680,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, alc = kmalloc_node(memsize, gfp, node); if (alc) { + kmemleak_no_scan(alc); init_arraycache(&alc->ac, entries, batch); spin_lock_init(&alc->lock); } @@ -4298,7 +4299,8 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); + struct kmem_cache *cachep = list_entry(p, struct kmem_cache, + root_caches_node); struct page *page; struct kmem_cache_node *n; const char *name; @@ -4318,8 +4320,12 @@ static int leaks_show(struct seq_file *m, void *p) * whole processing. */ do { - set_store_user_clean(cachep); drain_cpu_caches(cachep); + /* + * drain_cpu_caches() could make kmemleak_object and + * debug_objects_cache dirty, so reset afterwards. + */ + set_store_user_clean(cachep); x[1] = 0; diff --git a/mm/slub.c b/mm/slub.c index 220d42e592ef..07aeb129f3f8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4790,7 +4790,17 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } - get_online_mems(); + /* + * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" + * already held which will conflict with an existing lock order: + * + * mem_hotplug_lock->slab_mutex->kernfs_mutex + * + * We don't really need mem_hotplug_lock (to hold off + * slab_mem_going_offline_callback) here because slab's memory hot + * unplug code doesn't destroy the kmem_cache->node[] data. + */ + #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { struct kmem_cache_node *n; @@ -4831,7 +4841,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } diff --git a/mm/usercopy.c b/mm/usercopy.c index a9852b24715d..f8d74e09f8e4 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -15,6 +15,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> +#include <linux/highmem.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -121,7 +122,7 @@ static inline const char *check_kernel_text_object(const void *ptr, static inline const char *check_bogus_address(const void *ptr, unsigned long n) { /* Reject if object wraps past end of memory. */ - if ((unsigned long)ptr + n < (unsigned long)ptr) + if ((unsigned long)ptr + (n - 1) < (unsigned long)ptr) return "<wrapped address>"; /* Reject if NULL or ZERO-allocation. */ @@ -203,7 +204,12 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, if (!virt_addr_valid(ptr)) return NULL; - page = virt_to_head_page(ptr); + /* + * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the + * highmem page or fallback to virt_to_page(). The following + * is effectively a highmem-aware virt_to_head_page(). + */ + page = compound_head(kmap_to_page((void *)ptr)); /* Check slab allocator for flags and size. */ if (PageSlab(page)) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 5d70fdbd8bc0..d3b4a78d79b6 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -272,8 +272,7 @@ retry: */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, - idx, dst_addr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9ff21a12ea00..0b8852d80f44 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -498,7 +498,11 @@ nocache: } found: - if (addr + size > vend) + /* + * Check also calculated address against the vstart, + * because it can be 0 because of big align request. + */ + if (addr + size > vend || addr < vstart) goto overflow; va->va_start = addr; @@ -1762,6 +1766,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return NULL; /* + * First make sure the mappings are removed from all page-tables + * before they are freed. + */ + vmalloc_sync_all(); + + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. @@ -2262,7 +2272,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, if (!(area->flags & VM_USERMAP)) return -EINVAL; - if (kaddr + size > area->addr + area->size) + if (kaddr + size > area->addr + get_vm_area_size(area)) return -EINVAL; do { @@ -2310,6 +2320,9 @@ EXPORT_SYMBOL(remap_vmalloc_range); /* * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. + * + * The purpose of this function is to make sure the vmalloc area + * mappings are identical in all page-tables in the system. */ void __weak vmalloc_sync_all(void) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 9734e62654fa..0cc3c1eb15f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -502,6 +502,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, sc.nid = 0; freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); + /* + * Bail out if someone want to register a new shrinker to + * prevent the regsitration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } up_read(&shrinker_rwsem); @@ -1384,7 +1393,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, list_for_each_entry_safe(page, next, page_list, lru) { if (page_is_file_cache(page) && !PageDirty(page) && - !__PageMovable(page)) { + !__PageMovable(page) && !PageUnevictable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } @@ -2111,8 +2120,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct mem_cgroup *memcg, - struct scan_control *sc, bool actual_reclaim) + struct scan_control *sc, bool trace) { enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -2132,17 +2140,13 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); - if (memcg) - refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); - else - refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); - /* * When refaults are being observed, it means a new workingset * is being established. Disable active list protection to get * rid of the stale workingset quickly. */ - if (file && actual_reclaim && lruvec->refaults != refaults) { + refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); + if (file && lruvec->refaults != refaults) { inactive_ratio = 0; } else { gb = (inactive + active) >> (30 - PAGE_SHIFT); @@ -2152,7 +2156,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive_ratio = 1; } - if (actual_reclaim) + if (trace) trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, @@ -2162,12 +2166,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct mem_cgroup *memcg, - struct scan_control *sc) + struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), - memcg, sc, true)) + if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2267,7 +2269,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anonymous pages on the LRU in eligible zones. * Otherwise, the small LRU gets thrashed. */ - if (!inactive_list_is_low(lruvec, false, memcg, sc, false) && + if (!inactive_list_is_low(lruvec, false, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_ANON; @@ -2285,7 +2287,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && + if (!inactive_list_is_low(lruvec, true, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2438,7 +2440,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, memcg, sc); + lruvec, sc); } } @@ -2505,7 +2507,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, memcg, sc, true)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2830,12 +2832,8 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) unsigned long refaults; struct lruvec *lruvec; - if (memcg) - refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); - else - refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); - lruvec = mem_cgroup_lruvec(pgdat, memcg); + refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); lruvec->refaults = refaults; } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); } @@ -3183,7 +3181,7 @@ static void age_active_anon(struct pglist_data *pgdat, do { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false, memcg, sc, true)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -3441,19 +3439,18 @@ out: } /* - * pgdat->kswapd_classzone_idx is the highest zone index that a recent - * allocation request woke kswapd for. When kswapd has not woken recently, - * the value is MAX_NR_ZONES which is not a valid index. This compares a - * given classzone and returns it or the highest classzone index kswapd - * was recently woke for. + * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be + * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not + * a valid index then either kswapd runs for first time or kswapd couldn't sleep + * after previous reclaim attempt (node is still unbalanced). In that case + * return the zone index of the previous kswapd reclaim cycle. */ static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, - enum zone_type classzone_idx) + enum zone_type prev_classzone_idx) { if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) - return classzone_idx; - - return max(pgdat->kswapd_classzone_idx, classzone_idx); + return prev_classzone_idx; + return pgdat->kswapd_classzone_idx; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, @@ -3594,7 +3591,7 @@ kswapd_try_sleep: /* Read the new order and classzone_idx */ alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = kswapd_classzone_idx(pgdat, 0); + classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); pgdat->kswapd_order = 0; pgdat->kswapd_classzone_idx = MAX_NR_ZONES; @@ -3645,8 +3642,12 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; - pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, - classzone_idx); + + if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) + pgdat->kswapd_classzone_idx = classzone_idx; + else + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, + classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; diff --git a/mm/vmstat.c b/mm/vmstat.c index 6389e876c7a7..e2197b03da57 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1201,13 +1201,8 @@ const char * const vmstat_text[] = { #endif #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH -#ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#else - "", /* nr_tlb_remote_flush */ - "", /* nr_tlb_remote_flush_received */ -#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ @@ -1810,12 +1805,13 @@ static bool need_update(int cpu) /* * The fast way of checking if there are any vmstat diffs. - * This works because the diffs are byte sized items. */ - if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) + if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * + sizeof(p->vm_stat_diff[0]))) return true; #ifdef CONFIG_NUMA - if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS)) + if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * + sizeof(p->vm_numa_stat_diff[0]))) return true; #endif } @@ -1956,7 +1952,7 @@ void __init init_mm_internals(void) #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations); - proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations); + proc_create("pagetypeinfo", 0400, NULL, &pagetypeinfo_file_operations); proc_create("vmstat", 0444, NULL, &vmstat_file_operations); proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations); #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 685049a9048d..c6df483b3751 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -52,6 +52,7 @@ #include <linux/zpool.h> #include <linux/mount.h> #include <linux/migrate.h> +#include <linux/wait.h> #include <linux/pagemap.h> #define ZSPAGE_MAGIC 0x58 @@ -267,6 +268,10 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; + /* A wait queue for when migration races with async_free_zspage() */ + struct wait_queue_head migration_wait; + atomic_long_t isolated_pages; + bool destroying; #endif }; @@ -1878,6 +1883,31 @@ static void dec_zspage_isolation(struct zspage *zspage) zspage->isolated--; } +static void putback_zspage_deferred(struct zs_pool *pool, + struct size_class *class, + struct zspage *zspage) +{ + enum fullness_group fg; + + fg = putback_zspage(class, zspage); + if (fg == ZS_EMPTY) + schedule_work(&pool->free_work); + +} + +static inline void zs_pool_dec_isolated(struct zs_pool *pool) +{ + VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); + atomic_long_dec(&pool->isolated_pages); + /* + * There's no possibility of racing, since wait_for_isolated_drain() + * checks the isolated count under &class->lock after enqueuing + * on migration_wait. + */ + if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) + wake_up_all(&pool->migration_wait); +} + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1947,6 +1977,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode) */ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { get_zspage_mapping(zspage, &class_idx, &fullness); + atomic_long_inc(&pool->isolated_pages); remove_zspage(class, zspage, fullness); } @@ -2046,8 +2077,21 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage, * Page migration is done so let's putback isolated zspage to * the list if @page is final isolated subpage in the zspage. */ - if (!is_zspage_isolated(zspage)) - putback_zspage(class, zspage); + if (!is_zspage_isolated(zspage)) { + /* + * We cannot race with zs_destroy_pool() here because we wait + * for isolation to hit zero before we start destroying. + * Also, we ensure that everyone can see pool->destroying before + * we start waiting. + */ + putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); + } + + if (page_zone(newpage) != page_zone(page)) { + dec_zone_page_state(page, NR_ZSPAGES); + inc_zone_page_state(newpage, NR_ZSPAGES); + } reset_page(page); put_page(page); @@ -2093,13 +2137,12 @@ void zs_page_putback(struct page *page) spin_lock(&class->lock); dec_zspage_isolation(zspage); if (!is_zspage_isolated(zspage)) { - fg = putback_zspage(class, zspage); /* * Due to page_lock, we cannot free zspage immediately * so let's defer. */ - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); + putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); } spin_unlock(&class->lock); } @@ -2123,8 +2166,36 @@ static int zs_register_migration(struct zs_pool *pool) return 0; } +static bool pool_isolated_are_drained(struct zs_pool *pool) +{ + return atomic_long_read(&pool->isolated_pages) == 0; +} + +/* Function for resolving migration */ +static void wait_for_isolated_drain(struct zs_pool *pool) +{ + + /* + * We're in the process of destroying the pool, so there are no + * active allocations. zs_page_isolate() fails for completely free + * zspages, so we need only wait for the zs_pool's isolated + * count to hit zero. + */ + wait_event(pool->migration_wait, + pool_isolated_are_drained(pool)); +} + static void zs_unregister_migration(struct zs_pool *pool) { + pool->destroying = true; + /* + * We need a memory barrier here to ensure global visibility of + * pool->destroying. Thus pool->isolated pages will either be 0 in which + * case we don't care, or it will be > 0 and pool->destroying will + * ensure that we wake up once isolation hits 0. + */ + smp_mb(); + wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @@ -2365,6 +2436,10 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pool->migration_wait); +#endif + if (create_cache(pool)) goto err; |