From e43c3afb367112a5b357f9adfac7817255129c88 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 29 Sep 2009 13:16:20 +0800 Subject: HWPOISON: return early on non-LRU pages Right now we have some trouble with non atomic access to page flags when locking the page. To plug this hole for now, limit error recovery to LRU pages for now. This could be better fixed by defining a suitable protocol, but let's go this simple way for now This avoids unnecessary races with __set_page_locked() and __SetPageSlab*() and maybe more non-atomic page flag operations. This loses isolated pages which are currently in page reclaim, but these are relatively limited compared to the total memory. Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen [AK: new description, bug fixes, cleanups] --- mm/memory-failure.c | 49 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 729d4b15b645..e17ec3f1c637 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -370,9 +370,6 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) int ret = FAILED; struct address_space *mapping; - if (!isolate_lru_page(p)) - page_cache_release(p); - /* * For anonymous pages we're done the only reference left * should be the one m_f() holds. @@ -498,30 +495,18 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) */ static int me_swapcache_dirty(struct page *p, unsigned long pfn) { - int ret = FAILED; - ClearPageDirty(p); /* Trigger EIO in shmem: */ ClearPageUptodate(p); - if (!isolate_lru_page(p)) { - page_cache_release(p); - ret = DELAYED; - } - - return ret; + return DELAYED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { - int ret = FAILED; - - if (!isolate_lru_page(p)) { - page_cache_release(p); - ret = RECOVERED; - } delete_from_swap_cache(p); - return ret; + + return RECOVERED; } /* @@ -611,8 +596,6 @@ static struct page_state { { 0, 0, "unknown page state", me_unknown }, }; -#undef lru - static void action_result(unsigned long pfn, char *msg, int result) { struct page *page = NULL; @@ -664,9 +647,6 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, if (PageReserved(p) || PageCompound(p) || PageSlab(p)) return; - if (!PageLRU(p)) - lru_add_drain_all(); - /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. @@ -738,6 +718,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, int __memory_failure(unsigned long pfn, int trapno, int ref) { + unsigned long lru_flag; struct page_state *ps; struct page *p; int res; @@ -774,6 +755,24 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) return PageBuddy(compound_head(p)) ? 0 : -EBUSY; } + /* + * We ignore non-LRU pages for good reasons. + * - PG_locked is only well defined for LRU pages and a few others + * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageSlab*() (and more non-atomic ops) + * The check (unnecessarily) ignores LRU pages being isolated and + * walked by the page reclaim code, however that's not a big loss. + */ + if (!PageLRU(p)) + lru_add_drain_all(); + lru_flag = p->flags & lru; + if (isolate_lru_page(p)) { + action_result(pfn, "non LRU", IGNORED); + put_page(p); + return -EBUSY; + } + page_cache_release(p); + /* * Lock the page and wait for writeback to finish. * It's very difficult to mess with pages currently under IO @@ -790,7 +789,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) /* * Torn down by someone else? */ - if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { action_result(pfn, "already truncated LRU", IGNORED); res = 0; goto out; @@ -798,7 +797,7 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) res = -EBUSY; for (ps = error_states;; ps++) { - if ((p->flags & ps->mask) == ps->res) { + if (((p->flags | lru_flag)& ps->mask) == ps->res) { res = page_action(ps, p, pfn, ref); break; } -- cgit v1.2.3 From 4779cb31c0ee3b355116745edca3f3e5fe865553 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 14 Oct 2009 01:51:41 +0200 Subject: HWPOISON: Fix page count leak in hwpoison late kill in do_swap_page When returning due to a poisoned page drop the page count. It wasn't a fatal problem because noone cares about the page count on a poisoned page (except when it wraps), but it's cleaner to fix it. Pointed out by Linus. Signed-off-by: Andi Kleen --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7e91b5f9f690..7a3b0ad5594a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2539,7 +2539,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } else if (PageHWPoison(page)) { ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - goto out; + goto out_release; } lock_page(page); @@ -2611,6 +2611,7 @@ out_nomap: pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); +out_release: page_cache_release(page); return ret; } -- cgit v1.2.3 From 01e00f880ca700376e1845cf7a2524ebe68e47d6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 13 Oct 2009 15:02:11 +0100 Subject: HWPOISON: fix oops on ksm pages Memory failure on a KSM page currently oopses on its NULL anon_vma in page_lock_anon_vma(): that may not be much worse than the consequence of ignoring it, but it is better to be consistent with how ZERO_PAGE and hugetlb pages and other awkward cases are treated. Just skip it. Signed-off-by: Hugh Dickins Signed-off-by: Andi Kleen --- mm/memory-failure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e17ec3f1c637..e354b9f2f389 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -644,7 +645,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, int i; int kill = 1; - if (PageReserved(p) || PageCompound(p) || PageSlab(p)) + if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) return; /* -- cgit v1.2.3 From 7456b0405d8fc063c49628f969cdb23be060fc80 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 19 Oct 2009 08:15:01 +0200 Subject: HWPOISON: fix invalid page count in printk output The madvise injector already holds a reference when passing in a page to the memory-failure code. The code corrects for this additional reference for its checks, but the final printk output didn't. Fix that. Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/memory-failure.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e354b9f2f389..dacc64183874 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -613,13 +613,16 @@ static int page_action(struct page_state *ps, struct page *p, unsigned long pfn, int ref) { int result; + int count; result = ps->action(p, pfn); action_result(pfn, ps->msg, result); - if (page_count(p) != 1 + ref) + + count = page_count(p) - 1 - ref; + if (count != 0) printk(KERN_ERR "MCE %#lx: %s page still referenced by %d users\n", - pfn, ps->msg, page_count(p) - 1); + pfn, ps->msg, count); /* Could do more checks here if page looks ok */ /* -- cgit v1.2.3 From ed84a07a124bf3b1aab2fd7fdb6e9534838087ac Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Fri, 16 Oct 2009 07:21:36 +0000 Subject: powerpc: Limit memory hotplug support to PPC64 Book-3S machines Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 57963c6063d1..7ca3777bdede 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -129,7 +129,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC64 || SUPERH || S390) + depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 -- cgit v1.2.3 From 403a91b1659cb149dbddc5885f892734ae4542d8 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 29 Oct 2009 00:25:59 +0900 Subject: percpu: allow pcpu_alloc() to be called with IRQs off pcpu_alloc() and pcpu_extend_area_map() perform a series of spin_lock_irq()/spin_unlock_irq() calls, which make them unsafe with respect to being called from contexts which have IRQs off. This patch converts the code to perform save/restore of flags instead, making pcpu_alloc() (or __alloc_percpu() respectively) to be called from early kernel startup stage, where IRQs are off. This is needed for proper initialization of per-cpu rq_weight data from sched_init(). tj: added comment explaining why irqsave/restore is used in alloc path. Signed-off-by: Jiri Kosina Acked-by: Ingo Molnar Signed-off-by: Tejun Heo --- mm/percpu.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 6af78c1ee704..d90797160c2a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -153,7 +153,10 @@ static int pcpu_reserved_chunk_limit; * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. + * allocations are done using GFP_KERNEL with pcpu_lock released. In + * general, percpu memory can't be allocated with irq off but + * irqsave/restore are still used in alloc path so that it can be used + * from early init path - sched_init() specifically. * * Free path accesses and alters only the index data structures, so it * can be safely called from atomic context. When memory needs to be @@ -366,7 +369,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * RETURNS: * 0 if noop, 1 if successfully extended, -errno on failure. */ -static int pcpu_extend_area_map(struct pcpu_chunk *chunk) +static int pcpu_extend_area_map(struct pcpu_chunk *chunk, unsigned long *flags) { int new_alloc; int *new; @@ -376,7 +379,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) if (chunk->map_alloc >= chunk->map_used + 2) return 0; - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, *flags); new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < chunk->map_used + 2) @@ -384,7 +387,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); if (!new) { - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, *flags); return -ENOMEM; } @@ -393,7 +396,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) * could have happened inbetween, so map_used couldn't have * grown. */ - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, *flags); BUG_ON(new_alloc < chunk->map_used + 2); size = chunk->map_alloc * sizeof(chunk->map[0]); @@ -1047,6 +1050,7 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) struct pcpu_chunk *chunk; const char *err; int slot, off; + unsigned long flags; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " @@ -1055,13 +1059,13 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) } mutex_lock(&pcpu_alloc_mutex); - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk) < 0) { + pcpu_extend_area_map(chunk, &flags) < 0) { err = "failed to extend area map of reserved chunk"; goto fail_unlock; } @@ -1079,7 +1083,7 @@ restart: if (size > chunk->contig_hint) continue; - switch (pcpu_extend_area_map(chunk)) { + switch (pcpu_extend_area_map(chunk, &flags)) { case 0: break; case 1: @@ -1096,7 +1100,7 @@ restart: } /* hmmm... no space left, create a new chunk */ - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); chunk = alloc_pcpu_chunk(); if (!chunk) { @@ -1104,16 +1108,16 @@ restart: goto fail_unlock_mutex; } - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); goto restart; area_found: - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); /* populate, map and clear the area */ if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; @@ -1125,7 +1129,7 @@ area_found: return __addr_to_pcpu_ptr(chunk->base_addr + off); fail_unlock: - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); if (warn_limit) { -- cgit v1.2.3 From 592b09a42fc3ae6737a0f3ecf4fee42ecd0296f8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Oct 2009 11:46:12 +0100 Subject: backing-dev: ensure that a removed bdi no longer has super_block referencing it When the bdi is being removed, we have to ensure that no super_blocks currently have that cached in sb->s_bdi. Normally this is ensured by the sb having a longer life span than the bdi, but if the device is suddenly yanked, we have to kill this reference. sb->s_bdi is pointed to freed memory at that point. This fixes a problem with sync(1) hanging when a USB stick is pulled without cleanly umounting it first. Reported-by: Pavel Machek Signed-off-by: Jens Axboe --- mm/backing-dev.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5a37e2055717..1065b715ef64 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -610,6 +610,21 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) kthread_stop(wb->task); } +/* + * This bdi is going away now, make sure that no super_blocks point to it + */ +static void bdi_prune_sb(struct backing_dev_info *bdi) +{ + struct super_block *sb; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_bdi == bdi) + sb->s_bdi = NULL; + } + spin_unlock(&sb_lock); +} + void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { @@ -682,6 +697,7 @@ void bdi_destroy(struct backing_dev_info *bdi) spin_unlock(&inode_lock); } + bdi_prune_sb(bdi); bdi_unregister(bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) -- cgit v1.2.3 From 92f7ba70eecf4da8264a767b181cc2090f62d4ad Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 26 Oct 2009 16:49:31 -0700 Subject: hwpoison: fix oops on ksm pages Memory failure on a KSM page currently oopses on its NULL anon_vma in page_lock_anon_vma(): that may not be much worse than the consequence of ignoring it, but it is better to be consistent with how ZERO_PAGE and hugetlb pages and other awkward cases are treated. Just skip it. We could fix it for 2.6.32 at the KSM end, by putting a dummy anon_vma pointer in there; but that would get harder next time, when KSM will put a pointer to something else there (and I'm not currently planning to do any work to open that up to memory_failure). So I would prefer this simple PageKsm test, until the other exceptions are handled. Signed-off-by: Hugh Dickins Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 729d4b15b645..7fc2130d2737 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -661,7 +662,7 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, int i; int kill = 1; - if (PageReserved(p) || PageCompound(p) || PageSlab(p)) + if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) return; if (!PageLRU(p)) -- cgit v1.2.3 From 58355c7876a0754377c37c8af948b4cd423410e2 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 26 Oct 2009 16:49:35 -0700 Subject: congestion_wait(): don't use WRITE commit 8aa7e847d (Fix congestion_wait() sync/async vs read/write confusion) replace WRITE with BLK_RW_ASYNC. Unfortunately, concurrent mm development made the unchanged place accidentally. This patch fixes it too. Signed-off-by: KOSAKI Motohiro Acked-by: Jens Axboe Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 64e438898832..fbb9f6bdad6e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1088,7 +1088,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, int lumpy_reclaim = 0; while (unlikely(too_many_isolated(zone, file, sc))) { - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) -- cgit v1.2.3 From b76146ed1ae7d7acae1d51f9342e31d00c8d5a12 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 26 Oct 2009 16:49:52 -0700 Subject: revert "mm: oom analysis: add buffer cache information to show_free_areas()" Revert commit 71de1ccbe1fb40203edd3beb473f8580d917d2ca Author: KOSAKI Motohiro AuthorDate: Mon Sep 21 17:01:31 2009 -0700 Commit: Linus Torvalds CommitDate: Tue Sep 22 07:17:27 2009 -0700 mm: oom analysis: add buffer cache information to show_free_areas() show_free_areas() is called during page allocation failures, and page allocation failures can occur in any calling context. But nr_blockdev_pages() takes VFS locks which should not be taken from hard IRQ context (at least). The result is lockdep warnings (and deadlockability) during page allocation failures. Cc: KOSAKI Motohiro Cc: Wu Fengguang Cc: Rik van Riel Cc: David Rientjes Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bf720550b44d..cdcedf661616 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2183,7 +2183,7 @@ void show_free_areas(void) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" + " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), @@ -2196,7 +2196,6 @@ void show_free_areas(void) global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), - nr_blockdev_pages(), global_page_state(NR_FREE_PAGES), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), -- cgit v1.2.3 From 41e20983fe553b39bc2b00e07c7a379f0c86a4bc Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 26 Oct 2009 16:49:53 -0700 Subject: vmscan: limit VM_EXEC protection to file pages It is possible to have !Anon but SwapBacked pages, and some apps could create huge number of such pages with MAP_SHARED|MAP_ANONYMOUS. These pages go into the ANON lru list, and hence shall not be protected: we only care mapped executable files. Failing to do so may trigger OOM. Tested-by: Christian Borntraeger Reviewed-by: Rik van Riel Signed-off-by: Wu Fengguang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index fbb9f6bdad6e..fbcac3bdcf19 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1356,7 +1356,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * IO, plus JVM can create lots of anon VM_EXEC pages, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && !PageAnon(page)) { + if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { list_add(&page->lru, &l_active); continue; } -- cgit v1.2.3 From ab8a3e14e6f8e567560f664bbd29aefb306a274e Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 26 Oct 2009 16:49:58 -0700 Subject: mbind(): fix leak of never putback pages If mbind() receives an invalid address, do_mbind leaks a page. The following test program detects this leak. This patch fixes it. migrate_efault.c ======================================= #include #include #include #include #include #include #include static unsigned long pagesize; static void* make_hole_mapping(void) { void* addr; addr = mmap(NULL, pagesize*3, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, 0, 0); if (addr == MAP_FAILED) return NULL; /* make page populate */ memset(addr, 0, pagesize*3); /* make memory hole */ munmap(addr+pagesize, pagesize); return addr; } int main(int argc, char** argv) { void* addr; int ch; int node; struct bitmask *nmask = numa_allocate_nodemask(); int err; int node_set = 0; while ((ch = getopt(argc, argv, "n:")) != -1){ switch (ch){ case 'n': node = strtol(optarg, NULL, 0); numa_bitmask_setbit(nmask, node); node_set = 1; break; default: ; } } argc -= optind; argv += optind; if (!node_set) numa_bitmask_setbit(nmask, 0); pagesize = getpagesize(); addr = make_hole_mapping(); err = mbind(addr, pagesize*3, MPOL_BIND, nmask->maskp, nmask->size, MPOL_MF_MOVE_ALL); if (err) perror("mbind "); return 0; } ======================================= Signed-off-by: KOSAKI Motohiro Acked-by: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7dd9d9f80694..d49956d30259 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1058,7 +1058,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; - } + } else + putback_lru_pages(&pagelist); up_write(&mm->mmap_sem); mpol_put(new); -- cgit v1.2.3 From b05ca7385a2848abdc72051f832722641daed8b0 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 26 Oct 2009 16:49:59 -0700 Subject: do_mbind(): fix memory leak If migrate_prep is failed, new variable is leaked. This patch fixes it. Signed-off-by: KOSAKI Motohiro Acked-by: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d49956d30259..4545d5944243 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1024,7 +1024,7 @@ static long do_mbind(unsigned long start, unsigned long len, err = migrate_prep(); if (err) - return err; + goto mpol_out; } { NODEMASK_SCRATCH(scratch); @@ -1039,10 +1039,9 @@ static long do_mbind(unsigned long start, unsigned long len, err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } - if (err) { - mpol_put(new); - return err; - } + if (err) + goto mpol_out; + vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); @@ -1062,6 +1061,7 @@ static long do_mbind(unsigned long start, unsigned long len, putback_lru_pages(&pagelist); up_write(&mm->mmap_sem); + mpol_out: mpol_put(new); return err; } -- cgit v1.2.3 From 6a7b95481d49f73991d3dbf8c1e696a24684ac05 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Oct 2009 16:50:00 -0700 Subject: vmscan: order evictable rescue in LRU putback Isolators putting a page back to the LRU do not hold the page lock, and if the page is mlocked, another thread might munlock it concurrently. Expecting this, the putback code re-checks the evictability of a page when it just moved it to the unevictable list in order to correct its decision. The problem, however, is that ordering is not garuanteed between setting PG_lru when moving the page to the list and checking PG_mlocked afterwards: #0: #1 spin_lock() if (TestClearPageMlocked()) if (PageLRU()) move to evictable list SetPageLRU() spin_unlock() if (!PageMlocked()) move to evictable list The PageMlocked() check may get reordered before SetPageLRU() in #0, resulting in #0 not moving the still mlocked page, and in #1 failing to isolate and move the page as well. The page is now stranded on the unevictable list. The race condition is very unlikely. The consequence currently is one page falling off the reclaim grid and eventually getting freed with PG_unevictable set, which triggers a warning in the page allocator. TestClearPageMlocked() in #1 already provides full memory barrier semantics. This patch adds an explicit full barrier to force ordering between SetPageLRU() and PageMlocked() so that either one of the competitors rescues the page. Signed-off-by: Johannes Weiner Reviewed-by: KOSAKI Motohiro Cc: Hugh Dickins Cc: Mel Gorman Cc: Lee Schermerhorn Cc: Peter Zijlstra Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index fbcac3bdcf19..777af57fd8c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -544,6 +544,16 @@ redo: */ lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); + /* + * When racing with an mlock clearing (page is + * unlocked), make sure that if the other thread does + * not observe our setting of PG_lru and fails + * isolation, we see PG_mlocked cleared below and move + * the page back to the evictable list. + * + * The other side is TestClearPageMlocked(). + */ + smp_mb(); } /* -- cgit v1.2.3 From 1a83e175dc2c7be931a3ea9c7fb0769e6de55e90 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 26 Oct 2009 16:50:12 -0700 Subject: mm: fix sparsemem configuration Currently, sparsemem is only available if EXPERIMENTAL is enabled. However, it hasn't ever been marked experimental. It's been about four years since sparsemem was merged, and we have platforms which depend on it; allow architectures to decide whether sparsemem should be the default memory model. Signed-off-by: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 57963c6063d1..f791196cee8c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -67,7 +67,7 @@ config DISCONTIGMEM config SPARSEMEM def_bool y - depends on SPARSEMEM_MANUAL + depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL config FLATMEM def_bool y -- cgit v1.2.3 From c36987e2ef32e1bb7850379515f21187cba44754 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Mon, 26 Oct 2009 16:50:23 -0700 Subject: mm: don't call pte_unmap() against an improper pte There are some places where we do like: pte = pte_map(); do { (do break in some conditions) } while (pte++, ...); pte_unmap(pte - 1); But if the loop breaks at the first loop, pte_unmap() unmaps invalid pte. This patch is a fix for this problem. Signed-off-by: Daisuke Nishimura Reviewd-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7e91b5f9f690..60ea601e03ea 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -641,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; @@ -654,6 +655,8 @@ again: src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + orig_src_pte = src_pte; + orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { @@ -677,9 +680,9 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); - pte_unmap_nested(src_pte - 1); + pte_unmap_nested(orig_src_pte); add_mm_rss(dst_mm, rss[0], rss[1]); - pte_unmap_unlock(dst_pte - 1, dst_ptl); + pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (addr != end) goto again; @@ -1820,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, token = pmd_pgtable(*pmd); do { - err = fn(pte, token, addr, data); + err = fn(pte++, token, addr, data); if (err) break; - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); -- cgit v1.2.3 From 89a8640279f8bb78aaf778d1fc5c4a6778f18064 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 30 Oct 2009 13:13:26 +0000 Subject: NOMMU: Don't pass NULL pointers to fput() in do_mmap_pgoff() Don't pass NULL pointers to fput() in the error handling paths of the NOMMU do_mmap_pgoff() as it can't handle it. The following can be used as a test program: int main() { static long long a[1024 * 1024 * 20] = { 0 }; return a;} Without the patch, the code oopses in atomic_long_dec_and_test() as called by fput() after the kernel complains that it can't allocate that big a chunk of memory. With the patch, the kernel just complains about the allocation size and then the program segfaults during execve() as execve() can't complete the allocation of all the new ELF program segments. Reported-by: Robin Getz Signed-off-by: David Howells Acked-by: Robin Getz Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/nommu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 5189b5aed8c0..9876fa0c3ad3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1362,9 +1362,11 @@ share: error_just_free: up_write(&nommu_region_sem); error: - fput(region->vm_file); + if (region->vm_file) + fput(region->vm_file); kmem_cache_free(vm_region_jar, region); - fput(vma->vm_file); + if (vma->vm_file) + fput(vma->vm_file); if (vma->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(vma->vm_mm); kmem_cache_free(vm_area_cachep, vma); -- cgit v1.2.3 From 32c5fc10e79a7053ac5728b01a0bff55cbcb9d49 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Mon, 2 Nov 2009 16:50:33 +0000 Subject: mm: remove incorrect swap_count() from try_to_unuse() In try_to_unuse(), swcount is a local copy of *swap_map, including the SWAP_HAS_CACHE bit; but a wrong comparison against swap_count(*swap_map), which masks off the SWAP_HAS_CACHE bit, succeeded where it should fail. That had the effect of resetting the mm from which to start searching for the next swap page, to an irrelevant mm instead of to an mm in which this swap page had been found: which may increase search time by ~20%. But we're used to swapoff being slow, so never noticed the slowdown. Remove that one spurious use of swap_count(): Bo Liu thought it merely redundant, Hugh rewrote the description since it was measurably wrong. Signed-off-by: Bo Liu Signed-off-by: Hugh Dickins Reviewed-by: KAMEZAWA Hiroyuki Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index a1bc6b9af9a2..9c590eef7912 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1151,8 +1151,7 @@ static int try_to_unuse(unsigned int type) } else retval = unuse_mm(mm, entry, page); - if (set_start_mm && - swap_count(*swap_map) < swcount) { + if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); new_start_mm = mm; -- cgit v1.2.3