From 9c56751271e7a917783fb57ec49fe8382e0dc867 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 16 Oct 2013 13:46:43 -0700 Subject: mm, memcg: protect mem_cgroup_read_events for cpu hotplug for_each_online_cpu() needs the protection of {get,put}_online_cpus() so cpu_online_mask doesn't change during the iteration. cpu_hotplug.lock is held while a cpu is going down, it's a coarse lock that is used kernel-wide to synchronize cpu hotplug activity. Memcg has a cpu hotplug notifier, called while there may not be any cpu hotplug refcounts, which drains per-cpu event counts to memcg->nocpu_base.events to maintain a cumulative event count as cpus disappear. Without get_online_cpus() in mem_cgroup_read_events(), it's possible to account for the event count on a dying cpu twice, and this value may be significantly large. In fact, all memcg->pcp_counter_lock use should be nested by {get,put}_online_cpus(). This fixes that issue and ensures the reported statistics are not vastly over-reported during cpu hotplug. Signed-off-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1c52ddbc839b..5335b2b6be77 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -866,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, unsigned long val = 0; int cpu; + get_online_cpus(); for_each_online_cpu(cpu) val += per_cpu(memcg->stat->events[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU @@ -873,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, val += memcg->nocpu_base.events[idx]; spin_unlock(&memcg->pcp_counter_lock); #endif + put_online_cpus(); return val; } -- cgit v1.2.3 From 18ccee263c7e250a57f01c9434658f11f4118a64 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 16 Oct 2013 13:46:45 -0700 Subject: ipc: update locking scheme comments The initial documentation was a bit incomplete, update accordingly. [akpm@linux-foundation.org: make it more readable in 80 columns] Signed-off-by: Davidlohr Bueso Acked-by: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index fdb8ae740775..7684f41bce76 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -17,12 +17,27 @@ * Pavel Emelianov * * General sysv ipc locking scheme: - * when doing ipc id lookups, take the ids->rwsem - * rcu_read_lock() - * obtain the ipc object (kern_ipc_perm) - * perform security, capabilities, auditing and permission checks, etc. - * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object() - * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands) + * rcu_read_lock() + * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr + * tree. + * - perform initial checks (capabilities, auditing and permission, + * etc). + * - perform read-only operations, such as STAT, INFO commands. + * acquire the ipc lock (kern_ipc_perm.lock) through + * ipc_lock_object() + * - perform data updates, such as SET, RMID commands and + * mechanism-specific operations (semop/semtimedop, + * msgsnd/msgrcv, shmat/shmdt). + * drop the ipc lock, through ipc_unlock_object(). + * rcu_read_unlock() + * + * The ids->rwsem must be taken when: + * - creating, removing and iterating the existing entries in ipc + * identifier sets. + * - iterating through files under /proc/sysvipc/ + * + * Note that sems have a special fast path that avoids kern_ipc_perm.lock - + * see sem_lock(). */ #include -- cgit v1.2.3 From 6e224f94597842c5eb17f1fc2208d20b6f7f7d49 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Wed, 16 Oct 2013 13:46:45 -0700 Subject: ipc/sem.c: synchronize semop and semctl with IPC_RMID After acquiring the semlock spinlock, operations must test that the array is still valid. - semctl() and exit_sem() would walk stale linked lists (ugly, but should be ok: all lists are empty) - semtimedop() would sleep forever - and if woken up due to a signal - access memory after free. The patch also: - standardizes the tests for .deleted, so that all tests in one function leave the function with the same approach. - unconditionally tests for .deleted immediately after every call to sem_lock - even it it means that for semctl(GETALL), .deleted will be tested twice. Both changes make the review simpler: After every sem_lock, there must be a test of .deleted, followed by a goto to the cleanup code (if the function uses "goto cleanup"). The only exception is semctl_down(): If sem_ids().rwsem is locked, then the presence in ids->ipcs_idr is equivalent to !.deleted, thus no additional test is required. Signed-off-by: Manfred Spraul Cc: Mike Galbraith Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index 8c4f59b0204a..db9d241af133 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1282,6 +1282,12 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, sem_lock(sma, NULL, -1); + if (sma->sem_perm.deleted) { + sem_unlock(sma, -1); + rcu_read_unlock(); + return -EIDRM; + } + curr = &sma->sem_base[semnum]; ipc_assert_locked_object(&sma->sem_perm); @@ -1336,12 +1342,14 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, int i; sem_lock(sma, NULL, -1); + if (sma->sem_perm.deleted) { + err = -EIDRM; + goto out_unlock; + } if(nsems > SEMMSL_FAST) { if (!ipc_rcu_getref(sma)) { - sem_unlock(sma, -1); - rcu_read_unlock(); err = -EIDRM; - goto out_free; + goto out_unlock; } sem_unlock(sma, -1); rcu_read_unlock(); @@ -1354,10 +1362,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, rcu_read_lock(); sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma, -1); - rcu_read_unlock(); err = -EIDRM; - goto out_free; + goto out_unlock; } } for (i = 0; i < sma->sem_nsems; i++) @@ -1375,8 +1381,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, struct sem_undo *un; if (!ipc_rcu_getref(sma)) { - rcu_read_unlock(); - return -EIDRM; + err = -EIDRM; + goto out_rcu_wakeup; } rcu_read_unlock(); @@ -1404,10 +1410,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, rcu_read_lock(); sem_lock_and_putref(sma); if (sma->sem_perm.deleted) { - sem_unlock(sma, -1); - rcu_read_unlock(); err = -EIDRM; - goto out_free; + goto out_unlock; } for (i = 0; i < nsems; i++) @@ -1431,6 +1435,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, goto out_rcu_wakeup; sem_lock(sma, NULL, -1); + if (sma->sem_perm.deleted) { + err = -EIDRM; + goto out_unlock; + } curr = &sma->sem_base[semnum]; switch (cmd) { @@ -1836,6 +1844,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, if (error) goto out_rcu_wakeup; + error = -EIDRM; + locknum = sem_lock(sma, sops, nsops); + if (sma->sem_perm.deleted) + goto out_unlock_free; /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID @@ -1843,8 +1855,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, * This case can be detected checking un->semid. The existence of * "un" itself is guaranteed by rcu. */ - error = -EIDRM; - locknum = sem_lock(sma, sops, nsops); if (un && un->semid == -1) goto out_unlock_free; @@ -2057,6 +2067,12 @@ void exit_sem(struct task_struct *tsk) } sem_lock(sma, NULL, -1); + /* exit_sem raced with IPC_RMID, nothing to do */ + if (sma->sem_perm.deleted) { + sem_unlock(sma, -1); + rcu_read_unlock(); + continue; + } un = __lookup_undo(ulp, semid); if (un == NULL) { /* exit_sem raced with IPC_RMID+semget() that created -- cgit v1.2.3 From ae39332162a837c3791bb21172d22382a90a6fd1 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Wed, 16 Oct 2013 13:46:46 -0700 Subject: mm/vmscan.c: don't forget to free shrinker->nr_deferred This leak was added by commit 1d3d4437eae1 ("vmscan: per-node deferred work"). unreferenced object 0xffff88006ada3bd0 (size 8): comm "criu", pid 14781, jiffies 4295238251 (age 105.641s) hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace: [] kmemleak_alloc+0x5e/0xc0 [] __kmalloc+0x247/0x310 [] register_shrinker+0x3c/0xa0 [] sget+0x5ab/0x670 [] proc_mount+0x54/0x170 [] mount_fs+0x43/0x1b0 [] vfs_kern_mount+0x72/0x110 [] kern_mount_data+0x19/0x30 [] pid_ns_prepare_proc+0x20/0x40 [] alloc_pid+0x466/0x4a0 [] copy_process+0xc6a/0x1860 [] do_fork+0x8b/0x370 [] SyS_clone+0x16/0x20 [] stub_clone+0x69/0x90 [] 0xffffffffffffffff Signed-off-by: Andrew Vagin Cc: Mel Gorman Cc: Michal Hocko Cc: Rik van Riel Cc: Johannes Weiner Cc: Glauber Costa Cc: Chuck Lever Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 53f2f82f83ae..eea668d9cff6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -211,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); } EXPORT_SYMBOL(unregister_shrinker); -- cgit v1.2.3 From 16c794b4f3ff38e97a71ce2472c7792f859d1022 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 16 Oct 2013 13:46:48 -0700 Subject: mm/hugetlb.c: correct missing private flag clearing We should clear the page's private flag when returing the page to the hugepage pool. Otherwise, marked hugepage can be allocated to the user who tries to allocate the non-reserved hugepage. If this user fail to map this hugepage, he would try to return the page to the hugepage pool. Since this page has a private flag, resv_huge_pages would mistakenly increase. This patch fixes this situation. Signed-off-by: Joonsoo Kim Cc: Rik van Riel Cc: Mel Gorman Cc: Michal Hocko Cc: "Aneesh Kumar K.V" Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Cc: Wanpeng Li Cc: Naoya Horiguchi Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b49579c7f2a5..691f2264a6ce 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -653,6 +653,7 @@ static void free_huge_page(struct page *page) BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); restore_reserve = PagePrivate(page); + ClearPagePrivate(page); spin_lock(&hugetlb_lock); hugetlb_cgroup_uncharge_page(hstate_index(h), -- cgit v1.2.3 From d8e2162cf0bbc49bb3fe1ac689618f6629c9150b Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Wed, 16 Oct 2013 13:46:49 -0700 Subject: gcov: MAINTAINERS: Add an entry for gcov Signed-off-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 72b1e5c2378a..a7c34ef3509d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3624,6 +3624,12 @@ L: linux-scsi@vger.kernel.org S: Odd Fixes (e.g., new signatures) F: drivers/scsi/fdomain.* +GCOV BASED KERNEL PROFILING +M: Peter Oberparleiter +S: Maintained +F: kernel/gcov/ +F: Documentation/gcov.txt + GDT SCSI DISK ARRAY CONTROLLER DRIVER M: Achim Leubner L: linux-scsi@vger.kernel.org -- cgit v1.2.3 From c3d16e16522fe3fe8759735850a0676da18f4b1d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 16 Oct 2013 13:46:51 -0700 Subject: mm: migration: do not lose soft dirty bit if page is in migration state If page migration is turned on in config and the page is migrating, we may lose the soft dirty bit. If fork and mprotect are called on migrating pages (once migration is complete) pages do not obtain the soft dirty bit in the correspond pte entries. Fix it adding an appropriate test on swap entries. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andy Lutomirski Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 ++ mm/migrate.c | 2 ++ mm/mprotect.c | 7 +++++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ca0003947115..f7b7692c05ed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -837,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); set_pte_at(src_mm, addr, src_pte, pte); } } diff --git a/mm/migrate.c b/mm/migrate.c index a26bccd44ccb..7a7325ee1d08 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*ptep)) + pte = pte_mksoft_dirty(pte); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b43..a3af058f68e4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -94,13 +94,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry = pte_to_swp_entry(oldpte); if (is_write_migration_entry(entry)) { + pte_t newpte; /* * A protection check is difficult so * just be safe and disable write */ make_migration_entry_read(&entry); - set_pte_at(mm, addr, pte, - swp_entry_to_pte(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + set_pte_at(mm, addr, pte, newpte); } pages++; } -- cgit v1.2.3 From e9cdd6e771580e6ff872e5c64e8b766972c7d1bc Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 16 Oct 2013 13:46:53 -0700 Subject: mm: /proc/pid/pagemap: inspect _PAGE_SOFT_DIRTY only on present pages If a page we are inspecting is in swap we may occasionally report it as having soft dirty bit (even if it is clean). The pte_soft_dirty helper should be called on present pte only. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andy Lutomirski Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7366e9d63cee..390bdab01c3c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -941,6 +941,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, frame = pte_pfn(pte); flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) @@ -960,7 +962,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (page && !PageAnon(page)) flags |= PM_FILE; - if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) + if ((vma->vm_flags & VM_SOFTDIRTY)) flags2 |= __PM_SOFT_DIRTY; *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); -- cgit v1.2.3 From aa9bca05a467c61dcea4142b2877d5392de5bdce Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Wed, 16 Oct 2013 13:46:54 -0700 Subject: mm/zswap: bugfix: memory leak when re-swapon zswap_tree is not freed when swapoff, and it got re-kmalloced in swapon, so a memory leak occurs. Free the memory of zswap_tree in zswap_frontswap_invalidate_area(). Signed-off-by: Weijie Yang Reviewed-by: Bob Liu Cc: Minchan Kim Reviewed-by: Minchan Kim Cc: From: Weijie Yang Subject: mm/zswap: bugfix: memory leak when invalidate and reclaim occur concurrently Consider the following scenario: thread 0: reclaim entry x (get refcount, but not call zswap_get_swap_cache_page) thread 1: call zswap_frontswap_invalidate_page to invalidate entry x. finished, entry x and its zbud is not freed as its refcount != 0 now, the swap_map[x] = 0 thread 0: now call zswap_get_swap_cache_page swapcache_prepare return -ENOENT because entry x is not used any more zswap_get_swap_cache_page return ZSWAP_SWAPCACHE_NOMEM zswap_writeback_entry do nothing except put refcount Now, the memory of zswap_entry x and its zpage leak. Modify: - check the refcount in fail path, free memory if it is not referenced. - use ZSWAP_SWAPCACHE_FAIL instead of ZSWAP_SWAPCACHE_NOMEM as the fail path can be not only caused by nomem but also by invalidate. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Weijie Yang Reviewed-by: Bob Liu Cc: Minchan Kim Cc: Acked-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index 841e35f1db22..d93510c6aa2d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -804,6 +804,10 @@ static void zswap_frontswap_invalidate_area(unsigned type) } tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); + + zbud_destroy_pool(tree->pool); + kfree(tree); + zswap_trees[type] = NULL; } static struct zbud_ops zswap_zbud_ops = { -- cgit v1.2.3 From ef5a22be2c525293b777ccd879a8017c41c7ed5a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 16 Oct 2013 13:46:56 -0700 Subject: mm: hugetlb: initialize PG_reserved for tail pages of gigantic compound pages Commit 11feeb498086 ("kvm: optimize away THP checks in kvm_is_mmio_pfn()") introduced a memory leak when KVM is run on gigantic compound pages. That commit depends on the assumption that PG_reserved is identical for all head and tail pages of a compound page. So that if get_user_pages returns a tail page, we don't need to check the head page in order to know if we deal with a reserved page that requires different refcounting. The assumption that PG_reserved is the same for head and tail pages is certainly correct for THP and regular hugepages, but gigantic hugepages allocated through bootmem don't clear the PG_reserved on the tail pages (the clearing of PG_reserved is done later only if the gigantic hugepage is freed). This patch corrects the gigantic compound page initialization so that we can retain the optimization in 11feeb498086. The cacheline was already modified in order to set PG_tail so this won't affect the boot time of large memory systems. [akpm@linux-foundation.org: tweak comment layout and grammar] Signed-off-by: Andrea Arcangeli Reported-by: andy123 Acked-by: Rik van Riel Cc: Gleb Natapov Cc: Mel Gorman Cc: Hugh Dickins Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 691f2264a6ce..0b7656e804d1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -696,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); __SetPageHead(page); + __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + /* + * For gigantic hugepages allocated through bootmem at + * boot, it's safer to be consistent with the not-gigantic + * hugepages and clear the PG_reserved bit from all tail pages + * too. Otherwse drivers using get_user_pages() to access tail + * pages may get the reference counting wrong if they see + * PG_reserved set on a tail page (despite the head page not + * having PG_reserved set). Enforcing this consistency between + * head and tail pages allows drivers to optimize away a check + * on the head page when they need know if put_page() is needed + * after get_user_pages(). + */ + __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; } @@ -1330,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void) #else page = virt_to_page(m); #endif - __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); + WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); /* * If we had gigantic hugepages allocated at boot time, we need -- cgit v1.2.3 From 87fc0ad2ad8a15de653f4cef7760fa35e689077f Mon Sep 17 00:00:00 2001 From: Doug Anderson Date: Wed, 16 Oct 2013 13:46:57 -0700 Subject: block/partitions/efi.c: treat size mismatch as a warning, not an error In commit 27a7c642174e ("partitions/efi: account for pmbr size in lba") we started treating bad sizes in lba field of the partition that has the 0xEE (GPT protective) as errors. However, we may run into these "bad sizes" in the real world if someone uses dd to copy an image from a smaller disk to a bigger disk. Since this case used to work (even without using force_gpt), keep it working and treat the size mismatch as a warning instead of an error. Reported-by: Josh Triplett Reported-by: Sean Paul Signed-off-by: Doug Anderson Reviewed-by: Josh Triplett Acked-by: Davidlohr Bueso Tested-by: Artem Bityutskiy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 1eb09ee5311b..a8287b49d062 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -222,11 +222,16 @@ check_hybrid: * the disk size. * * Hybrid MBRs do not necessarily comply with this. + * + * Consider a bad value here to be a warning to support dd'ing + * an image from a smaller disk to a larger disk. */ if (ret == GPT_MBR_PROTECTIVE) { sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) - ret = 0; + pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", + sz, min_t(uint32_t, + total_sectors - 1, 0xFFFFFFFF)); } done: return ret; -- cgit v1.2.3 From c88b05b2cd07221cdefd56f7f7422c1459eb60c9 Mon Sep 17 00:00:00 2001 From: Felipe Pena Date: Wed, 16 Oct 2013 13:46:58 -0700 Subject: tools/testing/selftests: fix uninitialized variable The err variable is intended to receive the timer_create() return before checking it Signed-off-by: Felipe Pena Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/timers/posix_timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 4fa655d68a81..41bd85559d4b 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -151,7 +151,7 @@ static int check_timer_create(int which) fflush(stdout); done = 0; - timer_create(which, NULL, &id); + err = timer_create(which, NULL, &id); if (err < 0) { perror("Can't create timer\n"); return -1; -- cgit v1.2.3 From 4942642080ea82d99ab5b653abb9a12b7ba31f4a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 16 Oct 2013 13:46:59 -0700 Subject: mm: memcg: handle non-error OOM situations more gracefully Commit 3812c8c8f395 ("mm: memcg: do not trap chargers with full callstack on OOM") assumed that only a few places that can trigger a memcg OOM situation do not return VM_FAULT_OOM, like optional page cache readahead. But there are many more and it's impractical to annotate them all. First of all, we don't want to invoke the OOM killer when the failed allocation is gracefully handled, so defer the actual kill to the end of the fault handling as well. This simplifies the code quite a bit for added bonus. Second, since a failed allocation might not be the abrupt end of the fault, the memcg OOM handler needs to be re-entrant until the fault finishes for subsequent allocation attempts. If an allocation is attempted after the task already OOMed, allow it to bypass the limit so that it can quickly finish the fault and invoke the OOM killer. Reported-by: azurIt Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 50 ++++------------ include/linux/sched.h | 7 +-- mm/filemap.c | 11 +--- mm/memcontrol.c | 139 +++++++++++++++++---------------------------- mm/memory.c | 18 ++++-- mm/oom_kill.c | 2 +- 6 files changed, 79 insertions(+), 148 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ecc82b37c4cc..b3e7a667e03c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); -/** - * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task - * @new: true to enable, false to disable - * - * Toggle whether a failed memcg charge should invoke the OOM killer - * or just return -ENOMEM. Returns the previous toggle state. - * - * NOTE: Any path that enables the OOM killer before charging must - * call mem_cgroup_oom_synchronize() afterward to finalize the - * OOM handling and clean up. - */ -static inline bool mem_cgroup_toggle_oom(bool new) +static inline void mem_cgroup_oom_enable(void) { - bool old; - - old = current->memcg_oom.may_oom; - current->memcg_oom.may_oom = new; - - return old; + WARN_ON(current->memcg_oom.may_oom); + current->memcg_oom.may_oom = 1; } -static inline void mem_cgroup_enable_oom(void) +static inline void mem_cgroup_oom_disable(void) { - bool old = mem_cgroup_toggle_oom(true); - - WARN_ON(old == true); -} - -static inline void mem_cgroup_disable_oom(void) -{ - bool old = mem_cgroup_toggle_oom(false); - - WARN_ON(old == false); + WARN_ON(!current->memcg_oom.may_oom); + current->memcg_oom.may_oom = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { - return p->memcg_oom.in_memcg_oom; + return p->memcg_oom.memcg; } -bool mem_cgroup_oom_synchronize(void); +bool mem_cgroup_oom_synchronize(bool wait); #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; @@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, { } -static inline bool mem_cgroup_toggle_oom(bool new) -{ - return false; -} - -static inline void mem_cgroup_enable_oom(void) +static inline void mem_cgroup_oom_enable(void) { } -static inline void mem_cgroup_disable_oom(void) +static inline void mem_cgroup_oom_disable(void) { } @@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p) return false; } -static inline bool mem_cgroup_oom_synchronize(void) +static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b293..e27baeeda3f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1394,11 +1394,10 @@ struct task_struct { } memcg_batch; unsigned int memcg_kmem_skip_account; struct memcg_oom_info { + struct mem_cgroup *memcg; + gfp_t gfp_mask; + int order; unsigned int may_oom:1; - unsigned int in_memcg_oom:1; - unsigned int oom_locked:1; - int wakeups; - struct mem_cgroup *wait_on_memcg; } memcg_oom; #endif #ifdef CONFIG_UPROBES diff --git a/mm/filemap.c b/mm/filemap.c index 1e6aec4a2d2e..ae4846ff4849 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; - bool memcg_oom; pgoff_t size; int ret = 0; @@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* - * Do we have something in the page cache already? Either - * way, try readahead, but disable the memcg OOM killer for it - * as readahead is optional and no errors are propagated up - * the fault stack. The OOM killer is enabled while trying to - * instantiate the faulting page individually below. + * Do we have something in the page cache already? */ page = find_get_page(mapping, offset); if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { @@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * We found the page, so try async readahead before * waiting for the lock. */ - memcg_oom = mem_cgroup_toggle_oom(false); do_async_mmap_readahead(vma, ra, file, page, offset); - mem_cgroup_toggle_oom(memcg_oom); } else if (!page) { /* No page in the page cache at all */ - memcg_oom = mem_cgroup_toggle_oom(false); do_sync_mmap_readahead(vma, ra, file, offset); - mem_cgroup_toggle_oom(memcg_oom); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5335b2b6be77..65fc6a449841 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2161,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) memcg_wakeup_oom(memcg); } -/* - * try to call OOM killer - */ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - bool locked; - int wakeups; - if (!current->memcg_oom.may_oom) return; - - current->memcg_oom.in_memcg_oom = 1; - /* - * As with any blocking lock, a contender needs to start - * listening for wakeups before attempting the trylock, - * otherwise it can miss the wakeup from the unlock and sleep - * indefinitely. This is just open-coded because our locking - * is so particular to memcg hierarchies. + * We are in the middle of the charge context here, so we + * don't want to block when potentially sitting on a callstack + * that holds all kinds of filesystem and mm locks. + * + * Also, the caller may handle a failed allocation gracefully + * (like optional page cache readahead) and so an OOM killer + * invocation might not even be necessary. + * + * That's why we don't do anything here except remember the + * OOM context and then deal with it at the end of the page + * fault when the stack is unwound, the locks are released, + * and when we know whether the fault was overall successful. */ - wakeups = atomic_read(&memcg->oom_wakeups); - mem_cgroup_mark_under_oom(memcg); - - locked = mem_cgroup_oom_trylock(memcg); - - if (locked) - mem_cgroup_oom_notify(memcg); - - if (locked && !memcg->oom_kill_disable) { - mem_cgroup_unmark_under_oom(memcg); - mem_cgroup_out_of_memory(memcg, mask, order); - mem_cgroup_oom_unlock(memcg); - /* - * There is no guarantee that an OOM-lock contender - * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitely. - */ - memcg_oom_recover(memcg); - } else { - /* - * A system call can just return -ENOMEM, but if this - * is a page fault and somebody else is handling the - * OOM already, we need to sleep on the OOM waitqueue - * for this memcg until the situation is resolved. - * Which can take some time because it might be - * handled by a userspace task. - * - * However, this is the charge context, which means - * that we may sit on a large call stack and hold - * various filesystem locks, the mmap_sem etc. and we - * don't want the OOM handler to deadlock on them - * while we sit here and wait. Store the current OOM - * context in the task_struct, then return -ENOMEM. - * At the end of the page fault handler, with the - * stack unwound, pagefault_out_of_memory() will check - * back with us by calling - * mem_cgroup_oom_synchronize(), possibly putting the - * task to sleep. - */ - current->memcg_oom.oom_locked = locked; - current->memcg_oom.wakeups = wakeups; - css_get(&memcg->css); - current->memcg_oom.wait_on_memcg = memcg; - } + css_get(&memcg->css); + current->memcg_oom.memcg = memcg; + current->memcg_oom.gfp_mask = mask; + current->memcg_oom.order = order; } /** * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state * - * This has to be called at the end of a page fault if the the memcg - * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled. * - * Memcg supports userspace OOM handling, so failed allocations must + * Memcg supports userspace OOM handling where failed allocations must * sleep on a waitqueue until the userspace task resolves the * situation. Sleeping directly in the charge context with all kinds * of locks held is not a good idea, instead we remember an OOM state * in the task and mem_cgroup_oom_synchronize() has to be called at - * the end of the page fault to put the task to sleep and clean up the - * OOM state. + * the end of the page fault to complete the OOM handling. * * Returns %true if an ongoing memcg OOM situation was detected and - * finalized, %false otherwise. + * completed, %false otherwise. */ -bool mem_cgroup_oom_synchronize(void) +bool mem_cgroup_oom_synchronize(bool handle) { + struct mem_cgroup *memcg = current->memcg_oom.memcg; struct oom_wait_info owait; - struct mem_cgroup *memcg; + bool locked; /* OOM is global, do not handle */ - if (!current->memcg_oom.in_memcg_oom) - return false; - - /* - * We invoked the OOM killer but there is a chance that a kill - * did not free up any charges. Everybody else might already - * be sleeping, so restart the fault and keep the rampage - * going until some charges are released. - */ - memcg = current->memcg_oom.wait_on_memcg; if (!memcg) - goto out; + return false; - if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) - goto out_memcg; + if (!handle) + goto cleanup; owait.memcg = memcg; owait.wait.flags = 0; @@ -2273,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void) INIT_LIST_HEAD(&owait.wait.task_list); prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - /* Only sleep if we didn't miss any wakeups since OOM */ - if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, + current->memcg_oom.order); + } else { schedule(); - finish_wait(&memcg_oom_waitq, &owait.wait); -out_memcg: - mem_cgroup_unmark_under_oom(memcg); - if (current->memcg_oom.oom_locked) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + } + + if (locked) { mem_cgroup_oom_unlock(memcg); /* * There is no guarantee that an OOM-lock contender @@ -2288,10 +2249,9 @@ out_memcg: */ memcg_oom_recover(memcg); } +cleanup: + current->memcg_oom.memcg = NULL; css_put(&memcg->css); - current->memcg_oom.wait_on_memcg = NULL; -out: - current->memcg_oom.in_memcg_oom = 0; return true; } @@ -2705,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, || fatal_signal_pending(current))) goto bypass; + if (unlikely(task_in_memcg_oom(current))) + goto bypass; + /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the diff --git a/mm/memory.c b/mm/memory.c index f7b7692c05ed..1311f26497e6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3865,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, * space. Kernel faults are handled more gracefully. */ if (flags & FAULT_FLAG_USER) - mem_cgroup_enable_oom(); + mem_cgroup_oom_enable(); ret = __handle_mm_fault(mm, vma, address, flags); - if (flags & FAULT_FLAG_USER) - mem_cgroup_disable_oom(); - - if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) - mem_cgroup_oom_synchronize(); + if (flags & FAULT_FLAG_USER) { + mem_cgroup_oom_disable(); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); + } return ret; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d274381..6738c47f1f72 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -680,7 +680,7 @@ void pagefault_out_of_memory(void) { struct zonelist *zonelist; - if (mem_cgroup_oom_synchronize()) + if (mem_cgroup_oom_synchronize(true)) return; zonelist = node_zonelist(first_online_node, GFP_KERNEL); -- cgit v1.2.3 From 84235de394d9775bfaa7fa9762a59d91fef0c1fc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 16 Oct 2013 13:47:00 -0700 Subject: fs: buffer: move allocation failure loop into the allocator Buffer allocation has a very crude indefinite loop around waking the flusher threads and performing global NOFS direct reclaim because it can not handle allocation failures. The most immediate problem with this is that the allocation may fail due to a memory cgroup limit, where flushers + direct reclaim might not make any progress towards resolving the situation at all. Because unlike the global case, a memory cgroup may not have any cache at all, only anonymous pages but no swap. This situation will lead to a reclaim livelock with insane IO from waking the flushers and thrashing unrelated filesystem cache in a tight loop. Use __GFP_NOFAIL allocations for buffers for now. This makes sure that any looping happens in the page allocator, which knows how to orchestrate kswapd, direct reclaim, and the flushers sensibly. It also allows memory cgroups to detect allocations that can't handle failure and will allow them to ultimately bypass the limit if reclaim can not make progress. Reported-by: azurIt Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 14 ++++++++++++-- mm/memcontrol.c | 2 ++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 4d7433534f5c..6024877335ca 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; /* Will call free_more_memory() */ + gfp_t gfp_mask; - page = find_or_create_page(inode->i_mapping, index, - (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); + gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; + gfp_mask |= __GFP_MOVABLE; + /* + * XXX: __getblk_slow() can not really deal with failure and + * will endlessly loop on improvised global reclaim. Prefer + * looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp_mask |= __GFP_NOFAIL; + + page = find_or_create_page(inode->i_mapping, index, gfp_mask); if (!page) return ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 65fc6a449841..34d3ca9572d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2766,6 +2766,8 @@ done: return 0; nomem: *ptr = NULL; + if (gfp_mask & __GFP_NOFAIL) + return 0; return -ENOMEM; bypass: *ptr = root_mem_cgroup; -- cgit v1.2.3 From 5e9dd373dea43f61f5d3ce8b6667377a02c6fd42 Mon Sep 17 00:00:00 2001 From: Matias Bjorling Date: Wed, 16 Oct 2013 13:47:01 -0700 Subject: percpu_refcount: export symbols Export the interface to be used within modules. Signed-off-by: Matias Bjorling Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu-refcount.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 7deeb6297a48..1a53d497a8c5 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -53,6 +53,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) ref->release = release; return 0; } +EXPORT_SYMBOL_GPL(percpu_ref_init); /** * percpu_ref_cancel_init - cancel percpu_ref_init() @@ -84,6 +85,7 @@ void percpu_ref_cancel_init(struct percpu_ref *ref) free_percpu(ref->pcpu_count); } } +EXPORT_SYMBOL_GPL(percpu_ref_cancel_init); static void percpu_ref_kill_rcu(struct rcu_head *rcu) { @@ -156,3 +158,4 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } +EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); -- cgit v1.2.3 From e3b6c655b91e01a1dade056cfa358581b47a5351 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 16 Oct 2013 13:47:03 -0700 Subject: writeback: fix negative bdi max pause MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Toralf runs trinity on UML/i386. After some time it hangs and the last message line is BUG: soft lockup - CPU#0 stuck for 22s! [trinity-child0:1521] It's found that pages_dirtied becomes very large. More than 1000000000 pages in this case: period = HZ * pages_dirtied / task_ratelimit; BUG_ON(pages_dirtied > 2000000000); BUG_ON(pages_dirtied > 1000000000); <--------- UML debug printf shows that we got negative pause here: ick: pause : -984 ick: pages_dirtied : 0 ick: task_ratelimit: 0 pause: + if (pause < 0) { + extern int printf(char *, ...); + printf("ick : pause : %li\n", pause); + printf("ick: pages_dirtied : %lu\n", pages_dirtied); + printf("ick: task_ratelimit: %lu\n", task_ratelimit); + BUG_ON(1); + } trace_balance_dirty_pages(bdi, Since pause is bounded by [min_pause, max_pause] where min_pause is also bounded by max_pause. It's suspected and demonstrated that the max_pause calculation goes wrong: ick: pause : -717 ick: min_pause : -177 ick: max_pause : -717 ick: pages_dirtied : 14 ick: task_ratelimit: 0 The problem lies in the two "long = unsigned long" assignments in bdi_max_pause() which might go negative if the highest bit is 1, and the min_t(long, ...) check failed to protect it falling under 0. Fix all of them by using "unsigned long" throughout the function. Signed-off-by: Fengguang Wu Reported-by: Toralf Förster Tested-by: Toralf Förster Reviewed-by: Jan Kara Cc: Richard Weinberger Cc: Geert Uytterhoeven Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f5236f804aa6..63807583d8e8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static unsigned long bdi_max_pause(struct backing_dev_info *bdi, + unsigned long bdi_dirty) { - long bw = bdi->avg_write_bandwidth; - long t; + unsigned long bw = bdi->avg_write_bandwidth; + unsigned long t; /* * Limit pause time for small memory systems. If sleeping for too long @@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi, t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; - return min_t(long, t, MAX_PAUSE); + return min_t(unsigned long, t, MAX_PAUSE); } static long bdi_min_pause(struct backing_dev_info *bdi, -- cgit v1.2.3 From 2cbe3b0af82279f14cfb3195f2406651f28ee9b8 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 16 Oct 2013 13:47:04 -0700 Subject: procfs: fix unintended truncation of returned mapped address Currently, proc_reg_get_unmapped_area truncates upper 32-bit of the mapped virtual address returned from get_unmapped_area method in pde->proc_fops due to the variable rv of signed integer on x86_64. This is too small to have vitual address of unsigned long on x86_64 since on x86_64, signed integer is of 4 bytes while unsigned long is of 8 bytes. To fix this issue, use unsigned long instead. Fixes a regression added in commit c4fe24485729 ("sparc: fix PCI device proc file mmap(2)"). Signed-off-by: HATAYAMA Daisuke Cc: Alexey Dobriyan Cc: David S. Miller Tested-by: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9f8ef9b7674d..6c501c4d996d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -288,7 +288,7 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct proc_dir_entry *pde = PDE(file_inode(file)); - int rv = -EIO; + unsigned long rv = -EIO; unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); if (use_pde(pde)) { get_unmapped_area = pde->proc_fops->get_unmapped_area; -- cgit v1.2.3 From fad1a86e25e0a1f85635ed06ef62ddadd5b8fa4c Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 16 Oct 2013 13:47:05 -0700 Subject: procfs: call default get_unmapped_area on MMU-present architectures Commit c4fe24485729 ("sparc: fix PCI device proc file mmap(2)") added proc_reg_get_unmapped_area in proc_reg_file_ops and proc_reg_file_ops_no_compat, by which now mmap always returns EIO if get_unmapped_area method is not defined for the target procfs file, which causes regression of mmap on /proc/vmcore. To address this issue, like get_unmapped_area(), call default current->mm->get_unmapped_area on MMU-present architectures if pde->proc_fops->get_unmapped_area, i.e. the one in actual file operation in the procfs file, is not defined. Reported-by: Michael Holzheu Signed-off-by: HATAYAMA Daisuke Cc: Alexey Dobriyan Cc: David S. Miller Tested-by: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6c501c4d996d..8eaa1ba793fc 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -289,9 +289,13 @@ static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; if (use_pde(pde)) { - get_unmapped_area = pde->proc_fops->get_unmapped_area; +#ifdef CONFIG_MMU + get_unmapped_area = current->mm->get_unmapped_area; +#endif + if (pde->proc_fops->get_unmapped_area) + get_unmapped_area = pde->proc_fops->get_unmapped_area; if (get_unmapped_area) rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); unuse_pde(pde); -- cgit v1.2.3 From 5b808a2300a5ac45f4798ebfac8b367e98a4b692 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 16 Oct 2013 13:47:06 -0700 Subject: swap: fix set_blocksize race during swapon/swapoff Fix race between swapoff and swapon. Swapoff used old_block_size from swap_info outside of swapon_mutex so it could be overwritten by concurrent swapon. The race has visible effect only if more than one swap block device exists with different block sizes (e.g. /dev/sda1 with block size 4096 and /dev/sdb1 with 512). In such case it leads to setting the blocksize of swapped off device with wrong blocksize. The bug can be triggered with multiple concurrent swapoff and swapon: 0. Swap for some device is on. 1. swapoff: First the swapoff is called on this device and "struct swap_info_struct *p" is assigned. This is done under swap_lock however this lock is released for the call try_to_unuse(). 2. swapon: After the assignment above (and before acquiring swapon_mutex & swap_lock by swapoff) the swapon is called on the same device. The p->old_block_size is assigned to the value of block_size the device. This block size should be the same as previous but sometimes it is not. The swapon ends successfully. 3. swapoff: Swapoff resumes, grabs the locks and mutex and continues to disable this swap device. Now it sets the block size to value taken from swap_info which was overwritten by swapon in 2. Signed-off-by: Krzysztof Kozlowski Reported-by: Weijie Yang Cc: Bob Liu Cc: Konrad Rzeszutek Wilk Cc: Shaohua Li Cc: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 3963fc24fcc1..de7c904e52e5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct filename *pathname; int i, type, prev; int err; + unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } swap_file = p->swap_file; + old_block_size = p->old_block_size; p->swap_file = NULL; p->max = 0; swap_map = p->swap_map; @@ -1938,7 +1940,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); - set_blocksize(bdev, p->old_block_size); + set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); -- cgit v1.2.3 From 750e8165f5e87b6a142be953640eabb13a9d350a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Oct 2013 13:47:08 -0700 Subject: mm: fix BUG in __split_huge_page_pmd Occasionally we hit the BUG_ON(pmd_trans_huge(*pmd)) at the end of __split_huge_page_pmd(): seen when doing madvise(,,MADV_DONTNEED). It's invalid: we don't always have down_write of mmap_sem there: a racing do_huge_pmd_wp_page() might have copied-on-write to another huge page before our split_huge_page() got the anon_vma lock. Forget the BUG_ON, just go back and try again if this happens. Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Naoya Horiguchi Cc: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884682d8..610e3df2768a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2697,6 +2697,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, mmun_start = haddr; mmun_end = haddr + HPAGE_PMD_SIZE; +again: mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); spin_lock(&mm->page_table_lock); if (unlikely(!pmd_trans_huge(*pmd))) { @@ -2719,7 +2720,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, split_huge_page(page); put_page(page); - BUG_ON(pmd_trans_huge(*pmd)); + + /* + * We don't always have down_write of mmap_sem here: a racing + * do_huge_pmd_wp_page() might have copied-on-write to another + * huge page before our split_huge_page() got the anon_vma lock. + */ + if (unlikely(pmd_trans_huge(*pmd))) + goto again; } void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, -- cgit v1.2.3 From 57a8f0cdb87da776bf0e4ce7554a9133854fa779 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 16 Oct 2013 13:47:09 -0700 Subject: mm: revert mremap pud_free anti-fix Revert commit 1ecfd533f4c5 ("mm/mremap.c: call pud_free() after fail calling pmd_alloc()"). The original code was correct: pud_alloc(), pmd_alloc(), pte_alloc_map() ensure that the pud, pmd, pt is already allocated, and seldom do they need to allocate; on failure, upper levels are freed if appropriate by the subsequent do_munmap(). Whereas commit 1ecfd533f4c5 did an unconditional pud_free() of a most-likely still-in-use pud: saved only by the near-impossiblity of pmd_alloc() failing. Signed-off-by: Hugh Dickins Cc: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 91b13d6a16d4..0843feb66f3d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,7 +25,6 @@ #include #include #include -#include #include "internal.h" @@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; pmd = pmd_alloc(mm, pud, addr); - if (!pmd) { - pud_free(mm, pud); + if (!pmd) return NULL; - } VM_BUG_ON(pmd_trans_huge(*pmd)); -- cgit v1.2.3