From 4356f21d09283dc6d39a6f7287a65ddab61e2808 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 31 Oct 2011 17:06:47 -0700 Subject: mm: change isolate mode from #define to bitwise type Change ISOLATE_XXX macro with bitwise isolate_mode_t type. Normally, macro isn't recommended as it's type-unsafe and making debugging harder as symbol cannot be passed throught to the debugger. Quote from Johannes " Hmm, it would probably be cleaner to fully convert the isolation mode into independent flags. INACTIVE, ACTIVE, BOTH is currently a tri-state among flags, which is a bit ugly." This patch moves isolate mode from swap.h to mmzone.h by memcontrol.h Signed-off-by: Minchan Kim Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Cc: Michal Hocko Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 9fdfce7ba403..ec6dbcb976d1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1012,23 +1012,27 @@ keep_lumpy: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode, int file) +int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) { + bool all_lru_mode; int ret = -EINVAL; /* Only take pages on the LRU. */ if (!PageLRU(page)) return ret; + all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == + (ISOLATE_ACTIVE|ISOLATE_INACTIVE); + /* * When checking the active state, we need to be sure we are * dealing with comparible boolean values. Take the logical not * of each. */ - if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) + if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) return ret; - if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) + if (!all_lru_mode && !!page_is_file_cache(page) != file) return ret; /* @@ -1076,7 +1080,8 @@ int __isolate_lru_page(struct page *page, int mode, int file) */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode, int file) + unsigned long *scanned, int order, isolate_mode_t mode, + int file) { unsigned long nr_taken = 0; unsigned long nr_lumpy_taken = 0; @@ -1201,8 +1206,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, static unsigned long isolate_pages_global(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, - int mode, struct zone *z, - int active, int file) + isolate_mode_t mode, + struct zone *z, int active, int file) { int lru = LRU_BASE; if (active) @@ -1448,6 +1453,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_taken; unsigned long nr_anon; unsigned long nr_file; + isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1458,15 +1464,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } set_reclaim_mode(priority, sc, false); + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) + reclaim_mode |= ISOLATE_ACTIVE; + lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_to_scan, - &page_list, &nr_scanned, sc->order, - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, 0, file); + nr_taken = isolate_pages_global(nr_to_scan, &page_list, + &nr_scanned, sc->order, reclaim_mode, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1475,12 +1481,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); } else { - nr_taken = mem_cgroup_isolate_pages(nr_to_scan, - &page_list, &nr_scanned, sc->order, - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, - 0, file); + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, + &nr_scanned, sc->order, reclaim_mode, zone, + sc->mem_cgroup, 0, file); /* * mem_cgroup_isolate_pages() keeps track of * scanned pages on its own. -- cgit v1.2.3 From 39deaf8585152f1a35c1676d3d7dc6ae0fb65967 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 31 Oct 2011 17:06:51 -0700 Subject: mm: compaction: make isolate_lru_page() filter-aware In async mode, compaction doesn't migrate dirty or writeback pages. So, it's meaningless to pick the page and re-add it to lru list. Of course, when we isolate the page in compaction, the page might be dirty or writeback but when we try to migrate the page, the page would be not dirty, writeback. So it could be migrated. But it's very unlikely as isolate and migration cycle is much faster than writeout. So, this patch helps cpu overhead and prevent unnecessary LRU churning. Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Michal Hocko Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index ec6dbcb976d1..c007e78d7078 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1045,6 +1045,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) ret = -EBUSY; + if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) + return ret; + if (likely(get_page_unless_zero(page))) { /* * Be careful not to clear PageLRU until after we're -- cgit v1.2.3 From f80c0673610e36ae29d63e3297175e22f70dde5f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 31 Oct 2011 17:06:55 -0700 Subject: mm: zone_reclaim: make isolate_lru_page() filter-aware In __zone_reclaim case, we don't want to shrink mapped page. Nonetheless, we have isolated mapped page and re-add it into LRU's head. It's unnecessary CPU overhead and makes LRU churning. Of course, when we isolate the page, the page might be mapped but when we try to migrate the page, the page would be not mapped. So it could be migrated. But race is rare and although it happens, it's no big deal. Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Rik van Riel Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index c007e78d7078..b68a9342d5a3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1048,6 +1048,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) return ret; + if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) + return ret; + if (likely(get_page_unless_zero(page))) { /* * Be careful not to clear PageLRU until after we're @@ -1471,6 +1474,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, reclaim_mode |= ISOLATE_ACTIVE; lru_add_drain(); + + if (!sc->may_unmap) + reclaim_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + reclaim_mode |= ISOLATE_CLEAN; + spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { @@ -1588,19 +1597,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct page *page; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); unsigned long nr_rotated = 0; + isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; lru_add_drain(); + + if (!sc->may_unmap) + reclaim_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + reclaim_mode |= ISOLATE_CLEAN; + spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, + reclaim_mode, zone, 1, file); zone->pages_scanned += pgscanned; } else { nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, + reclaim_mode, zone, sc->mem_cgroup, 1, file); /* * mem_cgroup_isolate_pages() keeps track of -- cgit v1.2.3 From 3da367c3e5fca71d4e778fa565d9b098d5518f4a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 31 Oct 2011 17:07:03 -0700 Subject: vmscan: add block plug for page reclaim per-task block plug can reduce block queue lock contention and increase request merge. Currently page reclaim doesn't support it. I originally thought page reclaim doesn't need it, because kswapd thread count is limited and file cache write is done at flusher mostly. When I test a workload with heavy swap in a 4-node machine, each CPU is doing direct page reclaim and swap. This causes block queue lock contention. In my test, without below patch, the CPU utilization is about 2% ~ 7%. With the patch, the CPU utilization is about 1% ~ 3%. Disk throughput isn't changed. This should improve normal kswapd write and file cache write too (increase request merge for example), but might not be so obvious as I explain above. Signed-off-by: Shaohua Li Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index b68a9342d5a3..b1520b077858 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2005,12 +2005,14 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; restart: nr_reclaimed = 0; nr_scanned = sc->nr_scanned; get_scan_count(zone, sc, nr, priority); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { @@ -2034,6 +2036,7 @@ restart: if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } + blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; /* -- cgit v1.2.3 From f11c0ca501af89fc07b0d9f17531ba3b68a4ef39 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 31 Oct 2011 17:07:27 -0700 Subject: mm: vmscan: drop nr_force_scan[] from get_scan_count The nr_force_scan[] tuple holds the effective scan numbers for anon and file pages in case the situation called for a forced scan and the regularly calculated scan numbers turned out zero. However, the effective scan number can always be assumed to be SWAP_CLUSTER_MAX right before the division into anon and file. The numerators and denominator are properly set up for all cases, be it force scan for just file, just anon, or both, to do the right thing. Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Cc: Ying Han Cc: Balbir Singh Cc: KOSAKI Motohiro Cc: Daisuke Nishimura Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index b1520b077858..d29b2bdb9e03 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1817,12 +1817,19 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, enum lru_list l; int noswap = 0; bool force_scan = false; - unsigned long nr_force_scan[2]; - /* kswapd does zone balancing and needs to scan this zone */ + /* + * If the zone or memcg is small, nr[l] can be 0. This + * results in no scanning on this priority and a potential + * priority drop. Global direct reclaim can go to the next + * zone and tends to have no problems. Global kswapd is for + * zone balancing and it needs to scan a minimum amount. When + * reclaiming for a memcg, a priority drop can cause high + * latencies, so it's better to scan a minimum amount there as + * well. + */ if (scanning_global_lru(sc) && current_is_kswapd()) force_scan = true; - /* memcg may have small limit and need to avoid priority drop */ if (!scanning_global_lru(sc)) force_scan = true; @@ -1832,8 +1839,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 0; fraction[1] = 1; denominator = 1; - nr_force_scan[0] = 0; - nr_force_scan[1] = SWAP_CLUSTER_MAX; goto out; } @@ -1850,8 +1855,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 1; fraction[1] = 0; denominator = 1; - nr_force_scan[0] = SWAP_CLUSTER_MAX; - nr_force_scan[1] = 0; goto out; } } @@ -1900,11 +1903,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; - if (force_scan) { - unsigned long scan = SWAP_CLUSTER_MAX; - nr_force_scan[0] = div64_u64(scan * ap, denominator); - nr_force_scan[1] = div64_u64(scan * fp, denominator); - } out: for_each_evictable_lru(l) { int file = is_file_lru(l); @@ -1913,20 +1911,10 @@ out: scan = zone_nr_lru_pages(zone, sc, l); if (priority || noswap) { scan >>= priority; + if (!scan && force_scan) + scan = SWAP_CLUSTER_MAX; scan = div64_u64(scan * fraction[file], denominator); } - - /* - * If zone is small or memcg is small, nr[l] can be 0. - * This results no-scan on this priority and priority drop down. - * For global direct reclaim, it can visit next zone and tend - * not to have problems. For global kswapd, it's for zone - * balancing and it need to scan a small amounts. When using - * memcg, priority drop can cause big latency. So, it's better - * to scan small amount. See may_noscan above. - */ - if (!scan && force_scan) - scan = nr_force_scan[file]; nr[l] = scan; } } -- cgit v1.2.3 From ee72886d8ed5d9de3fa0ed3b99a7ca7702576a96 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 31 Oct 2011 17:07:38 -0700 Subject: mm: vmscan: do not writeback filesystem pages in direct reclaim Testing from the XFS folk revealed that there is still too much I/O from the end of the LRU in kswapd. Previously it was considered acceptable by VM people for a small number of pages to be written back from reclaim with testing generally showing about 0.3% of pages reclaimed were written back (higher if memory was low). That writing back a small number of pages is ok has been heavily disputed for quite some time and Dave Chinner explained it well; It doesn't have to be a very high number to be a problem. IO is orders of magnitude slower than the CPU time it takes to flush a page, so the cost of making a bad flush decision is very high. And single page writeback from the LRU is almost always a bad flush decision. To complicate matters, filesystems respond very differently to requests from reclaim according to Christoph Hellwig; xfs tries to write it back if the requester is kswapd ext4 ignores the request if it's a delayed allocation btrfs ignores the request As a result, each filesystem has different performance characteristics when under memory pressure and there are many pages being dirtied. In some cases, the request is ignored entirely so the VM cannot depend on the IO being dispatched. The objective of this series is to reduce writing of filesystem-backed pages from reclaim, play nicely with writeback that is already in progress and throttle reclaim appropriately when writeback pages are encountered. The assumption is that the flushers will always write pages faster than if reclaim issues the IO. A secondary goal is to avoid the problem whereby direct reclaim splices two potentially deep call stacks together. There is a potential new problem as reclaim has less control over how long before a page in a particularly zone or container is cleaned and direct reclaimers depend on kswapd or flusher threads to do the necessary work. However, as filesystems sometimes ignore direct reclaim requests already, it is not expected to be a serious issue. Patch 1 disables writeback of filesystem pages from direct reclaim entirely. Anonymous pages are still written. Patch 2 removes dead code in lumpy reclaim as it is no longer able to synchronously write pages. This hurts lumpy reclaim but there is an expectation that compaction is used for hugepage allocations these days and lumpy reclaim's days are numbered. Patches 3-4 add warnings to XFS and ext4 if called from direct reclaim. With patch 1, this "never happens" and is intended to catch regressions in this logic in the future. Patch 5 disables writeback of filesystem pages from kswapd unless the priority is raised to the point where kswapd is considered to be in trouble. Patch 6 throttles reclaimers if too many dirty pages are being encountered and the zones or backing devices are congested. Patch 7 invalidates dirty pages found at the end of the LRU so they are reclaimed quickly after being written back rather than waiting for a reclaimer to find them I consider this series to be orthogonal to the writeback work but it is worth noting that the writeback work affects the viability of patch 8 in particular. I tested this on ext4 and xfs using fs_mark, a simple writeback test based on dd and a micro benchmark that does a streaming write to a large mapping (exercises use-once LRU logic) followed by streaming writes to a mix of anonymous and file-backed mappings. The command line for fs_mark when botted with 512M looked something like ./fs_mark -d /tmp/fsmark-2676 -D 100 -N 150 -n 150 -L 25 -t 1 -S0 -s 10485760 The number of files was adjusted depending on the amount of available memory so that the files created was about 3xRAM. For multiple threads, the -d switch is specified multiple times. The test machine is x86-64 with an older generation of AMD processor with 4 cores. The underlying storage was 4 disks configured as RAID-0 as this was the best configuration of storage I had available. Swap is on a separate disk. Dirty ratio was tuned to 40% instead of the default of 20%. Testing was run with and without monitors to both verify that the patches were operating as expected and that any performance gain was real and not due to interference from monitors. Here is a summary of results based on testing XFS. 512M1P-xfs Files/s mean 32.69 ( 0.00%) 34.44 ( 5.08%) 512M1P-xfs Elapsed Time fsmark 51.41 48.29 512M1P-xfs Elapsed Time simple-wb 114.09 108.61 512M1P-xfs Elapsed Time mmap-strm 113.46 109.34 512M1P-xfs Kswapd efficiency fsmark 62% 63% 512M1P-xfs Kswapd efficiency simple-wb 56% 61% 512M1P-xfs Kswapd efficiency mmap-strm 44% 42% 512M-xfs Files/s mean 30.78 ( 0.00%) 35.94 (14.36%) 512M-xfs Elapsed Time fsmark 56.08 48.90 512M-xfs Elapsed Time simple-wb 112.22 98.13 512M-xfs Elapsed Time mmap-strm 219.15 196.67 512M-xfs Kswapd efficiency fsmark 54% 56% 512M-xfs Kswapd efficiency simple-wb 54% 55% 512M-xfs Kswapd efficiency mmap-strm 45% 44% 512M-4X-xfs Files/s mean 30.31 ( 0.00%) 33.33 ( 9.06%) 512M-4X-xfs Elapsed Time fsmark 63.26 55.88 512M-4X-xfs Elapsed Time simple-wb 100.90 90.25 512M-4X-xfs Elapsed Time mmap-strm 261.73 255.38 512M-4X-xfs Kswapd efficiency fsmark 49% 50% 512M-4X-xfs Kswapd efficiency simple-wb 54% 56% 512M-4X-xfs Kswapd efficiency mmap-strm 37% 36% 512M-16X-xfs Files/s mean 60.89 ( 0.00%) 65.22 ( 6.64%) 512M-16X-xfs Elapsed Time fsmark 67.47 58.25 512M-16X-xfs Elapsed Time simple-wb 103.22 90.89 512M-16X-xfs Elapsed Time mmap-strm 237.09 198.82 512M-16X-xfs Kswapd efficiency fsmark 45% 46% 512M-16X-xfs Kswapd efficiency simple-wb 53% 55% 512M-16X-xfs Kswapd efficiency mmap-strm 33% 33% Up until 512-4X, the FSmark improvements were statistically significant. For the 4X and 16X tests the results were within standard deviations but just barely. The time to completion for all tests is improved which is an important result. In general, kswapd efficiency is not affected by skipping dirty pages. 1024M1P-xfs Files/s mean 39.09 ( 0.00%) 41.15 ( 5.01%) 1024M1P-xfs Elapsed Time fsmark 84.14 80.41 1024M1P-xfs Elapsed Time simple-wb 210.77 184.78 1024M1P-xfs Elapsed Time mmap-strm 162.00 160.34 1024M1P-xfs Kswapd efficiency fsmark 69% 75% 1024M1P-xfs Kswapd efficiency simple-wb 71% 77% 1024M1P-xfs Kswapd efficiency mmap-strm 43% 44% 1024M-xfs Files/s mean 35.45 ( 0.00%) 37.00 ( 4.19%) 1024M-xfs Elapsed Time fsmark 94.59 91.00 1024M-xfs Elapsed Time simple-wb 229.84 195.08 1024M-xfs Elapsed Time mmap-strm 405.38 440.29 1024M-xfs Kswapd efficiency fsmark 79% 71% 1024M-xfs Kswapd efficiency simple-wb 74% 74% 1024M-xfs Kswapd efficiency mmap-strm 39% 42% 1024M-4X-xfs Files/s mean 32.63 ( 0.00%) 35.05 ( 6.90%) 1024M-4X-xfs Elapsed Time fsmark 103.33 97.74 1024M-4X-xfs Elapsed Time simple-wb 204.48 178.57 1024M-4X-xfs Elapsed Time mmap-strm 528.38 511.88 1024M-4X-xfs Kswapd efficiency fsmark 81% 70% 1024M-4X-xfs Kswapd efficiency simple-wb 73% 72% 1024M-4X-xfs Kswapd efficiency mmap-strm 39% 38% 1024M-16X-xfs Files/s mean 42.65 ( 0.00%) 42.97 ( 0.74%) 1024M-16X-xfs Elapsed Time fsmark 103.11 99.11 1024M-16X-xfs Elapsed Time simple-wb 200.83 178.24 1024M-16X-xfs Elapsed Time mmap-strm 397.35 459.82 1024M-16X-xfs Kswapd efficiency fsmark 84% 69% 1024M-16X-xfs Kswapd efficiency simple-wb 74% 73% 1024M-16X-xfs Kswapd efficiency mmap-strm 39% 40% All FSMark tests up to 16X had statistically significant improvements. For the most part, tests are completing faster with the exception of the streaming writes to a mixture of anonymous and file-backed mappings which were slower in two cases In the cases where the mmap-strm tests were slower, there was more swapping due to dirty pages being skipped. The number of additional pages swapped is almost identical to the fewer number of pages written from reclaim. In other words, roughly the same number of pages were reclaimed but swapping was slower. As the test is a bit unrealistic and stresses memory heavily, the small shift is acceptable. 4608M1P-xfs Files/s mean 29.75 ( 0.00%) 30.96 ( 3.91%) 4608M1P-xfs Elapsed Time fsmark 512.01 492.15 4608M1P-xfs Elapsed Time simple-wb 618.18 566.24 4608M1P-xfs Elapsed Time mmap-strm 488.05 465.07 4608M1P-xfs Kswapd efficiency fsmark 93% 86% 4608M1P-xfs Kswapd efficiency simple-wb 88% 84% 4608M1P-xfs Kswapd efficiency mmap-strm 46% 45% 4608M-xfs Files/s mean 27.60 ( 0.00%) 28.85 ( 4.33%) 4608M-xfs Elapsed Time fsmark 555.96 532.34 4608M-xfs Elapsed Time simple-wb 659.72 571.85 4608M-xfs Elapsed Time mmap-strm 1082.57 1146.38 4608M-xfs Kswapd efficiency fsmark 89% 91% 4608M-xfs Kswapd efficiency simple-wb 88% 82% 4608M-xfs Kswapd efficiency mmap-strm 48% 46% 4608M-4X-xfs Files/s mean 26.00 ( 0.00%) 27.47 ( 5.35%) 4608M-4X-xfs Elapsed Time fsmark 592.91 564.00 4608M-4X-xfs Elapsed Time simple-wb 616.65 575.07 4608M-4X-xfs Elapsed Time mmap-strm 1773.02 1631.53 4608M-4X-xfs Kswapd efficiency fsmark 90% 94% 4608M-4X-xfs Kswapd efficiency simple-wb 87% 82% 4608M-4X-xfs Kswapd efficiency mmap-strm 43% 43% 4608M-16X-xfs Files/s mean 26.07 ( 0.00%) 26.42 ( 1.32%) 4608M-16X-xfs Elapsed Time fsmark 602.69 585.78 4608M-16X-xfs Elapsed Time simple-wb 606.60 573.81 4608M-16X-xfs Elapsed Time mmap-strm 1549.75 1441.86 4608M-16X-xfs Kswapd efficiency fsmark 98% 98% 4608M-16X-xfs Kswapd efficiency simple-wb 88% 82% 4608M-16X-xfs Kswapd efficiency mmap-strm 44% 42% Unlike the other tests, the fsmark results are not statistically significant but the min and max times are both improved and for the most part, tests completed faster. There are other indications that this is an improvement as well. For example, in the vast majority of cases, there were fewer pages scanned by direct reclaim implying in many cases that stalls due to direct reclaim are reduced. KSwapd is scanning more due to skipping dirty pages which is unfortunate but the CPU usage is still acceptable In an earlier set of tests, I used blktrace and in almost all cases throughput throughout the entire test was higher. However, I ended up discarding those results as recording blktrace data was too heavy for my liking. On a laptop, I plugged in a USB stick and ran a similar tests of tests using it as backing storage. A desktop environment was running and for the entire duration of the tests, firefox and gnome terminal were launching and exiting to vaguely simulate a user. 1024M-xfs Files/s mean 0.41 ( 0.00%) 0.44 ( 6.82%) 1024M-xfs Elapsed Time fsmark 2053.52 1641.03 1024M-xfs Elapsed Time simple-wb 1229.53 768.05 1024M-xfs Elapsed Time mmap-strm 4126.44 4597.03 1024M-xfs Kswapd efficiency fsmark 84% 85% 1024M-xfs Kswapd efficiency simple-wb 92% 81% 1024M-xfs Kswapd efficiency mmap-strm 60% 51% 1024M-xfs Avg wait ms fsmark 5404.53 4473.87 1024M-xfs Avg wait ms simple-wb 2541.35 1453.54 1024M-xfs Avg wait ms mmap-strm 3400.25 3852.53 The mmap-strm results were hurt because firefox launching had a tendency to push the test out of memory. On the postive side, firefox launched marginally faster with the patches applied. Time to completion for many tests was faster but more importantly - the "Avg wait" time as measured by iostat was far lower implying the system would be more responsive. It was also the case that "Avg wait ms" on the root filesystem was lower. I tested it manually and while the system felt slightly more responsive while copying data to a USB stick, it was marginal enough that it could be my imagination. This patch: do not writeback filesystem pages in direct reclaim. When kswapd is failing to keep zones above the min watermark, a process will enter direct reclaim in the same manner kswapd does. If a dirty page is encountered during the scan, this page is written to backing storage using mapping->writepage. This causes two problems. First, it can result in very deep call stacks, particularly if the target storage or filesystem are complex. Some filesystems ignore write requests from direct reclaim as a result. The second is that a single-page flush is inefficient in terms of IO. While there is an expectation that the elevator will merge requests, this does not always happen. Quoting Christoph Hellwig; The elevator has a relatively small window it can operate on, and can never fix up a bad large scale writeback pattern. This patch prevents direct reclaim writing back filesystem pages by checking if current is kswapd. Anonymous pages are still written to swap as there is not the equivalent of a flusher thread for anonymous pages. If the dirty pages cannot be written back, they are placed back on the LRU lists. There is now a direct dependency on dirty page balancing to prevent too many pages in the system being dirtied which would prevent reclaim making forward progress. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Cc: Dave Chinner Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Wu Fengguang Cc: Jan Kara Cc: Rik van Riel Cc: Mel Gorman Cc: Alex Elder Cc: Theodore Ts'o Cc: Chris Mason Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index d29b2bdb9e03..10f9c59aed55 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -865,6 +865,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { nr_dirty++; + /* + * Only kswapd can writeback filesystem pages to + * avoid risk of stack overflow + */ + if (page_is_file_cache(page) && !current_is_kswapd()) { + inc_zone_page_state(page, NR_VMSCAN_WRITE_SKIP); + goto keep_locked; + } + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) -- cgit v1.2.3 From a18bba061c789f5815c3efc3c80e6ac269911964 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 31 Oct 2011 17:07:42 -0700 Subject: mm: vmscan: remove dead code related to lumpy reclaim waiting on pages under writeback Lumpy reclaim worked with two passes - the first which queued pages for IO and the second which waited on writeback. As direct reclaim can no longer write pages there is some dead code. This patch removes it but direct reclaim will continue to wait on pages under writeback while in synchronous reclaim mode. Signed-off-by: Mel Gorman Cc: Dave Chinner Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Wu Fengguang Cc: Jan Kara Cc: Minchan Kim Cc: Rik van Riel Cc: Mel Gorman Cc: Alex Elder Cc: Theodore Ts'o Cc: Chris Mason Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 10f9c59aed55..5c596654bd37 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -495,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, return PAGE_ACTIVATE; } - /* - * Wait on writeback if requested to. This happens when - * direct reclaiming a large contiguous area and the - * first attempt to free a range of pages fails. - */ - if (PageWriteback(page) && - (sc->reclaim_mode & RECLAIM_MODE_SYNC)) - wait_on_page_writeback(page); - if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); @@ -804,12 +795,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageWriteback(page)) { /* - * Synchronous reclaim is performed in two passes, - * first an asynchronous pass over the list to - * start parallel writeback, and a second synchronous - * pass to wait for the IO to complete. Wait here - * for any page for which writeback has already - * started. + * Synchronous reclaim cannot queue pages for + * writeback due to the possibility of stack overflow + * but if it encounters a page under writeback, wait + * for the IO to complete. */ if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && may_enter_fs) @@ -1414,7 +1403,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, } /* - * Returns true if the caller should wait to clean dirty/writeback pages. + * Returns true if a direct reclaim should wait on pages under writeback. * * If we are direct reclaiming for contiguous pages and we do not reclaim * everything in the list, try again and wait for writeback IO to complete. -- cgit v1.2.3 From f84f6e2b0868f198f97a32ba503d6f9f319a249a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 31 Oct 2011 17:07:51 -0700 Subject: mm: vmscan: do not writeback filesystem pages in kswapd except in high priority It is preferable that no dirty pages are dispatched for cleaning from the page reclaim path. At normal priorities, this patch prevents kswapd writing pages. However, page reclaim does have a requirement that pages be freed in a particular zone. If it is failing to make sufficient progress (reclaiming < SWAP_CLUSTER_MAX at any priority priority), the priority is raised to scan more pages. A priority of DEF_PRIORITY - 3 is considered to be the point where kswapd is getting into trouble reclaiming pages. If this priority is reached, kswapd will dispatch pages for writing. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Cc: Dave Chinner Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Wu Fengguang Cc: Jan Kara Cc: Rik van Riel Cc: Mel Gorman Cc: Alex Elder Cc: Theodore Ts'o Cc: Chris Mason Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5c596654bd37..15e3a29fdb23 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -750,7 +750,8 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) */ static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, - struct scan_control *sc) + struct scan_control *sc, + int priority) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -856,9 +857,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow + * avoid risk of stack overflow but do not writeback + * unless under significant pressure. */ - if (page_is_file_cache(page) && !current_is_kswapd()) { + if (page_is_file_cache(page) && + (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { inc_zone_page_state(page, NR_VMSCAN_WRITE_SKIP); goto keep_locked; } @@ -1509,12 +1512,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc); + nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc); + nr_reclaimed += shrink_page_list(&page_list, zone, sc, priority); } local_irq_disable(); -- cgit v1.2.3 From 92df3a723f84cdf8133560bbff950a7a99e92bc9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 31 Oct 2011 17:07:56 -0700 Subject: mm: vmscan: throttle reclaim if encountering too many dirty pages under writeback Workloads that are allocating frequently and writing files place a large number of dirty pages on the LRU. With use-once logic, it is possible for them to reach the end of the LRU quickly requiring the reclaimer to scan more to find clean pages. Ordinarily, processes that are dirtying memory will get throttled by dirty balancing but this is a global heuristic and does not take into account that LRUs are maintained on a per-zone basis. This can lead to a situation whereby reclaim is scanning heavily, skipping over a large number of pages under writeback and recycling them around the LRU consuming CPU. This patch checks how many of the number of pages isolated from the LRU were dirty and under writeback. If a percentage of them under writeback, the process will be throttled if a backing device or the zone is congested. Note that this applies whether it is anonymous or file-backed pages that are under writeback meaning that swapping is potentially throttled. This is intentional due to the fact if the swap device is congested, scanning more pages and dispatching more IO is not going to help matters. The percentage that must be in writeback depends on the priority. At default priority, all of them must be dirty. At DEF_PRIORITY-1, 50% of them must be, DEF_PRIORITY-2, 25% etc. i.e. as pressure increases the greater the likelihood the process will get throttled to allow the flusher threads to make some progress. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Acked-by: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Jan Kara Cc: Rik van Riel Cc: Mel Gorman Cc: Alex Elder Cc: Theodore Ts'o Cc: Chris Mason Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 15e3a29fdb23..7b0573f33a27 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -751,7 +751,9 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, struct scan_control *sc, - int priority) + int priority, + unsigned long *ret_nr_dirty, + unsigned long *ret_nr_writeback) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -759,6 +761,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; + unsigned long nr_writeback = 0; cond_resched(); @@ -795,6 +798,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); if (PageWriteback(page)) { + nr_writeback++; /* * Synchronous reclaim cannot queue pages for * writeback due to the possibility of stack overflow @@ -1000,6 +1004,8 @@ keep_lumpy: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + *ret_nr_dirty += nr_dirty; + *ret_nr_writeback += nr_writeback; return nr_reclaimed; } @@ -1460,6 +1466,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_taken; unsigned long nr_anon; unsigned long nr_file; + unsigned long nr_dirty = 0; + unsigned long nr_writeback = 0; isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; while (unlikely(too_many_isolated(zone, file, sc))) { @@ -1512,12 +1520,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority); + nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, + &nr_dirty, &nr_writeback); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc, priority); + nr_reclaimed += shrink_page_list(&page_list, zone, sc, + priority, &nr_dirty, &nr_writeback); } local_irq_disable(); @@ -1527,6 +1537,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + /* + * If reclaim is isolating dirty pages under writeback, it implies + * that the long-lived page allocation rate is exceeding the page + * laundering rate. Either the global limits are not being effective + * at throttling processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing device. The + * only option is to throttle from reclaim context which is not ideal + * as there is no guarantee the dirtying process is throttled in the + * same way balance_dirty_pages() manages. + * + * This scales the number of dirty pages that must be under writeback + * before throttling depending on priority. It is a simple backoff + * function that has the most effect in the range DEF_PRIORITY to + * DEF_PRIORITY-2 which is the priority reclaim is considered to be + * in trouble and reclaim is considered to be in trouble. + * + * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle + * DEF_PRIORITY-1 50% must be PageWriteback + * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble + * ... + * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any + * isolated page is PageWriteback + */ + if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, -- cgit v1.2.3 From 49ea7eb65e7c5060807fb9312b1ad4c3eab82e2c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 31 Oct 2011 17:07:59 -0700 Subject: mm: vmscan: immediately reclaim end-of-LRU dirty pages when writeback completes When direct reclaim encounters a dirty page, it gets recycled around the LRU for another cycle. This patch marks the page PageReclaim similar to deactivate_page() so that the page gets reclaimed almost immediately after the page gets cleaned. This is to avoid reclaiming clean pages that are younger than a dirty page encountered at the end of the LRU that might have been something like a use-once page. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Cc: Dave Chinner Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Jan Kara Cc: Minchan Kim Cc: Rik van Riel Cc: Mel Gorman Cc: Alex Elder Cc: Theodore Ts'o Cc: Chris Mason Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7b0573f33a27..a297603d35bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -866,7 +866,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { - inc_zone_page_state(page, NR_VMSCAN_WRITE_SKIP); + /* + * Immediately reclaim when written back. + * Similar in principal to deactivate_page() + * except we already have the page isolated + * and know it's dirty + */ + inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + SetPageReclaim(page); + goto keep_locked; } -- cgit v1.2.3 From 16fb951237c2b0b28037b992ee44e7ee401c30d1 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 31 Oct 2011 17:08:02 -0700 Subject: vmscan: count pages into balanced for zone with good watermark It's possible a zone watermark is ok when entering the balance_pgdat() loop, while the zone is within the requested classzone_idx. Count pages from this zone into `balanced'. In this way, we can skip shrinking zones too much for high order allocation. Signed-off-by: Shaohua Li Acked-by: Mel Gorman Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index a297603d35bc..77ee24fc891a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2748,6 +2748,8 @@ out: /* If balanced, clear the congested flag */ zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= *classzone_idx) + balanced += zone->present_pages; } } -- cgit v1.2.3 From d2ebd0f6b89567eb93ead4e2ca0cbe03021f344b Mon Sep 17 00:00:00 2001 From: "Alex,Shi" Date: Mon, 31 Oct 2011 17:08:39 -0700 Subject: kswapd: avoid unnecessary rebalance after an unsuccessful balancing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 215ddd66 ("mm: vmscan: only read new_classzone_idx from pgdat when reclaiming successfully") , Mel Gorman said kswapd is better to sleep after a unsuccessful balancing if there is tighter reclaim request pending in the balancing. But in the following scenario, kswapd do something that is not matched our expectation. The patch fixes this issue. 1, Read pgdat request A (classzone_idx, order = 3) 2, balance_pgdat() 3, During pgdat, a new pgdat request B (classzone_idx, order = 5) is placed 4, balance_pgdat() returns but failed since returned order = 0 5, pgdat of request A assigned to balance_pgdat(), and do balancing again. While the expectation behavior of kswapd should try to sleep. Signed-off-by: Alex Shi Reviewed-by: Tim Chen Acked-by: Mel Gorman Tested-by: Pádraig Brady Cc: Rik van Riel Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 77ee24fc891a..dd5fc86dbb82 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2823,7 +2823,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; + unsigned balanced_order; int classzone_idx, new_classzone_idx; + int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2854,7 +2856,9 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; + balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; + balanced_classzone_idx = classzone_idx; for ( ; ; ) { int ret; @@ -2863,7 +2867,8 @@ static int kswapd(void *p) * new request of a similar or harder type will succeed soon * so consider going to sleep on the basis we reclaimed at */ - if (classzone_idx >= new_classzone_idx && order == new_order) { + if (balanced_classzone_idx >= new_classzone_idx && + balanced_order == new_order) { new_order = pgdat->kswapd_max_order; new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; @@ -2878,7 +2883,8 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, order, classzone_idx); + kswapd_try_to_sleep(pgdat, balanced_order, + balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; @@ -2895,7 +2901,9 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - order = balance_pgdat(pgdat, order, &classzone_idx); + balanced_classzone_idx = classzone_idx; + balanced_order = balance_pgdat(pgdat, order, + &balanced_classzone_idx); } } return 0; -- cgit v1.2.3 From f0dfcde099453aa4c0dc42473828d15a6d492936 Mon Sep 17 00:00:00 2001 From: "Alex,Shi" Date: Mon, 31 Oct 2011 17:08:45 -0700 Subject: kswapd: assign new_order and new_classzone_idx after wakeup in sleeping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There 2 places to read pgdat in kswapd. One is return from a successful balance, another is waked up from kswapd sleeping. The new_order and new_classzone_idx represent the balance input order and classzone_idx. But current new_order and new_classzone_idx are not assigned after kswapd_try_to_sleep(), that will cause a bug in the following scenario. 1: after a successful balance, kswapd goes to sleep, and new_order = 0; new_classzone_idx = __MAX_NR_ZONES - 1; 2: kswapd waked up with order = 3 and classzone_idx = ZONE_NORMAL 3: in the balance_pgdat() running, a new balance wakeup happened with order = 5, and classzone_idx = ZONE_NORMAL 4: the first wakeup(order = 3) finished successufly, return order = 3 but, the new_order is still 0, so, this balancing will be treated as a failed balance. And then the second tighter balancing will be missed. So, to avoid the above problem, the new_order and new_classzone_idx need to be assigned for later successful comparison. Signed-off-by: Alex Shi Acked-by: Mel Gorman Reviewed-by: Minchan Kim Tested-by: Pádraig Brady Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index dd5fc86dbb82..51bc4bf3f723 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2887,6 +2887,8 @@ static int kswapd(void *p) balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; + new_order = order; + new_classzone_idx = classzone_idx; pgdat->kswapd_max_order = 0; pgdat->classzone_idx = pgdat->nr_zones - 1; } -- cgit v1.2.3 From 3f380998aeb51b99d5d22cadb41162e1e9db70d2 Mon Sep 17 00:00:00 2001 From: Kautuk Consul Date: Mon, 31 Oct 2011 17:09:11 -0700 Subject: vmscan.c: fix invalid strict_strtoul() check in write_scan_unevictable_node() write_scan_unevictable_node() checks the value req returned by strict_strtoul() and returns 1 if req is 0. However, when strict_strtoul() returns 0, it means successful conversion of buf to unsigned long. Due to this, the function was not proceeding to scan the zones for unevictable pages even though we write a valid value to the scan_unevictable_pages sys file. Change this check slightly to check for invalid value in buf as well as 0 value stored in res after successful conversion via strict_strtoul. In both cases, we do not perform the scanning of this node's zones. Signed-off-by: Kautuk Consul Reviewed-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 51bc4bf3f723..ac644fe85589 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3520,8 +3520,8 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev, unsigned long res; unsigned long req = strict_strtoul(buf, 10, &res); - if (!req) - return 1; /* zero is no-op */ + if (req || !res) + return 1; /* Invalid input or zero is no-op */ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!populated_zone(zone)) -- cgit v1.2.3 From 264e56d8247ef6e31ed4386926cae86c61ddcb18 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 31 Oct 2011 17:09:13 -0700 Subject: mm: disable user interface to manually rescue unevictable pages At one point, anonymous pages were supposed to go on the unevictable list when no swap space was configured, and the idea was to manually rescue those pages after adding swap and making them evictable again. But nowadays, swap-backed pages on the anon LRU list are not scanned without available swap space anyway, so there is no point in moving them to a separate list anymore. The manual rescue could also be used in case pages were stranded on the unevictable list due to race conditions. But the code has been around for a while now and newly discovered bugs should be properly reported and dealt with instead of relying on such a manual fixup. In addition to the lack of a usecase, the sysfs interface to rescue pages from a specific NUMA node has been broken since its introduction, so it's unlikely that anybody ever relied on that. This patch removes the functionality behind the sysctl and the node-interface and emits a one-time warning when somebody tries to access either of them. Signed-off-by: Johannes Weiner Reported-by: Kautuk Consul Reviewed-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 84 ++++++------------------------------------------------------- 1 file changed, 8 insertions(+), 76 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index ac644fe85589..3886b0bd7869 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3417,66 +3417,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) } -/** - * scan_zone_unevictable_pages - check unevictable list for evictable pages - * @zone - zone of which to scan the unevictable list - * - * Scan @zone's unevictable LRU lists to check for pages that have become - * evictable. Move those that have to @zone's inactive list where they - * become candidates for reclaim, unless shrink_inactive_zone() decides - * to reactivate them. Pages that are still unevictable are rotated - * back onto @zone's unevictable list. - */ -#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ -static void scan_zone_unevictable_pages(struct zone *zone) +static void warn_scan_unevictable_pages(void) { - struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; - unsigned long scan; - unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); - - while (nr_to_scan > 0) { - unsigned long batch_size = min(nr_to_scan, - SCAN_UNEVICTABLE_BATCH_SIZE); - - spin_lock_irq(&zone->lru_lock); - for (scan = 0; scan < batch_size; scan++) { - struct page *page = lru_to_page(l_unevictable); - - if (!trylock_page(page)) - continue; - - prefetchw_prev_lru_page(page, l_unevictable, flags); - - if (likely(PageLRU(page) && PageUnevictable(page))) - check_move_unevictable_page(page, zone); - - unlock_page(page); - } - spin_unlock_irq(&zone->lru_lock); - - nr_to_scan -= batch_size; - } -} - - -/** - * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages - * - * A really big hammer: scan all zones' unevictable LRU lists to check for - * pages that have become evictable. Move those back to the zones' - * inactive list where they become candidates for reclaim. - * This occurs when, e.g., we have unswappable pages on the unevictable lists, - * and we add swap to the system. As such, it runs in the context of a task - * that has possibly/probably made some previously unevictable pages - * evictable. - */ -static void scan_all_zones_unevictable_pages(void) -{ - struct zone *zone; - - for_each_zone(zone) { - scan_zone_unevictable_pages(zone); - } + printk_once(KERN_WARNING + "The scan_unevictable_pages sysctl/node-interface has been " + "disabled for lack of a legitimate use case. If you have " + "one, please send an email to linux-mm@kvack.org.\n"); } /* @@ -3489,11 +3435,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + warn_scan_unevictable_pages(); proc_doulongvec_minmax(table, write, buffer, length, ppos); - - if (write && *(unsigned long *)table->data) - scan_all_zones_unevictable_pages(); - scan_unevictable_pages = 0; return 0; } @@ -3508,6 +3451,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev, struct sysdev_attribute *attr, char *buf) { + warn_scan_unevictable_pages(); return sprintf(buf, "0\n"); /* always zero; should fit... */ } @@ -3515,19 +3459,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - struct zone *node_zones = NODE_DATA(dev->id)->node_zones; - struct zone *zone; - unsigned long res; - unsigned long req = strict_strtoul(buf, 10, &res); - - if (req || !res) - return 1; /* Invalid input or zero is no-op */ - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - scan_zone_unevictable_pages(zone); - } + warn_scan_unevictable_pages(); return 1; } -- cgit v1.2.3 From 21ee9f398be209ccbb62929d35961ca1ed48eec3 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 31 Oct 2011 17:09:28 -0700 Subject: vmscan: add barrier to prevent evictable page in unevictable list When a race between putback_lru_page() and shmem_lock with lock=0 happens, progrom execution order is as follows, but clear_bit in processor #1 could be reordered right before spin_unlock of processor #1. Then, the page would be stranded on the unevictable list. spin_lock SetPageLRU spin_unlock clear_bit(AS_UNEVICTABLE) spin_lock if PageLRU() if !test_bit(AS_UNEVICTABLE) move evictable list smp_mb if !test_bit(AS_UNEVICTABLE) move evictable list spin_unlock But, pagevec_lookup() in scan_mapping_unevictable_pages() has rcu_read_[un]lock() so it could protect reordering before reaching test_bit(AS_UNEVICTABLE) on processor #1 so this problem never happens. But it's a unexpected side effect and we should solve this problem properly. This patch adds a barrier after mapping_clear_unevictable. I didn't meet this problem but just found during review. Signed-off-by: Minchan Kim Acked-by: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Cc: Lee Schermerhorn Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 3886b0bd7869..f51a33e8ed89 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -633,13 +633,14 @@ redo: lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); /* - * When racing with an mlock clearing (page is - * unlocked), make sure that if the other thread does - * not observe our setting of PG_lru and fails - * isolation, we see PG_mlocked cleared below and move + * When racing with an mlock or AS_UNEVICTABLE clearing + * (page is unlocked) make sure that if the other thread + * does not observe our setting of PG_lru and fails + * isolation/check_move_unevictable_page, + * we see PG_mlocked/AS_UNEVICTABLE cleared below and move * the page back to the evictable list. * - * The other side is TestClearPageMlocked(). + * The other side is TestClearPageMlocked() or shmem_lock(). */ smp_mb(); } -- cgit v1.2.3 From e0887c19b2daa140f20ca8104bdc5740f39dbb86 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 31 Oct 2011 17:09:31 -0700 Subject: vmscan: limit direct reclaim for higher order allocations When suffering from memory fragmentation due to unfreeable pages, THP page faults will repeatedly try to compact memory. Due to the unfreeable pages, compaction fails. Needless to say, at that point page reclaim also fails to create free contiguous 2MB areas. However, that doesn't stop the current code from trying, over and over again, and freeing a minimum of 4MB (2UL << sc->order pages) at every single invocation. This resulted in my 12GB system having 2-3GB free memory, a corresponding amount of used swap and very sluggish response times. This can be avoided by having the direct reclaim code not reclaim from zones that already have plenty of free memory available for compaction. If compaction still fails due to unmovable memory, doing additional reclaim will only hurt the system, not help. [jweiner@redhat.com: change comment to explain the order check] Signed-off-by: Rik van Riel Acked-by: Johannes Weiner Acked-by: Mel Gorman Cc: Andrea Arcangeli Reviewed-by: Minchan Kim Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index f51a33e8ed89..7e0f05797388 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2125,6 +2125,22 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + if (COMPACTION_BUILD) { + /* + * If we already have plenty of memory + * free for compaction, don't free any + * more. Even though compaction is + * invoked for any non-zero order, + * only frequent costly order + * reclamation is disruptive enough to + * become a noticable problem, like + * transparent huge page allocations. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER && + (compaction_suitable(zone, sc->order) || + compaction_deferred(zone))) + continue; + } /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and -- cgit v1.2.3 From e0c23279c9f800c403f37511484d9014ac83adec Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 31 Oct 2011 17:09:33 -0700 Subject: vmscan: abort reclaim/compaction if compaction can proceed If compaction can proceed, shrink_zones() stops doing any work but its callers still call shrink_slab() which raises the priority and potentially sleeps. This is unnecessary and wasteful so this patch aborts direct reclaim/compaction entirely if compaction can proceed. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Minchan Kim Acked-by: Johannes Weiner Cc: Josh Boyer Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index 7e0f05797388..a90c603a8d02 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2103,14 +2103,19 @@ restart: * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. + * + * This function returns true if a zone is being reclaimed for a costly + * high-order allocation and compaction is either ready to begin or deferred. + * This indicates to the caller that it should retry the allocation or fail. */ -static void shrink_zones(int priority, struct zonelist *zonelist, +static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + bool should_abort_reclaim = false; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2127,19 +2132,20 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; /* Let kswapd poll it */ if (COMPACTION_BUILD) { /* - * If we already have plenty of memory - * free for compaction, don't free any - * more. Even though compaction is - * invoked for any non-zero order, - * only frequent costly order - * reclamation is disruptive enough to - * become a noticable problem, like - * transparent huge page allocations. + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticable problem, like transparent huge page + * allocations. */ if (sc->order > PAGE_ALLOC_COSTLY_ORDER && (compaction_suitable(zone, sc->order) || - compaction_deferred(zone))) + compaction_deferred(zone))) { + should_abort_reclaim = true; continue; + } } /* * This steals pages from memory cgroups over softlimit @@ -2158,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, shrink_zone(priority, zone, sc); } + + return should_abort_reclaim; } static bool zone_reclaimable(struct zone *zone) @@ -2222,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(sc->mem_cgroup); - shrink_zones(priority, zonelist, sc); + if (shrink_zones(priority, zonelist, sc)) + break; + /* * Don't shrink slabs when reclaiming memory from * over limit cgroups -- cgit v1.2.3