From 07b4e2bc9c35ea88cbd36d806fcd5e3bcbf022be Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Jul 2012 14:02:51 -0700 Subject: mm: sparse: fix section usemap placement calculation Commit 238305bb4d41 ("mm: remove sparsemem allocation details from the bootmem allocator") introduced a bug in the allocation goal calculation that put section usemaps not in the same section as the node descriptors, creating unnecessary hotplug dependencies between them: node 0 must be removed before remove section 16399 node 1 must be removed before remove section 16399 node 2 must be removed before remove section 16399 node 3 must be removed before remove section 16399 node 4 must be removed before remove section 16399 node 5 must be removed before remove section 16399 node 6 must be removed before remove section 16399 The reason is that it applies PAGE_SECTION_MASK to the physical address of the node descriptor when finding a suitable place to put the usemap, when this mask is actually intended to be used with PFNs. Because the PFN mask is wider, the target address will point beyond the wanted section holding the node descriptor and the node must be offlined before the section holding the usemap can go. Fix this by extending the mask to address width before use. Signed-off-by: Yinghai Lu Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 6a4bf9160e85..e861397016a9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -287,7 +287,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * from the same section as the pgdat where possible to avoid * this problem. */ - goal = __pa(pgdat) & PAGE_SECTION_MASK; + goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); return __alloc_bootmem_node_nopanic(host_pgdat, size, SMP_CACHE_BYTES, goal); -- cgit v1.2.3 From 99ab7b19440a72ebdf225f99b20f8ef40decee86 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Jul 2012 14:02:53 -0700 Subject: mm: sparse: fix usemap allocation above node descriptor section After commit f5bf18fa22f8 ("bootmem/sparsemem: remove limit constraint in alloc_bootmem_section"), usemap allocations may easily be placed outside the optimal section that holds the node descriptor, even if there is space available in that section. This results in unnecessary hotplug dependencies that need to have the node unplugged before the section holding the usemap. The reason is that the bootmem allocator doesn't guarantee a linear search starting from the passed allocation goal but may start out at a much higher address absent an upper limit. Fix this by trying the allocation with the limit at the section end, then retry without if that fails. This keeps the fix from f5bf18fa22f8 of not panicking if the allocation does not fit in the section, but still makes sure to try to stay within the section at first. Signed-off-by: Yinghai Lu Signed-off-by: Johannes Weiner Cc: [3.3.x, 3.4.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index e861397016a9..c7bb952400c8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -275,8 +275,9 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - pg_data_t *host_pgdat; - unsigned long goal; + unsigned long goal, limit; + unsigned long *p; + int nid; /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while @@ -288,9 +289,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * this problem. */ goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); - host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); - return __alloc_bootmem_node_nopanic(host_pgdat, size, - SMP_CACHE_BYTES, goal); + limit = goal + (1UL << PA_SECTION_SHIFT); + nid = early_pfn_to_nid(goal >> PAGE_SHIFT); +again: + p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, + SMP_CACHE_BYTES, goal, limit); + if (!p && limit) { + limit = 0; + goto again; + } + return p; } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) -- cgit v1.2.3 From ca57df79d4f64e1a4886606af4289d40636189c5 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 31 Jul 2012 16:43:19 -0700 Subject: mm: setup pageblock_order before it's used by sparsemem On architectures with CONFIG_HUGETLB_PAGE_SIZE_VARIABLE set, such as Itanium, pageblock_order is a variable with default value of 0. It's set to the right value by set_pageblock_order() in function free_area_init_core(). But pageblock_order may be used by sparse_init() before free_area_init_core() is called along path: sparse_init() ->sparse_early_usemaps_alloc_node() ->usemap_size() ->SECTION_BLOCKFLAGS_BITS ->((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) The uninitialized pageblock_size will cause memory wasting because usemap_size() returns a much bigger value then it's really needed. For example, on an Itanium platform, sparse_init() pageblock_order=0 usemap_size=24576 free_area_init_core() before pageblock_order=0, usemap_size=24576 free_area_init_core() after pageblock_order=12, usemap_size=8 That means 24K memory has been wasted for each section, so fix it by calling set_pageblock_order() from sparse_init(). Signed-off-by: Xishi Qiu Signed-off-by: Jiang Liu Cc: Tony Luck Cc: Yinghai Lu Cc: KAMEZAWA Hiroyuki Cc: Benjamin Herrenschmidt Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index c7bb952400c8..950981fd07c5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -493,6 +493,9 @@ void __init sparse_init(void) struct page **map_map; #endif + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ + set_pageblock_order(); + /* * map is using big page (aka 2M in x86 64 bit) * usemap is less one page (aka 24 bytes) -- cgit v1.2.3 From 5b760e64a64c8940cdccd0ba6fce19a9bd010d20 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 31 Jul 2012 16:46:02 -0700 Subject: mm/sparse: optimize sparse_index_alloc With CONFIG_SPARSEMEM_EXTREME, the two levels of memory section descriptors are allocated from slab or bootmem. When allocating from slab, let slab/bootmem allocator clear the memory chunk. We needn't clear it explicitly. Signed-off-by: Gavin Shan Reviewed-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 950981fd07c5..fa933f43b2c9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -65,14 +65,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) if (slab_is_available()) { if (node_state(nid, N_HIGH_MEMORY)) - section = kmalloc_node(array_size, GFP_KERNEL, nid); + section = kzalloc_node(array_size, GFP_KERNEL, nid); else - section = kmalloc(array_size, GFP_KERNEL); - } else + section = kzalloc(array_size, GFP_KERNEL); + } else { section = alloc_bootmem_node(NODE_DATA(nid), array_size); - - if (section) - memset(section, 0, array_size); + } return section; } -- cgit v1.2.3 From db36a46113e101a8aa2d6ede41e78f2eaabed3f1 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 31 Jul 2012 16:46:04 -0700 Subject: mm/sparse: more checks on mem_section number __section_nr() was implemented to retrieve the corresponding memory section number according to its descriptor. It's possible that the specified memory section descriptor doesn't exist in the global array. So add more checking on that and report an error for a wrong case. Signed-off-by: Gavin Shan Acked-by: David Rientjes Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index fa933f43b2c9..42ca0ea9af1b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -130,6 +130,8 @@ int __section_nr(struct mem_section* ms) break; } + VM_BUG_ON(root_nr == NR_SECTION_ROOTS); + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } -- cgit v1.2.3 From c1c9518331969f97ea403bac66f0fd4a85d204d5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 31 Jul 2012 16:46:06 -0700 Subject: mm/sparse: remove index_init_lock sparse_index_init() uses the index_init_lock spinlock to protect root mem_section assignment. The lock is not necessary anymore because the function is called only during boot (during paging init which is executed only from a single CPU) and from the hotplug code (by add_memory() via arch_add_memory()) which uses mem_hotplug_mutex. The lock was introduced by 28ae55c9 ("sparsemem extreme: hotplug preparation") and sparse_index_init() was used only during boot at that time. Later when the hotplug code (and add_memory()) was introduced there was no synchronization so it was possible to online more sections from the same root probably (though I am not 100% sure about that). The first synchronization has been added by 6ad696d2 ("mm: allow memory hotplug and hibernation in the same kernel") which was later replaced by the mem_hotplug_mutex - 20d6c96b ("mem-hotplug: introduce {un}lock_memory_hotplug()"). Let's remove the lock as it is not needed and it makes the code more confusing. [mhocko@suse.cz: changelog] Signed-off-by: Gavin Shan Reviewed-by: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'mm/sparse.c') diff --git a/mm/sparse.c b/mm/sparse.c index 42ca0ea9af1b..fac95f2888f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -77,7 +77,6 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) static int __meminit sparse_index_init(unsigned long section_nr, int nid) { - static DEFINE_SPINLOCK(index_init_lock); unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; int ret = 0; @@ -88,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) section = sparse_index_alloc(nid); if (!section) return -ENOMEM; - /* - * This lock keeps two different sections from - * reallocating for the same index - */ - spin_lock(&index_init_lock); - - if (mem_section[root]) { - ret = -EEXIST; - goto out; - } mem_section[root] = section; -out: - spin_unlock(&index_init_lock); + return ret; } #else /* !SPARSEMEM_EXTREME */ -- cgit v1.2.3