From bbd08c72c6b3344a230e15264d6cd444ed76b958 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Mon, 19 May 2014 23:04:55 -0500 Subject: powerpc/e6500: hw tablewalk: clear TID in kernel indirect entries Previously TID was being cleared before the tlbsx, but not after. This can lead to a multiway hit between a TLB entry with TID=0 (previously inserted when PID=0) and a TLB entry with TID!=0 that matches PID. This can theoretically result in undefined behavior, though we probably get lucky due to the details of the overlap. It also results in the inability to use multihit detection to detect other conflicting TLB entries, as well as poorer TLB utilization due to duplicating kernel TLB entries. Rather than try to patch up MAS1 after tlbsx, the entire value is saved/restored as with MAS2. I observed a slight improvement in TLB miss performance with this patch applied. Signed-off-by: Scott Wood Reported-by: Ed Swarthout --- arch/powerpc/mm/tlb_low_64e.S | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 356e8b41fb09..3298d1039e80 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -322,19 +322,17 @@ tlb_miss_common_e6500: b 1b .previous - mfspr r15,SPRN_MAS2 + mfspr r15,SPRN_MAS1 + mfspr r10,SPRN_MAS2 tlbsx 0,r16 + mtspr SPRN_MAS2,r10 mfspr r10,SPRN_MAS1 + mtspr SPRN_MAS1,r15 + andis. r10,r10,MAS1_VALID@h bne tlb_miss_done_e6500 - /* Undo MAS-damage from the tlbsx */ - mfspr r10,SPRN_MAS1 - oris r10,r10,MAS1_VALID@h - mtspr SPRN_MAS1,r10 - mtspr SPRN_MAS2,r15 - /* Now, we need to walk the page tables. First check if we are in * range. */ -- cgit v1.2.3 From 1cb4ed92f6a5a8961b122d11e651870ba741245b Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 22 May 2014 16:10:35 -0500 Subject: powerpc/e6500: hw tablewalk: fix recursive tlb lock on cpu 0 Commit 82d86de25b9c99db546e17c6f7ebf9a691da557e "TLB lock recursive" introduced a bug whereby cpu 0 uses the same value for "lock held" as is used to indicate that the lock is free. This means that cpu 1 can acquire the lock whenever it wants, regardless of whether cpu 0 has it locked, which in turn means we can get duplicate TLB entries. Add one to the CPU value to ensure we do not use zero as a "lock held" value. Signed-off-by: Scott Wood Reported-by: Ed Swarthout --- arch/powerpc/mm/tlb_low_64e.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 3298d1039e80..131f1f412bdd 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -296,7 +296,7 @@ itlb_miss_fault_bolted: * r14 = page table base * r13 = PACA * r11 = tlb_per_core ptr - * r10 = cpu number + * r10 = crap (free to use) */ tlb_miss_common_e6500: /* @@ -309,6 +309,7 @@ tlb_miss_common_e6500: lhz r10,PACAPACAINDEX(r13) cmpdi r15,0 cmpdi cr1,r15,1 /* set cr1.eq = 0 for non-recursive */ + addi r10,r10,1 bne 2f stbcx. r10,0,r11 bne 1b -- cgit v1.2.3 From 7c48005036a09142a662487ac52000ab845a07e8 Mon Sep 17 00:00:00 2001 From: Laurentiu Tudor Date: Fri, 30 May 2014 17:59:15 +0300 Subject: powerpc/booke64: wrap tlb lock and search in htw miss with FTR_SMT Virtualized environments may expose a e6500 dual-threaded core as two single-threaded e6500 cores. Take advantage of this and get rid of the tlb lock and the trap-causing tlbsx in the htw miss handler by guarding with CPU_FTR_SMT, as it's already being done in the bolted tlb1 miss handler. As seen in the results below, measurements done with lmbench random memory access latency test running under Freescale's Embedded Hypervisor, there is a ~34% improvement. Memory latencies in nanoseconds - smaller is better (WARNING - may not be correct, check graphs) ---------------------------------------------------- Host Mhz L1 $ L2 $ Main mem Rand mem --------- --- ---- ---- -------- -------- smt 1665 1.8020 13.2 83.0 1149.7 nosmt 1665 1.8020 13.2 83.0 758.1 Signed-off-by: Laurentiu Tudor Cc: Scott Wood [scottwood@freescale.com: commit message tweak] Signed-off-by: Scott Wood --- arch/powerpc/mm/tlb_low_64e.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 131f1f412bdd..57c4d662be33 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -299,6 +299,7 @@ itlb_miss_fault_bolted: * r10 = crap (free to use) */ tlb_miss_common_e6500: +BEGIN_FTR_SECTION /* * Search if we already have an indirect entry for that virtual * address, and if we do, bail out. @@ -333,6 +334,7 @@ tlb_miss_common_e6500: andis. r10,r10,MAS1_VALID@h bne tlb_miss_done_e6500 +END_FTR_SECTION_IFSET(CPU_FTR_SMT) /* Now, we need to walk the page tables. First check if we are in * range. @@ -393,11 +395,13 @@ tlb_miss_common_e6500: tlb_miss_done_e6500: .macro tlb_unlock_e6500 +BEGIN_FTR_SECTION beq cr1,1f /* no unlock if lock was recursively grabbed */ li r15,0 isync stb r15,0(r11) 1: +END_FTR_SECTION_IFSET(CPU_FTR_SMT) .endm tlb_unlock_e6500 -- cgit v1.2.3 From 376af5947c0e441ccbf98f0212d4ffbf171528f6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 10 Jul 2014 12:29:19 +1000 Subject: powerpc: Remove STAB code Old cpus didn't have a Segment Lookaside Buffer (SLB), instead they had a Segment Table (STAB). Now that we've dropped support for those cpus, we can remove the STAB support entirely. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/Makefile | 4 +- arch/powerpc/mm/hash_utils_64.c | 18 +-- arch/powerpc/mm/stab.c | 286 ---------------------------------------- 3 files changed, 5 insertions(+), 303 deletions(-) delete mode 100644 arch/powerpc/mm/stab.c (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 51230ee6a407..d0130fff20e5 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,9 +13,7 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o -obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ - slb_low.o slb.o stab.o \ - $(hash64-y) +obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o slb_low.o slb.o $(hash64-y) obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ tlb_hash$(CONFIG_WORD_SIZE).o \ diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 88fdd9d25077..fb8bea71327d 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -821,21 +821,15 @@ static void __init htab_initialize(void) void __init early_init_mmu(void) { - /* Setup initial STAB address in the PACA */ - get_paca()->stab_real = __pa((u64)&initial_stab); - get_paca()->stab_addr = (u64)&initial_stab; - /* Initialize the MMU Hash table and create the linear mapping - * of memory. Has to be done before stab/slb initialization as - * this is currently where the page size encoding is obtained + * of memory. Has to be done before SLB initialization as this is + * currently where the page size encoding is obtained. */ htab_initialize(); - /* Initialize stab / SLB management */ + /* Initialize SLB management */ if (mmu_has_feature(MMU_FTR_SLB)) slb_initialize(); - else - stab_initialize(get_paca()->stab_real); } #ifdef CONFIG_SMP @@ -845,13 +839,9 @@ void early_init_mmu_secondary(void) if (!firmware_has_feature(FW_FEATURE_LPAR)) mtspr(SPRN_SDR1, _SDR1); - /* Initialize STAB/SLB. We use a virtual address as it works - * in real mode on pSeries. - */ + /* Initialize SLB */ if (mmu_has_feature(MMU_FTR_SLB)) slb_initialize(); - else - stab_initialize(get_paca()->stab_addr); } #endif /* CONFIG_SMP */ diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c deleted file mode 100644 index 3f8efa6f2997..000000000000 --- a/arch/powerpc/mm/stab.c +++ /dev/null @@ -1,286 +0,0 @@ -/* - * PowerPC64 Segment Translation Support. - * - * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com - * Copyright (c) 2001 Dave Engebretsen - * - * Copyright (C) 2002 Anton Blanchard , IBM - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include - -#include -#include -#include -#include -#include -#include - -struct stab_entry { - unsigned long esid_data; - unsigned long vsid_data; -}; - -#define NR_STAB_CACHE_ENTRIES 8 -static DEFINE_PER_CPU(long, stab_cache_ptr); -static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache); - -/* - * Create a segment table entry for the given esid/vsid pair. - */ -static int make_ste(unsigned long stab, unsigned long esid, unsigned long vsid) -{ - unsigned long esid_data, vsid_data; - unsigned long entry, group, old_esid, castout_entry, i; - unsigned int global_entry; - struct stab_entry *ste, *castout_ste; - unsigned long kernel_segment = (esid << SID_SHIFT) >= PAGE_OFFSET; - - vsid_data = vsid << STE_VSID_SHIFT; - esid_data = esid << SID_SHIFT | STE_ESID_KP | STE_ESID_V; - if (! kernel_segment) - esid_data |= STE_ESID_KS; - - /* Search the primary group first. */ - global_entry = (esid & 0x1f) << 3; - ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); - - /* Find an empty entry, if one exists. */ - for (group = 0; group < 2; group++) { - for (entry = 0; entry < 8; entry++, ste++) { - if (!(ste->esid_data & STE_ESID_V)) { - ste->vsid_data = vsid_data; - eieio(); - ste->esid_data = esid_data; - return (global_entry | entry); - } - } - /* Now search the secondary group. */ - global_entry = ((~esid) & 0x1f) << 3; - ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); - } - - /* - * Could not find empty entry, pick one with a round robin selection. - * Search all entries in the two groups. - */ - castout_entry = get_paca()->stab_rr; - for (i = 0; i < 16; i++) { - if (castout_entry < 8) { - global_entry = (esid & 0x1f) << 3; - ste = (struct stab_entry *)(stab | ((esid & 0x1f) << 7)); - castout_ste = ste + castout_entry; - } else { - global_entry = ((~esid) & 0x1f) << 3; - ste = (struct stab_entry *)(stab | (((~esid) & 0x1f) << 7)); - castout_ste = ste + (castout_entry - 8); - } - - /* Dont cast out the first kernel segment */ - if ((castout_ste->esid_data & ESID_MASK) != PAGE_OFFSET) - break; - - castout_entry = (castout_entry + 1) & 0xf; - } - - get_paca()->stab_rr = (castout_entry + 1) & 0xf; - - /* Modify the old entry to the new value. */ - - /* Force previous translations to complete. DRENG */ - asm volatile("isync" : : : "memory"); - - old_esid = castout_ste->esid_data >> SID_SHIFT; - castout_ste->esid_data = 0; /* Invalidate old entry */ - - asm volatile("sync" : : : "memory"); /* Order update */ - - castout_ste->vsid_data = vsid_data; - eieio(); /* Order update */ - castout_ste->esid_data = esid_data; - - asm volatile("slbie %0" : : "r" (old_esid << SID_SHIFT)); - /* Ensure completion of slbie */ - asm volatile("sync" : : : "memory"); - - return (global_entry | (castout_entry & 0x7)); -} - -/* - * Allocate a segment table entry for the given ea and mm - */ -static int __ste_allocate(unsigned long ea, struct mm_struct *mm) -{ - unsigned long vsid; - unsigned char stab_entry; - unsigned long offset; - - /* Kernel or user address? */ - if (is_kernel_addr(ea)) { - vsid = get_kernel_vsid(ea, MMU_SEGSIZE_256M); - } else { - if ((ea >= TASK_SIZE_USER64) || (! mm)) - return 1; - - vsid = get_vsid(mm->context.id, ea, MMU_SEGSIZE_256M); - } - - stab_entry = make_ste(get_paca()->stab_addr, GET_ESID(ea), vsid); - - if (!is_kernel_addr(ea)) { - offset = __get_cpu_var(stab_cache_ptr); - if (offset < NR_STAB_CACHE_ENTRIES) - __get_cpu_var(stab_cache[offset++]) = stab_entry; - else - offset = NR_STAB_CACHE_ENTRIES+1; - __get_cpu_var(stab_cache_ptr) = offset; - - /* Order update */ - asm volatile("sync":::"memory"); - } - - return 0; -} - -int ste_allocate(unsigned long ea) -{ - return __ste_allocate(ea, current->mm); -} - -/* - * Do the segment table work for a context switch: flush all user - * entries from the table, then preload some probably useful entries - * for the new task - */ -void switch_stab(struct task_struct *tsk, struct mm_struct *mm) -{ - struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr; - struct stab_entry *ste; - unsigned long offset; - unsigned long pc = KSTK_EIP(tsk); - unsigned long stack = KSTK_ESP(tsk); - unsigned long unmapped_base; - - /* Force previous translations to complete. DRENG */ - asm volatile("isync" : : : "memory"); - - /* - * We need interrupts hard-disabled here, not just soft-disabled, - * so that a PMU interrupt can't occur, which might try to access - * user memory (to get a stack trace) and possible cause an STAB miss - * which would update the stab_cache/stab_cache_ptr per-cpu variables. - */ - hard_irq_disable(); - - offset = __get_cpu_var(stab_cache_ptr); - if (offset <= NR_STAB_CACHE_ENTRIES) { - int i; - - for (i = 0; i < offset; i++) { - ste = stab + __get_cpu_var(stab_cache[i]); - ste->esid_data = 0; /* invalidate entry */ - } - } else { - unsigned long entry; - - /* Invalidate all entries. */ - ste = stab; - - /* Never flush the first entry. */ - ste += 1; - for (entry = 1; - entry < (HW_PAGE_SIZE / sizeof(struct stab_entry)); - entry++, ste++) { - unsigned long ea; - ea = ste->esid_data & ESID_MASK; - if (!is_kernel_addr(ea)) { - ste->esid_data = 0; - } - } - } - - asm volatile("sync; slbia; sync":::"memory"); - - __get_cpu_var(stab_cache_ptr) = 0; - - /* Now preload some entries for the new task */ - if (test_tsk_thread_flag(tsk, TIF_32BIT)) - unmapped_base = TASK_UNMAPPED_BASE_USER32; - else - unmapped_base = TASK_UNMAPPED_BASE_USER64; - - __ste_allocate(pc, mm); - - if (GET_ESID(pc) == GET_ESID(stack)) - return; - - __ste_allocate(stack, mm); - - if ((GET_ESID(pc) == GET_ESID(unmapped_base)) - || (GET_ESID(stack) == GET_ESID(unmapped_base))) - return; - - __ste_allocate(unmapped_base, mm); - - /* Order update */ - asm volatile("sync" : : : "memory"); -} - -/* - * Allocate segment tables for secondary CPUs. These must all go in - * the first (bolted) segment, so that do_stab_bolted won't get a - * recursive segment miss on the segment table itself. - */ -void __init stabs_alloc(void) -{ - int cpu; - - if (mmu_has_feature(MMU_FTR_SLB)) - return; - - for_each_possible_cpu(cpu) { - unsigned long newstab; - - if (cpu == 0) - continue; /* stab for CPU 0 is statically allocated */ - - newstab = memblock_alloc_base(HW_PAGE_SIZE, HW_PAGE_SIZE, - 1<stab_real | 0x1ul; - - mtspr(SPRN_ASR, stabreal); -} -- cgit v1.2.3 From 13b3d13b813ab834fac67dc05f8b86dbcc29c134 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 10 Jul 2014 12:29:20 +1000 Subject: powerpc: Remove MMU_FTR_SLB We now only support cpus that use an SLB, so we don't need an MMU feature to indicate that. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hash_utils_64.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index fb8bea71327d..6b7c1c824cf9 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -828,8 +828,7 @@ void __init early_init_mmu(void) htab_initialize(); /* Initialize SLB management */ - if (mmu_has_feature(MMU_FTR_SLB)) - slb_initialize(); + slb_initialize(); } #ifdef CONFIG_SMP @@ -840,8 +839,7 @@ void early_init_mmu_secondary(void) mtspr(SPRN_SDR1, _SDR1); /* Initialize SLB */ - if (mmu_has_feature(MMU_FTR_SLB)) - slb_initialize(); + slb_initialize(); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From c3993f10076ae1fa479d5b3e227fe15e001c45a9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 10 Jul 2014 12:29:22 +1000 Subject: powerpc: Remove CONFIG_POWER3 Now that we have dropped power3 support we can remove CONFIG_POWER3. The usage in pgtable_32.c was already dead code as CONFIG_POWER3 was not selectable on PPC32. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 343a87fa78b5..cf11342bf519 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -41,7 +41,7 @@ unsigned long ioremap_base; unsigned long ioremap_bot; EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ -#if defined(CONFIG_6xx) || defined(CONFIG_POWER3) +#ifdef CONFIG_6xx #define HAVE_BATS 1 #endif -- cgit v1.2.3 From 0f369103ce78c9948cfab8611031e6ac084b75a8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 10 Jul 2014 12:29:24 +1000 Subject: powerpc: Remove power3 from comments There are still a few occurences where it remains, because it helps to explain something that persists. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/mmu_context_hash32.c | 2 +- arch/powerpc/mm/ppc_mmu_32.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c index 78fef6726e10..aa5a7fd89461 100644 --- a/arch/powerpc/mm/mmu_context_hash32.c +++ b/arch/powerpc/mm/mmu_context_hash32.c @@ -2,7 +2,7 @@ * This file contains the routines for handling the MMU on those * PowerPC implementations where the MMU substantially follows the * architecture specification. This includes the 6xx, 7xx, 7xxx, - * 8260, and POWER3 implementations but excludes the 8xx and 4xx. + * and 8260 implementations but excludes the 8xx and 4xx. * -- paulus * * Derived from arch/ppc/mm/init.c: diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index 11571e118831..5029dc19b517 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -2,7 +2,7 @@ * This file contains the routines for handling the MMU on those * PowerPC implementations where the MMU substantially follows the * architecture specification. This includes the 6xx, 7xx, 7xxx, - * 8260, and POWER3 implementations but excludes the 8xx and 4xx. + * and 8260 implementations but excludes the 8xx and 4xx. * -- paulus * * Derived from arch/ppc/mm/init.c: -- cgit v1.2.3 From 48cd9b5d590aee1664170968a9eae068e36761eb Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 11 Jun 2014 16:09:32 -0500 Subject: powerpc/e6500: Work around erratum A-008139 Erratum A-008139 can cause duplicate TLB entries if an indirect entry is overwritten using tlbwe while the other thread is using it to do a lookup. Work around this by using tlbilx to invalidate prior to overwriting. To avoid the need to save another register to hold MAS1 during the workaround code, TID clearing has been moved from tlb_miss_kernel_e6500 until after the SMT section. Signed-off-by: Scott Wood --- arch/powerpc/mm/tlb_low_64e.S | 68 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 57c4d662be33..89bf95bd63b1 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -299,7 +299,9 @@ itlb_miss_fault_bolted: * r10 = crap (free to use) */ tlb_miss_common_e6500: -BEGIN_FTR_SECTION + crmove cr2*4+2,cr0*4+2 /* cr2.eq != 0 if kernel address */ + +BEGIN_FTR_SECTION /* CPU_FTR_SMT */ /* * Search if we already have an indirect entry for that virtual * address, and if we do, bail out. @@ -324,17 +326,62 @@ BEGIN_FTR_SECTION b 1b .previous + /* + * Erratum A-008139 says that we can't use tlbwe to change + * an indirect entry in any way (including replacing or + * invalidating) if the other thread could be in the process + * of a lookup. The workaround is to invalidate the entry + * with tlbilx before overwriting. + */ + + lbz r15,TCD_ESEL_NEXT(r11) + rlwinm r10,r15,16,0xff0000 + oris r10,r10,MAS0_TLBSEL(1)@h + mtspr SPRN_MAS0,r10 + isync + tlbre mfspr r15,SPRN_MAS1 - mfspr r10,SPRN_MAS2 + andis. r15,r15,MAS1_VALID@h + beq 5f + +BEGIN_FTR_SECTION_NESTED(532) + mfspr r10,SPRN_MAS8 + rlwinm r10,r10,0,0x80000fff /* tgs,tlpid -> sgs,slpid */ + mtspr SPRN_MAS5,r10 +END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) - tlbsx 0,r16 - mtspr SPRN_MAS2,r10 mfspr r10,SPRN_MAS1 - mtspr SPRN_MAS1,r15 + rlwinm r15,r10,0,0x3fff0000 /* tid -> spid */ + rlwimi r15,r10,20,0x00000003 /* ind,ts -> sind,sas */ + mfspr r10,SPRN_MAS6 + mtspr SPRN_MAS6,r15 + + mfspr r15,SPRN_MAS2 + isync + tlbilxva 0,r15 + isync + + mtspr SPRN_MAS6,r10 - andis. r10,r10,MAS1_VALID@h +5: +BEGIN_FTR_SECTION_NESTED(532) + li r10,0 + mtspr SPRN_MAS8,r10 + mtspr SPRN_MAS5,r10 +END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) + + tlbsx 0,r16 + mfspr r10,SPRN_MAS1 + andis. r15,r10,MAS1_VALID@h bne tlb_miss_done_e6500 -END_FTR_SECTION_IFSET(CPU_FTR_SMT) +FTR_SECTION_ELSE + mfspr r10,SPRN_MAS1 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT) + + oris r10,r10,MAS1_VALID@h + beq cr2,4f + rlwinm r10,r10,0,16,1 /* Clear TID */ +4: mtspr SPRN_MAS1,r10 /* Now, we need to walk the page tables. First check if we are in * range. @@ -410,12 +457,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_SMT) rfi tlb_miss_kernel_e6500: - mfspr r10,SPRN_MAS1 ld r14,PACA_KERNELPGD(r13) - cmpldi cr0,r15,8 /* Check for vmalloc region */ - rlwinm r10,r10,0,16,1 /* Clear TID */ - mtspr SPRN_MAS1,r10 - beq+ tlb_miss_common_e6500 + cmpldi cr1,r15,8 /* Check for vmalloc region */ + beq+ cr1,tlb_miss_common_e6500 tlb_miss_fault_e6500: tlb_unlock_e6500 -- cgit v1.2.3 From b00fc6ec1f24f9d7af9b8988b6a198186eb3408c Mon Sep 17 00:00:00 2001 From: Andrey Utkin Date: Mon, 4 Aug 2014 23:13:10 +0300 Subject: powerpc/mm/numa: Fix break placement Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=81631 Reported-by: David Binderman Signed-off-by: Andrey Utkin CC: Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 3b181b22cd46..d3e9a78eaed3 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -611,8 +611,8 @@ static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action, case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: unmap_cpu_from_node(lcpu); - break; ret = NOTIFY_OK; + break; #endif } return ret; -- cgit v1.2.3 From 7d17622159a3f0e9d4bf02a819cf003241e1c688 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 1 Aug 2014 22:07:40 -0500 Subject: powerpc/64e: Add __ref to early_alloc_pgtable() This silences a section mismatch warning. early_alloc_pgtable() is called from map_kernel_page() which cannot be __init, but only when slab_is_available() returns false which can only happen during early boot. Signed-off-by: Scott Wood Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index f6ce1f111f5b..3b3c4d34c7a0 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -68,7 +68,7 @@ unsigned long ioremap_bot = IOREMAP_BASE; #ifdef CONFIG_PPC_MMU_NOHASH -static void *early_alloc_pgtable(unsigned long size) +static __ref void *early_alloc_pgtable(unsigned long size) { void *pt; -- cgit v1.2.3 From bd8cb03dbe77a529945aa270a18c1ba074f729c6 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 11 Jun 2014 16:23:36 +0800 Subject: powerpc: implement vmemmap_list_free() This patch implements vmemmap_list_free() for vmemmap_free(). The freed entries will be removed from vmemmap_list, and form a freed list, with next as the header. The next position in the last allocated page is kept at the list tail. When allocation, if there are freed entries left, get it from the freed list; if no freed entries left, get it like before from the last allocated pages. With this change, realmode_pfn_to_page() also needs to be changed to walk all the entries in the vmemmap_list, as the virt_addr of the entries might not be stored in order anymore. It helps to reuse the memory when continuous doing memory hot-plug/remove operations, but didn't reclaim the pages already allocated, so the memory usage will only increase, but won't exceed the value for the largest memory configuration. Signed-off-by: Li Zhong Cc: Nathan Fontenot Acked-by: Nathan Fontenot Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/init_64.c | 62 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 52 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index e3734edffa69..fa5d28b4e726 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -226,14 +226,24 @@ static void __meminit vmemmap_create_mapping(unsigned long start, #endif /* CONFIG_PPC_BOOK3E */ struct vmemmap_backing *vmemmap_list; +static struct vmemmap_backing *next; +static int num_left; +static int num_freed; static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node) { - static struct vmemmap_backing *next; - static int num_left; + struct vmemmap_backing *vmem_back; + /* get from freed entries first */ + if (num_freed) { + num_freed--; + vmem_back = next; + next = next->list; + + return vmem_back; + } /* allocate a page when required and hand out chunks */ - if (!next || !num_left) { + if (!num_left) { next = vmemmap_alloc_block(PAGE_SIZE, node); if (unlikely(!next)) { WARN_ON(1); @@ -266,6 +276,38 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmemmap_list = vmem_back; } +static unsigned long vmemmap_list_free(unsigned long start) +{ + struct vmemmap_backing *vmem_back, *vmem_back_prev; + + vmem_back_prev = vmem_back = vmemmap_list; + + /* look for it with prev pointer recorded */ + for (; vmem_back; vmem_back = vmem_back->list) { + if (vmem_back->virt_addr == start) + break; + vmem_back_prev = vmem_back; + } + + if (unlikely(!vmem_back)) { + WARN_ON(1); + return 0; + } + + /* remove it from vmemmap_list */ + if (vmem_back == vmemmap_list) /* remove head */ + vmemmap_list = vmem_back->list; + else + vmem_back_prev->list = vmem_back->list; + + /* next point to this freed entry */ + vmem_back->list = next; + next = vmem_back; + num_freed++; + + return vmem_back->phys; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; @@ -331,16 +373,16 @@ struct page *realmode_pfn_to_page(unsigned long pfn) if (pg_va < vmem_back->virt_addr) continue; - /* Check that page struct is not split between real pages */ - if ((pg_va + sizeof(struct page)) > - (vmem_back->virt_addr + page_size)) - return NULL; - - page = (struct page *) (vmem_back->phys + pg_va - + /* After vmemmap_list entry free is possible, need check all */ + if ((pg_va + sizeof(struct page)) <= + (vmem_back->virt_addr + page_size)) { + page = (struct page *) (vmem_back->phys + pg_va - vmem_back->virt_addr); - return page; + return page; + } } + /* Probably that page struct is split between real pages */ return NULL; } EXPORT_SYMBOL_GPL(realmode_pfn_to_page); -- cgit v1.2.3 From ed5694a8464a133582c632f8081324408bcc486d Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 11 Jun 2014 16:23:37 +0800 Subject: powerpc: implement vmemmap_remove_mapping() for BOOK3S This is to be called in vmemmap_free(), leave the implementation on BOOK3E empty as before. Signed-off-by: Li Zhong Cc: Nathan Fontenot Acked-by: Nathan Fontenot Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hash_utils_64.c | 2 +- arch/powerpc/mm/init_64.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 6b7c1c824cf9..daee7f4e5a14 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -243,7 +243,7 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, } #ifdef CONFIG_MEMORY_HOTPLUG -static int htab_remove_mapping(unsigned long vstart, unsigned long vend, +int htab_remove_mapping(unsigned long vstart, unsigned long vend, int psize, int ssize) { unsigned long vaddr; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index fa5d28b4e726..69203c8afb51 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -212,6 +212,13 @@ static void __meminit vmemmap_create_mapping(unsigned long start, for (i = 0; i < page_size; i += PAGE_SIZE) BUG_ON(map_kernel_page(start + i, phys, flags)); } + +#ifdef CONFIG_MEMORY_HOTPLUG +static void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ +} +#endif #else /* CONFIG_PPC_BOOK3E */ static void __meminit vmemmap_create_mapping(unsigned long start, unsigned long page_size, @@ -223,6 +230,21 @@ static void __meminit vmemmap_create_mapping(unsigned long start, mmu_kernel_ssize); BUG_ON(mapped < 0); } + +#ifdef CONFIG_MEMORY_HOTPLUG +extern int htab_remove_mapping(unsigned long vstart, unsigned long vend, + int psize, int ssize); + +static void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + int mapped = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(mapped < 0); +} +#endif + #endif /* CONFIG_PPC_BOOK3E */ struct vmemmap_backing *vmemmap_list; -- cgit v1.2.3 From 71b0bfe4f1608dbabb54a1e964046267a2c7f7b3 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 11 Jun 2014 16:23:38 +0800 Subject: powerpc: implement vmemmap_free() vmemmap_free() does the opposite of vmemap_populate(). This patch also puts vmemmap_free() and vmemmap_list_free() into CONFIG_MEMMORY_HOTPLUG. Signed-off-by: Li Zhong Cc: Nathan Fontenot Acked-by: Nathan Fontenot Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/init_64.c | 85 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 21 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 69203c8afb51..496379013873 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -298,6 +298,37 @@ static __meminit void vmemmap_list_populate(unsigned long phys, vmemmap_list = vmem_back; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ + unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + + /* Align to the page size of the linear mapping. */ + start = _ALIGN_DOWN(start, page_size); + + pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); + + for (; start < end; start += page_size) { + void *p; + + if (vmemmap_populated(start, page_size)) + continue; + + p = vmemmap_alloc_block(page_size, node); + if (!p) + return -ENOMEM; + + vmemmap_list_populate(__pa(p), start, node); + + pr_debug(" * %016lx..%016lx allocated at %p\n", + start, start + page_size, p); + + vmemmap_create_mapping(start, page_size, __pa(p)); + } + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG static unsigned long vmemmap_list_free(unsigned long start) { struct vmemmap_backing *vmem_back, *vmem_back_prev; @@ -330,40 +361,52 @@ static unsigned long vmemmap_list_free(unsigned long start) return vmem_back->phys; } -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +void __ref vmemmap_free(unsigned long start, unsigned long end) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; - /* Align to the page size of the linear mapping. */ start = _ALIGN_DOWN(start, page_size); - pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); + pr_debug("vmemmap_free %lx...%lx\n", start, end); for (; start < end; start += page_size) { - void *p; + unsigned long addr; + /* + * the section has already be marked as invalid, so + * vmemmap_populated() true means some other sections still + * in this page, so skip it. + */ if (vmemmap_populated(start, page_size)) continue; - p = vmemmap_alloc_block(page_size, node); - if (!p) - return -ENOMEM; - - vmemmap_list_populate(__pa(p), start, node); - - pr_debug(" * %016lx..%016lx allocated at %p\n", - start, start + page_size, p); - - vmemmap_create_mapping(start, page_size, __pa(p)); + addr = vmemmap_list_free(start); + if (addr) { + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + + if (PageReserved(page)) { + /* allocated from bootmem */ + if (page_size < PAGE_SIZE) { + /* + * this shouldn't happen, but if it is + * the case, leave the memory there + */ + WARN_ON_ONCE(1); + } else { + unsigned int nr_pages = + 1 << get_order(page_size); + while (nr_pages--) + free_reserved_page(page++); + } + } else + free_pages((unsigned long)(__va(addr)), + get_order(page_size)); + + vmemmap_remove_mapping(start, page_size); + } } - - return 0; -} - -void vmemmap_free(unsigned long start, unsigned long end) -{ } - +#endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *start_page, unsigned long size) { -- cgit v1.2.3 From 16a05bff128de196fc17edd2beaa40d0f07ae04a Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 11 Jun 2014 16:23:39 +0800 Subject: powerpc: start loop at section start of start in vmemmap_populated() vmemmap_populated() checks whether the [start, start + page_size) has valid pfn numbers, to know whether a vmemmap mapping has been created that includes this range. Some range before end might not be checked by this loop: sec11start......start11..sec11end/sec12start..end....start12..sec12end as the above, for start11(section 11), it checks [sec11start, sec11end), and loop ends as the next start(start12) is bigger than end. However, [sec11end/sec12start, end) is not checked here. So before the loop, adjust the start to be the start of the section, so we don't miss ranges like the above. After we adjust start to be the start of the section, it also means it's aligned with vmemmap as of the sizeof struct page, so we could use page_to_pfn directly in the loop. Signed-off-by: Li Zhong Cc: Nathan Fontenot Acked-by: Nathan Fontenot Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/init_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 496379013873..253b4b971c8a 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -175,9 +175,10 @@ static unsigned long __meminit vmemmap_section_start(unsigned long page) static int __meminit vmemmap_populated(unsigned long start, int page_size) { unsigned long end = start + page_size; + start = (unsigned long)(pfn_to_page(vmemmap_section_start(start))); for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page))) - if (pfn_valid(vmemmap_section_start(start))) + if (pfn_valid(page_to_pfn((struct page *)start))) return 1; return 0; -- cgit v1.2.3 From f51202de8ffd9dc8ca76ea81a916349138e234ca Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Wed, 6 Aug 2014 16:07:45 -0700 Subject: memory-hotplug: ppc: suitable memory should go to ZONE_MOVABLE This patch introduces zone_for_memory() to arch_add_memory() on powerpc to ensure new, higher memory added into ZONE_MOVABLE if movable zone has already setup. Signed-off-by: Wang Nan Cc: Zhang Yanfei Cc: Dave Hansen Cc: Ingo Molnar Cc: Yinghai Lu Cc: "Mel Gorman" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 2c8e90f5789e..e0f7a189c48e 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -128,7 +128,8 @@ int arch_add_memory(int nid, u64 start, u64 size) return -EINVAL; /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones; + zone = pgdata->node_zones + + zone_for_memory(nid, start, size, 0); return __add_pages(nid, zone, start_pfn, nr_pages); } -- cgit v1.2.3 From 308c09f17da4adc53935115dbeb5bce4f067d8f9 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Fri, 8 Aug 2014 14:23:25 -0700 Subject: lib/scatterlist: make ARCH_HAS_SG_CHAIN an actual Kconfig Rather than have architectures #define ARCH_HAS_SG_CHAIN in an architecture specific scatterlist.h, make it a proper Kconfig option and use that instead. At same time, remove the header files are are now mostly useless and just include asm-generic/scatterlist.h. [sfr@canb.auug.org.au: powerpc files now need asm/dma.h] Signed-off-by: Laura Abbott Acked-by: Thomas Gleixner [x86] Acked-by: Benjamin Herrenschmidt [powerpc] Acked-by: Heiko Carstens Cc: Russell King Cc: Tony Luck Cc: Fenghua Yu Cc: Paul Mackerras Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: Martin Schwidefsky Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/dma-noncoherent.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 7b6c10750179..d85e86aac7fb 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -33,6 +33,7 @@ #include #include +#include #include "mmu_decl.h" -- cgit v1.2.3 From 5d61a2172a0142c635ab6d7c3b1589af85a3603e Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 8 Aug 2014 18:44:01 -0500 Subject: powerpc/nohash: Split __early_init_mmu() into boot and secondary __early_init_mmu() does some things that are really only needed by the boot cpu. On FSL booke, This includes calling memblock_enforce_memory_limit(), which is labelled __init. Secondary cpu init code can't be __init as that would break CPU hotplug. While it's probably a bug that memblock_enforce_memory_limit() isn't __init_memblock instead, there's no reason why we should be doing this stuff for secondary cpus in the first place. Signed-off-by: Scott Wood Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/tlb_nohash.c | 111 +++++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 45 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 92cb18d52ea8..f38ea4df6a85 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -581,42 +581,10 @@ static void setup_mmu_htw(void) /* * Early initialization of the MMU TLB code */ -static void __early_init_mmu(int boot_cpu) +static void early_init_this_mmu(void) { unsigned int mas4; - /* XXX This will have to be decided at runtime, but right - * now our boot and TLB miss code hard wires it. Ideally - * we should find out a suitable page size and patch the - * TLB miss code (either that or use the PACA to store - * the value we want) - */ - mmu_linear_psize = MMU_PAGE_1G; - - /* XXX This should be decided at runtime based on supported - * page sizes in the TLB, but for now let's assume 16M is - * always there and a good fit (which it probably is) - * - * Freescale booke only supports 4K pages in TLB0, so use that. - */ - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) - mmu_vmemmap_psize = MMU_PAGE_4K; - else - mmu_vmemmap_psize = MMU_PAGE_16M; - - /* XXX This code only checks for TLB 0 capabilities and doesn't - * check what page size combos are supported by the HW. It - * also doesn't handle the case where a separate array holds - * the IND entries from the array loaded by the PT. - */ - if (boot_cpu) { - /* Look for supported page sizes */ - setup_page_sizes(); - - /* Look for HW tablewalk support */ - setup_mmu_htw(); - } - /* Set MAS4 based on page table setting */ mas4 = 0x4 << MAS4_WIMGED_SHIFT; @@ -650,11 +618,6 @@ static void __early_init_mmu(int boot_cpu) } mtspr(SPRN_MAS4, mas4); - /* Set the global containing the top of the linear mapping - * for use by the TLB miss code - */ - linear_map_top = memblock_end_of_DRAM(); - #ifdef CONFIG_PPC_FSL_BOOK3E if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned int num_cams; @@ -662,10 +625,49 @@ static void __early_init_mmu(int boot_cpu) /* use a quarter of the TLBCAM for bolted linear map */ num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; linear_map_top = map_mem_in_cams(linear_map_top, num_cams); + } +#endif - /* limit memory so we dont have linear faults */ - memblock_enforce_memory_limit(linear_map_top); + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); +} +static void __init early_init_mmu_global(void) +{ + /* XXX This will have to be decided at runtime, but right + * now our boot and TLB miss code hard wires it. Ideally + * we should find out a suitable page size and patch the + * TLB miss code (either that or use the PACA to store + * the value we want) + */ + mmu_linear_psize = MMU_PAGE_1G; + + /* XXX This should be decided at runtime based on supported + * page sizes in the TLB, but for now let's assume 16M is + * always there and a good fit (which it probably is) + * + * Freescale booke only supports 4K pages in TLB0, so use that. + */ + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) + mmu_vmemmap_psize = MMU_PAGE_4K; + else + mmu_vmemmap_psize = MMU_PAGE_16M; + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + /* Look for supported page sizes */ + setup_page_sizes(); + + /* Look for HW tablewalk support */ + setup_mmu_htw(); + +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { if (book3e_htw_mode == PPC_HTW_NONE) { extlb_level_exc = EX_TLB_SIZE; patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); @@ -675,22 +677,41 @@ static void __early_init_mmu(int boot_cpu) } #endif - /* A sync won't hurt us after mucking around with - * the MMU configuration + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code */ - mb(); + linear_map_top = memblock_end_of_DRAM(); +} + +static void __init early_mmu_set_memory_limit(void) +{ +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + /* + * Limit memory so we dont have linear faults. + * Unlike memblock_set_current_limit, which limits + * memory available during early boot, this permanently + * reduces the memory available to Linux. We need to + * do this because highmem is not supported on 64-bit. + */ + memblock_enforce_memory_limit(linear_map_top); + } +#endif memblock_set_current_limit(linear_map_top); } +/* boot cpu only */ void __init early_init_mmu(void) { - __early_init_mmu(1); + early_init_mmu_global(); + early_init_this_mmu(); + early_mmu_set_memory_limit(); } void early_init_mmu_secondary(void) { - __early_init_mmu(0); + early_init_this_mmu(); } void setup_initial_memory_limit(phys_addr_t first_memblock_base, -- cgit v1.2.3 From 2fabf084b6ad6337675d700b159a6091023544f2 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 17 Jul 2014 16:15:12 -0700 Subject: powerpc: reorder per-cpu NUMA information's initialization There is an issue currently where NUMA information is used on powerpc (and possibly ia64) before it has been read from the device-tree, which leads to large slab consumption with CONFIG_SLUB and memoryless nodes. NUMA powerpc non-boot CPU's cpu_to_node/cpu_to_mem is only accurate after start_secondary(), similar to ia64, which is invoked via smp_init(). Commit 6ee0578b4daae ("workqueue: mark init_workqueues() as early_initcall()") made init_workqueues() be invoked via do_pre_smp_initcalls(), which is obviously before the secondary processors are online. Additionally, the following commits changed init_workqueues() to use cpu_to_node to determine the node to use for kthread_create_on_node: bce903809ab3f ("workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[]") f3f90ad469342 ("workqueue: determine NUMA node of workers accourding to the allowed cpumask") Therefore, when init_workqueues() runs, it sees all CPUs as being on Node 0. On LPARs or KVM guests where Node 0 is memoryless, this leads to a high number of slab deactivations (http://www.spinics.net/lists/linux-mm/msg67489.html). Fix this by initializing the powerpc-specific CPU<->node/local memory node mapping as early as possible, which on powerpc is do_init_bootmem(). Currently that function initializes the mapping for the boot CPU, but we extend it to setup the mapping for all possible CPUs. Then, in smp_prepare_cpus(), we can correspondingly set the per-cpu values for all possible CPUs. That ensures that before the early_initcalls run (and really as early as possible), the per-cpu NUMA mapping is accurate. While testing memoryless nodes on PowerKVM guests with a fix to the workqueue logic to use cpu_to_mem() instead of cpu_to_node(), with a guest topology of: available: 2 nodes (0-1) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 node 0 size: 0 MB node 0 free: 0 MB node 1 cpus: 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 node 1 size: 16336 MB node 1 free: 15329 MB node distances: node 0 1 0: 10 40 1: 40 10 the slab consumption decreases from Slab: 932416 kB SUnreclaim: 902336 kB to Slab: 395264 kB SUnreclaim: 359424 kB And we a corresponding increase in the slab efficiency from slab mem objs slabs used active active ------------------------------------------------------------ kmalloc-16384 337 MB 11.28% 100.00% task_struct 288 MB 9.93% 100.00% to slab mem objs slabs used active active ------------------------------------------------------------ kmalloc-16384 37 MB 100.00% 100.00% task_struct 31 MB 100.00% 100.00% Powerpc didn't support memoryless nodes until recently (64bb80d87f01 "powerpc/numa: Enable CONFIG_HAVE_MEMORYLESS_NODES" and 8c272261194d "powerpc/numa: Enable USE_PERCPU_NUMA_NODE_ID"). Those commits also helped improve memory consumption with these kind of environments. Signed-off-by: Nishanth Aravamudan Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/numa.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index d3e9a78eaed3..d7737a542fd7 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1049,7 +1049,7 @@ static void __init mark_reserved_regions_for_nid(int nid) void __init do_init_bootmem(void) { - int nid; + int nid, cpu; min_low_pfn = 0; max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; @@ -1122,8 +1122,15 @@ void __init do_init_bootmem(void) reset_numa_cpu_lookup_table(); register_cpu_notifier(&ppc64_numa_nb); - cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, - (void *)(unsigned long)boot_cpuid); + /* + * We need the numa_cpu_lookup_table to be accurate for all CPUs, + * even before we online them, so that we can use cpu_to_{node,mem} + * early in boot, cf. smp_prepare_cpus(). + */ + for_each_possible_cpu(cpu) { + cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, + (void *)(unsigned long)cpu); + } } void __init paging_init(void) -- cgit v1.2.3 From b0aa44a3dfae3d8f45bd1264349aa87f87b7774f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 Aug 2014 12:31:57 +0530 Subject: powerpc/thp: Add write barrier after updating the valid bit With hugepages, we store the hpte valid information in the pte page whose address is stored in the second half of the PMD. Use a write barrier to make sure clearing pmd busy bit and updating hpte valid info are ordered properly. CC: Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hugepage-hash64.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 826893fcb3a7..11f9a37ca2c6 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -172,8 +172,11 @@ repeat: mark_hpte_slot_valid(hpte_slot_array, index, slot); } /* - * No need to use ldarx/stdcx here + * The hpte valid is stored in the pgtable whose address is in the + * second half of the PMD. Order this against clearing of the busy bit in + * huge pmd. */ + smp_wmb(); *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); return 0; } -- cgit v1.2.3 From fa1f8ae80f8bb996594167ff4750a0b0a5a5bb5d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 Aug 2014 12:31:58 +0530 Subject: powerpc/thp: Don't recompute vsid and ssize in loop on invalidate The segment identifier and segment size will remain the same in the loop, So we can compute it outside. We also change the hugepage_invalidate interface so that we can use it the later patch CC: Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hash_native_64.c | 19 +++++-------------- arch/powerpc/mm/pgtable_64.c | 24 ++++++++++++------------ 2 files changed, 17 insertions(+), 26 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index cf1d325eae8b..fb89d7695a9a 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -412,18 +412,18 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, local_irq_restore(flags); } -static void native_hugepage_invalidate(struct mm_struct *mm, +static void native_hugepage_invalidate(unsigned long vsid, + unsigned long addr, unsigned char *hpte_slot_array, - unsigned long addr, int psize) + int psize, int ssize) { - int ssize = 0, i; - int lock_tlbie; + int i, lock_tlbie; struct hash_pte *hptep; int actual_psize = MMU_PAGE_16M; unsigned int max_hpte_count, valid; unsigned long flags, s_addr = addr; unsigned long hpte_v, want_v, shift; - unsigned long hidx, vpn = 0, vsid, hash, slot; + unsigned long hidx, vpn = 0, hash, slot; shift = mmu_psize_defs[psize].shift; max_hpte_count = 1U << (PMD_SHIFT - shift); @@ -437,15 +437,6 @@ static void native_hugepage_invalidate(struct mm_struct *mm, /* get the vpn */ addr = s_addr + (i * (1ul << shift)); - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - vpn = hpt_vpn(addr, vsid, ssize); hash = hpt_hash(vpn, shift, ssize); if (hidx & _PTEIDX_SECONDARY) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 3b3c4d34c7a0..5039f3b04d6e 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -745,12 +745,21 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, if (!hpte_slot_array) return; - /* get the base page size */ + /* get the base page size,vsid and segment size */ psize = get_slice_psize(mm, s_addr); + if (!is_kernel_addr(s_addr)) { + ssize = user_segment_size(s_addr); + vsid = get_vsid(mm->context.id, s_addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(s_addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } if (ppc_md.hugepage_invalidate) - return ppc_md.hugepage_invalidate(mm, hpte_slot_array, - s_addr, psize); + return ppc_md.hugepage_invalidate(vsid, s_addr, + hpte_slot_array, + psize, ssize); /* * No bluk hpte removal support, invalidate each entry */ @@ -768,15 +777,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, /* get the vpn */ addr = s_addr + (i * (1ul << shift)); - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - vpn = hpt_vpn(addr, vsid, ssize); hash = hpt_hash(vpn, shift, ssize); if (hidx & _PTEIDX_SECONDARY) -- cgit v1.2.3 From 629149fae478f0ac6bf705a535708b192e9c6b59 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 Aug 2014 12:31:59 +0530 Subject: powerpc/thp: Invalidate old 64K based hash page mapping before insert of 4k pte If we changed base page size of the segment, either via sub_page_protect or via remap_4k_pfn, we do a demote_segment which doesn't flush the hash table entries. We do a lazy hash page table flush for all mapped pages in the demoted segment. This happens when we handle hash page fault for these pages. We use _PAGE_COMBO bit along with _PAGE_HASHPTE to indicate whether a pte is backed by 4K hash pte. If we find _PAGE_COMBO not set on the pte, that implies that we could possibly have older 64K hash pte entries in the hash page table and we need to invalidate those entries. Handle this correctly for 16M pages CC: Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hugepage-hash64.c | 79 ++++++++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 11f9a37ca2c6..1fb609dcc49b 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -18,6 +18,57 @@ #include #include +static void invalidate_old_hpte(unsigned long vsid, unsigned long addr, + pmd_t *pmdp, unsigned int psize, int ssize) +{ + int i, max_hpte_count, valid; + unsigned long s_addr; + unsigned char *hpte_slot_array; + unsigned long hidx, shift, vpn, hash, slot; + + s_addr = addr & HPAGE_PMD_MASK; + hpte_slot_array = get_hpte_slot_array(pmdp); + /* + * IF we try to do a HUGE PTE update after a withdraw is done. + * we will find the below NULL. This happens when we do + * split_huge_page_pmd + */ + if (!hpte_slot_array) + return; + + if (ppc_md.hugepage_invalidate) + return ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array, + psize, ssize); + /* + * No bluk hpte removal support, invalidate each entry + */ + shift = mmu_psize_defs[psize].shift; + max_hpte_count = HPAGE_PMD_SIZE >> shift; + for (i = 0; i < max_hpte_count; i++) { + /* + * 8 bits per each hpte entries + * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] + */ + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + ppc_md.hpte_invalidate(slot, vpn, psize, + MMU_PAGE_16M, ssize, 0); + } +} + + int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, pmd_t *pmdp, unsigned long trap, int local, int ssize, unsigned int psize) @@ -85,6 +136,15 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, vpn = hpt_vpn(ea, vsid, ssize); hash = hpt_hash(vpn, shift, ssize); hpte_slot_array = get_hpte_slot_array(pmdp); + if (psize == MMU_PAGE_4K) { + /* + * invalidate the old hpte entry if we have that mapped via 64K + * base page size. This is because demote_segment won't flush + * hash page table entries. + */ + if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) + invalidate_old_hpte(vsid, ea, pmdp, MMU_PAGE_64K, ssize); + } valid = hpte_valid(hpte_slot_array, index); if (valid) { @@ -107,11 +167,8 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * safely update this here. */ valid = 0; - new_pmd &= ~_PAGE_HPTEFLAGS; hpte_slot_array[index] = 0; - } else - /* clear the busy bits and set the hash pte bits */ - new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + } } if (!valid) { @@ -119,11 +176,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, /* insert new entry */ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; -repeat: - hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; - - /* clear the busy bits and set the hash pte bits */ - new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + new_pmd |= _PAGE_HASHPTE; /* Add in WIMG bits */ rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | @@ -132,6 +185,8 @@ repeat: * enable the memory coherence always */ rflags |= HPTE_R_M; +repeat: + hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; /* Insert into the hash table, primary slot */ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0, @@ -171,6 +226,12 @@ repeat: */ mark_hpte_slot_valid(hpte_slot_array, index, slot); } + /* + * Mark the pte with _PAGE_COMBO, if we are trying to hash it with + * base page size 4k. + */ + if (psize == MMU_PAGE_4K) + new_pmd |= _PAGE_COMBO; /* * The hpte valid is stored in the pgtable whose address is in the * second half of the PMD. Order this against clearing of the busy bit in -- cgit v1.2.3 From fc0479557572375100ef16c71170b29a98e0d69a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 Aug 2014 12:32:00 +0530 Subject: powerpc/thp: Handle combo pages in invalidate If we changed base page size of the segment, either via sub_page_protect or via remap_4k_pfn, we do a demote_segment which doesn't flush the hash table entries. We do a lazy hash page table flush for all mapped pages in the demoted segment. This happens when we handle hash page fault for these pages. We use _PAGE_COMBO bit along with _PAGE_HASHPTE to indicate whether a pte is backed by 4K hash pte. If we find _PAGE_COMBO not set on the pte, that implies that we could possibly have older 64K hash pte entries in the hash page table and we need to invalidate those entries. Use _PAGE_COMBO to determine the page size with which we should invalidate the hash table entries on unmap. CC: Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable_64.c | 14 +++++++++++--- arch/powerpc/mm/tlb_hash64.c | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 5039f3b04d6e..948a81e02ddb 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -538,7 +538,7 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, *pmdp = __pmd((old & ~clr) | set); #endif if (old & _PAGE_HASHPTE) - hpte_do_hugepage_flush(mm, addr, pmdp); + hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; } @@ -645,7 +645,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, if (!(old & _PAGE_SPLITTING)) { /* We need to flush the hpte */ if (old & _PAGE_HASHPTE) - hpte_do_hugepage_flush(vma->vm_mm, address, pmdp); + hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old); } /* * This ensures that generic code that rely on IRQ disabling @@ -723,7 +723,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, * neesd to be flushed. */ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) + pmd_t *pmdp, unsigned long old_pmd) { int ssize, i; unsigned long s_addr; @@ -746,7 +746,15 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, return; /* get the base page size,vsid and segment size */ +#ifdef CONFIG_DEBUG_VM psize = get_slice_psize(mm, s_addr); + BUG_ON(psize == MMU_PAGE_16M); +#endif + if (old_pmd & _PAGE_COMBO) + psize = MMU_PAGE_4K; + else + psize = MMU_PAGE_64K; + if (!is_kernel_addr(s_addr)) { ssize = user_segment_size(s_addr); vsid = get_vsid(mm->context.id, s_addr, ssize); diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index c99f6510a0b2..9adda5790463 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -216,7 +216,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, if (!(pte & _PAGE_HASHPTE)) continue; if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) - hpte_do_hugepage_flush(mm, start, (pmd_t *)pte); + hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); else hpte_need_flush(mm, start, ptep, pte, 0); } -- cgit v1.2.3 From 969b7b208f7408712a3526856e4ae60ad13f6928 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 Aug 2014 12:32:01 +0530 Subject: powerpc/thp: Invalidate with vpn in loop As per ISA, for 4k base page size we compare 14..65 bits of VA specified with the entry_VA in tlb. That implies we need to make sure we do a tlbie with all the possible 4k va we used to access the 16MB hugepage. With 64k base page size we compare 14..57 bits of VA. Hence we cannot ignore the lower 24 bits of va while tlbie .We also cannot tlb invalidate a 16MB entry with just one tlbie instruction because we don't track which va was used to instantiate the tlb entry. CC: Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hash_native_64.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index fb89d7695a9a..afc0a8295f84 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -417,7 +417,7 @@ static void native_hugepage_invalidate(unsigned long vsid, unsigned char *hpte_slot_array, int psize, int ssize) { - int i, lock_tlbie; + int i; struct hash_pte *hptep; int actual_psize = MMU_PAGE_16M; unsigned int max_hpte_count, valid; @@ -456,22 +456,13 @@ static void native_hugepage_invalidate(unsigned long vsid, else /* Invalidate the hpte. NOTE: this also unlocks it */ hptep->v = 0; + /* + * We need to do tlb invalidate for all the address, tlbie + * instruction compares entry_VA in tlb with the VA specified + * here + */ + tlbie(vpn, psize, actual_psize, ssize, 0); } - /* - * Since this is a hugepage, we just need a single tlbie. - * use the last vpn. - */ - lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); - if (lock_tlbie) - raw_spin_lock(&native_tlbie_lock); - - asm volatile("ptesync":::"memory"); - __tlbie(vpn, psize, actual_psize, ssize); - asm volatile("eieio; tlbsync; ptesync":::"memory"); - - if (lock_tlbie) - raw_spin_unlock(&native_tlbie_lock); - local_irq_restore(flags); } -- cgit v1.2.3 From 7e467245bf5226db34c4b12d3cbacfa2f7a15a8b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 Aug 2014 12:32:02 +0530 Subject: powerpc/thp: Use ACCESS_ONCE when loading pmdp We would get wrong results in compiler recomputed old_pmd. Avoid that by using ACCESS_ONCE CC: Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/hugepage-hash64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 1fb609dcc49b..5f5e6328c21c 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -84,7 +84,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * atomically mark the linux large page PMD busy and dirty */ do { - old_pmd = pmd_val(*pmdp); + pmd_t pmd = ACCESS_ONCE(*pmdp); + + old_pmd = pmd_val(pmd); /* If PMD busy, retry the access */ if (unlikely(old_pmd & _PAGE_BUSY)) return 0; -- cgit v1.2.3 From 9e813308a5c18c58f9ccae1ec72ed4e14eaf9025 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 Aug 2014 12:32:04 +0530 Subject: powerpc/thp: Add tracepoints to track hugepage invalidate Add tracepoint to track hugepage invalidate. This help us in debugging difficult to track bugs. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable_64.c | 6 ++++++ arch/powerpc/mm/tlb_hash64.c | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 948a81e02ddb..c8d709ab489d 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -54,6 +54,9 @@ #include "mmu_decl.h" +#define CREATE_TRACE_POINTS +#include + /* Some sanity checking */ #if TASK_SIZE_USER64 > PGTABLE_RANGE #error TASK_SIZE_USER64 exceeds pagetable range @@ -537,6 +540,7 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, old = pmd_val(*pmdp); *pmdp = __pmd((old & ~clr) | set); #endif + trace_hugepage_update(addr, old, clr, set); if (old & _PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; @@ -642,6 +646,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, * If we didn't had the splitting flag set, go and flush the * HPTE entries. */ + trace_hugepage_splitting(address, old); if (!(old & _PAGE_SPLITTING)) { /* We need to flush the hpte */ if (old & _PAGE_HASHPTE) @@ -709,6 +714,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, assert_spin_locked(&mm->page_table_lock); WARN_ON(!pmd_trans_huge(pmd)); #endif + trace_hugepage_set_pmd(addr, pmd); return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 9adda5790463..d2a94b85dbc2 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -30,6 +30,8 @@ #include #include +#include + DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); /* @@ -213,6 +215,8 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, if (ptep == NULL) continue; pte = pte_val(*ptep); + if (hugepage_shift) + trace_hugepage_invalidate(start, pte_val(pte)); if (!(pte & _PAGE_HASHPTE)) continue; if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) -- cgit v1.2.3 From 1c98025c6c95bc057a25e2c6596de23288c68160 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 8 Aug 2014 18:40:42 -0500 Subject: powerpc: Dynamic DMA zone limits Platform code can call limit_zone_pfn() to set appropriate limits for ZONE_DMA and ZONE_DMA32, and dma_direct_alloc_coherent() will select a suitable zone based on a device's mask and the pfn limits that platform code has configured. Signed-off-by: Scott Wood Cc: Shaohui Xie --- arch/powerpc/mm/mem.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 2c8e90f5789e..687e7f7f7751 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -260,6 +260,54 @@ static int __init mark_nonram_nosave(void) return 0; } +static bool zone_limits_final; + +static unsigned long max_zone_pfns[MAX_NR_ZONES] = { + [0 ... MAX_NR_ZONES - 1] = ~0UL +}; + +/* + * Restrict the specified zone and all more restrictive zones + * to be below the specified pfn. May not be called after + * paging_init(). + */ +void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit) +{ + int i; + + if (WARN_ON(zone_limits_final)) + return; + + for (i = zone; i >= 0; i--) { + if (max_zone_pfns[i] > pfn_limit) + max_zone_pfns[i] = pfn_limit; + } +} + +/* + * Find the least restrictive zone that is entirely below the + * specified pfn limit. Returns < 0 if no suitable zone is found. + * + * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit + * systems -- the DMA limit can be higher than any possible real pfn. + */ +int dma_pfn_limit_to_zone(u64 pfn_limit) +{ + enum zone_type top_zone = ZONE_NORMAL; + int i; + +#ifdef CONFIG_HIGHMEM + top_zone = ZONE_HIGHMEM; +#endif + + for (i = top_zone; i >= 0; i--) { + if (max_zone_pfns[i] <= pfn_limit) + return i; + } + + return -EPERM; +} + /* * paging_init() sets up the page tables - in fact we've already done this. */ @@ -267,7 +315,7 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); - unsigned long max_zone_pfns[MAX_NR_ZONES]; + enum zone_type top_zone; #ifdef CONFIG_PPC32 unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); @@ -289,13 +337,16 @@ void __init paging_init(void) (unsigned long long)top_of_ram, total_ram); printk(KERN_DEBUG "Memory hole size: %ldMB\n", (long int)((top_of_ram - total_ram) >> 20)); - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); + #ifdef CONFIG_HIGHMEM - max_zone_pfns[ZONE_DMA] = lowmem_end_addr >> PAGE_SHIFT; - max_zone_pfns[ZONE_HIGHMEM] = top_of_ram >> PAGE_SHIFT; + top_zone = ZONE_HIGHMEM; + limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT); #else - max_zone_pfns[ZONE_DMA] = top_of_ram >> PAGE_SHIFT; + top_zone = ZONE_NORMAL; #endif + + limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT); + zone_limits_final = true; free_area_init_nodes(max_zone_pfns); mark_nonram_nosave(); -- cgit v1.2.3 From d4311ff1a8da48d609db9500f121c15580dfeeb7 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:17 +0100 Subject: init/main.c: Give init_task a canary Tasks get their end of stack set to STACK_END_MAGIC with the aim to catch stack overruns. Currently this feature does not apply to init_task. This patch removes this restriction. Note that a similar patch was posted by Prarit Bhargava some time ago but was never merged: http://marc.info/?l=linux-kernel&m=127144305403241&w=2 Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Acked-by: Oleg Nesterov Acked-by: Michael Ellerman Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Alex Thorlton Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Daeseok Youn Cc: David Rientjes Cc: Fabian Frederick Cc: Geert Uytterhoeven Cc: Jiri Olsa Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Opdenacker Cc: Paul Mackerras Cc: Prarit Bhargava Cc: Rik van Riel Cc: Rusty Russell Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Vladimir Davydov Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-2-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 51ab9e7e6c39..35d0760c3fa4 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -538,7 +537,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) regs->nip); stackend = end_of_stack(current); - if (current != &init_task && *stackend != STACK_END_MAGIC) + if (*stackend != STACK_END_MAGIC) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); -- cgit v1.2.3 From a70857e46dd13e87ae06bf0e64cb6a2d4f436265 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Fri, 12 Sep 2014 14:16:18 +0100 Subject: sched: Add helper for task stack page overrun checking This facility is used in a few places so let's introduce a helper function to improve code readability. Signed-off-by: Aaron Tomlin Signed-off-by: Peter Zijlstra (Intel) Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dzickus@redhat.com Cc: bmr@redhat.com Cc: jcastillo@redhat.com Cc: oleg@redhat.com Cc: riel@redhat.com Cc: prarit@redhat.com Cc: jgh@redhat.com Cc: minchan@kernel.org Cc: mpe@ellerman.id.au Cc: tglx@linutronix.de Cc: hannes@cmpxchg.org Cc: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Michael Ellerman Cc: Paul Mackerras Cc: Seiji Aguchi Cc: Steven Rostedt Cc: Yasuaki Ishimatsu Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/1410527779-8133-3-git-send-email-atomlin@redhat.com Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 35d0760c3fa4..99b2f2775658 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -507,7 +507,6 @@ bail: void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { const struct exception_table_entry *entry; - unsigned long *stackend; /* Are we prepared to handle this fault? */ if ((entry = search_exception_tables(regs->nip)) != NULL) { @@ -536,8 +535,7 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", regs->nip); - stackend = end_of_stack(current); - if (*stackend != STACK_END_MAGIC) + if (task_stack_end_corrupted(current)) printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); die("Kernel access of bad area", regs, sig); -- cgit v1.2.3 From 6db35ad2373eed5deb3b105ae7c1e9de3e34ae94 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Thu, 18 Sep 2014 14:05:02 -0500 Subject: powerpc/mm: Use common paging_init() for NUMA Commit 1c98025c6c95bc057a25e2c6596de23288c68160 "powerpc: Dynamic DMA zone limits" updated how zones are created in paging_init(), but missed the NUMA version of paging_init(). This was noticed via a linker error, since dma_pfn_limit_to_zone() was, like the non-NUMA paging_init(), limited by #ifndef CONFIG_NEED_MULTIPLE_NODES. It turns out that the NUMA paging_init() was not actually doing anything different from the standard paging_init(), other than a couple debug prints, a couple 32-bit-only ifdef sections, and a call to mark_nonram_nosave(). It's not clear whether mark_nonram_nosave() is inherently wrong to do for NUMA, or just not useful on targets that have NUMA, but for now I'm preserving the existing behavior. Fixes: 1c98025c6c9 "powerpc: Dynamic DMA zone limits" Reported-by: Stephen Rothwell Signed-off-by: Scott Wood --- arch/powerpc/mm/mem.c | 7 ++++++- arch/powerpc/mm/numa.c | 8 -------- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 687e7f7f7751..420dfff22ba5 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -259,6 +259,12 @@ static int __init mark_nonram_nosave(void) } return 0; } +#else /* CONFIG_NEED_MULTIPLE_NODES */ +static int __init mark_nonram_nosave(void) +{ + return 0; +} +#endif static bool zone_limits_final; @@ -351,7 +357,6 @@ void __init paging_init(void) mark_nonram_nosave(); } -#endif /* ! CONFIG_NEED_MULTIPLE_NODES */ static void __init register_page_bootmem_info(void) { diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 3b181b22cd46..5eb07f332de1 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1126,14 +1126,6 @@ void __init do_init_bootmem(void) (void *)(unsigned long)boot_cpuid); } -void __init paging_init(void) -{ - unsigned long max_zone_pfns[MAX_NR_ZONES]; - memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT; - free_area_init_nodes(max_zone_pfns); -} - static int __init early_numa(char *p) { if (!p) -- cgit v1.2.3 From e1802b065d189cdfa25eaf6d019c222a91618b9c Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 20 Aug 2014 08:00:02 +1000 Subject: powerpc: Move more symbol exports next to function definitions Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index daee7f4e5a14..bc6cc2ad037b 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -92,6 +92,7 @@ extern unsigned long dart_tablebase; static unsigned long _SDR1; struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +EXPORT_SYMBOL_GPL(mmu_psize_defs); struct hash_pte *htab_address; unsigned long htab_size_bytes; -- cgit v1.2.3 From e51df2c170efaeadce4d416e1825b0830de0a795 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 20 Aug 2014 08:55:18 +1000 Subject: powerpc: Make a bunch of things static Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 2 +- arch/powerpc/mm/pgtable.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index bc6cc2ad037b..5786f2a81efe 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -868,7 +868,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) } #ifdef CONFIG_PPC_MM_SLICES -unsigned int get_paca_psize(unsigned long addr) +static unsigned int get_paca_psize(unsigned long addr) { u64 lpsizes; unsigned char *hpsizes; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index c695943a513c..c90e602677c9 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -48,7 +48,7 @@ static inline int pte_looks_normal(pte_t pte) (_PAGE_PRESENT | _PAGE_USER); } -struct page * maybe_pte_to_page(pte_t pte) +static struct page *maybe_pte_to_page(pte_t pte) { unsigned long pfn = pte_pfn(pte); struct page *page; -- cgit v1.2.3 From 1217d34b531c76362217057ca70a8ce8950574e0 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 20 Aug 2014 08:55:19 +1000 Subject: powerpc: Ensure global functions include their prototype Fix a number of places where global functions were not including their prototype. This ensures the prototype and the function match. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index b0c75cc15efc..86f6a755af0b 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -30,9 +30,11 @@ #include #include #include +#include #include #include #include +#include /* some sanity checks */ #if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE -- cgit v1.2.3 From f6026df1a4997db64e8201627421758585a15f55 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 20 Aug 2014 08:55:21 +1000 Subject: powerpc: Move htab_remove_mapping function prototype into header file A recent patch added a function prototype for htab_remove_mapping in c code. Fix it. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init_64.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 253b4b971c8a..3481556a1880 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -233,9 +233,6 @@ static void __meminit vmemmap_create_mapping(unsigned long start, } #ifdef CONFIG_MEMORY_HOTPLUG -extern int htab_remove_mapping(unsigned long vstart, unsigned long vend, - int psize, int ssize); - static void vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { -- cgit v1.2.3 From 70ad237515d99595ed03848bd8e549e50e83c4f2 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 27 Aug 2014 17:33:59 +0800 Subject: powerpc: Fix warning reported by verify_cpu_node_mapping() With commit 2fabf084b6ad ("powerpc: reorder per-cpu NUMA information's initialization"), during boottime, cpu_numa_callback() is called earlier(before their online) for each cpu, and verify_cpu_node_mapping() uses cpu_to_node() to check whether siblings are in the same node. It skips the checking for siblings that are not online yet. So the only check done here is for the bootcpu, which is online at that time. But the per-cpu numa_node cpu_to_node() uses hasn't been set up yet (which will be set up in smp_prepare_cpus()). So I saw something like following reported: [ 0.000000] CPU thread siblings 1/2/3 and 0 don't belong to the same node! As we don't actually do the checking during this early stage, so maybe we could directly call numa_setup_cpu() in do_init_bootmem(). Cc: Nishanth Aravamudan Cc: Nathan Fontenot Signed-off-by: Li Zhong Acked-by: Nishanth Aravamudan Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index d7737a542fd7..9918c0200857 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1128,8 +1128,7 @@ void __init do_init_bootmem(void) * early in boot, cf. smp_prepare_cpus(). */ for_each_possible_cpu(cpu) { - cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, - (void *)(unsigned long)cpu); + numa_setup_cpu((unsigned long)cpu); } } -- cgit v1.2.3 From bc3c4327c92b9ceb9a6356ec64d1b2ab2dc851f9 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 27 Aug 2014 17:34:00 +0800 Subject: powerpc: Only set numa node information for present cpus at boottime As Nish suggested, it makes more sense to init the numa node informatiion for present cpus at boottime, which could also avoid WARN_ON(1) in numa_setup_cpu(). With this change, we also need to change the smp_prepare_cpus() to set up numa information only on present cpus. For those possible, but not present cpus, their numa information will be set up after they are started, as the original code did before commit 2fabf084b6ad. Cc: Nishanth Aravamudan Cc: Nathan Fontenot Signed-off-by: Li Zhong Acked-by: Nishanth Aravamudan Tested-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 9918c0200857..3a9061e9f5dd 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1127,7 +1127,7 @@ void __init do_init_bootmem(void) * even before we online them, so that we can use cpu_to_{node,mem} * early in boot, cf. smp_prepare_cpus(). */ - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { numa_setup_cpu((unsigned long)cpu); } } -- cgit v1.2.3 From 297cf5025b3bda59e15d6ba1f84022ebd409925b Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 27 Aug 2014 17:34:01 +0800 Subject: powerpc: some changes in numa_setup_cpu() this patches changes some error handling logics in numa_setup_cpu(), when cpu node is not found, so: if the cpu is possible, but not present, -1 is kept in numa_cpu_lookup_table, so later, if the cpu is added, we could set correct numa information for it. if the cpu is present, then we set the first online node to numa_cpu_lookup_table instead of 0 ( in case 0 might not be an online node? ) Cc: Nishanth Aravamudan Cc: Nathan Fontenot Signed-off-by: Li Zhong Acked-by: Nishanth Aravamudan Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 3a9061e9f5dd..ec32d463cad9 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -538,7 +538,7 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, */ static int numa_setup_cpu(unsigned long lcpu) { - int nid; + int nid = -1; struct device_node *cpu; /* @@ -555,19 +555,21 @@ static int numa_setup_cpu(unsigned long lcpu) if (!cpu) { WARN_ON(1); - nid = 0; - goto out; + if (cpu_present(lcpu)) + goto out_present; + else + goto out; } nid = of_node_to_nid_single(cpu); +out_present: if (nid < 0 || !node_online(nid)) nid = first_online_node; -out: - map_cpu_to_node(lcpu, nid); + map_cpu_to_node(lcpu, nid); of_node_put(cpu); - +out: return nid; } -- cgit v1.2.3 From 9e34992a622a15fb915471ccdebed19a87d8d531 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 7 Aug 2014 17:26:33 +1000 Subject: powerpc/mm: Unindent htab_dt_scan_page_sizes() We can unindent the bulk of htab_dt_scan_page_sizes() by returning early if the property is not found. That is nice in and of itself, but also has the advantage of making it clear that we always return success once we have found the ibm,segment-page-sizes property. Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 121 ++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 61 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 5786f2a81efe..dd73b63f9479 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -334,70 +334,69 @@ static int __init htab_dt_scan_page_sizes(unsigned long node, return 0; prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); - if (prop != NULL) { - pr_info("Page sizes from device-tree:\n"); - size /= 4; - cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); - while(size > 0) { - unsigned int base_shift = be32_to_cpu(prop[0]); - unsigned int slbenc = be32_to_cpu(prop[1]); - unsigned int lpnum = be32_to_cpu(prop[2]); - struct mmu_psize_def *def; - int idx, base_idx; - - size -= 3; prop += 3; - base_idx = get_idx_from_shift(base_shift); - if (base_idx < 0) { - /* - * skip the pte encoding also - */ - prop += lpnum * 2; size -= lpnum * 2; + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + size /= 4; + cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); + while(size > 0) { + unsigned int base_shift = be32_to_cpu(prop[0]); + unsigned int slbenc = be32_to_cpu(prop[1]); + unsigned int lpnum = be32_to_cpu(prop[2]); + struct mmu_psize_def *def; + int idx, base_idx; + + size -= 3; prop += 3; + base_idx = get_idx_from_shift(base_shift); + if (base_idx < 0) { + /* skip the pte encoding also */ + prop += lpnum * 2; size -= lpnum * 2; + continue; + } + def = &mmu_psize_defs[base_idx]; + if (base_idx == MMU_PAGE_16M) + cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; + + def->shift = base_shift; + if (base_shift <= 23) + def->avpnm = 0; + else + def->avpnm = (1 << (base_shift - 23)) - 1; + def->sllp = slbenc; + /* + * We don't know for sure what's up with tlbiel, so + * for now we only set it for 4K and 64K pages + */ + if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) + def->tlbiel = 1; + else + def->tlbiel = 0; + + while (size > 0 && lpnum) { + unsigned int shift = be32_to_cpu(prop[0]); + int penc = be32_to_cpu(prop[1]); + + prop += 2; size -= 2; + lpnum--; + + idx = get_idx_from_shift(shift); + if (idx < 0) continue; - } - def = &mmu_psize_defs[base_idx]; - if (base_idx == MMU_PAGE_16M) - cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; - - def->shift = base_shift; - if (base_shift <= 23) - def->avpnm = 0; - else - def->avpnm = (1 << (base_shift - 23)) - 1; - def->sllp = slbenc; - /* - * We don't know for sure what's up with tlbiel, so - * for now we only set it for 4K and 64K pages - */ - if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) - def->tlbiel = 1; - else - def->tlbiel = 0; - - while (size > 0 && lpnum) { - unsigned int shift = be32_to_cpu(prop[0]); - int penc = be32_to_cpu(prop[1]); - - prop += 2; size -= 2; - lpnum--; - - idx = get_idx_from_shift(shift); - if (idx < 0) - continue; - - if (penc == -1) - pr_err("Invalid penc for base_shift=%d " - "shift=%d\n", base_shift, shift); - - def->penc[idx] = penc; - pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," - " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", - base_shift, shift, def->sllp, - def->avpnm, def->tlbiel, def->penc[idx]); - } + + if (penc == -1) + pr_err("Invalid penc for base_shift=%d " + "shift=%d\n", base_shift, shift); + + def->penc[idx] = penc; + pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," + " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", + base_shift, shift, def->sllp, + def->avpnm, def->tlbiel, def->penc[idx]); } - return 1; } - return 0; + + return 1; } #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From 63af52629adcd1313c7db252f085263012ecd9db Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 24 Sep 2014 16:59:56 +1000 Subject: powerpc: Simplify do_sigbus Exit out early for a kernel fault, avoiding indenting of most of the function. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/mm/fault.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 51ab9e7e6c39..abc8c816a326 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -120,16 +120,16 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address) up_read(¤t->mm->mmap_sem); - if (user_mode(regs)) { - current->thread.trap_nr = BUS_ADRERR; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, current); - return MM_FAULT_RETURN; - } - return MM_FAULT_ERR(SIGBUS); + if (!user_mode(regs)) + return MM_FAULT_ERR(SIGBUS); + + current->thread.trap_nr = BUS_ADRERR; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; + force_sig_info(SIGBUS, &info, current); + return MM_FAULT_RETURN; } static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) -- cgit v1.2.3 From 3913fdd7a23d9d8480ce3a6ca9cdf78bf0dec5a0 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 24 Sep 2014 16:59:57 +1000 Subject: powerpc: Add VM_FAULT_HWPOISON handling to powerpc page fault handler do_page_fault was missing knowledge of HWPOISON, and we would oops if userspace tried to access a poisoned page: kernel BUG at arch/powerpc/mm/fault.c:180! Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/mm/fault.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index abc8c816a326..588b6ccc0569 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -114,7 +114,8 @@ static int store_updates_sp(struct pt_regs *regs) #define MM_FAULT_CONTINUE -1 #define MM_FAULT_ERR(sig) (sig) -static int do_sigbus(struct pt_regs *regs, unsigned long address) +static int do_sigbus(struct pt_regs *regs, unsigned long address, + unsigned int fault) { siginfo_t info; @@ -128,6 +129,13 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address) info.si_errno = 0; info.si_code = BUS_ADRERR; info.si_addr = (void __user *)address; +#ifdef CONFIG_MEMORY_FAILURE + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + current->comm, current->pid, address); + info.si_code = BUS_MCEERR_AR; + } +#endif force_sig_info(SIGBUS, &info, current); return MM_FAULT_RETURN; } @@ -170,11 +178,8 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) return MM_FAULT_RETURN; } - /* Bus error. x86 handles HWPOISON here, we'll add this if/when - * we support the feature in HW - */ - if (fault & VM_FAULT_SIGBUS) - return do_sigbus(regs, addr); + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) + return do_sigbus(regs, addr, fault); /* We don't understand the fault code, this is fatal */ BUG(); -- cgit v1.2.3 From 9d57472f61acd7c3a33ebf5a79361e316d8ffbef Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 24 Sep 2014 16:59:58 +1000 Subject: powerpc: Fill in si_addr_lsb siginfo field Fill in the si_addr_lsb siginfo field so the hwpoison code can pass to userspace the length of memory that has been corrupted. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/mm/fault.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 588b6ccc0569..24b3f4949df4 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -118,6 +119,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, unsigned int fault) { siginfo_t info; + unsigned int lsb = 0; up_read(¤t->mm->mmap_sem); @@ -135,7 +137,13 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, current->comm, current->pid, address); info.si_code = BUS_MCEERR_AR; } + + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; #endif + info.si_addr_lsb = lsb; force_sig_info(SIGBUS, &info, current); return MM_FAULT_RETURN; } -- cgit v1.2.3 From 3e47d1474c2b4099f0fadd12a6553fdb2e8feaae Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 17 Sep 2014 14:39:36 +1000 Subject: powerpc: Remove powerpc specific cmd_line There is no need for yet another copy of the command line, just use boot_command_line like everyone else. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index cff59f1bec23..cad68ff8eca5 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -106,11 +106,11 @@ unsigned long __max_low_memory = MAX_LOW_MEM; void MMU_setup(void) { /* Check for nobats option (used in mapin_ram). */ - if (strstr(cmd_line, "nobats")) { + if (strstr(boot_command_line, "nobats")) { __map_without_bats = 1; } - if (strstr(cmd_line, "noltlbs")) { + if (strstr(boot_command_line, "noltlbs")) { __map_without_ltlbs = 1; } #ifdef CONFIG_DEBUG_PAGEALLOC -- cgit v1.2.3 From e83d01697583d8610d1d62279758c2a881e3396f Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 8 Oct 2014 19:54:50 +1100 Subject: powerpc/cell: Move spu_handle_mm_fault() out of cell platform Currently spu_handle_mm_fault() is in the cell platform. This code is generically useful for other non-cell co-processors on powerpc. This patch moves this function out of the cell platform into arch/powerpc/mm so that others may use it. Signed-off-by: Ian Munsie Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/mm/Makefile | 1 + arch/powerpc/mm/copro_fault.c | 92 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 arch/powerpc/mm/copro_fault.c (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index d0130fff20e5..325e861616a1 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -34,3 +34,4 @@ obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o +obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c new file mode 100644 index 000000000000..ba7df14c6b82 --- /dev/null +++ b/arch/powerpc/mm/copro_fault.c @@ -0,0 +1,92 @@ +/* + * CoProcessor (SPU/AFU) mm fault handler + * + * (C) Copyright IBM Deutschland Entwicklung GmbH 2007 + * + * Author: Arnd Bergmann + * Author: Jeremy Kerr + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include +#include + +/* + * This ought to be kept in sync with the powerpc specific do_page_fault + * function. Currently, there are a few corner cases that we haven't had + * to handle fortunately. + */ +int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, + unsigned long dsisr, unsigned *flt) +{ + struct vm_area_struct *vma; + unsigned long is_write; + int ret; + + if (mm == NULL) + return -EFAULT; + + if (mm->pgd == NULL) + return -EFAULT; + + down_read(&mm->mmap_sem); + ret = -EFAULT; + vma = find_vma(mm, ea); + if (!vma) + goto out_unlock; + + if (ea < vma->vm_start) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_unlock; + if (expand_stack(vma, ea)) + goto out_unlock; + } + + is_write = dsisr & DSISR_ISSTORE; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock; + } else { + if (dsisr & DSISR_PROTFAULT) + goto out_unlock; + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto out_unlock; + } + + ret = 0; + *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0); + if (unlikely(*flt & VM_FAULT_ERROR)) { + if (*flt & VM_FAULT_OOM) { + ret = -ENOMEM; + goto out_unlock; + } else if (*flt & VM_FAULT_SIGBUS) { + ret = -EFAULT; + goto out_unlock; + } + BUG(); + } + + if (*flt & VM_FAULT_MAJOR) + current->maj_flt++; + else + current->min_flt++; + +out_unlock: + up_read(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL_GPL(copro_handle_mm_fault); -- cgit v1.2.3 From 73d16a6e0e51990cbe13f8d8f43bd5329bbab30a Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 8 Oct 2014 19:54:51 +1100 Subject: powerpc/cell: Move data segment faulting code out of cell platform __spu_trap_data_seg() currently contains code to determine the VSID and ESID required for a particular EA and mm struct. This code is generically useful for other co-processors. This moves the code of the cell platform so it can be used by other powerpc code. It also adds 1TB segment handling which Cell didn't support. The new function is called copro_calculate_slb(). This also moves the internal struct spu_slb to a generic struct copro_slb which is now used in the Cell and copro code. We use this new struct instead of passing around esid and vsid parameters. Signed-off-by: Ian Munsie Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/mm/copro_fault.c | 46 +++++++++++++++++++++++++++++++++++++++++++ arch/powerpc/mm/slb.c | 3 --- 2 files changed, 46 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index ba7df14c6b82..a15a23efc0e2 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -24,6 +24,7 @@ #include #include #include +#include /* * This ought to be kept in sync with the powerpc specific do_page_fault @@ -90,3 +91,48 @@ out_unlock: return ret; } EXPORT_SYMBOL_GPL(copro_handle_mm_fault); + +int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) +{ + u64 vsid; + int psize, ssize; + + slb->esid = (ea & ESID_MASK) | SLB_ESID_V; + + switch (REGION_ID(ea)) { + case USER_REGION_ID: + pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea); + psize = get_slice_psize(mm, ea); + ssize = user_segment_size(ea); + vsid = get_vsid(mm->context.id, ea, ssize); + break; + case VMALLOC_REGION_ID: + pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea); + if (ea < VMALLOC_END) + psize = mmu_vmalloc_psize; + else + psize = mmu_io_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + break; + case KERNEL_REGION_ID: + pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea); + psize = mmu_linear_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + break; + default: + pr_debug("%s: invalid region access at %016llx\n", __func__, ea); + return 1; + } + + vsid = (vsid << slb_vsid_shift(ssize)) | SLB_VSID_USER; + + vsid |= mmu_psize_defs[psize].sllp | + ((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0); + + slb->vsid = vsid; + + return 0; +} +EXPORT_SYMBOL_GPL(copro_calculate_slb); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 0399a6702958..6e450ca66526 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -46,9 +46,6 @@ static inline unsigned long mk_esid_data(unsigned long ea, int ssize, return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot; } -#define slb_vsid_shift(ssize) \ - ((ssize) == MMU_SEGSIZE_256M? SLB_VSID_SHIFT: SLB_VSID_SHIFT_1T) - static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, unsigned long flags) { -- cgit v1.2.3 From be3ebfe8215392b714349554c5138b8b6592fe20 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 8 Oct 2014 19:54:52 +1100 Subject: powerpc/cell: Make spu_flush_all_slbs() generic This moves spu_flush_all_slbs() into a generic call copro_flush_all_slbs(). This will be useful when we add cxl which also needs a similar SLB flush call. Signed-off-by: Ian Munsie Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/mm/copro_fault.c | 9 +++++++++ arch/powerpc/mm/hash_utils_64.c | 10 +++------- arch/powerpc/mm/slice.c | 10 +++------- 3 files changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index a15a23efc0e2..f2aa5a81ad9c 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * This ought to be kept in sync with the powerpc specific do_page_fault @@ -136,3 +137,11 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) return 0; } EXPORT_SYMBOL_GPL(copro_calculate_slb); + +void copro_flush_all_slbs(struct mm_struct *mm) +{ +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif +} +EXPORT_SYMBOL_GPL(copro_flush_all_slbs); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index dd73b63f9479..0842b726b03f 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -51,7 +51,7 @@ #include #include #include -#include +#include #include #include #include @@ -901,9 +901,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) if (get_slice_psize(mm, addr) == MMU_PAGE_4K) return; slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); -#ifdef CONFIG_SPU_BASE - spu_flush_all_slbs(mm); -#endif + copro_flush_all_slbs(mm); if (get_paca_psize(addr) != MMU_PAGE_4K) { get_paca()->context = mm->context; slb_flush_and_rebolt(); @@ -1141,9 +1139,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) "to 4kB pages because of " "non-cacheable mapping\n"); psize = mmu_vmalloc_psize = MMU_PAGE_4K; -#ifdef CONFIG_SPU_BASE - spu_flush_all_slbs(mm); -#endif + copro_flush_all_slbs(mm); } } diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 86f6a755af0b..8d7bda94d196 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include /* some sanity checks */ @@ -234,9 +234,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz spin_unlock_irqrestore(&slice_convert_lock, flags); -#ifdef CONFIG_SPU_BASE - spu_flush_all_slbs(mm); -#endif + copro_flush_all_slbs(mm); } /* @@ -673,9 +671,7 @@ void slice_set_psize(struct mm_struct *mm, unsigned long address, spin_unlock_irqrestore(&slice_convert_lock, flags); -#ifdef CONFIG_SPU_BASE - spu_flush_all_slbs(mm); -#endif + copro_flush_all_slbs(mm); } void slice_set_range_psize(struct mm_struct *mm, unsigned long start, -- cgit v1.2.3 From 8ca7a82f7b64ad29206daba7bdaf2f8766437243 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 8 Oct 2014 19:54:54 +1100 Subject: powerpc/mm: Export mmu_kernel_ssize and mmu_linear_psize Export mmu_kernel_ssize and mmu_linear_psize. These are needed by the cxl driver which has it's own MMU. To setup the MMU cxl needs access to these. Signed-off-by: Ian Munsie Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 0842b726b03f..fe5609d7a9d7 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -99,6 +99,7 @@ unsigned long htab_size_bytes; unsigned long htab_hash_mask; EXPORT_SYMBOL_GPL(htab_hash_mask); int mmu_linear_psize = MMU_PAGE_4K; +EXPORT_SYMBOL_GPL(mmu_linear_psize); int mmu_virtual_psize = MMU_PAGE_4K; int mmu_vmalloc_psize = MMU_PAGE_4K; #ifdef CONFIG_SPARSEMEM_VMEMMAP @@ -106,6 +107,7 @@ int mmu_vmemmap_psize = MMU_PAGE_4K; #endif int mmu_io_psize = MMU_PAGE_4K; int mmu_kernel_ssize = MMU_SEGSIZE_256M; +EXPORT_SYMBOL_GPL(mmu_kernel_ssize); int mmu_highuser_ssize = MMU_SEGSIZE_256M; u16 mmu_slb_size = 64; EXPORT_SYMBOL_GPL(mmu_slb_size); -- cgit v1.2.3 From a1dca3465a7be53980abab1e0d6646176fdc075f Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 8 Oct 2014 19:54:58 +1100 Subject: powerpc/mm: Add new hash_page_mm() This adds a new function hash_page_mm() based on the existing hash_page(). This version allows any struct mm to be passed in, rather than assuming current. This is useful for servicing co-processor faults which are not in the context of the current running process. We need to be careful here as the current hash_page() assumes current in a few places. Signed-off-by: Ian Munsie Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index fe5609d7a9d7..d5339a3b9945 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -904,7 +904,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) return; slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); copro_flush_all_slbs(mm); - if (get_paca_psize(addr) != MMU_PAGE_4K) { + if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { get_paca()->context = mm->context; slb_flush_and_rebolt(); } @@ -989,12 +989,11 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, * -1 - critical hash insertion error * -2 - access not permitted by subpage protection mechanism */ -int hash_page(unsigned long ea, unsigned long access, unsigned long trap) +int hash_page_mm(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap) { enum ctx_state prev_state = exception_enter(); pgd_t *pgdir; unsigned long vsid; - struct mm_struct *mm; pte_t *ptep; unsigned hugeshift; const struct cpumask *tmp; @@ -1008,7 +1007,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) switch (REGION_ID(ea)) { case USER_REGION_ID: user_region = 1; - mm = current->mm; if (! mm) { DBG_LOW(" user region with no mm !\n"); rc = 1; @@ -1019,7 +1017,6 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) vsid = get_vsid(mm->context.id, ea, ssize); break; case VMALLOC_REGION_ID: - mm = &init_mm; vsid = get_kernel_vsid(ea, mmu_kernel_ssize); if (ea < VMALLOC_END) psize = mmu_vmalloc_psize; @@ -1104,7 +1101,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) WARN_ON(1); } #endif - check_paca_psize(ea, mm, psize, user_region); + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); goto bail; } @@ -1145,7 +1143,8 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) } } - check_paca_psize(ea, mm, psize, user_region); + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); #endif /* CONFIG_PPC_64K_PAGES */ #ifdef CONFIG_PPC_HAS_HASH_64K @@ -1180,6 +1179,17 @@ bail: exception_exit(prev_state); return rc; } +EXPORT_SYMBOL_GPL(hash_page_mm); + +int hash_page(unsigned long ea, unsigned long access, unsigned long trap) +{ + struct mm_struct *mm = current->mm; + + if (REGION_ID(ea) == VMALLOC_REGION_ID) + mm = &init_mm; + + return hash_page_mm(mm, ea, access, trap); +} EXPORT_SYMBOL_GPL(hash_page); void hash_preload(struct mm_struct *mm, unsigned long ea, -- cgit v1.2.3 From 4c6d9acce1f4ca815881f0e3adca192795eca1c5 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Wed, 8 Oct 2014 19:55:00 +1100 Subject: powerpc/mm: Add hooks for cxl This adds hooks into the core powerpc mm code for cxl. The core powerpc code sometimes uses local tlbie. Unfortunately this won't work with the current cxl driver as it relies on snooping tlbie broadcasts. The cxl hardware can have TLB entries invalidated via MMIO but this is not currently supported by the driver. In future we can make local tlbie smarter so that it invalidates cxl contexts via MMIO when it needs to but for now we have this workaround. This workaround checks for any active cxl contexts and if so, disables local tlbie. This also adds a hook for when SLBs are invalidated. This ensures any corresponding SLBs in cxl are also invalidated at the same time. This is required for segment demotion. Signed-off-by: Ian Munsie Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/mm/copro_fault.c | 2 ++ arch/powerpc/mm/hash_native_64.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index f2aa5a81ad9c..0f9939e693df 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -26,6 +26,7 @@ #include #include #include +#include /* * This ought to be kept in sync with the powerpc specific do_page_fault @@ -143,5 +144,6 @@ void copro_flush_all_slbs(struct mm_struct *mm) #ifdef CONFIG_SPU_BASE spu_flush_all_slbs(mm); #endif + cxl_slbia(mm); } EXPORT_SYMBOL_GPL(copro_flush_all_slbs); diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index afc0a8295f84..ae4962a06476 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -29,6 +29,8 @@ #include #include +#include + #ifdef DEBUG_LOW #define DBG_LOW(fmt...) udbg_printf(fmt) #else @@ -149,9 +151,11 @@ static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) static inline void tlbie(unsigned long vpn, int psize, int apsize, int ssize, int local) { - unsigned int use_local = local && mmu_has_feature(MMU_FTR_TLBIEL); + unsigned int use_local; int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use(); + if (use_local) use_local = mmu_psize_defs[psize].tlbiel; if (lock_tlbie && !use_local) -- cgit v1.2.3 From 2d15b9b479512f05680541acffd9acbbc831a47c Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 9 Oct 2014 16:41:28 -0700 Subject: powerpc/numa: check error return from proc_create proc_create can fail, we should check the return value and pass up the failure. Suggested-by: Michael Ellerman Signed-off-by: Nishanth Aravamudan Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 649666d5d1c2..51d707d85b2d 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1801,7 +1801,8 @@ static const struct file_operations topology_ops = { static int topology_update_init(void) { start_topology_update(); - proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops); + if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) + return -ENOMEM; return 0; } -- cgit v1.2.3 From 2d73bae12b26db6eba074b70406c707961b6cda9 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Fri, 10 Oct 2014 09:04:49 -0700 Subject: powerpc/numa: Add ability to disable and debug topology updates We have hit a few customer issues with the topology update code (VPHN and PRRN). It would be nice to be able to debug the notifications coming from the hypervisor in both cases to the LPAR, as well as to disable responding to the notifications at boot-time, to narrow down the source of the problems. Add a basic level of such functionality, similar to the numa= command-line parameter. We already have a toggle in /proc/powerpc/topology_updates that allows run-time enabling/disabling, so the updates can be started at run-time if desired. But the bugs we've run into have occured during boot or very shortly after coming to login, and have resulted in a broken NUMA topology. Signed-off-by: Nishanth Aravamudan Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 51d707d85b2d..177659050fa0 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -8,6 +8,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "numa: " fmt + #include #include #include @@ -1153,6 +1155,22 @@ static int __init early_numa(char *p) } early_param("numa", early_numa); +static bool topology_updates_enabled = true; + +static int __init early_topology_updates(char *p) +{ + if (!p) + return 0; + + if (!strcmp(p, "off")) { + pr_info("Disabling topology updates\n"); + topology_updates_enabled = false; + } + + return 0; +} +early_param("topology_updates", early_topology_updates); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Find the node associated with a hot added memory section for @@ -1539,6 +1557,9 @@ int arch_update_cpu_topology(void) struct device *dev; int weight, new_nid, i = 0; + if (!prrn_enabled && !vphn_enabled) + return 0; + weight = cpumask_weight(&cpu_associativity_changes_mask); if (!weight) return 0; @@ -1592,6 +1613,15 @@ int arch_update_cpu_topology(void) cpu = cpu_last_thread_sibling(cpu); } + pr_debug("Topology update for the following CPUs:\n"); + if (cpumask_weight(&updated_cpus)) { + for (ud = &updates[0]; ud; ud = ud->next) { + pr_debug("cpu %d moving from node %d " + "to %d\n", ud->cpu, + ud->old_nid, ud->new_nid); + } + } + /* * In cases where we have nothing to update (because the updates list * is too short or because the new topology is same as the old one), @@ -1800,7 +1830,10 @@ static const struct file_operations topology_ops = { static int topology_update_init(void) { - start_topology_update(); + /* Do not poll for changes if disabled at boot */ + if (topology_updates_enabled) + start_topology_update(); + if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) return -ENOMEM; -- cgit v1.2.3 From 5c9fb1899400096c6818181c525897a31d57e488 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Wed, 15 Oct 2014 12:42:58 +0200 Subject: powerpc/vphn: NUMA node code expects big-endian The associativity domain numbers are obtained from the hypervisor through registers and written into memory by the guest: the packed array passed to vphn_unpack_associativity() is then native-endian, unlike what was assumed in the following commit: commit b08a2a12e44eaec5024b2b969f4fcb98169d1ca3 Author: Alistair Popple Date: Wed Aug 7 02:01:44 2013 +1000 powerpc: Make NUMA device node code endian safe This issue fills the topology with bogus data and makes it unusable. It may lead to severe performance breakdowns. We should ideally patch the vphn_unpack_associativity() function to do the 64-bit loads, but this requires some more brain storming. In the meantime, let's go for a suboptimal and temporary bug fix: this patch converts each 64-bit value of the packed array to big endian, as expected by the current parsing code in vphn_unpack_associativity(). Signed-off-by: Greg Kurz Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 177659050fa0..e5236c24dc07 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1460,8 +1460,11 @@ static long hcall_vphn(unsigned long cpu, __be32 *associativity) long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; u64 flags = 1; int hwcpu = get_hard_smp_processor_id(cpu); + int i; rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); + for (i = 0; i < 6; i++) + retbuf[i] = cpu_to_be64(retbuf[i]); vphn_unpack_associativity(retbuf, associativity); return rc; -- cgit v1.2.3 From 6643773ce1a17c99a3ce29ee8ac2b114d2ba771f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 21 Oct 2014 14:25:38 +1100 Subject: powerpc/mm: Fix build error with hugetlfs disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arch/powerpc/mm/slice.c:704:5: error: expected identifier or ‘(’ before numeric constant int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, ^ make[1]: *** [arch/powerpc/mm/slice.o] Error 1 make: *** [arch/powerpc/mm/slice.o] Error 2 This got introduced via 1217d34b531c76362217057ca70a8ce8950574e0 "powerpc: Ensure global functions include their prototype". We started including linux/hugetlb.h with that patch and now we have #define is_hugepage_only_range(mm, addr, len) 0 with hugetlbfs disabled. Fixes: 1217d34b531c ("powerpc: Ensure global functions include their prototype") Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 8d7bda94d196..ded0ea1afde4 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -682,6 +682,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, slice_convert(mm, mask, psize); } +#ifdef CONFIG_HUGETLB_PAGE /* * is_hugepage_only_range() is used by generic code to verify whether * a normal mmap mapping (non hugetlbfs) is valid on a given area. @@ -726,4 +727,4 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, #endif return !slice_check_fit(mask, available); } - +#endif -- cgit v1.2.3 From 03f54397976581e71a3294ac0e6dfcf4aa36e539 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Tue, 28 Oct 2014 14:25:29 +1100 Subject: powerpc/mm: Use appropriate ESID mask in copro_calculate_slb() This patch makes copro_calculate_slb() mask the ESID by the correct mask for 1T vs 256M segments. This has no effect by itself as the extra bits were ignored, but it makes debugging the segment table entries easier and means that we can directly compare the ESID values for duplicates without needing to worry about masking in the comparison. This will be used to simplify a comparison in the following patch. Signed-off-by: Ian Munsie Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/copro_fault.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 0f9939e693df..5a236f082c78 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -99,8 +99,6 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) u64 vsid; int psize, ssize; - slb->esid = (ea & ESID_MASK) | SLB_ESID_V; - switch (REGION_ID(ea)) { case USER_REGION_ID: pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea); @@ -133,6 +131,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) vsid |= mmu_psize_defs[psize].sllp | ((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0); + slb->esid = (ea & (ssize == MMU_SEGSIZE_1T ? ESID_MASK_1T : ESID_MASK)) | SLB_ESID_V; slb->vsid = vsid; return 0; -- cgit v1.2.3 From 49f8d8c04368d2e29cd26752715367ccafec5b1d Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Fri, 17 Oct 2014 17:49:44 -0700 Subject: powerpc/numa: use cached value of update->cpu in update_cpu_topology There isn't any need to keep referring to update->cpu, as we've already checked cpu == update->cpu at this point. Signed-off-by: Nishanth Aravamudan Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index e5236c24dc07..ef45abf7319e 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1512,8 +1512,8 @@ static int update_cpu_topology(void *data) if (cpu != update->cpu) continue; - unmap_cpu_from_node(update->cpu); - map_cpu_to_node(update->cpu, update->new_nid); + unmap_cpu_from_node(cpu); + map_cpu_to_node(cpu, update->new_nid); vdso_getcpu_init(); } -- cgit v1.2.3 From 2c0a33f9861d38631245f7ef434ecad3413324fb Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Fri, 17 Oct 2014 17:50:40 -0700 Subject: powerpc/numa: ensure per-cpu NUMA mappings are correct on topology update We received a report of warning in kernel/sched/core.c where the sched group was NULL on an LPAR after a topology update. This seems to occur because after the topology update has moved the CPUs, cpu_to_node is returning the old value still, which ends up breaking the consistency of the NUMA topology in the per-cpu maps. Ensure that we update the per-cpu fields when we re-map CPUs. Signed-off-by: Nishanth Aravamudan Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index ef45abf7319e..b9d1dfdbe5bb 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1509,11 +1509,14 @@ static int update_cpu_topology(void *data) cpu = smp_processor_id(); for (update = data; update; update = update->next) { + int new_nid = update->new_nid; if (cpu != update->cpu) continue; unmap_cpu_from_node(cpu); - map_cpu_to_node(cpu, update->new_nid); + map_cpu_to_node(cpu, new_nid); + set_cpu_numa_node(cpu, new_nid); + set_cpu_numa_mem(cpu, local_memory_node(new_nid)); vdso_getcpu_init(); } -- cgit v1.2.3 From 94966b712c6875939fcdd83cb2707a797e131a43 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 29 Oct 2014 21:14:27 +0100 Subject: powerpc: Fix section mismatch warning Add __init to MMU_setup() which uses __initdata boot_command_line. Also MMU_setup() is only called from MMU_init(), which is also __init. Warning appeared since commit 3e47d1474c2b. Fixes: 3e47d1474c2b ("powerpc: Remove powerpc specific cmd_line") Signed-off-by: Fabian Frederick [mpe: Update changelog] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/mm') diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index cad68ff8eca5..415a51b028b9 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -103,7 +103,7 @@ unsigned long __max_low_memory = MAX_LOW_MEM; /* * Check for command-line options that affect what MMU_init will do. */ -void MMU_setup(void) +void __init MMU_setup(void) { /* Check for nobats option (used in mapin_ram). */ if (strstr(boot_command_line, "nobats")) { -- cgit v1.2.3