summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
authorSachin Nikam <snikam@nvidia.com>2013-03-06 19:12:15 +0530
committerSachin Nikam <snikam@nvidia.com>2013-03-06 19:31:07 +0530
commitd3dd6b4227fe3e59df0a55f679ec196e4ccb1e3a (patch)
tree0137a07ecab729a19cea68b671c88def5bf83a2d /arch/x86
parentbb36790494c382ec26250f351c8cea76613f0f02 (diff)
parent2713e2797a78a0fdda765bc5ad7fed41e94818ba (diff)
Merge branch 'linux-3.4.35' into rel-17
Bug 1243631 Change-Id: I915826047b2e20f0ad0a7d75df295c6cbf6e5b0a
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile3
-rw-r--r--arch/x86/boot/compressed/eboot.c2
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/efi.h2
-rw-r--r--arch/x86/include/asm/fpu-internal.h15
-rw-r--r--arch/x86/include/asm/mmzone_32.h6
-rw-r--r--arch/x86/include/asm/pgtable.h16
-rw-r--r--arch/x86/include/asm/ptrace.h15
-rw-r--r--arch/x86/include/asm/xen/page.h3
-rw-r--r--arch/x86/kernel/alternative.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c21
-rw-r--r--arch/x86/kernel/cpu/amd.c28
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c3
-rw-r--r--arch/x86/kernel/e820.c3
-rw-r--r--arch/x86/kernel/entry_32.S9
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/head.c53
-rw-r--r--arch/x86/kernel/hpet.c4
-rw-r--r--arch/x86/kernel/microcode_amd.c11
-rw-r--r--arch/x86/kernel/msr.c3
-rw-r--r--arch/x86/kernel/ptrace.c30
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c128
-rw-r--r--arch/x86/kernel/smpboot.c5
-rw-r--r--arch/x86/kvm/cpuid.h3
-rw-r--r--arch/x86/kvm/x86.c3
-rw-r--r--arch/x86/mm/fault.c8
-rw-r--r--arch/x86/mm/hugetlbpage.c21
-rw-r--r--arch/x86/mm/init.c69
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/numa.c32
-rw-r--r--arch/x86/mm/numa_32.c161
-rw-r--r--arch/x86/mm/numa_internal.h6
-rw-r--r--arch/x86/oprofile/nmi_int.c2
-rw-r--r--arch/x86/pci/fixup.c17
-rw-r--r--arch/x86/platform/efi/efi.c100
-rw-r--r--arch/x86/platform/efi/efi_64.c22
-rw-r--r--arch/x86/power/hibernate_32.c2
-rw-r--r--arch/x86/syscalls/syscall_64.tbl6
-rw-r--r--arch/x86/xen/enlighten.c18
-rw-r--r--arch/x86/xen/mmu.c21
-rw-r--r--arch/x86/xen/p2m.c27
-rw-r--r--arch/x86/xen/setup.c13
-rw-r--r--arch/x86/xen/spinlock.c1
-rw-r--r--arch/x86/xen/xen-asm_32.S14
47 files changed, 535 insertions, 390 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7cbdfdac3c7c..29cf70ff6b43 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1244,10 +1244,6 @@ config HAVE_ARCH_BOOTMEM
def_bool y
depends on X86_32 && NUMA
-config HAVE_ARCH_ALLOC_REMAP
- def_bool y
- depends on X86_32 && NUMA
-
config ARCH_HAVE_MEMORY_PRESENT
def_bool y
depends on X86_32 && DISCONTIGMEM
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index b1c611e6da67..f1276aaeffdb 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -85,7 +85,7 @@ endif
ifdef CONFIG_X86_X32
x32_ld_ok := $(call try-run,\
/bin/echo -e '1: .quad 1b' | \
- $(CC) $(KBUILD_AFLAGS) -c -xassembler -o "$$TMP" - && \
+ $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" - && \
$(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \
$(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n)
ifeq ($(x32_ld_ok),y)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index e398bb5d63bb..8a84501acb1b 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -28,6 +28,9 @@ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
$(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
$(obj)/piggy.o
+$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
+$(obj)/efi_stub_$(BITS).o: KBUILD_CLFAGS += -fshort-wchar -mno-red-zone
+
ifeq ($(CONFIG_EFI_STUB), y)
VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
endif
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 0cdfc0d2315e..8bb90707cb55 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -12,6 +12,8 @@
#include <asm/setup.h>
#include <asm/desc.h>
+#undef memcpy /* Use memcpy from misc.c */
+
#include "eboot.h"
static efi_system_table_t *sys_table;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e3e734005e19..66c4cae439c9 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -205,7 +205,7 @@ sysexit_from_sys_call:
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
movl %eax,%esi /* second arg, syscall return value */
cmpl $-MAX_ERRNO,%eax /* is it an error ? */
jbe 1f
@@ -215,7 +215,7 @@ sysexit_from_sys_call:
call __audit_syscall_exit
movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz \exit
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index c9dcc181d4d1..da37433e3628 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -94,10 +94,12 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
#endif /* CONFIG_X86_32 */
extern int add_efi_memmap;
+extern unsigned long x86_efi_facility;
extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
extern int efi_memblock_x86_reserve_range(void);
extern void efi_call_phys_prelog(void);
extern void efi_call_phys_epilog(void);
+extern void efi_unmap_memmap(void);
#ifndef CONFIG_EFI
/*
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 4fa88154e4de..92e05b6c84f8 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -334,14 +334,17 @@ static inline void __thread_fpu_begin(struct task_struct *tsk)
typedef struct { int preload; } fpu_switch_t;
/*
- * FIXME! We could do a totally lazy restore, but we need to
- * add a per-cpu "this was the task that last touched the FPU
- * on this CPU" variable, and the task needs to have a "I last
- * touched the FPU on this CPU" and check them.
+ * Must be run with preemption disabled: this clears the fpu_owner_task,
+ * on this CPU.
*
- * We don't do that yet, so "fpu_lazy_restore()" always returns
- * false, but some day..
+ * This will disable any lazy FPU state restore of the current FPU state,
+ * but if the current thread owns the FPU, it will still be saved by.
*/
+static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+{
+ per_cpu(fpu_owner_task, cpu) = NULL;
+}
+
static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
{
return new == percpu_read_stable(fpu_owner_task) &&
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 55728e121473..5e0286f8081a 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -14,12 +14,6 @@ extern struct pglist_data *node_data[];
#include <asm/numaq.h>
-extern void resume_map_numa_kva(pgd_t *pgd);
-
-#else /* !CONFIG_NUMA */
-
-static inline void resume_map_numa_kva(pgd_t *pgd) {}
-
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DISCONTIGMEM
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 49afb3f41eb6..3f3dd526b3b8 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -142,12 +142,16 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}
+static inline unsigned long pud_pfn(pud_t pud)
+{
+ return (pud_val(pud) & PTE_PFN_MASK) >> PAGE_SHIFT;
+}
+
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
static inline int pmd_large(pmd_t pte)
{
- return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
- (_PAGE_PSE | _PAGE_PRESENT);
+ return pmd_flags(pte) & _PAGE_PSE;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -415,7 +419,13 @@ static inline int pte_hidden(pte_t pte)
static inline int pmd_present(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_PRESENT;
+ /*
+ * Checking for _PAGE_PSE is needed too because
+ * split_huge_page will temporarily clear the present bit (but
+ * the _PAGE_PSE flag will remain set at all times while the
+ * _PAGE_PRESENT bit is clear).
+ */
+ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}
static inline int pmd_none(pmd_t pmd)
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index dcfde52979c3..19f16ebaf4fa 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -205,21 +205,14 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
}
#endif
-/*
- * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
- * when it traps. The previous stack will be directly underneath the saved
- * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
- *
- * This is valid only for kernel mode traps.
- */
-static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
-{
#ifdef CONFIG_X86_32
- return (unsigned long)(&regs->sp);
+extern unsigned long kernel_stack_pointer(struct pt_regs *regs);
#else
+static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
return regs->sp;
-#endif
}
+#endif
#define GET_IP(regs) ((regs)->ip)
#define GET_FP(regs) ((regs)->bp)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c34f96c2f7a0..1bd321d00f26 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -50,7 +50,8 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s,
extern int m2p_add_override(unsigned long mfn, struct page *page,
struct gnttab_map_grant_ref *kmap_op);
-extern int m2p_remove_override(struct page *page, bool clear_pte);
+extern int m2p_remove_override(struct page *page,
+ struct gnttab_map_grant_ref *kmap_op);
extern struct page *m2p_find_override(unsigned long mfn);
extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 73ef56c5a8b3..bda833c5f6ce 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -160,7 +160,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
#endif
#ifdef P6_NOP1
-static const unsigned char __initconst_or_module p6nops[] =
+static const unsigned char p6nops[] =
{
P6_NOP1,
P6_NOP2,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 991e315f4227..db31a2cfc9a9 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -20,18 +20,19 @@ static int set_x2apic_phys_mode(char *arg)
}
early_param("x2apic_phys", set_x2apic_phys_mode);
-static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+static bool x2apic_fadt_phys(void)
{
- if (x2apic_phys)
- return x2apic_enabled();
- else if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) &&
- x2apic_enabled()) {
+ if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
+ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
printk(KERN_DEBUG "System requires x2apic physical mode\n");
- return 1;
+ return true;
}
- else
- return 0;
+ return false;
+}
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
}
static void
@@ -114,7 +115,7 @@ static void init_x2apic_ldr(void)
static int x2apic_phys_probe(void)
{
- if (x2apic_mode && x2apic_phys)
+ if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
return 1;
return apic == &apic_x2apic_phys;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 146bb6218eec..a9c8a46dd7d3 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -598,6 +598,34 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
}
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if ((c->x86 == 0x15) &&
+ (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+ u64 val;
+
+ if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) {
+ val |= 0x1E;
+ checking_wrmsrl(0xc0011021, val);
+ }
+ }
+
+ /*
+ * The way access filter has a performance penalty on some workloads.
+ * Disable it on the affected CPUs.
+ */
+ if ((c->x86 == 0x15) &&
+ (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
+ u64 val;
+
+ if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) {
+ val |= 0x1E;
+ checking_wrmsrl(0xc0011021, val);
+ }
+ }
+
cpu_detect_cache_sizes(c);
/* Multi core CPU? */
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0a630dd4b620..646d192b18a2 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -68,7 +68,8 @@ static void __init ms_hyperv_init_platform(void)
printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
- clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
+ if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
+ clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 62d61e9976eb..298dc000378a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1076,6 +1076,9 @@ void __init memblock_x86_fill(void)
memblock_add(ei->addr, ei->size);
}
+ /* throw away partial pages */
+ memblock_trim_memory(PAGE_SIZE);
+
memblock_dump_all();
}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7b784f4ef1e4..2af4ccd88d16 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1025,7 +1025,7 @@ ENTRY(xen_sysenter_target)
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
- pushl_cfi $0
+ pushl_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
TRACE_IRQS_OFF
@@ -1067,14 +1067,15 @@ ENTRY(xen_failsafe_callback)
2: mov 8(%esp),%es
3: mov 12(%esp),%fs
4: mov 16(%esp),%gs
+ /* EAX == 0 => Category 1 (Bad segment)
+ EAX != 0 => Category 2 (Bad IRET) */
testl %eax,%eax
popl_cfi %eax
lea 16(%esp),%esp
CFI_ADJUST_CFA_OFFSET -16
jz 5f
- addl $16,%esp
- jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
-5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
+ jmp iret_exc
+5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
jmp ret_from_exception
CFI_ENDPROC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index cdc79b5cfcd9..bd6f59203b5f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1351,7 +1351,7 @@ ENTRY(xen_failsafe_callback)
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0
+ pushq_cfi $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
jmp error_exit
CFI_ENDPROC
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 48d9d4ea1020..992f442ca155 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -5,8 +5,6 @@
#include <asm/setup.h>
#include <asm/bios_ebda.h>
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
/*
* The BIOS places the EBDA/XBDA at the top of conventional
* memory, and usually decreases the reported amount of
@@ -16,17 +14,30 @@
* chipset: reserve a page before VGA to prevent PCI prefetch
* into it (errata #56). Usually the page is reserved anyways,
* unless you have no PS/2 mouse plugged in.
+ *
+ * This functions is deliberately very conservative. Losing
+ * memory in the bottom megabyte is rarely a problem, as long
+ * as we have enough memory to install the trampoline. Using
+ * memory that is in use by the BIOS or by some DMA device
+ * the BIOS didn't shut down *is* a big problem.
*/
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+#define LOWMEM_CAP 0x9f000U /* Absolute maximum */
+#define INSANE_CUTOFF 0x20000U /* Less than this = insane */
+
void __init reserve_ebda_region(void)
{
unsigned int lowmem, ebda_addr;
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
+ /*
+ * To determine the position of the EBDA and the
+ * end of conventional memory, we need to look at
+ * the BIOS data area. In a paravirtual environment
+ * that area is absent. We'll just have to assume
+ * that the paravirt case can handle memory setup
+ * correctly, without our help.
+ */
if (paravirt_enabled())
return;
@@ -37,19 +48,23 @@ void __init reserve_ebda_region(void)
/* start of EBDA area */
ebda_addr = get_bios_ebda();
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
+ /*
+ * Note: some old Dells seem to need 4k EBDA without
+ * reporting so, so just consider the memory above 0x9f000
+ * to be off limits (bugzilla 2990).
+ */
+
+ /* If the EBDA address is below 128K, assume it is bogus */
+ if (ebda_addr < INSANE_CUTOFF)
+ ebda_addr = LOWMEM_CAP;
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
+ /* If lowmem is less than 128K, assume it is bogus */
+ if (lowmem < INSANE_CUTOFF)
+ lowmem = LOWMEM_CAP;
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
+ /* Use the lower of the lowmem and EBDA markers as the cutoff */
+ lowmem = min(lowmem, ebda_addr);
+ lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */
/* reserve all memory between lowmem and the 1MB mark */
memblock_reserve(lowmem, 0x100000 - lowmem);
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ad0de0c2714e..2861b4cafd2e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -431,7 +431,7 @@ void hpet_msi_unmask(struct irq_data *data)
/* unmask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg |= HPET_TN_FSB;
+ cfg |= HPET_TN_ENABLE | HPET_TN_FSB;
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
@@ -442,7 +442,7 @@ void hpet_msi_mask(struct irq_data *data)
/* mask it */
cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
- cfg &= ~HPET_TN_FSB;
+ cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB);
hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
}
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 8a2ce8fd41c0..5d8cf0d6796c 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -97,6 +97,7 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size,
#define F1XH_MPB_MAX_SIZE 2048
#define F14H_MPB_MAX_SIZE 1824
#define F15H_MPB_MAX_SIZE 4096
+#define F16H_MPB_MAX_SIZE 3458
switch (c->x86) {
case 0x14:
@@ -105,6 +106,9 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size,
case 0x15:
max_size = F15H_MPB_MAX_SIZE;
break;
+ case 0x16:
+ max_size = F16H_MPB_MAX_SIZE;
+ break;
default:
max_size = F1XH_MPB_MAX_SIZE;
break;
@@ -143,11 +147,12 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
unsigned int *current_size)
{
struct microcode_header_amd *mc_hdr;
- unsigned int actual_size;
+ unsigned int actual_size, patch_size;
u16 equiv_cpu_id;
/* size of the current patch we're staring at */
- *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE;
+ patch_size = *(u32 *)(ucode_ptr + 4);
+ *current_size = patch_size + SECTION_HDR_SIZE;
equiv_cpu_id = find_equiv_id();
if (!equiv_cpu_id)
@@ -174,7 +179,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
/*
* now that the header looks sane, verify its size
*/
- actual_size = verify_ucode_size(cpu, *current_size, leftover_size);
+ actual_size = verify_ucode_size(cpu, patch_size, leftover_size);
if (!actual_size)
return 0;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index eb113693f043..8563b64e18cf 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -174,6 +174,9 @@ static int msr_open(struct inode *inode, struct file *file)
unsigned int cpu;
struct cpuinfo_x86 *c;
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
cpu = iminor(file->f_path.dentry->d_inode);
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
return -ENXIO; /* No such CPU */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index cf1178332bc0..c4410fb4938f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
#include <linux/signal.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -165,6 +166,35 @@ static inline bool invalid_selector(u16 value)
#define FLAG_MASK FLAG_MASK_32
+/*
+ * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
+ * when it traps. The previous stack will be directly underneath the saved
+ * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
+ *
+ * Now, if the stack is empty, '&regs->sp' is out of range. In this
+ * case we try to take the previous stack. To always return a non-null
+ * stack pointer we fall back to regs as stack if no previous stack
+ * exists.
+ *
+ * This is valid only for kernel mode traps.
+ */
+unsigned long kernel_stack_pointer(struct pt_regs *regs)
+{
+ unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
+ unsigned long sp = (unsigned long)&regs->sp;
+ struct thread_info *tinfo;
+
+ if (context == (sp & ~(THREAD_SIZE - 1)))
+ return sp;
+
+ tinfo = (struct thread_info *)context;
+ if (tinfo->previous_esp)
+ return tinfo->previous_esp;
+
+ return (unsigned long)regs;
+}
+EXPORT_SYMBOL_GPL(kernel_stack_pointer);
+
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 3034ee5afb0f..df1b604c0c1f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -619,7 +619,7 @@ static void native_machine_emergency_restart(void)
break;
case BOOT_EFI:
- if (efi_enabled)
+ if (efi_enabled(EFI_RUNTIME_SERVICES))
efi.reset_system(reboot_mode ?
EFI_RESET_WARM :
EFI_RESET_COLD,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d6c956e674cc..dc29333e1bb9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -621,6 +621,83 @@ static __init void reserve_ibft_region(void)
static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
+static bool __init snb_gfx_workaround_needed(void)
+{
+#ifdef CONFIG_PCI
+ int i;
+ u16 vendor, devid;
+ static const u16 snb_ids[] = {
+ 0x0102,
+ 0x0112,
+ 0x0122,
+ 0x0106,
+ 0x0116,
+ 0x0126,
+ 0x010a,
+ };
+
+ /* Assume no if something weird is going on with PCI */
+ if (!early_pci_allowed())
+ return false;
+
+ vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
+ if (vendor != 0x8086)
+ return false;
+
+ devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
+ for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
+ if (devid == snb_ids[i])
+ return true;
+#endif
+
+ return false;
+}
+
+/*
+ * Sandy Bridge graphics has trouble with certain ranges, exclude
+ * them from allocation.
+ */
+static void __init trim_snb_memory(void)
+{
+ static const unsigned long bad_pages[] = {
+ 0x20050000,
+ 0x20110000,
+ 0x20130000,
+ 0x20138000,
+ 0x40004000,
+ };
+ int i;
+
+ if (!snb_gfx_workaround_needed())
+ return;
+
+ printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
+
+ /*
+ * Reserve all memory below the 1 MB mark that has not
+ * already been reserved.
+ */
+ memblock_reserve(0, 1<<20);
+
+ for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
+ if (memblock_reserve(bad_pages[i], PAGE_SIZE))
+ printk(KERN_WARNING "failed to reserve 0x%08lx\n",
+ bad_pages[i]);
+ }
+}
+
+/*
+ * Here we put platform-specific memory range workarounds, i.e.
+ * memory known to be corrupt or otherwise in need to be reserved on
+ * specific platforms.
+ *
+ * If this gets used more widely it could use a real dispatch mechanism.
+ */
+static void __init trim_platform_memory_ranges(void)
+{
+ trim_snb_memory();
+}
+
static void __init trim_bios_range(void)
{
/*
@@ -641,6 +718,7 @@ static void __init trim_bios_range(void)
* take them out.
*/
e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
+
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
}
@@ -741,15 +819,15 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
"EL32", 4)) {
- efi_enabled = 1;
- efi_64bit = false;
+ set_bit(EFI_BOOT, &x86_efi_facility);
} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
"EL64", 4)) {
- efi_enabled = 1;
- efi_64bit = true;
+ set_bit(EFI_BOOT, &x86_efi_facility);
+ set_bit(EFI_64BIT, &x86_efi_facility);
}
- if (efi_enabled && efi_memblock_x86_reserve_range())
- efi_enabled = 0;
+
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
#endif
x86_init.oem.arch_setup();
@@ -822,7 +900,7 @@ void __init setup_arch(char **cmdline_p)
finish_e820_parsing();
- if (efi_enabled)
+ if (efi_enabled(EFI_BOOT))
efi_init();
dmi_scan_machine();
@@ -905,7 +983,7 @@ void __init setup_arch(char **cmdline_p)
* The EFI specification says that boot service code won't be called
* after ExitBootServices(). This is, in fact, a lie.
*/
- if (efi_enabled)
+ if (efi_enabled(EFI_MEMMAP))
efi_reserve_boot_services();
/* preallocate 4k for mptable mpc */
@@ -920,6 +998,8 @@ void __init setup_arch(char **cmdline_p)
setup_trampolines();
+ trim_platform_memory_ranges();
+
init_gbpages();
/* max_pfn_mapped is updated here */
@@ -928,8 +1008,22 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_64
if (max_pfn > max_low_pfn) {
- max_pfn_mapped = init_memory_mapping(1UL<<32,
- max_pfn<<PAGE_SHIFT);
+ int i;
+ unsigned long start, end;
+ unsigned long start_pfn, end_pfn;
+
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
+ NULL) {
+
+ end = PFN_PHYS(end_pfn);
+ if (end <= (1UL<<32))
+ continue;
+
+ start = PFN_PHYS(start_pfn);
+ max_pfn_mapped = init_memory_mapping(
+ max((1UL<<32), start), end);
+ }
+
/* can we preseve max_low_pfn ?*/
max_low_pfn = max_pfn;
}
@@ -1027,7 +1121,7 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
- if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+ if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
conswitchp = &vga_con;
#elif defined(CONFIG_DUMMY_CONSOLE)
conswitchp = &dummy_con;
@@ -1042,6 +1136,18 @@ void __init setup_arch(char **cmdline_p)
mcheck_init();
arch_init_ideal_nops();
+
+#ifdef CONFIG_EFI
+ /* Once setup is done above, unmap the EFI memory map on
+ * mismatched firmware/kernel archtectures since there is no
+ * support for runtime services.
+ */
+ if (efi_enabled(EFI_BOOT) &&
+ IS_ENABLED(CONFIG_X86_64) != efi_enabled(EFI_64BIT)) {
+ pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
+ efi_unmap_memmap();
+ }
+#endif
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6e1e406038c2..849cdcf2e76a 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -66,6 +66,8 @@
#include <asm/mwait.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
+#include <asm/i387.h>
+#include <asm/fpu-internal.h>
#include <asm/setup.h>
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
@@ -851,6 +853,9 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ /* the FPU context is blank, nobody can own it */
+ __cpu_disable_lazy_restore(cpu);
+
err = do_boot_cpu(apicid, cpu);
if (err) {
pr_debug("do_boot_cpu failed %d\n", err);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 26d1fb437eb5..638de3efd7a3 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -23,6 +23,9 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
+ if (!static_cpu_has(X86_FEATURE_XSAVE))
+ return 0;
+
best = kvm_find_cpuid_entry(vcpu, 1, 0);
return best && (best->ecx & bit(X86_FEATURE_XSAVE));
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 185a2b823a2d..e28fb97a1a88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5697,6 +5697,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int pending_vec, max_bits, idx;
struct desc_ptr dt;
+ if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))
+ return -EINVAL;
+
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
kvm_x86_ops->set_idt(vcpu, &dt);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3ecfd1aaf214..e922e01868fb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -747,13 +747,15 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
return;
}
#endif
+ /* Kernel addresses are always protection faults: */
+ if (address >= TASK_SIZE)
+ error_code |= PF_PROT;
- if (unlikely(show_unhandled_signals))
+ if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
- /* Kernel addresses are always protection faults: */
tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+ tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f6679a7fb8ca..b91e48512425 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -56,9 +56,16 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
}
/*
- * search for a shareable pmd page for hugetlb.
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
*/
-static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+static pte_t *
+huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
struct vm_area_struct *vma = find_vma(mm, addr);
struct address_space *mapping = vma->vm_file->f_mapping;
@@ -68,9 +75,10 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
struct vm_area_struct *svma;
unsigned long saddr;
pte_t *spte = NULL;
+ pte_t *pte;
if (!vma_shareable(vma, addr))
- return;
+ return (pte_t *)pmd_alloc(mm, pud, addr);
mutex_lock(&mapping->i_mmap_mutex);
vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
@@ -97,7 +105,9 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
out:
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
mutex_unlock(&mapping->i_mmap_mutex);
+ return pte;
}
/*
@@ -142,8 +152,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
} else {
BUG_ON(sz != PMD_SIZE);
if (pud_none(*pud))
- huge_pmd_share(mm, addr, pud);
- pte = (pte_t *) pmd_alloc(mm, pud, addr);
+ pte = huge_pmd_share(mm, addr, pud);
+ else
+ pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 4f0cec7e4ffb..fae70904ed08 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -29,36 +29,50 @@ int direct_gbpages
#endif
;
-static void __init find_early_table_space(unsigned long end, int use_pse,
- int use_gbpages)
+struct map_range {
+ unsigned long start;
+ unsigned long end;
+ unsigned page_size_mask;
+};
+
+/*
+ * First calculate space needed for kernel direct mapping page tables to cover
+ * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB
+ * pages. Then find enough contiguous space for those page tables.
+ */
+static void __init find_early_table_space(struct map_range *mr, int nr_range)
{
- unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
+ int i;
+ unsigned long puds = 0, pmds = 0, ptes = 0, tables;
+ unsigned long start = 0, good_end;
phys_addr_t base;
- puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ for (i = 0; i < nr_range; i++) {
+ unsigned long range, extra;
- if (use_gbpages) {
- unsigned long extra;
+ range = mr[i].end - mr[i].start;
+ puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
- extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
- pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
- } else
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-
- tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+ if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
+ extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
+ pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+ } else {
+ pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
+ }
- if (use_pse) {
- unsigned long extra;
-
- extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+ if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
+ extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
#ifdef CONFIG_X86_32
- extra += PMD_SIZE;
+ extra += PMD_SIZE;
#endif
- ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
- } else
- ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else {
+ ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ }
+ }
+ tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+ tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
#ifdef CONFIG_X86_32
@@ -75,8 +89,9 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
pgt_buf_end = pgt_buf_start;
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
- printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
+ printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n",
+ mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
+ (pgt_buf_top << PAGE_SHIFT) - 1);
}
void __init native_pagetable_reserve(u64 start, u64 end)
@@ -84,12 +99,6 @@ void __init native_pagetable_reserve(u64 start, u64 end)
memblock_reserve(start, end - start);
}
-struct map_range {
- unsigned long start;
- unsigned long end;
- unsigned page_size_mask;
-};
-
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
@@ -261,7 +270,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* nodes are discovered.
*/
if (!after_bootmem)
- find_early_table_space(end, use_pse, use_gbpages);
+ find_early_table_space(mr, nr_range);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index fc18be0f6f29..faf7a6830c0b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -821,6 +821,9 @@ int kern_addr_valid(unsigned long addr)
if (pud_none(*pud))
return 0;
+ if (pud_large(*pud))
+ return pfn_valid(pud_pfn(*pud));
+
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return 0;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 19d3fa08b119..c1e83944abfe 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -193,7 +193,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
static void __init setup_node_data(int nid, u64 start, u64 end)
{
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
- bool remapped = false;
u64 nd_pa;
void *nd;
int tnid;
@@ -205,37 +204,28 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
if (end && (end - start) < NODE_MIN_SIZE)
return;
- /* initialize remap allocator before aligning to ZONE_ALIGN */
- init_alloc_remap(nid, start, end);
-
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
nid, start, end);
/*
- * Allocate node data. Try remap allocator first, node-local
- * memory and then any node. Never allocate in DMA zone.
+ * Allocate node data. Try node-local memory and then any node.
+ * Never allocate in DMA zone.
*/
- nd = alloc_remap(nid, nd_size);
- if (nd) {
- nd_pa = __pa(nd);
- remapped = true;
- } else {
- nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
- if (!nd_pa) {
- pr_err("Cannot find %zu bytes in node %d\n",
- nd_size, nid);
- return;
- }
- nd = __va(nd_pa);
+ nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa) {
+ pr_err("Cannot find %zu bytes in node %d\n",
+ nd_size, nid);
+ return;
}
+ nd = __va(nd_pa);
/* report and initialize */
- printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
- nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
+ printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
+ nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
- if (!remapped && tnid != nid)
+ if (tnid != nid)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
node_data[nid] = nd;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 534255a36b6b..73a6d7395bd3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
extern unsigned long highend_pfn, highstart_pfn;
-#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
-
-static void *node_remap_start_vaddr[MAX_NUMNODES];
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-/*
- * Remap memory allocator
- */
-static unsigned long node_remap_start_pfn[MAX_NUMNODES];
-static void *node_remap_end_vaddr[MAX_NUMNODES];
-static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-
-/**
- * alloc_remap - Allocate remapped memory
- * @nid: NUMA node to allocate memory from
- * @size: The size of allocation
- *
- * Allocate @size bytes from the remap area of NUMA node @nid. The
- * size of the remap area is predetermined by init_alloc_remap() and
- * only the callers considered there should call this function. For
- * more info, please read the comment on top of init_alloc_remap().
- *
- * The caller must be ready to handle allocation failure from this
- * function and fall back to regular memory allocator in such cases.
- *
- * CONTEXT:
- * Single CPU early boot context.
- *
- * RETURNS:
- * Pointer to the allocated memory on success, %NULL on failure.
- */
-void *alloc_remap(int nid, unsigned long size)
-{
- void *allocation = node_remap_alloc_vaddr[nid];
-
- size = ALIGN(size, L1_CACHE_BYTES);
-
- if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
- return NULL;
-
- node_remap_alloc_vaddr[nid] += size;
- memset(allocation, 0, size);
-
- return allocation;
-}
-
-#ifdef CONFIG_HIBERNATION
-/**
- * resume_map_numa_kva - add KVA mapping to the temporary page tables created
- * during resume from hibernation
- * @pgd_base - temporary resume page directory
- */
-void resume_map_numa_kva(pgd_t *pgd_base)
-{
- int node;
-
- for_each_online_node(node) {
- unsigned long start_va, start_pfn, nr_pages, pfn;
-
- start_va = (unsigned long)node_remap_start_vaddr[node];
- start_pfn = node_remap_start_pfn[node];
- nr_pages = (node_remap_end_vaddr[node] -
- node_remap_start_vaddr[node]) >> PAGE_SHIFT;
-
- printk(KERN_DEBUG "%s: node %d\n", __func__, node);
-
- for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
- unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
- pgd_t *pgd = pgd_base + pgd_index(vaddr);
- pud_t *pud = pud_offset(pgd, vaddr);
- pmd_t *pmd = pmd_offset(pud, vaddr);
-
- set_pmd(pmd, pfn_pmd(start_pfn + pfn,
- PAGE_KERNEL_LARGE_EXEC));
-
- printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
- __func__, vaddr, start_pfn + pfn);
- }
- }
-}
-#endif
-
-/**
- * init_alloc_remap - Initialize remap allocator for a NUMA node
- * @nid: NUMA node to initizlie remap allocator for
- *
- * NUMA nodes may end up without any lowmem. As allocating pgdat and
- * memmap on a different node with lowmem is inefficient, a special
- * remap allocator is implemented which can be used by alloc_remap().
- *
- * For each node, the amount of memory which will be necessary for
- * pgdat and memmap is calculated and two memory areas of the size are
- * allocated - one in the node and the other in lowmem; then, the area
- * in the node is remapped to the lowmem area.
- *
- * As pgdat and memmap must be allocated in lowmem anyway, this
- * doesn't waste lowmem address space; however, the actual lowmem
- * which gets remapped over is wasted. The amount shouldn't be
- * problematic on machines this feature will be used.
- *
- * Initialization failure isn't fatal. alloc_remap() is used
- * opportunistically and the callers will fall back to other memory
- * allocation mechanisms on failure.
- */
-void __init init_alloc_remap(int nid, u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long end_pfn = end >> PAGE_SHIFT;
- unsigned long size, pfn;
- u64 node_pa, remap_pa;
- void *remap_va;
-
- /*
- * The acpi/srat node info can show hot-add memroy zones where
- * memory could be added but not currently present.
- */
- printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
- nid, start_pfn, end_pfn);
-
- /* calculate the necessary space aligned to large page size */
- size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
- size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
- size = ALIGN(size, LARGE_PAGE_BYTES);
-
- /* allocate node memory and the lowmem remap area */
- node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
- if (!node_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
- size, nid);
- return;
- }
- memblock_reserve(node_pa, size);
-
- remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
- max_low_pfn << PAGE_SHIFT,
- size, LARGE_PAGE_BYTES);
- if (!remap_pa) {
- pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
- size, nid);
- memblock_free(node_pa, size);
- return;
- }
- memblock_reserve(remap_pa, size);
- remap_va = phys_to_virt(remap_pa);
-
- /* perform actual remap */
- for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
- set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
- (node_pa >> PAGE_SHIFT) + pfn,
- PAGE_KERNEL_LARGE);
-
- /* initialize remap allocator parameters */
- node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
- node_remap_start_vaddr[nid] = remap_va;
- node_remap_end_vaddr[nid] = remap_va + size;
- node_remap_alloc_vaddr[nid] = remap_va;
-
- printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
- nid, node_pa, node_pa + size, remap_va, remap_va + size);
-}
-
void __init initmem_init(void)
{
x86_numa_init();
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 7178c3afe05e..ad86ec91e640 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
void __init x86_numa_init(void);
-#ifdef CONFIG_X86_64
-static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
-#else
-void __init init_alloc_remap(int nid, u64 start, u64 end);
-#endif
-
#ifdef CONFIG_NUMA_EMU
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
int numa_dist_cnt);
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 26b8a8514ee5..48768df2471a 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -55,7 +55,7 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
val |= counter_config->extra;
event &= model->event_mask ? model->event_mask : 0xFF;
val |= event & 0xFF;
- val |= (event & 0x0F00) << 24;
+ val |= (u64)(event & 0x0F00) << 24;
return val;
}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index d0e6e403b4f6..5dd467bd6121 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -519,3 +519,20 @@ static void sb600_disable_hpet_bar(struct pci_dev *dev)
}
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
+
+/*
+ * Twinhead H12Y needs us to block out a region otherwise we map devices
+ * there and any access kills the box.
+ *
+ * See: https://bugzilla.kernel.org/show_bug.cgi?id=10231
+ *
+ * Match off the LPC and svid/sdid (older kernels lose the bridge subvendor)
+ */
+static void __devinit twinhead_reserve_killing_zone(struct pci_dev *dev)
+{
+ if (dev->subsystem_vendor == 0x14FF && dev->subsystem_device == 0xA003) {
+ pr_info("Reserving memory on Twinhead H12Y\n");
+ request_mem_region(0xFFB00000, 0x100000, "twinhead");
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 92660edaa1e7..1e406371a683 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -50,9 +50,6 @@
#define EFI_DEBUG 1
-int efi_enabled;
-EXPORT_SYMBOL(efi_enabled);
-
struct efi __read_mostly efi = {
.mps = EFI_INVALID_TABLE_ADDR,
.acpi = EFI_INVALID_TABLE_ADDR,
@@ -68,15 +65,29 @@ EXPORT_SYMBOL(efi);
struct efi_memory_map memmap;
-bool efi_64bit;
-static bool efi_native;
-
static struct efi efi_phys __initdata;
static efi_system_table_t efi_systab __initdata;
+static inline bool efi_is_native(void)
+{
+ return IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT);
+}
+
+unsigned long x86_efi_facility;
+
+/*
+ * Returns 1 if 'facility' is enabled, 0 otherwise.
+ */
+int efi_enabled(int facility)
+{
+ return test_bit(facility, &x86_efi_facility) != 0;
+}
+EXPORT_SYMBOL(efi_enabled);
+
+static bool disable_runtime = false;
static int __init setup_noefi(char *arg)
{
- efi_enabled = 0;
+ disable_runtime = true;
return 0;
}
early_param("noefi", setup_noefi);
@@ -419,10 +430,22 @@ void __init efi_reserve_boot_services(void)
}
}
-static void __init efi_free_boot_services(void)
+void __init efi_unmap_memmap(void)
+{
+ clear_bit(EFI_MEMMAP, &x86_efi_facility);
+ if (memmap.map) {
+ early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
+ memmap.map = NULL;
+ }
+}
+
+void __init efi_free_boot_services(void)
{
void *p;
+ if (!efi_is_native())
+ return;
+
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
efi_memory_desc_t *md = p;
unsigned long long start = md->phys_addr;
@@ -438,11 +461,13 @@ static void __init efi_free_boot_services(void)
free_bootmem_late(start, size);
}
+
+ efi_unmap_memmap();
}
static int __init efi_systab_init(void *phys)
{
- if (efi_64bit) {
+ if (efi_enabled(EFI_64BIT)) {
efi_system_table_64_t *systab64;
u64 tmp = 0;
@@ -534,7 +559,7 @@ static int __init efi_config_init(u64 tables, int nr_tables)
void *config_tables, *tablep;
int i, sz;
- if (efi_64bit)
+ if (efi_enabled(EFI_64BIT))
sz = sizeof(efi_config_table_64_t);
else
sz = sizeof(efi_config_table_32_t);
@@ -554,7 +579,7 @@ static int __init efi_config_init(u64 tables, int nr_tables)
efi_guid_t guid;
unsigned long table;
- if (efi_64bit) {
+ if (efi_enabled(EFI_64BIT)) {
u64 table64;
guid = ((efi_config_table_64_t *)tablep)->guid;
table64 = ((efi_config_table_64_t *)tablep)->table;
@@ -666,22 +691,19 @@ void __init efi_init(void)
if (boot_params.efi_info.efi_systab_hi ||
boot_params.efi_info.efi_memmap_hi) {
pr_info("Table located above 4GB, disabling EFI.\n");
- efi_enabled = 0;
return;
}
efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
- efi_native = !efi_64bit;
#else
efi_phys.systab = (efi_system_table_t *)
(boot_params.efi_info.efi_systab |
((__u64)boot_params.efi_info.efi_systab_hi<<32));
- efi_native = efi_64bit;
#endif
- if (efi_systab_init(efi_phys.systab)) {
- efi_enabled = 0;
+ if (efi_systab_init(efi_phys.systab))
return;
- }
+
+ set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility);
/*
* Show what we know for posterity
@@ -699,29 +721,31 @@ void __init efi_init(void)
efi.systab->hdr.revision >> 16,
efi.systab->hdr.revision & 0xffff, vendor);
- if (efi_config_init(efi.systab->tables, efi.systab->nr_tables)) {
- efi_enabled = 0;
+ if (efi_config_init(efi.systab->tables, efi.systab->nr_tables))
return;
- }
+
+ set_bit(EFI_CONFIG_TABLES, &x86_efi_facility);
/*
* Note: We currently don't support runtime services on an EFI
* that doesn't match the kernel 32/64-bit mode.
*/
- if (!efi_native)
+ if (!efi_is_native())
pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
- else if (efi_runtime_init()) {
- efi_enabled = 0;
- return;
+ else {
+ if (disable_runtime || efi_runtime_init())
+ return;
+ set_bit(EFI_RUNTIME_SERVICES, &x86_efi_facility);
}
- if (efi_memmap_init()) {
- efi_enabled = 0;
+ if (efi_memmap_init())
return;
- }
+
+ set_bit(EFI_MEMMAP, &x86_efi_facility);
+
#ifdef CONFIG_X86_32
- if (efi_native) {
+ if (efi_is_native()) {
x86_platform.get_wallclock = efi_get_time;
x86_platform.set_wallclock = efi_set_rtc_mmss;
}
@@ -787,8 +811,10 @@ void __init efi_enter_virtual_mode(void)
* non-native EFI
*/
- if (!efi_native)
- goto out;
+ if (!efi_is_native()) {
+ efi_unmap_memmap();
+ return;
+ }
/* Merge contiguous regions of the same type and attribute */
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -878,18 +904,12 @@ void __init efi_enter_virtual_mode(void)
}
/*
- * Thankfully, it does seem that no runtime services other than
- * SetVirtualAddressMap() will touch boot services code, so we can
- * get rid of it all at this point
- */
- efi_free_boot_services();
-
- /*
* Now that EFI is in virtual mode, update the function
* pointers in the runtime service table to the new virtual addresses.
*
* Call EFI services through wrapper functions.
*/
+ efi.runtime_version = efi_systab.hdr.revision;
efi.get_time = virt_efi_get_time;
efi.set_time = virt_efi_set_time;
efi.get_wakeup_time = virt_efi_get_wakeup_time;
@@ -906,9 +926,6 @@ void __init efi_enter_virtual_mode(void)
if (__supported_pte_mask & _PAGE_NX)
runtime_code_page_mkexec();
-out:
- early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
- memmap.map = NULL;
kfree(new_memmap);
}
@@ -935,6 +952,9 @@ u64 efi_mem_attributes(unsigned long phys_addr)
efi_memory_desc_t *md;
void *p;
+ if (!efi_enabled(EFI_MEMMAP))
+ return 0;
+
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->phys_addr <= phys_addr) &&
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac3aa54e2654..0fba86da5895 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -38,7 +38,7 @@
#include <asm/cacheflush.h>
#include <asm/fixmap.h>
-static pgd_t save_pgd __initdata;
+static pgd_t *save_pgd __initdata;
static unsigned long efi_flags __initdata;
static void __init early_code_mapping_set_exec(int executable)
@@ -61,12 +61,20 @@ static void __init early_code_mapping_set_exec(int executable)
void __init efi_call_phys_prelog(void)
{
unsigned long vaddress;
+ int pgd;
+ int n_pgds;
early_code_mapping_set_exec(1);
local_irq_save(efi_flags);
- vaddress = (unsigned long)__va(0x0UL);
- save_pgd = *pgd_offset_k(0x0UL);
- set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
+
+ n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
+ save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
+
+ for (pgd = 0; pgd < n_pgds; pgd++) {
+ save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
+ vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
+ set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
+ }
__flush_tlb_all();
}
@@ -75,7 +83,11 @@ void __init efi_call_phys_epilog(void)
/*
* After the lock is released, the original page table is restored.
*/
- set_pgd(pgd_offset_k(0x0UL), save_pgd);
+ int pgd;
+ int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+ for (pgd = 0; pgd < n_pgds; pgd++)
+ set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
+ kfree(save_pgd);
__flush_tlb_all();
local_irq_restore(efi_flags);
early_code_mapping_set_exec(0);
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 74202c1910cd..7d28c885d238 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -129,8 +129,6 @@ static int resume_physical_mapping_init(pgd_t *pgd_base)
}
}
- resume_map_numa_kva(pgd_base);
-
return 0;
}
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index dd29a9ea27c5..fd1f10348130 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -60,8 +60,8 @@
51 common getsockname sys_getsockname
52 common getpeername sys_getpeername
53 common socketpair sys_socketpair
-54 common setsockopt sys_setsockopt
-55 common getsockopt sys_getsockopt
+54 64 setsockopt sys_setsockopt
+55 64 getsockopt sys_getsockopt
56 common clone stub_clone
57 common fork stub_fork
58 common vfork stub_vfork
@@ -351,3 +351,5 @@
538 x32 sendmmsg compat_sys_sendmmsg
539 x32 process_vm_readv compat_sys_process_vm_readv
540 x32 process_vm_writev compat_sys_process_vm_writev
+541 x32 setsockopt compat_sys_setsockopt
+542 x32 getsockopt compat_sys_getsockopt
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 40edfc37a6fd..59100548cbdb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -942,7 +942,16 @@ static void xen_write_cr4(unsigned long cr4)
native_write_cr4(cr4);
}
-
+#ifdef CONFIG_X86_64
+static inline unsigned long xen_read_cr8(void)
+{
+ return 0;
+}
+static inline void xen_write_cr8(unsigned long val)
+{
+ BUG_ON(val);
+}
+#endif
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
int ret;
@@ -1111,6 +1120,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_cr4_safe = native_read_cr4_safe,
.write_cr4 = xen_write_cr4,
+#ifdef CONFIG_X86_64
+ .read_cr8 = xen_read_cr8,
+ .write_cr8 = xen_write_cr8,
+#endif
+
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
@@ -1121,6 +1135,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
+ .read_tscp = native_read_tscp,
+
.iret = xen_iret,
.irq_enable_sysexit = xen_sysexit,
#ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 69f5857660ac..5cb8e27d43d2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1203,6 +1203,25 @@ unsigned long xen_read_cr2_direct(void)
return this_cpu_read(xen_vcpu_info.arch.cr2);
}
+void xen_flush_tlb_all(void)
+{
+ struct mmuext_op *op;
+ struct multicall_space mcs;
+
+ trace_xen_mmu_flush_tlb_all(0);
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = MMUEXT_TLB_FLUSH_ALL;
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
static void xen_flush_tlb(void)
{
struct mmuext_op *op;
@@ -2364,7 +2383,7 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
err = 0;
out:
- flush_tlb_all();
+ xen_flush_tlb_all();
return err;
}
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 00a038540c8e..3ace81769478 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -714,9 +714,6 @@ int m2p_add_override(unsigned long mfn, struct page *page,
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
- /* let's use dev_bus_addr to record the old mfn instead */
- kmap_op->dev_bus_addr = page->index;
- page->index = (unsigned long) kmap_op;
}
spin_lock_irqsave(&m2p_override_lock, flags);
list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
@@ -743,7 +740,8 @@ int m2p_add_override(unsigned long mfn, struct page *page,
return 0;
}
EXPORT_SYMBOL_GPL(m2p_add_override);
-int m2p_remove_override(struct page *page, bool clear_pte)
+int m2p_remove_override(struct page *page,
+ struct gnttab_map_grant_ref *kmap_op)
{
unsigned long flags;
unsigned long mfn;
@@ -773,10 +771,8 @@ int m2p_remove_override(struct page *page, bool clear_pte)
WARN_ON(!PagePrivate(page));
ClearPagePrivate(page);
- if (clear_pte) {
- struct gnttab_map_grant_ref *map_op =
- (struct gnttab_map_grant_ref *) page->index;
- set_phys_to_machine(pfn, map_op->dev_bus_addr);
+ set_phys_to_machine(pfn, page->index);
+ if (kmap_op != NULL) {
if (!PageHighMem(page)) {
struct multicall_space mcs;
struct gnttab_unmap_grant_ref *unmap_op;
@@ -788,13 +784,13 @@ int m2p_remove_override(struct page *page, bool clear_pte)
* issued. In this case handle is going to -1 because
* it hasn't been modified yet.
*/
- if (map_op->handle == -1)
+ if (kmap_op->handle == -1)
xen_mc_flush();
/*
- * Now if map_op->handle is negative it means that the
+ * Now if kmap_op->handle is negative it means that the
* hypercall actually returned an error.
*/
- if (map_op->handle == GNTST_general_error) {
+ if (kmap_op->handle == GNTST_general_error) {
printk(KERN_WARNING "m2p_remove_override: "
"pfn %lx mfn %lx, failed to modify kernel mappings",
pfn, mfn);
@@ -804,8 +800,8 @@ int m2p_remove_override(struct page *page, bool clear_pte)
mcs = xen_mc_entry(
sizeof(struct gnttab_unmap_grant_ref));
unmap_op = mcs.args;
- unmap_op->host_addr = map_op->host_addr;
- unmap_op->handle = map_op->handle;
+ unmap_op->host_addr = kmap_op->host_addr;
+ unmap_op->handle = kmap_op->handle;
unmap_op->dev_bus_addr = 0;
MULTI_grant_table_op(mcs.mc,
@@ -816,10 +812,9 @@ int m2p_remove_override(struct page *page, bool clear_pte)
set_pte_at(&init_mm, address, ptep,
pfn_pte(pfn, PAGE_KERNEL));
__flush_tlb_single(address);
- map_op->host_addr = 0;
+ kmap_op->host_addr = 0;
}
- } else
- set_phys_to_machine(pfn, page->index);
+ }
/* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
* somewhere in this domain, even before being added to the
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 1ba8dff26753..017d48a26a02 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -17,6 +17,7 @@
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/acpi.h>
+#include <asm/numa.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
@@ -79,9 +80,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
memblock_reserve(start, size);
xen_max_p2m_pfn = PFN_DOWN(start + size);
+ for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
+ continue;
+ WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
+ pfn, mfn);
- for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++)
__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ }
}
static unsigned long __init xen_release_chunk(unsigned long start,
@@ -424,4 +432,7 @@ void __init xen_arch_setup(void)
disable_cpufreq();
WARN_ON(set_pm_idle_to_default());
fiddle_vdso();
+#ifdef CONFIG_NUMA
+ numa_off = 1;
+#endif
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index d69cc6c3f808..67bc7bae746e 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -328,7 +328,6 @@ static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
if (per_cpu(lock_spinners, cpu) == xl) {
ADD_STATS(released_slow_kicked, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
- break;
}
}
}
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index b040b0e518ca..7328f71651e1 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -88,11 +88,11 @@ ENTRY(xen_iret)
*/
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
- movl TI_cpu(%eax), %eax
- movl __per_cpu_offset(,%eax,4), %eax
- mov xen_vcpu(%eax), %eax
+ movl %ss:TI_cpu(%eax), %eax
+ movl %ss:__per_cpu_offset(,%eax,4), %eax
+ mov %ss:xen_vcpu(%eax), %eax
#else
- movl xen_vcpu, %eax
+ movl %ss:xen_vcpu, %eax
#endif
/* check IF state we're restoring */
@@ -105,11 +105,11 @@ ENTRY(xen_iret)
* resuming the code, so we don't have to be worried about
* being preempted to another CPU.
*/
- setz XEN_vcpu_info_mask(%eax)
+ setz %ss:XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
- cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+ cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax)
/*
* If there's something pending, mask events again so we can
@@ -117,7 +117,7 @@ xen_iret_start_crit:
* touch XEN_vcpu_info_mask.
*/
jne 1f
- movb $1, XEN_vcpu_info_mask(%eax)
+ movb $1, %ss:XEN_vcpu_info_mask(%eax)
1: popl %eax