summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile8
-rw-r--r--arch/x86/kernel/acpi/boot.c409
-rw-r--r--arch/x86/kernel/amd_iommu.c962
-rw-r--r--arch/x86/kernel/amd_iommu_init.c875
-rw-r--r--arch/x86/kernel/aperture_64.c309
-rw-r--r--arch/x86/kernel/apic_32.c42
-rw-r--r--arch/x86/kernel/apic_64.c42
-rw-r--r--arch/x86/kernel/apm_32.c25
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c17
-rw-r--r--arch/x86/kernel/cpu/amd.c42
-rw-r--r--arch/x86/kernel/cpu/amd_64.c211
-rw-r--r--arch/x86/kernel/cpu/bugs.c27
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c (renamed from arch/x86/kernel/bugs_64.c)0
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c43
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c44
-rw-r--r--arch/x86/kernel/cpu/intel_64.c103
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c36
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c24
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c90
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c38
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c901
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h3
-rw-r--r--arch/x86/kernel/e820.c (renamed from arch/x86/kernel/e820_64.c)1224
-rw-r--r--arch/x86/kernel/e820_32.c775
-rw-r--r--arch/x86/kernel/early-quirks.c13
-rw-r--r--arch/x86/kernel/efi.c59
-rw-r--r--arch/x86/kernel/efi_64.c9
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/entry_64.S13
-rw-r--r--arch/x86/kernel/genapic_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c141
-rw-r--r--arch/x86/kernel/head.c73
-rw-r--r--arch/x86/kernel/head32.c27
-rw-r--r--arch/x86/kernel/head64.c68
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S27
-rw-r--r--arch/x86/kernel/hpet.c43
-rw-r--r--arch/x86/kernel/i387.c4
-rw-r--r--arch/x86/kernel/i8259.c (renamed from arch/x86/kernel/i8259_32.c)136
-rw-r--r--arch/x86/kernel/i8259_64.c512
-rw-r--r--arch/x86/kernel/io_apic_32.c599
-rw-r--r--arch/x86/kernel/io_apic_64.c220
-rw-r--r--arch/x86/kernel/ipi.c1
-rw-r--r--arch/x86/kernel/irq_32.c254
-rw-r--r--arch/x86/kernel/irq_64.c28
-rw-r--r--arch/x86/kernel/irqinit_32.c114
-rw-r--r--arch/x86/kernel/irqinit_64.c217
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/machine_kexec_32.c4
-rw-r--r--arch/x86/kernel/microcode.c29
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c1
-rw-r--r--arch/x86/kernel/mpparse.c828
-rw-r--r--arch/x86/kernel/nmi_32.c13
-rw-r--r--arch/x86/kernel/nmi_64.c11
-rw-r--r--arch/x86/kernel/numaq_32.c25
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-dma.c18
-rw-r--r--arch/x86/kernel/pci-gart_64.c83
-rw-r--r--arch/x86/kernel/process.c190
-rw-r--r--arch/x86/kernel/process_32.c61
-rw-r--r--arch/x86/kernel/process_64.c35
-rw-r--r--arch/x86/kernel/ptrace.c4
-rw-r--r--arch/x86/kernel/quirks.c58
-rw-r--r--arch/x86/kernel/reboot.c18
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c4
-rw-r--r--arch/x86/kernel/setup.c26
-rw-r--r--arch/x86/kernel/setup_32.c554
-rw-r--r--arch/x86/kernel/setup_64.c502
-rw-r--r--arch/x86/kernel/smpboot.c24
-rw-r--r--arch/x86/kernel/srat_32.c191
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c64
-rw-r--r--arch/x86/kernel/time_32.c3
-rw-r--r--arch/x86/kernel/time_64.c2
-rw-r--r--arch/x86/kernel/trampoline.c2
-rw-r--r--arch/x86/kernel/traps_64.c3
-rw-r--r--arch/x86/kernel/tsc_32.c12
-rw-r--r--arch/x86/kernel/tsc_64.c2
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S7
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S8
-rw-r--r--arch/x86/kernel/vsmp_64.c3
85 files changed, 7270 insertions, 4358 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 77807d4769c9..d1d4ee895270 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,7 @@
# Makefile for the linux kernel.
#
-extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
+extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
@@ -18,14 +18,13 @@ CFLAGS_tsc_64.o := $(nostackp)
obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
obj-y += traps_$(BITS).o irq_$(BITS).o
obj-y += time_$(BITS).o ioport.o ldt.o
-obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o
+obj-y += setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o
-obj-y += bootflag.o e820_$(BITS).o
+obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o
-obj-$(CONFIG_X86_64) += bugs_64.o
obj-y += tsc_$(BITS).o io_delay.o rtc.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
@@ -100,6 +99,7 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
+ obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 33c5216fd3e1..6516359922ba 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -83,6 +83,8 @@ int acpi_lapic;
int acpi_ioapic;
int acpi_strict;
+static int disable_irq0_through_ioapic __initdata;
+
u8 acpi_sci_flags __initdata;
int acpi_sci_override_gsi __initdata;
int acpi_skip_timer_override __initdata;
@@ -338,8 +340,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
#ifdef CONFIG_X86_IO_APIC
-struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
-
static int __init
acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
{
@@ -514,8 +514,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
* Make sure all (legacy) PCI IRQs are set as level-triggered.
*/
if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
- extern void eisa_set_level_irq(unsigned int irq);
-
if (triggering == ACPI_LEVEL_SENSITIVE)
eisa_set_level_irq(gsi);
}
@@ -860,6 +858,372 @@ static int __init acpi_parse_madt_lapic_entries(void)
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS 0
+
+#ifdef CONFIG_X86_ES7000
+extern int es7000_plat;
+#endif
+
+static struct {
+ int apic_id;
+ int gsi_base;
+ int gsi_end;
+ DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic(int gsi)
+{
+ int i = 0;
+
+ /* Find the IOAPIC that manages this GSI. */
+ for (i = 0; i < nr_ioapics; i++) {
+ if ((gsi >= mp_ioapic_routing[i].gsi_base)
+ && (gsi <= mp_ioapic_routing[i].gsi_end))
+ return i;
+ }
+
+ printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ return -1;
+}
+
+static u8 __init uniq_ioapic_id(u8 id)
+{
+#ifdef CONFIG_X86_32
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return io_apic_get_unique_id(nr_ioapics, id);
+ else
+ return id;
+#else
+ int i;
+ DECLARE_BITMAP(used, 256);
+ bitmap_zero(used, 256);
+ for (i = 0; i < nr_ioapics; i++) {
+ struct mp_config_ioapic *ia = &mp_ioapics[i];
+ __set_bit(ia->mp_apicid, used);
+ }
+ if (!test_bit(id, used))
+ return id;
+ return find_first_zero_bit(used, 256);
+#endif
+}
+
+static int bad_ioapic(unsigned long address)
+{
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+ }
+ if (!address) {
+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+ " found in table, skipping!\n");
+ return 1;
+ }
+ return 0;
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+ int idx = 0;
+
+ if (bad_ioapic(address))
+ return;
+
+ idx = nr_ioapics;
+
+ mp_ioapics[idx].mp_type = MP_IOAPIC;
+ mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
+ mp_ioapics[idx].mp_apicaddr = address;
+
+ set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+ mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
+#ifdef CONFIG_X86_32
+ mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
+#else
+ mp_ioapics[idx].mp_apicver = 0;
+#endif
+ /*
+ * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+ * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+ */
+ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
+ mp_ioapic_routing[idx].gsi_base = gsi_base;
+ mp_ioapic_routing[idx].gsi_end = gsi_base +
+ io_apic_get_redir_entries(idx);
+
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+ "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
+ mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
+ mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+
+ nr_ioapics++;
+}
+
+static void assign_to_mp_irq(struct mp_config_intsrc *m,
+ struct mp_config_intsrc *mp_irq)
+{
+ memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
+ struct mp_config_intsrc *m)
+{
+ return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static void save_mp_irq(struct mp_config_intsrc *m)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!mp_irq_cmp(&mp_irqs[i], m))
+ return;
+ }
+
+ assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+ int ioapic;
+ int pin;
+ struct mp_config_intsrc mp_irq;
+
+ /* Skip the 8254 timer interrupt (IRQ 0) if requested. */
+ if (bus_irq == 0 && disable_irq0_through_ioapic)
+ return;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return;
+ pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ mp_irq.mp_type = MP_INTSRC;
+ mp_irq.mp_irqtype = mp_INT;
+ mp_irq.mp_irqflag = (trigger << 2) | polarity;
+ mp_irq.mp_srcbus = MP_ISA_BUS;
+ mp_irq.mp_srcbusirq = bus_irq; /* IRQ */
+ mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
+ mp_irq.mp_dstirq = pin; /* INTIN# */
+
+ save_mp_irq(&mp_irq);
+}
+
+void __init mp_config_acpi_legacy_irqs(void)
+{
+ int i;
+ int ioapic;
+ unsigned int dstapic;
+ struct mp_config_intsrc mp_irq;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+ /*
+ * Fabricate the legacy ISA bus (bus #31).
+ */
+ mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
+ Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+#ifdef CONFIG_X86_ES7000
+ /*
+ * Older generations of ES7000 have no legacy identity mappings
+ */
+ if (es7000_plat == 1)
+ return;
+#endif
+
+ /*
+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
+ */
+ ioapic = mp_find_ioapic(0);
+ if (ioapic < 0)
+ return;
+ dstapic = mp_ioapics[ioapic].mp_apicid;
+
+ /*
+ * Use the default configuration for the IRQs 0-15. Unless
+ * overridden by (MADT) interrupt source override entries.
+ */
+ for (i = 0; i < 16; i++) {
+ int idx;
+
+ /* Skip the 8254 timer interrupt (IRQ 0) if requested. */
+ if (i == 0 && disable_irq0_through_ioapic)
+ continue;
+
+ for (idx = 0; idx < mp_irq_entries; idx++) {
+ struct mp_config_intsrc *irq = mp_irqs + idx;
+
+ /* Do we already have a mapping for this ISA IRQ? */
+ if (irq->mp_srcbus == MP_ISA_BUS
+ && irq->mp_srcbusirq == i)
+ break;
+
+ /* Do we already have a mapping for this IOAPIC pin */
+ if (irq->mp_dstapic == dstapic &&
+ irq->mp_dstirq == i)
+ break;
+ }
+
+ if (idx != mp_irq_entries) {
+ printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+ continue; /* IRQ already used */
+ }
+
+ mp_irq.mp_type = MP_INTSRC;
+ mp_irq.mp_irqflag = 0; /* Conforming */
+ mp_irq.mp_srcbus = MP_ISA_BUS;
+ mp_irq.mp_dstapic = dstapic;
+ mp_irq.mp_irqtype = mp_INT;
+ mp_irq.mp_srcbusirq = i; /* Identity mapped */
+ mp_irq.mp_dstirq = i;
+
+ save_mp_irq(&mp_irq);
+ }
+}
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+ int ioapic;
+ int ioapic_pin;
+#ifdef CONFIG_X86_32
+#define MAX_GSI_NUM 4096
+#define IRQ_COMPRESSION_START 64
+
+ static int pci_irq = IRQ_COMPRESSION_START;
+ /*
+ * Mapping between Global System Interrupts, which
+ * represent all possible interrupts, and IRQs
+ * assigned to actual devices.
+ */
+ static int gsi_to_irq[MAX_GSI_NUM];
+#else
+
+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+ return gsi;
+#endif
+
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (acpi_gbl_FADT.sci_interrupt == gsi)
+ return gsi;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0) {
+ printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+ return gsi;
+ }
+
+ ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+#ifdef CONFIG_X86_32
+ if (ioapic_renumber_irq)
+ gsi = ioapic_renumber_irq(ioapic, gsi);
+#endif
+
+ /*
+ * Avoid pin reprogramming. PRTs typically include entries
+ * with redundant pin->gsi mappings (but unique PCI devices);
+ * we only program the IOAPIC on the first.
+ */
+ if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
+ printk(KERN_ERR "Invalid reference to IOAPIC pin "
+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+ ioapic_pin);
+ return gsi;
+ }
+ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+ Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+ mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+#ifdef CONFIG_X86_32
+ return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+#else
+ return gsi;
+#endif
+ }
+
+ set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
+#ifdef CONFIG_X86_32
+ /*
+ * For GSI >= 64, use IRQ compression
+ */
+ if ((gsi >= IRQ_COMPRESSION_START)
+ && (triggering == ACPI_LEVEL_SENSITIVE)) {
+ /*
+ * For PCI devices assign IRQs in order, avoiding gaps
+ * due to unused I/O APIC pins.
+ */
+ int irq = gsi;
+ if (gsi < MAX_GSI_NUM) {
+ /*
+ * Retain the VIA chipset work-around (gsi > 15), but
+ * avoid a problem where the 8254 timer (IRQ0) is setup
+ * via an override (so it's not on pin 0 of the ioapic),
+ * and at the same time, the pin 0 interrupt is a PCI
+ * type. The gsi > 15 test could cause these two pins
+ * to be shared as IRQ0, and they are not shareable.
+ * So test for this condition, and if necessary, avoid
+ * the pin collision.
+ */
+ gsi = pci_irq++;
+ /*
+ * Don't assign IRQ used by ACPI SCI
+ */
+ if (gsi == acpi_gbl_FADT.sci_interrupt)
+ gsi = pci_irq++;
+ gsi_to_irq[irq] = gsi;
+ } else {
+ printk(KERN_ERR "GSI %u is too high\n", gsi);
+ return gsi;
+ }
+ }
+#endif
+ io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+ return gsi;
+}
+
+int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+ u32 gsi, int triggering, int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+ struct mp_config_intsrc mp_irq;
+ int ioapic;
+
+ if (!acpi_ioapic)
+ return 0;
+
+ /* print the entry should happen on mptable identically */
+ mp_irq.mp_type = MP_INTSRC;
+ mp_irq.mp_irqtype = mp_INT;
+ mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.mp_srcbus = number;
+ mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
+ mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+ save_mp_irq(&mp_irq);
+#endif
+ return 0;
+}
+
/*
* Parse IOAPIC related entries in MADT
* returns 0 on success, < 0 on error
@@ -1061,6 +1425,17 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
}
/*
+ * Don't register any I/O APIC entries for the 8254 timer IRQ.
+ */
+static int __init
+dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d)
+{
+ pr_notice("%s detected: disabling IRQ 0 through I/O APIC\n", d->ident);
+ disable_irq0_through_ioapic = 1;
+ return 0;
+}
+
+/*
* If your system is blacklisted here, but you find that acpi=force
* works for you, please contact acpi-devel@sourceforge.net
*/
@@ -1227,6 +1602,32 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
},
},
+ /*
+ * HP laptops which use a DSDT reporting as HP/SB400/10000,
+ * which includes some code which overrides all temperature
+ * trip points to 16C if the INTIN2 input of the I/O APIC
+ * is enabled. This input is incorrectly designated the
+ * ISA IRQ 0 via an interrupt source override even though
+ * it is wired to the output of the master 8259A and INTIN0
+ * is not connected at all. Abandon any attempts to route
+ * IRQ 0 through the I/O APIC therefore.
+ */
+ {
+ .callback = dmi_disable_irq0_through_ioapic,
+ .ident = "HP NX6125 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
+ },
+ },
+ {
+ .callback = dmi_disable_irq0_through_ioapic,
+ .ident = "HP NX6325 laptop",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
+ },
+ },
{}
};
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
new file mode 100644
index 000000000000..f2766d84c7a0
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/gfp.h>
+#include <linux/bitops.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+#include <asm/proto.h>
+#include <asm/gart.h>
+#include <asm/amd_iommu_types.h>
+#include <asm/amd_iommu.h>
+
+#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
+
+#define to_pages(addr, size) \
+ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+
+static DEFINE_RWLOCK(amd_iommu_devtable_lock);
+
+struct command {
+ u32 data[4];
+};
+
+static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+ struct unity_map_entry *e);
+
+static int iommu_has_npcache(struct amd_iommu *iommu)
+{
+ return iommu->cap & IOMMU_CAP_NPCACHE;
+}
+
+static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+{
+ u32 tail, head;
+ u8 *target;
+
+ tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+ target = (iommu->cmd_buf + tail);
+ memcpy_toio(target, cmd, sizeof(*cmd));
+ tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+ head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ if (tail == head)
+ return -ENOMEM;
+ writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+ return 0;
+}
+
+static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&iommu->lock, flags);
+ ret = __iommu_queue_command(iommu, cmd);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ return ret;
+}
+
+static int iommu_completion_wait(struct amd_iommu *iommu)
+{
+ int ret;
+ struct command cmd;
+ volatile u64 ready = 0;
+ unsigned long ready_phys = virt_to_phys(&ready);
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
+ cmd.data[1] = HIGH_U32(ready_phys);
+ cmd.data[2] = 1; /* value written to 'ready' */
+ CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+
+ iommu->need_sync = 0;
+
+ ret = iommu_queue_command(iommu, &cmd);
+
+ if (ret)
+ return ret;
+
+ while (!ready)
+ cpu_relax();
+
+ return 0;
+}
+
+static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
+{
+ struct command cmd;
+
+ BUG_ON(iommu == NULL);
+
+ memset(&cmd, 0, sizeof(cmd));
+ CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
+ cmd.data[0] = devid;
+
+ iommu->need_sync = 1;
+
+ return iommu_queue_command(iommu, &cmd);
+}
+
+static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
+ u64 address, u16 domid, int pde, int s)
+{
+ struct command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ address &= PAGE_MASK;
+ CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
+ cmd.data[1] |= domid;
+ cmd.data[2] = LOW_U32(address);
+ cmd.data[3] = HIGH_U32(address);
+ if (s)
+ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ if (pde)
+ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+
+ iommu->need_sync = 1;
+
+ return iommu_queue_command(iommu, &cmd);
+}
+
+static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
+ u64 address, size_t size)
+{
+ int s = 0;
+ unsigned pages = to_pages(address, size);
+
+ address &= PAGE_MASK;
+
+ if (pages > 1) {
+ /*
+ * If we have to flush more than one page, flush all
+ * TLB entries for this domain
+ */
+ address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ s = 1;
+ }
+
+ iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
+
+ return 0;
+}
+
+static int iommu_map(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long phys_addr,
+ int prot)
+{
+ u64 __pte, *pte, *page;
+
+ bus_addr = PAGE_ALIGN(bus_addr);
+ phys_addr = PAGE_ALIGN(bus_addr);
+
+ /* only support 512GB address spaces for now */
+ if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
+ return -EINVAL;
+
+ pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ *pte = IOMMU_L2_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte)) {
+ page = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ *pte = IOMMU_L1_PDE(virt_to_phys(page));
+ }
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+
+ if (IOMMU_PTE_PRESENT(*pte))
+ return -EBUSY;
+
+ __pte = phys_addr | IOMMU_PTE_P;
+ if (prot & IOMMU_PROT_IR)
+ __pte |= IOMMU_PTE_IR;
+ if (prot & IOMMU_PROT_IW)
+ __pte |= IOMMU_PTE_IW;
+
+ *pte = __pte;
+
+ return 0;
+}
+
+static int iommu_for_unity_map(struct amd_iommu *iommu,
+ struct unity_map_entry *entry)
+{
+ u16 bdf, i;
+
+ for (i = entry->devid_start; i <= entry->devid_end; ++i) {
+ bdf = amd_iommu_alias_table[i];
+ if (amd_iommu_rlookup_table[bdf] == iommu)
+ return 1;
+ }
+
+ return 0;
+}
+
+static int iommu_init_unity_mappings(struct amd_iommu *iommu)
+{
+ struct unity_map_entry *entry;
+ int ret;
+
+ list_for_each_entry(entry, &amd_iommu_unity_map, list) {
+ if (!iommu_for_unity_map(iommu, entry))
+ continue;
+ ret = dma_ops_unity_map(iommu->default_dom, entry);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+ struct unity_map_entry *e)
+{
+ u64 addr;
+ int ret;
+
+ for (addr = e->address_start; addr < e->address_end;
+ addr += PAGE_SIZE) {
+ ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
+ if (ret)
+ return ret;
+ /*
+ * if unity mapping is in aperture range mark the page
+ * as allocated in the aperture
+ */
+ if (addr < dma_dom->aperture_size)
+ __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+ }
+
+ return 0;
+}
+
+static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
+ u16 devid)
+{
+ struct unity_map_entry *e;
+ int ret;
+
+ list_for_each_entry(e, &amd_iommu_unity_map, list) {
+ if (!(devid >= e->devid_start && devid <= e->devid_end))
+ continue;
+ ret = dma_ops_unity_map(dma_dom, e);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static unsigned long dma_mask_to_pages(unsigned long mask)
+{
+ return (mask >> PAGE_SHIFT) +
+ (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
+}
+
+static unsigned long dma_ops_alloc_addresses(struct device *dev,
+ struct dma_ops_domain *dom,
+ unsigned int pages)
+{
+ unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
+ unsigned long address;
+ unsigned long size = dom->aperture_size >> PAGE_SHIFT;
+ unsigned long boundary_size;
+
+ boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+ PAGE_SIZE) >> PAGE_SHIFT;
+ limit = limit < size ? limit : size;
+
+ if (dom->next_bit >= limit)
+ dom->next_bit = 0;
+
+ address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
+ 0 , boundary_size, 0);
+ if (address == -1)
+ address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
+ 0, boundary_size, 0);
+
+ if (likely(address != -1)) {
+ dom->next_bit = address + pages;
+ address <<= PAGE_SHIFT;
+ } else
+ address = bad_dma_address;
+
+ WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
+
+ return address;
+}
+
+static void dma_ops_free_addresses(struct dma_ops_domain *dom,
+ unsigned long address,
+ unsigned int pages)
+{
+ address >>= PAGE_SHIFT;
+ iommu_area_free(dom->bitmap, address, pages);
+}
+
+static u16 domain_id_alloc(void)
+{
+ unsigned long flags;
+ int id;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
+ BUG_ON(id == 0);
+ if (id > 0 && id < MAX_DOMAIN_ID)
+ __set_bit(id, amd_iommu_pd_alloc_bitmap);
+ else
+ id = 0;
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ return id;
+}
+
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+ unsigned long start_page,
+ unsigned int pages)
+{
+ unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+
+ if (start_page + pages > last_page)
+ pages = last_page - start_page;
+
+ set_bit_string(dom->bitmap, start_page, pages);
+}
+
+static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
+{
+ int i, j;
+ u64 *p1, *p2, *p3;
+
+ p1 = dma_dom->domain.pt_root;
+
+ if (!p1)
+ return;
+
+ for (i = 0; i < 512; ++i) {
+ if (!IOMMU_PTE_PRESENT(p1[i]))
+ continue;
+
+ p2 = IOMMU_PTE_PAGE(p1[i]);
+ for (j = 0; j < 512; ++i) {
+ if (!IOMMU_PTE_PRESENT(p2[j]))
+ continue;
+ p3 = IOMMU_PTE_PAGE(p2[j]);
+ free_page((unsigned long)p3);
+ }
+
+ free_page((unsigned long)p2);
+ }
+
+ free_page((unsigned long)p1);
+}
+
+static void dma_ops_domain_free(struct dma_ops_domain *dom)
+{
+ if (!dom)
+ return;
+
+ dma_ops_free_pagetable(dom);
+
+ kfree(dom->pte_pages);
+
+ kfree(dom->bitmap);
+
+ kfree(dom);
+}
+
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
+ unsigned order)
+{
+ struct dma_ops_domain *dma_dom;
+ unsigned i, num_pte_pages;
+ u64 *l2_pde;
+ u64 address;
+
+ /*
+ * Currently the DMA aperture must be between 32 MB and 1GB in size
+ */
+ if ((order < 25) || (order > 30))
+ return NULL;
+
+ dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
+ if (!dma_dom)
+ return NULL;
+
+ spin_lock_init(&dma_dom->domain.lock);
+
+ dma_dom->domain.id = domain_id_alloc();
+ if (dma_dom->domain.id == 0)
+ goto free_dma_dom;
+ dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+ dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ dma_dom->domain.priv = dma_dom;
+ if (!dma_dom->domain.pt_root)
+ goto free_dma_dom;
+ dma_dom->aperture_size = (1ULL << order);
+ dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
+ GFP_KERNEL);
+ if (!dma_dom->bitmap)
+ goto free_dma_dom;
+ /*
+ * mark the first page as allocated so we never return 0 as
+ * a valid dma-address. So we can use 0 as error value
+ */
+ dma_dom->bitmap[0] = 1;
+ dma_dom->next_bit = 0;
+
+ if (iommu->exclusion_start &&
+ iommu->exclusion_start < dma_dom->aperture_size) {
+ unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+ int pages = to_pages(iommu->exclusion_start,
+ iommu->exclusion_length);
+ dma_ops_reserve_addresses(dma_dom, startpage, pages);
+ }
+
+ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
+ dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
+ GFP_KERNEL);
+ if (!dma_dom->pte_pages)
+ goto free_dma_dom;
+
+ l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (l2_pde == NULL)
+ goto free_dma_dom;
+
+ dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
+
+ for (i = 0; i < num_pte_pages; ++i) {
+ dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
+ if (!dma_dom->pte_pages[i])
+ goto free_dma_dom;
+ address = virt_to_phys(dma_dom->pte_pages[i]);
+ l2_pde[i] = IOMMU_L1_PDE(address);
+ }
+
+ return dma_dom;
+
+free_dma_dom:
+ dma_ops_domain_free(dma_dom);
+
+ return NULL;
+}
+
+static struct protection_domain *domain_for_device(u16 devid)
+{
+ struct protection_domain *dom;
+ unsigned long flags;
+
+ read_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ dom = amd_iommu_pd_table[devid];
+ read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ return dom;
+}
+
+static void set_device_domain(struct amd_iommu *iommu,
+ struct protection_domain *domain,
+ u16 devid)
+{
+ unsigned long flags;
+
+ u64 pte_root = virt_to_phys(domain->pt_root);
+
+ pte_root |= (domain->mode & 0x07) << 9;
+ pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ amd_iommu_dev_table[devid].data[0] = pte_root;
+ amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
+ amd_iommu_dev_table[devid].data[2] = domain->id;
+
+ amd_iommu_pd_table[devid] = domain;
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ iommu_queue_inv_dev_entry(iommu, devid);
+
+ iommu->need_sync = 1;
+}
+
+static int get_device_resources(struct device *dev,
+ struct amd_iommu **iommu,
+ struct protection_domain **domain,
+ u16 *bdf)
+{
+ struct dma_ops_domain *dma_dom;
+ struct pci_dev *pcidev;
+ u16 _bdf;
+
+ BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
+
+ pcidev = to_pci_dev(dev);
+ _bdf = (pcidev->bus->number << 8) | pcidev->devfn;
+
+ if (_bdf >= amd_iommu_last_bdf) {
+ *iommu = NULL;
+ *domain = NULL;
+ *bdf = 0xffff;
+ return 0;
+ }
+
+ *bdf = amd_iommu_alias_table[_bdf];
+
+ *iommu = amd_iommu_rlookup_table[*bdf];
+ if (*iommu == NULL)
+ return 0;
+ dma_dom = (*iommu)->default_dom;
+ *domain = domain_for_device(*bdf);
+ if (*domain == NULL) {
+ *domain = &dma_dom->domain;
+ set_device_domain(*iommu, *domain, *bdf);
+ printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
+ "device ", (*domain)->id);
+ print_devid(_bdf, 1);
+ }
+
+ return 1;
+}
+
+static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
+ struct dma_ops_domain *dom,
+ unsigned long address,
+ phys_addr_t paddr,
+ int direction)
+{
+ u64 *pte, __pte;
+
+ WARN_ON(address > dom->aperture_size);
+
+ paddr &= PAGE_MASK;
+
+ pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+ pte += IOMMU_PTE_L0_INDEX(address);
+
+ __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
+
+ if (direction == DMA_TO_DEVICE)
+ __pte |= IOMMU_PTE_IR;
+ else if (direction == DMA_FROM_DEVICE)
+ __pte |= IOMMU_PTE_IW;
+ else if (direction == DMA_BIDIRECTIONAL)
+ __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
+
+ WARN_ON(*pte);
+
+ *pte = __pte;
+
+ return (dma_addr_t)address;
+}
+
+static void dma_ops_domain_unmap(struct amd_iommu *iommu,
+ struct dma_ops_domain *dom,
+ unsigned long address)
+{
+ u64 *pte;
+
+ if (address >= dom->aperture_size)
+ return;
+
+ WARN_ON(address & 0xfffULL || address > dom->aperture_size);
+
+ pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+ pte += IOMMU_PTE_L0_INDEX(address);
+
+ WARN_ON(!*pte);
+
+ *pte = 0ULL;
+}
+
+static dma_addr_t __map_single(struct device *dev,
+ struct amd_iommu *iommu,
+ struct dma_ops_domain *dma_dom,
+ phys_addr_t paddr,
+ size_t size,
+ int dir)
+{
+ dma_addr_t offset = paddr & ~PAGE_MASK;
+ dma_addr_t address, start;
+ unsigned int pages;
+ int i;
+
+ pages = to_pages(paddr, size);
+ paddr &= PAGE_MASK;
+
+ address = dma_ops_alloc_addresses(dev, dma_dom, pages);
+ if (unlikely(address == bad_dma_address))
+ goto out;
+
+ start = address;
+ for (i = 0; i < pages; ++i) {
+ dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+ paddr += PAGE_SIZE;
+ start += PAGE_SIZE;
+ }
+ address += offset;
+
+out:
+ return address;
+}
+
+static void __unmap_single(struct amd_iommu *iommu,
+ struct dma_ops_domain *dma_dom,
+ dma_addr_t dma_addr,
+ size_t size,
+ int dir)
+{
+ dma_addr_t i, start;
+ unsigned int pages;
+
+ if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
+ return;
+
+ pages = to_pages(dma_addr, size);
+ dma_addr &= PAGE_MASK;
+ start = dma_addr;
+
+ for (i = 0; i < pages; ++i) {
+ dma_ops_domain_unmap(iommu, dma_dom, start);
+ start += PAGE_SIZE;
+ }
+
+ dma_ops_free_addresses(dma_dom, dma_addr, pages);
+}
+
+static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
+ size_t size, int dir)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+ dma_addr_t addr;
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (iommu == NULL || domain == NULL)
+ return (dma_addr_t)paddr;
+
+ spin_lock_irqsave(&domain->lock, flags);
+ addr = __map_single(dev, iommu, domain->priv, paddr, size, dir);
+ if (addr == bad_dma_address)
+ goto out;
+
+ if (iommu_has_npcache(iommu))
+ iommu_flush_pages(iommu, domain->id, addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return addr;
+}
+
+static void unmap_single(struct device *dev, dma_addr_t dma_addr,
+ size_t size, int dir)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+
+ if (!get_device_resources(dev, &iommu, &domain, &devid))
+ return;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ __unmap_single(iommu, domain->priv, dma_addr, size, dir);
+
+ iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+{
+ struct scatterlist *s;
+ int i;
+
+ for_each_sg(sglist, s, nelems, i) {
+ s->dma_address = (dma_addr_t)sg_phys(s);
+ s->dma_length = s->length;
+ }
+
+ return nelems;
+}
+
+static int map_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+ int i;
+ struct scatterlist *s;
+ phys_addr_t paddr;
+ int mapped_elems = 0;
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (!iommu || !domain)
+ return map_sg_no_iommu(dev, sglist, nelems, dir);
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ for_each_sg(sglist, s, nelems, i) {
+ paddr = sg_phys(s);
+
+ s->dma_address = __map_single(dev, iommu, domain->priv,
+ paddr, s->length, dir);
+
+ if (s->dma_address) {
+ s->dma_length = s->length;
+ mapped_elems++;
+ } else
+ goto unmap;
+ if (iommu_has_npcache(iommu))
+ iommu_flush_pages(iommu, domain->id, s->dma_address,
+ s->dma_length);
+ }
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return mapped_elems;
+unmap:
+ for_each_sg(sglist, s, mapped_elems, i) {
+ if (s->dma_address)
+ __unmap_single(iommu, domain->priv, s->dma_address,
+ s->dma_length, dir);
+ s->dma_address = s->dma_length = 0;
+ }
+
+ mapped_elems = 0;
+
+ goto out;
+}
+
+static void unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int dir)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ struct scatterlist *s;
+ u16 devid;
+ int i;
+
+ if (!get_device_resources(dev, &iommu, &domain, &devid))
+ return;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ for_each_sg(sglist, s, nelems, i) {
+ __unmap_single(iommu, domain->priv, s->dma_address,
+ s->dma_length, dir);
+ iommu_flush_pages(iommu, domain->id, s->dma_address,
+ s->dma_length);
+ s->dma_address = s->dma_length = 0;
+ }
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static void *alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_addr, gfp_t flag)
+{
+ unsigned long flags;
+ void *virt_addr;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+ phys_addr_t paddr;
+
+ virt_addr = (void *)__get_free_pages(flag, get_order(size));
+ if (!virt_addr)
+ return 0;
+
+ memset(virt_addr, 0, size);
+ paddr = virt_to_phys(virt_addr);
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (!iommu || !domain) {
+ *dma_addr = (dma_addr_t)paddr;
+ return virt_addr;
+ }
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
+ size, DMA_BIDIRECTIONAL);
+
+ if (*dma_addr == bad_dma_address) {
+ free_pages((unsigned long)virt_addr, get_order(size));
+ virt_addr = NULL;
+ goto out;
+ }
+
+ if (iommu_has_npcache(iommu))
+ iommu_flush_pages(iommu, domain->id, *dma_addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+out:
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+ return virt_addr;
+}
+
+static void free_coherent(struct device *dev, size_t size,
+ void *virt_addr, dma_addr_t dma_addr)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct protection_domain *domain;
+ u16 devid;
+
+ get_device_resources(dev, &iommu, &domain, &devid);
+
+ if (!iommu || !domain)
+ goto free_mem;
+
+ spin_lock_irqsave(&domain->lock, flags);
+
+ __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
+ iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+ if (iommu->need_sync)
+ iommu_completion_wait(iommu);
+
+ spin_unlock_irqrestore(&domain->lock, flags);
+
+free_mem:
+ free_pages((unsigned long)virt_addr, get_order(size));
+}
+
+/*
+ * If the driver core informs the DMA layer if a driver grabs a device
+ * we don't need to preallocate the protection domains anymore.
+ * For now we have to.
+ */
+void prealloc_protection_domains(void)
+{
+ struct pci_dev *dev = NULL;
+ struct dma_ops_domain *dma_dom;
+ struct amd_iommu *iommu;
+ int order = amd_iommu_aperture_order;
+ u16 devid;
+
+ while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+ devid = (dev->bus->number << 8) | dev->devfn;
+ if (devid >= amd_iommu_last_bdf)
+ continue;
+ devid = amd_iommu_alias_table[devid];
+ if (domain_for_device(devid))
+ continue;
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ continue;
+ dma_dom = dma_ops_domain_alloc(iommu, order);
+ if (!dma_dom)
+ continue;
+ init_unity_mappings_for_device(dma_dom, devid);
+ set_device_domain(iommu, &dma_dom->domain, devid);
+ printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ",
+ dma_dom->domain.id);
+ print_devid(devid, 1);
+ }
+}
+
+static struct dma_mapping_ops amd_iommu_dma_ops = {
+ .alloc_coherent = alloc_coherent,
+ .free_coherent = free_coherent,
+ .map_single = map_single,
+ .unmap_single = unmap_single,
+ .map_sg = map_sg,
+ .unmap_sg = unmap_sg,
+};
+
+int __init amd_iommu_init_dma_ops(void)
+{
+ struct amd_iommu *iommu;
+ int order = amd_iommu_aperture_order;
+ int ret;
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+ if (iommu->default_dom == NULL)
+ return -ENOMEM;
+ ret = iommu_init_unity_mappings(iommu);
+ if (ret)
+ goto free_domains;
+ }
+
+ if (amd_iommu_isolate)
+ prealloc_protection_domains();
+
+ iommu_detected = 1;
+ force_iommu = 1;
+ bad_dma_address = 0;
+#ifdef CONFIG_GART_IOMMU
+ gart_iommu_aperture_disabled = 1;
+ gart_iommu_aperture = 0;
+#endif
+
+ dma_ops = &amd_iommu_dma_ops;
+
+ return 0;
+
+free_domains:
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ if (iommu->default_dom)
+ dma_ops_domain_free(iommu->default_dom);
+ }
+
+ return ret;
+}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
new file mode 100644
index 000000000000..2a13e430437d
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ * Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/sysdev.h>
+#include <asm/pci-direct.h>
+#include <asm/amd_iommu_types.h>
+#include <asm/amd_iommu.h>
+#include <asm/gart.h>
+
+/*
+ * definitions for the ACPI scanning code
+ */
+#define UPDATE_LAST_BDF(x) do {\
+ if ((x) > amd_iommu_last_bdf) \
+ amd_iommu_last_bdf = (x); \
+ } while (0);
+
+#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
+#define PCI_BUS(x) (((x) >> 8) & 0xff)
+#define IVRS_HEADER_LENGTH 48
+#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
+
+#define ACPI_IVHD_TYPE 0x10
+#define ACPI_IVMD_TYPE_ALL 0x20
+#define ACPI_IVMD_TYPE 0x21
+#define ACPI_IVMD_TYPE_RANGE 0x22
+
+#define IVHD_DEV_ALL 0x01
+#define IVHD_DEV_SELECT 0x02
+#define IVHD_DEV_SELECT_RANGE_START 0x03
+#define IVHD_DEV_RANGE_END 0x04
+#define IVHD_DEV_ALIAS 0x42
+#define IVHD_DEV_ALIAS_RANGE 0x43
+#define IVHD_DEV_EXT_SELECT 0x46
+#define IVHD_DEV_EXT_SELECT_RANGE 0x47
+
+#define IVHD_FLAG_HT_TUN_EN 0x00
+#define IVHD_FLAG_PASSPW_EN 0x01
+#define IVHD_FLAG_RESPASSPW_EN 0x02
+#define IVHD_FLAG_ISOC_EN 0x03
+
+#define IVMD_FLAG_EXCL_RANGE 0x08
+#define IVMD_FLAG_UNITY_MAP 0x01
+
+#define ACPI_DEVFLAG_INITPASS 0x01
+#define ACPI_DEVFLAG_EXTINT 0x02
+#define ACPI_DEVFLAG_NMI 0x04
+#define ACPI_DEVFLAG_SYSMGT1 0x10
+#define ACPI_DEVFLAG_SYSMGT2 0x20
+#define ACPI_DEVFLAG_LINT0 0x40
+#define ACPI_DEVFLAG_LINT1 0x80
+#define ACPI_DEVFLAG_ATSDIS 0x10000000
+
+struct ivhd_header {
+ u8 type;
+ u8 flags;
+ u16 length;
+ u16 devid;
+ u16 cap_ptr;
+ u64 mmio_phys;
+ u16 pci_seg;
+ u16 info;
+ u32 reserved;
+} __attribute__((packed));
+
+struct ivhd_entry {
+ u8 type;
+ u16 devid;
+ u8 flags;
+ u32 ext;
+} __attribute__((packed));
+
+struct ivmd_header {
+ u8 type;
+ u8 flags;
+ u16 length;
+ u16 devid;
+ u16 aux;
+ u64 resv;
+ u64 range_start;
+ u64 range_length;
+} __attribute__((packed));
+
+static int __initdata amd_iommu_detected;
+
+u16 amd_iommu_last_bdf;
+struct list_head amd_iommu_unity_map;
+unsigned amd_iommu_aperture_order = 26;
+int amd_iommu_isolate;
+
+struct list_head amd_iommu_list;
+struct dev_table_entry *amd_iommu_dev_table;
+u16 *amd_iommu_alias_table;
+struct amd_iommu **amd_iommu_rlookup_table;
+struct protection_domain **amd_iommu_pd_table;
+unsigned long *amd_iommu_pd_alloc_bitmap;
+
+static u32 dev_table_size;
+static u32 alias_table_size;
+static u32 rlookup_table_size;
+
+static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+{
+ u64 start = iommu->exclusion_start & PAGE_MASK;
+ u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
+ u64 entry;
+
+ if (!iommu->exclusion_start)
+ return;
+
+ entry = start | MMIO_EXCL_ENABLE_MASK;
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
+ &entry, sizeof(entry));
+
+ entry = limit;
+ memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
+ &entry, sizeof(entry));
+}
+
+static void __init iommu_set_device_table(struct amd_iommu *iommu)
+{
+ u32 entry;
+
+ BUG_ON(iommu->mmio_base == NULL);
+
+ entry = virt_to_phys(amd_iommu_dev_table);
+ entry |= (dev_table_size >> 12) - 1;
+ memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
+ &entry, sizeof(entry));
+}
+
+static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+{
+ u32 ctrl;
+
+ ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl |= (1 << bit);
+ writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+{
+ u32 ctrl;
+
+ ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+ ctrl &= ~(1 << bit);
+ writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+void __init iommu_enable(struct amd_iommu *iommu)
+{
+ printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
+ print_devid(iommu->devid, 0);
+ printk(" cap 0x%hx\n", iommu->cap_ptr);
+
+ iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
+}
+
+static u8 * __init iommu_map_mmio_space(u64 address)
+{
+ u8 *ret;
+
+ if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
+ return NULL;
+
+ ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
+ if (ret != NULL)
+ return ret;
+
+ release_mem_region(address, MMIO_REGION_LENGTH);
+
+ return NULL;
+}
+
+static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
+{
+ if (iommu->mmio_base)
+ iounmap(iommu->mmio_base);
+ release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
+}
+
+static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
+{
+ u32 cap;
+
+ cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
+ UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+
+ return 0;
+}
+
+static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
+{
+ u8 *p = (void *)h, *end = (void *)h;
+ struct ivhd_entry *dev;
+
+ p += sizeof(*h);
+ end += h->length;
+
+ find_last_devid_on_pci(PCI_BUS(h->devid),
+ PCI_SLOT(h->devid),
+ PCI_FUNC(h->devid),
+ h->cap_ptr);
+
+ while (p < end) {
+ dev = (struct ivhd_entry *)p;
+ switch (dev->type) {
+ case IVHD_DEV_SELECT:
+ case IVHD_DEV_RANGE_END:
+ case IVHD_DEV_ALIAS:
+ case IVHD_DEV_EXT_SELECT:
+ UPDATE_LAST_BDF(dev->devid);
+ break;
+ default:
+ break;
+ }
+ p += 0x04 << (*p >> 6);
+ }
+
+ WARN_ON(p != end);
+
+ return 0;
+}
+
+static int __init find_last_devid_acpi(struct acpi_table_header *table)
+{
+ int i;
+ u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
+ struct ivhd_header *h;
+
+ /*
+ * Validate checksum here so we don't need to do it when
+ * we actually parse the table
+ */
+ for (i = 0; i < table->length; ++i)
+ checksum += p[i];
+ if (checksum != 0)
+ /* ACPI table corrupt */
+ return -ENODEV;
+
+ p += IVRS_HEADER_LENGTH;
+
+ end += table->length;
+ while (p < end) {
+ h = (struct ivhd_header *)p;
+ switch (h->type) {
+ case ACPI_IVHD_TYPE:
+ find_last_devid_from_ivhd(h);
+ break;
+ default:
+ break;
+ }
+ p += h->length;
+ }
+ WARN_ON(p != end);
+
+ return 0;
+}
+
+static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
+{
+ u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
+ get_order(CMD_BUFFER_SIZE));
+ u64 entry = 0;
+
+ if (cmd_buf == NULL)
+ return NULL;
+
+ iommu->cmd_buf_size = CMD_BUFFER_SIZE;
+
+ memset(cmd_buf, 0, CMD_BUFFER_SIZE);
+
+ entry = (u64)virt_to_phys(cmd_buf);
+ entry |= MMIO_CMD_SIZE_512;
+ memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+ &entry, sizeof(entry));
+
+ iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+
+ return cmd_buf;
+}
+
+static void __init free_command_buffer(struct amd_iommu *iommu)
+{
+ if (iommu->cmd_buf)
+ free_pages((unsigned long)iommu->cmd_buf,
+ get_order(CMD_BUFFER_SIZE));
+}
+
+static void set_dev_entry_bit(u16 devid, u8 bit)
+{
+ int i = (bit >> 5) & 0x07;
+ int _bit = bit & 0x1f;
+
+ amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
+}
+
+static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
+{
+ if (flags & ACPI_DEVFLAG_INITPASS)
+ set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
+ if (flags & ACPI_DEVFLAG_EXTINT)
+ set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
+ if (flags & ACPI_DEVFLAG_NMI)
+ set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
+ if (flags & ACPI_DEVFLAG_SYSMGT1)
+ set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
+ if (flags & ACPI_DEVFLAG_SYSMGT2)
+ set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
+ if (flags & ACPI_DEVFLAG_LINT0)
+ set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
+ if (flags & ACPI_DEVFLAG_LINT1)
+ set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
+}
+
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+ amd_iommu_rlookup_table[devid] = iommu;
+}
+
+static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
+{
+ struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+ if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
+ return;
+
+ if (iommu) {
+ set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
+ iommu->exclusion_start = m->range_start;
+ iommu->exclusion_length = m->range_length;
+ }
+}
+
+static void __init init_iommu_from_pci(struct amd_iommu *iommu)
+{
+ int bus = PCI_BUS(iommu->devid);
+ int dev = PCI_SLOT(iommu->devid);
+ int fn = PCI_FUNC(iommu->devid);
+ int cap_ptr = iommu->cap_ptr;
+ u32 range;
+
+ iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
+
+ range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
+ iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
+ iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
+}
+
+static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
+ struct ivhd_header *h)
+{
+ u8 *p = (u8 *)h;
+ u8 *end = p, flags = 0;
+ u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
+ u32 ext_flags = 0;
+ bool alias = 0;
+ struct ivhd_entry *e;
+
+ /*
+ * First set the recommended feature enable bits from ACPI
+ * into the IOMMU control registers
+ */
+ h->flags & IVHD_FLAG_HT_TUN_EN ?
+ iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+ iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+ h->flags & IVHD_FLAG_PASSPW_EN ?
+ iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+ h->flags & IVHD_FLAG_RESPASSPW_EN ?
+ iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+ iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+ h->flags & IVHD_FLAG_ISOC_EN ?
+ iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+ iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+ /*
+ * make IOMMU memory accesses cache coherent
+ */
+ iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+
+ /*
+ * Done. Now parse the device entries
+ */
+ p += sizeof(struct ivhd_header);
+ end += h->length;
+
+ while (p < end) {
+ e = (struct ivhd_entry *)p;
+ switch (e->type) {
+ case IVHD_DEV_ALL:
+ for (dev_i = iommu->first_device;
+ dev_i <= iommu->last_device; ++dev_i)
+ set_dev_entry_from_acpi(dev_i, e->flags, 0);
+ break;
+ case IVHD_DEV_SELECT:
+ devid = e->devid;
+ set_dev_entry_from_acpi(devid, e->flags, 0);
+ break;
+ case IVHD_DEV_SELECT_RANGE_START:
+ devid_start = e->devid;
+ flags = e->flags;
+ ext_flags = 0;
+ alias = 0;
+ break;
+ case IVHD_DEV_ALIAS:
+ devid = e->devid;
+ devid_to = e->ext >> 8;
+ set_dev_entry_from_acpi(devid, e->flags, 0);
+ amd_iommu_alias_table[devid] = devid_to;
+ break;
+ case IVHD_DEV_ALIAS_RANGE:
+ devid_start = e->devid;
+ flags = e->flags;
+ devid_to = e->ext >> 8;
+ ext_flags = 0;
+ alias = 1;
+ break;
+ case IVHD_DEV_EXT_SELECT:
+ devid = e->devid;
+ set_dev_entry_from_acpi(devid, e->flags, e->ext);
+ break;
+ case IVHD_DEV_EXT_SELECT_RANGE:
+ devid_start = e->devid;
+ flags = e->flags;
+ ext_flags = e->ext;
+ alias = 0;
+ break;
+ case IVHD_DEV_RANGE_END:
+ devid = e->devid;
+ for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
+ if (alias)
+ amd_iommu_alias_table[dev_i] = devid_to;
+ set_dev_entry_from_acpi(
+ amd_iommu_alias_table[dev_i],
+ flags, ext_flags);
+ }
+ break;
+ default:
+ break;
+ }
+
+ p += 0x04 << (e->type >> 6);
+ }
+}
+
+static int __init init_iommu_devices(struct amd_iommu *iommu)
+{
+ u16 i;
+
+ for (i = iommu->first_device; i <= iommu->last_device; ++i)
+ set_iommu_for_device(iommu, i);
+
+ return 0;
+}
+
+static void __init free_iommu_one(struct amd_iommu *iommu)
+{
+ free_command_buffer(iommu);
+ iommu_unmap_mmio_space(iommu);
+}
+
+static void __init free_iommu_all(void)
+{
+ struct amd_iommu *iommu, *next;
+
+ list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+ list_del(&iommu->list);
+ free_iommu_one(iommu);
+ kfree(iommu);
+ }
+}
+
+static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
+{
+ spin_lock_init(&iommu->lock);
+ list_add_tail(&iommu->list, &amd_iommu_list);
+
+ /*
+ * Copy data from ACPI table entry to the iommu struct
+ */
+ iommu->devid = h->devid;
+ iommu->cap_ptr = h->cap_ptr;
+ iommu->mmio_phys = h->mmio_phys;
+ iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
+ if (!iommu->mmio_base)
+ return -ENOMEM;
+
+ iommu_set_device_table(iommu);
+ iommu->cmd_buf = alloc_command_buffer(iommu);
+ if (!iommu->cmd_buf)
+ return -ENOMEM;
+
+ init_iommu_from_pci(iommu);
+ init_iommu_from_acpi(iommu, h);
+ init_iommu_devices(iommu);
+
+ return 0;
+}
+
+static int __init init_iommu_all(struct acpi_table_header *table)
+{
+ u8 *p = (u8 *)table, *end = (u8 *)table;
+ struct ivhd_header *h;
+ struct amd_iommu *iommu;
+ int ret;
+
+ INIT_LIST_HEAD(&amd_iommu_list);
+
+ end += table->length;
+ p += IVRS_HEADER_LENGTH;
+
+ while (p < end) {
+ h = (struct ivhd_header *)p;
+ switch (*p) {
+ case ACPI_IVHD_TYPE:
+ iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
+ if (iommu == NULL)
+ return -ENOMEM;
+ ret = init_iommu_one(iommu, h);
+ if (ret)
+ return ret;
+ break;
+ default:
+ break;
+ }
+ p += h->length;
+
+ }
+ WARN_ON(p != end);
+
+ return 0;
+}
+
+static void __init free_unity_maps(void)
+{
+ struct unity_map_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
+ list_del(&entry->list);
+ kfree(entry);
+ }
+}
+
+static int __init init_exclusion_range(struct ivmd_header *m)
+{
+ int i;
+
+ switch (m->type) {
+ case ACPI_IVMD_TYPE:
+ set_device_exclusion_range(m->devid, m);
+ break;
+ case ACPI_IVMD_TYPE_ALL:
+ for (i = 0; i < amd_iommu_last_bdf; ++i)
+ set_device_exclusion_range(i, m);
+ break;
+ case ACPI_IVMD_TYPE_RANGE:
+ for (i = m->devid; i <= m->aux; ++i)
+ set_device_exclusion_range(i, m);
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static int __init init_unity_map_range(struct ivmd_header *m)
+{
+ struct unity_map_entry *e = 0;
+
+ e = kzalloc(sizeof(*e), GFP_KERNEL);
+ if (e == NULL)
+ return -ENOMEM;
+
+ switch (m->type) {
+ default:
+ case ACPI_IVMD_TYPE:
+ e->devid_start = e->devid_end = m->devid;
+ break;
+ case ACPI_IVMD_TYPE_ALL:
+ e->devid_start = 0;
+ e->devid_end = amd_iommu_last_bdf;
+ break;
+ case ACPI_IVMD_TYPE_RANGE:
+ e->devid_start = m->devid;
+ e->devid_end = m->aux;
+ break;
+ }
+ e->address_start = PAGE_ALIGN(m->range_start);
+ e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
+ e->prot = m->flags >> 1;
+
+ list_add_tail(&e->list, &amd_iommu_unity_map);
+
+ return 0;
+}
+
+static int __init init_memory_definitions(struct acpi_table_header *table)
+{
+ u8 *p = (u8 *)table, *end = (u8 *)table;
+ struct ivmd_header *m;
+
+ INIT_LIST_HEAD(&amd_iommu_unity_map);
+
+ end += table->length;
+ p += IVRS_HEADER_LENGTH;
+
+ while (p < end) {
+ m = (struct ivmd_header *)p;
+ if (m->flags & IVMD_FLAG_EXCL_RANGE)
+ init_exclusion_range(m);
+ else if (m->flags & IVMD_FLAG_UNITY_MAP)
+ init_unity_map_range(m);
+
+ p += m->length;
+ }
+
+ return 0;
+}
+
+static void __init enable_iommus(void)
+{
+ struct amd_iommu *iommu;
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ iommu_set_exclusion_range(iommu);
+ iommu_enable(iommu);
+ }
+}
+
+/*
+ * Suspend/Resume support
+ * disable suspend until real resume implemented
+ */
+
+static int amd_iommu_resume(struct sys_device *dev)
+{
+ return 0;
+}
+
+static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
+{
+ return -EINVAL;
+}
+
+static struct sysdev_class amd_iommu_sysdev_class = {
+ .name = "amd_iommu",
+ .suspend = amd_iommu_suspend,
+ .resume = amd_iommu_resume,
+};
+
+static struct sys_device device_amd_iommu = {
+ .id = 0,
+ .cls = &amd_iommu_sysdev_class,
+};
+
+int __init amd_iommu_init(void)
+{
+ int i, ret = 0;
+
+
+ if (no_iommu) {
+ printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
+ return 0;
+ }
+
+ if (!amd_iommu_detected)
+ return -ENODEV;
+
+ /*
+ * First parse ACPI tables to find the largest Bus/Dev/Func
+ * we need to handle. Upon this information the shared data
+ * structures for the IOMMUs in the system will be allocated
+ */
+ if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
+ return -ENODEV;
+
+ dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
+ alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
+ rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
+
+ ret = -ENOMEM;
+
+ /* Device table - directly used by all IOMMUs */
+ amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(dev_table_size));
+ if (amd_iommu_dev_table == NULL)
+ goto out;
+
+ /*
+ * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
+ * IOMMU see for that device
+ */
+ amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(alias_table_size));
+ if (amd_iommu_alias_table == NULL)
+ goto free;
+
+ /* IOMMU rlookup table - find the IOMMU for a specific device */
+ amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(rlookup_table_size));
+ if (amd_iommu_rlookup_table == NULL)
+ goto free;
+
+ /*
+ * Protection Domain table - maps devices to protection domains
+ * This table has the same size as the rlookup_table
+ */
+ amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(rlookup_table_size));
+ if (amd_iommu_pd_table == NULL)
+ goto free;
+
+ amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(MAX_DOMAIN_ID/8));
+ if (amd_iommu_pd_alloc_bitmap == NULL)
+ goto free;
+
+ /*
+ * memory is allocated now; initialize the device table with all zeroes
+ * and let all alias entries point to itself
+ */
+ memset(amd_iommu_dev_table, 0, dev_table_size);
+ for (i = 0; i < amd_iommu_last_bdf; ++i)
+ amd_iommu_alias_table[i] = i;
+
+ memset(amd_iommu_pd_table, 0, rlookup_table_size);
+ memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
+
+ /*
+ * never allocate domain 0 because its used as the non-allocated and
+ * error value placeholder
+ */
+ amd_iommu_pd_alloc_bitmap[0] = 1;
+
+ /*
+ * now the data structures are allocated and basically initialized
+ * start the real acpi table scan
+ */
+ ret = -ENODEV;
+ if (acpi_table_parse("IVRS", init_iommu_all) != 0)
+ goto free;
+
+ if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
+ goto free;
+
+ ret = amd_iommu_init_dma_ops();
+ if (ret)
+ goto free;
+
+ ret = sysdev_class_register(&amd_iommu_sysdev_class);
+ if (ret)
+ goto free;
+
+ ret = sysdev_register(&device_amd_iommu);
+ if (ret)
+ goto free;
+
+ enable_iommus();
+
+ printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
+ (1 << (amd_iommu_aperture_order-20)));
+
+ printk(KERN_INFO "AMD IOMMU: device isolation ");
+ if (amd_iommu_isolate)
+ printk("enabled\n");
+ else
+ printk("disabled\n");
+
+out:
+ return ret;
+
+free:
+ if (amd_iommu_pd_alloc_bitmap)
+ free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+
+ if (amd_iommu_pd_table)
+ free_pages((unsigned long)amd_iommu_pd_table,
+ get_order(rlookup_table_size));
+
+ if (amd_iommu_rlookup_table)
+ free_pages((unsigned long)amd_iommu_rlookup_table,
+ get_order(rlookup_table_size));
+
+ if (amd_iommu_alias_table)
+ free_pages((unsigned long)amd_iommu_alias_table,
+ get_order(alias_table_size));
+
+ if (amd_iommu_dev_table)
+ free_pages((unsigned long)amd_iommu_dev_table,
+ get_order(dev_table_size));
+
+ free_iommu_all();
+
+ free_unity_maps();
+
+ goto out;
+}
+
+static int __init early_amd_iommu_detect(struct acpi_table_header *table)
+{
+ return 0;
+}
+
+void __init amd_iommu_detect(void)
+{
+ if (swiotlb || no_iommu || iommu_detected)
+ return;
+
+ if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
+ iommu_detected = 1;
+ amd_iommu_detected = 1;
+#ifdef CONFIG_GART_IOMMU
+ gart_iommu_aperture_disabled = 1;
+ gart_iommu_aperture = 0;
+#endif
+ }
+}
+
+static int __init parse_amd_iommu_options(char *str)
+{
+ for (; *str; ++str) {
+ if (strcmp(str, "isolate") == 0)
+ amd_iommu_isolate = 1;
+ }
+
+ return 1;
+}
+
+static int __init parse_amd_iommu_size_options(char *str)
+{
+ for (; *str; ++str) {
+ if (strcmp(str, "32M") == 0)
+ amd_iommu_aperture_order = 25;
+ if (strcmp(str, "64M") == 0)
+ amd_iommu_aperture_order = 26;
+ if (strcmp(str, "128M") == 0)
+ amd_iommu_aperture_order = 27;
+ if (strcmp(str, "256M") == 0)
+ amd_iommu_aperture_order = 28;
+ if (strcmp(str, "512M") == 0)
+ amd_iommu_aperture_order = 29;
+ if (strcmp(str, "1G") == 0)
+ amd_iommu_aperture_order = 30;
+ }
+
+ return 1;
+}
+
+__setup("amd_iommu=", parse_amd_iommu_options);
+__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 479926d9e004..600470d464fa 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -35,6 +35,18 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
+struct bus_dev_range {
+ int bus;
+ int dev_base;
+ int dev_limit;
+};
+
+static struct bus_dev_range bus_dev_ranges[] __initdata = {
+ { 0x00, 0x18, 0x20},
+ { 0xff, 0x00, 0x20},
+ { 0xfe, 0x00, 0x20}
+};
+
static struct resource gart_resource = {
.name = "GART",
.flags = IORESOURCE_MEM,
@@ -55,8 +67,9 @@ static u32 __init allocate_aperture(void)
u32 aper_size;
void *p;
- if (fallback_aper_order > 7)
- fallback_aper_order = 7;
+ /* aper_size should <= 1G */
+ if (fallback_aper_order > 5)
+ fallback_aper_order = 5;
aper_size = (32 * 1024 * 1024) << fallback_aper_order;
/*
@@ -65,7 +78,20 @@ static u32 __init allocate_aperture(void)
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
- p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
+ /*
+ * using 512M as goal, in case kexec will load kernel_big
+ * that will do the on position decompress, and could overlap with
+ * that positon with gart that is used.
+ * sequende:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+ * ==> kernel_small(gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * so don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe
+ */
+ p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
if (!p || __pa(p)+aper_size > 0xffffffff) {
printk(KERN_ERR
"Cannot allocate aperture memory hole (%p,%uK)\n",
@@ -83,69 +109,53 @@ static u32 __init allocate_aperture(void)
return (u32)__pa(p);
}
-static int __init aperture_valid(u64 aper_base, u32 aper_size)
-{
- if (!aper_base)
- return 0;
-
- if (aper_base + aper_size > 0x100000000UL) {
- printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
- return 0;
- }
- if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
- printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
- return 0;
- }
- if (aper_size < 64*1024*1024) {
- printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
- return 0;
- }
-
- return 1;
-}
/* Find a PCI capability */
-static __u32 __init find_cap(int num, int slot, int func, int cap)
+static u32 __init find_cap(int bus, int slot, int func, int cap)
{
int bytes;
u8 pos;
- if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+ if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
PCI_STATUS_CAP_LIST))
return 0;
- pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+ pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
u8 id;
pos &= ~3;
- id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
+ id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
if (id == 0xff)
break;
if (id == cap)
return pos;
- pos = read_pci_config_byte(num, slot, func,
+ pos = read_pci_config_byte(bus, slot, func,
pos+PCI_CAP_LIST_NEXT);
}
return 0;
}
/* Read a standard AGPv3 bridge header */
-static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
{
u32 apsize;
u32 apsizereg;
int nbits;
u32 aper_low, aper_hi;
u64 aper;
+ u32 old_order;
- printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
- apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
+ printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+ apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
if (apsizereg == 0xffffffff) {
printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
return 0;
}
+ /* old_order could be the value from NB gart setting */
+ old_order = *order;
+
apsize = apsizereg & 0xfff;
/* Some BIOS use weird encodings not in the AGPv3 table. */
if (apsize & 0xff)
@@ -155,14 +165,26 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
if ((int)*order < 0) /* < 32MB */
*order = 0;
- aper_low = read_pci_config(num, slot, func, 0x10);
- aper_hi = read_pci_config(num, slot, func, 0x14);
+ aper_low = read_pci_config(bus, slot, func, 0x10);
+ aper_hi = read_pci_config(bus, slot, func, 0x14);
aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+ /*
+ * On some sick chips, APSIZE is 0. It means it wants 4G
+ * so let double check that order, and lets trust AMD NB settings:
+ */
+ printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
+ aper, 32 << old_order);
+ if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
+ printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
+ 32 << *order, apsizereg);
+ *order = old_order;
+ }
+
printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
aper, 32 << *order, apsizereg);
- if (!aperture_valid(aper, (32*1024*1024) << *order))
+ if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
return 0;
return (u32)aper;
}
@@ -180,17 +202,17 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
* the AGP bridges should be always an own bus on the HT hierarchy,
* but do it here for future safety.
*/
-static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
{
- int num, slot, func;
+ int bus, slot, func;
/* Poor man's PCI discovery */
- for (num = 0; num < 256; num++) {
+ for (bus = 0; bus < 256; bus++) {
for (slot = 0; slot < 32; slot++) {
for (func = 0; func < 8; func++) {
u32 class, cap;
u8 type;
- class = read_pci_config(num, slot, func,
+ class = read_pci_config(bus, slot, func,
PCI_CLASS_REVISION);
if (class == 0xffffffff)
break;
@@ -199,17 +221,17 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
case PCI_CLASS_BRIDGE_HOST:
case PCI_CLASS_BRIDGE_OTHER: /* needed? */
/* AGP bridge? */
- cap = find_cap(num, slot, func,
+ cap = find_cap(bus, slot, func,
PCI_CAP_ID_AGP);
if (!cap)
break;
*valid_agp = 1;
- return read_agp(num, slot, func, cap,
+ return read_agp(bus, slot, func, cap,
order);
}
/* No multi-function device? */
- type = read_pci_config_byte(num, slot, func,
+ type = read_pci_config_byte(bus, slot, func,
PCI_HEADER_TYPE);
if (!(type & 0x80))
break;
@@ -249,36 +271,50 @@ void __init early_gart_iommu_check(void)
* or BIOS forget to put that in reserved.
* try to update e820 to make that region as reserved.
*/
- int fix, num;
+ int i, fix, slot;
u32 ctl;
u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base = 0, last_aper_base = 0;
- int aper_enabled = 0, last_aper_enabled = 0;
+ int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
if (!early_pci_allowed())
return;
+ /* This is mostly duplicate of iommu_hole_init */
fix = 0;
- for (num = 24; num < 32; num++) {
- if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
- continue;
-
- ctl = read_pci_config(0, num, 3, 0x90);
- aper_enabled = ctl & 1;
- aper_order = (ctl >> 1) & 7;
- aper_size = (32 * 1024 * 1024) << aper_order;
- aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
- aper_base <<= 25;
-
- if ((last_aper_order && aper_order != last_aper_order) ||
- (last_aper_base && aper_base != last_aper_base) ||
- (last_aper_enabled && aper_enabled != last_aper_enabled)) {
- fix = 1;
- break;
+ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = bus_dev_ranges[i].bus;
+ dev_base = bus_dev_ranges[i].dev_base;
+ dev_limit = bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ aper_enabled = ctl & AMD64_GARTEN;
+ aper_order = (ctl >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ if (last_valid) {
+ if ((aper_order != last_aper_order) ||
+ (aper_base != last_aper_base) ||
+ (aper_enabled != last_aper_enabled)) {
+ fix = 1;
+ break;
+ }
+ }
+
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
+ last_aper_enabled = aper_enabled;
+ last_valid = 1;
}
- last_aper_order = aper_order;
- last_aper_base = aper_base;
- last_aper_enabled = aper_enabled;
}
if (!fix && !aper_enabled)
@@ -290,32 +326,46 @@ void __init early_gart_iommu_check(void)
if (gart_fix_e820 && !fix && aper_enabled) {
if (e820_any_mapped(aper_base, aper_base + aper_size,
E820_RAM)) {
- /* reserved it, so we can resuse it in second kernel */
+ /* reserve it, so we can reuse it in second kernel */
printk(KERN_INFO "update e820 for GART\n");
- add_memory_region(aper_base, aper_size, E820_RESERVED);
+ e820_add_region(aper_base, aper_size, E820_RESERVED);
update_e820();
}
- return;
}
+ if (!fix)
+ return;
+
/* different nodes have different setting, disable them all at first*/
- for (num = 24; num < 32; num++) {
- if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
- continue;
+ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = bus_dev_ranges[i].bus;
+ dev_base = bus_dev_ranges[i].dev_base;
+ dev_limit = bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
- ctl = read_pci_config(0, num, 3, 0x90);
- ctl &= ~1;
- write_pci_config(0, num, 3, 0x90, ctl);
+ ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+ ctl &= ~AMD64_GARTEN;
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+ }
}
}
+static int __initdata printed_gart_size_msg;
+
void __init gart_iommu_hole_init(void)
{
+ u32 agp_aper_base = 0, agp_aper_order = 0;
u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
u64 aper_base, last_aper_base = 0;
- int fix, num, valid_agp = 0;
- int node;
+ int fix, slot, valid_agp = 0;
+ int i, node;
if (gart_iommu_aperture_disabled || !fix_aperture ||
!early_pci_allowed())
@@ -323,38 +373,63 @@ void __init gart_iommu_hole_init(void)
printk(KERN_INFO "Checking aperture...\n");
+ if (!fallback_aper_force)
+ agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+
fix = 0;
node = 0;
- for (num = 24; num < 32; num++) {
- if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
- continue;
-
- iommu_detected = 1;
- gart_iommu_aperture = 1;
-
- aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
- aper_size = (32 * 1024 * 1024) << aper_order;
- aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
- aper_base <<= 25;
-
- printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
- node, aper_base, aper_size >> 20);
- node++;
-
- if (!aperture_valid(aper_base, aper_size)) {
- fix = 1;
- break;
- }
+ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = bus_dev_ranges[i].bus;
+ dev_base = bus_dev_ranges[i].dev_base;
+ dev_limit = bus_dev_ranges[i].dev_limit;
+
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ iommu_detected = 1;
+ gart_iommu_aperture = 1;
+
+ aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+ aper_base <<= 25;
+
+ printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
+ node, aper_base, aper_size >> 20);
+ node++;
+
+ if (!aperture_valid(aper_base, aper_size, 64<<20)) {
+ if (valid_agp && agp_aper_base &&
+ agp_aper_base == aper_base &&
+ agp_aper_order == aper_order) {
+ /* the same between two setting from NB and agp */
+ if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) {
+ printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
+ printk(KERN_ERR "please increase GART size in your BIOS setup\n");
+ printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+ printed_gart_size_msg = 1;
+ }
+ } else {
+ fix = 1;
+ goto out;
+ }
+ }
- if ((last_aper_order && aper_order != last_aper_order) ||
- (last_aper_base && aper_base != last_aper_base)) {
- fix = 1;
- break;
+ if ((last_aper_order && aper_order != last_aper_order) ||
+ (last_aper_base && aper_base != last_aper_base)) {
+ fix = 1;
+ goto out;
+ }
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
}
- last_aper_order = aper_order;
- last_aper_base = aper_base;
}
+out:
if (!fix && !fallback_aper_force) {
if (last_aper_base) {
unsigned long n = (32 * 1024 * 1024) << last_aper_order;
@@ -364,8 +439,10 @@ void __init gart_iommu_hole_init(void)
return;
}
- if (!fallback_aper_force)
- aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
+ if (!fallback_aper_force) {
+ aper_alloc = agp_aper_base;
+ aper_order = agp_aper_order;
+ }
if (aper_alloc) {
/* Got the aperture from the AGP bridge */
@@ -401,16 +478,24 @@ void __init gart_iommu_hole_init(void)
}
/* Fix up the north bridges */
- for (num = 24; num < 32; num++) {
- if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
- continue;
-
- /*
- * Don't enable translation yet. That is done later.
- * Assume this BIOS didn't initialise the GART so
- * just overwrite all previous bits
- */
- write_pci_config(0, num, 3, 0x90, aper_order<<1);
- write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
+ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ int bus;
+ int dev_base, dev_limit;
+
+ bus = bus_dev_ranges[i].bus;
+ dev_base = bus_dev_ranges[i].dev_base;
+ dev_limit = bus_dev_ranges[i].dev_limit;
+ for (slot = dev_base; slot < dev_limit; slot++) {
+ if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ continue;
+
+ /* Don't enable translation yet. That is done later.
+ Assume this BIOS didn't initialise the GART so
+ just overwrite all previous bits */
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+ write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
+ }
}
+
+ set_up_gart_resume(aper_order, aper_alloc);
}
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index f17c1c1bc384..84ce106b33c8 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -61,18 +61,26 @@ static int enable_local_apic __initdata;
/* Local APIC timer verification ok */
static int local_apic_timer_verify_ok;
-/* Disable local APIC timer from the kernel commandline or via dmi quirk
- or using CPU MSR check */
-int local_apic_timer_disabled;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk */
+static int local_apic_timer_disabled;
/* Local APIC timer works in C2 */
int local_apic_timer_c2_ok;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+
/*
* Debug level, exported for io_apic.c
*/
int apic_verbosity;
+int pic_mode;
+
+/* Have we found an MP table */
+int smp_found_config;
+
static unsigned int calibration_result;
static int lapic_next_event(unsigned long delta,
@@ -1151,9 +1159,6 @@ static int __init detect_init_APIC(void)
if (l & MSR_IA32_APICBASE_ENABLE)
mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
- if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
- nmi_watchdog = NMI_LOCAL_APIC;
-
printk(KERN_INFO "Found and enabled local APIC!\n");
apic_pm_activate();
@@ -1199,7 +1204,7 @@ void __init init_apic_mappings(void)
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
if (!ioapic_phys) {
printk(KERN_ERR
"WARNING: bogus zero IO-APIC "
@@ -1266,6 +1271,10 @@ int __init APIC_init_uniprocessor(void)
setup_local_APIC();
+#ifdef CONFIG_X86_IO_APIC
+ if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
+#endif
+ localise_nmi_watchdog();
end_local_APIC_setup();
#ifdef CONFIG_X86_IO_APIC
if (smp_found_config)
@@ -1348,13 +1357,13 @@ void __init smp_intr_init(void)
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
*/
- set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+ alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
/* IPI for invalidation */
- set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
/* IPI for generic function call */
- set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
}
#endif
@@ -1367,15 +1376,15 @@ void __init apic_intr_init(void)
smp_intr_init();
#endif
/* self generated IPI for local APIC timer */
- set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
/* IPI vectors for APIC spurious and error interrupts */
- set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+ alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
/* thermal monitor LVT interrupt */
#ifdef CONFIG_X86_MCE_P4THERMAL
- set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
#endif
}
@@ -1510,6 +1519,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
*/
cpu = 0;
+ if (apicid > max_physical_apicid)
+ max_physical_apicid = apicid;
+
/*
* Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
* but we need to work other dependencies like SMP_SUSPEND etc
@@ -1517,7 +1529,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
* if (CPU_HOTPLUG_ENABLED || num_processors > 8)
* - Ashok Raj <ashok.raj@intel.com>
*/
- if (num_processors > 8) {
+ if (max_physical_apicid >= 8) {
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
if (!APIC_XAPIC(version)) {
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 4fd21f7d698c..e494809fc508 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -43,7 +43,7 @@
#include <mach_ipi.h>
#include <mach_apic.h>
-int disable_apic_timer __cpuinitdata;
+static int disable_apic_timer __cpuinitdata;
static int apic_calibrate_pmtmr __initdata;
int disable_apic;
@@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
*/
int apic_verbosity;
+/* Have we found an MP table */
+int smp_found_config;
+
static struct resource lapic_resource = {
.name = "Local APIC",
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -419,32 +422,8 @@ void __init setup_boot_APIC_clock(void)
setup_APIC_timer();
}
-/*
- * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
- * C1E flag only in the secondary CPU, so when we detect the wreckage
- * we already have enabled the boot CPU local apic timer. Check, if
- * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
- * set the DUMMY flag again and force the broadcast mode in the
- * clockevents layer.
- */
-static void __cpuinit check_boot_apic_timer_broadcast(void)
-{
- if (!disable_apic_timer ||
- (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
- return;
-
- printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
- lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
-
- local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
- &boot_cpu_physical_apicid);
- local_irq_disable();
-}
-
void __cpuinit setup_secondary_APIC_clock(void)
{
- check_boot_apic_timer_broadcast();
setup_APIC_timer();
}
@@ -872,7 +851,7 @@ static int __init detect_init_APIC(void)
void __init early_init_lapic_mapping(void)
{
- unsigned long apic_phys;
+ unsigned long phys_addr;
/*
* If no local APIC can be found then go out
@@ -881,11 +860,11 @@ void __init early_init_lapic_mapping(void)
if (!smp_found_config)
return;
- apic_phys = mp_lapic_addr;
+ phys_addr = mp_lapic_addr;
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+ set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, apic_phys);
+ APIC_BASE, phys_addr);
/*
* Fetch the APIC ID of the BSP in case we have a
@@ -951,6 +930,8 @@ int __init APIC_init_uniprocessor(void)
if (!skip_ioapic_setup && nr_ioapics)
enable_IO_APIC();
+ if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
+ localise_nmi_watchdog();
end_local_APIC_setup();
if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
@@ -1087,6 +1068,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
*/
cpu = 0;
}
+ if (apicid > max_physical_apicid)
+ max_physical_apicid = apicid;
+
/* are we being called early in kernel startup? */
if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9290e29013..00e6d1370954 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,6 +228,7 @@
#include <linux/suspend.h>
#include <linux/kthread.h>
#include <linux/jiffies.h>
+#include <linux/smp_lock.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -1149,7 +1150,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
as->event_tail = 0;
}
as->events[as->event_head] = event;
- if ((!as->suser) || (!as->writer))
+ if (!as->suser || !as->writer)
continue;
switch (event) {
case APM_SYS_SUSPEND:
@@ -1396,7 +1397,7 @@ static void apm_mainloop(void)
static int check_apm_user(struct apm_user *as, const char *func)
{
- if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
+ if (as == NULL || as->magic != APM_BIOS_MAGIC) {
printk(KERN_ERR "apm: %s passed bad filp\n", func);
return 1;
}
@@ -1459,18 +1460,19 @@ static unsigned int do_poll(struct file *fp, poll_table *wait)
return 0;
}
-static int do_ioctl(struct inode *inode, struct file *filp,
- u_int cmd, u_long arg)
+static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
{
struct apm_user *as;
+ int ret;
as = filp->private_data;
if (check_apm_user(as, "ioctl"))
return -EIO;
- if ((!as->suser) || (!as->writer))
+ if (!as->suser || !as->writer)
return -EPERM;
switch (cmd) {
case APM_IOC_STANDBY:
+ lock_kernel();
if (as->standbys_read > 0) {
as->standbys_read--;
as->standbys_pending--;
@@ -1479,8 +1481,10 @@ static int do_ioctl(struct inode *inode, struct file *filp,
queue_event(APM_USER_STANDBY, as);
if (standbys_pending <= 0)
standby();
+ unlock_kernel();
break;
case APM_IOC_SUSPEND:
+ lock_kernel();
if (as->suspends_read > 0) {
as->suspends_read--;
as->suspends_pending--;
@@ -1488,16 +1492,17 @@ static int do_ioctl(struct inode *inode, struct file *filp,
} else
queue_event(APM_USER_SUSPEND, as);
if (suspends_pending <= 0) {
- return suspend(1);
+ ret = suspend(1);
} else {
as->suspend_wait = 1;
wait_event_interruptible(apm_suspend_waitqueue,
as->suspend_wait == 0);
- return as->suspend_result;
+ ret = as->suspend_result;
}
- break;
+ unlock_kernel();
+ return ret;
default:
- return -EINVAL;
+ return -ENOTTY;
}
return 0;
}
@@ -1860,7 +1865,7 @@ static const struct file_operations apm_bios_fops = {
.owner = THIS_MODULE,
.read = do_read,
.poll = do_poll,
- .ioctl = do_ioctl,
+ .unlocked_ioctl = do_ioctl,
.open = do_open,
.release = do_release,
};
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0c6f8190887..65b1be5fe9ce 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -6,11 +6,15 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o
obj-y += proc.o feature_names.o
obj-$(CONFIG_X86_32) += common.o bugs.o
+obj-$(CONFIG_X86_64) += bugs_64.o
obj-$(CONFIG_X86_32) += amd.o
+obj-$(CONFIG_X86_64) += amd_64.o
obj-$(CONFIG_X86_32) += cyrix.o
obj-$(CONFIG_X86_32) += centaur.o
+obj-$(CONFIG_X86_64) += centaur_64.o
obj-$(CONFIG_X86_32) += transmeta.o
obj-$(CONFIG_X86_32) += intel.o
+obj-$(CONFIG_X86_64) += intel_64.o
obj-$(CONFIG_X86_32) += umc.o
obj-$(CONFIG_X86_MCE) += mcheck/
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c2e1ce33c7cb..84a8220a6072 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -1,9 +1,7 @@
-
/*
* Routines to indentify additional cpu features that are scattered in
* cpuid space.
*/
-
#include <linux/cpu.h>
#include <asm/pat.h>
@@ -53,19 +51,20 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_PAT
void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
{
+ if (!cpu_has_pat)
+ pat_disable("PAT not supported by CPU.");
+
switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- if (c->x86 >= 0xf && c->x86 <= 0x11)
- return;
- break;
case X86_VENDOR_INTEL:
if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
return;
break;
+ case X86_VENDOR_AMD:
+ case X86_VENDOR_CENTAUR:
+ case X86_VENDOR_TRANSMETA:
+ return;
}
- pat_disable(cpu_has_pat ?
- "PAT disabled. Not yet verified on this CPU type." :
- "PAT not supported by CPU.");
+ pat_disable("PAT disabled. Not yet verified on this CPU type.");
}
#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 245866828294..81a07ca65d44 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,43 +24,6 @@
extern void vide(void);
__asm__(".align 4\nvide: ret");
-#ifdef CONFIG_X86_LOCAL_APIC
-#define ENABLE_C1E_MASK 0x18000000
-#define CPUID_PROCESSOR_SIGNATURE 1
-#define CPUID_XFAM 0x0ff00000
-#define CPUID_XFAM_K8 0x00000000
-#define CPUID_XFAM_10H 0x00100000
-#define CPUID_XFAM_11H 0x00200000
-#define CPUID_XMOD 0x000f0000
-#define CPUID_XMOD_REV_F 0x00040000
-
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
-{
- u32 lo, hi;
- u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
- switch (eax & CPUID_XFAM) {
- case CPUID_XFAM_K8:
- if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
- break;
- case CPUID_XFAM_10H:
- case CPUID_XFAM_11H:
- rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
- if (lo & ENABLE_C1E_MASK) {
- if (smp_processor_id() != boot_cpu_physical_apicid)
- printk(KERN_INFO "AMD C1E detected late. "
- " Force timer broadcast.\n");
- return 1;
- }
- break;
- default:
- /* err on the side of caution */
- return 1;
- }
- return 0;
-}
-#endif
-
int force_mwait __cpuinitdata;
static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
@@ -297,11 +260,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
num_cache_leaves = 3;
}
-#ifdef CONFIG_X86_LOCAL_APIC
- if (amd_apic_timer_broken())
- local_apic_timer_disabled = 1;
-#endif
-
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
clear_cpu_cap(c, X86_FEATURE_MCE);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
new file mode 100644
index 000000000000..30b7557c9641
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -0,0 +1,211 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/numa_64.h>
+#include <asm/mmconfig.h>
+#include <asm/cacheflush.h>
+
+#include <mach_apic.h>
+
+#include "cpu.h"
+
+int force_mwait __cpuinitdata;
+
+#ifdef CONFIG_NUMA
+static int __cpuinit nearby_node(int apicid)
+{
+ int i, node;
+
+ for (i = apicid - 1; i >= 0; i--) {
+ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+ node = apicid_to_node[i];
+ if (node != NUMA_NO_NODE && node_online(node))
+ return node;
+ }
+ return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned bits;
+#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
+ int node = 0;
+ unsigned apicid = hard_smp_processor_id();
+#endif
+ bits = c->x86_coreid_bits;
+
+ /* Low order bits define the core id (index of core in socket) */
+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ /* Convert the initial APIC ID into the socket ID */
+ c->phys_proc_id = c->initial_apicid >> bits;
+
+#ifdef CONFIG_NUMA
+ node = c->phys_proc_id;
+ if (apicid_to_node[apicid] != NUMA_NO_NODE)
+ node = apicid_to_node[apicid];
+ if (!node_online(node)) {
+ /* Two possibilities here:
+ - The CPU is missing memory and no node was created.
+ In that case try picking one from a nearby CPU
+ - The APIC IDs differ from the HyperTransport node IDs
+ which the K8 northbridge parsing fills in.
+ Assume they are all increased by a constant offset,
+ but in the same order as the HT nodeids.
+ If that doesn't result in a usable node fall back to the
+ path for the previous case. */
+
+ int ht_nodeid = c->initial_apicid;
+
+ if (ht_nodeid >= 0 &&
+ apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ node = apicid_to_node[ht_nodeid];
+ /* Pick a nearby node */
+ if (!node_online(node))
+ node = nearby_node(apicid);
+ }
+ numa_set_node(cpu, node);
+
+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+ unsigned bits, ecx;
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level < 0x80000008)
+ return;
+
+ ecx = cpuid_ecx(0x80000008);
+
+ c->x86_max_cores = (ecx & 0xff) + 1;
+
+ /* CPU telling us the core id bits shift? */
+ bits = (ecx >> 12) & 0xF;
+
+ /* Otherwise recompute */
+ if (bits == 0) {
+ while ((1 << bits) < c->x86_max_cores)
+ bits++;
+ }
+
+ c->x86_coreid_bits = bits;
+
+#endif
+}
+
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+ early_init_amd_mc(c);
+
+ /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+ if (c->x86_power & (1<<8))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
+ unsigned level;
+
+#ifdef CONFIG_SMP
+ unsigned long value;
+
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ if (c->x86 == 15) {
+ rdmsrl(MSR_K8_HWCR, value);
+ value |= 1 << 6;
+ wrmsrl(MSR_K8_HWCR, value);
+ }
+#endif
+
+ /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+ clear_cpu_cap(c, 0*32+31);
+
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
+ level >= 0x0f58))
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ if (c->x86 == 0x10 || c->x86 == 0x11)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* Enable workaround for FXSAVE leak */
+ if (c->x86 >= 6)
+ set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+
+ level = get_model_name(c);
+ if (!level) {
+ switch (c->x86) {
+ case 15:
+ /* Should distinguish Models here, but this is only
+ a fallback anyways. */
+ strcpy(c->x86_model_id, "Hammer");
+ break;
+ }
+ }
+ display_cacheinfo(c);
+
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level >= 0x80000008)
+ amd_detect_cmp(c);
+
+ if (c->extended_cpuid_level >= 0x80000006 &&
+ (cpuid_edx(0x80000006) & 0xf000))
+ num_cache_leaves = 4;
+ else
+ num_cache_leaves = 3;
+
+ if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
+ set_cpu_cap(c, X86_FEATURE_K8);
+
+ /* MFENCE stops RDTSC speculation */
+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+
+ if (c->x86 == 0x10)
+ fam10h_check_enable_mmcfg();
+
+ if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
+ (tseg >> PMD_SHIFT) <
+ (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+}
+
+static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+ .c_vendor = "AMD",
+ .c_ident = { "AuthenticAMD" },
+ .c_early_init = early_init_amd,
+ .c_init = init_amd,
+};
+
+cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 170d2f5523b2..1b1c56bb338f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -59,8 +59,12 @@ static void __init check_fpu(void)
return;
}
-/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
- /* Test for the divl bug.. */
+ /*
+ * trap_init() enabled FXSR and company _before_ testing for FP
+ * problems here.
+ *
+ * Test for the divl bug..
+ */
__asm__("fninit\n\t"
"fldl %1\n\t"
"fdivl %2\n\t"
@@ -108,10 +112,15 @@ static void __init check_popad(void)
"movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
: "=&a" (res)
: "d" (inp)
- : "ecx", "edi" );
- /* If this fails, it means that any user program may lock the CPU hard. Too bad. */
- if (res != 12345678) printk( "Buggy.\n" );
- else printk( "OK.\n" );
+ : "ecx", "edi");
+ /*
+ * If this fails, it means that any user program may lock the
+ * CPU hard. Too bad.
+ */
+ if (res != 12345678)
+ printk("Buggy.\n");
+ else
+ printk("OK.\n");
#endif
}
@@ -137,7 +146,8 @@ static void __init check_config(void)
* i486+ only features! (WP works in supervisor mode and the
* new "invlpg" and "bswap" instructions)
*/
-#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
+#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
+ defined(CONFIG_X86_BSWAP)
if (boot_cpu_data.x86 == 3)
panic("Kernel requires i486+ for 'invlpg' and other features");
#endif
@@ -170,6 +180,7 @@ void __init check_bugs(void)
check_fpu();
check_hlt();
check_popad();
- init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+ init_utsname()->machine[1] =
+ '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
alternative_instructions();
}
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 9a3ed0649d4e..9a3ed0649d4e 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
new file mode 100644
index 000000000000..13526fd5cce1
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -0,0 +1,43 @@
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+{
+ if (c->x86 == 0x6 && c->x86_model >= 0xf)
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+{
+ /* Cache sizes */
+ unsigned n;
+
+ n = c->extended_cpuid_level;
+ if (n >= 0x80000008) {
+ unsigned eax = cpuid_eax(0x80000008);
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+
+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+}
+
+static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+ .c_vendor = "Centaur",
+ .c_ident = { "CentaurHauls" },
+ .c_early_init = early_init_centaur,
+ .c_init = init_centaur,
+};
+
+cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 783691b2a738..4d894e8565fe 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,3 +1,6 @@
+#ifndef ARCH_X86_CPU_H
+
+#define ARCH_X86_CPU_H
struct cpu_model_info {
int vendor;
@@ -36,3 +39,5 @@ extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
extern int get_model_name(struct cpuinfo_x86 *c);
extern void display_cacheinfo(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index f03e9153618e..965ea52767ac 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -26,9 +26,10 @@
#define NFORCE2_SAFE_DISTANCE 50
/* Delay in ms between FSB changes */
-//#define NFORCE2_DELAY 10
+/* #define NFORCE2_DELAY 10 */
-/* nforce2_chipset:
+/*
+ * nforce2_chipset:
* FSB is changed using the chipset
*/
static struct pci_dev *nforce2_chipset_dev;
@@ -36,13 +37,13 @@ static struct pci_dev *nforce2_chipset_dev;
/* fid:
* multiplier * 10
*/
-static int fid = 0;
+static int fid;
/* min_fsb, max_fsb:
* minimum and maximum FSB (= FSB at boot time)
*/
-static int min_fsb = 0;
-static int max_fsb = 0;
+static int min_fsb;
+static int max_fsb;
MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
@@ -53,7 +54,7 @@ module_param(min_fsb, int, 0444);
MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
MODULE_PARM_DESC(min_fsb,
- "Minimum FSB to use, if not defined: current FSB - 50");
+ "Minimum FSB to use, if not defined: current FSB - 50");
#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
@@ -139,7 +140,7 @@ static unsigned int nforce2_fsb_read(int bootfsb)
/* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
- 0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL);
+ 0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL);
if (!nforce2_sub5)
return 0;
@@ -147,13 +148,13 @@ static unsigned int nforce2_fsb_read(int bootfsb)
fsb /= 1000000;
/* Check if PLL register is already set */
- pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+ pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
- if(bootfsb || !temp)
+ if (bootfsb || !temp)
return fsb;
-
+
/* Use PLL register FSB value */
- pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp);
+ pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp);
fsb = nforce2_calc_fsb(temp);
return fsb;
@@ -184,7 +185,7 @@ static int nforce2_set_fsb(unsigned int fsb)
}
/* First write? Then set actual value */
- pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+ pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
if (!temp) {
pll = nforce2_calc_pll(tfsb);
@@ -210,7 +211,8 @@ static int nforce2_set_fsb(unsigned int fsb)
tfsb--;
/* Calculate the PLL reg. value */
- if ((pll = nforce2_calc_pll(tfsb)) == -1)
+ pll = nforce2_calc_pll(tfsb);
+ if (pll == -1)
return -EINVAL;
nforce2_write_pll(pll);
@@ -249,7 +251,7 @@ static unsigned int nforce2_get(unsigned int cpu)
static int nforce2_target(struct cpufreq_policy *policy,
unsigned int target_freq, unsigned int relation)
{
-// unsigned long flags;
+/* unsigned long flags; */
struct cpufreq_freqs freqs;
unsigned int target_fsb;
@@ -271,17 +273,17 @@ static int nforce2_target(struct cpufreq_policy *policy,
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
/* Disable IRQs */
- //local_irq_save(flags);
+ /* local_irq_save(flags); */
if (nforce2_set_fsb(target_fsb) < 0)
printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
- target_fsb);
+ target_fsb);
else
dprintk("Changed FSB successfully to %d\n",
- target_fsb);
+ target_fsb);
/* Enable IRQs */
- //local_irq_restore(flags);
+ /* local_irq_restore(flags); */
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
@@ -302,8 +304,8 @@ static int nforce2_verify(struct cpufreq_policy *policy)
policy->max = (fsb_pol_max + 1) * fid * 100;
cpufreq_verify_within_limits(policy,
- policy->cpuinfo.min_freq,
- policy->cpuinfo.max_freq);
+ policy->cpuinfo.min_freq,
+ policy->cpuinfo.max_freq);
return 0;
}
@@ -347,7 +349,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
/* Set maximum FSB to FSB at boot time */
max_fsb = nforce2_fsb_read(1);
- if(!max_fsb)
+ if (!max_fsb)
return -EIO;
if (!min_fsb)
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
new file mode 100644
index 000000000000..fcb1cc9d75ca
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -0,0 +1,103 @@
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/topology.h>
+#include <asm/numa_64.h>
+
+#include "cpu.h"
+
+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+{
+ if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+ (c->x86 == 0x6 && c->x86_model >= 0x0e))
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+/*
+ * find out the number of processor cores on the die
+ */
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+ unsigned int eax, t;
+
+ if (c->cpuid_level < 4)
+ return 1;
+
+ cpuid_count(4, 0, &eax, &t, &t, &t);
+
+ if (eax & 0x1f)
+ return ((eax >> 26) + 1);
+ else
+ return 1;
+}
+
+static void __cpuinit srat_detect_node(void)
+{
+#ifdef CONFIG_NUMA
+ unsigned node;
+ int cpu = smp_processor_id();
+ int apicid = hard_smp_processor_id();
+
+ /* Don't do the funky fallback heuristics the AMD version employs
+ for now. */
+ node = apicid_to_node[apicid];
+ if (node == NUMA_NO_NODE || !node_online(node))
+ node = first_node(node_online_map);
+ numa_set_node(cpu, node);
+
+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+{
+ /* Cache sizes */
+ unsigned n;
+
+ init_intel_cacheinfo(c);
+ if (c->cpuid_level > 9) {
+ unsigned eax = cpuid_eax(10);
+ /* Check for version and the number of counters */
+ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+ }
+
+ if (cpu_has_ds) {
+ unsigned int l1, l2;
+ rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+ if (!(l1 & (1<<11)))
+ set_cpu_cap(c, X86_FEATURE_BTS);
+ if (!(l1 & (1<<12)))
+ set_cpu_cap(c, X86_FEATURE_PEBS);
+ }
+
+
+ if (cpu_has_bts)
+ ds_init_intel(c);
+
+ n = c->extended_cpuid_level;
+ if (n >= 0x80000008) {
+ unsigned eax = cpuid_eax(0x80000008);
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+
+ if (c->x86 == 15)
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+ c->x86_max_cores = intel_num_cpu_cores(c);
+
+ srat_detect_node();
+}
+
+static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+ .c_vendor = "Intel",
+ .c_ident = { "GenuineIntel" },
+ .c_early_init = early_init_intel,
+ .c_init = init_intel,
+};
+cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 26d615dcb149..2c8afafa18e8 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -62,6 +62,7 @@ static struct _cache_table cache_table[] __cpuinitdata =
{ 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */
{ 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */
{ 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */
+ { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */
{ 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
{ 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index e633c9c2b764..f390c9f66351 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -9,23 +9,23 @@
#include <linux/interrupt.h>
#include <linux/smp.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For AMD Athlon/Duron */
-static void k7_machine_check(struct pt_regs * regs, long error_code)
+static void k7_machine_check(struct pt_regs *regs, long error_code)
{
- int recover=1;
+ int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
- rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+ rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
- recover=0;
+ recover = 0;
printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
@@ -60,12 +60,12 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
}
if (recover&2)
- panic ("CPU context corrupt");
+ panic("CPU context corrupt");
if (recover&1)
- panic ("Unable to continue");
- printk (KERN_EMERG "Attempting to continue.\n");
+ panic("Unable to continue");
+ printk(KERN_EMERG "Attempting to continue.\n");
mcgstl &= ~(1<<2);
- wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+ wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
@@ -81,25 +81,25 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
machine_check_vector = k7_machine_check;
wmb();
- printk (KERN_INFO "Intel machine check architecture supported.\n");
- rdmsr (MSR_IA32_MCG_CAP, l, h);
+ printk(KERN_INFO "Intel machine check architecture supported.\n");
+ rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
- wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
/* Clear status for MC index 0 separately, we don't touch CTL,
* as some K7 Athlons cause spurious MCEs when its enabled. */
if (boot_cpu_data.x86 == 6) {
- wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+ wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
i = 1;
} else
i = 0;
- for (; i<nr_mce_banks; i++) {
- wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
- wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+ for (; i < nr_mce_banks; i++) {
+ wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+ wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
- set_in_cr4 (X86_CR4_MCE);
- printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+ set_in_cr4(X86_CR4_MCE);
+ printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae0..501ca1cea27d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -31,7 +31,7 @@
#include <asm/idle.h>
#define MISC_MCELOG_MINOR 227
-#define NR_BANKS 6
+#define NR_SYSFS_BANKS 6
atomic_t mce_entry;
@@ -46,7 +46,7 @@ static int mce_dont_init;
*/
static int tolerant = 1;
static int banks;
-static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
static unsigned long notify_user;
static int rip_msr;
static int mce_bootlog = -1;
@@ -209,7 +209,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
barrier();
for (i = 0; i < banks; i++) {
- if (!bank[i])
+ if (i < NR_SYSFS_BANKS && !bank[i])
continue;
m.misc = 0;
@@ -444,9 +444,10 @@ static void mce_init(void *dummy)
rdmsrl(MSR_IA32_MCG_CAP, cap);
banks = cap & 0xff;
- if (banks > NR_BANKS) {
- printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
- banks = NR_BANKS;
+ if (banks > MCE_EXTENDED_BANK) {
+ banks = MCE_EXTENDED_BANK;
+ printk(KERN_INFO "MCE: warning: using only %d banks\n",
+ MCE_EXTENDED_BANK);
}
/* Use accurate RIP reporting if available. */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
@@ -462,7 +463,11 @@ static void mce_init(void *dummy)
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
for (i = 0; i < banks; i++) {
- wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+ if (i < NR_SYSFS_BANKS)
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+ else
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
+
wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
}
}
@@ -766,7 +771,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
} \
static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
-/* TBD should generate these dynamically based on number of available banks */
+/*
+ * TBD should generate these dynamically based on number of available banks.
+ * Have only 6 contol banks in /sysfs until then.
+ */
ACCESSOR(bank0ctl,bank[0],mce_restart())
ACCESSOR(bank1ctl,bank[1],mce_restart())
ACCESSOR(bank2ctl,bank[2],mce_restart())
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index cb03345554a5..eef001ad3bde 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -8,7 +8,7 @@
#include <linux/interrupt.h>
#include <linux/smp.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -32,12 +32,12 @@ struct intel_mce_extended_msrs {
/* u32 *reserved[]; */
};
-static int mce_num_extended_msrs = 0;
+static int mce_num_extended_msrs;
#ifdef CONFIG_X86_MCE_P4THERMAL
static void unexpected_thermal_interrupt(struct pt_regs *regs)
-{
+{
printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
smp_processor_id());
add_taint(TAINT_MACHINE_CHECK);
@@ -83,7 +83,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
* be some SMM goo which handles it, so we can't even put a handler
* since it might be delivered via SMI already -zwanem.
*/
- rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
h = apic_read(APIC_LVTTHMR);
if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
@@ -91,7 +91,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
return; /* -EBUSY */
}
- /* check whether a vector already exists, temporarily masked? */
+ /* check whether a vector already exists, temporarily masked? */
if (h & APIC_VECTOR_MASK) {
printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
"installed\n",
@@ -104,18 +104,18 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
apic_write_around(APIC_LVTTHMR, h);
- rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+ rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+ wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
/* ok we're good to go... */
vendor_thermal_interrupt = intel_thermal_interrupt;
-
- rdmsr (MSR_IA32_MISC_ENABLE, l, h);
- wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
- l = apic_read (APIC_LVTTHMR);
- apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
- printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+
+ l = apic_read(APIC_LVTTHMR);
+ apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+ printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
@@ -129,28 +129,28 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
{
u32 h;
- rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
- rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
- rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
- rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
- rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
- rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
- rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
- rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
- rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
- rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
+ rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
+ rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
+ rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
+ rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
+ rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
+ rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
+ rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
+ rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
+ rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
+ rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
}
-static void intel_machine_check(struct pt_regs * regs, long error_code)
+static void intel_machine_check(struct pt_regs *regs, long error_code)
{
- int recover=1;
+ int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
- rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+ rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
- recover=0;
+ recover = 0;
printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
@@ -191,20 +191,20 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
}
if (recover & 2)
- panic ("CPU context corrupt");
+ panic("CPU context corrupt");
if (recover & 1)
- panic ("Unable to continue");
+ panic("Unable to continue");
printk(KERN_EMERG "Attempting to continue.\n");
- /*
- * Do not clear the MSR_IA32_MCi_STATUS if the error is not
+ /*
+ * Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
* for errors if the OS could not log the error.
*/
- for (i=0; i<nr_mce_banks; i++) {
+ for (i = 0; i < nr_mce_banks; i++) {
u32 msr;
msr = MSR_IA32_MC0_STATUS+i*4;
- rdmsr (msr, low, high);
+ rdmsr(msr, low, high);
if (high&(1<<31)) {
/* Clear it */
wrmsr(msr, 0UL, 0UL);
@@ -214,7 +214,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
}
}
mcgstl &= ~(1<<2);
- wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+ wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
@@ -222,30 +222,30 @@ void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
-
+
machine_check_vector = intel_machine_check;
wmb();
- printk (KERN_INFO "Intel machine check architecture supported.\n");
- rdmsr (MSR_IA32_MCG_CAP, l, h);
+ printk(KERN_INFO "Intel machine check architecture supported.\n");
+ rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
- wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
- for (i=0; i<nr_mce_banks; i++) {
- wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
- wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+ for (i = 0; i < nr_mce_banks; i++) {
+ wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+ wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
- set_in_cr4 (X86_CR4_MCE);
- printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+ set_in_cr4(X86_CR4_MCE);
+ printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
/* Check for P4/Xeon extended MCE MSRs */
- rdmsr (MSR_IA32_MCG_CAP, l, h);
+ rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<9)) {/* MCG_EXT_P */
mce_num_extended_msrs = (l >> 16) & 0xff;
- printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
+ printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
" available\n",
smp_processor_id(), mce_num_extended_msrs);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 5d241ce94a44..509bd3d9eacd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
static unsigned long smp_changes_mask;
static struct mtrr_state mtrr_state = {};
static int mtrr_state_set;
-static u64 tom2;
+u64 mtrr_tom2;
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "mtrr."
@@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
}
}
- if (tom2) {
- if (start >= (1ULL<<32) && (end < tom2))
+ if (mtrr_tom2) {
+ if (start >= (1ULL<<32) && (end < mtrr_tom2))
return MTRR_TYPE_WRBACK;
}
@@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
}
+/* fill the MSR pair relating to a var range */
+void fill_mtrr_var_range(unsigned int index,
+ u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
+{
+ struct mtrr_var_range *vr;
+
+ vr = mtrr_state.var_ranges;
+
+ vr[index].base_lo = base_lo;
+ vr[index].base_hi = base_hi;
+ vr[index].mask_lo = mask_lo;
+ vr[index].mask_hi = mask_hi;
+}
+
static void
get_fixed_ranges(mtrr_type * frs)
{
@@ -213,13 +227,13 @@ void __init get_mtrr_state(void)
mtrr_state.enabled = (lo & 0xc00) >> 10;
if (amd_special_default_mtrr()) {
- unsigned lo, hi;
+ unsigned low, high;
/* TOP_MEM2 */
- rdmsr(MSR_K8_TOP_MEM2, lo, hi);
- tom2 = hi;
- tom2 <<= 32;
- tom2 |= lo;
- tom2 &= 0xffffff8000000ULL;
+ rdmsr(MSR_K8_TOP_MEM2, low, high);
+ mtrr_tom2 = high;
+ mtrr_tom2 <<= 32;
+ mtrr_tom2 |= low;
+ mtrr_tom2 &= 0xffffff800000ULL;
}
if (mtrr_show) {
int high_width;
@@ -251,9 +265,9 @@ void __init get_mtrr_state(void)
else
printk(KERN_INFO "MTRR %u disabled\n", i);
}
- if (tom2) {
+ if (mtrr_tom2) {
printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
- tom2, tom2>>20);
+ mtrr_tom2, mtrr_tom2>>20);
}
}
mtrr_state_set = 1;
@@ -328,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
if (lo != msrwords[0] || hi != msrwords[1]) {
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 == 15 &&
+ (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
k8_enable_fixed_iorrs();
mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6a1e278d9323..105afe12beb0 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -37,6 +37,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/mutex.h>
+#include <linux/sort.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
@@ -609,6 +610,787 @@ static struct sysdev_driver mtrr_sysdev_driver = {
.resume = mtrr_restore,
};
+/* should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+struct res_range {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+ unsigned long end)
+{
+ /* out of slots */
+ if (nr_range >= RANGE_NUM)
+ return nr_range;
+
+ range[nr_range].start = start;
+ range[nr_range].end = end;
+
+ nr_range++;
+
+ return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+ unsigned long end)
+{
+ int i;
+
+ /* try to merge it with old one */
+ for (i = 0; i < nr_range; i++) {
+ unsigned long final_start, final_end;
+ unsigned long common_start, common_end;
+
+ if (!range[i].end)
+ continue;
+
+ common_start = max(range[i].start, start);
+ common_end = min(range[i].end, end);
+ if (common_start > common_end + 1)
+ continue;
+
+ final_start = min(range[i].start, start);
+ final_end = max(range[i].end, end);
+
+ range[i].start = final_start;
+ range[i].end = final_end;
+ return nr_range;
+ }
+
+ /* need to add that */
+ return add_range(range, nr_range, start, end);
+}
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
+{
+ int i, j;
+
+ for (j = 0; j < RANGE_NUM; j++) {
+ if (!range[j].end)
+ continue;
+
+ if (start <= range[j].start && end >= range[j].end) {
+ range[j].start = 0;
+ range[j].end = 0;
+ continue;
+ }
+
+ if (start <= range[j].start && end < range[j].end &&
+ range[j].start < end + 1) {
+ range[j].start = end + 1;
+ continue;
+ }
+
+
+ if (start > range[j].start && end >= range[j].end &&
+ range[j].end > start - 1) {
+ range[j].end = start - 1;
+ continue;
+ }
+
+ if (start > range[j].start && end < range[j].end) {
+ /* find the new spare */
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (range[i].end == 0)
+ break;
+ }
+ if (i < RANGE_NUM) {
+ range[i].end = range[j].end;
+ range[i].start = end + 1;
+ } else {
+ printk(KERN_ERR "run of slot in ranges\n");
+ }
+ range[j].end = start - 1;
+ continue;
+ }
+ }
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+ const struct res_range *r1 = x1;
+ const struct res_range *r2 = x2;
+ long start1, start2;
+
+ start1 = r1->start;
+ start2 = r2->start;
+
+ return start1 - start2;
+}
+
+struct var_mtrr_range_state {
+ unsigned long base_pfn;
+ unsigned long size_pfn;
+ mtrr_type type;
+};
+
+struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+ unsigned long extra_remove_base,
+ unsigned long extra_remove_size)
+{
+ unsigned long i, base, size;
+ mtrr_type type;
+
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_WRBACK)
+ continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
+ nr_range = add_range_with_merge(range, nr_range, base,
+ base + size - 1);
+ }
+ if (debug_print) {
+ printk(KERN_DEBUG "After WB checking\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* take out UC ranges */
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type != MTRR_TYPE_UNCACHABLE)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ continue;
+ base = range_state[i].base_pfn;
+ subtract_range(range, base, base + size - 1);
+ }
+ if (extra_remove_size)
+ subtract_range(range, extra_remove_base,
+ extra_remove_base + extra_remove_size - 1);
+
+ /* get new range num */
+ nr_range = 0;
+ for (i = 0; i < RANGE_NUM; i++) {
+ if (!range[i].end)
+ continue;
+ nr_range++;
+ }
+ if (debug_print) {
+ printk(KERN_DEBUG "After UC checking\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* sort the ranges */
+ sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+ if (debug_print) {
+ printk(KERN_DEBUG "After sorting\n");
+ for (i = 0; i < nr_range; i++)
+ printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+ range[i].start, range[i].end + 1);
+ }
+
+ /* clear those is not used */
+ for (i = nr_range; i < RANGE_NUM; i++)
+ memset(&range[i], 0, sizeof(range[i]));
+
+ return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+ unsigned long sum;
+ int i;
+
+ sum = 0;
+ for (i = 0; i < nr_range; i++)
+ sum += range[i].end + 1 - range[i].start;
+
+ return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+ CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+ if (enable_mtrr_cleanup != -1)
+ enable_mtrr_cleanup = 0;
+ return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+ if (enable_mtrr_cleanup != -1)
+ enable_mtrr_cleanup = 1;
+ return 0;
+}
+early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+struct var_mtrr_state {
+ unsigned long range_startk;
+ unsigned long range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+ unsigned int reg;
+};
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type, unsigned int address_bits)
+{
+ u32 base_lo, base_hi, mask_lo, mask_hi;
+ u64 base, mask;
+
+ if (!sizek) {
+ fill_mtrr_var_range(reg, 0, 0, 0, 0);
+ return;
+ }
+
+ mask = (1ULL << address_bits) - 1;
+ mask &= ~((((u64)sizek) << 10) - 1);
+
+ base = ((u64)basek) << 10;
+
+ base |= type;
+ mask |= 0x800;
+
+ base_lo = base & ((1ULL<<32) - 1);
+ base_hi = base >> 32;
+
+ mask_lo = mask & ((1ULL<<32) - 1);
+ mask_hi = mask >> 32;
+
+ fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+ unsigned char type)
+{
+ range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+ range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+ range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+ unsigned long basek, sizek;
+ unsigned char type;
+ unsigned int reg;
+
+ for (reg = 0; reg < num_var_ranges; reg++) {
+ basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+ sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+ type = range_state[reg].type;
+
+ set_var_mtrr(reg, basek, sizek, type, address_bits);
+ }
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+ unsigned long range_sizek, unsigned char type)
+{
+ if (!range_sizek || (reg >= num_var_ranges))
+ return reg;
+
+ while (range_sizek) {
+ unsigned long max_align, align;
+ unsigned long sizek;
+
+ /* Compute the maximum size I can make a range */
+ if (range_startk)
+ max_align = ffs(range_startk) - 1;
+ else
+ max_align = 32;
+ align = fls(range_sizek) - 1;
+ if (align > max_align)
+ align = max_align;
+
+ sizek = 1 << align;
+ if (debug_print)
+ printk(KERN_DEBUG "Setting variable MTRR %d, "
+ "base: %ldMB, range: %ldMB, type %s\n",
+ reg, range_startk >> 10, sizek >> 10,
+ (type == MTRR_TYPE_UNCACHABLE)?"UC":
+ ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
+ );
+ save_var_mtrr(reg++, range_startk, sizek, type);
+ range_startk += sizek;
+ range_sizek -= sizek;
+ if (reg >= num_var_ranges)
+ break;
+ }
+ return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+ unsigned long sizek)
+{
+ unsigned long hole_basek, hole_sizek;
+ unsigned long second_basek, second_sizek;
+ unsigned long range0_basek, range0_sizek;
+ unsigned long range_basek, range_sizek;
+ unsigned long chunk_sizek;
+ unsigned long gran_sizek;
+
+ hole_basek = 0;
+ hole_sizek = 0;
+ second_basek = 0;
+ second_sizek = 0;
+ chunk_sizek = state->chunk_sizek;
+ gran_sizek = state->gran_sizek;
+
+ /* align with gran size, prevent small block used up MTRRs */
+ range_basek = ALIGN(state->range_startk, gran_sizek);
+ if ((range_basek > basek) && basek)
+ return second_sizek;
+ state->range_sizek -= (range_basek - state->range_startk);
+ range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+ while (range_sizek > state->range_sizek) {
+ range_sizek -= gran_sizek;
+ if (!range_sizek)
+ return 0;
+ }
+ state->range_sizek = range_sizek;
+
+ /* try to append some small hole */
+ range0_basek = state->range_startk;
+ range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+ if (range0_sizek == state->range_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + state->range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ state->range_sizek, MTRR_TYPE_WRBACK);
+ return 0;
+ }
+
+ range0_sizek -= chunk_sizek;
+ if (range0_sizek && sizek) {
+ while (range0_basek + range0_sizek > (basek + sizek)) {
+ range0_sizek -= chunk_sizek;
+ if (!range0_sizek)
+ break;
+ }
+ }
+
+ if (range0_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+ range0_basek<<10,
+ (range0_basek + range0_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range0_basek,
+ range0_sizek, MTRR_TYPE_WRBACK);
+
+ }
+
+ range_basek = range0_basek + range0_sizek;
+ range_sizek = chunk_sizek;
+
+ if (range_basek + range_sizek > basek &&
+ range_basek + range_sizek <= (basek + sizek)) {
+ /* one hole */
+ second_basek = basek;
+ second_sizek = range_basek + range_sizek - basek;
+ }
+
+ /* if last piece, only could one hole near end */
+ if ((second_basek || !basek) &&
+ range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
+ (chunk_sizek >> 1)) {
+ /*
+ * one hole in middle (second_sizek is 0) or at end
+ * (second_sizek is 0 )
+ */
+ hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
+ - second_sizek;
+ hole_basek = range_basek + range_sizek - hole_sizek
+ - second_sizek;
+ } else {
+ /* fallback for big hole, or several holes */
+ range_sizek = state->range_sizek - range0_sizek;
+ second_basek = 0;
+ second_sizek = 0;
+ }
+
+ if (debug_print)
+ printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
+ (range_basek + range_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
+ MTRR_TYPE_WRBACK);
+ if (hole_sizek) {
+ if (debug_print)
+ printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+ hole_basek<<10, (hole_basek + hole_sizek)<<10);
+ state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
+ MTRR_TYPE_UNCACHABLE);
+
+ }
+
+ return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+ unsigned long size_pfn)
+{
+ unsigned long basek, sizek;
+ unsigned long second_sizek = 0;
+
+ if (state->reg >= num_var_ranges)
+ return;
+
+ basek = base_pfn << (PAGE_SHIFT - 10);
+ sizek = size_pfn << (PAGE_SHIFT - 10);
+
+ /* See if I can merge with the last range */
+ if ((basek <= 1024) ||
+ (state->range_startk + state->range_sizek == basek)) {
+ unsigned long endk = basek + sizek;
+ state->range_sizek = endk - state->range_startk;
+ return;
+ }
+ /* Write the range mtrrs */
+ if (state->range_sizek != 0)
+ second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+ /* Allocate an msr */
+ state->range_startk = basek + second_sizek;
+ state->range_sizek = sizek - second_sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_chunk_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ mtrr_gran_size = memparse(p, &p);
+ return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static int nr_mtrr_spare_reg __initdata =
+ CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+ if (arg)
+ nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+ u64 chunk_size, u64 gran_size)
+{
+ struct var_mtrr_state var_state;
+ int i;
+ int num_reg;
+
+ var_state.range_startk = 0;
+ var_state.range_sizek = 0;
+ var_state.reg = 0;
+ var_state.chunk_sizek = chunk_size >> 10;
+ var_state.gran_sizek = gran_size >> 10;
+
+ memset(range_state, 0, sizeof(range_state));
+
+ /* Write the range etc */
+ for (i = 0; i < nr_range; i++)
+ set_var_mtrr_range(&var_state, range[i].start,
+ range[i].end - range[i].start + 1);
+
+ /* Write the last range */
+ if (var_state.range_sizek != 0)
+ range_to_mtrr_with_hole(&var_state, 0, 0);
+
+ num_reg = var_state.reg;
+ /* Clear out the extra MTRR's */
+ while (var_state.reg < num_var_ranges) {
+ save_var_mtrr(var_state.reg, 0, 0, 0);
+ var_state.reg++;
+ }
+
+ return num_reg;
+}
+
+struct mtrr_cleanup_result {
+ unsigned long gran_sizek;
+ unsigned long chunk_sizek;
+ unsigned long lose_cover_sizek;
+ unsigned int num_reg;
+ int bad;
+};
+
+/*
+ * gran_size: 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 4G
+ * so we need (2+13)*6
+ */
+#define NUM_RESULT 90
+#define PSHIFT (PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static struct res_range __initdata range_new[RANGE_NUM];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+ unsigned long extra_remove_base, extra_remove_size;
+ unsigned long i, base, size, def, dummy;
+ mtrr_type type;
+ int nr_range, nr_range_new;
+ u64 chunk_size, gran_size;
+ unsigned long range_sums, range_sums_new;
+ int index_good;
+ int num_reg_good;
+
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
+
+ if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+ return 0;
+ rdmsr(MTRRdefType_MSR, def, dummy);
+ def &= 0xff;
+ if (def != MTRR_TYPE_UNCACHABLE)
+ return 0;
+
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
+
+ /* check entries number */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ size = range_state[i].size_pfn;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* check if we got UC entries */
+ if (!num[MTRR_TYPE_UNCACHABLE])
+ return 0;
+
+ /* check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ memset(range, 0, sizeof(range));
+ extra_remove_size = 0;
+ if (mtrr_tom2) {
+ extra_remove_base = 1 << (32 - PAGE_SHIFT);
+ extra_remove_size =
+ (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+ }
+ nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+ extra_remove_size);
+ range_sums = sum_ranges(range, nr_range);
+ printk(KERN_INFO "total RAM coverred: %ldM\n",
+ range_sums >> (20 - PAGE_SHIFT));
+
+ if (mtrr_chunk_size && mtrr_gran_size) {
+ int num_reg;
+
+ debug_print = 1;
+ /* convert ranges to var ranges state */
+ num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
+ mtrr_gran_size);
+
+ /* we got new setting in range_state, check it */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ extra_remove_base,
+ extra_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ i = 0;
+ result[i].chunk_sizek = mtrr_chunk_size >> 10;
+ result[i].gran_sizek = mtrr_gran_size >> 10;
+ result[i].num_reg = num_reg;
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek =
+ (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else
+ result[i].lose_cover_sizek =
+ (range_sums - range_sums_new) << PSHIFT;
+
+ printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+ result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
+ result[i].chunk_sizek >> 10);
+ printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n",
+ result[i].num_reg, result[i].bad?"-":"",
+ result[i].lose_cover_sizek >> 10);
+ if (!result[i].bad) {
+ set_var_mtrr_all(address_bits);
+ return 1;
+ }
+ printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+ "will find optimal one\n");
+ debug_print = 0;
+ memset(result, 0, sizeof(result[0]));
+ }
+
+ i = 0;
+ memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+ memset(result, 0, sizeof(result));
+ for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
+ for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+ chunk_size <<= 1) {
+ int num_reg;
+
+ if (debug_print)
+ printk(KERN_INFO
+ "\ngran_size: %lldM chunk_size_size: %lldM\n",
+ gran_size >> 20, chunk_size >> 20);
+ if (i >= NUM_RESULT)
+ continue;
+
+ /* convert ranges to var ranges state */
+ num_reg = x86_setup_var_mtrrs(range, nr_range,
+ chunk_size, gran_size);
+
+ /* we got new setting in range_state, check it */
+ memset(range_new, 0, sizeof(range_new));
+ nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+ extra_remove_base, extra_remove_size);
+ range_sums_new = sum_ranges(range_new, nr_range_new);
+
+ result[i].chunk_sizek = chunk_size >> 10;
+ result[i].gran_sizek = gran_size >> 10;
+ result[i].num_reg = num_reg;
+ if (range_sums < range_sums_new) {
+ result[i].lose_cover_sizek =
+ (range_sums_new - range_sums) << PSHIFT;
+ result[i].bad = 1;
+ } else
+ result[i].lose_cover_sizek =
+ (range_sums - range_sums_new) << PSHIFT;
+
+ /* double check it */
+ if (!result[i].bad && !result[i].lose_cover_sizek) {
+ if (nr_range_new != nr_range ||
+ memcmp(range, range_new, sizeof(range)))
+ result[i].bad = 1;
+ }
+
+ if (!result[i].bad && (range_sums - range_sums_new <
+ min_loss_pfn[num_reg])) {
+ min_loss_pfn[num_reg] =
+ range_sums - range_sums_new;
+ }
+ i++;
+ }
+ }
+
+ /* print out all */
+ for (i = 0; i < NUM_RESULT; i++) {
+ printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+ result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
+ result[i].chunk_sizek >> 10);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
+ result[i].num_reg, result[i].bad?"-":"",
+ result[i].lose_cover_sizek >> 10);
+ }
+
+ /* try to find the optimal index */
+ if (nr_mtrr_spare_reg >= num_var_ranges)
+ nr_mtrr_spare_reg = num_var_ranges - 1;
+ num_reg_good = -1;
+ for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+ if (!min_loss_pfn[i]) {
+ num_reg_good = i;
+ break;
+ }
+ }
+
+ index_good = -1;
+ if (num_reg_good != -1) {
+ for (i = 0; i < NUM_RESULT; i++) {
+ if (!result[i].bad &&
+ result[i].num_reg == num_reg_good &&
+ !result[i].lose_cover_sizek) {
+ index_good = i;
+ break;
+ }
+ }
+ }
+
+ if (index_good != -1) {
+ printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+ i = index_good;
+ printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
+ result[i].gran_sizek >> 10,
+ result[i].chunk_sizek >> 10);
+ printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
+ result[i].num_reg,
+ result[i].lose_cover_sizek >> 10);
+ /* convert ranges to var ranges state */
+ chunk_size = result[i].chunk_sizek;
+ chunk_size <<= 10;
+ gran_size = result[i].gran_sizek;
+ gran_size <<= 10;
+ debug_print = 1;
+ x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+ set_var_mtrr_all(address_bits);
+ return 1;
+ }
+
+ printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+ printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+ return 0;
+}
+#else
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+ return 0;
+}
+#endif
+
+static int __initdata changed_by_mtrr_cleanup;
+
static int disable_mtrr_trim;
static int __init disable_mtrr_trim_setup(char *str)
@@ -648,6 +1430,19 @@ int __init amd_special_default_mtrr(void)
return 0;
}
+static u64 __init real_trim_memory(unsigned long start_pfn,
+ unsigned long limit_pfn)
+{
+ u64 trim_start, trim_size;
+ trim_start = start_pfn;
+ trim_start <<= PAGE_SHIFT;
+ trim_size = limit_pfn;
+ trim_size <<= PAGE_SHIFT;
+ trim_size -= trim_start;
+
+ return e820_update_range(trim_start, trim_size, E820_RAM,
+ E820_RESERVED);
+}
/**
* mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
* @end_pfn: ending page frame number
@@ -663,8 +1458,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
{
unsigned long i, base, size, highest_pfn = 0, def, dummy;
mtrr_type type;
- u64 trim_start, trim_size;
+ int nr_range;
+ u64 total_trim_size;
+ /* extra one for all 0 */
+ int num[MTRR_NUM_TYPES + 1];
/*
* Make sure we only trim uncachable memory on machines that
* support the Intel MTRR architecture:
@@ -676,14 +1474,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
if (def != MTRR_TYPE_UNCACHABLE)
return 0;
- if (amd_special_default_mtrr())
- return 0;
+ /* get it and store it aside */
+ memset(range_state, 0, sizeof(range_state));
+ for (i = 0; i < num_var_ranges; i++) {
+ mtrr_if->get(i, &base, &size, &type);
+ range_state[i].base_pfn = base;
+ range_state[i].size_pfn = size;
+ range_state[i].type = type;
+ }
/* Find highest cached pfn */
for (i = 0; i < num_var_ranges; i++) {
- mtrr_if->get(i, &base, &size, &type);
+ type = range_state[i].type;
if (type != MTRR_TYPE_WRBACK)
continue;
+ base = range_state[i].base_pfn;
+ size = range_state[i].size_pfn;
if (highest_pfn < base + size)
highest_pfn = base + size;
}
@@ -698,22 +1504,65 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
return 0;
}
- if (highest_pfn < end_pfn) {
+ /* check entries number */
+ memset(num, 0, sizeof(num));
+ for (i = 0; i < num_var_ranges; i++) {
+ type = range_state[i].type;
+ if (type >= MTRR_NUM_TYPES)
+ continue;
+ size = range_state[i].size_pfn;
+ if (!size)
+ type = MTRR_NUM_TYPES;
+ num[type]++;
+ }
+
+ /* no entry for WB? */
+ if (!num[MTRR_TYPE_WRBACK])
+ return 0;
+
+ /* check if we only had WB and UC */
+ if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+ num_var_ranges - num[MTRR_NUM_TYPES])
+ return 0;
+
+ memset(range, 0, sizeof(range));
+ nr_range = 0;
+ if (mtrr_tom2) {
+ range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+ range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+ if (highest_pfn < range[nr_range].end + 1)
+ highest_pfn = range[nr_range].end + 1;
+ nr_range++;
+ }
+ nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+ total_trim_size = 0;
+ /* check the head */
+ if (range[0].start)
+ total_trim_size += real_trim_memory(0, range[0].start);
+ /* check the holes */
+ for (i = 0; i < nr_range - 1; i++) {
+ if (range[i].end + 1 < range[i+1].start)
+ total_trim_size += real_trim_memory(range[i].end + 1,
+ range[i+1].start);
+ }
+ /* check the top */
+ i = nr_range - 1;
+ if (range[i].end + 1 < end_pfn)
+ total_trim_size += real_trim_memory(range[i].end + 1,
+ end_pfn);
+
+ if (total_trim_size) {
printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
- " all of memory, losing %luMB of RAM.\n",
- (end_pfn - highest_pfn) >> (20 - PAGE_SHIFT));
+ " all of memory, losing %lluMB of RAM.\n",
+ total_trim_size >> 20);
- WARN_ON(1);
+ if (!changed_by_mtrr_cleanup)
+ WARN_ON(1);
printk(KERN_INFO "update e820 for mtrr\n");
- trim_start = highest_pfn;
- trim_start <<= PAGE_SHIFT;
- trim_size = end_pfn;
- trim_size <<= PAGE_SHIFT;
- trim_size -= trim_start;
- update_memory_range(trim_start, trim_size, E820_RAM,
- E820_RESERVED);
update_e820();
+
return 1;
}
@@ -729,18 +1578,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
*/
void __init mtrr_bp_init(void)
{
+ u32 phys_addr;
init_ifs();
+ phys_addr = 32;
+
if (cpu_has_mtrr) {
mtrr_if = &generic_mtrr_ops;
size_or_mask = 0xff000000; /* 36 bits */
size_and_mask = 0x00f00000;
+ phys_addr = 36;
/* This is an AMD specific MSR, but we assume(hope?) that
Intel will implement it to when they extend the address
bus of the Xeon. */
if (cpuid_eax(0x80000000) >= 0x80000008) {
- u32 phys_addr;
phys_addr = cpuid_eax(0x80000008) & 0xff;
/* CPUID workaround for Intel 0F33/0F34 CPU */
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
@@ -758,6 +1610,7 @@ void __init mtrr_bp_init(void)
don't support PAE */
size_or_mask = 0xfff00000; /* 32 bits */
size_and_mask = 0;
+ phys_addr = 32;
}
} else {
switch (boot_cpu_data.x86_vendor) {
@@ -791,8 +1644,15 @@ void __init mtrr_bp_init(void)
if (mtrr_if) {
set_num_var_ranges();
init_table();
- if (use_intel())
+ if (use_intel()) {
get_mtrr_state();
+
+ if (mtrr_cleanup(phys_addr)) {
+ changed_by_mtrr_cleanup = 1;
+ mtrr_if->set_all();
+ }
+
+ }
}
}
@@ -829,9 +1689,10 @@ static int __init mtrr_init_finialize(void)
{
if (!mtrr_if)
return 0;
- if (use_intel())
- mtrr_state_warn();
- else {
+ if (use_intel()) {
+ if (!changed_by_mtrr_cleanup)
+ mtrr_state_warn();
+ } else {
/* The CPUs haven't MTRR and seem to not support SMP. They have
* specific drivers, we use a tricky method to support
* suspend/resume for them.
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2cc77eb6fea3..2dc4ec656b23 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt);
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
+void fill_mtrr_var_range(unsigned int index,
+ u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
void get_mtrr_state(void);
extern void set_mtrr_ops(struct mtrr_ops * ops);
@@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if;
#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
extern unsigned int num_var_ranges;
+extern u64 mtrr_tom2;
void mtrr_state_warn(void);
const char *mtrr_attrib_to_str(int x);
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820.c
index af1eb0789740..7b613d2efb04 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820.c
@@ -17,172 +17,30 @@
#include <linux/kexec.h>
#include <linux/module.h>
#include <linux/mm.h>
-#include <linux/suspend.h>
#include <linux/pfn.h>
+#include <linux/suspend.h>
#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/setup.h>
-#include <asm/sections.h>
-#include <asm/kdebug.h>
#include <asm/trampoline.h>
struct e820map e820;
-/*
- * PFN of last memory page.
- */
-unsigned long end_pfn;
-
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_pfn_mapped;
-
-/*
- * Last pfn which the user wants to use.
- */
-static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
-
-/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 20
-
-struct early_res {
- unsigned long start, end;
- char name[16];
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
- { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
-#ifdef CONFIG_X86_TRAMPOLINE
- { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
#endif
- {}
-};
-
-void __init reserve_early(unsigned long start, unsigned long end, char *name)
-{
- int i;
- struct early_res *r;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (end > r->start && start < r->end)
- panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
- start, end - 1, name?name:"", r->start, r->end - 1, r->name);
- }
- if (i >= MAX_EARLY_RES)
- panic("Too many early reservations");
- r = &early_res[i];
- r->start = start;
- r->end = end;
- if (name)
- strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-void __init free_early(unsigned long start, unsigned long end)
-{
- struct early_res *r;
- int i, j;
-
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- r = &early_res[i];
- if (start == r->start && end == r->end)
- break;
- }
- if (i >= MAX_EARLY_RES || !early_res[i].end)
- panic("free_early on not reserved area: %lx-%lx!", start, end);
- for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
- ;
-
- memmove(&early_res[i], &early_res[i + 1],
- (j - 1 - i) * sizeof(struct early_res));
-
- early_res[j - 1].end = 0;
-}
-
-void __init early_res_to_bootmem(unsigned long start, unsigned long end)
-{
- int i;
- unsigned long final_start, final_end;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- final_start = max(start, r->start);
- final_end = min(end, r->end);
- if (final_start >= final_end)
- continue;
- printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
- final_start, final_end - 1, r->name);
- reserve_bootmem_generic(final_start, final_end - final_start,
- BOOTMEM_DEFAULT);
- }
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
-{
- int i;
- unsigned long addr = *addrp, last;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last >= r->start && addr < r->end) {
- *addrp = addr = round_up(r->end, align);
- changed = 1;
- goto again;
- }
- }
- return changed;
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
-{
- int i;
- unsigned long addr = *addrp, last;
- unsigned long size = *sizep;
- int changed = 0;
-again:
- last = addr + size;
- for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
- struct early_res *r = &early_res[i];
- if (last > r->start && addr < r->start) {
- size = r->start - addr;
- changed = 1;
- goto again;
- }
- if (last > r->end && addr < r->end) {
- addr = round_up(r->end, align);
- size = last - addr;
- changed = 1;
- goto again;
- }
- if (last <= r->end && addr >= r->start) {
- (*sizep)++;
- return 0;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
-}
/*
* This function checks if any part of the range <start,end> is mapped
* with type.
*/
int
-e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
+e820_any_mapped(u64 start, u64 end, unsigned type)
{
int i;
@@ -205,8 +63,7 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
* Note: this function only works correct if the e820 table is sorted and
* not-overlapping, which is the case
*/
-int __init e820_all_mapped(unsigned long start, unsigned long end,
- unsigned type)
+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
{
int i;
@@ -235,214 +92,13 @@ int __init e820_all_mapped(unsigned long start, unsigned long end,
}
/*
- * Find a free area with specified alignment in a specific range.
- */
-unsigned long __init find_e820_area(unsigned long start, unsigned long end,
- unsigned long size, unsigned long align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long addr, last;
- unsigned long ei_last;
-
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- while (bad_addr(&addr, size, align) && addr+size <= ei_last)
- ;
- last = addr + size;
- if (last > ei_last)
- continue;
- if (last > end)
- continue;
- return addr;
- }
- return -1UL;
-}
-
-/*
- * Find next free range after *start
- */
-unsigned long __init find_e820_area_size(unsigned long start,
- unsigned long *sizep,
- unsigned long align)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long addr, last;
- unsigned long ei_last;
-
- if (ei->type != E820_RAM)
- continue;
- addr = round_up(ei->addr, align);
- ei_last = ei->addr + ei->size;
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (bad_addr_size(&addr, sizep, align) &&
- addr + *sizep <= ei_last)
- ;
- last = addr + *sizep;
- if (last > ei_last)
- continue;
- return addr;
- }
- return -1UL;
-
-}
-/*
- * Find the highest page frame number we have available
- */
-unsigned long __init e820_end_of_ram(void)
-{
- unsigned long end_pfn;
-
- end_pfn = find_max_pfn_with_active_regions();
-
- if (end_pfn > max_pfn_mapped)
- max_pfn_mapped = end_pfn;
- if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
- max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
- if (end_pfn > end_user_pfn)
- end_pfn = end_user_pfn;
- if (end_pfn > max_pfn_mapped)
- end_pfn = max_pfn_mapped;
-
- printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
- return end_pfn;
-}
-
-/*
- * Mark e820 reserved areas as busy for the resource manager.
- */
-void __init e820_reserve_resources(void)
-{
- int i;
- struct resource *res;
-
- res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
- for (i = 0; i < e820.nr_map; i++) {
- switch (e820.map[i].type) {
- case E820_RAM: res->name = "System RAM"; break;
- case E820_ACPI: res->name = "ACPI Tables"; break;
- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
- default: res->name = "reserved";
- }
- res->start = e820.map[i].addr;
- res->end = res->start + e820.map[i].size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- insert_resource(&iomem_resource, res);
- res++;
- }
-}
-
-/*
- * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for software
- * suspend and suspend to RAM.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
- int i;
- unsigned long paddr;
-
- paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
- for (i = 1; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (paddr < ei->addr)
- register_nosave_region(PFN_DOWN(paddr),
- PFN_UP(ei->addr));
-
- paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (ei->type != E820_RAM)
- register_nosave_region(PFN_UP(ei->addr),
- PFN_DOWN(paddr));
-
- if (paddr >= (end_pfn << PAGE_SHIFT))
- break;
- }
-}
-
-/*
- * Finds an active region in the address range from start_pfn to end_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-static int __init e820_find_active_region(const struct e820entry *ei,
- unsigned long start_pfn,
- unsigned long end_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn)
-{
- *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
- *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
-
- /* Skip map entries smaller than a page */
- if (*ei_startpfn >= *ei_endpfn)
- return 0;
-
- /* Check if max_pfn_mapped should be updated */
- if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
- max_pfn_mapped = *ei_endpfn;
-
- /* Skip if map is outside the node */
- if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
- *ei_startpfn >= end_pfn)
- return 0;
-
- /* Check for overlaps */
- if (*ei_startpfn < start_pfn)
- *ei_startpfn = start_pfn;
- if (*ei_endpfn > end_pfn)
- *ei_endpfn = end_pfn;
-
- /* Obey end_user_pfn to save on memmap */
- if (*ei_startpfn >= end_user_pfn)
- return 0;
- if (*ei_endpfn > end_user_pfn)
- *ei_endpfn = end_user_pfn;
-
- return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init
-e820_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long ei_startpfn;
- unsigned long ei_endpfn;
- int i;
-
- for (i = 0; i < e820.nr_map; i++)
- if (e820_find_active_region(&e820.map[i],
- start_pfn, end_pfn,
- &ei_startpfn, &ei_endpfn))
- add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/*
* Add a memory region to the kernel e820 map.
*/
-void __init add_memory_region(unsigned long start, unsigned long size, int type)
+void __init e820_add_region(u64 start, u64 size, int type)
{
int x = e820.nr_map;
- if (x == E820MAX) {
+ if (x == ARRAY_SIZE(e820.map)) {
printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
return;
}
@@ -453,28 +109,7 @@ void __init add_memory_region(unsigned long start, unsigned long size, int type)
e820.nr_map++;
}
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long end_pfn = end >> PAGE_SHIFT;
- unsigned long ei_startpfn, ei_endpfn, ram = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- if (e820_find_active_region(&e820.map[i],
- start_pfn, end_pfn,
- &ei_startpfn, &ei_endpfn))
- ram += ei_endpfn - ei_startpfn;
- }
- return end - start - (ram << PAGE_SHIFT);
-}
-
-static void __init e820_print_map(char *who)
+void __init e820_print_map(char *who)
{
int i;
@@ -507,19 +142,75 @@ static void __init e820_print_map(char *who)
* Sanitize the BIOS e820 map.
*
* Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
+ * replaces the original e820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
+ *
+ * The input parameter biosmap points to an array of 'struct
+ * e820entry' which on entry has elements in the range [0, *pnr_map)
+ * valid, and which has space for up to max_nr_map entries.
+ * On return, the resulting sanitized e820 map entries will be in
+ * overwritten in the same location, starting at biosmap.
+ *
+ * The integer pointed to by pnr_map must be valid on entry (the
+ * current number of valid entries located at biosmap) and will
+ * be updated on return, with the new number of valid entries
+ * (something no more than max_nr_map.)
+ *
+ * The return value from sanitize_e820_map() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ * Visually we're performing the following
+ * (1,2,3,4 = memory types)...
+ *
+ * Sample memory map (w/overlaps):
+ * ____22__________________
+ * ______________________4_
+ * ____1111________________
+ * _44_____________________
+ * 11111111________________
+ * ____________________33__
+ * ___________44___________
+ * __________33333_________
+ * ______________22________
+ * ___________________2222_
+ * _________111111111______
+ * _____________________11_
+ * _________________4______
*
+ * Sanitized equivalent (no overlap):
+ * 1_______________________
+ * _44_____________________
+ * ___1____________________
+ * ____22__________________
+ * ______11________________
+ * _________1______________
+ * __________3_____________
+ * ___________44___________
+ * _____________33_________
+ * _______________2________
+ * ________________1_______
+ * _________________4______
+ * ___________________2____
+ * ____________________33__
+ * ______________________4_
*/
-static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
+
+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
+ int *pnr_map)
{
struct change_member {
struct e820entry *pbios; /* pointer to original bios entry */
unsigned long long addr; /* address for this change point */
};
- static struct change_member change_point_list[2*E820MAX] __initdata;
- static struct change_member *change_point[2*E820MAX] __initdata;
- static struct e820entry *overlap_list[E820MAX] __initdata;
- static struct e820entry new_bios[E820MAX] __initdata;
+static struct change_member change_point_list[2*E820_X_MAX] __initdata;
+static struct change_member *change_point[2*E820_X_MAX] __initdata;
+static struct e820entry *overlap_list[E820_X_MAX] __initdata;
+static struct e820entry new_bios[E820_X_MAX] __initdata;
struct change_member *change_tmp;
unsigned long current_type, last_type;
unsigned long long last_addr;
@@ -529,48 +220,12 @@ static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
int old_nr, new_nr, chg_nr;
int i;
- /*
- Visually we're performing the following
- (1,2,3,4 = memory types)...
-
- Sample memory map (w/overlaps):
- ____22__________________
- ______________________4_
- ____1111________________
- _44_____________________
- 11111111________________
- ____________________33__
- ___________44___________
- __________33333_________
- ______________22________
- ___________________2222_
- _________111111111______
- _____________________11_
- _________________4______
-
- Sanitized equivalent (no overlap):
- 1_______________________
- _44_____________________
- ___1____________________
- ____22__________________
- ______11________________
- _________1______________
- __________3_____________
- ___________44___________
- _____________33_________
- _______________2________
- ________________1_______
- _________________4______
- ___________________2____
- ____________________33__
- ______________________4_
- */
-
/* if there's only one memory region, don't bother */
if (*pnr_map < 2)
return -1;
old_nr = *pnr_map;
+ BUG_ON(old_nr > max_nr_map);
/* bail out if we find any unreasonable addresses in bios map */
for (i = 0; i < old_nr; i++)
@@ -682,7 +337,7 @@ static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
* no more space left for new
* bios entries ?
*/
- if (++new_bios_entry >= E820MAX)
+ if (++new_bios_entry >= max_nr_map)
break;
}
if (current_type != 0) {
@@ -704,22 +359,9 @@ static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
return 0;
}
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory. If we aren't, we'll fake a memory map.
- */
-static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init __copy_e820_map(struct e820entry *biosmap, int nr_map)
{
- /* Only one memory region (or negative)? Ignore it */
- if (nr_map < 2)
- return -1;
-
- do {
+ while (nr_map) {
u64 start = biosmap->addr;
u64 size = biosmap->size;
u64 end = start + size;
@@ -729,111 +371,37 @@ static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
if (start > end)
return -1;
- add_memory_region(start, size, type);
- } while (biosmap++, --nr_map);
- return 0;
-}
-
-static void early_panic(char *msg)
-{
- early_printk(msg);
- panic(msg);
-}
-
-/* We're not void only for x86 32-bit compat */
-char * __init machine_specific_memory_setup(void)
-{
- char *who = "BIOS-e820";
- /*
- * Try to copy the BIOS-supplied E820-map.
- *
- * Otherwise fake a memory map; one section from 0k->640k,
- * the next section from 1mb->appropriate_mem_k
- */
- sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
- if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
- early_panic("Cannot find a valid memory map");
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- e820_print_map(who);
-
- /* In case someone cares... */
- return who;
-}
+ e820_add_region(start, size, type);
-static int __init parse_memopt(char *p)
-{
- if (!p)
- return -EINVAL;
- end_user_pfn = memparse(p, &p);
- end_user_pfn >>= PAGE_SHIFT;
- return 0;
-}
-early_param("mem", parse_memopt);
-
-static int userdef __initdata;
-
-static int __init parse_memmap_opt(char *p)
-{
- char *oldp;
- unsigned long long start_at, mem_size;
-
- if (!strcmp(p, "exactmap")) {
-#ifdef CONFIG_CRASH_DUMP
- /*
- * If we are doing a crash dump, we still need to know
- * the real mem size before original memory map is
- * reset.
- */
- e820_register_active_regions(0, 0, -1UL);
- saved_max_pfn = e820_end_of_ram();
- remove_all_active_ranges();
-#endif
- max_pfn_mapped = 0;
- e820.nr_map = 0;
- userdef = 1;
- return 0;
+ biosmap++;
+ nr_map--;
}
-
- oldp = p;
- mem_size = memparse(p, &p);
- if (p == oldp)
- return -EINVAL;
-
- userdef = 1;
- if (*p == '@') {
- start_at = memparse(p+1, &p);
- add_memory_region(start_at, mem_size, E820_RAM);
- } else if (*p == '#') {
- start_at = memparse(p+1, &p);
- add_memory_region(start_at, mem_size, E820_ACPI);
- } else if (*p == '$') {
- start_at = memparse(p+1, &p);
- add_memory_region(start_at, mem_size, E820_RESERVED);
- } else {
- end_user_pfn = (mem_size >> PAGE_SHIFT);
- }
- return *p == '\0' ? 0 : -EINVAL;
+ return 0;
}
-early_param("memmap", parse_memmap_opt);
-void __init finish_e820_parsing(void)
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ */
+int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
{
- if (userdef) {
- char nr = e820.nr_map;
-
- if (sanitize_e820_map(e820.map, &nr) < 0)
- early_panic("Invalid user supplied memory map");
- e820.nr_map = nr;
+ /* Only one memory region (or negative)? Ignore it */
+ if (nr_map < 2)
+ return -1;
- printk(KERN_INFO "user-defined physical RAM map:\n");
- e820_print_map("user");
- }
+ return __copy_e820_map(biosmap, nr_map);
}
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
unsigned new_type)
{
int i;
+ u64 real_updated_size = 0;
BUG_ON(old_type == new_type);
@@ -843,8 +411,10 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type,
if (ei->type != old_type)
continue;
/* totally covered? */
- if (ei->addr >= start && ei->size <= size) {
+ if (ei->addr >= start &&
+ (ei->addr + ei->size) <= (start + size)) {
ei->type = new_type;
+ real_updated_size += ei->size;
continue;
}
/* partially covered */
@@ -852,26 +422,25 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type,
final_end = min(start + size, ei->addr + ei->size);
if (final_start >= final_end)
continue;
- add_memory_region(final_start, final_end - final_start,
+ e820_add_region(final_start, final_end - final_start,
new_type);
+ real_updated_size += final_end - final_start;
}
+ return real_updated_size;
}
void __init update_e820(void)
{
- u8 nr_map;
+ int nr_map;
nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, &nr_map))
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
return;
e820.nr_map = nr_map;
printk(KERN_INFO "modified physical RAM map:\n");
e820_print_map("modified");
}
-unsigned long pci_mem_start = 0xaeedbabe;
-EXPORT_SYMBOL(pci_mem_start);
-
/*
* Search for the biggest gap in the low 32 bits of the e820
* memory space. We pass this space to PCI to assign MMIO resources
@@ -881,7 +450,7 @@ EXPORT_SYMBOL(pci_mem_start);
__init void e820_setup_gap(void)
{
unsigned long gapstart, gapsize, round;
- unsigned long last;
+ unsigned long long last;
int i;
int found = 0;
@@ -910,6 +479,7 @@ __init void e820_setup_gap(void)
last = start;
}
+#ifdef CONFIG_X86_64
if (!found) {
gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
@@ -917,6 +487,7 @@ __init void e820_setup_gap(void)
KERN_ERR "PCI: Unassigned devices with 32bit resource "
"registers may break!\n");
}
+#endif
/*
* See how much we want to round up: start off with
@@ -933,6 +504,586 @@ __init void e820_setup_gap(void)
pci_mem_start, gapstart, gapsize);
}
+/**
+ * Because of the size limitation of struct boot_params, only first
+ * 128 E820 memory entries are passed to kernel via
+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
+ * linked list of struct setup_data, which is parsed here.
+ */
+void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
+{
+ u32 map_len;
+ int entries;
+ struct e820entry *extmap;
+
+ entries = sdata->len / sizeof(struct e820entry);
+ map_len = sdata->len + sizeof(struct setup_data);
+ if (map_len > PAGE_SIZE)
+ sdata = early_ioremap(pa_data, map_len);
+ extmap = (struct e820entry *)(sdata->data);
+ __copy_e820_map(extmap, entries);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ if (map_len > PAGE_SIZE)
+ early_iounmap(sdata, map_len);
+ printk(KERN_INFO "extended physical RAM map:\n");
+ e820_print_map("extended");
+}
+
+#if defined(CONFIG_X86_64) || \
+ (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+/**
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+ int i;
+ unsigned long pfn;
+
+ pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+ for (i = 1; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (pfn < PFN_UP(ei->addr))
+ register_nosave_region(pfn, PFN_UP(ei->addr));
+
+ pfn = PFN_DOWN(ei->addr + ei->size);
+ if (ei->type != E820_RAM)
+ register_nosave_region(PFN_UP(ei->addr), pfn);
+
+ if (pfn >= limit_pfn)
+ break;
+ }
+}
+#endif
+
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20
+
+struct early_res {
+ u64 start, end;
+ char name[16];
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+ { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
+#endif
+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
+ /*
+ * But first pinch a few for the stack/trampoline stuff
+ * FIXME: Don't need the extra page at 4K, but need to fix
+ * trampoline before removing it. (see the GDT stuff)
+ */
+ { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
+ /*
+ * Has to be in very low memory so we can execute
+ * real-mode AP code.
+ */
+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
+#endif
+ {}
+};
+
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+ int i;
+ struct early_res *r;
+
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+ if (end > r->start && start < r->end)
+ break;
+ }
+
+ return i;
+}
+
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+ int i;
+ struct early_res *r;
+
+ i = find_overlapped_early(start, end);
+ if (i >= MAX_EARLY_RES)
+ panic("Too many early reservations");
+ r = &early_res[i];
+ if (r->end)
+ panic("Overlapping early reservations "
+ "%llx-%llx %s to %llx-%llx %s\n",
+ start, end - 1, name?name:"", r->start,
+ r->end - 1, r->name);
+ r->start = start;
+ r->end = end;
+ if (name)
+ strncpy(r->name, name, sizeof(r->name) - 1);
+}
+
+void __init free_early(u64 start, u64 end)
+{
+ struct early_res *r;
+ int i, j;
+
+ i = find_overlapped_early(start, end);
+ r = &early_res[i];
+ if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
+ panic("free_early on not reserved area: %llx-%llx!",
+ start, end - 1);
+
+ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+ ;
+
+ memmove(&early_res[i], &early_res[i + 1],
+ (j - 1 - i) * sizeof(struct early_res));
+
+ early_res[j - 1].end = 0;
+}
+
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+ int i;
+ u64 final_start, final_end;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ struct early_res *r = &early_res[i];
+ final_start = max(start, r->start);
+ final_end = min(end, r->end);
+ if (final_start >= final_end)
+ continue;
+ printk(KERN_INFO " early res: %d [%llx-%llx] %s\n", i,
+ final_start, final_end - 1, r->name);
+ reserve_bootmem_generic(final_start, final_end - final_start,
+ BOOTMEM_DEFAULT);
+ }
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+ int i;
+ u64 addr = *addrp;
+ int changed = 0;
+ struct early_res *r;
+again:
+ i = find_overlapped_early(addr, addr + size);
+ r = &early_res[i];
+ if (i < MAX_EARLY_RES && r->end) {
+ *addrp = addr = round_up(r->end, align);
+ changed = 1;
+ goto again;
+ }
+ return changed;
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+ int i;
+ u64 addr = *addrp, last;
+ u64 size = *sizep;
+ int changed = 0;
+again:
+ last = addr + size;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ struct early_res *r = &early_res[i];
+ if (last > r->start && addr < r->start) {
+ size = r->start - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last > r->end && addr < r->end) {
+ addr = round_up(r->end, align);
+ size = last - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last <= r->end && addr >= r->start) {
+ (*sizep)++;
+ return 0;
+ }
+ }
+ if (changed) {
+ *addrp = addr;
+ *sizep = size;
+ }
+ return changed;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr, last;
+ u64 ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+ ;
+ last = addr + size;
+ if (last > ei_last)
+ continue;
+ if (last > end)
+ continue;
+ return addr;
+ }
+ return -1ULL;
+}
+
+/*
+ * Find next free range after *start
+ */
+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ u64 addr, last;
+ u64 ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ *sizep = ei_last - addr;
+ while (bad_addr_size(&addr, sizep, align) &&
+ addr + *sizep <= ei_last)
+ ;
+ last = addr + *sizep;
+ if (last > ei_last)
+ continue;
+ return addr;
+ }
+ return -1UL;
+
+}
+
+/*
+ * pre allocated 4k and reserved it in e820
+ */
+u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+{
+ u64 size = 0;
+ u64 addr;
+ u64 start;
+
+ start = startt;
+ while (size < sizet)
+ start = find_e820_area_size(start, &size, align);
+
+ if (size < sizet)
+ return 0;
+
+ addr = round_down(start + size - sizet, align);
+ e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+ printk(KERN_INFO "update e820 for early_reserve_e820\n");
+ update_e820();
+
+ return addr;
+}
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT))
+# else
+# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
+#endif
+
+/*
+ * Last pfn which the user wants to use.
+ */
+unsigned long __initdata end_user_pfn = MAX_ARCH_PFN;
+
+/*
+ * Find the highest page frame number we have available
+ */
+unsigned long __init e820_end_of_ram(void)
+{
+ unsigned long last_pfn;
+ unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+ last_pfn = find_max_pfn_with_active_regions();
+
+ if (last_pfn > max_arch_pfn)
+ last_pfn = max_arch_pfn;
+ if (last_pfn > end_user_pfn)
+ last_pfn = end_user_pfn;
+
+ printk(KERN_INFO "last_pfn = %lu max_arch_pfn = %lu\n",
+ last_pfn, max_arch_pfn);
+ return last_pfn;
+}
+
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+int __init e820_find_active_region(const struct e820entry *ei,
+ unsigned long start_pfn,
+ unsigned long last_pfn,
+ unsigned long *ei_startpfn,
+ unsigned long *ei_endpfn)
+{
+ u64 align = PAGE_SIZE;
+
+ *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
+ *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
+
+ /* Skip map entries smaller than a page */
+ if (*ei_startpfn >= *ei_endpfn)
+ return 0;
+
+ /* Skip if map is outside the node */
+ if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+ *ei_startpfn >= last_pfn)
+ return 0;
+
+ /* Check for overlaps */
+ if (*ei_startpfn < start_pfn)
+ *ei_startpfn = start_pfn;
+ if (*ei_endpfn > last_pfn)
+ *ei_endpfn = last_pfn;
+
+ /* Obey end_user_pfn to save on memmap */
+ if (*ei_startpfn >= end_user_pfn)
+ return 0;
+ if (*ei_endpfn > end_user_pfn)
+ *ei_endpfn = end_user_pfn;
+
+ return 1;
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long last_pfn)
+{
+ unsigned long ei_startpfn;
+ unsigned long ei_endpfn;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++)
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, last_pfn,
+ &ei_startpfn, &ei_endpfn))
+ add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+u64 __init e820_hole_size(u64 start, u64 end)
+{
+ unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long last_pfn = end >> PAGE_SHIFT;
+ unsigned long ei_startpfn, ei_endpfn, ram = 0;
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ if (e820_find_active_region(&e820.map[i],
+ start_pfn, last_pfn,
+ &ei_startpfn, &ei_endpfn))
+ ram += ei_endpfn - ei_startpfn;
+ }
+ return end - start - ((u64)ram << PAGE_SHIFT);
+}
+
+static void early_panic(char *msg)
+{
+ early_printk(msg);
+ panic(msg);
+}
+
+/* "mem=nopentium" disables the 4MB page tables. */
+static int __init parse_memopt(char *p)
+{
+ u64 mem_size;
+
+ if (!p)
+ return -EINVAL;
+
+#ifdef CONFIG_X86_32
+ if (!strcmp(p, "nopentium")) {
+ setup_clear_cpu_cap(X86_FEATURE_PSE);
+ return 0;
+ }
+#endif
+
+ mem_size = memparse(p, &p);
+ end_user_pfn = mem_size>>PAGE_SHIFT;
+ return 0;
+}
+early_param("mem", parse_memopt);
+
+static int userdef __initdata;
+
+static int __init parse_memmap_opt(char *p)
+{
+ char *oldp;
+ u64 start_at, mem_size;
+
+ if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+ /*
+ * If we are doing a crash dump, we still need to know
+ * the real mem size before original memory map is
+ * reset.
+ */
+ e820_register_active_regions(0, 0, -1UL);
+ saved_max_pfn = e820_end_of_ram();
+ remove_all_active_ranges();
+#endif
+ e820.nr_map = 0;
+ userdef = 1;
+ return 0;
+ }
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
+
+ userdef = 1;
+ if (*p == '@') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_RAM);
+ } else if (*p == '#') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_ACPI);
+ } else if (*p == '$') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_RESERVED);
+ } else {
+ end_user_pfn = (mem_size >> PAGE_SHIFT);
+ }
+ return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+
+void __init finish_e820_parsing(void)
+{
+ if (userdef) {
+ int nr = e820.nr_map;
+
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+ early_panic("Invalid user supplied memory map");
+ e820.nr_map = nr;
+
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ e820_print_map("user");
+ }
+}
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+ int i;
+ struct resource *res;
+
+ res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+ for (i = 0; i < e820.nr_map; i++) {
+ switch (e820.map[i].type) {
+ case E820_RAM: res->name = "System RAM"; break;
+ case E820_ACPI: res->name = "ACPI Tables"; break;
+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
+ default: res->name = "reserved";
+ }
+ res->start = e820.map[i].addr;
+ res->end = res->start + e820.map[i].size - 1;
+#ifndef CONFIG_RESOURCES_64BIT
+ if (res->end > 0x100000000ULL) {
+ res++;
+ continue;
+ }
+#endif
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ insert_resource(&iomem_resource, res);
+ res++;
+ }
+}
+
+char *__init default_machine_specific_memory_setup(void)
+{
+ char *who = "BIOS-e820";
+ int new_nr;
+ /*
+ * Try to copy the BIOS-supplied E820-map.
+ *
+ * Otherwise fake a memory map; one section from 0k->640k,
+ * the next section from 1mb->appropriate_mem_k
+ */
+ new_nr = boot_params.e820_entries;
+ sanitize_e820_map(boot_params.e820_map,
+ ARRAY_SIZE(boot_params.e820_map),
+ &new_nr);
+ boot_params.e820_entries = new_nr;
+ if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) {
+ u64 mem_size;
+
+ /* compare results from other methods and take the greater */
+ if (boot_params.alt_mem_k
+ < boot_params.screen_info.ext_mem_k) {
+ mem_size = boot_params.screen_info.ext_mem_k;
+ who = "BIOS-88";
+ } else {
+ mem_size = boot_params.alt_mem_k;
+ who = "BIOS-e801";
+ }
+
+ e820.nr_map = 0;
+ e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+ e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+ }
+
+ /* In case someone cares... */
+ return who;
+}
+
+char *__init __attribute__((weak)) machine_specific_memory_setup(void)
+{
+ return default_machine_specific_memory_setup();
+}
+
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __init __attribute__((weak)) memory_setup(void)
+{
+ return machine_specific_memory_setup();
+}
+
+void __init setup_memory_map(void)
+{
+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+ e820_print_map(memory_setup());
+}
+
+#ifdef CONFIG_X86_64
int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
{
int i;
@@ -951,3 +1102,4 @@ int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
max_pfn << PAGE_SHIFT) - *addr;
return i + 1;
}
+#endif
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
deleted file mode 100644
index ed733e7cf4e6..000000000000
--- a/arch/x86/kernel/e820_32.c
+++ /dev/null
@@ -1,775 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pfn.h>
-#include <linux/uaccess.h>
-#include <linux/suspend.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-
-struct e820map e820;
-struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-extern int user_defined_memmap;
-
-static struct resource system_rom_resource = {
- .name = "System ROM",
- .start = 0xf0000,
- .end = 0xfffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
- .name = "Extension ROM",
- .start = 0xe0000,
- .end = 0xeffff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
- .name = "Adapter ROM",
- .start = 0xc8000,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
- .name = "Adapter ROM",
- .start = 0,
- .end = 0,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
- .name = "Video ROM",
- .start = 0xc0000,
- .end = 0xc7fff,
- .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
- const unsigned short * const ptr = (const unsigned short *)rom;
- unsigned short sig;
-
- return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
- unsigned char sum, c;
-
- for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
- sum += c;
- return !length && !sum;
-}
-
-static void __init probe_roms(void)
-{
- const unsigned char *rom;
- unsigned long start, length, upper;
- unsigned char c;
- int i;
-
- /* video rom */
- upper = adapter_rom_resources[0].start;
- for (start = video_rom_resource.start; start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- video_rom_resource.start = start;
-
- if (probe_kernel_address(rom + 2, c) != 0)
- continue;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = c * 512;
-
- /* if checksum okay, trust length byte */
- if (length && romchecksum(rom, length))
- video_rom_resource.end = start + length - 1;
-
- request_resource(&iomem_resource, &video_rom_resource);
- break;
- }
-
- start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
- if (start < upper)
- start = upper;
-
- /* system rom */
- request_resource(&iomem_resource, &system_rom_resource);
- upper = system_rom_resource.start;
-
- /* check for extension rom (ignore length byte!) */
- rom = isa_bus_to_virt(extension_rom_resource.start);
- if (romsignature(rom)) {
- length = extension_rom_resource.end - extension_rom_resource.start + 1;
- if (romchecksum(rom, length)) {
- request_resource(&iomem_resource, &extension_rom_resource);
- upper = extension_rom_resource.start;
- }
- }
-
- /* check for adapter roms on 2k boundaries */
- for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
- rom = isa_bus_to_virt(start);
- if (!romsignature(rom))
- continue;
-
- if (probe_kernel_address(rom + 2, c) != 0)
- continue;
-
- /* 0 < length <= 0x7f * 512, historically */
- length = c * 512;
-
- /* but accept any length that fits if checksum okay */
- if (!length || start + length > upper || !romchecksum(rom, length))
- continue;
-
- adapter_rom_resources[i].start = start;
- adapter_rom_resources[i].end = start + length - 1;
- request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
- start = adapter_rom_resources[i++].end & ~2047UL;
- }
-}
-
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-void __init init_iomem_resources(struct resource *code_resource,
- struct resource *data_resource,
- struct resource *bss_resource)
-{
- int i;
-
- probe_roms();
- for (i = 0; i < e820.nr_map; i++) {
- struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
- continue;
-#endif
- res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
- switch (e820.map[i].type) {
- case E820_RAM: res->name = "System RAM"; break;
- case E820_ACPI: res->name = "ACPI Tables"; break;
- case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
- default: res->name = "reserved";
- }
- res->start = e820.map[i].addr;
- res->end = res->start + e820.map[i].size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- if (request_resource(&iomem_resource, res)) {
- kfree(res);
- continue;
- }
- if (e820.map[i].type == E820_RAM) {
- /*
- * We don't know which RAM region contains kernel data,
- * so we try it repeatedly and let the resource manager
- * test it.
- */
- request_resource(res, code_resource);
- request_resource(res, data_resource);
- request_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
- if (crashk_res.start != crashk_res.end)
- request_resource(res, &crashk_res);
-#endif
- }
- }
-}
-
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-/**
- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
- int i;
- unsigned long pfn;
-
- pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
- for (i = 1; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
-
- if (pfn < PFN_UP(ei->addr))
- register_nosave_region(pfn, PFN_UP(ei->addr));
-
- pfn = PFN_DOWN(ei->addr + ei->size);
- if (ei->type != E820_RAM)
- register_nosave_region(PFN_UP(ei->addr), pfn);
-
- if (pfn >= max_low_pfn)
- break;
- }
-}
-#endif
-
-void __init add_memory_region(unsigned long long start,
- unsigned long long size, int type)
-{
- int x;
-
- x = e820.nr_map;
-
- if (x == E820MAX) {
- printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
- return;
- }
-
- e820.map[x].addr = start;
- e820.map[x].size = size;
- e820.map[x].type = type;
- e820.nr_map++;
-} /* add_memory_region */
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
- struct change_member *change_tmp;
- unsigned long current_type, last_type;
- unsigned long long last_addr;
- int chgidx, still_changing;
- int overlap_entries;
- int new_bios_entry;
- int old_nr, new_nr, chg_nr;
- int i;
-
- /*
- Visually we're performing the following (1,2,3,4 = memory types)...
-
- Sample memory map (w/overlaps):
- ____22__________________
- ______________________4_
- ____1111________________
- _44_____________________
- 11111111________________
- ____________________33__
- ___________44___________
- __________33333_________
- ______________22________
- ___________________2222_
- _________111111111______
- _____________________11_
- _________________4______
-
- Sanitized equivalent (no overlap):
- 1_______________________
- _44_____________________
- ___1____________________
- ____22__________________
- ______11________________
- _________1______________
- __________3_____________
- ___________44___________
- _____________33_________
- _______________2________
- ________________1_______
- _________________4______
- ___________________2____
- ____________________33__
- ______________________4_
- */
- /* if there's only one memory region, don't bother */
- if (*pnr_map < 2) {
- return -1;
- }
-
- old_nr = *pnr_map;
-
- /* bail out if we find any unreasonable addresses in bios map */
- for (i=0; i<old_nr; i++)
- if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
- return -1;
- }
-
- /* create pointers for initial change-point information (for sorting) */
- for (i=0; i < 2*old_nr; i++)
- change_point[i] = &change_point_list[i];
-
- /* record all known change-points (starting and ending addresses),
- omitting those that are for empty memory regions */
- chgidx = 0;
- for (i=0; i < old_nr; i++) {
- if (biosmap[i].size != 0) {
- change_point[chgidx]->addr = biosmap[i].addr;
- change_point[chgidx++]->pbios = &biosmap[i];
- change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
- change_point[chgidx++]->pbios = &biosmap[i];
- }
- }
- chg_nr = chgidx; /* true number of change-points */
-
- /* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i=1; i < chg_nr; i++) {
- /* if <current_addr> > <last_addr>, swap */
- /* or, if current=<start_addr> & last=<end_addr>, swap */
- if ((change_point[i]->addr < change_point[i-1]->addr) ||
- ((change_point[i]->addr == change_point[i-1]->addr) &&
- (change_point[i]->addr == change_point[i]->pbios->addr) &&
- (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
- )
- {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing=1;
- }
- }
- }
-
- /* create a new bios memory map, removing overlaps */
- overlap_entries=0; /* number of entries in the overlap table */
- new_bios_entry=0; /* index for creating new bios map entries */
- last_type = 0; /* start with undefined memory type */
- last_addr = 0; /* start with 0 as last starting address */
- /* loop through change-points, determining affect on the new bios map */
- for (chgidx=0; chgidx < chg_nr; chgidx++)
- {
- /* keep track of all overlapping bios entries */
- if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
- {
- /* add map entry to overlap list (> 1 entry implies an overlap) */
- overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
- }
- else
- {
- /* remove entry from list (order independent, so swap with last) */
- for (i=0; i<overlap_entries; i++)
- {
- if (overlap_list[i] == change_point[chgidx]->pbios)
- overlap_list[i] = overlap_list[overlap_entries-1];
- }
- overlap_entries--;
- }
- /* if there are overlapping entries, decide which "type" to use */
- /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
- current_type = 0;
- for (i=0; i<overlap_entries; i++)
- if (overlap_list[i]->type > current_type)
- current_type = overlap_list[i]->type;
- /* continue building up new bios map based on this information */
- if (current_type != last_type) {
- if (last_type != 0) {
- new_bios[new_bios_entry].size =
- change_point[chgidx]->addr - last_addr;
- /* move forward only if the new size was non-zero */
- if (new_bios[new_bios_entry].size != 0)
- if (++new_bios_entry >= E820MAX)
- break; /* no more space left for new bios entries */
- }
- if (current_type != 0) {
- new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
- new_bios[new_bios_entry].type = current_type;
- last_addr=change_point[chgidx]->addr;
- }
- last_type = current_type;
- }
- }
- new_nr = new_bios_entry; /* retain count for new bios entries */
-
- /* copy new bios mapping into original location */
- memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
- *pnr_map = new_nr;
-
- return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory. If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up. (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
- /* Only one memory region (or negative)? Ignore it */
- if (nr_map < 2)
- return -1;
-
- do {
- u64 start = biosmap->addr;
- u64 size = biosmap->size;
- u64 end = start + size;
- u32 type = biosmap->type;
-
- /* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
- return -1;
-
- add_memory_region(start, size, type);
- } while (biosmap++, --nr_map);
-
- return 0;
-}
-
-/*
- * Find the highest page frame number we have available
- */
-void __init propagate_e820_map(void)
-{
- int i;
-
- max_pfn = 0;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long start, end;
- /* RAM? */
- if (e820.map[i].type != E820_RAM)
- continue;
- start = PFN_UP(e820.map[i].addr);
- end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
- if (start >= end)
- continue;
- if (end > max_pfn)
- max_pfn = end;
- memory_present(0, start, end);
- }
-}
-
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long curr_pfn, last_pfn, size;
- /*
- * Reserve usable low memory
- */
- if (e820.map[i].type != E820_RAM)
- continue;
- /*
- * We are rounding up the start address of usable memory:
- */
- curr_pfn = PFN_UP(e820.map[i].addr);
- if (curr_pfn >= max_low_pfn)
- continue;
- /*
- * ... and at the end of the usable range downwards:
- */
- last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
- if (last_pfn > max_low_pfn)
- last_pfn = max_low_pfn;
-
- /*
- * .. finally, did all the rounding and playing
- * around just make the area go away?
- */
- if (last_pfn <= curr_pfn)
- continue;
-
- size = last_pfn - curr_pfn;
- free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
- }
-}
-
-void __init e820_register_memory(void)
-{
- unsigned long gapstart, gapsize, round;
- unsigned long long last;
- int i;
-
- /*
- * Search for the biggest gap in the low 32 bits of the e820
- * memory space.
- */
- last = 0x100000000ull;
- gapstart = 0x10000000;
- gapsize = 0x400000;
- i = e820.nr_map;
- while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
-
- /*
- * Since "last" is at most 4GB, we know we'll
- * fit in 32 bits if this condition is true
- */
- if (last > end) {
- unsigned long gap = last - end;
-
- if (gap > gapsize) {
- gapsize = gap;
- gapstart = end;
- }
- }
- if (start < last)
- last = start;
- }
-
- /*
- * See how much we want to round up: start off with
- * rounding to the next 1MB area.
- */
- round = 0x100000;
- while ((gapsize >> 4) > round)
- round += round;
- /* Fun with two's complement */
- pci_mem_start = (gapstart + round) & -round;
-
- printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
- pci_mem_start, gapstart, gapsize);
-}
-
-void __init print_memory_map(char *who)
-{
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- printk(" %s: %016Lx - %016Lx ", who,
- e820.map[i].addr,
- e820.map[i].addr + e820.map[i].size);
- switch (e820.map[i].type) {
- case E820_RAM: printk("(usable)\n");
- break;
- case E820_RESERVED:
- printk("(reserved)\n");
- break;
- case E820_ACPI:
- printk("(ACPI data)\n");
- break;
- case E820_NVS:
- printk("(ACPI NVS)\n");
- break;
- default: printk("type %u\n", e820.map[i].type);
- break;
- }
- }
-}
-
-void __init limit_regions(unsigned long long size)
-{
- unsigned long long current_addr;
- int i;
-
- print_memory_map("limit_regions start");
- for (i = 0; i < e820.nr_map; i++) {
- current_addr = e820.map[i].addr + e820.map[i].size;
- if (current_addr < size)
- continue;
-
- if (e820.map[i].type != E820_RAM)
- continue;
-
- if (e820.map[i].addr >= size) {
- /*
- * This region starts past the end of the
- * requested size, skip it completely.
- */
- e820.nr_map = i;
- } else {
- e820.nr_map = i + 1;
- e820.map[i].size -= current_addr - size;
- }
- print_memory_map("limit_regions endfor");
- return;
- }
- print_memory_map("limit_regions endfunc");
-}
-
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
-{
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- const struct e820entry *ei = &e820.map[i];
- if (type && ei->type != type)
- continue;
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
- return 1;
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
- /*
- * This function checks if the entire range <start,end> is mapped with type.
- *
- * Note: this function only works correct if the e820 table is sorted and
- * not-overlapping, which is the case
- */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
- u64 start = s;
- u64 end = e;
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- if (type && ei->type != type)
- continue;
- /* is the region (part) in overlap with the current region ?*/
- if (ei->addr >= end || ei->addr + ei->size <= start)
- continue;
- /* if the region is at the beginning of <start,end> we move
- * start to the end of the region since it's ok until there
- */
- if (ei->addr <= start)
- start = ei->addr + ei->size;
- /* if start is now at or beyond end, we're done, full
- * coverage */
- if (start >= end)
- return 1; /* we're done */
- }
- return 0;
-}
-
-static int __init parse_memmap(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
- /* If we are doing a crash dump, we
- * still need to know the real mem
- * size before original memory map is
- * reset.
- */
- propagate_e820_map();
- saved_max_pfn = max_pfn;
-#endif
- e820.nr_map = 0;
- user_defined_memmap = 1;
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long start_at, mem_size;
-
- mem_size = memparse(arg, &arg);
- if (*arg == '@') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_RAM);
- } else if (*arg == '#') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_ACPI);
- } else if (*arg == '$') {
- start_at = memparse(arg+1, &arg);
- add_memory_region(start_at, mem_size, E820_RESERVED);
- } else {
- limit_regions(mem_size);
- user_defined_memmap = 1;
- }
- }
- return 0;
-}
-early_param("memmap", parse_memmap);
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
- unsigned new_type)
-{
- int i;
-
- BUG_ON(old_type == new_type);
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- u64 final_start, final_end;
- if (ei->type != old_type)
- continue;
- /* totally covered? */
- if (ei->addr >= start && ei->size <= size) {
- ei->type = new_type;
- continue;
- }
- /* partially covered */
- final_start = max(start, ei->addr);
- final_end = min(start + size, ei->addr + ei->size);
- if (final_start >= final_end)
- continue;
- add_memory_region(final_start, final_end - final_start,
- new_type);
- }
-}
-void __init update_e820(void)
-{
- u8 nr_map;
-
- nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, &nr_map))
- return;
- e820.nr_map = nr_map;
- printk(KERN_INFO "modified physical RAM map:\n");
- print_memory_map("modified");
-}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9f51e1ea9e82..84fd9f2a28ff 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -98,17 +98,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
}
-static void __init ati_bugs(int num, int slot, int func)
-{
-#ifdef CONFIG_X86_IO_APIC
- if (timer_over_8254 == 1) {
- timer_over_8254 = 0;
- printk(KERN_INFO
- "ATI board detected. Disabling timer routing over 8254.\n");
- }
-#endif
-}
-
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -126,8 +115,6 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
{ PCI_VENDOR_ID_VIA, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
- { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
- PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
{}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 77d424cf68b3..473c89fe5073 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -213,6 +213,48 @@ unsigned long efi_get_time(void)
eft.minute, eft.second);
}
+/*
+ * Tell the kernel about the EFI memory map. This might include
+ * more than the max 128 entries that can fit in the e820 legacy
+ * (zeropage) memory map.
+ */
+
+static void __init add_efi_memmap(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+ int e820_type;
+
+ if (md->attribute & EFI_MEMORY_WB)
+ e820_type = E820_RAM;
+ else
+ e820_type = E820_RESERVED;
+ e820_add_region(start, size, e820_type);
+ }
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
+void __init efi_reserve_early(void)
+{
+ unsigned long pmap;
+
+ pmap = boot_params.efi_info.efi_memmap;
+#ifdef CONFIG_X86_64
+ pmap += (__u64)boot_params.efi_info.efi_memmap_hi << 32;
+#endif
+ memmap.phys_map = (void *)pmap;
+ memmap.nr_map = boot_params.efi_info.efi_memmap_size /
+ boot_params.efi_info.efi_memdesc_size;
+ memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
+ memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
+ reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
+ "EFI memmap");
+}
+
#if EFI_DEBUG
static void __init print_efi_memmap(void)
{
@@ -242,21 +284,11 @@ void __init efi_init(void)
int i = 0;
void *tmp;
-#ifdef CONFIG_X86_32
efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
- memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
-#else
- efi_phys.systab = (efi_system_table_t *)
- (boot_params.efi_info.efi_systab |
- ((__u64)boot_params.efi_info.efi_systab_hi<<32));
- memmap.phys_map = (void *)
- (boot_params.efi_info.efi_memmap |
- ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
+#ifdef CONFIG_X86_64
+ efi_phys.systab = (void *)efi_phys.systab +
+ ((__u64)boot_params.efi_info.efi_systab_hi<<32);
#endif
- memmap.nr_map = boot_params.efi_info.efi_memmap_size /
- boot_params.efi_info.efi_memdesc_size;
- memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
- memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
efi.systab = early_ioremap((unsigned long)efi_phys.systab,
sizeof(efi_system_table_t));
@@ -370,6 +402,7 @@ void __init efi_init(void)
if (memmap.desc_size != sizeof(efi_memory_desc_t))
printk(KERN_WARNING "Kernel-defined memdesc"
"doesn't match the one from EFI!\n");
+ add_efi_memmap();
/* Setup for EFI runtime service */
reboot_type = BOOT_EFI;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d561dd5f1e62..652c5287215f 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -97,14 +97,7 @@ void __init efi_call_phys_epilog(void)
early_runtime_code_mapping_set_exec(0);
}
-void __init efi_reserve_bootmem(void)
-{
- reserve_bootmem_generic((unsigned long)memmap.phys_map,
- memmap.nr_map * memmap.desc_size,
- BOOTMEM_DEFAULT);
-}
-
-void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
{
static unsigned pages_mapped __initdata;
unsigned i, pages;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c778e4fa55a2..159a1c76d2bd 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,7 +51,7 @@
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
-#include "irq_vectors.h"
+#include <asm/irq_vectors.h>
/*
* We use macros for low-level operations which need to be overridden
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..e4c5f951e68d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -420,7 +420,6 @@ END(\label)
PTREGSCALL stub_clone, sys_clone, %r8
PTREGSCALL stub_fork, sys_fork, %rdi
PTREGSCALL stub_vfork, sys_vfork, %rdi
- PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
PTREGSCALL stub_iopl, sys_iopl, %rsi
@@ -926,11 +925,11 @@ error_kernelspace:
iret run with kernel gs again, so don't set the user space flag.
B stepping K8s sometimes report an truncated RIP for IRET
exceptions returning to compat mode. Check for these here too. */
- leaq irq_return(%rip),%rbp
- cmpq %rbp,RIP(%rsp)
+ leaq irq_return(%rip),%rcx
+ cmpq %rcx,RIP(%rsp)
je error_swapgs
- movl %ebp,%ebp /* zero extend */
- cmpq %rbp,RIP(%rsp)
+ movl %ecx,%ecx /* zero extend */
+ cmpq %rcx,RIP(%rsp)
je error_swapgs
cmpq $gs_change,RIP(%rsp)
je error_swapgs
@@ -1120,10 +1119,6 @@ ENTRY(coprocessor_segment_overrun)
zeroentry do_coprocessor_segment_overrun
END(coprocessor_segment_overrun)
-ENTRY(reserved)
- zeroentry do_reserved
-END(reserved)
-
/* runs on exception stack */
ENTRY(double_fault)
XCPT_FRAME
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index cbaaf69bedb2..1fa8be5bd217 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -51,7 +51,7 @@ void __init setup_apic_routing(void)
else
#endif
- if (num_possible_cpus() <= 8)
+ if (max_physical_apicid < 8)
genapic = &apic_flat;
else
genapic = &apic_physflat;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index ebf13908a743..45e84acca8a9 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -5,7 +5,7 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/threads.h>
@@ -55,37 +55,37 @@ static cpumask_t uv_vector_allocation_domain(int cpu)
int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
{
unsigned long val;
- int nasid;
+ int pnode;
- nasid = uv_apicid_to_nasid(phys_apicid);
+ pnode = uv_apicid_to_pnode(phys_apicid);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
(((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_INIT;
- uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
mdelay(10);
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
(((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
APIC_DM_STARTUP;
- uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
return 0;
}
static void uv_send_IPI_one(int cpu, int vector)
{
unsigned long val, apicid, lapicid;
- int nasid;
+ int pnode;
apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */
lapicid = apicid & 0x3f; /* ZZZ macro needed */
- nasid = uv_apicid_to_nasid(apicid);
+ pnode = uv_apicid_to_pnode(apicid);
val =
(1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
UVH_IPI_INT_APIC_ID_SHFT) |
(vector << UVH_IPI_INT_VECTOR_SHFT);
- uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+ uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
static void uv_send_IPI_mask(cpumask_t mask, int vector)
@@ -159,39 +159,81 @@ struct genapic apic_x2apic_uv_x = {
.phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */
};
-static __cpuinit void set_x2apic_extra_bits(int nasid)
+static __cpuinit void set_x2apic_extra_bits(int pnode)
{
- __get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6);
+ __get_cpu_var(x2apic_extra_bits) = (pnode << 6);
}
/*
* Called on boot cpu.
*/
+static __init int boot_pnode_to_blade(int pnode)
+{
+ int blade;
+
+ for (blade = 0; blade < uv_num_possible_blades(); blade++)
+ if (pnode == uv_blade_info[blade].pnode)
+ return blade;
+ BUG();
+}
+
+struct redir_addr {
+ unsigned long redirect;
+ unsigned long alias;
+};
+
+#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
+
+static __initdata struct redir_addr redir_addrs[] = {
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
+};
+
+static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
+{
+ union uvh_si_alias0_overlay_config_u alias;
+ union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
+ alias.v = uv_read_local_mmr(redir_addrs[i].alias);
+ if (alias.s.base == 0) {
+ *size = (1UL << alias.s.m_alias);
+ redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
+ *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
+ return;
+ }
+ }
+ BUG();
+}
+
static __init void uv_system_init(void)
{
union uvh_si_addr_map_config_u m_n_config;
- int bytes, nid, cpu, lcpu, nasid, last_nasid, blade;
- unsigned long mmr_base;
+ union uvh_node_id_u node_id;
+ unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
+ int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+ unsigned long mmr_base, present;
m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
+ m_val = m_n_config.s.m_skt;
+ n_val = m_n_config.s.n_skt;
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
- last_nasid = -1;
- for_each_possible_cpu(cpu) {
- nid = cpu_to_node(cpu);
- nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
- if (nasid != last_nasid)
- uv_possible_blades++;
- last_nasid = nasid;
- }
+ for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
+ uv_possible_blades +=
+ hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
uv_blade_info = alloc_bootmem_pages(bytes);
+ get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
+
bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
uv_node_to_blade = alloc_bootmem_pages(bytes);
memset(uv_node_to_blade, 255, bytes);
@@ -200,43 +242,56 @@ static __init void uv_system_init(void)
uv_cpu_to_blade = alloc_bootmem_pages(bytes);
memset(uv_cpu_to_blade, 255, bytes);
- last_nasid = -1;
- blade = -1;
- lcpu = -1;
- for_each_possible_cpu(cpu) {
- nid = cpu_to_node(cpu);
- nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
- if (nasid != last_nasid) {
- blade++;
- lcpu = -1;
- uv_blade_info[blade].nr_posible_cpus = 0;
+ blade = 0;
+ for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+ present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+ for (j = 0; j < 64; j++) {
+ if (!test_bit(j, &present))
+ continue;
+ uv_blade_info[blade].pnode = (i * 64 + j);
+ uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
+ blade++;
}
- last_nasid = nasid;
- lcpu++;
+ }
- uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt;
- uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt;
+ node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+ gnode_upper = (((unsigned long)node_id.s.node_id) &
+ ~((1 << n_val) - 1)) << m_val;
+
+ for_each_present_cpu(cpu) {
+ nid = cpu_to_node(cpu);
+ pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
+ blade = boot_pnode_to_blade(pnode);
+ lcpu = uv_blade_info[blade].nr_possible_cpus;
+ uv_blade_info[blade].nr_possible_cpus++;
+
+ uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
+ uv_cpu_hub_info(cpu)->lowmem_remap_top =
+ lowmem_redir_base + lowmem_redir_size;
+ uv_cpu_hub_info(cpu)->m_val = m_val;
+ uv_cpu_hub_info(cpu)->n_val = m_val;
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
- uv_cpu_hub_info(cpu)->local_nasid = nasid;
- uv_cpu_hub_info(cpu)->gnode_upper =
- nasid & ~((1 << uv_hub_info->n_val) - 1);
+ uv_cpu_hub_info(cpu)->pnode = pnode;
+ uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1;
+ uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
+ uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
- uv_blade_info[blade].nasid = nasid;
- uv_blade_info[blade].nr_posible_cpus++;
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
- printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n",
- cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid);
- printk(KERN_DEBUG "UV lcpu %d, blade %d\n", lcpu, blade);
+ printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, "
+ "lcpu %d, blade %d\n",
+ cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
+ lcpu, blade);
}
}
/*
* Called on each cpu to initialize the per_cpu UV data area.
+ * ZZZ hotplug not supported yet
*/
void __cpuinit uv_cpu_init(void)
{
@@ -246,5 +301,5 @@ void __cpuinit uv_cpu_init(void)
uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
- set_x2apic_extra_bits(uv_hub_info->local_nasid);
+ set_x2apic_extra_bits(uv_hub_info->pnode);
}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
new file mode 100644
index 000000000000..a727c0b9819c
--- /dev/null
+++ b/arch/x86/kernel/head.c
@@ -0,0 +1,73 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+void __init reserve_ebda_region(void)
+{
+ unsigned int lowmem, ebda_addr;
+
+ /* To determine the position of the EBDA and the */
+ /* end of conventional memory, we need to look at */
+ /* the BIOS data area. In a paravirtual environment */
+ /* that area is absent. We'll just have to assume */
+ /* that the paravirt case can handle memory setup */
+ /* correctly, without our help. */
+ if (paravirt_enabled())
+ return;
+
+ /* end of low (conventional) memory */
+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+ lowmem <<= 10;
+
+ /* start of EBDA area */
+ ebda_addr = get_bios_ebda();
+
+ /* Fixup: bios puts an EBDA in the top 64K segment */
+ /* of conventional memory, but does not adjust lowmem. */
+ if ((lowmem - ebda_addr) <= 0x10000)
+ lowmem = ebda_addr;
+
+ /* Fixup: bios does not report an EBDA at all. */
+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+ lowmem = 0x9f000;
+
+ /* Paranoia: should never happen, but... */
+ if ((lowmem == 0) || (lowmem >= 0x100000))
+ lowmem = 0x9f000;
+
+ /* reserve all memory between lowmem and the 1MB mark */
+ reserve_early(lowmem, 0x100000, "BIOS reserved");
+}
+
+void __init reserve_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data;
+ char buf[32];
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ sprintf(buf, "setup data %x", data->type);
+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
+}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3db059058927..fa1d25dd83e3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,7 +8,34 @@
#include <linux/init.h>
#include <linux/start_kernel.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820.h>
+#include <asm/bios_ebda.h>
+
void __init i386_start_kernel(void)
{
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+
+#ifdef CONFIG_BLK_DEV_INITRD
+ /* Reserve INITRD */
+ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
+ reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+ }
+#endif
+ reserve_early(init_pg_tables_start, init_pg_tables_end,
+ "INIT_PG_TABLE");
+
+ reserve_ebda_region();
+
+ /*
+ * At this point everything still needed from the boot loader
+ * or BIOS or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+
start_kernel();
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 4bcb61cd9fcd..c970929bb15d 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -65,74 +65,6 @@ static void __init copy_bootdata(char *real_mode_data)
}
}
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
- unsigned int lowmem, ebda_addr;
-
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
- if (paravirt_enabled())
- return;
-
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
-
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
-
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
-
- /* reserve all memory between lowmem and the 1MB mark */
- reserve_early(lowmem, 0x100000, "BIOS reserved");
-}
-
-static void __init reserve_setup_data(void)
-{
- struct setup_data *data;
- unsigned long pa_data;
- char buf[32];
-
- if (boot_params.hdr.version < 0x0209)
- return;
- pa_data = boot_params.hdr.setup_data;
- while (pa_data) {
- data = early_ioremap(pa_data, sizeof(*data));
- sprintf(buf, "setup data %x", data->type);
- reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
- pa_data = data->next;
- early_iounmap(data, sizeof(*data));
- }
-}
-
void __init x86_64_start_kernel(char * real_mode_data)
{
int i;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f7357cc0162c..b98b338aae1a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -194,6 +194,7 @@ default_entry:
xorl %ebx,%ebx /* %ebx is kept at zero */
movl $pa(pg0), %edi
+ movl %edi, pa(init_pg_tables_start)
movl $pa(swapper_pg_pmd), %edx
movl $PTE_ATTR, %eax
10:
@@ -219,6 +220,8 @@ default_entry:
jb 10b
1:
movl %edi,pa(init_pg_tables_end)
+ shrl $12, %eax
+ movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
@@ -228,6 +231,7 @@ default_entry:
page_pde_offset = (__PAGE_OFFSET >> 20);
movl $pa(pg0), %edi
+ movl %edi, pa(init_pg_tables_start)
movl $pa(swapper_pg_dir), %edx
movl $PTE_ATTR, %eax
10:
@@ -249,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
cmpl %ebp,%eax
jb 10b
movl %edi,pa(init_pg_tables_end)
+ shrl $12, %eax
+ movl %eax, pa(max_pfn_mapped)
/* Do early initialization of the fixmap area */
movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b817974ef942..263b9d14753e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -18,6 +18,7 @@
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/cache.h>
+#include <asm/processor-flags.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
@@ -154,9 +155,7 @@ ENTRY(secondary_startup_64)
*/
/* Enable PAE mode and PGE */
- xorq %rax, %rax
- btsq $5, %rax
- btsq $7, %rax
+ movl $(X86_CR4_PAE | X86_CR4_PGE), %eax
movq %rax, %cr4
/* Setup early boot stage 4 level pagetables. */
@@ -184,14 +183,10 @@ ENTRY(secondary_startup_64)
1: wrmsr /* Make changes effective */
/* Setup cr0 */
-#define CR0_PM 1 /* protected mode */
-#define CR0_MP (1<<1)
-#define CR0_ET (1<<4)
-#define CR0_NE (1<<5)
-#define CR0_WP (1<<16)
-#define CR0_AM (1<<18)
-#define CR0_PAGING (1<<31)
- movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
+#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+ X86_CR0_PG)
+ movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
@@ -327,11 +322,11 @@ early_idt_ripmsg:
ENTRY(name)
/* Automate the creation of 1 to 1 mapping pmd entries */
-#define PMDS(START, PERM, COUNT) \
- i = 0 ; \
- .rept (COUNT) ; \
- .quad (START) + (i << 21) + (PERM) ; \
- i = i + 1 ; \
+#define PMDS(START, PERM, COUNT) \
+ i = 0 ; \
+ .rept (COUNT) ; \
+ .quad (START) + (i << PMD_SHIFT) + (PERM) ; \
+ i = i + 1 ; \
.endr
/*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 9b5cfcdfc426..ea230ec69057 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -17,7 +17,7 @@
/* FSEC = 10^-15
NSEC = 10^-9 */
-#define FSEC_PER_NSEC 1000000
+#define FSEC_PER_NSEC 1000000L
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
@@ -206,20 +206,19 @@ static void hpet_enable_legacy_int(void)
static void hpet_legacy_clockevent_register(void)
{
- uint64_t hpet_freq;
-
/* Start HPET legacy interrupts */
hpet_enable_legacy_int();
/*
- * The period is a femto seconds value. We need to calculate the
- * scaled math multiplication factor for nanosecond to hpet tick
- * conversion.
+ * The mult factor is defined as (include/linux/clockchips.h)
+ * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
+ * hpet_period is in units of femtoseconds (per cycle), so
+ * mult/2^shift = cyc/ns = 10^6/hpet_period
+ * mult = (10^6 * 2^shift)/hpet_period
+ * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
*/
- hpet_freq = 1000000000000000ULL;
- do_div(hpet_freq, hpet_period);
- hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
- NSEC_PER_SEC, hpet_clockevent.shift);
+ hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
+ hpet_period, hpet_clockevent.shift);
/* Calculate the min / max delta */
hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
&hpet_clockevent);
@@ -324,7 +323,7 @@ static struct clocksource clocksource_hpet = {
static int hpet_clocksource_register(void)
{
- u64 tmp, start, now;
+ u64 start, now;
cycle_t t1;
/* Start the counter */
@@ -351,21 +350,15 @@ static int hpet_clocksource_register(void)
return -ENODEV;
}
- /* Initialize and register HPET clocksource
- *
- * hpet period is in femto seconds per cycle
- * so we need to convert this to ns/cyc units
- * approximated by mult/2^shift
- *
- * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
- * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
- * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
- * (fsec/cyc << shift)/1000000 = mult
- * (hpet_period << shift)/FSEC_PER_NSEC = mult
+ /*
+ * The definition of mult is (include/linux/clocksource.h)
+ * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
+ * so we first need to convert hpet_period to ns/cyc units:
+ * mult/2^shift = ns/cyc = hpet_period/10^6
+ * mult = (hpet_period * 2^shift)/10^6
+ * mult = (hpet_period << shift)/FSEC_PER_NSEC
*/
- tmp = (u64)hpet_period << HPET_SHIFT;
- do_div(tmp, FSEC_PER_NSEC);
- clocksource_hpet.mult = (u32)tmp;
+ clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
clocksource_register(&clocksource_hpet);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 95e80e5033c3..eb9ddd8efb82 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -162,7 +162,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
int ret;
if (!cpu_has_fxsr)
- return -EIO;
+ return -ENODEV;
ret = init_fpu(target);
if (ret)
@@ -179,7 +179,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
int ret;
if (!cpu_has_fxsr)
- return -EIO;
+ return -ENODEV;
ret = init_fpu(target);
if (ret)
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259.c
index fe631967d625..dc92b49d9204 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259.c
@@ -1,8 +1,10 @@
+#include <linux/linkage.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
+#include <linux/timex.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/init.h>
@@ -10,10 +12,12 @@
#include <linux/sysdev.h>
#include <linux/bitops.h>
+#include <asm/acpi.h>
#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/timer.h>
+#include <asm/hw_irq.h>
#include <asm/pgtable.h>
#include <asm/delay.h>
#include <asm/desc.h>
@@ -32,7 +36,7 @@ static int i8259A_auto_eoi;
DEFINE_SPINLOCK(i8259A_lock);
static void mask_and_ack_8259A(unsigned int);
-static struct irq_chip i8259A_chip = {
+struct irq_chip i8259A_chip = {
.name = "XT-PIC",
.mask = disable_8259A_irq,
.disable = disable_8259A_irq,
@@ -125,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq)
int irqmask = 1<<irq;
if (irq < 8) {
- outb(0x0B,PIC_MASTER_CMD); /* ISR register */
+ outb(0x0B, PIC_MASTER_CMD); /* ISR register */
value = inb(PIC_MASTER_CMD) & irqmask;
- outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
+ outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */
return value;
}
- outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
+ outb(0x0B, PIC_SLAVE_CMD); /* ISR register */
value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
- outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
+ outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */
return value;
}
@@ -171,12 +175,14 @@ handle_real_irq:
if (irq & 8) {
inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
outb(cached_slave_mask, PIC_SLAVE_IMR);
- outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
- outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+ /* 'Specific EOI' to slave */
+ outb(0x60+(irq&7), PIC_SLAVE_CMD);
+ /* 'Specific EOI' to master-IRQ2 */
+ outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD);
} else {
inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
outb(cached_master_mask, PIC_MASTER_IMR);
- outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */
+ outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
}
spin_unlock_irqrestore(&i8259A_lock, flags);
return;
@@ -199,7 +205,8 @@ spurious_8259A_irq:
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+ printk(KERN_DEBUG
+ "spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
atomic_inc(&irq_err_count);
@@ -290,17 +297,28 @@ void init_8259A(int auto_eoi)
* outb_pic - this has to work on a wide range of PC hardware.
*/
outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
- outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
- outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */
+
+ /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
+ to 0x20-0x27 on i386 */
+ outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+
+ /* 8259A-1 (the master) has a slave on IR2 */
+ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
+
if (auto_eoi) /* master does Auto EOI */
outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
else /* master expects normal EOI */
outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
- outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
- outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */
- outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+
+ /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
+ outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+ /* 8259A-2 is a slave on master's IR2 */
+ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+ /* (slave's support for AEOI in flat mode is to be investigated) */
+ outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+
if (auto_eoi)
/*
* In AEOI mode we just have to mask the interrupt
@@ -317,93 +335,3 @@ void init_8259A(int auto_eoi)
spin_unlock_irqrestore(&i8259A_lock, flags);
}
-
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
- extern void math_error(void __user *);
- outb(0,0xF0);
- if (ignore_fpu_irq || !boot_cpu_data.hard_math)
- return IRQ_NONE;
- math_error((void __user *)get_irq_regs()->ip);
- return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
- .handler = math_error_irq,
- .mask = CPU_MASK_NONE,
- .name = "fpu",
-};
-
-void __init init_ISA_irqs (void)
-{
- int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- init_bsp_APIC();
-#endif
- init_8259A(0);
-
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- for (i = 0; i < 16; i++) {
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- }
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
- int i;
-
- /* all the set up before the call gates are initialised */
- pre_intr_init_hook();
-
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (i >= NR_IRQS)
- break;
- /* SYSCALL_VECTOR was reserved in trap_init. */
- if (!test_bit(vector, used_vectors))
- set_intr_gate(vector, interrupt[i]);
- }
-
- /* setup after call gates are initialised (usually add in
- * the architecture specific gates)
- */
- intr_init_hook();
-
- /*
- * External FPU? Set up irq13 if so, for
- * original braindamaged IBM FERR coupling.
- */
- if (boot_cpu_data.hard_math && !cpu_has_fpu)
- setup_irq(FPU_IRQ, &fpu_irq);
-
- irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
deleted file mode 100644
index fa57a1568508..000000000000
--- a/arch/x86/kernel/i8259_64.c
+++ /dev/null
@@ -1,512 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-
-#include <asm/acpi.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * Common place to define all x86 IRQ vectors
- *
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
- *
- * These macros create the low-level assembly IRQ routines that save
- * register context and call do_IRQ(). do_IRQ() then does all the
- * operations that are needed to keep the AT (or SMP IOAPIC)
- * interrupt-controller happy.
- */
-
-#define BI(x,y) \
- BUILD_IRQ(x##y)
-
-#define BUILD_16_IRQS(x) \
- BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
- BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
- BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
- BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
- BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
-
-#undef BUILD_16_IRQS
-#undef BI
-
-
-#define IRQ(x,y) \
- IRQ##x##y##_interrupt
-
-#define IRQLIST_16(x) \
- IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
- IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
- IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
- IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-
-/* for the irq vectors */
-static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
- IRQLIST_16(0x2), IRQLIST_16(0x3),
- IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
- IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
- IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
-};
-
-#undef IRQ
-#undef IRQLIST_16
-
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- * this file should become arch/i386/kernel/irq.c when the old irq.c
- * moves to arch independent land
- */
-
-static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-static struct irq_chip i8259A_chip = {
- .name = "XT-PIC",
- .mask = disable_8259A_irq,
- .disable = disable_8259A_irq,
- .unmask = enable_8259A_irq,
- .mask_ack = mask_and_ack_8259A,
-};
-
-/*
- * 8259A PIC functions to handle ISA devices:
- */
-
-/*
- * This contains the irq mask for both 8259A irq controllers,
- */
-unsigned int cached_irq_mask = 0xffff;
-
-/*
- * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not really connected to any IO-APIC pin,
- * it's fed to the master 8259A's IR0 line only.
- *
- * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
- * this 'mixed mode' IRQ handling costs nothing because it's only used
- * at IRQ setup time.
- */
-unsigned long io_apic_irqs;
-
-void disable_8259A_irq(unsigned int irq)
-{
- unsigned int mask = 1 << irq;
- unsigned long flags;
-
- spin_lock_irqsave(&i8259A_lock, flags);
- cached_irq_mask |= mask;
- if (irq & 8)
- outb(cached_slave_mask, PIC_SLAVE_IMR);
- else
- outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-void enable_8259A_irq(unsigned int irq)
-{
- unsigned int mask = ~(1 << irq);
- unsigned long flags;
-
- spin_lock_irqsave(&i8259A_lock, flags);
- cached_irq_mask &= mask;
- if (irq & 8)
- outb(cached_slave_mask, PIC_SLAVE_IMR);
- else
- outb(cached_master_mask, PIC_MASTER_IMR);
- spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-int i8259A_irq_pending(unsigned int irq)
-{
- unsigned int mask = 1<<irq;
- unsigned long flags;
- int ret;
-
- spin_lock_irqsave(&i8259A_lock, flags);
- if (irq < 8)
- ret = inb(PIC_MASTER_CMD) & mask;
- else
- ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
- spin_unlock_irqrestore(&i8259A_lock, flags);
-
- return ret;
-}
-
-void make_8259A_irq(unsigned int irq)
-{
- disable_irq_nosync(irq);
- io_apic_irqs &= ~(1<<irq);
- set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
- "XT");
- enable_irq(irq);
-}
-
-/*
- * This function assumes to be called rarely. Switching between
- * 8259A registers is slow.
- * This has to be protected by the irq controller spinlock
- * before being called.
- */
-static inline int i8259A_irq_real(unsigned int irq)
-{
- int value;
- int irqmask = 1<<irq;
-
- if (irq < 8) {
- outb(0x0B,PIC_MASTER_CMD); /* ISR register */
- value = inb(PIC_MASTER_CMD) & irqmask;
- outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */
- return value;
- }
- outb(0x0B,PIC_SLAVE_CMD); /* ISR register */
- value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
- outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */
- return value;
-}
-
-/*
- * Careful! The 8259A is a fragile beast, it pretty
- * much _has_ to be done exactly like this (mask it
- * first, _then_ send the EOI, and the order of EOI
- * to the two 8259s is important!
- */
-static void mask_and_ack_8259A(unsigned int irq)
-{
- unsigned int irqmask = 1 << irq;
- unsigned long flags;
-
- spin_lock_irqsave(&i8259A_lock, flags);
- /*
- * Lightweight spurious IRQ detection. We do not want
- * to overdo spurious IRQ handling - it's usually a sign
- * of hardware problems, so we only do the checks we can
- * do without slowing down good hardware unnecessarily.
- *
- * Note that IRQ7 and IRQ15 (the two spurious IRQs
- * usually resulting from the 8259A-1|2 PICs) occur
- * even if the IRQ is masked in the 8259A. Thus we
- * can check spurious 8259A IRQs without doing the
- * quite slow i8259A_irq_real() call for every IRQ.
- * This does not cover 100% of spurious interrupts,
- * but should be enough to warn the user that there
- * is something bad going on ...
- */
- if (cached_irq_mask & irqmask)
- goto spurious_8259A_irq;
- cached_irq_mask |= irqmask;
-
-handle_real_irq:
- if (irq & 8) {
- inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */
- outb(cached_slave_mask, PIC_SLAVE_IMR);
- /* 'Specific EOI' to slave */
- outb(0x60+(irq&7),PIC_SLAVE_CMD);
- /* 'Specific EOI' to master-IRQ2 */
- outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
- } else {
- inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */
- outb(cached_master_mask, PIC_MASTER_IMR);
- /* 'Specific EOI' to master */
- outb(0x60+irq,PIC_MASTER_CMD);
- }
- spin_unlock_irqrestore(&i8259A_lock, flags);
- return;
-
-spurious_8259A_irq:
- /*
- * this is the slow path - should happen rarely.
- */
- if (i8259A_irq_real(irq))
- /*
- * oops, the IRQ _is_ in service according to the
- * 8259A - not spurious, go handle it.
- */
- goto handle_real_irq;
-
- {
- static int spurious_irq_mask;
- /*
- * At this point we can be sure the IRQ is spurious,
- * lets ACK and report it. [once per IRQ]
- */
- if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG
- "spurious 8259A interrupt: IRQ%d.\n", irq);
- spurious_irq_mask |= irqmask;
- }
- atomic_inc(&irq_err_count);
- /*
- * Theoretically we do not have to handle this IRQ,
- * but in Linux this does not cause problems and is
- * simpler for us.
- */
- goto handle_real_irq;
- }
-}
-
-static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
-static void restore_ELCR(char *trigger)
-{
- outb(trigger[0], 0x4d0);
- outb(trigger[1], 0x4d1);
-}
-
-static void save_ELCR(char *trigger)
-{
- /* IRQ 0,1,2,8,13 are marked as reserved */
- trigger[0] = inb(0x4d0) & 0xF8;
- trigger[1] = inb(0x4d1) & 0xDE;
-}
-
-static int i8259A_resume(struct sys_device *dev)
-{
- init_8259A(i8259A_auto_eoi);
- restore_ELCR(irq_trigger);
- return 0;
-}
-
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
-{
- save_ELCR(irq_trigger);
- return 0;
-}
-
-static int i8259A_shutdown(struct sys_device *dev)
-{
- /* Put the i8259A into a quiescent state that
- * the kernel initialization code can get it
- * out of.
- */
- outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
- return 0;
-}
-
-static struct sysdev_class i8259_sysdev_class = {
- .name = "i8259",
- .suspend = i8259A_suspend,
- .resume = i8259A_resume,
- .shutdown = i8259A_shutdown,
-};
-
-static struct sys_device device_i8259A = {
- .id = 0,
- .cls = &i8259_sysdev_class,
-};
-
-static int __init i8259A_init_sysfs(void)
-{
- int error = sysdev_class_register(&i8259_sysdev_class);
- if (!error)
- error = sysdev_register(&device_i8259A);
- return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-void init_8259A(int auto_eoi)
-{
- unsigned long flags;
-
- i8259A_auto_eoi = auto_eoi;
-
- spin_lock_irqsave(&i8259A_lock, flags);
-
- outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
-
- /*
- * outb_pic - this has to work on a wide range of PC hardware.
- */
- outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
- /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
- outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
- /* 8259A-1 (the master) has a slave on IR2 */
- outb_pic(0x04, PIC_MASTER_IMR);
- if (auto_eoi) /* master does Auto EOI */
- outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
- else /* master expects normal EOI */
- outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
-
- outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */
- /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
- outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
- /* 8259A-2 is a slave on master's IR2 */
- outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
- /* (slave's support for AEOI in flat mode is to be investigated) */
- outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
-
- if (auto_eoi)
- /*
- * In AEOI mode we just have to mask the interrupt
- * when acking.
- */
- i8259A_chip.mask_ack = disable_8259A_irq;
- else
- i8259A_chip.mask_ack = mask_and_ack_8259A;
-
- udelay(100); /* wait for 8259A to initialize */
-
- outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
- outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
-
- spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-
-
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
- .handler = no_action,
- .mask = CPU_MASK_NONE,
- .name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... IRQ0_VECTOR - 1] = -1,
- [IRQ0_VECTOR] = 0,
- [IRQ1_VECTOR] = 1,
- [IRQ2_VECTOR] = 2,
- [IRQ3_VECTOR] = 3,
- [IRQ4_VECTOR] = 4,
- [IRQ5_VECTOR] = 5,
- [IRQ6_VECTOR] = 6,
- [IRQ7_VECTOR] = 7,
- [IRQ8_VECTOR] = 8,
- [IRQ9_VECTOR] = 9,
- [IRQ10_VECTOR] = 10,
- [IRQ11_VECTOR] = 11,
- [IRQ12_VECTOR] = 12,
- [IRQ13_VECTOR] = 13,
- [IRQ14_VECTOR] = 14,
- [IRQ15_VECTOR] = 15,
- [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-void __init init_ISA_irqs (void)
-{
- int i;
-
- init_bsp_APIC();
- init_8259A(0);
-
- for (i = 0; i < NR_IRQS; i++) {
- irq_desc[i].status = IRQ_DISABLED;
- irq_desc[i].action = NULL;
- irq_desc[i].depth = 1;
-
- if (i < 16) {
- /*
- * 16 old-style INTA-cycle interrupts:
- */
- set_irq_chip_and_handler_name(i, &i8259A_chip,
- handle_level_irq, "XT");
- } else {
- /*
- * 'high' PCI IRQs filled in on demand
- */
- irq_desc[i].chip = &no_irq_chip;
- }
- }
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
- int i;
-
- init_ISA_irqs();
- /*
- * Cover the whole vector space, no vector can escape
- * us. (some of these will be overridden and become
- * 'special' SMP interrupts)
- */
- for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
- if (vector != IA32_SYSCALL_VECTOR)
- set_intr_gate(vector, interrupt[i]);
- }
-
-#ifdef CONFIG_SMP
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPIs for invalidation */
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
- set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
- /* IPI for generic function call */
- set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
- /* Low priority IPI to cleanup after moving an irq */
- set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-#endif
- set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
- set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
- /* self generated IPI for local APIC timer */
- set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
- /* IPI vectors for APIC spurious and error interrupts */
- set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
- set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
- if (!acpi_ioapic)
- setup_irq(2, &irq2);
-}
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d20..fedb3b113ace 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -58,7 +58,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
static DEFINE_SPINLOCK(ioapic_lock);
static DEFINE_SPINLOCK(vector_lock);
-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;
/*
* Is the SiS APIC rmw bug present ?
@@ -72,15 +72,21 @@ int sis_apic_bug = -1;
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
static int disable_timer_pin_1 __initdata;
/*
@@ -110,7 +116,7 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
}
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -239,7 +245,7 @@ static void __init replace_pin_at_irq(unsigned int irq,
}
}
-static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
{
struct irq_pin_list *entry = irq_2_pin + irq;
unsigned int pin, reg;
@@ -259,30 +265,32 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign
}
/* mask = 1 */
-static void __mask_IO_APIC_irq (unsigned int irq)
+static void __mask_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0x00010000, 0);
+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
}
/* mask = 0 */
-static void __unmask_IO_APIC_irq (unsigned int irq)
+static void __unmask_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0, 0x00010000);
+ __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
}
/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
+ IO_APIC_REDIR_LEVEL_TRIGGER);
}
/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
{
- __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+ __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
+ IO_APIC_REDIR_MASKED);
}
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq(unsigned int irq)
{
unsigned long flags;
@@ -291,7 +299,7 @@ static void mask_IO_APIC_irq (unsigned int irq)
spin_unlock_irqrestore(&ioapic_lock, flags);
}
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq(unsigned int irq)
{
unsigned long flags;
@@ -303,7 +311,7 @@ static void unmask_IO_APIC_irq (unsigned int irq)
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
-
+
/* Check delivery_mode to be sure we're not clearing an SMI pin */
entry = ioapic_read_entry(apic, pin);
if (entry.delivery_mode == dest_SMI)
@@ -315,7 +323,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
ioapic_mask_entry(apic, pin);
}
-static void clear_IO_APIC (void)
+static void clear_IO_APIC(void)
{
int apic, pin;
@@ -332,7 +340,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
struct irq_pin_list *entry = irq_2_pin + irq;
unsigned int apicid_value;
cpumask_t tmp;
-
+
cpus_and(tmp, cpumask, cpu_online_map);
if (cpus_empty(tmp))
tmp = TARGET_CPUS;
@@ -361,7 +369,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
# include <linux/kernel_stat.h> /* kstat */
# include <linux/slab.h> /* kmalloc() */
# include <linux/timer.h>
-
+
#define IRQBALANCE_CHECK_ARCH -999
#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
@@ -373,14 +381,14 @@ static int physical_balance __read_mostly;
static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
static struct irq_cpu_info {
- unsigned long * last_irq;
- unsigned long * irq_delta;
+ unsigned long *last_irq;
+ unsigned long *irq_delta;
unsigned long irq;
} irq_cpu_data[NR_CPUS];
#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq])
+#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
#define IDLE_ENOUGH(cpu,now) \
(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
@@ -419,8 +427,8 @@ inside:
if (cpu == -1)
cpu = NR_CPUS-1;
}
- } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
- (search_idle && !IDLE_ENOUGH(cpu,now)));
+ } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
+ (search_idle && !IDLE_ENOUGH(cpu, now)));
return cpu;
}
@@ -430,15 +438,14 @@ static inline void balance_irq(int cpu, int irq)
unsigned long now = jiffies;
cpumask_t allowed_mask;
unsigned int new_cpu;
-
+
if (irqbalance_disabled)
- return;
+ return;
cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
new_cpu = move(cpu, allowed_mask, now, 1);
- if (cpu != new_cpu) {
+ if (cpu != new_cpu)
set_pending_irq(irq, cpumask_of_cpu(new_cpu));
- }
}
static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
@@ -450,14 +457,14 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
if (!irq_desc[j].action)
continue;
/* Is it a significant load ? */
- if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
useful_load_threshold)
continue;
balance_irq(i, j);
}
}
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
return;
}
@@ -486,22 +493,22 @@ static void do_irq_balance(void)
/* Is this an active IRQ or balancing disabled ? */
if (!irq_desc[j].action || irq_balancing_disabled(j))
continue;
- if ( package_index == i )
- IRQ_DELTA(package_index,j) = 0;
+ if (package_index == i)
+ IRQ_DELTA(package_index, j) = 0;
/* Determine the total count per processor per IRQ */
value_now = (unsigned long) kstat_cpu(i).irqs[j];
/* Determine the activity per processor per IRQ */
- delta = value_now - LAST_CPU_IRQ(i,j);
+ delta = value_now - LAST_CPU_IRQ(i, j);
/* Update last_cpu_irq[][] for the next time */
- LAST_CPU_IRQ(i,j) = value_now;
+ LAST_CPU_IRQ(i, j) = value_now;
/* Ignore IRQs whose rate is less than the clock */
if (delta < useful_load_threshold)
continue;
/* update the load for the processor or package total */
- IRQ_DELTA(package_index,j) += delta;
+ IRQ_DELTA(package_index, j) += delta;
/* Keep track of the higher numbered sibling as well */
if (i != package_index)
@@ -527,7 +534,8 @@ static void do_irq_balance(void)
max_cpu_irq = ULONG_MAX;
tryanothercpu:
- /* Look for heaviest loaded processor.
+ /*
+ * Look for heaviest loaded processor.
* We may come back to get the next heaviest loaded processor.
* Skip processors with trivial loads.
*/
@@ -536,7 +544,7 @@ tryanothercpu:
for_each_online_cpu(i) {
if (i != CPU_TO_PACKAGEINDEX(i))
continue;
- if (max_cpu_irq <= CPU_IRQ(i))
+ if (max_cpu_irq <= CPU_IRQ(i))
continue;
if (tmp_cpu_irq < CPU_IRQ(i)) {
tmp_cpu_irq = CPU_IRQ(i);
@@ -545,8 +553,9 @@ tryanothercpu:
}
if (tmp_loaded == -1) {
- /* In the case of small number of heavy interrupt sources,
- * loading some of the cpus too much. We use Ingo's original
+ /*
+ * In the case of small number of heavy interrupt sources,
+ * loading some of the cpus too much. We use Ingo's original
* approach to rotate them around.
*/
if (!first_attempt && imbalance >= useful_load_threshold) {
@@ -555,13 +564,14 @@ tryanothercpu:
}
goto not_worth_the_effort;
}
-
+
first_attempt = 0; /* heaviest search */
max_cpu_irq = tmp_cpu_irq; /* load */
max_loaded = tmp_loaded; /* processor */
imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-
- /* if imbalance is less than approx 10% of max load, then
+
+ /*
+ * if imbalance is less than approx 10% of max load, then
* observe diminishing returns action. - quit
*/
if (imbalance < (max_cpu_irq >> 3))
@@ -577,26 +587,25 @@ tryanotherirq:
/* Is this an active IRQ? */
if (!irq_desc[j].action)
continue;
- if (imbalance <= IRQ_DELTA(max_loaded,j))
+ if (imbalance <= IRQ_DELTA(max_loaded, j))
continue;
/* Try to find the IRQ that is closest to the imbalance
* without going over.
*/
- if (move_this_load < IRQ_DELTA(max_loaded,j)) {
- move_this_load = IRQ_DELTA(max_loaded,j);
+ if (move_this_load < IRQ_DELTA(max_loaded, j)) {
+ move_this_load = IRQ_DELTA(max_loaded, j);
selected_irq = j;
}
}
- if (selected_irq == -1) {
+ if (selected_irq == -1)
goto tryanothercpu;
- }
imbalance = move_this_load;
-
+
/* For physical_balance case, we accumulated both load
* values in the one of the siblings cpu_irq[],
* to use the same code for physical and logical processors
- * as much as possible.
+ * as much as possible.
*
* NOTE: the cpu_irq[] array holds the sum of the load for
* sibling A and sibling B in the slot for the lowest numbered
@@ -625,11 +634,11 @@ tryanotherirq:
/* mark for change destination */
set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
- /* Since we made a change, come back sooner to
+ /* Since we made a change, come back sooner to
* check for more variation.
*/
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
+ balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
return;
}
goto tryanotherirq;
@@ -640,7 +649,7 @@ not_worth_the_effort:
* upward
*/
balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
- balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
+ balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
return;
}
@@ -679,13 +688,13 @@ static int __init balanced_irq_init(void)
cpumask_t tmp;
cpus_shift_right(tmp, cpu_online_map, 2);
- c = &boot_cpu_data;
+ c = &boot_cpu_data;
/* When not overwritten by the command line ask subarchitecture. */
if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
irqbalance_disabled = NO_BALANCE_IRQ;
if (irqbalance_disabled)
return 0;
-
+
/* disable irqbalance completely if there is only one processor online */
if (num_online_cpus() < 2) {
irqbalance_disabled = 1;
@@ -699,16 +708,14 @@ static int __init balanced_irq_init(void)
physical_balance = 1;
for_each_online_cpu(i) {
- irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
- irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+ irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
printk(KERN_ERR "balanced_irq_init: out of memory");
goto failed;
}
- memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
- memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
}
-
+
printk(KERN_INFO "Starting balanced_irq\n");
if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
return 0;
@@ -801,10 +808,10 @@ static int find_irq_entry(int apic, int pin, int type)
int i;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == type &&
- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mpc_dstirq == pin)
+ if (mp_irqs[i].mp_irqtype == type &&
+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].mp_dstirq == pin)
return i;
return -1;
@@ -818,13 +825,13 @@ static int __init find_isa_irq_pin(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
- return mp_irqs[i].mpc_dstirq;
+ return mp_irqs[i].mp_dstirq;
}
return -1;
}
@@ -834,17 +841,17 @@ static int __init find_isa_irq_apic(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
- for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
return apic;
}
}
@@ -864,28 +871,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
"slot:%d, pin:%d.\n", bus, slot, pin);
- if (mp_bus_id_to_pci_bus[bus] == -1) {
+ if (test_bit(bus, mp_bus_not_pci)) {
printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mpc_irqtype &&
+ !mp_irqs[i].mp_irqtype &&
(bus == lbus) &&
- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
@@ -900,7 +907,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
/*
- * This function currently is only a helper for the i386 smp boot process where
+ * This function currently is only a helper for the i386 smp boot process where
* we need to reprogram the ioredtbls to cater for the cpus which have come online
* so mask in all cases should simply be TARGET_CPUS
*/
@@ -952,7 +959,7 @@ static int EISA_ELCR(unsigned int irq)
* EISA conforming in the MP table, that means its trigger type must
* be read in from the ELCR */
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
#define default_EISA_polarity(idx) default_ISA_polarity(idx)
/* PCI interrupts are always polarity one level triggered,
@@ -969,118 +976,115 @@ static int EISA_ELCR(unsigned int irq)
static int MPBIOS_polarity(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].mpc_irqflag & 3)
+ switch (mp_irqs[idx].mp_irqflag & 3) {
+ case 0: /* conforms, ie. bus-type dependent polarity */
{
- case 0: /* conforms, ie. bus-type dependent polarity */
- {
- polarity = test_bit(bus, mp_bus_not_pci)?
- default_ISA_polarity(idx):
- default_PCI_polarity(idx);
- break;
- }
- case 1: /* high active */
- {
- polarity = 0;
- break;
- }
- case 2: /* reserved */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- case 3: /* low active */
- {
- polarity = 1;
- break;
- }
- default: /* invalid */
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
+ polarity = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_polarity(idx):
+ default_PCI_polarity(idx);
+ break;
+ }
+ case 1: /* high active */
+ {
+ polarity = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ case 3: /* low active */
+ {
+ polarity = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
}
return polarity;
}
static int MPBIOS_trigger(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
+ case 0: /* conforms, ie. bus-type dependent */
{
- case 0: /* conforms, ie. bus-type dependent */
- {
- trigger = test_bit(bus, mp_bus_not_pci)?
- default_ISA_trigger(idx):
- default_PCI_trigger(idx);
+ trigger = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_trigger(idx):
+ default_PCI_trigger(idx);
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /* set before the switch */
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
-#endif
+ switch (mp_bus_id_to_type[bus]) {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ /* set before the switch */
break;
}
- case 1: /* edge */
+ case MP_BUS_EISA: /* EISA pin */
{
- trigger = 0;
+ trigger = default_EISA_trigger(idx);
break;
}
- case 2: /* reserved */
+ case MP_BUS_PCI: /* PCI pin */
{
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
+ /* set before the switch */
break;
}
- case 3: /* level */
+ case MP_BUS_MCA: /* MCA pin */
{
- trigger = 1;
+ trigger = default_MCA_trigger(idx);
break;
}
- default: /* invalid */
+ default:
{
printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 0;
+ trigger = 1;
break;
}
}
+#endif
+ break;
+ }
+ case 1: /* edge */
+ {
+ trigger = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ case 3: /* level */
+ {
+ trigger = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 0;
+ break;
+ }
+ }
return trigger;
}
@@ -1097,16 +1101,16 @@ static inline int irq_trigger(int idx)
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
- if (mp_irqs[idx].mpc_dstirq != pin)
+ if (mp_irqs[idx].mp_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci))
- irq = mp_irqs[idx].mpc_srcbusirq;
+ irq = mp_irqs[idx].mp_srcbusirq;
else {
/*
* PCI IRQs are mapped in order
@@ -1148,8 +1152,8 @@ static inline int IO_APIC_irq_trigger(int irq)
for (apic = 0; apic < nr_ioapics; apic++) {
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
- idx = find_irq_entry(apic,pin,mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
return irq_trigger(idx);
}
}
@@ -1164,7 +1168,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }
static int __assign_irq_vector(int irq)
{
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
int vector, offset;
BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
@@ -1176,7 +1180,7 @@ static int __assign_irq_vector(int irq)
offset = current_offset;
next:
vector += 8;
- if (vector >= FIRST_SYSTEM_VECTOR) {
+ if (vector >= first_system_vector) {
offset = (offset + 1) % 8;
vector = FIRST_DEVICE_VECTOR + offset;
}
@@ -1237,25 +1241,25 @@ static void __init setup_IO_APIC_irqs(void)
/*
* add it to the IO-APIC irq-routing table:
*/
- memset(&entry,0,sizeof(entry));
+ memset(&entry, 0, sizeof(entry));
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest =
+ entry.dest.logical.logical_dest =
cpu_mask_to_apicid(TARGET_CPUS);
- idx = find_irq_entry(apic,pin,mp_INT);
+ idx = find_irq_entry(apic, pin, mp_INT);
if (idx == -1) {
if (first_notcon) {
apic_printk(APIC_VERBOSE, KERN_DEBUG
" IO-APIC (apicid-pin) %d-%d",
- mp_ioapics[apic].mpc_apicid,
+ mp_ioapics[apic].mp_apicid,
pin);
first_notcon = 0;
} else
apic_printk(APIC_VERBOSE, ", %d-%d",
- mp_ioapics[apic].mpc_apicid, pin);
+ mp_ioapics[apic].mp_apicid, pin);
continue;
}
@@ -1289,7 +1293,7 @@ static void __init setup_IO_APIC_irqs(void)
vector = assign_irq_vector(irq);
entry.vector = vector;
ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-
+
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
@@ -1302,25 +1306,21 @@ static void __init setup_IO_APIC_irqs(void)
}
/*
- * Set up the 8259A-master output pin:
+ * Set up the timer pin, possibly with the 8259A-master behind.
*/
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+ int vector)
{
struct IO_APIC_route_entry entry;
- memset(&entry,0,sizeof(entry));
-
- disable_8259A_irq(0);
-
- /* mask LVT0 */
- apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ memset(&entry, 0, sizeof(entry));
/*
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
entry.dest_mode = INT_DEST_MODE;
- entry.mask = 0; /* unmask IRQ now */
+ entry.mask = 1; /* mask IRQ now */
entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.delivery_mode = INT_DELIVERY_MODE;
entry.polarity = 0;
@@ -1329,17 +1329,14 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
/*
* The timer IRQ doesn't have to know that behind the
- * scene we have a 8259A-master in AEOI mode ...
+ * scene we may have a 8259A-master in AEOI mode ...
*/
- irq_desc[0].chip = &ioapic_chip;
- set_irq_handler(0, handle_edge_irq);
+ ioapic_register_intr(0, vector, IOAPIC_EDGE);
/*
* Add it to the IO-APIC irq-routing table:
*/
ioapic_write_entry(apic, pin, entry);
-
- enable_8259A_irq(0);
}
void __init print_IO_APIC(void)
@@ -1354,10 +1351,10 @@ void __init print_IO_APIC(void)
if (apic_verbosity == APIC_QUIET)
return;
- printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -1376,7 +1373,7 @@ void __init print_IO_APIC(void)
reg_03.raw = io_apic_read(apic, 3);
spin_unlock_irqrestore(&ioapic_lock, flags);
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1459,7 +1456,7 @@ void __init print_IO_APIC(void)
#if 0
-static void print_APIC_bitfield (int base)
+static void print_APIC_bitfield(int base)
{
unsigned int v;
int i, j;
@@ -1480,7 +1477,7 @@ static void print_APIC_bitfield (int base)
}
}
-void /*__init*/ print_local_APIC(void * dummy)
+void /*__init*/ print_local_APIC(void *dummy)
{
unsigned int v, ver, maxlvt;
@@ -1489,6 +1486,7 @@ void /*__init*/ print_local_APIC(void * dummy)
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
GET_APIC_ID(read_apic_id()));
v = apic_read(APIC_LVR);
@@ -1563,7 +1561,7 @@ void /*__init*/ print_local_APIC(void * dummy)
printk("\n");
}
-void print_all_local_APICs (void)
+void print_all_local_APICs(void)
{
on_each_cpu(print_local_APIC, NULL, 1, 1);
}
@@ -1586,11 +1584,11 @@ void /*__init*/ print_PIC(void)
v = inb(0xa0) << 8 | inb(0x20);
printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
- outb(0x0b,0xa0);
- outb(0x0b,0x20);
+ outb(0x0b, 0xa0);
+ outb(0x0b, 0x20);
v = inb(0xa0) << 8 | inb(0x20);
- outb(0x0a,0xa0);
- outb(0x0a,0x20);
+ outb(0x0a, 0xa0);
+ outb(0x0a, 0x20);
spin_unlock_irqrestore(&i8259A_lock, flags);
@@ -1626,7 +1624,7 @@ static void __init enable_IO_APIC(void)
spin_unlock_irqrestore(&ioapic_lock, flags);
nr_ioapic_registers[apic] = reg_01.bits.entries+1;
}
- for(apic = 0; apic < nr_ioapics; apic++) {
+ for (apic = 0; apic < nr_ioapics; apic++) {
int pin;
/* See if any of the pins is in ExtINT mode */
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
@@ -1716,7 +1714,6 @@ void disable_IO_APIC(void)
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-#ifndef CONFIG_X86_NUMAQ
static void __init setup_ioapic_ids_from_mpc(void)
{
union IO_APIC_reg_00 reg_00;
@@ -1726,6 +1723,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
unsigned char old_id;
unsigned long flags;
+#ifdef CONFIG_X86_NUMAQ
+ if (found_numaq)
+ return;
+#endif
+
/*
* Don't check I/O APIC IDs for xAPIC systems. They have
* no meaning without the serial APIC bus.
@@ -1748,15 +1750,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic].mpc_apicid;
- if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+ old_id = mp_ioapics[apic].mp_apicid;
+
+ if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
- apic, mp_ioapics[apic].mpc_apicid);
+ apic, mp_ioapics[apic].mp_apicid);
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
reg_00.bits.ID);
- mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+ mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
}
/*
@@ -1765,9 +1767,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (check_apicid_used(phys_id_present_map,
- mp_ioapics[apic].mpc_apicid)) {
+ mp_ioapics[apic].mp_apicid)) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
- apic, mp_ioapics[apic].mpc_apicid);
+ apic, mp_ioapics[apic].mp_apicid);
for (i = 0; i < get_physical_broadcast(); i++)
if (!physid_isset(i, phys_id_present_map))
break;
@@ -1776,13 +1778,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
physid_set(i, phys_id_present_map);
- mp_ioapics[apic].mpc_apicid = i;
+ mp_ioapics[apic].mp_apicid = i;
} else {
physid_mask_t tmp;
- tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+ tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
- mp_ioapics[apic].mpc_apicid);
+ mp_ioapics[apic].mp_apicid);
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
@@ -1791,21 +1793,21 @@ static void __init setup_ioapic_ids_from_mpc(void)
* We need to adjust the IRQ routing table
* if the ID changed.
*/
- if (old_id != mp_ioapics[apic].mpc_apicid)
+ if (old_id != mp_ioapics[apic].mp_apicid)
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_dstapic == old_id)
- mp_irqs[i].mpc_dstapic
- = mp_ioapics[apic].mpc_apicid;
+ if (mp_irqs[i].mp_dstapic == old_id)
+ mp_irqs[i].mp_dstapic
+ = mp_ioapics[apic].mp_apicid;
/*
* Read the right value from the MPC table and
* write it into the ID register.
- */
+ */
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mpc_apicid);
+ mp_ioapics[apic].mp_apicid);
- reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+ reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0, reg_00.raw);
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1816,15 +1818,12 @@ static void __init setup_ioapic_ids_from_mpc(void)
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+ if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
printk("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
}
}
-#else
-static void __init setup_ioapic_ids_from_mpc(void) { }
-#endif
int no_timer_check __initdata;
@@ -2020,7 +2019,7 @@ static void ack_apic(unsigned int irq)
ack_APIC_irq();
}
-static void mask_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
{
unsigned long v;
@@ -2028,7 +2027,7 @@ static void mask_lapic_irq (unsigned int irq)
apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
}
-static void unmask_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
{
unsigned long v;
@@ -2037,7 +2036,7 @@ static void unmask_lapic_irq (unsigned int irq)
}
static struct irq_chip lapic_chip __read_mostly = {
- .name = "local-APIC-edge",
+ .name = "local-APIC",
.mask = mask_lapic_irq,
.unmask = unmask_lapic_irq,
.eoi = ack_apic,
@@ -2046,14 +2045,14 @@ static struct irq_chip lapic_chip __read_mostly = {
static void __init setup_nmi(void)
{
/*
- * Dirty trick to enable the NMI watchdog ...
+ * Dirty trick to enable the NMI watchdog ...
* We put the 8259A master into AEOI mode and
* unmask on all local APICs LVT0 as NMI.
*
* The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
* is from Maciej W. Rozycki - so we do not have to EOI from
* the NMI handler or the timer interrupt.
- */
+ */
apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
enable_NMI_through_LVT0();
@@ -2129,11 +2128,16 @@ static inline void __init unlock_ExtINT_logic(void)
static inline void __init check_timer(void)
{
int apic1, pin1, apic2, pin2;
+ int no_pin1 = 0;
int vector;
+ unsigned int ver;
unsigned long flags;
local_irq_save(flags);
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+
/*
* get/set the timer IRQ vector:
*/
@@ -2142,17 +2146,17 @@ static inline void __init check_timer(void)
set_intr_gate(vector, interrupt[0]);
/*
- * Subtle, code in do_timer_interrupt() expects an AEOI
- * mode for the 8259A whenever interrupts are routed
- * through I/O APICs. Also IRQ0 has to be enabled in
- * the 8259A which implies the virtual wire has to be
- * disabled in the local APIC.
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+ * wire has to be disabled in the local APIC. Also
+ * timer interrupts need to be acknowledged manually in
+ * the 8259A for the i82489DX when using the NMI
+ * watchdog as that APIC treats NMIs as level-triggered.
+ * The AEOI mode will finish them in the 8259A
+ * automatically.
*/
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
- timer_ack = 1;
- if (timer_over_8254 > 0)
- enable_8259A_irq(0);
+ timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2162,14 +2166,33 @@ static inline void __init check_timer(void)
printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
vector, apic1, pin1, apic2, pin2);
+ /*
+ * Some BIOS writers are clueless and report the ExtINTA
+ * I/O APIC input from the cascaded 8259A as the timer
+ * interrupt input. So just in case, if only one pin
+ * was found above, try it both directly and through the
+ * 8259A.
+ */
+ if (pin1 == -1) {
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+ } else if (pin2 == -1) {
+ pin2 = pin1;
+ apic2 = apic1;
+ }
+
if (pin1 != -1) {
/*
* Ok, does IRQ0 through the IOAPIC work?
*/
+ if (no_pin1) {
+ add_pin_to_irq(0, apic1, pin1);
+ setup_timer_IRQ0_pin(apic1, pin1, vector);
+ }
unmask_IO_APIC_irq(0);
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
setup_nmi();
enable_8259A_irq(0);
}
@@ -2178,43 +2201,46 @@ static inline void __init check_timer(void)
goto out;
}
clear_IO_APIC_pin(apic1, pin1);
- printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
- "IO-APIC\n");
- }
+ if (!no_pin1)
+ printk(KERN_ERR "..MP-BIOS bug: "
+ "8254 timer not connected to IO-APIC\n");
- printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
- if (pin2 != -1) {
+ printk(KERN_INFO "...trying to set up timer (IRQ0) "
+ "through the 8259A ... ");
printk("\n..... (found pin %d) ...", pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
- setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ setup_timer_IRQ0_pin(apic2, pin2, vector);
+ unmask_IO_APIC_irq(0);
+ enable_8259A_irq(0);
if (timer_irq_works()) {
printk("works.\n");
- if (pin1 != -1)
- replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
- else
- add_pin_to_irq(0, apic2, pin2);
+ timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
setup_nmi();
+ enable_8259A_irq(0);
}
goto out;
}
/*
* Cleanup, just in case ...
*/
+ disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
+ printk(" failed.\n");
}
- printk(" failed.\n");
if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = 0;
+ nmi_watchdog = NMI_NONE;
}
+ timer_ack = 0;
printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
- disable_8259A_irq(0);
set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
"fasteoi");
apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
@@ -2224,12 +2250,12 @@ static inline void __init check_timer(void)
printk(" works.\n");
goto out;
}
+ disable_8259A_irq(0);
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
printk(" failed.\n");
printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
- timer_ack = 0;
init_8259A(0);
make_8259A_irq(0);
apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
@@ -2261,7 +2287,7 @@ void __init setup_IO_APIC(void)
int i;
/* Reserve all the system vectors. */
- for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
+ for (i = first_system_vector; i < NR_VECTORS; i++)
set_bit(i, used_vectors);
enable_IO_APIC();
@@ -2286,28 +2312,14 @@ void __init setup_IO_APIC(void)
print_IO_APIC();
}
-static int __init setup_disable_8254_timer(char *s)
-{
- timer_over_8254 = -1;
- return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
- timer_over_8254 = 2;
- return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
/*
* Called after all the initialization is done. If we didnt find any
* APIC bugs then we can allow the modify fast path
*/
-
+
static int __init io_apic_bug_finalize(void)
{
- if(sis_apic_bug == -1)
+ if (sis_apic_bug == -1)
sis_apic_bug = 0;
return 0;
}
@@ -2318,17 +2330,17 @@ struct sysfs_ioapic_data {
struct sys_device dev;
struct IO_APIC_route_entry entry[0];
};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
{
struct IO_APIC_route_entry *entry;
struct sysfs_ioapic_data *data;
int i;
-
+
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
entry[i] = ioapic_read_entry(dev->id, i);
return 0;
@@ -2341,18 +2353,18 @@ static int ioapic_resume(struct sys_device *dev)
unsigned long flags;
union IO_APIC_reg_00 reg_00;
int i;
-
+
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
ioapic_write_entry(dev->id, i, entry[i]);
return 0;
@@ -2366,24 +2378,23 @@ static struct sysdev_class ioapic_sysdev_class = {
static int __init ioapic_init_sysfs(void)
{
- struct sys_device * dev;
+ struct sys_device *dev;
int i, size, error = 0;
error = sysdev_class_register(&ioapic_sysdev_class);
if (error)
return error;
- for (i = 0; i < nr_ioapics; i++ ) {
- size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+ for (i = 0; i < nr_ioapics; i++) {
+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
* sizeof(struct IO_APIC_route_entry);
- mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+ mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
if (!mp_ioapic_data[i]) {
printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
continue;
}
- memset(mp_ioapic_data[i], 0, size);
dev = &mp_ioapic_data[i]->dev;
- dev->id = i;
+ dev->id = i;
dev->cls = &ioapic_sysdev_class;
error = sysdev_register(dev);
if (error) {
@@ -2458,7 +2469,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
msg->address_lo =
MSI_ADDR_BASE_LO |
((INT_DEST_MODE == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
+MSI_ADDR_DEST_MODE_PHYSICAL:
MSI_ADDR_DEST_MODE_LOGICAL) |
((INT_DELIVERY_MODE != dest_LowestPrio) ?
MSI_ADDR_REDIRECTION_CPU:
@@ -2469,7 +2480,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
((INT_DELIVERY_MODE != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
+MSI_DATA_DELIVERY_FIXED:
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_VECTOR(vector);
}
@@ -2640,12 +2651,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
#endif /* CONFIG_HT_IRQ */
/* --------------------------------------------------------------------------
- ACPI-based IOAPIC Configuration
+ ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
#ifdef CONFIG_ACPI
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
{
union IO_APIC_reg_00 reg_00;
static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -2654,10 +2665,10 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
int i = 0;
/*
- * The P4 platform supports up to 256 APIC IDs on two separate APIC
- * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
* supports up to 16 on one shared APIC bus.
- *
+ *
* TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
* advantage of new APIC bus architecture.
*/
@@ -2676,7 +2687,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
}
/*
- * Every APIC in a system must have a unique ID or we get lots of nice
+ * Every APIC in a system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (check_apicid_used(apic_id_map, apic_id)) {
@@ -2693,7 +2704,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
"trying %d\n", ioapic, apic_id, i);
apic_id = i;
- }
+ }
tmp = apicid_to_cpu_present(apic_id);
physids_or(apic_id_map, apic_id_map, tmp);
@@ -2720,7 +2731,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
}
-int __init io_apic_get_version (int ioapic)
+int __init io_apic_get_version(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -2733,7 +2744,7 @@ int __init io_apic_get_version (int ioapic)
}
-int __init io_apic_get_redir_entries (int ioapic)
+int __init io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -2746,7 +2757,7 @@ int __init io_apic_get_redir_entries (int ioapic)
}
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
{
struct IO_APIC_route_entry entry;
@@ -2762,7 +2773,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
* corresponding device driver registers for this IRQ.
*/
- memset(&entry,0,sizeof(entry));
+ memset(&entry, 0, sizeof(entry));
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
@@ -2781,7 +2792,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
- mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+ mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
edge_level, active_high_low);
ioapic_register_intr(irq, entry.vector, edge_level);
@@ -2802,8 +2813,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
return -1;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == mp_INT &&
- mp_irqs[i].mpc_srcbusirq == bus_irq)
+ if (mp_irqs[i].mp_irqtype == mp_INT &&
+ mp_irqs[i].mp_srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..2eba4f4c14ba 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -61,7 +61,7 @@ struct irq_cfg {
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
[0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
[1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
[2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
@@ -82,6 +82,10 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
static int assign_irq_vector(int irq, cpumask_t mask);
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+
#define __apicdebuginit __init
int sis_apic_bug; /* not actually supported, dummy for compile */
@@ -90,7 +94,7 @@ static int no_timer_check;
static int disable_timer_pin_1 __initdata;
-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;
/* Where if anywhere is the i8259 connect in external int mode */
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
@@ -104,15 +108,17 @@ DEFINE_SPINLOCK(vector_lock);
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
/*
* Rough estimation of how many shared IRQs there are, can
* be changed anytime.
@@ -140,7 +146,7 @@ struct io_apic {
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
- + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+ + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
}
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -183,7 +189,7 @@ static bool io_apic_level_ack_pending(unsigned int irq)
break;
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
- if ((reg >> 14) & 1) {
+ if (reg & IO_APIC_REDIR_REMOTE_IRR) {
spin_unlock_irqrestore(&ioapic_lock, flags);
return true;
}
@@ -298,7 +304,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
break;
io_apic_write(apic, 0x11 + pin*2, dest);
reg = io_apic_read(apic, 0x10 + pin*2);
- reg &= ~0x000000ff;
+ reg &= ~IO_APIC_REDIR_VECTOR_MASK;
reg |= vector;
io_apic_modify(apic, reg);
if (!entry->next)
@@ -360,16 +366,37 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
entry->pin = pin;
}
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+ int oldapic, int oldpin,
+ int newapic, int newpin)
+{
+ struct irq_pin_list *entry = irq_2_pin + irq;
+
+ while (1) {
+ if (entry->apic == oldapic && entry->pin == oldpin) {
+ entry->apic = newapic;
+ entry->pin = newpin;
+ }
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+}
+
#define DO_ACTION(name,R,ACTION, FINAL) \
\
static void name##_IO_APIC_irq (unsigned int irq) \
__DO_ACTION(R, ACTION, FINAL)
-DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
- /* mask = 1 */
-DO_ACTION( __unmask, 0, &= 0xfffeffff, )
- /* mask = 0 */
+/* mask = 1 */
+DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+
+/* mask = 0 */
+DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, )
static void mask_IO_APIC_irq (unsigned int irq)
{
@@ -430,20 +457,6 @@ static int __init disable_timer_pin_setup(char *arg)
}
__setup("disable_timer_pin_1", disable_timer_pin_setup);
-static int __init setup_disable_8254_timer(char *s)
-{
- timer_over_8254 = -1;
- return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
- timer_over_8254 = 2;
- return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
/*
* Find the IRQ entry number of a certain pin.
@@ -453,10 +466,10 @@ static int find_irq_entry(int apic, int pin, int type)
int i;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == type &&
- (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
- mp_irqs[i].mpc_dstirq == pin)
+ if (mp_irqs[i].mp_irqtype == type &&
+ (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].mp_dstirq == pin)
return i;
return -1;
@@ -470,13 +483,13 @@ static int __init find_isa_irq_pin(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
- return mp_irqs[i].mpc_dstirq;
+ return mp_irqs[i].mp_dstirq;
}
return -1;
}
@@ -486,17 +499,17 @@ static int __init find_isa_irq_apic(int irq, int type)
int i;
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
- (mp_irqs[i].mpc_irqtype == type) &&
- (mp_irqs[i].mpc_srcbusirq == irq))
+ (mp_irqs[i].mp_irqtype == type) &&
+ (mp_irqs[i].mp_srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
for(apic = 0; apic < nr_ioapics; apic++) {
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
return apic;
}
}
@@ -516,28 +529,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
bus, slot, pin);
- if (mp_bus_id_to_pci_bus[bus] == -1) {
+ if (test_bit(bus, mp_bus_not_pci)) {
apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
- int lbus = mp_irqs[i].mpc_srcbus;
+ int lbus = mp_irqs[i].mp_srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
- if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
- mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+ mp_irqs[i].mp_dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].mpc_irqtype &&
+ !mp_irqs[i].mp_irqtype &&
(bus == lbus) &&
- (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+ (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
- if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+ if (pin == (mp_irqs[i].mp_srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
@@ -565,13 +578,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
static int MPBIOS_polarity(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
- switch (mp_irqs[idx].mpc_irqflag & 3)
+ switch (mp_irqs[idx].mp_irqflag & 3)
{
case 0: /* conforms, ie. bus-type dependent polarity */
if (test_bit(bus, mp_bus_not_pci))
@@ -607,13 +620,13 @@ static int MPBIOS_polarity(int idx)
static int MPBIOS_trigger(int idx)
{
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
- switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+ switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
{
case 0: /* conforms, ie. bus-type dependent */
if (test_bit(bus, mp_bus_not_pci))
@@ -660,16 +673,16 @@ static inline int irq_trigger(int idx)
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
- int bus = mp_irqs[idx].mpc_srcbus;
+ int bus = mp_irqs[idx].mp_srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
- if (mp_irqs[idx].mpc_dstirq != pin)
+ if (mp_irqs[idx].mp_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci)) {
- irq = mp_irqs[idx].mpc_srcbusirq;
+ irq = mp_irqs[idx].mp_srcbusirq;
} else {
/*
* PCI IRQs are mapped in order
@@ -730,7 +743,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
offset = current_offset;
next:
vector += 8;
- if (vector >= FIRST_SYSTEM_VECTOR) {
+ if (vector >= first_system_vector) {
/* If we run out of vectors on large boxen, must share them. */
offset = (offset + 1) % 8;
vector = FIRST_DEVICE_VECTOR + offset;
@@ -846,7 +859,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
"IRQ %d Mode:%i Active:%i)\n",
- apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+ apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
irq, trigger, polarity);
/*
@@ -887,10 +900,10 @@ static void __init setup_IO_APIC_irqs(void)
idx = find_irq_entry(apic,pin,mp_INT);
if (idx == -1) {
if (first_notcon) {
- apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
first_notcon = 0;
} else
- apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
continue;
}
if (!first_notcon) {
@@ -911,26 +924,21 @@ static void __init setup_IO_APIC_irqs(void)
}
/*
- * Set up the 8259A-master output pin as broadcast to all
- * CPUs.
+ * Set up the timer pin, possibly with the 8259A-master behind.
*/
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+ int vector)
{
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
- disable_8259A_irq(0);
-
- /* mask LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-
/*
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
entry.dest_mode = INT_DEST_MODE;
- entry.mask = 0; /* unmask IRQ now */
+ entry.mask = 1; /* mask IRQ now */
entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.delivery_mode = INT_DELIVERY_MODE;
entry.polarity = 0;
@@ -939,7 +947,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
/*
* The timer IRQ doesn't have to know that behind the
- * scene we have a 8259A-master in AEOI mode ...
+ * scene we may have a 8259A-master in AEOI mode ...
*/
set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
@@ -947,8 +955,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
* Add it to the IO-APIC irq-routing table:
*/
ioapic_write_entry(apic, pin, entry);
-
- enable_8259A_irq(0);
}
void __apicdebuginit print_IO_APIC(void)
@@ -965,7 +971,7 @@ void __apicdebuginit print_IO_APIC(void)
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
- mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+ mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
@@ -983,7 +989,7 @@ void __apicdebuginit print_IO_APIC(void)
spin_unlock_irqrestore(&ioapic_lock, flags);
printk("\n");
- printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
@@ -1077,6 +1083,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
v = apic_read(APIC_LVR);
printk(KERN_INFO "... APIC VERSION: %08x\n", v);
@@ -1659,6 +1666,7 @@ static inline void __init check_timer(void)
struct irq_cfg *cfg = irq_cfg + 0;
int apic1, pin1, apic2, pin2;
unsigned long flags;
+ int no_pin1 = 0;
local_irq_save(flags);
@@ -1669,16 +1677,11 @@ static inline void __init check_timer(void)
assign_irq_vector(0, TARGET_CPUS);
/*
- * Subtle, code in do_timer_interrupt() expects an AEOI
- * mode for the 8259A whenever interrupts are routed
- * through I/O APICs. Also IRQ0 has to be enabled in
- * the 8259A which implies the virtual wire has to be
- * disabled in the local APIC.
+ * As IRQ0 is to be enabled in the 8259A, the virtual
+ * wire has to be disabled in the local APIC.
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
- if (timer_over_8254 > 0)
- enable_8259A_irq(0);
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
@@ -1688,15 +1691,39 @@ static inline void __init check_timer(void)
apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
cfg->vector, apic1, pin1, apic2, pin2);
+ /*
+ * Some BIOS writers are clueless and report the ExtINTA
+ * I/O APIC input from the cascaded 8259A as the timer
+ * interrupt input. So just in case, if only one pin
+ * was found above, try it both directly and through the
+ * 8259A.
+ */
+ if (pin1 == -1) {
+ pin1 = pin2;
+ apic1 = apic2;
+ no_pin1 = 1;
+ } else if (pin2 == -1) {
+ pin2 = pin1;
+ apic2 = apic1;
+ }
+
+ replace_pin_at_irq(0, 0, 0, apic1, pin1);
+ apic1 = 0;
+ pin1 = 0;
+ setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+
if (pin1 != -1) {
/*
* Ok, does IRQ0 through the IOAPIC work?
*/
+ if (no_pin1) {
+ add_pin_to_irq(0, apic1, pin1);
+ setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+ }
unmask_IO_APIC_irq(0);
if (!no_timer_check && timer_irq_works()) {
nmi_watchdog_default();
if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
setup_nmi();
enable_8259A_irq(0);
}
@@ -1705,42 +1732,48 @@ static inline void __init check_timer(void)
goto out;
}
clear_IO_APIC_pin(apic1, pin1);
- apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
- "connected to IO-APIC\n");
- }
+ if (!no_pin1)
+ apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+ "8254 timer not connected to IO-APIC\n");
- apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
- "through the 8259A ... ");
- if (pin2 != -1) {
+ apic_printk(APIC_VERBOSE,KERN_INFO
+ "...trying to set up timer (IRQ0) "
+ "through the 8259A ... ");
apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
apic2, pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
- setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+ replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+ setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+ unmask_IO_APIC_irq(0);
+ enable_8259A_irq(0);
if (timer_irq_works()) {
apic_printk(APIC_VERBOSE," works.\n");
+ timer_through_8259 = 1;
nmi_watchdog_default();
if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
setup_nmi();
+ enable_8259A_irq(0);
}
goto out;
}
/*
* Cleanup, just in case ...
*/
+ disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
+ apic_printk(APIC_VERBOSE," failed.\n");
}
- apic_printk(APIC_VERBOSE," failed.\n");
if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = 0;
+ nmi_watchdog = NMI_NONE;
}
apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
- disable_8259A_irq(0);
irq_desc[0].chip = &lapic_irq_type;
apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -1749,6 +1782,7 @@ static inline void __init check_timer(void)
apic_printk(APIC_VERBOSE," works.\n");
goto out;
}
+ disable_8259A_irq(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_VERBOSE," failed.\n");
@@ -1841,8 +1875,8 @@ static int ioapic_resume(struct sys_device *dev)
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
- if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
- reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+ if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2242,8 +2276,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
return -1;
for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_irqtype == mp_INT &&
- mp_irqs[i].mpc_srcbusirq == bus_irq)
+ if (mp_irqs[i].mp_irqtype == mp_INT &&
+ mp_irqs[i].mp_srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
@@ -2336,7 +2370,7 @@ void __init ioapic_init_mappings(void)
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
- ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+ ioapic_phys = mp_ioapics[i].mp_apicaddr;
} else {
ioapic_phys = (unsigned long)
alloc_bootmem_pages(PAGE_SIZE);
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index c0df7b89ca23..9d98cda39ad9 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -8,7 +8,6 @@
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/cache.h>
-#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/module.h>
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 147352df28b9..47a6f6f12478 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
#endif
}
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+/* Debugging check for stack overflow: is there less than 1KB free? */
+static int check_stack_overflow(void)
+{
+ long sp;
+
+ __asm__ __volatile__("andl %%esp,%0" :
+ "=r" (sp) : "0" (THREAD_SIZE - 1));
+
+ return sp < (sizeof(struct thread_info) + STACK_WARN);
+}
+
+static void print_stack_overflow(void)
+{
+ printk(KERN_WARNING "low stack detected by irq handler\n");
+ dump_stack();
+}
+
+#else
+static inline int check_stack_overflow(void) { return 0; }
+static inline void print_stack_overflow(void) { }
+#endif
+
#ifdef CONFIG_4KSTACKS
/*
* per-CPU IRQ handling contexts (thread information and stack)
@@ -59,48 +82,29 @@ union irq_ctx {
static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
-#endif
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-unsigned int do_IRQ(struct pt_regs *regs)
-{
- struct pt_regs *old_regs;
- /* high bit used in ret_from_ code */
- int irq = ~regs->orig_ax;
- struct irq_desc *desc = irq_desc + irq;
-#ifdef CONFIG_4KSTACKS
- union irq_ctx *curctx, *irqctx;
- u32 *isp;
-#endif
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+ __attribute__((__section__(".bss.page_aligned")));
- if (unlikely((unsigned)irq >= NR_IRQS)) {
- printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
- __func__, irq);
- BUG();
- }
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+ __attribute__((__section__(".bss.page_aligned")));
- old_regs = set_irq_regs(regs);
- irq_enter();
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
- /* Debugging check for stack overflow: is there less than 1KB free? */
- {
- long sp;
-
- __asm__ __volatile__("andl %%esp,%0" :
- "=r" (sp) : "0" (THREAD_SIZE - 1));
- if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
- printk("do_IRQ: stack overflow: %ld\n",
- sp - sizeof(struct thread_info));
- dump_stack();
- }
- }
-#endif
+static void call_on_stack(void *func, void *stack)
+{
+ asm volatile("xchgl %%ebx,%%esp \n"
+ "call *%%edi \n"
+ "movl %%ebx,%%esp \n"
+ : "=b" (stack)
+ : "0" (stack),
+ "D"(func)
+ : "memory", "cc", "edx", "ecx", "eax");
+}
-#ifdef CONFIG_4KSTACKS
+static inline int
+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
+{
+ union irq_ctx *curctx, *irqctx;
+ u32 *isp, arg1, arg2;
curctx = (union irq_ctx *) current_thread_info();
irqctx = hardirq_ctx[smp_processor_id()];
@@ -111,52 +115,39 @@ unsigned int do_IRQ(struct pt_regs *regs)
* handler) we can't do that and just have to keep using the
* current stack (which is the irq stack already after all)
*/
- if (curctx != irqctx) {
- int arg1, arg2, bx;
-
- /* build the stack frame on the IRQ stack */
- isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
- irqctx->tinfo.task = curctx->tinfo.task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
+ if (unlikely(curctx == irqctx))
+ return 0;
- /*
- * Copy the softirq bits in preempt_count so that the
- * softirq checks work in the hardirq context.
- */
- irqctx->tinfo.preempt_count =
- (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
- (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
-
- asm volatile(
- " xchgl %%ebx,%%esp \n"
- " call *%%edi \n"
- " movl %%ebx,%%esp \n"
- : "=a" (arg1), "=d" (arg2), "=b" (bx)
- : "0" (irq), "1" (desc), "2" (isp),
- "D" (desc->handle_irq)
- : "memory", "cc", "ecx"
- );
- } else
-#endif
- desc->handle_irq(irq, desc);
+ /* build the stack frame on the IRQ stack */
+ isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
+ irqctx->tinfo.task = curctx->tinfo.task;
+ irqctx->tinfo.previous_esp = current_stack_pointer;
- irq_exit();
- set_irq_regs(old_regs);
+ /*
+ * Copy the softirq bits in preempt_count so that the
+ * softirq checks work in the hardirq context.
+ */
+ irqctx->tinfo.preempt_count =
+ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+ (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
+ if (unlikely(overflow))
+ call_on_stack(print_stack_overflow, isp);
+
+ asm volatile("xchgl %%ebx,%%esp \n"
+ "call *%%edi \n"
+ "movl %%ebx,%%esp \n"
+ : "=a" (arg1), "=d" (arg2), "=b" (isp)
+ : "0" (irq), "1" (desc), "2" (isp),
+ "D" (desc->handle_irq)
+ : "memory", "cc", "ecx");
return 1;
}
-#ifdef CONFIG_4KSTACKS
-
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__section__(".bss.page_aligned")));
-
/*
* allocate per-cpu stacks for hardirq and for softirq processing
*/
-void irq_ctx_init(int cpu)
+void __cpuinit irq_ctx_init(int cpu)
{
union irq_ctx *irqctx;
@@ -164,25 +155,25 @@ void irq_ctx_init(int cpu)
return;
irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+ irqctx->tinfo.task = NULL;
+ irqctx->tinfo.exec_domain = NULL;
+ irqctx->tinfo.cpu = cpu;
+ irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
hardirq_ctx[cpu] = irqctx;
irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = 0;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+ irqctx->tinfo.task = NULL;
+ irqctx->tinfo.exec_domain = NULL;
+ irqctx->tinfo.cpu = cpu;
+ irqctx->tinfo.preempt_count = 0;
+ irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
softirq_ctx[cpu] = irqctx;
- printk("CPU %u irqstacks, hard=%p soft=%p\n",
- cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+ printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
+ cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
}
void irq_ctx_exit(int cpu)
@@ -211,25 +202,56 @@ asmlinkage void do_softirq(void)
/* build the stack frame on the softirq stack */
isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
- asm volatile(
- " xchgl %%ebx,%%esp \n"
- " call __do_softirq \n"
- " movl %%ebx,%%esp \n"
- : "=b"(isp)
- : "0"(isp)
- : "memory", "cc", "edx", "ecx", "eax"
- );
+ call_on_stack(__do_softirq, isp);
/*
* Shouldnt happen, we returned above if in_interrupt():
- */
+ */
WARN_ON_ONCE(softirq_count());
}
local_irq_restore(flags);
}
+
+#else
+static inline int
+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
#endif
/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int do_IRQ(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+ /* high bit used in ret_from_ code */
+ int overflow, irq = ~regs->orig_ax;
+ struct irq_desc *desc = irq_desc + irq;
+
+ if (unlikely((unsigned)irq >= NR_IRQS)) {
+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+ __func__, irq);
+ BUG();
+ }
+
+ old_regs = set_irq_regs(regs);
+ irq_enter();
+
+ overflow = check_stack_overflow();
+
+ if (!execute_on_irq_stack(overflow, desc, irq)) {
+ if (unlikely(overflow))
+ print_stack_overflow();
+ desc->handle_irq(irq, desc);
+ }
+
+ irq_exit();
+ set_irq_regs(old_regs);
+ return 1;
+}
+
+/*
* Interrupt statistics:
*/
@@ -313,16 +335,20 @@ skip:
per_cpu(irq_stat,j).irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
+#ifdef CONFIG_X86_MCE
seq_printf(p, "TRM: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
per_cpu(irq_stat,j).irq_thermal_count);
seq_printf(p, " Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "SPU: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ",
per_cpu(irq_stat,j).irq_spurious_count);
seq_printf(p, " Spurious interrupts\n");
+#endif
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
@@ -331,6 +357,40 @@ skip:
return 0;
}
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+ u64 sum = nmi_count(cpu);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
+#endif
+#ifdef CONFIG_SMP
+ sum += per_cpu(irq_stat, cpu).irq_resched_count;
+ sum += per_cpu(irq_stat, cpu).irq_call_count;
+ sum += per_cpu(irq_stat, cpu).irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ sum += per_cpu(irq_stat, cpu).irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+ sum += per_cpu(irq_stat, cpu).irq_spurious_count;
+#endif
+ return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+ u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+ sum += atomic_read(&irq_mis_count);
+#endif
+ return sum;
+}
+
#ifdef CONFIG_HOTPLUG_CPU
#include <mach_apic.h>
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3aac15466a91..1f78b238d8d2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -135,6 +135,7 @@ skip:
seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
seq_printf(p, " TLB shootdowns\n");
#endif
+#ifdef CONFIG_X86_MCE
seq_printf(p, "TRM: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
@@ -143,6 +144,7 @@ skip:
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
seq_printf(p, " Threshold APIC interrupts\n");
+#endif
seq_printf(p, "SPU: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
@@ -153,6 +155,32 @@ skip:
}
/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+ u64 sum = cpu_pda(cpu)->__nmi_count;
+
+ sum += cpu_pda(cpu)->apic_timer_irqs;
+#ifdef CONFIG_SMP
+ sum += cpu_pda(cpu)->irq_resched_count;
+ sum += cpu_pda(cpu)->irq_call_count;
+ sum += cpu_pda(cpu)->irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+ sum += cpu_pda(cpu)->irq_thermal_count;
+ sum += cpu_pda(cpu)->irq_threshold_count;
+#endif
+ sum += cpu_pda(cpu)->irq_spurious_count;
+ return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+ return atomic_read(&irq_err_count);
+}
+
+/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
new file mode 100644
index 000000000000..d66914287ee1
--- /dev/null
+++ b/arch/x86/kernel/irqinit_32.c
@@ -0,0 +1,114 @@
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8259.h>
+
+
+
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+ extern void math_error(void __user *);
+ outb(0,0xF0);
+ if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+ return IRQ_NONE;
+ math_error((void __user *)get_irq_regs()->ip);
+ return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = {
+ .handler = math_error_irq,
+ .mask = CPU_MASK_NONE,
+ .name = "fpu",
+};
+
+void __init init_ISA_irqs (void)
+{
+ int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ init_bsp_APIC();
+#endif
+ init_8259A(0);
+
+ /*
+ * 16 old-style INTA-cycle interrupts:
+ */
+ for (i = 0; i < 16; i++) {
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
+ handle_level_irq, "XT");
+ }
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+ int i;
+
+ /* all the set up before the call gates are initialised */
+ pre_intr_init_hook();
+
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (i >= NR_IRQS)
+ break;
+ /* SYSCALL_VECTOR was reserved in trap_init. */
+ if (!test_bit(vector, used_vectors))
+ set_intr_gate(vector, interrupt[i]);
+ }
+
+ /* setup after call gates are initialised (usually add in
+ * the architecture specific gates)
+ */
+ intr_init_hook();
+
+ /*
+ * External FPU? Set up irq13 if so, for
+ * original braindamaged IBM FERR coupling.
+ */
+ if (boot_cpu_data.hard_math && !cpu_has_fpu)
+ setup_irq(FPU_IRQ, &fpu_irq);
+
+ irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
new file mode 100644
index 000000000000..31f49e8f46a7
--- /dev/null
+++ b/arch/x86/kernel/irqinit_64.c
@@ -0,0 +1,217 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+
+#define IRQ_NAME2(nr) nr##_interrupt(void)
+#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
+
+/*
+ * SMP has a few special interrupts for IPI messages
+ */
+
+#define BUILD_IRQ(nr) \
+ asmlinkage void IRQ_NAME(nr); \
+ asm("\n.p2align\n" \
+ "IRQ" #nr "_interrupt:\n\t" \
+ "push $~(" #nr ") ; " \
+ "jmp common_interrupt");
+
+#define BI(x,y) \
+ BUILD_IRQ(x##y)
+
+#define BUILD_16_IRQS(x) \
+ BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+ BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+ BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+ BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
+
+#undef BUILD_16_IRQS
+#undef BI
+
+
+#define IRQ(x,y) \
+ IRQ##x##y##_interrupt
+
+#define IRQLIST_16(x) \
+ IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+ IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+ IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+
+/* for the irq vectors */
+static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+ IRQLIST_16(0x2), IRQLIST_16(0x3),
+ IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+ IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+ IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
+};
+
+#undef IRQ
+#undef IRQLIST_16
+
+
+
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+
+static struct irqaction irq2 = {
+ .handler = no_action,
+ .mask = CPU_MASK_NONE,
+ .name = "cascade",
+};
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+ [0 ... IRQ0_VECTOR - 1] = -1,
+ [IRQ0_VECTOR] = 0,
+ [IRQ1_VECTOR] = 1,
+ [IRQ2_VECTOR] = 2,
+ [IRQ3_VECTOR] = 3,
+ [IRQ4_VECTOR] = 4,
+ [IRQ5_VECTOR] = 5,
+ [IRQ6_VECTOR] = 6,
+ [IRQ7_VECTOR] = 7,
+ [IRQ8_VECTOR] = 8,
+ [IRQ9_VECTOR] = 9,
+ [IRQ10_VECTOR] = 10,
+ [IRQ11_VECTOR] = 11,
+ [IRQ12_VECTOR] = 12,
+ [IRQ13_VECTOR] = 13,
+ [IRQ14_VECTOR] = 14,
+ [IRQ15_VECTOR] = 15,
+ [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
+static void __init init_ISA_irqs (void)
+{
+ int i;
+
+ init_bsp_APIC();
+ init_8259A(0);
+
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc[i].status = IRQ_DISABLED;
+ irq_desc[i].action = NULL;
+ irq_desc[i].depth = 1;
+
+ if (i < 16) {
+ /*
+ * 16 old-style INTA-cycle interrupts:
+ */
+ set_irq_chip_and_handler_name(i, &i8259A_chip,
+ handle_level_irq, "XT");
+ } else {
+ /*
+ * 'high' PCI IRQs filled in on demand
+ */
+ irq_desc[i].chip = &no_irq_chip;
+ }
+ }
+}
+
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+ int i;
+
+ init_ISA_irqs();
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (vector != IA32_SYSCALL_VECTOR)
+ set_intr_gate(vector, interrupt[i]);
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+ * IPI, driven by wakeup.
+ */
+ alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+ /* IPIs for invalidation */
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+ alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+ /* IPI for generic function call */
+ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+ /* Low priority IPI to cleanup after moving an irq */
+ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
+ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+ alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+
+ /* self generated IPI for local APIC timer */
+ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+ /* IPI vectors for APIC spurious and error interrupts */
+ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+ alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+ if (!acpi_ioapic)
+ setup_irq(2, &irq2);
+}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0224c3637c73..21f2bae98c15 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,9 +20,9 @@
#include <asm/mmu_context.h>
#ifdef CONFIG_SMP
-static void flush_ldt(void *null)
+static void flush_ldt(void *current_mm)
{
- if (current->active_mm)
+ if (current->active_mm == current_mm)
load_LDT(&current->active_mm->context);
}
#endif
@@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
load_LDT(pc);
mask = cpumask_of_cpu(smp_processor_id());
if (!cpus_equal(current->mm->cpu_vm_mask, mask))
- smp_call_function(flush_ldt, NULL, 1, 1);
+ smp_call_function(flush_ldt, current->mm, 1, 1);
preempt_enable();
#else
load_LDT(pc);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc31..f4960171bc66 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -39,7 +39,7 @@ static void set_idt(void *newidt, __u16 limit)
curidt.address = (unsigned long)newidt;
load_idt(&curidt);
-};
+}
static void set_gdt(void *newgdt, __u16 limit)
@@ -51,7 +51,7 @@ static void set_gdt(void *newgdt, __u16 limit)
curgdt.address = (unsigned long)newgdt;
load_gdt(&curgdt);
-};
+}
static void load_segments(void)
{
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 69729e38b78a..9758fea87c5b 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -5,13 +5,14 @@
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
+ * belonging to IA-32 family - PentiumPro, Pentium II,
* Pentium III, Xeon, Pentium 4, etc.
*
- * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual,
- * Order Number 245472 or free download from:
- *
- * http://developer.intel.com/design/pentium4/manuals/245472.htm
+ * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ * Software Developer's Manual
+ * Order Number 253668 or free download from:
+ *
+ * http://developer.intel.com/design/pentium4/manuals/253668.htm
*
* For more information, go to http://www.urbanmyth.org/microcode
*
@@ -58,12 +59,12 @@
* nature of implementation.
* 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
* Fix the panic when writing zero-length microcode chunk.
- * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
* Jun Nakajima <jun.nakajima@intel.com>
* Support for the microcode updates in the new format.
* 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
* Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- * because we no longer hold a copy of applied microcode
+ * because we no longer hold a copy of applied microcode
* in kernel memory.
* 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
* Fix sigmatch() macro to handle old CPUs with pf == 0.
@@ -320,11 +321,11 @@ static void apply_microcode(int cpu)
return;
/* serialize access to the physical write to MSR 0x79 */
- spin_lock_irqsave(&microcode_update_lock, flags);
+ spin_lock_irqsave(&microcode_update_lock, flags);
/* write microcode via MSR 0x79 */
wrmsr(MSR_IA32_UCODE_WRITE,
- (unsigned long) uci->mc->bits,
+ (unsigned long) uci->mc->bits,
(unsigned long) uci->mc->bits >> 16 >> 16);
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
@@ -341,7 +342,7 @@ static void apply_microcode(int cpu)
return;
}
printk(KERN_INFO "microcode: CPU%d updated from revision "
- "0x%x to 0x%x, date = %08x \n",
+ "0x%x to 0x%x, date = %08x \n",
cpu_num, uci->rev, val[1], uci->mc->hdr.date);
uci->rev = val[1];
}
@@ -534,7 +535,7 @@ static int cpu_request_microcode(int cpu)
c->x86, c->x86_model, c->x86_mask);
error = request_firmware(&firmware, name, &microcode_pdev->dev);
if (error) {
- pr_debug("microcode: ucode data file %s load failed\n", name);
+ pr_debug("microcode: data file %s load failed\n", name);
return error;
}
buf = firmware->data;
@@ -805,6 +806,9 @@ static int __init microcode_init (void)
{
int error;
+ printk(KERN_INFO
+ "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+
error = microcode_dev_init();
if (error)
return error;
@@ -825,9 +829,6 @@ static int __init microcode_init (void)
}
register_hotcpu_notifier(&mc_cpu_notifier);
-
- printk(KERN_INFO
- "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
return 0;
}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index edc5fbfe85c0..fdfdc550b366 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -12,6 +12,7 @@
#include <asm/io.h>
#include <asm/msr.h>
#include <asm/acpi.h>
+#include <asm/mmconfig.h>
#include "../pci/pci.h"
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 4901ae3f742c..8b6b1e05c306 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -25,6 +25,8 @@
#include <asm/proto.h>
#include <asm/acpi.h>
#include <asm/bios_ebda.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
#include <mach_apic.h>
#ifdef CONFIG_X86_32
@@ -32,28 +34,6 @@
#include <mach_mpparse.h>
#endif
-/* Have we found an MP table */
-int smp_found_config;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
-
-static int mp_current_pci_id;
-
-int pic_mode;
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
/*
* Checksum an MP configuration block.
*/
@@ -69,15 +49,73 @@ static int __init mpf_checksum(unsigned char *mp, int len)
}
#ifdef CONFIG_X86_NUMAQ
+int found_numaq;
/*
* Have to match translation table entries to main table entries by counter
* hence the mpc_record variable .... can't see a less disgusting way of
* doing this ....
*/
+struct mpc_config_translation {
+ unsigned char mpc_type;
+ unsigned char trans_len;
+ unsigned char trans_type;
+ unsigned char trans_quad;
+ unsigned char trans_global;
+ unsigned char trans_local;
+ unsigned short trans_reserved;
+};
+
static int mpc_record;
static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
__cpuinitdata;
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+ return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+
+static inline int mpc_apic_id(struct mpc_config_processor *m,
+ struct mpc_config_translation *translation_record)
+{
+ int quad = translation_record->trans_quad;
+ int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+
+ printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+ m->mpc_apicid,
+ (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+ (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+ m->mpc_apicver, quad, logical_apicid);
+ return logical_apicid;
+}
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+
+static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
+ struct mpc_config_translation *translation)
+{
+ int quad = translation->trans_quad;
+ int local = translation->trans_local;
+
+ mp_bus_id_to_node[m->mpc_busid] = quad;
+ mp_bus_id_to_local[m->mpc_busid] = local;
+ printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+ m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+static void mpc_oem_pci_bus(struct mpc_config_bus *m,
+ struct mpc_config_translation *translation)
+{
+ int quad = translation->trans_quad;
+ int local = translation->trans_local;
+
+ quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
#endif
static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
@@ -90,7 +128,10 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
return;
}
#ifdef CONFIG_X86_NUMAQ
- apicid = mpc_apic_id(m, translation_table[mpc_record]);
+ if (found_numaq)
+ apicid = mpc_apic_id(m, translation_table[mpc_record]);
+ else
+ apicid = m->mpc_apicid;
#else
apicid = m->mpc_apicid;
#endif
@@ -103,17 +144,18 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
generic_processor_info(apicid, m->mpc_apicver);
}
+#ifdef CONFIG_X86_IO_APIC
static void __init MP_bus_info(struct mpc_config_bus *m)
{
char str[7];
-
memcpy(str, m->mpc_bustype, 6);
str[6] = 0;
#ifdef CONFIG_X86_NUMAQ
- mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+ if (found_numaq)
+ mpc_oem_bus_info(m, str, translation_table[mpc_record]);
#else
- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+ printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
#endif
#if MAX_MP_BUSSES < 256
@@ -132,11 +174,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
#ifdef CONFIG_X86_NUMAQ
- mpc_oem_pci_bus(m, translation_table[mpc_record]);
+ if (found_numaq)
+ mpc_oem_pci_bus(m, translation_table[mpc_record]);
#endif
clear_bit(m->mpc_busid, mp_bus_not_pci);
- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
- mp_current_pci_id++;
#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
@@ -147,6 +188,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
} else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
}
+#endif
#ifdef CONFIG_X86_IO_APIC
@@ -176,18 +218,89 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
if (bad_ioapic(m->mpc_apicaddr))
return;
- mp_ioapics[nr_ioapics] = *m;
+ mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
+ mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
+ mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
+ mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
+ mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
nr_ioapics++;
}
-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
{
- mp_irqs[mp_irq_entries] = *m;
- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+ printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+}
+
+static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+{
+ printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
+ (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
+ mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+}
+
+static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
+ struct mp_config_intsrc *mp_irq)
+{
+ mp_irq->mp_dstapic = m->mpc_dstapic;
+ mp_irq->mp_type = m->mpc_type;
+ mp_irq->mp_irqtype = m->mpc_irqtype;
+ mp_irq->mp_irqflag = m->mpc_irqflag;
+ mp_irq->mp_srcbus = m->mpc_srcbus;
+ mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
+ mp_irq->mp_dstirq = m->mpc_dstirq;
+}
+
+static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+ struct mpc_config_intsrc *m)
+{
+ m->mpc_dstapic = mp_irq->mp_dstapic;
+ m->mpc_type = mp_irq->mp_type;
+ m->mpc_irqtype = mp_irq->mp_irqtype;
+ m->mpc_irqflag = mp_irq->mp_irqflag;
+ m->mpc_srcbus = mp_irq->mp_srcbus;
+ m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
+ m->mpc_dstirq = mp_irq->mp_dstirq;
+}
+
+static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+ struct mpc_config_intsrc *m)
+{
+ if (mp_irq->mp_dstapic != m->mpc_dstapic)
+ return 1;
+ if (mp_irq->mp_type != m->mpc_type)
+ return 2;
+ if (mp_irq->mp_irqtype != m->mpc_irqtype)
+ return 3;
+ if (mp_irq->mp_irqflag != m->mpc_irqflag)
+ return 4;
+ if (mp_irq->mp_srcbus != m->mpc_srcbus)
+ return 5;
+ if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+ return 6;
+ if (mp_irq->mp_dstirq != m->mpc_dstirq)
+ return 7;
+
+ return 0;
+}
+
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+{
+ int i;
+
+ print_MP_intsrc_info(m);
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
+ return;
+ }
+
+ assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!!\n");
}
@@ -196,7 +309,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
{
- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+ printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x,"
" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
@@ -266,11 +379,14 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
}
}
-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
char *productid)
{
if (strncmp(oem, "IBM NUMA", 8))
- printk("Warning! May not be a NUMA-Q system!\n");
+ printk("Warning! Not a NUMA-Q system!\n");
+ else
+ found_numaq = 1;
+
if (mpc->mpc_oemptr)
smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
mpc->mpc_oemsize);
@@ -281,12 +397,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
* Read/parse the MPC
*/
-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
+ char *str)
{
- char str[16];
- char oem[10];
- int count = sizeof(*mpc);
- unsigned char *mpt = ((unsigned char *)mpc) + count;
if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -309,19 +422,42 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
}
memcpy(oem, mpc->mpc_oem, 8);
oem[8] = 0;
- printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
+ printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
memcpy(str, mpc->mpc_productid, 12);
str[12] = 0;
- printk("Product ID: %s ", str);
-#ifdef CONFIG_X86_32
- mps_oem_check(mpc, oem, str);
-#endif
- printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
+ printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
+ return 1;
+}
+
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+{
+ char str[16];
+ char oem[10];
+
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+ if (!smp_check_mpc(mpc, oem, str))
+ return 0;
+
+#ifdef CONFIG_X86_32
+ /*
+ * need to make sure summit and es7000's mps_oem_check is safe to be
+ * called early via genericarch 's mps_oem_check
+ */
+ if (early) {
+#ifdef CONFIG_X86_NUMAQ
+ numaq_mps_oem_check(mpc, oem, str);
+#endif
+ } else
+ mps_oem_check(mpc, oem, str);
+#endif
+
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
mp_lapic_addr = mpc->mpc_lapic;
@@ -352,7 +488,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
{
struct mpc_config_bus *m =
(struct mpc_config_bus *)mpt;
+#ifdef CONFIG_X86_IO_APIC
MP_bus_info(m);
+#endif
mpt += sizeof(*m);
count += sizeof(*m);
break;
@@ -402,6 +540,11 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
++mpc_record;
#endif
}
+
+#ifdef CONFIG_X86_GENERICARCH
+ generic_bigsmp_probe();
+#endif
+
setup_apic_routing();
if (!num_processors)
printk(KERN_ERR "MPTABLE: no processors registered!\n");
@@ -427,7 +570,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
intsrc.mpc_type = MP_INTSRC;
intsrc.mpc_irqflag = 0; /* conforming */
intsrc.mpc_srcbus = 0;
- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+ intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
intsrc.mpc_irqtype = mp_INT;
@@ -488,40 +631,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
MP_intsrc_info(&intsrc);
}
-#endif
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+static void construct_ioapic_table(int mpc_default_type)
{
- struct mpc_config_processor processor;
- struct mpc_config_bus bus;
-#ifdef CONFIG_X86_IO_APIC
struct mpc_config_ioapic ioapic;
-#endif
- struct mpc_config_lintsrc lintsrc;
- int linttypes[2] = { mp_ExtINT, mp_NMI };
- int i;
-
- /*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /*
- * 2 CPUs, numbered 0 & 1.
- */
- processor.mpc_type = MP_PROCESSOR;
- /* Either an integrated APIC or a discrete 82489DX. */
- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
- processor.mpc_cpuflag = CPU_ENABLED;
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
- processor.mpc_reserved[0] = 0;
- processor.mpc_reserved[1] = 0;
- for (i = 0; i < 2; i++) {
- processor.mpc_apicid = i;
- MP_processor_info(&processor);
- }
+ struct mpc_config_bus bus;
bus.mpc_type = MP_BUS;
bus.mpc_busid = 0;
@@ -550,7 +664,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
MP_bus_info(&bus);
}
-#ifdef CONFIG_X86_IO_APIC
ioapic.mpc_type = MP_IOAPIC;
ioapic.mpc_apicid = 2;
ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -562,7 +675,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
* We set up most of the low 16 IO-APIC pins according to MPS rules.
*/
construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void construct_ioapic_table(int mpc_default_type) { }
#endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+ struct mpc_config_processor processor;
+ struct mpc_config_lintsrc lintsrc;
+ int linttypes[2] = { mp_ExtINT, mp_NMI };
+ int i;
+
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /*
+ * 2 CPUs, numbered 0 & 1.
+ */
+ processor.mpc_type = MP_PROCESSOR;
+ /* Either an integrated APIC or a discrete 82489DX. */
+ processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.mpc_cpuflag = CPU_ENABLED;
+ processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+ processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+ processor.mpc_reserved[0] = 0;
+ processor.mpc_reserved[1] = 0;
+ for (i = 0; i < 2; i++) {
+ processor.mpc_apicid = i;
+ MP_processor_info(&processor);
+ }
+
+ construct_ioapic_table(mpc_default_type);
+
lintsrc.mpc_type = MP_LINTSRC;
lintsrc.mpc_irqflag = 0; /* conforming */
lintsrc.mpc_srcbusid = 0;
@@ -600,7 +748,7 @@ static void __init __get_smp_config(unsigned early)
printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
mpf->mpf_specification);
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
if (mpf->mpf_feature2 & (1 << 7)) {
printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
pic_mode = 1;
@@ -632,7 +780,9 @@ static void __init __get_smp_config(unsigned early)
* override the defaults.
*/
if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
+#endif
printk(KERN_ERR
"BIOS bug, MP table errors detected!...\n");
printk(KERN_ERR "... disabling SMP support. "
@@ -689,7 +839,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned int *bp = phys_to_virt(base);
struct intel_mp_floating *mpf;
- Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+ printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length);
BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
@@ -699,15 +849,21 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
!mpf_checksum((unsigned char *)bp, 16) &&
((mpf->mpf_specification == 1)
|| (mpf->mpf_specification == 4))) {
-
+#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 1;
+#endif
mpf_found = mpf;
-#ifdef CONFIG_X86_32
+
printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
mpf, virt_to_phys(mpf));
- reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+
+ if (!reserve)
+ return 1;
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
BOOTMEM_DEFAULT);
if (mpf->mpf_physptr) {
+ unsigned long size = PAGE_SIZE;
+#ifdef CONFIG_X86_32
/*
* We cannot access to MPC table to compute
* table size yet, as only few megabytes from
@@ -717,25 +873,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
* PAGE_SIZE from mpg->mpf_physptr yields BUG()
* in reserve_bootmem.
*/
- unsigned long size = PAGE_SIZE;
unsigned long end = max_low_pfn * PAGE_SIZE;
if (mpf->mpf_physptr + size > end)
size = end - mpf->mpf_physptr;
- reserve_bootmem(mpf->mpf_physptr, size,
+#endif
+ reserve_bootmem_generic(mpf->mpf_physptr, size,
BOOTMEM_DEFAULT);
}
-#else
- if (!reserve)
- return 1;
-
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
- BOOTMEM_DEFAULT);
- if (mpf->mpf_physptr)
- reserve_bootmem_generic(mpf->mpf_physptr,
- PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
- return 1;
+ return 1;
}
bp += 4;
length -= 16;
@@ -791,298 +937,294 @@ void __init find_smp_config(void)
__find_smp_config(1);
}
-/* --------------------------------------------------------------------------
- ACPI-based MP Configuration
- -------------------------------------------------------------------------- */
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
-/*
- * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
- */
-int es7000_plat;
+static int __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+{
+ int i;
-#ifdef CONFIG_ACPI
+ if (m->mpc_irqtype != mp_INT)
+ return 0;
-#ifdef CONFIG_X86_IO_APIC
+ if (m->mpc_irqflag != 0x0f)
+ return 0;
-#define MP_ISA_BUS 0
+ /* not legacy */
-extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (mp_irqs[i].mp_irqtype != mp_INT)
+ continue;
-static int mp_find_ioapic(int gsi)
-{
- int i = 0;
+ if (mp_irqs[i].mp_irqflag != 0x0f)
+ continue;
- /* Find the IOAPIC that manages this GSI. */
- for (i = 0; i < nr_ioapics; i++) {
- if ((gsi >= mp_ioapic_routing[i].gsi_base)
- && (gsi <= mp_ioapic_routing[i].gsi_end))
- return i;
+ if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+ continue;
+ if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+ continue;
+ if (irq_used[i]) {
+ /* already claimed */
+ return -2;
+ }
+ irq_used[i] = 1;
+ return i;
}
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+ /* not found */
return -1;
}
-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return io_apic_get_unique_id(nr_ioapics, id);
- else
- return id;
-#else
- int i;
- DECLARE_BITMAP(used, 256);
- bitmap_zero(used, 256);
- for (i = 0; i < nr_ioapics; i++) {
- struct mpc_config_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->mpc_apicid, used);
- }
- if (!test_bit(id, used))
- return id;
- return find_first_zero_bit(used, 256);
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
#endif
-}
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+static int __init replace_intsrc_all(struct mp_config_table *mpc,
+ unsigned long mpc_new_phys,
+ unsigned long mpc_new_length)
{
- int idx = 0;
-
- if (bad_ioapic(address))
- return;
+#ifdef CONFIG_X86_IO_APIC
+ int i;
+ int nr_m_spare = 0;
+#endif
- idx = nr_ioapics;
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
- mp_ioapics[idx].mpc_type = MP_IOAPIC;
- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
- mp_ioapics[idx].mpc_apicaddr = address;
+ printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
+ while (count < mpc->mpc_length) {
+ switch (*mpt) {
+ case MP_PROCESSOR:
+ {
+ struct mpc_config_processor *m =
+ (struct mpc_config_processor *)mpt;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_BUS:
+ {
+ struct mpc_config_bus *m =
+ (struct mpc_config_bus *)mpt;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ case MP_IOAPIC:
+ {
+ mpt += sizeof(struct mpc_config_ioapic);
+ count += sizeof(struct mpc_config_ioapic);
+ break;
+ }
+ case MP_INTSRC:
+ {
+#ifdef CONFIG_X86_IO_APIC
+ struct mpc_config_intsrc *m =
+ (struct mpc_config_intsrc *)mpt;
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-#else
- mp_ioapics[idx].mpc_apicver = 0;
+ printk(KERN_INFO "OLD ");
+ print_MP_intsrc_info(m);
+ i = get_MP_intsrc_index(m);
+ if (i > 0) {
+ assign_to_mpc_intsrc(&mp_irqs[i], m);
+ printk(KERN_INFO "NEW ");
+ print_mp_irq_info(&mp_irqs[i]);
+ } else if (!i) {
+ /* legacy, do nothing */
+ } else if (nr_m_spare < SPARE_SLOT_NUM) {
+ /*
+ * not found (-1), or duplicated (-2)
+ * are invalid entries,
+ * we need to use the slot later
+ */
+ m_spare[nr_m_spare] = m;
+ nr_m_spare++;
+ }
#endif
- /*
- * Build basic GSI lookup table to facilitate gsi->io_apic lookups
- * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
- */
- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
- mp_ioapic_routing[idx].gsi_base = gsi_base;
- mp_ioapic_routing[idx].gsi_end = gsi_base +
- io_apic_get_redir_entries(idx);
+ mpt += sizeof(struct mpc_config_intsrc);
+ count += sizeof(struct mpc_config_intsrc);
+ break;
+ }
+ case MP_LINTSRC:
+ {
+ struct mpc_config_lintsrc *m =
+ (struct mpc_config_lintsrc *)mpt;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
+ break;
+ }
+ default:
+ /* wrong mptable */
+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+ printk(KERN_ERR "type %x\n", *mpt);
+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
+ 1, mpc, mpc->mpc_length, 1);
+ goto out;
+ }
+ }
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
- mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+#ifdef CONFIG_X86_IO_APIC
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (irq_used[i])
+ continue;
- nr_ioapics++;
-}
+ if (mp_irqs[i].mp_irqtype != mp_INT)
+ continue;
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
- struct mpc_config_intsrc intsrc;
- int ioapic = -1;
- int pin = -1;
+ if (mp_irqs[i].mp_irqflag != 0x0f)
+ continue;
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return;
- pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-
- /*
- * TBD: This check is for faulty timer entries, where the override
- * erroneously sets the trigger to level, resulting in a HUGE
- * increase of timer interrupts!
- */
- if ((bus_irq == 0) && (trigger == 3))
- trigger = 1;
-
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqtype = mp_INT;
- intsrc.mpc_irqflag = (trigger << 2) | polarity;
- intsrc.mpc_srcbus = MP_ISA_BUS;
- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
- intsrc.mpc_dstirq = pin; /* INTIN# */
+ if (nr_m_spare > 0) {
+ printk(KERN_INFO "*NEW* found ");
+ nr_m_spare--;
+ assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+ m_spare[nr_m_spare] = NULL;
+ } else {
+ struct mpc_config_intsrc *m =
+ (struct mpc_config_intsrc *)mpt;
+ count += sizeof(struct mpc_config_intsrc);
+ if (!mpc_new_phys) {
+ printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
+ } else {
+ if (count <= mpc_new_length)
+ printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
+ else {
+ printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
+ goto out;
+ }
+ }
+ assign_to_mpc_intsrc(&mp_irqs[i], m);
+ mpc->mpc_length = count;
+ mpt += sizeof(struct mpc_config_intsrc);
+ }
+ print_mp_irq_info(&mp_irqs[i]);
+ }
+#endif
+out:
+ /* update checksum */
+ mpc->mpc_checksum = 0;
+ mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
+ mpc->mpc_length);
- MP_intsrc_info(&intsrc);
+ return 0;
}
-void __init mp_config_acpi_legacy_irqs(void)
-{
- struct mpc_config_intsrc intsrc;
- int i = 0;
- int ioapic = -1;
+static int __initdata enable_update_mptable;
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
- /*
- * Fabricate the legacy ISA bus (bus #31).
- */
- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-#endif
- set_bit(MP_ISA_BUS, mp_bus_not_pci);
- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+static int __init update_mptable_setup(char *str)
+{
+ enable_update_mptable = 1;
+ return 0;
+}
+early_param("update_mptable", update_mptable_setup);
- /*
- * Older generations of ES7000 have no legacy identity mappings
- */
- if (es7000_plat == 1)
- return;
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
- /*
- * Locate the IOAPIC that manages the ISA IRQs (0-15).
- */
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- return;
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+ enable_update_mptable = 1;
+ alloc_mptable = 1;
+ if (!p)
+ return 0;
+ mpc_new_length = memparse(p, &p);
+ return 0;
+}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqflag = 0; /* Conforming */
- intsrc.mpc_srcbus = MP_ISA_BUS;
-#ifdef CONFIG_X86_IO_APIC
- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+void __init early_reserve_e820_mpc_new(void)
+{
+ if (enable_update_mptable && alloc_mptable) {
+ u64 startt = 0;
+#ifdef CONFIG_X86_TRAMPOLINE
+ startt = TRAMPOLINE_BASE;
#endif
- /*
- * Use the default configuration for the IRQs 0-15. Unless
- * overridden by (MADT) interrupt source override entries.
- */
- for (i = 0; i < 16; i++) {
- int idx;
-
- for (idx = 0; idx < mp_irq_entries; idx++) {
- struct mpc_config_intsrc *irq = mp_irqs + idx;
-
- /* Do we already have a mapping for this ISA IRQ? */
- if (irq->mpc_srcbus == MP_ISA_BUS
- && irq->mpc_srcbusirq == i)
- break;
-
- /* Do we already have a mapping for this IOAPIC pin */
- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
- (irq->mpc_dstirq == i))
- break;
- }
-
- if (idx != mp_irq_entries) {
- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
- continue; /* IRQ already used */
- }
-
- intsrc.mpc_irqtype = mp_INT;
- intsrc.mpc_srcbusirq = i; /* Identity mapped */
- intsrc.mpc_dstirq = i;
-
- MP_intsrc_info(&intsrc);
+ mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
}
}
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int __init update_mp_table(void)
{
- int ioapic;
- int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM 4096
-#define IRQ_COMPRESSION_START 64
+ char str[16];
+ char oem[10];
+ struct intel_mp_floating *mpf;
+ struct mp_config_table *mpc;
+ struct mp_config_table *mpc_new;
+
+ if (!enable_update_mptable)
+ return 0;
+
+ mpf = mpf_found;
+ if (!mpf)
+ return 0;
- static int pci_irq = IRQ_COMPRESSION_START;
/*
- * Mapping between Global System Interrupts, which
- * represent all possible interrupts, and IRQs
- * assigned to actual devices.
+ * Now see if we need to go further.
*/
- static int gsi_to_irq[MAX_GSI_NUM];
-#else
-
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return gsi;
-#endif
+ if (mpf->mpf_feature1 != 0)
+ return 0;
- /* Don't set up the ACPI SCI because it's already set up */
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return gsi;
+ if (!mpf->mpf_physptr)
+ return 0;
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0) {
- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
- return gsi;
- }
+ mpc = phys_to_virt(mpf->mpf_physptr);
- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+ if (!smp_check_mpc(mpc, oem, str))
+ return 0;
-#ifdef CONFIG_X86_32
- if (ioapic_renumber_irq)
- gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
+ printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
+ printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
- /*
- * Avoid pin reprogramming. PRTs typically include entries
- * with redundant pin->gsi mappings (but unique PCI devices);
- * we only program the IOAPIC on the first.
- */
- if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
- printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
- ioapic_pin);
- return gsi;
+ if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+ mpc_new_phys = 0;
+ printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
+ mpc_new_length);
}
- if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
- return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
- return gsi;
-#endif
+
+ if (!mpc_new_phys) {
+ unsigned char old, new;
+ /* check if we can change the postion */
+ mpc->mpc_checksum = 0;
+ old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+ mpc->mpc_checksum = 0xff;
+ new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+ if (old == new) {
+ printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+ return 0;
+ }
+ printk(KERN_INFO "use in-positon replacing\n");
+ } else {
+ mpf->mpf_physptr = mpc_new_phys;
+ mpc_new = phys_to_virt(mpc_new_phys);
+ memcpy(mpc_new, mpc, mpc->mpc_length);
+ mpc = mpc_new;
+ /* check if we can modify that */
+ if (mpc_new_phys - mpf->mpf_physptr) {
+ struct intel_mp_floating *mpf_new;
+ /* steal 16 bytes from [0, 1k) */
+ printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+ mpf_new = phys_to_virt(0x400 - 16);
+ memcpy(mpf_new, mpf, 16);
+ mpf = mpf_new;
+ mpf->mpf_physptr = mpc_new_phys;
+ }
+ mpf->mpf_checksum = 0;
+ mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
+ printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
}
- set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
/*
- * For GSI >= 64, use IRQ compression
+ * only replace the one with mp_INT and
+ * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+ * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+ * may need pci=routeirq for all coverage
*/
- if ((gsi >= IRQ_COMPRESSION_START)
- && (triggering == ACPI_LEVEL_SENSITIVE)) {
- /*
- * For PCI devices assign IRQs in order, avoiding gaps
- * due to unused I/O APIC pins.
- */
- int irq = gsi;
- if (gsi < MAX_GSI_NUM) {
- /*
- * Retain the VIA chipset work-around (gsi > 15), but
- * avoid a problem where the 8254 timer (IRQ0) is setup
- * via an override (so it's not on pin 0 of the ioapic),
- * and at the same time, the pin 0 interrupt is a PCI
- * type. The gsi > 15 test could cause these two pins
- * to be shared as IRQ0, and they are not shareable.
- * So test for this condition, and if necessary, avoid
- * the pin collision.
- */
- gsi = pci_irq++;
- /*
- * Don't assign IRQ used by ACPI SCI
- */
- if (gsi == acpi_gbl_FADT.sci_interrupt)
- gsi = pci_irq++;
- gsi_to_irq[irq] = gsi;
- } else {
- printk(KERN_ERR "GSI %u is too high\n", gsi);
- return gsi;
- }
- }
-#endif
- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- return gsi;
+ replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+ return 0;
}
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+late_initcall(update_mp_table);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 84160f74eeb0..6580dae46277 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -24,8 +24,11 @@
#include <linux/kdebug.h>
#include <linux/slab.h>
+#include <asm/i8259.h>
+#include <asm/io_apic.h>
#include <asm/smp.h>
#include <asm/nmi.h>
+#include <asm/timer.h>
#include "mach_traps.h"
@@ -81,7 +84,7 @@ int __init check_nmi_watchdog(void)
prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
if (!prev_nmi_count)
- return -1;
+ goto error;
printk(KERN_INFO "Testing NMI watchdog ... ");
@@ -118,7 +121,7 @@ int __init check_nmi_watchdog(void)
if (!atomic_read(&nmi_active)) {
kfree(prev_nmi_count);
atomic_set(&nmi_active, -1);
- return -1;
+ goto error;
}
printk("OK.\n");
@@ -129,6 +132,12 @@ int __init check_nmi_watchdog(void)
kfree(prev_nmi_count);
return 0;
+error:
+ if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
+ disable_8259A_irq(0);
+ timer_ack = 0;
+
+ return -1;
}
static int __init setup_nmi_watchdog(char *str)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 2861b9408ac9..d62f3b66b529 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -21,6 +21,8 @@
#include <linux/cpumask.h>
#include <linux/kdebug.h>
+#include <asm/i8259.h>
+#include <asm/io_apic.h>
#include <asm/smp.h>
#include <asm/nmi.h>
#include <asm/proto.h>
@@ -90,7 +92,7 @@ int __init check_nmi_watchdog(void)
prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
if (!prev_nmi_count)
- return -1;
+ goto error;
printk(KERN_INFO "Testing NMI watchdog ... ");
@@ -121,7 +123,7 @@ int __init check_nmi_watchdog(void)
if (!atomic_read(&nmi_active)) {
kfree(prev_nmi_count);
atomic_set(&nmi_active, -1);
- return -1;
+ goto error;
}
printk("OK.\n");
@@ -132,6 +134,11 @@ int __init check_nmi_watchdog(void)
kfree(prev_nmi_count);
return 0;
+error:
+ if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
+ disable_8259A_irq(0);
+
+ return -1;
}
static int __init setup_nmi_watchdog(char *str)
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index e65281b1634b..f0f1de1c4a1d 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,6 +31,8 @@
#include <asm/numaq.h>
#include <asm/topology.h>
#include <asm/processor.h>
+#include <asm/mpspec.h>
+#include <asm/e820.h>
#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
@@ -58,6 +60,8 @@ static void __init smp_dump_qct(void)
node_end_pfn[node] = MB_TO_PAGES(
eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
+ e820_register_active_regions(node, node_start_pfn[node],
+ node_end_pfn[node]);
memory_present(node,
node_start_pfn[node], node_end_pfn[node]);
node_remap_size[node] = node_memmap_size_bytes(node,
@@ -67,13 +71,24 @@ static void __init smp_dump_qct(void)
}
}
-/*
- * Unlike Summit, we don't really care to let the NUMA-Q
- * fall back to flat mode. Don't compile for NUMA-Q
- * unless you really need it!
- */
+static __init void early_check_numaq(void)
+{
+ /*
+ * Find possible boot-time SMP configuration:
+ */
+ early_find_smp_config();
+ /*
+ * get boot-time SMP configuration:
+ */
+ if (smp_found_config)
+ early_get_smp_config();
+}
+
int __init get_memcfg_numaq(void)
{
+ early_check_numaq();
+ if (!found_numaq)
+ return 0;
smp_dump_qct();
return 1;
}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 74f0c5ea2a03..f1ab0f727007 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -380,6 +380,9 @@ struct pv_mmu_ops pv_mmu_ops = {
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
+ .ptep_modify_prot_start = __ptep_modify_prot_start,
+ .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
#ifdef CONFIG_HIGHPTE
.kmap_atomic_pte = kmap_atomic,
#endif
@@ -403,6 +406,7 @@ struct pv_mmu_ops pv_mmu_ops = {
#endif /* PAGETABLE_LEVELS >= 3 */
.pte_val = native_pte_val,
+ .pte_flags = native_pte_val,
.pgd_val = native_pgd_val,
.make_pte = native_make_pte,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index dc00a1331ace..cb0bdf440715 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -7,6 +7,7 @@
#include <asm/dma.h>
#include <asm/gart.h>
#include <asm/calgary.h>
+#include <asm/amd_iommu.h>
int forbid_dac __read_mostly;
EXPORT_SYMBOL(forbid_dac);
@@ -77,10 +78,14 @@ void __init dma32_reserve_bootmem(void)
if (end_pfn <= MAX_DMA32_PFN)
return;
+ /*
+ * check aperture_64.c allocate_aperture() for reason about
+ * using 512M as goal
+ */
align = 64ULL<<20;
size = round_up(dma32_bootmem_size, align);
dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
- __pa(MAX_DMA_ADDRESS));
+ 512ULL<<20);
if (dma32_bootmem_ptr)
dma32_bootmem_size = size;
else
@@ -88,7 +93,6 @@ void __init dma32_reserve_bootmem(void)
}
static void __init dma32_free_bootmem(void)
{
- int node;
if (end_pfn <= MAX_DMA32_PFN)
return;
@@ -96,9 +100,7 @@ static void __init dma32_free_bootmem(void)
if (!dma32_bootmem_ptr)
return;
- for_each_online_node(node)
- free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
- dma32_bootmem_size);
+ free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
dma32_bootmem_ptr = NULL;
dma32_bootmem_size = 0;
@@ -122,6 +124,8 @@ void __init pci_iommu_alloc(void)
detect_intel_iommu();
+ amd_iommu_detect();
+
#ifdef CONFIG_SWIOTLB
pci_swiotlb_init();
#endif
@@ -357,7 +361,7 @@ int dma_supported(struct device *dev, u64 mask)
EXPORT_SYMBOL(dma_supported);
/* Allocate DMA memory on node near device */
-noinline struct page *
+static noinline struct page *
dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
{
int node;
@@ -502,6 +506,8 @@ static int __init pci_iommu_init(void)
intel_iommu_init();
+ amd_iommu_init();
+
#ifdef CONFIG_GART_IOMMU
gart_iommu_init();
#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index aa8ec928caa8..021f3c684a62 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -104,7 +104,6 @@ static unsigned long alloc_iommu(struct device *dev, int size)
size, base_index, boundary_size, 0);
}
if (offset != -1) {
- set_bit_string(iommu_gart_bitmap, offset, size);
next_bit = offset+size;
if (next_bit >= iommu_pages) {
next_bit = 0;
@@ -534,8 +533,8 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
unsigned aper_size = 0, aper_base_32, aper_order;
u64 aper_base;
- pci_read_config_dword(dev, 0x94, &aper_base_32);
- pci_read_config_dword(dev, 0x90, &aper_order);
+ pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
aper_order = (aper_order >> 1) & 7;
aper_base = aper_base_32 & 0x7fff;
@@ -549,14 +548,63 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
return aper_base;
}
+static void enable_gart_translations(void)
+{
+ int i;
+
+ for (i = 0; i < num_k8_northbridges; i++) {
+ struct pci_dev *dev = k8_northbridges[i];
+
+ enable_gart_translation(dev, __pa(agp_gatt_table));
+ }
+}
+
+/*
+ * If fix_up_north_bridges is set, the north bridges have to be fixed up on
+ * resume in the same way as they are handled in gart_iommu_hole_init().
+ */
+static bool fix_up_north_bridges;
+static u32 aperture_order;
+static u32 aperture_alloc;
+
+void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
+{
+ fix_up_north_bridges = true;
+ aperture_order = aper_order;
+ aperture_alloc = aper_alloc;
+}
+
static int gart_resume(struct sys_device *dev)
{
+ printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n");
+
+ if (fix_up_north_bridges) {
+ int i;
+
+ printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n");
+
+ for (i = 0; i < num_k8_northbridges; i++) {
+ struct pci_dev *dev = k8_northbridges[i];
+
+ /*
+ * Don't enable translations just yet. That is the next
+ * step. Restore the pre-suspend aperture settings.
+ */
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL,
+ aperture_order << 1);
+ pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
+ aperture_alloc >> 25);
+ }
+ }
+
+ enable_gart_translations();
+
return 0;
}
static int gart_suspend(struct sys_device *dev, pm_message_t state)
{
- return -EINVAL;
+ return 0;
}
static struct sysdev_class gart_sysdev_class = {
@@ -614,27 +662,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
memset(gatt, 0, gatt_size);
agp_gatt_table = gatt;
- for (i = 0; i < num_k8_northbridges; i++) {
- u32 gatt_reg;
- u32 ctl;
-
- dev = k8_northbridges[i];
- gatt_reg = __pa(gatt) >> 12;
- gatt_reg <<= 4;
- pci_write_config_dword(dev, 0x98, gatt_reg);
- pci_read_config_dword(dev, 0x90, &ctl);
-
- ctl |= 1;
- ctl &= ~((1<<4) | (1<<5));
-
- pci_write_config_dword(dev, 0x90, ctl);
- }
+ enable_gart_translations();
error = sysdev_class_register(&gart_sysdev_class);
if (!error)
error = sysdev_register(&device_gart);
if (error)
panic("Could not register gart_sysdev -- would corrupt data on next suspend");
+
flush_gart();
printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
@@ -677,11 +712,11 @@ void gart_iommu_shutdown(void)
u32 ctl;
dev = k8_northbridges[i];
- pci_read_config_dword(dev, 0x90, &ctl);
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
- ctl &= ~1;
+ ctl &= ~GARTEN;
- pci_write_config_dword(dev, 0x90, ctl);
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
}
}
@@ -788,10 +823,10 @@ void __init gart_iommu_init(void)
wbinvd();
/*
- * Try to workaround a bug (thanks to BenH)
+ * Try to workaround a bug (thanks to BenH):
* Set unmapped entries to a scratch page instead of 0.
* Any prefetches that hit unmapped entries won't get an bus abort
- * then.
+ * then. (P2P bridge may be prefetching on DMA reads).
*/
scratch = get_zeroed_page(GFP_KERNEL);
if (!scratch)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba370dc8685b..4061d63aabe7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -6,6 +6,7 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
+#include <linux/clockchips.h>
struct kmem_cache *task_xstate_cachep;
@@ -45,6 +46,76 @@ void arch_task_cache_init(void)
SLAB_PANIC, NULL);
}
+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+
+#ifdef CONFIG_X86_32
+/*
+ * This halt magic was a workaround for ancient floppy DMA
+ * wreckage. It should be safe to remove.
+ */
+static int hlt_counter;
+void disable_hlt(void)
+{
+ hlt_counter++;
+}
+EXPORT_SYMBOL(disable_hlt);
+
+void enable_hlt(void)
+{
+ hlt_counter--;
+}
+EXPORT_SYMBOL(enable_hlt);
+
+static inline int hlt_use_halt(void)
+{
+ return (!hlt_counter && boot_cpu_data.hlt_works_ok);
+}
+#else
+static inline int hlt_use_halt(void)
+{
+ return 1;
+}
+#endif
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void default_idle(void)
+{
+ if (hlt_use_halt()) {
+ current_thread_info()->status &= ~TS_POLLING;
+ /*
+ * TS_POLLING-cleared state must be visible before we
+ * test NEED_RESCHED:
+ */
+ smp_mb();
+
+ if (!need_resched())
+ safe_halt(); /* enables interrupts racelessly */
+ else
+ local_irq_enable();
+ current_thread_info()->status |= TS_POLLING;
+ } else {
+ local_irq_enable();
+ /* loop is done by the caller */
+ cpu_relax();
+ }
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
static void do_nothing(void *unused)
{
}
@@ -122,44 +193,129 @@ static void poll_idle(void)
*
* idle=mwait overrides this decision and forces the usage of mwait.
*/
+
+#define MWAIT_INFO 0x05
+#define MWAIT_ECX_EXTENDED_INFO 0x01
+#define MWAIT_EDX_C1 0xf0
+
static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
{
+ u32 eax, ebx, ecx, edx;
+
if (force_mwait)
return 1;
- if (c->x86_vendor == X86_VENDOR_AMD) {
- switch(c->x86) {
- case 0x10:
- case 0x11:
- return 0;
- }
- }
+ if (c->cpuid_level < MWAIT_INFO)
+ return 0;
+
+ cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
+ /* Check, whether EDX has extended info about MWAIT */
+ if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
+ return 1;
+
+ /*
+ * edx enumeratios MONITOR/MWAIT extensions. Check, whether
+ * C1 supports MWAIT
+ */
+ return (edx & MWAIT_EDX_C1);
+}
+
+/*
+ * Check for AMD CPUs, which have potentially C1E support
+ */
+static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor != X86_VENDOR_AMD)
+ return 0;
+
+ if (c->x86 < 0x0F)
+ return 0;
+
+ /* Family 0x0f models < rev F do not have C1E */
+ if (c->x86 == 0x0f && c->x86_model < 0x40)
+ return 0;
+
return 1;
}
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+/*
+ * C1E aware idle routine. We check for C1E active in the interrupt
+ * pending message MSR. If we detect C1E, then we handle it the same
+ * way as C3 power states (local apic timer and TSC stop)
+ */
+static void c1e_idle(void)
{
- static int selected;
+ static cpumask_t c1e_mask = CPU_MASK_NONE;
+ static int c1e_detected;
- if (selected)
+ if (need_resched())
return;
+
+ if (!c1e_detected) {
+ u32 lo, hi;
+
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+ c1e_detected = 1;
+ mark_tsc_unstable("TSC halt in C1E");
+ printk(KERN_INFO "System has C1E enabled\n");
+ }
+ }
+
+ if (c1e_detected) {
+ int cpu = smp_processor_id();
+
+ if (!cpu_isset(cpu, c1e_mask)) {
+ cpu_set(cpu, c1e_mask);
+ /*
+ * Force broadcast so ACPI can not interfere. Needs
+ * to run with interrupts enabled as it uses
+ * smp_function_call.
+ */
+ local_irq_enable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+ &cpu);
+ printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
+ cpu);
+ local_irq_disable();
+ }
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+
+ default_idle();
+
+ /*
+ * The switch back from broadcast mode needs to be
+ * called with interrupts disabled.
+ */
+ local_irq_disable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+ local_irq_enable();
+ } else
+ default_idle();
+}
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
#ifdef CONFIG_X86_SMP
if (pm_idle == poll_idle && smp_num_siblings > 1) {
printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
" performance may degrade.\n");
}
#endif
+ if (pm_idle)
+ return;
+
if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
/*
- * Skip, if setup has overridden idle.
* One CPU supports mwait => All CPUs supports mwait
*/
- if (!pm_idle) {
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- }
- }
- selected = 1;
+ printk(KERN_INFO "using mwait in idle threads.\n");
+ pm_idle = mwait_idle;
+ } else if (check_c1e_idle(c)) {
+ printk(KERN_INFO "using C1E aware idle routine\n");
+ pm_idle = c1e_idle;
+ } else
+ pm_idle = default_idle;
}
static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index e2db9ac5c61c..c2a11d77b1b5 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,11 +58,6 @@
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-static int hlt_counter;
-
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
@@ -77,55 +72,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
return ((unsigned long *)tsk->thread.sp)[3];
}
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
-void disable_hlt(void)
-{
- hlt_counter++;
-}
-
-EXPORT_SYMBOL(disable_hlt);
-
-void enable_hlt(void)
-{
- hlt_counter--;
-}
-
-EXPORT_SYMBOL(enable_hlt);
-
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
-{
- if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
-
- if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
- } else {
- local_irq_enable();
- /* loop is done by the caller */
- cpu_relax();
- }
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(default_idle);
-#endif
-
#ifdef CONFIG_HOTPLUG_CPU
#include <asm/nmi.h>
/* We don't actually take CPU down, just spin without interrupts. */
@@ -168,24 +114,19 @@ void cpu_idle(void)
while (1) {
tick_nohz_stop_sched_tick();
while (!need_resched()) {
- void (*idle)(void);
check_pgt_cache();
rmb();
- idle = pm_idle;
if (rcu_pending(cpu))
rcu_check_callbacks(cpu, 0);
- if (!idle)
- idle = default_idle;
-
if (cpu_is_offline(cpu))
play_dead();
local_irq_disable();
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
- idle();
+ pm_idle();
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c6eb5c91e5f6..290183e9731a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -56,15 +56,6 @@ asmlinkage extern void ret_from_fork(void);
unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)
@@ -94,25 +85,6 @@ void exit_idle(void)
__exit_idle();
}
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
-{
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
- if (!need_resched())
- safe_halt(); /* enables interrupts racelessly */
- else
- local_irq_enable();
- current_thread_info()->status |= TS_POLLING;
-}
-
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);
@@ -150,12 +122,9 @@ void cpu_idle(void)
while (1) {
tick_nohz_stop_sched_tick();
while (!need_resched()) {
- void (*idle)(void);
rmb();
- idle = pm_idle;
- if (!idle)
- idle = default_idle;
+
if (cpu_is_offline(smp_processor_id()))
play_dead();
/*
@@ -165,7 +134,7 @@ void cpu_idle(void)
*/
local_irq_disable();
enter_idle();
- idle();
+ pm_idle();
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7835f282936..77040b6070e1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -943,13 +943,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
return copy_regset_to_user(child, &user_x86_32_view,
REGSET_XFP,
0, sizeof(struct user_fxsr_struct),
- datap);
+ datap) ? -EIO : 0;
case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
return copy_regset_from_user(child, &user_x86_32_view,
REGSET_XFP,
0, sizeof(struct user_fxsr_struct),
- datap);
+ datap) ? -EIO : 0;
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d89a648fe710..79bdcd11c66e 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -65,6 +65,7 @@ static enum {
ICH_FORCE_HPET_RESUME,
VT8237_FORCE_HPET_RESUME,
NVIDIA_FORCE_HPET_RESUME,
+ ATI_FORCE_HPET_RESUME,
} force_hpet_resume_type;
static void __iomem *rcba_base;
@@ -158,6 +159,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
+ ich_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
ich_force_enable_hpet);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
@@ -174,6 +177,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
static struct pci_dev *cached_dev;
+static void hpet_print_force_info(void)
+{
+ printk(KERN_INFO "HPET not enabled in BIOS. "
+ "You might try hpet=force boot option\n");
+}
+
static void old_ich_force_hpet_resume(void)
{
u32 val;
@@ -253,6 +262,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
{
if (hpet_force_user)
old_ich_force_enable_hpet(dev);
+ else
+ hpet_print_force_info();
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
@@ -290,8 +301,13 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
{
u32 uninitialized_var(val);
- if (!hpet_force_user || hpet_address || force_hpet_address)
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
return;
+ }
pci_read_config_dword(dev, 0x68, &val);
/*
@@ -330,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
vt8237_force_enable_hpet);
+static void ati_force_hpet_resume(void)
+{
+ pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
+ printk(KERN_DEBUG "Force enabled HPET at resume\n");
+}
+
+static void ati_force_enable_hpet(struct pci_dev *dev)
+{
+ u32 uninitialized_var(val);
+
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
+ return;
+ }
+
+ pci_write_config_dword(dev, 0x14, 0xfed00000);
+ pci_read_config_dword(dev, 0x14, &val);
+ force_hpet_address = val;
+ force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+ force_hpet_address);
+ cached_dev = dev;
+ return;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+ ati_force_enable_hpet);
+
/*
* Undocumented chipset feature taken from LinuxBIOS.
*/
@@ -343,8 +389,13 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
{
u32 uninitialized_var(val);
- if (!hpet_force_user || hpet_address || force_hpet_address)
+ if (hpet_address || force_hpet_address)
+ return;
+
+ if (!hpet_force_user) {
+ hpet_print_force_info();
return;
+ }
pci_write_config_dword(dev, 0x44, 0xfed00001);
pci_read_config_dword(dev, 0x44, &val);
@@ -397,6 +448,9 @@ void force_hpet_resume(void)
case NVIDIA_FORCE_HPET_RESUME:
nvidia_force_hpet_resume();
return;
+ case ATI_FORCE_HPET_RESUME:
+ ati_force_hpet_resume();
+ return;
default:
break;
}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f6be7d5f82f8..f8a62160e151 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -27,7 +27,7 @@
void (*pm_power_off)(void);
EXPORT_SYMBOL(pm_power_off);
-static long no_idt[3];
+static const struct desc_ptr no_idt = {};
static int reboot_mode;
enum reboot_type reboot_type = BOOT_KBD;
int reboot_force;
@@ -201,15 +201,15 @@ core_initcall(reboot_init);
controller to pulse the CPU reset line, which is more thorough, but
doesn't work with at least one type of 486 motherboard. It is easy
to stop this code working; hence the copious comments. */
-static unsigned long long
+static const unsigned long long
real_mode_gdt_entries [3] =
{
0x0000000000000000ULL, /* Null descriptor */
- 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
- 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
+ 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
+ 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
};
-static struct desc_ptr
+static const struct desc_ptr
real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
real_mode_idt = { 0x3ff, 0 };
@@ -231,7 +231,7 @@ real_mode_idt = { 0x3ff, 0 };
More could be done here to set up the registers as if a CPU reset had
occurred; hopefully real BIOSs don't assume much. */
-static unsigned char real_mode_switch [] =
+static const unsigned char real_mode_switch [] =
{
0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */
0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */
@@ -245,7 +245,7 @@ static unsigned char real_mode_switch [] =
0x24, 0x10, /* f: andb $0x10,al */
0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
};
-static unsigned char jump_to_bios [] =
+static const unsigned char jump_to_bios [] =
{
0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
};
@@ -255,7 +255,7 @@ static unsigned char jump_to_bios [] =
* specified by the code and length parameters.
* We assume that length will aways be less that 100!
*/
-void machine_real_restart(unsigned char *code, int length)
+void machine_real_restart(const unsigned char *code, int length)
{
local_irq_disable();
@@ -368,7 +368,7 @@ static void native_machine_emergency_restart(void)
}
case BOOT_TRIPLE:
- load_idt((const struct desc_ptr *)&no_idt);
+ load_idt(&no_idt);
__asm__ __volatile__("int3");
reboot_type = BOOT_KBD;
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index dec0b5ec25c2..61a837743fe5 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -49,7 +49,7 @@ struct device_fixup {
void (*reboot_fixup)(struct pci_dev *);
};
-static struct device_fixup fixups_table[] = {
+static const struct device_fixup fixups_table[] = {
{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
@@ -64,7 +64,7 @@ static struct device_fixup fixups_table[] = {
*/
void mach_reboot_fixups(void)
{
- struct device_fixup *cur;
+ const struct device_fixup *cur;
struct pci_dev *dev;
int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d4eaa4eb481d..ebb0a2bcdc08 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -17,6 +17,7 @@ unsigned int num_processors;
unsigned disabled_cpus __cpuinitdata;
/* Processor that is doing the boot up */
unsigned int boot_cpu_physical_apicid = -1U;
+unsigned int max_physical_apicid;
EXPORT_SYMBOL(boot_cpu_physical_apicid);
/* Bitmask of physically existing CPUs */
@@ -206,6 +207,31 @@ void __init setup_per_cpu_areas(void)
#endif
+void __init parse_setup_data(void)
+{
+ struct setup_data *data;
+ u64 pa_data;
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, PAGE_SIZE);
+ switch (data->type) {
+ case SETUP_E820_EXT:
+ parse_e820_ext(data, pa_data);
+ break;
+ default:
+ break;
+ }
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+ free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+ pa_data = data->next;
+ early_iounmap(data, PAGE_SIZE);
+ }
+}
+
#ifdef X86_64_NUMA
/*
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index ccd5f5cdbbe6..a9b19ad24edb 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -59,6 +59,7 @@
#include <asm/setup.h>
#include <asm/arch_hooks.h>
#include <asm/sections.h>
+#include <asm/dmi.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/io.h>
@@ -67,10 +68,13 @@
#include <asm/bios_ebda.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
+#include <asm/efi.h>
+#include <asm/bugs.h>
/* This value is set up by the early boot code to point to the value
immediately after the boot time page tables. It contains a *physical*
address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_start __initdata = ~0UL;
unsigned long init_pg_tables_end __initdata = ~0UL;
/*
@@ -182,6 +186,12 @@ int bootloader_type;
static unsigned int highmem_pages = -1;
/*
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
+/*
* Setup options
*/
struct screen_info screen_info;
@@ -237,42 +247,6 @@ static inline void copy_edd(void)
}
#endif
-int __initdata user_defined_memmap;
-
-/*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem= [also see Documentation/i386/boot.txt]
- */
-static int __init parse_mem(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp(arg, "nopentium") == 0) {
- setup_clear_cpu_cap(X86_FEATURE_PSE);
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long mem_size;
-
- mem_size = memparse(arg, &arg);
- limit_regions(mem_size);
- user_defined_memmap = 1;
- }
- return 0;
-}
-early_param("mem", parse_mem);
-
#ifdef CONFIG_PROC_VMCORE
/* elfcorehdr= specifies the location of elf core header
* stored by the crashed kernel.
@@ -395,56 +369,6 @@ unsigned long __init find_max_low_pfn(void)
return max_low_pfn;
}
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
- unsigned int lowmem, ebda_addr;
-
- /* To determine the position of the EBDA and the */
- /* end of conventional memory, we need to look at */
- /* the BIOS data area. In a paravirtual environment */
- /* that area is absent. We'll just have to assume */
- /* that the paravirt case can handle memory setup */
- /* correctly, without our help. */
- if (paravirt_enabled())
- return;
-
- /* end of low (conventional) memory */
- lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
- lowmem <<= 10;
-
- /* start of EBDA area */
- ebda_addr = get_bios_ebda();
-
- /* Fixup: bios puts an EBDA in the top 64K segment */
- /* of conventional memory, but does not adjust lowmem. */
- if ((lowmem - ebda_addr) <= 0x10000)
- lowmem = ebda_addr;
-
- /* Fixup: bios does not report an EBDA at all. */
- /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
- if ((ebda_addr == 0) && (lowmem >= 0x9f000))
- lowmem = 0x9f000;
-
- /* Paranoia: should never happen, but... */
- if ((lowmem == 0) || (lowmem >= 0x100000))
- lowmem = 0x9f000;
-
- /* reserve all memory between lowmem and the 1MB mark */
- reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
-}
-
#ifndef CONFIG_NEED_MULTIPLE_NODES
static void __init setup_bootmem_allocator(void);
static unsigned long __init setup_memory(void)
@@ -462,11 +386,13 @@ static unsigned long __init setup_memory(void)
if (max_pfn > max_low_pfn) {
highstart_pfn = max_low_pfn;
}
+ memory_present(0, 0, highend_pfn);
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
+ memory_present(0, 0, max_low_pfn);
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
@@ -488,11 +414,12 @@ static void __init zone_sizes_init(void)
max_zone_pfns[ZONE_DMA] =
virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+ remove_all_active_ranges();
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
- add_active_range(0, 0, highend_pfn);
+ e820_register_active_regions(0, 0, highend_pfn);
#else
- add_active_range(0, 0, max_low_pfn);
+ e820_register_active_regions(0, 0, max_low_pfn);
#endif
free_area_init_nodes(max_zone_pfns);
@@ -526,25 +453,28 @@ static void __init reserve_crashkernel(void)
ret = parse_crashkernel(boot_command_line, total_mem,
&crash_size, &crash_base);
if (ret == 0 && crash_size > 0) {
- if (crash_base > 0) {
- printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
- "for crashkernel (System RAM: %ldMB)\n",
- (unsigned long)(crash_size >> 20),
- (unsigned long)(crash_base >> 20),
- (unsigned long)(total_mem >> 20));
-
- if (reserve_bootmem(crash_base, crash_size,
- BOOTMEM_EXCLUSIVE) < 0) {
- printk(KERN_INFO "crashkernel reservation "
- "failed - memory is in use\n");
- return;
- }
-
- crashk_res.start = crash_base;
- crashk_res.end = crash_base + crash_size - 1;
- } else
+ if (crash_base <= 0) {
printk(KERN_INFO "crashkernel reservation failed - "
"you have to specify a base address\n");
+ return;
+ }
+
+ if (reserve_bootmem_generic(crash_base, crash_size,
+ BOOTMEM_EXCLUSIVE) < 0) {
+ printk(KERN_INFO "crashkernel reservation failed - "
+ "memory is in use\n");
+ return;
+ }
+
+ printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+ "for crashkernel (System RAM: %ldMB)\n",
+ (unsigned long)(crash_size >> 20),
+ (unsigned long)(crash_base >> 20),
+ (unsigned long)(total_mem >> 20));
+
+ crashk_res.start = crash_base;
+ crashk_res.end = crash_base + crash_size - 1;
+ insert_resource(&iomem_resource, &crashk_res);
}
}
#else
@@ -558,44 +488,57 @@ static bool do_relocate_initrd = false;
static void __init reserve_initrd(void)
{
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
- unsigned long ramdisk_here;
-
- initrd_start = 0;
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 ramdisk_end = ramdisk_image + ramdisk_size;
+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 ramdisk_here;
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
return; /* No initrd provided by bootloader */
- if (ramdisk_end < ramdisk_image) {
- printk(KERN_ERR "initrd wraps around end of memory, "
- "disabling initrd\n");
- return;
- }
+ initrd_start = 0;
+
if (ramdisk_size >= end_of_lowmem/2) {
+ free_early(ramdisk_image, ramdisk_end);
printk(KERN_ERR "initrd too large to handle, "
"disabling initrd\n");
return;
}
+
+ printk(KERN_INFO "old RAMDISK: %08llx - %08llx\n", ramdisk_image,
+ ramdisk_end);
+
+
if (ramdisk_end <= end_of_lowmem) {
/* All in lowmem, easy case */
- reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
+ /*
+ * don't need to reserve again, already reserved early
+ * in i386_start_kernel
+ */
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start+ramdisk_size;
return;
}
/* We need to move the initrd down into lowmem */
- ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
+ ramdisk_here = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+ end_of_lowmem, ramdisk_size,
+ PAGE_SIZE);
+
+ if (ramdisk_here == -1ULL)
+ panic("Cannot find place for new RAMDISK of size %lld\n",
+ ramdisk_size);
/* Note: this includes all the lowmem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
- reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
+ reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+ "NEW RAMDISK");
initrd_start = ramdisk_here + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
+ printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+ ramdisk_here, ramdisk_here + ramdisk_size);
do_relocate_initrd = true;
}
@@ -604,10 +547,10 @@ static void __init reserve_initrd(void)
static void __init relocate_initrd(void)
{
- unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
- unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
- unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
- unsigned long ramdisk_here;
+ u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+ u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+ u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+ u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
@@ -624,6 +567,10 @@ static void __init relocate_initrd(void)
p = (char *)__va(ramdisk_image);
memcpy(q, p, clen);
q += clen;
+ /* need to free these low pages...*/
+ printk(KERN_INFO "Freeing old partial RAMDISK %08llx-%08llx\n",
+ ramdisk_image, ramdisk_image + clen - 1);
+ free_bootmem(ramdisk_image, clen);
ramdisk_image += clen;
ramdisk_size -= clen;
}
@@ -642,66 +589,47 @@ static void __init relocate_initrd(void)
ramdisk_image += clen;
ramdisk_size -= clen;
}
+ /* high pages is not converted by early_res_to_bootmem */
+ ramdisk_image = boot_params.hdr.ramdisk_image;
+ ramdisk_size = boot_params.hdr.ramdisk_size;
+ printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n",
+ ramdisk_image, ramdisk_image + ramdisk_size - 1,
+ ramdisk_here, ramdisk_here + ramdisk_size - 1);
+
+ /* need to free that, otherwise init highmem will reserve it again */
+ free_early(ramdisk_image, ramdisk_image+ramdisk_size);
}
#endif /* CONFIG_BLK_DEV_INITRD */
void __init setup_bootmem_allocator(void)
{
- unsigned long bootmap_size;
+ int i;
+ unsigned long bootmap_size, bootmap;
/*
* Initialize the boot-time allocator (with low memory only):
*/
- bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
-
- register_bootmem_low_pages(max_low_pfn);
-
- /*
- * Reserve the bootmem bitmap itself as well. We do this in two
- * steps (first step was init_bootmem()) because this catches
- * the (very unlikely) case of us accidentally initializing the
- * bootmem allocator with an invalid RAM area.
- */
- reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
- bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
- BOOTMEM_DEFAULT);
-
- /*
- * reserve physical page 0 - it's a special BIOS page on many boxes,
- * enabling clean reboots, SMP operation, laptop functions.
- */
- reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
-
- /* reserve EBDA region */
- reserve_ebda_region();
-
-#ifdef CONFIG_SMP
- /*
- * But first pinch a few for the stack/trampoline stuff
- * FIXME: Don't need the extra page at 4K, but need to fix
- * trampoline before removing it. (see the GDT stuff)
- */
- reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
-#ifdef CONFIG_ACPI_SLEEP
- /*
- * Reserve low memory region for sleep support.
- */
- acpi_reserve_bootmem();
-#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
- /*
- * Find and reserve possible boot-time SMP configuration:
- */
- find_smp_config();
-#endif
+ bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
+ bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+ max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+ PAGE_SIZE);
+ if (bootmap == -1L)
+ panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+ reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
#ifdef CONFIG_BLK_DEV_INITRD
reserve_initrd();
#endif
- numa_kva_reserve();
- reserve_crashkernel();
+ bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
+ printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
+ max_pfn_mapped<<PAGE_SHIFT);
+ printk(KERN_INFO " low ram: %08lx - %08lx\n",
+ min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+ printk(KERN_INFO " bootmap %08lx - %08lx\n",
+ bootmap, bootmap + bootmap_size);
+ for_each_online_node(i)
+ free_bootmem_with_active_regions(i, max_low_pfn);
+ early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
- reserve_ibft_region();
}
/*
@@ -731,11 +659,7 @@ static void set_mca_bus(int x)
static void set_mca_bus(int x) { }
#endif
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
- return machine_specific_memory_setup();
-}
+static void probe_roms(void);
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
@@ -746,17 +670,21 @@ char * __init __attribute__((weak)) memory_setup(void)
*/
void __init setup_arch(char **cmdline_p)
{
+ int i;
unsigned long max_low_pfn;
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
pre_setup_arch_hook();
early_cpu_init();
early_ioremap_init();
+ reserve_setup_data();
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL32", 4))
+ "EL32", 4)) {
efi_enabled = 1;
+ efi_reserve_early();
+ }
#endif
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -780,8 +708,7 @@ void __init setup_arch(char **cmdline_p)
#endif
ARCH_SETUP
- printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- print_memory_map(memory_setup());
+ setup_memory_map();
copy_edd();
@@ -799,12 +726,18 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = virt_to_phys(&__bss_start);
bss_resource.end = virt_to_phys(&__bss_stop)-1;
+ parse_setup_data();
+
parse_early_param();
- if (user_defined_memmap) {
- printk(KERN_INFO "user-defined physical RAM map:\n");
- print_memory_map("user");
- }
+ finish_e820_parsing();
+
+ probe_roms();
+
+ /* after parse_early_param, so could debug it */
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
@@ -812,14 +745,67 @@ void __init setup_arch(char **cmdline_p)
if (efi_enabled)
efi_init();
+ if (ppro_with_ram_bug()) {
+ e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+ E820_RESERVED);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ printk(KERN_INFO "fixed physical RAM map:\n");
+ e820_print_map("bad_ppro");
+ }
+
+ e820_register_active_regions(0, 0, -1UL);
+ /*
+ * partially used pages are not usable - thus
+ * we are rounding upwards:
+ */
+ max_pfn = e820_end_of_ram();
+
+ /* preallocate 4k for mptable mpc */
+ early_reserve_e820_mpc_new();
/* update e820 for memory not covered by WB MTRRs */
- propagate_e820_map();
mtrr_bp_init();
- if (mtrr_trim_uncached_memory(max_pfn))
- propagate_e820_map();
+ if (mtrr_trim_uncached_memory(max_pfn)) {
+ remove_all_active_ranges();
+ e820_register_active_regions(0, 0, -1UL);
+ max_pfn = e820_end_of_ram();
+ }
+
+ dmi_scan_machine();
+
+ io_delay_init();
+
+#ifdef CONFIG_ACPI
+ /*
+ * Parse the ACPI tables for possible boot-time SMP configuration.
+ */
+ acpi_boot_table_init();
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * Parse SRAT to discover nodes.
+ */
+ acpi_numa_init();
+#endif
max_low_pfn = setup_memory();
+#ifdef CONFIG_ACPI_SLEEP
+ /*
+ * Reserve low memory region for sleep support.
+ */
+ acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+ /*
+ * Find and reserve possible boot-time SMP configuration:
+ */
+ find_smp_config();
+#endif
+ reserve_crashkernel();
+
+ reserve_ibft_region();
+
#ifdef CONFIG_KVM_CLOCK
kvmclock_init();
#endif
@@ -843,9 +829,6 @@ void __init setup_arch(char **cmdline_p)
* not to exceed the 8Mb limit.
*/
-#ifdef CONFIG_SMP
- smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
-#endif
paging_init();
/*
@@ -857,10 +840,6 @@ void __init setup_arch(char **cmdline_p)
init_ohci1394_dma_on_all_controllers();
#endif
- remapped_pgdat_init();
- sparse_init();
- zone_sizes_init();
-
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
@@ -869,42 +848,41 @@ void __init setup_arch(char **cmdline_p)
relocate_initrd();
#endif
- paravirt_post_allocator_init();
-
- dmi_scan_machine();
+ remapped_pgdat_init();
+ sparse_init();
+ zone_sizes_init();
- io_delay_init();
+ paravirt_post_allocator_init();
#ifdef CONFIG_X86_GENERICARCH
generic_apic_probe();
#endif
-#ifdef CONFIG_ACPI
- /*
- * Parse the ACPI tables for possible boot-time SMP configuration.
- */
- acpi_boot_table_init();
-#endif
-
early_quirks();
#ifdef CONFIG_ACPI
acpi_boot_init();
-
+#endif
+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+ if (smp_found_config)
+ get_smp_config();
+#endif
#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
if (def_to_bigsmp)
printk(KERN_WARNING "More than 8 CPUs detected and "
"CONFIG_X86_PC cannot handle it.\nUse "
"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
#endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
- if (smp_found_config)
- get_smp_config();
-#endif
- e820_register_memory();
- e820_mark_nosave_regions();
+ e820_reserve_resources();
+ e820_mark_nosave_regions(max_low_pfn);
+
+ request_resource(&iomem_resource, &video_ram_resource);
+ /* request I/O space for devices used on all i[345]86 PCs */
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+ request_resource(&ioport_resource, &standard_io_resources[i]);
+
+ e820_setup_gap();
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
@@ -916,25 +894,147 @@ void __init setup_arch(char **cmdline_p)
#endif
}
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
+static struct resource system_rom_resource = {
+ .name = "System ROM",
+ .start = 0xf0000,
+ .end = 0xfffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+ .name = "Extension ROM",
+ .start = 0xe0000,
+ .end = 0xeffff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+ .name = "Adapter ROM",
+ .start = 0xc8000,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+ .name = "Adapter ROM",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+ .name = "Video ROM",
+ .start = 0xc0000,
+ .end = 0xc7fff,
+ .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
{
+ const unsigned short * const ptr = (const unsigned short *)rom;
+ unsigned short sig;
+
+ return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+ unsigned char sum, c;
+
+ for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+ sum += c;
+ return !length && !sum;
+}
+
+static void __init probe_roms(void)
+{
+ const unsigned char *rom;
+ unsigned long start, length, upper;
+ unsigned char c;
int i;
- printk(KERN_INFO "Setting up standard PCI resources\n");
- init_iomem_resources(&code_resource, &data_resource, &bss_resource);
+ /* video rom */
+ upper = adapter_rom_resources[0].start;
+ for (start = video_rom_resource.start; start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
- request_resource(&iomem_resource, &video_ram_resource);
+ video_rom_resource.start = start;
- /* request I/O space for devices used on all i[345]86 PCs */
- for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
- request_resource(&ioport_resource, &standard_io_resources[i]);
- return 0;
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* if checksum okay, trust length byte */
+ if (length && romchecksum(rom, length))
+ video_rom_resource.end = start + length - 1;
+
+ request_resource(&iomem_resource, &video_rom_resource);
+ break;
+ }
+
+ start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+ if (start < upper)
+ start = upper;
+
+ /* system rom */
+ request_resource(&iomem_resource, &system_rom_resource);
+ upper = system_rom_resource.start;
+
+ /* check for extension rom (ignore length byte!) */
+ rom = isa_bus_to_virt(extension_rom_resource.start);
+ if (romsignature(rom)) {
+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
+ if (romchecksum(rom, length)) {
+ request_resource(&iomem_resource, &extension_rom_resource);
+ upper = extension_rom_resource.start;
+ }
+ }
+
+ /* check for adapter roms on 2k boundaries */
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ if (probe_kernel_address(rom + 2, c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* but accept any length that fits if checksum okay */
+ if (!length || start + length > upper || !romchecksum(rom, length))
+ continue;
+
+ adapter_rom_resources[i].start = start;
+ adapter_rom_resources[i].end = start + length - 1;
+ request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+ start = adapter_rom_resources[i++].end & ~2047UL;
+ }
}
-subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 4a666cdccb68..16ef53ab538a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -56,6 +56,7 @@
#include <asm/desc.h>
#include <video/edid.h>
#include <asm/e820.h>
+#include <asm/mpspec.h>
#include <asm/dma.h>
#include <asm/gart.h>
#include <asm/mpspec.h>
@@ -71,6 +72,7 @@
#include <asm/topology.h>
#include <asm/trampoline.h>
#include <asm/pat.h>
+#include <asm/mmconfig.h>
#include <mach_apic.h>
#ifdef CONFIG_PARAVIRT
@@ -79,6 +81,8 @@
#define ARCH_SETUP
#endif
+#include "cpu/cpu.h"
+
/*
* Machine setup..
*/
@@ -95,8 +99,6 @@ int bootloader_type;
unsigned long saved_video_mode;
-int force_mwait __cpuinitdata;
-
/*
* Early DMI memory
*/
@@ -118,7 +120,7 @@ EXPORT_SYMBOL_GPL(edid_info);
extern int root_mountflags;
-char __initdata command_line[COMMAND_LINE_SIZE];
+static char __initdata command_line[COMMAND_LINE_SIZE];
static struct resource standard_io_resources[] = {
{ .name = "dma1", .start = 0x00, .end = 0x1f,
@@ -164,6 +166,7 @@ static struct resource bss_resource = {
.flags = IORESOURCE_RAM,
};
+static void __init early_cpu_init(void);
static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
#ifdef CONFIG_PROC_VMCORE
@@ -265,46 +268,6 @@ static inline void __init reserve_crashkernel(void)
{}
#endif
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) __init memory_setup(void)
-{
- machine_specific_memory_setup();
-}
-
-static void __init parse_setup_data(void)
-{
- struct setup_data *data;
- unsigned long pa_data;
-
- if (boot_params.hdr.version < 0x0209)
- return;
- pa_data = boot_params.hdr.setup_data;
- while (pa_data) {
- data = early_ioremap(pa_data, PAGE_SIZE);
- switch (data->type) {
- default:
- break;
- }
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
- free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
- pa_data = data->next;
- early_iounmap(data, PAGE_SIZE);
- }
-}
-
-#ifdef CONFIG_PCI_MMCONFIG
-extern void __cpuinit fam10h_check_enable_mmcfg(void);
-extern void __init check_enable_amd_mmconf_dmi(void);
-#else
-void __cpuinit fam10h_check_enable_mmcfg(void)
-{
-}
-void __init check_enable_amd_mmconf_dmi(void)
-{
-}
-#endif
-
/*
* setup_arch - architecture-specific boot-time initializations
*
@@ -329,13 +292,15 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL64", 4))
+ "EL64", 4)) {
efi_enabled = 1;
+ efi_reserve_early();
+ }
#endif
ARCH_SETUP
- memory_setup();
+ setup_memory_map();
copy_edd();
if (!boot_params.hdr.root_flags)
@@ -352,6 +317,7 @@ void __init setup_arch(char **cmdline_p)
bss_resource.start = virt_to_phys(&__bss_start);
bss_resource.end = virt_to_phys(&__bss_stop)-1;
+ early_cpu_init();
early_identify_cpu(&boot_cpu_data);
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
@@ -381,9 +347,13 @@ void __init setup_arch(char **cmdline_p)
* we are rounding upwards:
*/
end_pfn = e820_end_of_ram();
+
+ /* pre allocte 4k for mptable mpc */
+ early_reserve_e820_mpc_new();
/* update e820 for memory not covered by WB MTRRs */
mtrr_bp_init();
if (mtrr_trim_uncached_memory(end_pfn)) {
+ remove_all_active_ranges();
e820_register_active_regions(0, 0, -1UL);
end_pfn = e820_end_of_ram();
}
@@ -392,7 +362,7 @@ void __init setup_arch(char **cmdline_p)
check_efer();
- max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
+ max_pfn_mapped = init_memory_mapping(0, (end_pfn << PAGE_SHIFT));
if (efi_enabled)
efi_init();
@@ -444,13 +414,12 @@ void __init setup_arch(char **cmdline_p)
acpi_reserve_bootmem();
#endif
- if (efi_enabled)
- efi_reserve_bootmem();
-
+#ifdef CONFIG_X86_MPPARSE
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
+#endif
#ifdef CONFIG_BLK_DEV_INITRD
if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -493,11 +462,13 @@ void __init setup_arch(char **cmdline_p)
init_cpu_to_node();
+#ifdef CONFIG_X86_MPPARSE
/*
* get boot-time SMP configuration:
*/
if (smp_found_config)
get_smp_config();
+#endif
init_apic_mappings();
ioapic_init_mappings();
@@ -507,7 +478,7 @@ void __init setup_arch(char **cmdline_p)
* We trust e820 completely. No explicit ROM probing in memory.
*/
e820_reserve_resources();
- e820_mark_nosave_regions();
+ e820_mark_nosave_regions(end_pfn);
/* request I/O space for devices used on all i[345]86 PCs */
for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
@@ -528,7 +499,20 @@ void __init setup_arch(char **cmdline_p)
check_enable_amd_mmconf_dmi();
}
-static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+ display_cacheinfo(c);
+}
+
+static struct cpu_dev __cpuinitdata default_cpu = {
+ .c_init = default_init,
+ .c_vendor = "Unknown",
+};
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
@@ -544,7 +528,7 @@ static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
}
-static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, eax, ebx, ecx, edx;
@@ -576,228 +560,6 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
}
}
-#ifdef CONFIG_NUMA
-static int __cpuinit nearby_node(int apicid)
-{
- int i, node;
-
- for (i = apicid - 1; i >= 0; i--) {
- node = apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
- node = apicid_to_node[i];
- if (node != NUMA_NO_NODE && node_online(node))
- return node;
- }
- return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned bits;
-#ifdef CONFIG_NUMA
- int cpu = smp_processor_id();
- int node = 0;
- unsigned apicid = hard_smp_processor_id();
-#endif
- bits = c->x86_coreid_bits;
-
- /* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
- /* Convert the initial APIC ID into the socket ID */
- c->phys_proc_id = c->initial_apicid >> bits;
-
-#ifdef CONFIG_NUMA
- node = c->phys_proc_id;
- if (apicid_to_node[apicid] != NUMA_NO_NODE)
- node = apicid_to_node[apicid];
- if (!node_online(node)) {
- /* Two possibilities here:
- - The CPU is missing memory and no node was created.
- In that case try picking one from a nearby CPU
- - The APIC IDs differ from the HyperTransport node IDs
- which the K8 northbridge parsing fills in.
- Assume they are all increased by a constant offset,
- but in the same order as the HT nodeids.
- If that doesn't result in a usable node fall back to the
- path for the previous case. */
-
- int ht_nodeid = c->initial_apicid;
-
- if (ht_nodeid >= 0 &&
- apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
- node = apicid_to_node[ht_nodeid];
- /* Pick a nearby node */
- if (!node_online(node))
- node = nearby_node(apicid);
- }
- numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
- unsigned bits, ecx;
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level < 0x80000008)
- return;
-
- ecx = cpuid_ecx(0x80000008);
-
- c->x86_max_cores = (ecx & 0xff) + 1;
-
- /* CPU telling us the core id bits shift? */
- bits = (ecx >> 12) & 0xF;
-
- /* Otherwise recompute */
- if (bits == 0) {
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- }
-
- c->x86_coreid_bits = bits;
-
-#endif
-}
-
-#define ENABLE_C1E_MASK 0x18000000
-#define CPUID_PROCESSOR_SIGNATURE 1
-#define CPUID_XFAM 0x0ff00000
-#define CPUID_XFAM_K8 0x00000000
-#define CPUID_XFAM_10H 0x00100000
-#define CPUID_XFAM_11H 0x00200000
-#define CPUID_XMOD 0x000f0000
-#define CPUID_XMOD_REV_F 0x00040000
-
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
-{
- u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-
- switch (eax & CPUID_XFAM) {
- case CPUID_XFAM_K8:
- if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
- break;
- case CPUID_XFAM_10H:
- case CPUID_XFAM_11H:
- rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
- if (lo & ENABLE_C1E_MASK)
- return 1;
- break;
- default:
- /* err on the side of caution */
- return 1;
- }
- return 0;
-}
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
-{
- early_init_amd_mc(c);
-
- /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
- if (c->x86_power & (1<<8))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
- unsigned level;
-
-#ifdef CONFIG_SMP
- unsigned long value;
-
- /*
- * Disable TLB flush filter by setting HWCR.FFDIS on K8
- * bit 6 of msr C001_0015
- *
- * Errata 63 for SH-B3 steppings
- * Errata 122 for all steppings (F+ have it disabled by default)
- */
- if (c->x86 == 15) {
- rdmsrl(MSR_K8_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K8_HWCR, value);
- }
-#endif
-
- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
- clear_cpu_cap(c, 0*32+31);
-
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- level = cpuid_eax(1);
- if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
- level >= 0x0f58))
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- if (c->x86 == 0x10 || c->x86 == 0x11)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
- level = get_model_name(c);
- if (!level) {
- switch (c->x86) {
- case 15:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
- }
- }
- display_cacheinfo(c);
-
- /* Multi core CPU? */
- if (c->extended_cpuid_level >= 0x80000008)
- amd_detect_cmp(c);
-
- if (c->extended_cpuid_level >= 0x80000006 &&
- (cpuid_edx(0x80000006) & 0xf000))
- num_cache_leaves = 4;
- else
- num_cache_leaves = 3;
-
- if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
- set_cpu_cap(c, X86_FEATURE_K8);
-
- /* MFENCE stops RDTSC speculation */
- set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
- if (c->x86 == 0x10)
- fam10h_check_enable_mmcfg();
-
- if (amd_apic_timer_broken())
- disable_apic_timer = 1;
-
- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
- (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
-}
-
void __cpuinit detect_ht(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
@@ -848,135 +610,59 @@ out:
#endif
}
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
- unsigned int eax, t;
-
- if (c->cpuid_level < 4)
- return 1;
-
- cpuid_count(4, 0, &eax, &t, &t, &t);
-
- if (eax & 0x1f)
- return ((eax >> 26) + 1);
- else
- return 1;
-}
-
-static void __cpuinit srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
- unsigned node;
- int cpu = smp_processor_id();
- int apicid = hard_smp_processor_id();
-
- /* Don't do the funky fallback heuristics the AMD version employs
- for now. */
- node = apicid_to_node[apicid];
- if (node == NUMA_NO_NODE || !node_online(node))
- node = first_node(node_online_map);
- numa_set_node(cpu, node);
-
- printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
-{
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
- /* Cache sizes */
- unsigned n;
-
- init_intel_cacheinfo(c);
- if (c->cpuid_level > 9) {
- unsigned eax = cpuid_eax(10);
- /* Check for version and the number of counters */
- if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
- set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
- }
-
- if (cpu_has_ds) {
- unsigned int l1, l2;
- rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
- set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
- set_cpu_cap(c, X86_FEATURE_PEBS);
+ char *v = c->x86_vendor_id;
+ int i;
+ static int printed;
+
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ if (cpu_devs[i]) {
+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+ (cpu_devs[i]->c_ident[1] &&
+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+ c->x86_vendor = i;
+ this_cpu = cpu_devs[i];
+ return;
+ }
+ }
}
-
-
- if (cpu_has_bts)
- ds_init_intel(c);
-
- n = c->extended_cpuid_level;
- if (n >= 0x80000008) {
- unsigned eax = cpuid_eax(0x80000008);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- /* CPUID workaround for Intel 0F34 CPU */
- if (c->x86_vendor == X86_VENDOR_INTEL &&
- c->x86 == 0xF && c->x86_model == 0x3 &&
- c->x86_mask == 0x4)
- c->x86_phys_bits = 36;
+ if (!printed) {
+ printed++;
+ printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+ printk(KERN_ERR "CPU: Your system may be unstable.\n");
}
-
- if (c->x86 == 15)
- c->x86_cache_alignment = c->x86_clflush_size * 2;
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
- c->x86_max_cores = intel_num_cpu_cores(c);
-
- srat_detect_node();
-}
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
- if (c->x86 == 0x6 && c->x86_model >= 0xf)
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ c->x86_vendor = X86_VENDOR_UNKNOWN;
}
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+static void __init early_cpu_support_print(void)
{
- /* Cache sizes */
- unsigned n;
+ int i,j;
+ struct cpu_dev *cpu_devx;
- n = c->extended_cpuid_level;
- if (n >= 0x80000008) {
- unsigned eax = cpuid_eax(0x80000008);
- c->x86_virt_bits = (eax >> 8) & 0xff;
- c->x86_phys_bits = eax & 0xff;
- }
-
- if (c->x86 == 0x6 && c->x86_model >= 0xf) {
- c->x86_cache_alignment = c->x86_clflush_size * 2;
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ printk("KERNEL supported cpus:\n");
+ for (i = 0; i < X86_VENDOR_NUM; i++) {
+ cpu_devx = cpu_devs[i];
+ if (!cpu_devx)
+ continue;
+ for (j = 0; j < 2; j++) {
+ if (!cpu_devx->c_ident[j])
+ continue;
+ printk(" %s %s\n", cpu_devx->c_vendor,
+ cpu_devx->c_ident[j]);
+ }
}
- set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
}
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+static void __init early_cpu_init(void)
{
- char *v = c->x86_vendor_id;
+ struct cpu_vendor_dev *cvdev;
- if (!strcmp(v, "AuthenticAMD"))
- c->x86_vendor = X86_VENDOR_AMD;
- else if (!strcmp(v, "GenuineIntel"))
- c->x86_vendor = X86_VENDOR_INTEL;
- else if (!strcmp(v, "CentaurHauls"))
- c->x86_vendor = X86_VENDOR_CENTAUR;
- else
- c->x86_vendor = X86_VENDOR_UNKNOWN;
+ for (cvdev = __x86cpuvendor_start ;
+ cvdev < __x86cpuvendor_end ;
+ cvdev++)
+ cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+ early_cpu_support_print();
}
/* Do some early cpuid on the boot CPU to get some parameter that are
@@ -1057,17 +743,9 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- early_init_amd(c);
- break;
- case X86_VENDOR_INTEL:
- early_init_intel(c);
- break;
- case X86_VENDOR_CENTAUR:
- early_init_centaur(c);
- break;
- }
+ if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+ cpu_devs[c->x86_vendor]->c_early_init)
+ cpu_devs[c->x86_vendor]->c_early_init(c);
validate_pat_support(c);
}
@@ -1095,24 +773,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
* At the end of this section, c->x86_capability better
* indicate the features this CPU genuinely supports!
*/
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- init_amd(c);
- break;
-
- case X86_VENDOR_INTEL:
- init_intel(c);
- break;
-
- case X86_VENDOR_CENTAUR:
- init_centaur(c);
- break;
-
- case X86_VENDOR_UNKNOWN:
- default:
- display_cacheinfo(c);
- break;
- }
+ if (this_cpu->c_init)
+ this_cpu->c_init(c);
detect_ht(c);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bc1e1257e515..ae0a7a200421 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -59,7 +59,6 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
-#include <asm/nmi.h>
#include <asm/vmi.h>
#include <asm/genapic.h>
#include <linux/mc146818rtc.h>
@@ -539,23 +538,6 @@ cpumask_t cpu_coregroup_map(int cpu)
return c->llc_shared_map;
}
-#ifdef CONFIG_X86_32
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-void __init smp_alloc_memory(void)
-{
- trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
- /*
- * Has to be in very low memory so we can execute
- * real-mode AP code.
- */
- if (__pa(trampoline_base) >= 0x9F000)
- BUG();
-}
-#endif
-
static void impress_friends(void)
{
int cpu;
@@ -1174,9 +1156,11 @@ static int __init smp_sanity_check(unsigned max_cpus)
* If SMP should be disabled, then really disable it!
*/
if (!max_cpus) {
- printk(KERN_INFO "SMP mode deactivated,"
- "forcing use of dummy APIC emulation.\n");
+ printk(KERN_INFO "SMP mode deactivated.\n");
smpboot_clear_io_apic();
+
+ localise_nmi_watchdog();
+
#ifdef CONFIG_X86_32
connect_bsp_APIC();
#endif
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 70e4a374b4e8..5978023b799b 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -31,6 +31,7 @@
#include <asm/srat.h>
#include <asm/topology.h>
#include <asm/smp.h>
+#include <asm/e820.h>
/*
* proximity macros and definitions
@@ -41,7 +42,7 @@
#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
/* bitmap length; _PXM is at most 255 */
#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
-static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
+static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
#define MAX_CHUNKS_PER_NODE 3
#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
@@ -52,16 +53,37 @@ struct node_memory_chunk_s {
u8 nid; // which cnode contains this chunk?
u8 bank; // which mem bank on this node
};
-static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
+static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
-static int num_memory_chunks; /* total number of memory chunks */
+static int __initdata num_memory_chunks; /* total number of memory chunks */
static u8 __initdata apicid_to_pxm[MAX_APICID];
+int numa_off __initdata;
+int acpi_numa __initdata;
+
+static __init void bad_srat(void)
+{
+ printk(KERN_ERR "SRAT: SRAT not used.\n");
+ acpi_numa = -1;
+ num_memory_chunks = 0;
+}
+
+static __init inline int srat_disabled(void)
+{
+ return numa_off || acpi_numa < 0;
+}
+
/* Identify CPU proximity domains */
-static void __init parse_cpu_affinity_structure(char *p)
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
{
- struct acpi_srat_cpu_affinity *cpu_affinity =
- (struct acpi_srat_cpu_affinity *) p;
+ if (srat_disabled())
+ return;
+ if (cpu_affinity->header.length !=
+ sizeof(struct acpi_srat_cpu_affinity)) {
+ bad_srat();
+ return;
+ }
if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
return; /* empty entry */
@@ -79,14 +101,21 @@ static void __init parse_cpu_affinity_structure(char *p)
* Identify memory proximity domains and hot-remove capabilities.
* Fill node memory chunk list structure.
*/
-static void __init parse_memory_affinity_structure (char *sratp)
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
{
unsigned long long paddr, size;
unsigned long start_pfn, end_pfn;
u8 pxm;
struct node_memory_chunk_s *p, *q, *pend;
- struct acpi_srat_mem_affinity *memory_affinity =
- (struct acpi_srat_mem_affinity *) sratp;
+
+ if (srat_disabled())
+ return;
+ if (memory_affinity->header.length !=
+ sizeof(struct acpi_srat_mem_affinity)) {
+ bad_srat();
+ return;
+ }
if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
return; /* empty entry */
@@ -134,6 +163,14 @@ static void __init parse_memory_affinity_structure (char *sratp)
"enabled and removable" : "enabled" ) );
}
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+}
+
+void acpi_numa_arch_fixup(void)
+{
+}
/*
* The SRAT table always lists ascending addresses, so can always
* assume that the first "start" address that you see is the real
@@ -166,39 +203,13 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
node_end_pfn[nid] = memory_chunk->end_pfn;
}
-/* Parse the ACPI Static Resource Affinity Table */
-static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
+int __init get_memcfg_from_srat(void)
{
- u8 *start, *end, *p;
int i, j, nid;
- start = (u8 *)(&(sratp->reserved) + 1); /* skip header */
- p = start;
- end = (u8 *)sratp + sratp->header.length;
-
- memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */
- memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
- num_memory_chunks = 0;
- while (p < end) {
- switch (*p) {
- case ACPI_SRAT_TYPE_CPU_AFFINITY:
- parse_cpu_affinity_structure(p);
- break;
- case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
- parse_memory_affinity_structure(p);
- break;
- default:
- printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
- break;
- }
- p += p[1];
- if (p[1] == 0) {
- printk("acpi20_parse_srat: Entry length value is zero;"
- " can't parse any further!\n");
- break;
- }
- }
+ if (srat_disabled())
+ goto out_fail;
if (num_memory_chunks == 0) {
printk("could not finy any ACPI SRAT memory areas.\n");
@@ -244,115 +255,19 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
node_read_chunk(chunk->nid, chunk);
- add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
+ e820_register_active_regions(chunk->nid, chunk->start_pfn,
+ min(chunk->end_pfn, max_pfn));
}
-
+
for_each_online_node(nid) {
unsigned long start = node_start_pfn[nid];
- unsigned long end = node_end_pfn[nid];
+ unsigned long end = min(node_end_pfn[nid], max_pfn);
memory_present(nid, start, end);
node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
}
return 1;
out_fail:
- return 0;
-}
-
-struct acpi_static_rsdt {
- struct acpi_table_rsdt table;
- u32 padding[7]; /* Allow for 7 more table entries */
-};
-
-int __init get_memcfg_from_srat(void)
-{
- struct acpi_table_header *header = NULL;
- struct acpi_table_rsdp *rsdp = NULL;
- struct acpi_table_rsdt *rsdt = NULL;
- acpi_native_uint rsdp_address = 0;
- struct acpi_static_rsdt saved_rsdt;
- int tables = 0;
- int i = 0;
-
- rsdp_address = acpi_os_get_root_pointer();
- if (!rsdp_address) {
- printk("%s: System description tables not found\n",
- __func__);
- goto out_err;
- }
-
- printk("%s: assigning address to rsdp\n", __func__);
- rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
- if (!rsdp) {
- printk("%s: Didn't find ACPI root!\n", __func__);
- goto out_err;
- }
-
- printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
- rsdp->oem_id);
-
- if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
- printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__);
- goto out_err;
- }
-
- rsdt = (struct acpi_table_rsdt *)
- early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
-
- if (!rsdt) {
- printk(KERN_WARNING
- "%s: ACPI: Invalid root system description tables (RSDT)\n",
- __func__);
- goto out_err;
- }
-
- header = &rsdt->header;
-
- if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
- printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
- goto out_err;
- }
-
- /*
- * The number of tables is computed by taking the
- * size of all entries (header size minus total
- * size of RSDT) divided by the size of each entry
- * (4-byte table pointers).
- */
- tables = (header->length - sizeof(struct acpi_table_header)) / 4;
-
- if (!tables)
- goto out_err;
-
- memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
-
- if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
- printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
- saved_rsdt.table.header.length);
- goto out_err;
- }
-
- printk("Begin SRAT table scan....\n");
-
- for (i = 0; i < tables; i++) {
- /* Map in header, then map in full table length. */
- header = (struct acpi_table_header *)
- early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
- if (!header)
- break;
- header = (struct acpi_table_header *)
- early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
- if (!header)
- break;
-
- if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
- continue;
-
- /* we've found the srat table. don't need to look at any more tables */
- return acpi20_parse_srat((struct acpi_table_srat *)header);
- }
-out_err:
- remove_all_active_ranges();
printk("failed to get NUMA memory information from SRAT table\n");
return 0;
}
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index ae751094eba9..d67ce5f044ba 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -36,7 +36,9 @@ static struct rio_table_hdr *rio_table_hdr __initdata;
static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
+#ifndef CONFIG_X86_NUMAQ
static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
+#endif
static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
{
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d2ab52cc1d6b..7066cb855a60 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -19,8 +19,8 @@
#include <linux/utsname.h>
#include <linux/ipc.h>
-#include <asm/uaccess.h>
-#include <asm/unistd.h>
+#include <linux/uaccess.h>
+#include <linux/unistd.h>
asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
@@ -103,7 +103,7 @@ asmlinkage int old_select(struct sel_arg_struct __user *arg)
*
* This is really horribly ugly.
*/
-asmlinkage int sys_ipc (uint call, int first, int second,
+asmlinkage int sys_ipc(uint call, int first, int second,
int third, void __user *ptr, long fifth)
{
int version, ret;
@@ -113,24 +113,24 @@ asmlinkage int sys_ipc (uint call, int first, int second,
switch (call) {
case SEMOP:
- return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
+ return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
case SEMTIMEDOP:
return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
(const struct timespec __user *)fifth);
case SEMGET:
- return sys_semget (first, second, third);
+ return sys_semget(first, second, third);
case SEMCTL: {
union semun fourth;
if (!ptr)
return -EINVAL;
if (get_user(fourth.__pad, (void __user * __user *) ptr))
return -EFAULT;
- return sys_semctl (first, second, third, fourth);
+ return sys_semctl(first, second, third, fourth);
}
case MSGSND:
- return sys_msgsnd (first, (struct msgbuf __user *) ptr,
+ return sys_msgsnd(first, (struct msgbuf __user *) ptr,
second, third);
case MSGRCV:
switch (version) {
@@ -138,45 +138,45 @@ asmlinkage int sys_ipc (uint call, int first, int second,
struct ipc_kludge tmp;
if (!ptr)
return -EINVAL;
-
+
if (copy_from_user(&tmp,
- (struct ipc_kludge __user *) ptr,
- sizeof (tmp)))
+ (struct ipc_kludge __user *) ptr,
+ sizeof(tmp)))
return -EFAULT;
- return sys_msgrcv (first, tmp.msgp, second,
+ return sys_msgrcv(first, tmp.msgp, second,
tmp.msgtyp, third);
}
default:
- return sys_msgrcv (first,
+ return sys_msgrcv(first,
(struct msgbuf __user *) ptr,
second, fifth, third);
}
case MSGGET:
- return sys_msgget ((key_t) first, second);
+ return sys_msgget((key_t) first, second);
case MSGCTL:
- return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
+ return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
case SHMAT:
switch (version) {
default: {
ulong raddr;
- ret = do_shmat (first, (char __user *) ptr, second, &raddr);
+ ret = do_shmat(first, (char __user *) ptr, second, &raddr);
if (ret)
return ret;
- return put_user (raddr, (ulong __user *) third);
+ return put_user(raddr, (ulong __user *) third);
}
case 1: /* iBCS2 emulator entry point */
if (!segment_eq(get_fs(), get_ds()))
return -EINVAL;
/* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
- return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
+ return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
}
- case SHMDT:
- return sys_shmdt ((char __user *)ptr);
+ case SHMDT:
+ return sys_shmdt((char __user *)ptr);
case SHMGET:
- return sys_shmget (first, second, third);
+ return sys_shmget(first, second, third);
case SHMCTL:
- return sys_shmctl (first, second,
+ return sys_shmctl(first, second,
(struct shmid_ds __user *) ptr);
default:
return -ENOSYS;
@@ -186,28 +186,28 @@ asmlinkage int sys_ipc (uint call, int first, int second,
/*
* Old cruft
*/
-asmlinkage int sys_uname(struct old_utsname __user * name)
+asmlinkage int sys_uname(struct old_utsname __user *name)
{
int err;
if (!name)
return -EFAULT;
down_read(&uts_sem);
- err = copy_to_user(name, utsname(), sizeof (*name));
+ err = copy_to_user(name, utsname(), sizeof(*name));
up_read(&uts_sem);
- return err?-EFAULT:0;
+ return err? -EFAULT:0;
}
-asmlinkage int sys_olduname(struct oldold_utsname __user * name)
+asmlinkage int sys_olduname(struct oldold_utsname __user *name)
{
int error;
if (!name)
return -EFAULT;
- if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+ if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
return -EFAULT;
-
- down_read(&uts_sem);
-
+
+ down_read(&uts_sem);
+
error = __copy_to_user(&name->sysname, &utsname()->sysname,
__OLD_UTS_LEN);
error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
@@ -223,9 +223,9 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name)
error |= __copy_to_user(&name->machine, &utsname()->machine,
__OLD_UTS_LEN);
error |= __put_user(0, name->machine + __OLD_UTS_LEN);
-
+
up_read(&uts_sem);
-
+
error = error ? -EFAULT : 0;
return error;
@@ -241,6 +241,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
long __res;
asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
: "=a" (__res)
- : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
+ : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
return __res;
}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 2ff21f398934..5f29f12da50c 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -84,8 +84,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
if (timer_ack) {
/*
* Subtle, when I/O APICs are used we have to ack timer IRQ
- * manually to reset the IRR bit for do_slow_gettimeoffset().
- * This will also deassert NMI lines for the watchdog if run
+ * manually to deassert NMI lines for the watchdog if run
* on an 82489DX-based system.
*/
spin_lock(&i8259A_lock);
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index c737849e2ef7..39ae8511a137 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -123,6 +123,8 @@ void __init time_init(void)
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
cpu_khz = calculate_cpu_khz();
+ lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ;
+
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index abbf199adebb..1106fac6024d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,7 +2,7 @@
#include <asm/trampoline.h>
-/* ready for x86_64, no harm for x86, since it will overwrite after alloc */
+/* ready for x86_64 and x86 */
unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
/*
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76ea97c4..ec6d3b2130c4 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -71,7 +71,6 @@ asmlinkage void general_protection(void);
asmlinkage void page_fault(void);
asmlinkage void coprocessor_error(void);
asmlinkage void simd_coprocessor_error(void);
-asmlinkage void reserved(void);
asmlinkage void alignment_check(void);
asmlinkage void machine_check(void);
asmlinkage void spurious_interrupt_bug(void);
@@ -702,12 +701,10 @@ DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
DO_ERROR( 4, SIGSEGV, "overflow", overflow)
DO_ERROR( 5, SIGSEGV, "bounds", bounds)
DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR(18, SIGSEGV, "reserved", reserved)
/* Runs on IST stack */
asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 65b70637ad97..6240922e497c 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -1,6 +1,7 @@
#include <linux/sched.h>
#include <linux/clocksource.h>
#include <linux/workqueue.h>
+#include <linux/delay.h>
#include <linux/cpufreq.h>
#include <linux/jiffies.h>
#include <linux/init.h>
@@ -286,7 +287,6 @@ core_initcall(cpufreq_tsc);
/* clock source code */
-static unsigned long current_tsc_khz;
static struct clocksource clocksource_tsc;
/*
@@ -404,6 +404,7 @@ static inline void check_geode_tsc_reliable(void) { }
void __init tsc_init(void)
{
int cpu;
+ u64 lpj;
if (!cpu_has_tsc || tsc_disabled > 0)
return;
@@ -416,6 +417,10 @@ void __init tsc_init(void)
return;
}
+ lpj = ((u64)tsc_khz * 1000);
+ do_div(lpj, HZ);
+ lpj_fine = lpj;
+
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;
@@ -439,9 +444,8 @@ void __init tsc_init(void)
unsynchronized_tsc();
check_geode_tsc_reliable();
- current_tsc_khz = tsc_khz;
- clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
- clocksource_tsc.shift);
+ clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
+ clocksource_tsc.shift);
/* lower the rating if we already know its unstable: */
if (check_tsc_unstable()) {
clocksource_tsc.rating = 0;
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 1784b8077a12..9898fb01edfd 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -242,7 +242,7 @@ void __init tsc_calibrate(void)
if (hpet) {
printk(KERN_INFO "TSC calibrated against HPET\n");
if (hpet2 < hpet1)
- hpet2 += 0x100000000;
+ hpet2 += 0x100000000UL;
hpet2 -= hpet1;
tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000;
} else {
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index a2b030780aa9..ba7d19e102b1 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -33,8 +33,7 @@
#include <asm/apic.h>
#include <asm/timer.h>
#include <asm/i8253.h>
-
-#include <irq_vectors.h>
+#include <asm/irq_vectors.h>
#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ce5ed083a1e9..2674f5796275 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -60,13 +60,6 @@ SECTIONS
BUG_TABLE :text
- . = ALIGN(4);
- .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
- __tracedata_start = .;
- *(.tracedata)
- __tracedata_end = .;
- }
-
RODATA
/* writeable */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a5..fd246e22fe6b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -53,13 +53,6 @@ SECTIONS
RODATA
- . = ALIGN(4);
- .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
- __tracedata_start = .;
- *(.tracedata)
- __tracedata_end = .;
- }
-
. = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -177,6 +170,7 @@ SECTIONS
*(.con_initcall.init)
}
__con_initcall_end = .;
+ . = ALIGN(16);
__x86cpuvendor_start = .;
.x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
*(.x86cpuvendor.init)
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index ba8c0b75ab0a..0c029e8959c7 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,9 +15,12 @@
#include <linux/init.h>
#include <linux/pci_ids.h>
#include <linux/pci_regs.h>
+
+#include <asm/apic.h>
#include <asm/pci-direct.h>
#include <asm/io.h>
#include <asm/paravirt.h>
+#include <asm/setup.h>
#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
/*