85 files changed, 7270 insertions, 4358 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 77807d4769c9..d1d4ee895270 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the linux kernel.
 #
 
-extra-y                := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
+extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
@@ -18,14 +18,13 @@ CFLAGS_tsc_64.o		:= $(nostackp)
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y			+= traps_$(BITS).o irq_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
-obj-y			+= setup_$(BITS).o i8259_$(BITS).o setup.o
+obj-y			+= setup_$(BITS).o i8259.o irqinit_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
-obj-y			+= bootflag.o e820_$(BITS).o
+obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o
-obj-$(CONFIG_X86_64)	+= bugs_64.o
 obj-y			+= tsc_$(BITS).o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
@@ -100,6 +99,7 @@ ifeq ($(CONFIG_X86_64),y)
 
         obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
         obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
+        obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o amd_iommu.o
         obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb_64.o
 
         obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 33c5216fd3e1..6516359922ba 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -83,6 +83,8 @@ int acpi_lapic;
 int acpi_ioapic;
 int acpi_strict;
 
+static int disable_irq0_through_ioapic __initdata;
+
 u8 acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
 int acpi_skip_timer_override __initdata;
@@ -338,8 +340,6 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 
 #ifdef CONFIG_X86_IO_APIC
 
-struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
-
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
@@ -514,8 +514,6 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 */
 	if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
-		extern void eisa_set_level_irq(unsigned int irq);
-
 		if (triggering == ACPI_LEVEL_SENSITIVE)
 			eisa_set_level_irq(gsi);
 	}
@@ -860,6 +858,372 @@ static int __init acpi_parse_madt_lapic_entries(void)
 #endif				/* CONFIG_X86_LOCAL_APIC */
 
 #ifdef	CONFIG_X86_IO_APIC
+#define MP_ISA_BUS		0
+
+#ifdef CONFIG_X86_ES7000
+extern int es7000_plat;
+#endif
+
+static struct {
+	int apic_id;
+	int gsi_base;
+	int gsi_end;
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
+static int mp_find_ioapic(int gsi)
+{
+	int i = 0;
+
+	/* Find the IOAPIC that manages this GSI. */
+	for (i = 0; i < nr_ioapics; i++) {
+		if ((gsi >= mp_ioapic_routing[i].gsi_base)
+		    && (gsi <= mp_ioapic_routing[i].gsi_end))
+			return i;
+	}
+
+	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+	return -1;
+}
+
+static u8 __init uniq_ioapic_id(u8 id)
+{
+#ifdef CONFIG_X86_32
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return io_apic_get_unique_id(nr_ioapics, id);
+	else
+		return id;
+#else
+	int i;
+	DECLARE_BITMAP(used, 256);
+	bitmap_zero(used, 256);
+	for (i = 0; i < nr_ioapics; i++) {
+		struct mp_config_ioapic *ia = &mp_ioapics[i];
+		__set_bit(ia->mp_apicid, used);
+	}
+	if (!test_bit(id, used))
+		return id;
+	return find_first_zero_bit(used, 256);
+#endif
+}
+
+static int bad_ioapic(unsigned long address)
+{
+	if (nr_ioapics >= MAX_IO_APICS) {
+		printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+		       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+		panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+	}
+	if (!address) {
+		printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+		       " found in table, skipping!\n");
+		return 1;
+	}
+	return 0;
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+{
+	int idx = 0;
+
+	if (bad_ioapic(address))
+		return;
+
+	idx = nr_ioapics;
+
+	mp_ioapics[idx].mp_type = MP_IOAPIC;
+	mp_ioapics[idx].mp_flags = MPC_APIC_USABLE;
+	mp_ioapics[idx].mp_apicaddr = address;
+
+	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+	mp_ioapics[idx].mp_apicid = uniq_ioapic_id(id);
+#ifdef CONFIG_X86_32
+	mp_ioapics[idx].mp_apicver = io_apic_get_version(idx);
+#else
+	mp_ioapics[idx].mp_apicver = 0;
+#endif
+	/*
+	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+	 */
+	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mp_apicid;
+	mp_ioapic_routing[idx].gsi_base = gsi_base;
+	mp_ioapic_routing[idx].gsi_end = gsi_base +
+	    io_apic_get_redir_entries(idx);
+
+	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+	       "GSI %d-%d\n", idx, mp_ioapics[idx].mp_apicid,
+	       mp_ioapics[idx].mp_apicver, mp_ioapics[idx].mp_apicaddr,
+	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+
+	nr_ioapics++;
+}
+
+static void assign_to_mp_irq(struct mp_config_intsrc *m,
+				    struct mp_config_intsrc *mp_irq)
+{
+	memcpy(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static int mp_irq_cmp(struct mp_config_intsrc *mp_irq,
+				struct mp_config_intsrc *m)
+{
+	return memcmp(mp_irq, m, sizeof(struct mp_config_intsrc));
+}
+
+static void save_mp_irq(struct mp_config_intsrc *m)
+{
+	int i;
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!mp_irq_cmp(&mp_irqs[i], m))
+			return;
+	}
+
+	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
+	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+		panic("Max # of irq sources exceeded!!\n");
+}
+
+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+{
+	int ioapic;
+	int pin;
+	struct mp_config_intsrc mp_irq;
+
+	/* Skip the 8254 timer interrupt (IRQ 0) if requested.  */
+	if (bus_irq == 0 && disable_irq0_through_ioapic)
+		return;
+
+	/*
+	 * Convert 'gsi' to 'ioapic.pin'.
+	 */
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0)
+		return;
+	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+	/*
+	 * TBD: This check is for faulty timer entries, where the override
+	 *      erroneously sets the trigger to level, resulting in a HUGE
+	 *      increase of timer interrupts!
+	 */
+	if ((bus_irq == 0) && (trigger == 3))
+		trigger = 1;
+
+	mp_irq.mp_type = MP_INTSRC;
+	mp_irq.mp_irqtype = mp_INT;
+	mp_irq.mp_irqflag = (trigger << 2) | polarity;
+	mp_irq.mp_srcbus = MP_ISA_BUS;
+	mp_irq.mp_srcbusirq = bus_irq;	/* IRQ */
+	mp_irq.mp_dstapic = mp_ioapics[ioapic].mp_apicid; /* APIC ID */
+	mp_irq.mp_dstirq = pin;	/* INTIN# */
+
+	save_mp_irq(&mp_irq);
+}
+
+void __init mp_config_acpi_legacy_irqs(void)
+{
+	int i;
+	int ioapic;
+	unsigned int dstapic;
+	struct mp_config_intsrc mp_irq;
+
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+	/*
+	 * Fabricate the legacy ISA bus (bus #31).
+	 */
+	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+	set_bit(MP_ISA_BUS, mp_bus_not_pci);
+	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+#ifdef CONFIG_X86_ES7000
+	/*
+	 * Older generations of ES7000 have no legacy identity mappings
+	 */
+	if (es7000_plat == 1)
+		return;
+#endif
+
+	/*
+	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
+	 */
+	ioapic = mp_find_ioapic(0);
+	if (ioapic < 0)
+		return;
+	dstapic = mp_ioapics[ioapic].mp_apicid;
+
+	/*
+	 * Use the default configuration for the IRQs 0-15.  Unless
+	 * overridden by (MADT) interrupt source override entries.
+	 */
+	for (i = 0; i < 16; i++) {
+		int idx;
+
+		/* Skip the 8254 timer interrupt (IRQ 0) if requested.  */
+		if (i == 0 && disable_irq0_through_ioapic)
+			continue;
+
+		for (idx = 0; idx < mp_irq_entries; idx++) {
+			struct mp_config_intsrc *irq = mp_irqs + idx;
+
+			/* Do we already have a mapping for this ISA IRQ? */
+			if (irq->mp_srcbus == MP_ISA_BUS
+			    && irq->mp_srcbusirq == i)
+				break;
+
+			/* Do we already have a mapping for this IOAPIC pin */
+			if (irq->mp_dstapic == dstapic &&
+			    irq->mp_dstirq == i)
+				break;
+		}
+
+		if (idx != mp_irq_entries) {
+			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+			continue;	/* IRQ already used */
+		}
+
+		mp_irq.mp_type = MP_INTSRC;
+		mp_irq.mp_irqflag = 0;	/* Conforming */
+		mp_irq.mp_srcbus = MP_ISA_BUS;
+		mp_irq.mp_dstapic = dstapic;
+		mp_irq.mp_irqtype = mp_INT;
+		mp_irq.mp_srcbusirq = i; /* Identity mapped */
+		mp_irq.mp_dstirq = i;
+
+		save_mp_irq(&mp_irq);
+	}
+}
+
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+	int ioapic;
+	int ioapic_pin;
+#ifdef CONFIG_X86_32
+#define MAX_GSI_NUM	4096
+#define IRQ_COMPRESSION_START	64
+
+	static int pci_irq = IRQ_COMPRESSION_START;
+	/*
+	 * Mapping between Global System Interrupts, which
+	 * represent all possible interrupts, and IRQs
+	 * assigned to actual devices.
+	 */
+	static int gsi_to_irq[MAX_GSI_NUM];
+#else
+
+	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+		return gsi;
+#endif
+
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (acpi_gbl_FADT.sci_interrupt == gsi)
+		return gsi;
+
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0) {
+		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+		return gsi;
+	}
+
+	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+#ifdef CONFIG_X86_32
+	if (ioapic_renumber_irq)
+		gsi = ioapic_renumber_irq(ioapic, gsi);
+#endif
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
+		printk(KERN_ERR "Invalid reference to IOAPIC pin "
+		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+		       ioapic_pin);
+		return gsi;
+	}
+	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+#ifdef CONFIG_X86_32
+		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+#else
+		return gsi;
+#endif
+	}
+
+	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
+#ifdef CONFIG_X86_32
+	/*
+	 * For GSI >= 64, use IRQ compression
+	 */
+	if ((gsi >= IRQ_COMPRESSION_START)
+	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
+		/*
+		 * For PCI devices assign IRQs in order, avoiding gaps
+		 * due to unused I/O APIC pins.
+		 */
+		int irq = gsi;
+		if (gsi < MAX_GSI_NUM) {
+			/*
+			 * Retain the VIA chipset work-around (gsi > 15), but
+			 * avoid a problem where the 8254 timer (IRQ0) is setup
+			 * via an override (so it's not on pin 0 of the ioapic),
+			 * and at the same time, the pin 0 interrupt is a PCI
+			 * type.  The gsi > 15 test could cause these two pins
+			 * to be shared as IRQ0, and they are not shareable.
+			 * So test for this condition, and if necessary, avoid
+			 * the pin collision.
+			 */
+			gsi = pci_irq++;
+			/*
+			 * Don't assign IRQ used by ACPI SCI
+			 */
+			if (gsi == acpi_gbl_FADT.sci_interrupt)
+				gsi = pci_irq++;
+			gsi_to_irq[irq] = gsi;
+		} else {
+			printk(KERN_ERR "GSI %u is too high\n", gsi);
+			return gsi;
+		}
+	}
+#endif
+	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+	return gsi;
+}
+
+int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
+			u32 gsi, int triggering, int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+	struct mp_config_intsrc mp_irq;
+	int ioapic;
+
+	if (!acpi_ioapic)
+		return 0;
+
+	/* print the entry should happen on mptable identically */
+	mp_irq.mp_type = MP_INTSRC;
+	mp_irq.mp_irqtype = mp_INT;
+	mp_irq.mp_irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+	mp_irq.mp_srcbus = number;
+	mp_irq.mp_srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+	ioapic = mp_find_ioapic(gsi);
+	mp_irq.mp_dstapic = mp_ioapic_routing[ioapic].apic_id;
+	mp_irq.mp_dstirq = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+	save_mp_irq(&mp_irq);
+#endif
+	return 0;
+}
+
 /*
  * Parse IOAPIC related entries in MADT
  * returns 0 on success, < 0 on error
@@ -1061,6 +1425,17 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
 }
 
 /*
+ * Don't register any I/O APIC entries for the 8254 timer IRQ.
+ */
+static int __init
+dmi_disable_irq0_through_ioapic(const struct dmi_system_id *d)
+{
+	pr_notice("%s detected: disabling IRQ 0 through I/O APIC\n", d->ident);
+	disable_irq0_through_ioapic = 1;
+	return 0;
+}
+
+/*
  * If your system is blacklisted here, but you find that acpi=force
  * works for you, please contact acpi-devel@sourceforge.net
  */
@@ -1227,6 +1602,32 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 		     DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
 		     },
 	 },
+	/*
+	 * HP laptops which use a DSDT reporting as HP/SB400/10000,
+	 * which includes some code which overrides all temperature
+	 * trip points to 16C if the INTIN2 input of the I/O APIC
+	 * is enabled.  This input is incorrectly designated the
+	 * ISA IRQ 0 via an interrupt source override even though
+	 * it is wired to the output of the master 8259A and INTIN0
+	 * is not connected at all.  Abandon any attempts to route
+	 * IRQ 0 through the I/O APIC therefore.
+	 */
+	{
+	 .callback = dmi_disable_irq0_through_ioapic,
+	 .ident = "HP NX6125 laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"),
+		     },
+	 },
+	{
+	 .callback = dmi_disable_irq0_through_ioapic,
+	 .ident = "HP NX6325 laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
+		     },
+	 },
 	{}
 };
 
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
new file mode 100644
index 000000000000..f2766d84c7a0
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/gfp.h>
+#include <linux/bitops.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+#include <asm/proto.h>
+#include <asm/gart.h>
+#include <asm/amd_iommu_types.h>
+#include <asm/amd_iommu.h>
+
+#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
+
+#define to_pages(addr, size) \
+	 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+
+static DEFINE_RWLOCK(amd_iommu_devtable_lock);
+
+struct command {
+	u32 data[4];
+};
+
+static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+			     struct unity_map_entry *e);
+
+static int iommu_has_npcache(struct amd_iommu *iommu)
+{
+	return iommu->cap & IOMMU_CAP_NPCACHE;
+}
+
+static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+{
+	u32 tail, head;
+	u8 *target;
+
+	tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+	target = (iommu->cmd_buf + tail);
+	memcpy_toio(target, cmd, sizeof(*cmd));
+	tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+	head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+	if (tail == head)
+		return -ENOMEM;
+	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+	return 0;
+}
+
+static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	ret = __iommu_queue_command(iommu, cmd);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return ret;
+}
+
+static int iommu_completion_wait(struct amd_iommu *iommu)
+{
+	int ret;
+	struct command cmd;
+	volatile u64 ready = 0;
+	unsigned long ready_phys = virt_to_phys(&ready);
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
+	cmd.data[1] = HIGH_U32(ready_phys);
+	cmd.data[2] = 1; /* value written to 'ready' */
+	CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+
+	iommu->need_sync = 0;
+
+	ret = iommu_queue_command(iommu, &cmd);
+
+	if (ret)
+		return ret;
+
+	while (!ready)
+		cpu_relax();
+
+	return 0;
+}
+
+static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
+{
+	struct command cmd;
+
+	BUG_ON(iommu == NULL);
+
+	memset(&cmd, 0, sizeof(cmd));
+	CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
+	cmd.data[0] = devid;
+
+	iommu->need_sync = 1;
+
+	return iommu_queue_command(iommu, &cmd);
+}
+
+static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
+		u64 address, u16 domid, int pde, int s)
+{
+	struct command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	address &= PAGE_MASK;
+	CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
+	cmd.data[1] |= domid;
+	cmd.data[2] = LOW_U32(address);
+	cmd.data[3] = HIGH_U32(address);
+	if (s)
+		cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+	if (pde)
+		cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+
+	iommu->need_sync = 1;
+
+	return iommu_queue_command(iommu, &cmd);
+}
+
+static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
+		u64 address, size_t size)
+{
+	int s = 0;
+	unsigned pages = to_pages(address, size);
+
+	address &= PAGE_MASK;
+
+	if (pages > 1) {
+		/*
+		 * If we have to flush more than one page, flush all
+		 * TLB entries for this domain
+		 */
+		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+		s = 1;
+	}
+
+	iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
+
+	return 0;
+}
+
+static int iommu_map(struct protection_domain *dom,
+		     unsigned long bus_addr,
+		     unsigned long phys_addr,
+		     int prot)
+{
+	u64 __pte, *pte, *page;
+
+	bus_addr  = PAGE_ALIGN(bus_addr);
+	phys_addr = PAGE_ALIGN(bus_addr);
+
+	/* only support 512GB address spaces for now */
+	if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
+		return -EINVAL;
+
+	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
+
+	if (!IOMMU_PTE_PRESENT(*pte)) {
+		page = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		*pte = IOMMU_L2_PDE(virt_to_phys(page));
+	}
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+	if (!IOMMU_PTE_PRESENT(*pte)) {
+		page = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+		*pte = IOMMU_L1_PDE(virt_to_phys(page));
+	}
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+
+	if (IOMMU_PTE_PRESENT(*pte))
+		return -EBUSY;
+
+	__pte = phys_addr | IOMMU_PTE_P;
+	if (prot & IOMMU_PROT_IR)
+		__pte |= IOMMU_PTE_IR;
+	if (prot & IOMMU_PROT_IW)
+		__pte |= IOMMU_PTE_IW;
+
+	*pte = __pte;
+
+	return 0;
+}
+
+static int iommu_for_unity_map(struct amd_iommu *iommu,
+			       struct unity_map_entry *entry)
+{
+	u16 bdf, i;
+
+	for (i = entry->devid_start; i <= entry->devid_end; ++i) {
+		bdf = amd_iommu_alias_table[i];
+		if (amd_iommu_rlookup_table[bdf] == iommu)
+			return 1;
+	}
+
+	return 0;
+}
+
+static int iommu_init_unity_mappings(struct amd_iommu *iommu)
+{
+	struct unity_map_entry *entry;
+	int ret;
+
+	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
+		if (!iommu_for_unity_map(iommu, entry))
+			continue;
+		ret = dma_ops_unity_map(iommu->default_dom, entry);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
+			     struct unity_map_entry *e)
+{
+	u64 addr;
+	int ret;
+
+	for (addr = e->address_start; addr < e->address_end;
+	     addr += PAGE_SIZE) {
+		ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
+		if (ret)
+			return ret;
+		/*
+		 * if unity mapping is in aperture range mark the page
+		 * as allocated in the aperture
+		 */
+		if (addr < dma_dom->aperture_size)
+			__set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+	}
+
+	return 0;
+}
+
+static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
+					  u16 devid)
+{
+	struct unity_map_entry *e;
+	int ret;
+
+	list_for_each_entry(e, &amd_iommu_unity_map, list) {
+		if (!(devid >= e->devid_start && devid <= e->devid_end))
+			continue;
+		ret = dma_ops_unity_map(dma_dom, e);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static unsigned long dma_mask_to_pages(unsigned long mask)
+{
+	return (mask >> PAGE_SHIFT) +
+		(PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
+}
+
+static unsigned long dma_ops_alloc_addresses(struct device *dev,
+					     struct dma_ops_domain *dom,
+					     unsigned int pages)
+{
+	unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
+	unsigned long address;
+	unsigned long size = dom->aperture_size >> PAGE_SHIFT;
+	unsigned long boundary_size;
+
+	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+			PAGE_SIZE) >> PAGE_SHIFT;
+	limit = limit < size ? limit : size;
+
+	if (dom->next_bit >= limit)
+		dom->next_bit = 0;
+
+	address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
+			0 , boundary_size, 0);
+	if (address == -1)
+		address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
+				0, boundary_size, 0);
+
+	if (likely(address != -1)) {
+		dom->next_bit = address + pages;
+		address <<= PAGE_SHIFT;
+	} else
+		address = bad_dma_address;
+
+	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
+
+	return address;
+}
+
+static void dma_ops_free_addresses(struct dma_ops_domain *dom,
+				   unsigned long address,
+				   unsigned int pages)
+{
+	address >>= PAGE_SHIFT;
+	iommu_area_free(dom->bitmap, address, pages);
+}
+
+static u16 domain_id_alloc(void)
+{
+	unsigned long flags;
+	int id;
+
+	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
+	BUG_ON(id == 0);
+	if (id > 0 && id < MAX_DOMAIN_ID)
+		__set_bit(id, amd_iommu_pd_alloc_bitmap);
+	else
+		id = 0;
+	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	return id;
+}
+
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+				      unsigned long start_page,
+				      unsigned int pages)
+{
+	unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+
+	if (start_page + pages > last_page)
+		pages = last_page - start_page;
+
+	set_bit_string(dom->bitmap, start_page, pages);
+}
+
+static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
+{
+	int i, j;
+	u64 *p1, *p2, *p3;
+
+	p1 = dma_dom->domain.pt_root;
+
+	if (!p1)
+		return;
+
+	for (i = 0; i < 512; ++i) {
+		if (!IOMMU_PTE_PRESENT(p1[i]))
+			continue;
+
+		p2 = IOMMU_PTE_PAGE(p1[i]);
+		for (j = 0; j < 512; ++i) {
+			if (!IOMMU_PTE_PRESENT(p2[j]))
+				continue;
+			p3 = IOMMU_PTE_PAGE(p2[j]);
+			free_page((unsigned long)p3);
+		}
+
+		free_page((unsigned long)p2);
+	}
+
+	free_page((unsigned long)p1);
+}
+
+static void dma_ops_domain_free(struct dma_ops_domain *dom)
+{
+	if (!dom)
+		return;
+
+	dma_ops_free_pagetable(dom);
+
+	kfree(dom->pte_pages);
+
+	kfree(dom->bitmap);
+
+	kfree(dom);
+}
+
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
+						   unsigned order)
+{
+	struct dma_ops_domain *dma_dom;
+	unsigned i, num_pte_pages;
+	u64 *l2_pde;
+	u64 address;
+
+	/*
+	 * Currently the DMA aperture must be between 32 MB and 1GB in size
+	 */
+	if ((order < 25) || (order > 30))
+		return NULL;
+
+	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
+	if (!dma_dom)
+		return NULL;
+
+	spin_lock_init(&dma_dom->domain.lock);
+
+	dma_dom->domain.id = domain_id_alloc();
+	if (dma_dom->domain.id == 0)
+		goto free_dma_dom;
+	dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
+	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+	dma_dom->domain.priv = dma_dom;
+	if (!dma_dom->domain.pt_root)
+		goto free_dma_dom;
+	dma_dom->aperture_size = (1ULL << order);
+	dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
+				  GFP_KERNEL);
+	if (!dma_dom->bitmap)
+		goto free_dma_dom;
+	/*
+	 * mark the first page as allocated so we never return 0 as
+	 * a valid dma-address. So we can use 0 as error value
+	 */
+	dma_dom->bitmap[0] = 1;
+	dma_dom->next_bit = 0;
+
+	if (iommu->exclusion_start &&
+	    iommu->exclusion_start < dma_dom->aperture_size) {
+		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+		int pages = to_pages(iommu->exclusion_start,
+				iommu->exclusion_length);
+		dma_ops_reserve_addresses(dma_dom, startpage, pages);
+	}
+
+	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
+	dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
+			GFP_KERNEL);
+	if (!dma_dom->pte_pages)
+		goto free_dma_dom;
+
+	l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
+	if (l2_pde == NULL)
+		goto free_dma_dom;
+
+	dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
+
+	for (i = 0; i < num_pte_pages; ++i) {
+		dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!dma_dom->pte_pages[i])
+			goto free_dma_dom;
+		address = virt_to_phys(dma_dom->pte_pages[i]);
+		l2_pde[i] = IOMMU_L1_PDE(address);
+	}
+
+	return dma_dom;
+
+free_dma_dom:
+	dma_ops_domain_free(dma_dom);
+
+	return NULL;
+}
+
+static struct protection_domain *domain_for_device(u16 devid)
+{
+	struct protection_domain *dom;
+	unsigned long flags;
+
+	read_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	dom = amd_iommu_pd_table[devid];
+	read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	return dom;
+}
+
+static void set_device_domain(struct amd_iommu *iommu,
+			      struct protection_domain *domain,
+			      u16 devid)
+{
+	unsigned long flags;
+
+	u64 pte_root = virt_to_phys(domain->pt_root);
+
+	pte_root |= (domain->mode & 0x07) << 9;
+	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
+
+	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+	amd_iommu_dev_table[devid].data[0] = pte_root;
+	amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
+	amd_iommu_dev_table[devid].data[2] = domain->id;
+
+	amd_iommu_pd_table[devid] = domain;
+	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	iommu_queue_inv_dev_entry(iommu, devid);
+
+	iommu->need_sync = 1;
+}
+
+static int get_device_resources(struct device *dev,
+				struct amd_iommu **iommu,
+				struct protection_domain **domain,
+				u16 *bdf)
+{
+	struct dma_ops_domain *dma_dom;
+	struct pci_dev *pcidev;
+	u16 _bdf;
+
+	BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
+
+	pcidev = to_pci_dev(dev);
+	_bdf = (pcidev->bus->number << 8) | pcidev->devfn;
+
+	if (_bdf >= amd_iommu_last_bdf) {
+		*iommu = NULL;
+		*domain = NULL;
+		*bdf = 0xffff;
+		return 0;
+	}
+
+	*bdf = amd_iommu_alias_table[_bdf];
+
+	*iommu = amd_iommu_rlookup_table[*bdf];
+	if (*iommu == NULL)
+		return 0;
+	dma_dom = (*iommu)->default_dom;
+	*domain = domain_for_device(*bdf);
+	if (*domain == NULL) {
+		*domain = &dma_dom->domain;
+		set_device_domain(*iommu, *domain, *bdf);
+		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
+				"device ", (*domain)->id);
+		print_devid(_bdf, 1);
+	}
+
+	return 1;
+}
+
+static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
+				     struct dma_ops_domain *dom,
+				     unsigned long address,
+				     phys_addr_t paddr,
+				     int direction)
+{
+	u64 *pte, __pte;
+
+	WARN_ON(address > dom->aperture_size);
+
+	paddr &= PAGE_MASK;
+
+	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	pte += IOMMU_PTE_L0_INDEX(address);
+
+	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
+
+	if (direction == DMA_TO_DEVICE)
+		__pte |= IOMMU_PTE_IR;
+	else if (direction == DMA_FROM_DEVICE)
+		__pte |= IOMMU_PTE_IW;
+	else if (direction == DMA_BIDIRECTIONAL)
+		__pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
+
+	WARN_ON(*pte);
+
+	*pte = __pte;
+
+	return (dma_addr_t)address;
+}
+
+static void dma_ops_domain_unmap(struct amd_iommu *iommu,
+				 struct dma_ops_domain *dom,
+				 unsigned long address)
+{
+	u64 *pte;
+
+	if (address >= dom->aperture_size)
+		return;
+
+	WARN_ON(address & 0xfffULL || address > dom->aperture_size);
+
+	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	pte += IOMMU_PTE_L0_INDEX(address);
+
+	WARN_ON(!*pte);
+
+	*pte = 0ULL;
+}
+
+static dma_addr_t __map_single(struct device *dev,
+			       struct amd_iommu *iommu,
+			       struct dma_ops_domain *dma_dom,
+			       phys_addr_t paddr,
+			       size_t size,
+			       int dir)
+{
+	dma_addr_t offset = paddr & ~PAGE_MASK;
+	dma_addr_t address, start;
+	unsigned int pages;
+	int i;
+
+	pages = to_pages(paddr, size);
+	paddr &= PAGE_MASK;
+
+	address = dma_ops_alloc_addresses(dev, dma_dom, pages);
+	if (unlikely(address == bad_dma_address))
+		goto out;
+
+	start = address;
+	for (i = 0; i < pages; ++i) {
+		dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+		paddr += PAGE_SIZE;
+		start += PAGE_SIZE;
+	}
+	address += offset;
+
+out:
+	return address;
+}
+
+static void __unmap_single(struct amd_iommu *iommu,
+			   struct dma_ops_domain *dma_dom,
+			   dma_addr_t dma_addr,
+			   size_t size,
+			   int dir)
+{
+	dma_addr_t i, start;
+	unsigned int pages;
+
+	if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
+		return;
+
+	pages = to_pages(dma_addr, size);
+	dma_addr &= PAGE_MASK;
+	start = dma_addr;
+
+	for (i = 0; i < pages; ++i) {
+		dma_ops_domain_unmap(iommu, dma_dom, start);
+		start += PAGE_SIZE;
+	}
+
+	dma_ops_free_addresses(dma_dom, dma_addr, pages);
+}
+
+static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
+			     size_t size, int dir)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+	dma_addr_t addr;
+
+	get_device_resources(dev, &iommu, &domain, &devid);
+
+	if (iommu == NULL || domain == NULL)
+		return (dma_addr_t)paddr;
+
+	spin_lock_irqsave(&domain->lock, flags);
+	addr = __map_single(dev, iommu, domain->priv, paddr, size, dir);
+	if (addr == bad_dma_address)
+		goto out;
+
+	if (iommu_has_npcache(iommu))
+		iommu_flush_pages(iommu, domain->id, addr, size);
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+out:
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return addr;
+}
+
+static void unmap_single(struct device *dev, dma_addr_t dma_addr,
+			 size_t size, int dir)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+
+	if (!get_device_resources(dev, &iommu, &domain, &devid))
+		return;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	__unmap_single(iommu, domain->priv, dma_addr, size, dir);
+
+	iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
+			   int nelems, int dir)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sglist, s, nelems, i) {
+		s->dma_address = (dma_addr_t)sg_phys(s);
+		s->dma_length  = s->length;
+	}
+
+	return nelems;
+}
+
+static int map_sg(struct device *dev, struct scatterlist *sglist,
+		  int nelems, int dir)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+	int i;
+	struct scatterlist *s;
+	phys_addr_t paddr;
+	int mapped_elems = 0;
+
+	get_device_resources(dev, &iommu, &domain, &devid);
+
+	if (!iommu || !domain)
+		return map_sg_no_iommu(dev, sglist, nelems, dir);
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	for_each_sg(sglist, s, nelems, i) {
+		paddr = sg_phys(s);
+
+		s->dma_address = __map_single(dev, iommu, domain->priv,
+					      paddr, s->length, dir);
+
+		if (s->dma_address) {
+			s->dma_length = s->length;
+			mapped_elems++;
+		} else
+			goto unmap;
+		if (iommu_has_npcache(iommu))
+			iommu_flush_pages(iommu, domain->id, s->dma_address,
+					  s->dma_length);
+	}
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+out:
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return mapped_elems;
+unmap:
+	for_each_sg(sglist, s, mapped_elems, i) {
+		if (s->dma_address)
+			__unmap_single(iommu, domain->priv, s->dma_address,
+				       s->dma_length, dir);
+		s->dma_address = s->dma_length = 0;
+	}
+
+	mapped_elems = 0;
+
+	goto out;
+}
+
+static void unmap_sg(struct device *dev, struct scatterlist *sglist,
+		     int nelems, int dir)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	struct scatterlist *s;
+	u16 devid;
+	int i;
+
+	if (!get_device_resources(dev, &iommu, &domain, &devid))
+		return;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	for_each_sg(sglist, s, nelems, i) {
+		__unmap_single(iommu, domain->priv, s->dma_address,
+			       s->dma_length, dir);
+		iommu_flush_pages(iommu, domain->id, s->dma_address,
+				  s->dma_length);
+		s->dma_address = s->dma_length = 0;
+	}
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+}
+
+static void *alloc_coherent(struct device *dev, size_t size,
+			    dma_addr_t *dma_addr, gfp_t flag)
+{
+	unsigned long flags;
+	void *virt_addr;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+	phys_addr_t paddr;
+
+	virt_addr = (void *)__get_free_pages(flag, get_order(size));
+	if (!virt_addr)
+		return 0;
+
+	memset(virt_addr, 0, size);
+	paddr = virt_to_phys(virt_addr);
+
+	get_device_resources(dev, &iommu, &domain, &devid);
+
+	if (!iommu || !domain) {
+		*dma_addr = (dma_addr_t)paddr;
+		return virt_addr;
+	}
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
+				 size, DMA_BIDIRECTIONAL);
+
+	if (*dma_addr == bad_dma_address) {
+		free_pages((unsigned long)virt_addr, get_order(size));
+		virt_addr = NULL;
+		goto out;
+	}
+
+	if (iommu_has_npcache(iommu))
+		iommu_flush_pages(iommu, domain->id, *dma_addr, size);
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+out:
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return virt_addr;
+}
+
+static void free_coherent(struct device *dev, size_t size,
+			  void *virt_addr, dma_addr_t dma_addr)
+{
+	unsigned long flags;
+	struct amd_iommu *iommu;
+	struct protection_domain *domain;
+	u16 devid;
+
+	get_device_resources(dev, &iommu, &domain, &devid);
+
+	if (!iommu || !domain)
+		goto free_mem;
+
+	spin_lock_irqsave(&domain->lock, flags);
+
+	__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
+	iommu_flush_pages(iommu, domain->id, dma_addr, size);
+
+	if (iommu->need_sync)
+		iommu_completion_wait(iommu);
+
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+free_mem:
+	free_pages((unsigned long)virt_addr, get_order(size));
+}
+
+/*
+ * If the driver core informs the DMA layer if a driver grabs a device
+ * we don't need to preallocate the protection domains anymore.
+ * For now we have to.
+ */
+void prealloc_protection_domains(void)
+{
+	struct pci_dev *dev = NULL;
+	struct dma_ops_domain *dma_dom;
+	struct amd_iommu *iommu;
+	int order = amd_iommu_aperture_order;
+	u16 devid;
+
+	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+		devid = (dev->bus->number << 8) | dev->devfn;
+		if (devid >= amd_iommu_last_bdf)
+			continue;
+		devid = amd_iommu_alias_table[devid];
+		if (domain_for_device(devid))
+			continue;
+		iommu = amd_iommu_rlookup_table[devid];
+		if (!iommu)
+			continue;
+		dma_dom = dma_ops_domain_alloc(iommu, order);
+		if (!dma_dom)
+			continue;
+		init_unity_mappings_for_device(dma_dom, devid);
+		set_device_domain(iommu, &dma_dom->domain, devid);
+		printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ",
+		       dma_dom->domain.id);
+		print_devid(devid, 1);
+	}
+}
+
+static struct dma_mapping_ops amd_iommu_dma_ops = {
+	.alloc_coherent = alloc_coherent,
+	.free_coherent = free_coherent,
+	.map_single = map_single,
+	.unmap_single = unmap_single,
+	.map_sg = map_sg,
+	.unmap_sg = unmap_sg,
+};
+
+int __init amd_iommu_init_dma_ops(void)
+{
+	struct amd_iommu *iommu;
+	int order = amd_iommu_aperture_order;
+	int ret;
+
+	list_for_each_entry(iommu, &amd_iommu_list, list) {
+		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+		if (iommu->default_dom == NULL)
+			return -ENOMEM;
+		ret = iommu_init_unity_mappings(iommu);
+		if (ret)
+			goto free_domains;
+	}
+
+	if (amd_iommu_isolate)
+		prealloc_protection_domains();
+
+	iommu_detected = 1;
+	force_iommu = 1;
+	bad_dma_address = 0;
+#ifdef CONFIG_GART_IOMMU
+	gart_iommu_aperture_disabled = 1;
+	gart_iommu_aperture = 0;
+#endif
+
+	dma_ops = &amd_iommu_dma_ops;
+
+	return 0;
+
+free_domains:
+
+	list_for_each_entry(iommu, &amd_iommu_list, list) {
+		if (iommu->default_dom)
+			dma_ops_domain_free(iommu->default_dom);
+	}
+
+	return ret;
+}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
new file mode 100644
index 000000000000..2a13e430437d
--- /dev/null
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
+ * Author: Joerg Roedel <joerg.roedel@amd.com>
+ *         Leo Duran <leo.duran@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/sysdev.h>
+#include <asm/pci-direct.h>
+#include <asm/amd_iommu_types.h>
+#include <asm/amd_iommu.h>
+#include <asm/gart.h>
+
+/*
+ * definitions for the ACPI scanning code
+ */
+#define UPDATE_LAST_BDF(x) do {\
+	if ((x) > amd_iommu_last_bdf) \
+		amd_iommu_last_bdf = (x); \
+	} while (0);
+
+#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
+#define PCI_BUS(x) (((x) >> 8) & 0xff)
+#define IVRS_HEADER_LENGTH 48
+#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
+
+#define ACPI_IVHD_TYPE                  0x10
+#define ACPI_IVMD_TYPE_ALL              0x20
+#define ACPI_IVMD_TYPE                  0x21
+#define ACPI_IVMD_TYPE_RANGE            0x22
+
+#define IVHD_DEV_ALL                    0x01
+#define IVHD_DEV_SELECT                 0x02
+#define IVHD_DEV_SELECT_RANGE_START     0x03
+#define IVHD_DEV_RANGE_END              0x04
+#define IVHD_DEV_ALIAS                  0x42
+#define IVHD_DEV_ALIAS_RANGE            0x43
+#define IVHD_DEV_EXT_SELECT             0x46
+#define IVHD_DEV_EXT_SELECT_RANGE       0x47
+
+#define IVHD_FLAG_HT_TUN_EN             0x00
+#define IVHD_FLAG_PASSPW_EN             0x01
+#define IVHD_FLAG_RESPASSPW_EN          0x02
+#define IVHD_FLAG_ISOC_EN               0x03
+
+#define IVMD_FLAG_EXCL_RANGE            0x08
+#define IVMD_FLAG_UNITY_MAP             0x01
+
+#define ACPI_DEVFLAG_INITPASS           0x01
+#define ACPI_DEVFLAG_EXTINT             0x02
+#define ACPI_DEVFLAG_NMI                0x04
+#define ACPI_DEVFLAG_SYSMGT1            0x10
+#define ACPI_DEVFLAG_SYSMGT2            0x20
+#define ACPI_DEVFLAG_LINT0              0x40
+#define ACPI_DEVFLAG_LINT1              0x80
+#define ACPI_DEVFLAG_ATSDIS             0x10000000
+
+struct ivhd_header {
+	u8 type;
+	u8 flags;
+	u16 length;
+	u16 devid;
+	u16 cap_ptr;
+	u64 mmio_phys;
+	u16 pci_seg;
+	u16 info;
+	u32 reserved;
+} __attribute__((packed));
+
+struct ivhd_entry {
+	u8 type;
+	u16 devid;
+	u8 flags;
+	u32 ext;
+} __attribute__((packed));
+
+struct ivmd_header {
+	u8 type;
+	u8 flags;
+	u16 length;
+	u16 devid;
+	u16 aux;
+	u64 resv;
+	u64 range_start;
+	u64 range_length;
+} __attribute__((packed));
+
+static int __initdata amd_iommu_detected;
+
+u16 amd_iommu_last_bdf;
+struct list_head amd_iommu_unity_map;
+unsigned amd_iommu_aperture_order = 26;
+int amd_iommu_isolate;
+
+struct list_head amd_iommu_list;
+struct dev_table_entry *amd_iommu_dev_table;
+u16 *amd_iommu_alias_table;
+struct amd_iommu **amd_iommu_rlookup_table;
+struct protection_domain **amd_iommu_pd_table;
+unsigned long *amd_iommu_pd_alloc_bitmap;
+
+static u32 dev_table_size;
+static u32 alias_table_size;
+static u32 rlookup_table_size;
+
+static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+{
+	u64 start = iommu->exclusion_start & PAGE_MASK;
+	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
+	u64 entry;
+
+	if (!iommu->exclusion_start)
+		return;
+
+	entry = start | MMIO_EXCL_ENABLE_MASK;
+	memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
+			&entry, sizeof(entry));
+
+	entry = limit;
+	memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
+			&entry, sizeof(entry));
+}
+
+static void __init iommu_set_device_table(struct amd_iommu *iommu)
+{
+	u32 entry;
+
+	BUG_ON(iommu->mmio_base == NULL);
+
+	entry = virt_to_phys(amd_iommu_dev_table);
+	entry |= (dev_table_size >> 12) - 1;
+	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
+			&entry, sizeof(entry));
+}
+
+static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+{
+	u32 ctrl;
+
+	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl |= (1 << bit);
+	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
+{
+	u32 ctrl;
+
+	ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl &= ~(1 << bit);
+	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
+}
+
+void __init iommu_enable(struct amd_iommu *iommu)
+{
+	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
+	print_devid(iommu->devid, 0);
+	printk(" cap 0x%hx\n", iommu->cap_ptr);
+
+	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
+}
+
+static u8 * __init iommu_map_mmio_space(u64 address)
+{
+	u8 *ret;
+
+	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
+		return NULL;
+
+	ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
+	if (ret != NULL)
+		return ret;
+
+	release_mem_region(address, MMIO_REGION_LENGTH);
+
+	return NULL;
+}
+
+static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
+{
+	if (iommu->mmio_base)
+		iounmap(iommu->mmio_base);
+	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
+}
+
+static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
+{
+	u32 cap;
+
+	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
+	UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+
+	return 0;
+}
+
+static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
+{
+	u8 *p = (void *)h, *end = (void *)h;
+	struct ivhd_entry *dev;
+
+	p += sizeof(*h);
+	end += h->length;
+
+	find_last_devid_on_pci(PCI_BUS(h->devid),
+			PCI_SLOT(h->devid),
+			PCI_FUNC(h->devid),
+			h->cap_ptr);
+
+	while (p < end) {
+		dev = (struct ivhd_entry *)p;
+		switch (dev->type) {
+		case IVHD_DEV_SELECT:
+		case IVHD_DEV_RANGE_END:
+		case IVHD_DEV_ALIAS:
+		case IVHD_DEV_EXT_SELECT:
+			UPDATE_LAST_BDF(dev->devid);
+			break;
+		default:
+			break;
+		}
+		p += 0x04 << (*p >> 6);
+	}
+
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+static int __init find_last_devid_acpi(struct acpi_table_header *table)
+{
+	int i;
+	u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
+	struct ivhd_header *h;
+
+	/*
+	 * Validate checksum here so we don't need to do it when
+	 * we actually parse the table
+	 */
+	for (i = 0; i < table->length; ++i)
+		checksum += p[i];
+	if (checksum != 0)
+		/* ACPI table corrupt */
+		return -ENODEV;
+
+	p += IVRS_HEADER_LENGTH;
+
+	end += table->length;
+	while (p < end) {
+		h = (struct ivhd_header *)p;
+		switch (h->type) {
+		case ACPI_IVHD_TYPE:
+			find_last_devid_from_ivhd(h);
+			break;
+		default:
+			break;
+		}
+		p += h->length;
+	}
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
+{
+	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
+			get_order(CMD_BUFFER_SIZE));
+	u64 entry = 0;
+
+	if (cmd_buf == NULL)
+		return NULL;
+
+	iommu->cmd_buf_size = CMD_BUFFER_SIZE;
+
+	memset(cmd_buf, 0, CMD_BUFFER_SIZE);
+
+	entry = (u64)virt_to_phys(cmd_buf);
+	entry |= MMIO_CMD_SIZE_512;
+	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
+			&entry, sizeof(entry));
+
+	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
+
+	return cmd_buf;
+}
+
+static void __init free_command_buffer(struct amd_iommu *iommu)
+{
+	if (iommu->cmd_buf)
+		free_pages((unsigned long)iommu->cmd_buf,
+				get_order(CMD_BUFFER_SIZE));
+}
+
+static void set_dev_entry_bit(u16 devid, u8 bit)
+{
+	int i = (bit >> 5) & 0x07;
+	int _bit = bit & 0x1f;
+
+	amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
+}
+
+static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
+{
+	if (flags & ACPI_DEVFLAG_INITPASS)
+		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
+	if (flags & ACPI_DEVFLAG_EXTINT)
+		set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
+	if (flags & ACPI_DEVFLAG_NMI)
+		set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
+	if (flags & ACPI_DEVFLAG_SYSMGT1)
+		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
+	if (flags & ACPI_DEVFLAG_SYSMGT2)
+		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
+	if (flags & ACPI_DEVFLAG_LINT0)
+		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
+	if (flags & ACPI_DEVFLAG_LINT1)
+		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
+}
+
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+	amd_iommu_rlookup_table[devid] = iommu;
+}
+
+static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
+{
+	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
+
+	if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
+		return;
+
+	if (iommu) {
+		set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
+		iommu->exclusion_start = m->range_start;
+		iommu->exclusion_length = m->range_length;
+	}
+}
+
+static void __init init_iommu_from_pci(struct amd_iommu *iommu)
+{
+	int bus = PCI_BUS(iommu->devid);
+	int dev = PCI_SLOT(iommu->devid);
+	int fn  = PCI_FUNC(iommu->devid);
+	int cap_ptr = iommu->cap_ptr;
+	u32 range;
+
+	iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
+
+	range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
+	iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
+	iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
+}
+
+static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
+					struct ivhd_header *h)
+{
+	u8 *p = (u8 *)h;
+	u8 *end = p, flags = 0;
+	u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
+	u32 ext_flags = 0;
+	bool alias = 0;
+	struct ivhd_entry *e;
+
+	/*
+	 * First set the recommended feature enable bits from ACPI
+	 * into the IOMMU control registers
+	 */
+	h->flags & IVHD_FLAG_HT_TUN_EN ?
+		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
+		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
+
+	h->flags & IVHD_FLAG_PASSPW_EN ?
+		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
+		iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
+
+	h->flags & IVHD_FLAG_RESPASSPW_EN ?
+		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
+		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
+
+	h->flags & IVHD_FLAG_ISOC_EN ?
+		iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
+		iommu_feature_disable(iommu, CONTROL_ISOC_EN);
+
+	/*
+	 * make IOMMU memory accesses cache coherent
+	 */
+	iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
+
+	/*
+	 * Done. Now parse the device entries
+	 */
+	p += sizeof(struct ivhd_header);
+	end += h->length;
+
+	while (p < end) {
+		e = (struct ivhd_entry *)p;
+		switch (e->type) {
+		case IVHD_DEV_ALL:
+			for (dev_i = iommu->first_device;
+					dev_i <= iommu->last_device; ++dev_i)
+				set_dev_entry_from_acpi(dev_i, e->flags, 0);
+			break;
+		case IVHD_DEV_SELECT:
+			devid = e->devid;
+			set_dev_entry_from_acpi(devid, e->flags, 0);
+			break;
+		case IVHD_DEV_SELECT_RANGE_START:
+			devid_start = e->devid;
+			flags = e->flags;
+			ext_flags = 0;
+			alias = 0;
+			break;
+		case IVHD_DEV_ALIAS:
+			devid = e->devid;
+			devid_to = e->ext >> 8;
+			set_dev_entry_from_acpi(devid, e->flags, 0);
+			amd_iommu_alias_table[devid] = devid_to;
+			break;
+		case IVHD_DEV_ALIAS_RANGE:
+			devid_start = e->devid;
+			flags = e->flags;
+			devid_to = e->ext >> 8;
+			ext_flags = 0;
+			alias = 1;
+			break;
+		case IVHD_DEV_EXT_SELECT:
+			devid = e->devid;
+			set_dev_entry_from_acpi(devid, e->flags, e->ext);
+			break;
+		case IVHD_DEV_EXT_SELECT_RANGE:
+			devid_start = e->devid;
+			flags = e->flags;
+			ext_flags = e->ext;
+			alias = 0;
+			break;
+		case IVHD_DEV_RANGE_END:
+			devid = e->devid;
+			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
+				if (alias)
+					amd_iommu_alias_table[dev_i] = devid_to;
+				set_dev_entry_from_acpi(
+						amd_iommu_alias_table[dev_i],
+						flags, ext_flags);
+			}
+			break;
+		default:
+			break;
+		}
+
+		p += 0x04 << (e->type >> 6);
+	}
+}
+
+static int __init init_iommu_devices(struct amd_iommu *iommu)
+{
+	u16 i;
+
+	for (i = iommu->first_device; i <= iommu->last_device; ++i)
+		set_iommu_for_device(iommu, i);
+
+	return 0;
+}
+
+static void __init free_iommu_one(struct amd_iommu *iommu)
+{
+	free_command_buffer(iommu);
+	iommu_unmap_mmio_space(iommu);
+}
+
+static void __init free_iommu_all(void)
+{
+	struct amd_iommu *iommu, *next;
+
+	list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+		list_del(&iommu->list);
+		free_iommu_one(iommu);
+		kfree(iommu);
+	}
+}
+
+static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
+{
+	spin_lock_init(&iommu->lock);
+	list_add_tail(&iommu->list, &amd_iommu_list);
+
+	/*
+	 * Copy data from ACPI table entry to the iommu struct
+	 */
+	iommu->devid = h->devid;
+	iommu->cap_ptr = h->cap_ptr;
+	iommu->mmio_phys = h->mmio_phys;
+	iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
+	if (!iommu->mmio_base)
+		return -ENOMEM;
+
+	iommu_set_device_table(iommu);
+	iommu->cmd_buf = alloc_command_buffer(iommu);
+	if (!iommu->cmd_buf)
+		return -ENOMEM;
+
+	init_iommu_from_pci(iommu);
+	init_iommu_from_acpi(iommu, h);
+	init_iommu_devices(iommu);
+
+	return 0;
+}
+
+static int __init init_iommu_all(struct acpi_table_header *table)
+{
+	u8 *p = (u8 *)table, *end = (u8 *)table;
+	struct ivhd_header *h;
+	struct amd_iommu *iommu;
+	int ret;
+
+	INIT_LIST_HEAD(&amd_iommu_list);
+
+	end += table->length;
+	p += IVRS_HEADER_LENGTH;
+
+	while (p < end) {
+		h = (struct ivhd_header *)p;
+		switch (*p) {
+		case ACPI_IVHD_TYPE:
+			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
+			if (iommu == NULL)
+				return -ENOMEM;
+			ret = init_iommu_one(iommu, h);
+			if (ret)
+				return ret;
+			break;
+		default:
+			break;
+		}
+		p += h->length;
+
+	}
+	WARN_ON(p != end);
+
+	return 0;
+}
+
+static void __init free_unity_maps(void)
+{
+	struct unity_map_entry *entry, *next;
+
+	list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
+		list_del(&entry->list);
+		kfree(entry);
+	}
+}
+
+static int __init init_exclusion_range(struct ivmd_header *m)
+{
+	int i;
+
+	switch (m->type) {
+	case ACPI_IVMD_TYPE:
+		set_device_exclusion_range(m->devid, m);
+		break;
+	case ACPI_IVMD_TYPE_ALL:
+		for (i = 0; i < amd_iommu_last_bdf; ++i)
+			set_device_exclusion_range(i, m);
+		break;
+	case ACPI_IVMD_TYPE_RANGE:
+		for (i = m->devid; i <= m->aux; ++i)
+			set_device_exclusion_range(i, m);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int __init init_unity_map_range(struct ivmd_header *m)
+{
+	struct unity_map_entry *e = 0;
+
+	e = kzalloc(sizeof(*e), GFP_KERNEL);
+	if (e == NULL)
+		return -ENOMEM;
+
+	switch (m->type) {
+	default:
+	case ACPI_IVMD_TYPE:
+		e->devid_start = e->devid_end = m->devid;
+		break;
+	case ACPI_IVMD_TYPE_ALL:
+		e->devid_start = 0;
+		e->devid_end = amd_iommu_last_bdf;
+		break;
+	case ACPI_IVMD_TYPE_RANGE:
+		e->devid_start = m->devid;
+		e->devid_end = m->aux;
+		break;
+	}
+	e->address_start = PAGE_ALIGN(m->range_start);
+	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
+	e->prot = m->flags >> 1;
+
+	list_add_tail(&e->list, &amd_iommu_unity_map);
+
+	return 0;
+}
+
+static int __init init_memory_definitions(struct acpi_table_header *table)
+{
+	u8 *p = (u8 *)table, *end = (u8 *)table;
+	struct ivmd_header *m;
+
+	INIT_LIST_HEAD(&amd_iommu_unity_map);
+
+	end += table->length;
+	p += IVRS_HEADER_LENGTH;
+
+	while (p < end) {
+		m = (struct ivmd_header *)p;
+		if (m->flags & IVMD_FLAG_EXCL_RANGE)
+			init_exclusion_range(m);
+		else if (m->flags & IVMD_FLAG_UNITY_MAP)
+			init_unity_map_range(m);
+
+		p += m->length;
+	}
+
+	return 0;
+}
+
+static void __init enable_iommus(void)
+{
+	struct amd_iommu *iommu;
+
+	list_for_each_entry(iommu, &amd_iommu_list, list) {
+		iommu_set_exclusion_range(iommu);
+		iommu_enable(iommu);
+	}
+}
+
+/*
+ * Suspend/Resume support
+ * disable suspend until real resume implemented
+ */
+
+static int amd_iommu_resume(struct sys_device *dev)
+{
+	return 0;
+}
+
+static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return -EINVAL;
+}
+
+static struct sysdev_class amd_iommu_sysdev_class = {
+	.name = "amd_iommu",
+	.suspend = amd_iommu_suspend,
+	.resume = amd_iommu_resume,
+};
+
+static struct sys_device device_amd_iommu = {
+	.id = 0,
+	.cls = &amd_iommu_sysdev_class,
+};
+
+int __init amd_iommu_init(void)
+{
+	int i, ret = 0;
+
+
+	if (no_iommu) {
+		printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n");
+		return 0;
+	}
+
+	if (!amd_iommu_detected)
+		return -ENODEV;
+
+	/*
+	 * First parse ACPI tables to find the largest Bus/Dev/Func
+	 * we need to handle. Upon this information the shared data
+	 * structures for the IOMMUs in the system will be allocated
+	 */
+	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
+		return -ENODEV;
+
+	dev_table_size     = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
+	alias_table_size   = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
+	rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
+
+	ret = -ENOMEM;
+
+	/* Device table - directly used by all IOMMUs */
+	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
+				      get_order(dev_table_size));
+	if (amd_iommu_dev_table == NULL)
+		goto out;
+
+	/*
+	 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
+	 * IOMMU see for that device
+	 */
+	amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
+			get_order(alias_table_size));
+	if (amd_iommu_alias_table == NULL)
+		goto free;
+
+	/* IOMMU rlookup table - find the IOMMU for a specific device */
+	amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL,
+			get_order(rlookup_table_size));
+	if (amd_iommu_rlookup_table == NULL)
+		goto free;
+
+	/*
+	 * Protection Domain table - maps devices to protection domains
+	 * This table has the same size as the rlookup_table
+	 */
+	amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
+				     get_order(rlookup_table_size));
+	if (amd_iommu_pd_table == NULL)
+		goto free;
+
+	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
+					    get_order(MAX_DOMAIN_ID/8));
+	if (amd_iommu_pd_alloc_bitmap == NULL)
+		goto free;
+
+	/*
+	 * memory is allocated now; initialize the device table with all zeroes
+	 * and let all alias entries point to itself
+	 */
+	memset(amd_iommu_dev_table, 0, dev_table_size);
+	for (i = 0; i < amd_iommu_last_bdf; ++i)
+		amd_iommu_alias_table[i] = i;
+
+	memset(amd_iommu_pd_table, 0, rlookup_table_size);
+	memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
+
+	/*
+	 * never allocate domain 0 because its used as the non-allocated and
+	 * error value placeholder
+	 */
+	amd_iommu_pd_alloc_bitmap[0] = 1;
+
+	/*
+	 * now the data structures are allocated and basically initialized
+	 * start the real acpi table scan
+	 */
+	ret = -ENODEV;
+	if (acpi_table_parse("IVRS", init_iommu_all) != 0)
+		goto free;
+
+	if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
+		goto free;
+
+	ret = amd_iommu_init_dma_ops();
+	if (ret)
+		goto free;
+
+	ret = sysdev_class_register(&amd_iommu_sysdev_class);
+	if (ret)
+		goto free;
+
+	ret = sysdev_register(&device_amd_iommu);
+	if (ret)
+		goto free;
+
+	enable_iommus();
+
+	printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
+			(1 << (amd_iommu_aperture_order-20)));
+
+	printk(KERN_INFO "AMD IOMMU: device isolation ");
+	if (amd_iommu_isolate)
+		printk("enabled\n");
+	else
+		printk("disabled\n");
+
+out:
+	return ret;
+
+free:
+	if (amd_iommu_pd_alloc_bitmap)
+		free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+
+	if (amd_iommu_pd_table)
+		free_pages((unsigned long)amd_iommu_pd_table,
+				get_order(rlookup_table_size));
+
+	if (amd_iommu_rlookup_table)
+		free_pages((unsigned long)amd_iommu_rlookup_table,
+				get_order(rlookup_table_size));
+
+	if (amd_iommu_alias_table)
+		free_pages((unsigned long)amd_iommu_alias_table,
+				get_order(alias_table_size));
+
+	if (amd_iommu_dev_table)
+		free_pages((unsigned long)amd_iommu_dev_table,
+				get_order(dev_table_size));
+
+	free_iommu_all();
+
+	free_unity_maps();
+
+	goto out;
+}
+
+static int __init early_amd_iommu_detect(struct acpi_table_header *table)
+{
+	return 0;
+}
+
+void __init amd_iommu_detect(void)
+{
+	if (swiotlb || no_iommu || iommu_detected)
+		return;
+
+	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
+		iommu_detected = 1;
+		amd_iommu_detected = 1;
+#ifdef CONFIG_GART_IOMMU
+		gart_iommu_aperture_disabled = 1;
+		gart_iommu_aperture = 0;
+#endif
+	}
+}
+
+static int __init parse_amd_iommu_options(char *str)
+{
+	for (; *str; ++str) {
+		if (strcmp(str, "isolate") == 0)
+			amd_iommu_isolate = 1;
+	}
+
+	return 1;
+}
+
+static int __init parse_amd_iommu_size_options(char *str)
+{
+	for (; *str; ++str) {
+		if (strcmp(str, "32M") == 0)
+			amd_iommu_aperture_order = 25;
+		if (strcmp(str, "64M") == 0)
+			amd_iommu_aperture_order = 26;
+		if (strcmp(str, "128M") == 0)
+			amd_iommu_aperture_order = 27;
+		if (strcmp(str, "256M") == 0)
+			amd_iommu_aperture_order = 28;
+		if (strcmp(str, "512M") == 0)
+			amd_iommu_aperture_order = 29;
+		if (strcmp(str, "1G") == 0)
+			amd_iommu_aperture_order = 30;
+	}
+
+	return 1;
+}
+
+__setup("amd_iommu=", parse_amd_iommu_options);
+__setup("amd_iommu_size=", parse_amd_iommu_size_options);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 479926d9e004..600470d464fa 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -35,6 +35,18 @@ int fallback_aper_force __initdata;
 
 int fix_aperture __initdata = 1;
 
+struct bus_dev_range {
+	int bus;
+	int dev_base;
+	int dev_limit;
+};
+
+static struct bus_dev_range bus_dev_ranges[] __initdata = {
+	{ 0x00, 0x18, 0x20},
+	{ 0xff, 0x00, 0x20},
+	{ 0xfe, 0x00, 0x20}
+};
+
 static struct resource gart_resource = {
 	.name	= "GART",
 	.flags	= IORESOURCE_MEM,
@@ -55,8 +67,9 @@ static u32 __init allocate_aperture(void)
 	u32 aper_size;
 	void *p;
 
-	if (fallback_aper_order > 7)
-		fallback_aper_order = 7;
+	/* aper_size should <= 1G */
+	if (fallback_aper_order > 5)
+		fallback_aper_order = 5;
 	aper_size = (32 * 1024 * 1024) << fallback_aper_order;
 
 	/*
@@ -65,7 +78,20 @@ static u32 __init allocate_aperture(void)
 	 * memory. Unfortunately we cannot move it up because that would
 	 * make the IOMMU useless.
 	 */
-	p = __alloc_bootmem_nopanic(aper_size, aper_size, 0);
+	/*
+	 * using 512M as goal, in case kexec will load kernel_big
+	 * that will do the on position decompress, and  could overlap with
+	 * that positon with gart that is used.
+	 * sequende:
+	 * kernel_small
+	 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+	 * ==> kernel_small(gart area become e820_reserved)
+	 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
+	 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+	 * so don't use 512M below as gart iommu, leave the space for kernel
+	 * code for safe
+	 */
+	p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
 	if (!p || __pa(p)+aper_size > 0xffffffff) {
 		printk(KERN_ERR
 			"Cannot allocate aperture memory hole (%p,%uK)\n",
@@ -83,69 +109,53 @@ static u32 __init allocate_aperture(void)
 	return (u32)__pa(p);
 }
 
-static int __init aperture_valid(u64 aper_base, u32 aper_size)
-{
-	if (!aper_base)
-		return 0;
-
-	if (aper_base + aper_size > 0x100000000UL) {
-		printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n");
-		return 0;
-	}
-	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
-		printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n");
-		return 0;
-	}
-	if (aper_size < 64*1024*1024) {
-		printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20);
-		return 0;
-	}
-
-	return 1;
-}
 
 /* Find a PCI capability */
-static __u32 __init find_cap(int num, int slot, int func, int cap)
+static u32 __init find_cap(int bus, int slot, int func, int cap)
 {
 	int bytes;
 	u8 pos;
 
-	if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+	if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) &
 						PCI_STATUS_CAP_LIST))
 		return 0;
 
-	pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+	pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST);
 	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
 		u8 id;
 
 		pos &= ~3;
-		id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
+		id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID);
 		if (id == 0xff)
 			break;
 		if (id == cap)
 			return pos;
-		pos = read_pci_config_byte(num, slot, func,
+		pos = read_pci_config_byte(bus, slot, func,
 						pos+PCI_CAP_LIST_NEXT);
 	}
 	return 0;
 }
 
 /* Read a standard AGPv3 bridge header */
-static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
 {
 	u32 apsize;
 	u32 apsizereg;
 	int nbits;
 	u32 aper_low, aper_hi;
 	u64 aper;
+	u32 old_order;
 
-	printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func);
-	apsizereg = read_pci_config_16(num, slot, func, cap + 0x14);
+	printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+	apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
 	if (apsizereg == 0xffffffff) {
 		printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
 		return 0;
 	}
 
+	/* old_order could be the value from NB gart setting */
+	old_order = *order;
+
 	apsize = apsizereg & 0xfff;
 	/* Some BIOS use weird encodings not in the AGPv3 table. */
 	if (apsize & 0xff)
@@ -155,14 +165,26 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
 	if ((int)*order < 0) /* < 32MB */
 		*order = 0;
 
-	aper_low = read_pci_config(num, slot, func, 0x10);
-	aper_hi = read_pci_config(num, slot, func, 0x14);
+	aper_low = read_pci_config(bus, slot, func, 0x10);
+	aper_hi = read_pci_config(bus, slot, func, 0x14);
 	aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
 
+	/*
+	 * On some sick chips, APSIZE is 0. It means it wants 4G
+	 * so let double check that order, and lets trust AMD NB settings:
+	 */
+	printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
+			aper, 32 << old_order);
+	if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
+		printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
+				32 << *order, apsizereg);
+		*order = old_order;
+	}
+
 	printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
 			aper, 32 << *order, apsizereg);
 
-	if (!aperture_valid(aper, (32*1024*1024) << *order))
+	if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
 		return 0;
 	return (u32)aper;
 }
@@ -180,17 +202,17 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
  * the AGP bridges should be always an own bus on the HT hierarchy,
  * but do it here for future safety.
  */
-static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 {
-	int num, slot, func;
+	int bus, slot, func;
 
 	/* Poor man's PCI discovery */
-	for (num = 0; num < 256; num++) {
+	for (bus = 0; bus < 256; bus++) {
 		for (slot = 0; slot < 32; slot++) {
 			for (func = 0; func < 8; func++) {
 				u32 class, cap;
 				u8 type;
-				class = read_pci_config(num, slot, func,
+				class = read_pci_config(bus, slot, func,
 							PCI_CLASS_REVISION);
 				if (class == 0xffffffff)
 					break;
@@ -199,17 +221,17 @@ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
 				case PCI_CLASS_BRIDGE_HOST:
 				case PCI_CLASS_BRIDGE_OTHER: /* needed? */
 					/* AGP bridge? */
-					cap = find_cap(num, slot, func,
+					cap = find_cap(bus, slot, func,
 							PCI_CAP_ID_AGP);
 					if (!cap)
 						break;
 					*valid_agp = 1;
-					return read_agp(num, slot, func, cap,
+					return read_agp(bus, slot, func, cap,
 							order);
 				}
 
 				/* No multi-function device? */
-				type = read_pci_config_byte(num, slot, func,
+				type = read_pci_config_byte(bus, slot, func,
 							       PCI_HEADER_TYPE);
 				if (!(type & 0x80))
 					break;
@@ -249,36 +271,50 @@ void __init early_gart_iommu_check(void)
 	 * or BIOS forget to put that in reserved.
 	 * try to update e820 to make that region as reserved.
 	 */
-	int fix, num;
+	int i, fix, slot;
 	u32 ctl;
 	u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
 	u64 aper_base = 0, last_aper_base = 0;
-	int aper_enabled = 0, last_aper_enabled = 0;
+	int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
 
 	if (!early_pci_allowed())
 		return;
 
+	/* This is mostly duplicate of iommu_hole_init */
 	fix = 0;
-	for (num = 24; num < 32; num++) {
-		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-			continue;
-
-		ctl = read_pci_config(0, num, 3, 0x90);
-		aper_enabled = ctl & 1;
-		aper_order = (ctl >> 1) & 7;
-		aper_size = (32 * 1024 * 1024) << aper_order;
-		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
-		aper_base <<= 25;
-
-		if ((last_aper_order && aper_order != last_aper_order) ||
-		    (last_aper_base && aper_base != last_aper_base) ||
-		    (last_aper_enabled && aper_enabled != last_aper_enabled)) {
-			fix = 1;
-			break;
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+		int bus;
+		int dev_base, dev_limit;
+
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
+
+		for (slot = dev_base; slot < dev_limit; slot++) {
+			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+				continue;
+
+			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+			aper_enabled = ctl & AMD64_GARTEN;
+			aper_order = (ctl >> 1) & 7;
+			aper_size = (32 * 1024 * 1024) << aper_order;
+			aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+			aper_base <<= 25;
+
+			if (last_valid) {
+				if ((aper_order != last_aper_order) ||
+				    (aper_base != last_aper_base) ||
+				    (aper_enabled != last_aper_enabled)) {
+					fix = 1;
+					break;
+				}
+			}
+
+			last_aper_order = aper_order;
+			last_aper_base = aper_base;
+			last_aper_enabled = aper_enabled;
+			last_valid = 1;
 		}
-		last_aper_order = aper_order;
-		last_aper_base = aper_base;
-		last_aper_enabled = aper_enabled;
 	}
 
 	if (!fix && !aper_enabled)
@@ -290,32 +326,46 @@ void __init early_gart_iommu_check(void)
 	if (gart_fix_e820 && !fix && aper_enabled) {
 		if (e820_any_mapped(aper_base, aper_base + aper_size,
 				    E820_RAM)) {
-			/* reserved it, so we can resuse it in second kernel */
+			/* reserve it, so we can reuse it in second kernel */
 			printk(KERN_INFO "update e820 for GART\n");
-			add_memory_region(aper_base, aper_size, E820_RESERVED);
+			e820_add_region(aper_base, aper_size, E820_RESERVED);
 			update_e820();
 		}
-		return;
 	}
 
+	if (!fix)
+		return;
+
 	/* different nodes have different setting, disable them all at first*/
-	for (num = 24; num < 32; num++) {
-		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-			continue;
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+		int bus;
+		int dev_base, dev_limit;
+
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
+
+		for (slot = dev_base; slot < dev_limit; slot++) {
+			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+				continue;
 
-		ctl = read_pci_config(0, num, 3, 0x90);
-		ctl &= ~1;
-		write_pci_config(0, num, 3, 0x90, ctl);
+			ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
+			ctl &= ~AMD64_GARTEN;
+			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
+		}
 	}
 
 }
 
+static int __initdata printed_gart_size_msg;
+
 void __init gart_iommu_hole_init(void)
 {
+	u32 agp_aper_base = 0, agp_aper_order = 0;
 	u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
 	u64 aper_base, last_aper_base = 0;
-	int fix, num, valid_agp = 0;
-	int node;
+	int fix, slot, valid_agp = 0;
+	int i, node;
 
 	if (gart_iommu_aperture_disabled || !fix_aperture ||
 	    !early_pci_allowed())
@@ -323,38 +373,63 @@ void __init gart_iommu_hole_init(void)
 
 	printk(KERN_INFO  "Checking aperture...\n");
 
+	if (!fallback_aper_force)
+		agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+
 	fix = 0;
 	node = 0;
-	for (num = 24; num < 32; num++) {
-		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-			continue;
-
-		iommu_detected = 1;
-		gart_iommu_aperture = 1;
-
-		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
-		aper_size = (32 * 1024 * 1024) << aper_order;
-		aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
-		aper_base <<= 25;
-
-		printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
-				node, aper_base, aper_size >> 20);
-		node++;
-
-		if (!aperture_valid(aper_base, aper_size)) {
-			fix = 1;
-			break;
-		}
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+		int bus;
+		int dev_base, dev_limit;
+
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
+
+		for (slot = dev_base; slot < dev_limit; slot++) {
+			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+				continue;
+
+			iommu_detected = 1;
+			gart_iommu_aperture = 1;
+
+			aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7;
+			aper_size = (32 * 1024 * 1024) << aper_order;
+			aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
+			aper_base <<= 25;
+
+			printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
+					node, aper_base, aper_size >> 20);
+			node++;
+
+			if (!aperture_valid(aper_base, aper_size, 64<<20)) {
+				if (valid_agp && agp_aper_base &&
+				    agp_aper_base == aper_base &&
+				    agp_aper_order == aper_order) {
+					/* the same between two setting from NB and agp */
+					if (!no_iommu && end_pfn > MAX_DMA32_PFN && !printed_gart_size_msg) {
+						printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
+						printk(KERN_ERR "please increase GART size in your BIOS setup\n");
+						printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+						printed_gart_size_msg = 1;
+					}
+				} else {
+					fix = 1;
+					goto out;
+				}
+			}
 
-		if ((last_aper_order && aper_order != last_aper_order) ||
-		    (last_aper_base && aper_base != last_aper_base)) {
-			fix = 1;
-			break;
+			if ((last_aper_order && aper_order != last_aper_order) ||
+			    (last_aper_base && aper_base != last_aper_base)) {
+				fix = 1;
+				goto out;
+			}
+			last_aper_order = aper_order;
+			last_aper_base = aper_base;
 		}
-		last_aper_order = aper_order;
-		last_aper_base = aper_base;
 	}
 
+out:
 	if (!fix && !fallback_aper_force) {
 		if (last_aper_base) {
 			unsigned long n = (32 * 1024 * 1024) << last_aper_order;
@@ -364,8 +439,10 @@ void __init gart_iommu_hole_init(void)
 		return;
 	}
 
-	if (!fallback_aper_force)
-		aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
+	if (!fallback_aper_force) {
+		aper_alloc = agp_aper_base;
+		aper_order = agp_aper_order;
+	}
 
 	if (aper_alloc) {
 		/* Got the aperture from the AGP bridge */
@@ -401,16 +478,24 @@ void __init gart_iommu_hole_init(void)
 	}
 
 	/* Fix up the north bridges */
-	for (num = 24; num < 32; num++) {
-		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
-			continue;
-
-		/*
-		 * Don't enable translation yet. That is done later.
-		 * Assume this BIOS didn't initialise the GART so
-		 * just overwrite all previous bits
-		 */
-		write_pci_config(0, num, 3, 0x90, aper_order<<1);
-		write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
+	for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+		int bus;
+		int dev_base, dev_limit;
+
+		bus = bus_dev_ranges[i].bus;
+		dev_base = bus_dev_ranges[i].dev_base;
+		dev_limit = bus_dev_ranges[i].dev_limit;
+		for (slot = dev_base; slot < dev_limit; slot++) {
+			if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+				continue;
+
+			/* Don't enable translation yet. That is done later.
+			   Assume this BIOS didn't initialise the GART so
+			   just overwrite all previous bits */
+			write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
+			write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
+		}
 	}
+
+	set_up_gart_resume(aper_order, aper_alloc);
 }
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index f17c1c1bc384..84ce106b33c8 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -61,18 +61,26 @@ static int enable_local_apic __initdata;
 
 /* Local APIC timer verification ok */
 static int local_apic_timer_verify_ok;
-/* Disable local APIC timer from the kernel commandline or via dmi quirk
-   or using CPU MSR check */
-int local_apic_timer_disabled;
+/* Disable local APIC timer from the kernel commandline or via dmi quirk */
+static int local_apic_timer_disabled;
 /* Local APIC timer works in C2 */
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
 
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+
 /*
  * Debug level, exported for io_apic.c
  */
 int apic_verbosity;
 
+int pic_mode;
+
+/* Have we found an MP table */
+int smp_found_config;
+
 static unsigned int calibration_result;
 
 static int lapic_next_event(unsigned long delta,
@@ -1151,9 +1159,6 @@ static int __init detect_init_APIC(void)
 	if (l & MSR_IA32_APICBASE_ENABLE)
 		mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
 
-	if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED)
-		nmi_watchdog = NMI_LOCAL_APIC;
-
 	printk(KERN_INFO "Found and enabled local APIC!\n");
 
 	apic_pm_activate();
@@ -1199,7 +1204,7 @@ void __init init_apic_mappings(void)
 
 		for (i = 0; i < nr_ioapics; i++) {
 			if (smp_found_config) {
-				ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+				ioapic_phys = mp_ioapics[i].mp_apicaddr;
 				if (!ioapic_phys) {
 					printk(KERN_ERR
 					       "WARNING: bogus zero IO-APIC "
@@ -1266,6 +1271,10 @@ int __init APIC_init_uniprocessor(void)
 
 	setup_local_APIC();
 
+#ifdef CONFIG_X86_IO_APIC
+	if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
+#endif
+		localise_nmi_watchdog();
 	end_local_APIC_setup();
 #ifdef CONFIG_X86_IO_APIC
 	if (smp_found_config)
@@ -1348,13 +1357,13 @@ void __init smp_intr_init(void)
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
 	 * IPI, driven by wakeup.
 	 */
-	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
 
 	/* IPI for invalidation */
-	set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
 
 	/* IPI for generic function call */
-	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 }
 #endif
 
@@ -1367,15 +1376,15 @@ void __init apic_intr_init(void)
 	smp_intr_init();
 #endif
 	/* self generated IPI for local APIC timer */
-	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
 	/* IPI vectors for APIC spurious and error interrupts */
-	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
 	/* thermal monitor LVT interrupt */
 #ifdef CONFIG_X86_MCE_P4THERMAL
-	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
 }
 
@@ -1510,6 +1519,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		 */
 		cpu = 0;
 
+	if (apicid > max_physical_apicid)
+		max_physical_apicid = apicid;
+
 	/*
 	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
 	 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1517,7 +1529,7 @@ void __cpuinit generic_processor_info(int apicid, int version)
 	 * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
 	 *       - Ashok Raj <ashok.raj@intel.com>
 	 */
-	if (num_processors > 8) {
+	if (max_physical_apicid >= 8) {
 		switch (boot_cpu_data.x86_vendor) {
 		case X86_VENDOR_INTEL:
 			if (!APIC_XAPIC(version)) {
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 4fd21f7d698c..e494809fc508 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -43,7 +43,7 @@
 #include <mach_ipi.h>
 #include <mach_apic.h>
 
-int disable_apic_timer __cpuinitdata;
+static int disable_apic_timer __cpuinitdata;
 static int apic_calibrate_pmtmr __initdata;
 int disable_apic;
 
@@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
  */
 int apic_verbosity;
 
+/* Have we found an MP table */
+int smp_found_config;
+
 static struct resource lapic_resource = {
 	.name = "Local APIC",
 	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
@@ -419,32 +422,8 @@ void __init setup_boot_APIC_clock(void)
 	setup_APIC_timer();
 }
 
-/*
- * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the
- * C1E flag only in the secondary CPU, so when we detect the wreckage
- * we already have enabled the boot CPU local apic timer. Check, if
- * disable_apic_timer is set and the DUMMY flag is cleared. If yes,
- * set the DUMMY flag again and force the broadcast mode in the
- * clockevents layer.
- */
-static void __cpuinit check_boot_apic_timer_broadcast(void)
-{
-	if (!disable_apic_timer ||
-	    (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
-		return;
-
-	printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n");
-	lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
-
-	local_irq_enable();
-	clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
-			   &boot_cpu_physical_apicid);
-	local_irq_disable();
-}
-
 void __cpuinit setup_secondary_APIC_clock(void)
 {
-	check_boot_apic_timer_broadcast();
 	setup_APIC_timer();
 }
 
@@ -872,7 +851,7 @@ static int __init detect_init_APIC(void)
 
 void __init early_init_lapic_mapping(void)
 {
-	unsigned long apic_phys;
+	unsigned long phys_addr;
 
 	/*
 	 * If no local APIC can be found then go out
@@ -881,11 +860,11 @@ void __init early_init_lapic_mapping(void)
 	if (!smp_found_config)
 		return;
 
-	apic_phys = mp_lapic_addr;
+	phys_addr = mp_lapic_addr;
 
-	set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+	set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
 	apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
-				 APIC_BASE, apic_phys);
+		    APIC_BASE, phys_addr);
 
 	/*
 	 * Fetch the APIC ID of the BSP in case we have a
@@ -951,6 +930,8 @@ int __init APIC_init_uniprocessor(void)
 	if (!skip_ioapic_setup && nr_ioapics)
 		enable_IO_APIC();
 
+	if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
+		localise_nmi_watchdog();
 	end_local_APIC_setup();
 
 	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
@@ -1087,6 +1068,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		 */
 		cpu = 0;
 	}
+	if (apicid > max_physical_apicid)
+		max_physical_apicid = apicid;
+
 	/* are we being called early in kernel startup? */
 	if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
 		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9290e29013..00e6d1370954 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,6 +228,7 @@
 #include <linux/suspend.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>
+#include <linux/smp_lock.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -1149,7 +1150,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
 				as->event_tail = 0;
 		}
 		as->events[as->event_head] = event;
-		if ((!as->suser) || (!as->writer))
+		if (!as->suser || !as->writer)
 			continue;
 		switch (event) {
 		case APM_SYS_SUSPEND:
@@ -1396,7 +1397,7 @@ static void apm_mainloop(void)
 
 static int check_apm_user(struct apm_user *as, const char *func)
 {
-	if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) {
+	if (as == NULL || as->magic != APM_BIOS_MAGIC) {
 		printk(KERN_ERR "apm: %s passed bad filp\n", func);
 		return 1;
 	}
@@ -1459,18 +1460,19 @@ static unsigned int do_poll(struct file *fp, poll_table *wait)
 	return 0;
 }
 
-static int do_ioctl(struct inode *inode, struct file *filp,
-		    u_int cmd, u_long arg)
+static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
 {
 	struct apm_user *as;
+	int ret;
 
 	as = filp->private_data;
 	if (check_apm_user(as, "ioctl"))
 		return -EIO;
-	if ((!as->suser) || (!as->writer))
+	if (!as->suser || !as->writer)
 		return -EPERM;
 	switch (cmd) {
 	case APM_IOC_STANDBY:
+		lock_kernel();
 		if (as->standbys_read > 0) {
 			as->standbys_read--;
 			as->standbys_pending--;
@@ -1479,8 +1481,10 @@ static int do_ioctl(struct inode *inode, struct file *filp,
 			queue_event(APM_USER_STANDBY, as);
 		if (standbys_pending <= 0)
 			standby();
+		unlock_kernel();
 		break;
 	case APM_IOC_SUSPEND:
+		lock_kernel();
 		if (as->suspends_read > 0) {
 			as->suspends_read--;
 			as->suspends_pending--;
@@ -1488,16 +1492,17 @@ static int do_ioctl(struct inode *inode, struct file *filp,
 		} else
 			queue_event(APM_USER_SUSPEND, as);
 		if (suspends_pending <= 0) {
-			return suspend(1);
+			ret = suspend(1);
 		} else {
 			as->suspend_wait = 1;
 			wait_event_interruptible(apm_suspend_waitqueue,
 					as->suspend_wait == 0);
-			return as->suspend_result;
+			ret = as->suspend_result;
 		}
-		break;
+		unlock_kernel();
+		return ret;
 	default:
-		return -EINVAL;
+		return -ENOTTY;
 	}
 	return 0;
 }
@@ -1860,7 +1865,7 @@ static const struct file_operations apm_bios_fops = {
 	.owner		= THIS_MODULE,
 	.read		= do_read,
 	.poll		= do_poll,
-	.ioctl		= do_ioctl,
+	.unlocked_ioctl	= do_ioctl,
 	.open		= do_open,
 	.release	= do_release,
 };
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0c6f8190887..65b1be5fe9ce 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -6,11 +6,15 @@ obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o feature_names.o
 
 obj-$(CONFIG_X86_32)	+= common.o bugs.o
+obj-$(CONFIG_X86_64)	+= bugs_64.o
 obj-$(CONFIG_X86_32)	+= amd.o
+obj-$(CONFIG_X86_64)	+= amd_64.o
 obj-$(CONFIG_X86_32)	+= cyrix.o
 obj-$(CONFIG_X86_32)	+= centaur.o
+obj-$(CONFIG_X86_64)	+= centaur_64.o
 obj-$(CONFIG_X86_32)	+= transmeta.o
 obj-$(CONFIG_X86_32)	+= intel.o
+obj-$(CONFIG_X86_64)	+= intel_64.o
 obj-$(CONFIG_X86_32)	+= umc.o
 
 obj-$(CONFIG_X86_MCE)	+= mcheck/
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c2e1ce33c7cb..84a8220a6072 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -1,9 +1,7 @@
-
 /*
  *	Routines to indentify additional cpu features that are scattered in
  *	cpuid space.
  */
-
 #include <linux/cpu.h>
 
 #include <asm/pat.h>
@@ -53,19 +51,20 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_PAT
 void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
 {
+	if (!cpu_has_pat)
+		pat_disable("PAT not supported by CPU.");
+
 	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		if (c->x86 >= 0xf && c->x86 <= 0x11)
-			return;
-		break;
 	case X86_VENDOR_INTEL:
 		if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
 			return;
 		break;
+	case X86_VENDOR_AMD:
+	case X86_VENDOR_CENTAUR:
+	case X86_VENDOR_TRANSMETA:
+		return;
 	}
 
-	pat_disable(cpu_has_pat ?
-		    "PAT disabled. Not yet verified on this CPU type." :
-		    "PAT not supported by CPU.");
+	pat_disable("PAT disabled. Not yet verified on this CPU type.");
 }
 #endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 245866828294..81a07ca65d44 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -24,43 +24,6 @@
 extern void vide(void);
 __asm__(".align 4\nvide: ret");
 
-#ifdef CONFIG_X86_LOCAL_APIC
-#define ENABLE_C1E_MASK         0x18000000
-#define CPUID_PROCESSOR_SIGNATURE       1
-#define CPUID_XFAM              0x0ff00000
-#define CPUID_XFAM_K8           0x00000000
-#define CPUID_XFAM_10H          0x00100000
-#define CPUID_XFAM_11H          0x00200000
-#define CPUID_XMOD              0x000f0000
-#define CPUID_XMOD_REV_F        0x00040000
-
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
-{
-	u32 lo, hi;
-	u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-	switch (eax & CPUID_XFAM) {
-	case CPUID_XFAM_K8:
-		if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
-			break;
-	case CPUID_XFAM_10H:
-	case CPUID_XFAM_11H:
-		rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
-		if (lo & ENABLE_C1E_MASK) {
-			if (smp_processor_id() != boot_cpu_physical_apicid)
-				printk(KERN_INFO "AMD C1E detected late. "
-				       "	Force timer broadcast.\n");
-			return 1;
-		}
-		break;
-	default:
-		/* err on the side of caution */
-		return 1;
-	}
-	return 0;
-}
-#endif
-
 int force_mwait __cpuinitdata;
 
 static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
@@ -297,11 +260,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 			num_cache_leaves = 3;
 	}
 
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (amd_apic_timer_broken())
-		local_apic_timer_disabled = 1;
-#endif
-
 	/* K6s reports MCEs but don't actually have all the MSRs */
 	if (c->x86 < 6)
 		clear_cpu_cap(c, X86_FEATURE_MCE);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
new file mode 100644
index 000000000000..30b7557c9641
--- /dev/null
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -0,0 +1,211 @@
+#include <linux/init.h>
+#include <linux/mm.h>
+
+#include <asm/numa_64.h>
+#include <asm/mmconfig.h>
+#include <asm/cacheflush.h>
+
+#include <mach_apic.h>
+
+#include "cpu.h"
+
+int force_mwait __cpuinitdata;
+
+#ifdef CONFIG_NUMA
+static int __cpuinit nearby_node(int apicid)
+{
+	int i, node;
+
+	for (i = apicid - 1; i >= 0; i--) {
+		node = apicid_to_node[i];
+		if (node != NUMA_NO_NODE && node_online(node))
+			return node;
+	}
+	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+		node = apicid_to_node[i];
+		if (node != NUMA_NO_NODE && node_online(node))
+			return node;
+	}
+	return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned bits;
+#ifdef CONFIG_NUMA
+	int cpu = smp_processor_id();
+	int node = 0;
+	unsigned apicid = hard_smp_processor_id();
+#endif
+	bits = c->x86_coreid_bits;
+
+	/* Low order bits define the core id (index of core in socket) */
+	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+	/* Convert the initial APIC ID into the socket ID */
+	c->phys_proc_id = c->initial_apicid >> bits;
+
+#ifdef CONFIG_NUMA
+	node = c->phys_proc_id;
+	if (apicid_to_node[apicid] != NUMA_NO_NODE)
+		node = apicid_to_node[apicid];
+	if (!node_online(node)) {
+		/* Two possibilities here:
+		   - The CPU is missing memory and no node was created.
+		   In that case try picking one from a nearby CPU
+		   - The APIC IDs differ from the HyperTransport node IDs
+		   which the K8 northbridge parsing fills in.
+		   Assume they are all increased by a constant offset,
+		   but in the same order as the HT nodeids.
+		   If that doesn't result in a usable node fall back to the
+		   path for the previous case.  */
+
+		int ht_nodeid = c->initial_apicid;
+
+		if (ht_nodeid >= 0 &&
+		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+			node = apicid_to_node[ht_nodeid];
+		/* Pick a nearby node */
+		if (!node_online(node))
+			node = nearby_node(apicid);
+	}
+	numa_set_node(cpu, node);
+
+	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned bits, ecx;
+
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level < 0x80000008)
+		return;
+
+	ecx = cpuid_ecx(0x80000008);
+
+	c->x86_max_cores = (ecx & 0xff) + 1;
+
+	/* CPU telling us the core id bits shift? */
+	bits = (ecx >> 12) & 0xF;
+
+	/* Otherwise recompute */
+	if (bits == 0) {
+		while ((1 << bits) < c->x86_max_cores)
+			bits++;
+	}
+
+	c->x86_coreid_bits = bits;
+
+#endif
+}
+
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+	early_init_amd_mc(c);
+
+	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+	if (c->x86_power & (1<<8))
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
+	unsigned level;
+
+#ifdef CONFIG_SMP
+	unsigned long value;
+
+	/*
+	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
+	 * bit 6 of msr C001_0015
+	 *
+	 * Errata 63 for SH-B3 steppings
+	 * Errata 122 for all steppings (F+ have it disabled by default)
+	 */
+	if (c->x86 == 15) {
+		rdmsrl(MSR_K8_HWCR, value);
+		value |= 1 << 6;
+		wrmsrl(MSR_K8_HWCR, value);
+	}
+#endif
+
+	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
+	clear_cpu_cap(c, 0*32+31);
+
+	/* On C+ stepping K8 rep microcode works well for copy/memset */
+	level = cpuid_eax(1);
+	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
+			     level >= 0x0f58))
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	if (c->x86 == 0x10 || c->x86 == 0x11)
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+	/* Enable workaround for FXSAVE leak */
+	if (c->x86 >= 6)
+		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
+
+	level = get_model_name(c);
+	if (!level) {
+		switch (c->x86) {
+		case 15:
+			/* Should distinguish Models here, but this is only
+			   a fallback anyways. */
+			strcpy(c->x86_model_id, "Hammer");
+			break;
+		}
+	}
+	display_cacheinfo(c);
+
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level >= 0x80000008)
+		amd_detect_cmp(c);
+
+	if (c->extended_cpuid_level >= 0x80000006 &&
+		(cpuid_edx(0x80000006) & 0xf000))
+		num_cache_leaves = 4;
+	else
+		num_cache_leaves = 3;
+
+	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
+		set_cpu_cap(c, X86_FEATURE_K8);
+
+	/* MFENCE stops RDTSC speculation */
+	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+
+	if (c->x86 == 0x10)
+		fam10h_check_enable_mmcfg();
+
+	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+		unsigned long long tseg;
+
+		/*
+		 * Split up direct mapping around the TSEG SMM area.
+		 * Don't do it for gbpages because there seems very little
+		 * benefit in doing so.
+		 */
+		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
+		    (tseg >> PMD_SHIFT) <
+			(max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
+			set_memory_4k((unsigned long)__va(tseg), 1);
+	}
+}
+
+static struct cpu_dev amd_cpu_dev __cpuinitdata = {
+	.c_vendor	= "AMD",
+	.c_ident	= { "AuthenticAMD" },
+	.c_early_init   = early_init_amd,
+	.c_init		= init_amd,
+};
+
+cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 170d2f5523b2..1b1c56bb338f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -59,8 +59,12 @@ static void __init check_fpu(void)
 		return;
 	}
 
-/* trap_init() enabled FXSR and company _before_ testing for FP problems here. */
-	/* Test for the divl bug.. */
+	/*
+	 * trap_init() enabled FXSR and company _before_ testing for FP
+	 * problems here.
+	 *
+	 * Test for the divl bug..
+	 */
 	__asm__("fninit\n\t"
 		"fldl %1\n\t"
 		"fdivl %2\n\t"
@@ -108,10 +112,15 @@ static void __init check_popad(void)
 	  "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
 	  : "=&a" (res)
 	  : "d" (inp)
-	  : "ecx", "edi" );
-	/* If this fails, it means that any user program may lock the CPU hard. Too bad. */
-	if (res != 12345678) printk( "Buggy.\n" );
-		        else printk( "OK.\n" );
+	  : "ecx", "edi");
+	/*
+	 * If this fails, it means that any user program may lock the
+	 * CPU hard. Too bad.
+	 */
+	if (res != 12345678)
+		printk("Buggy.\n");
+	else
+		printk("OK.\n");
 #endif
 }
 
@@ -137,7 +146,8 @@ static void __init check_config(void)
  * i486+ only features! (WP works in supervisor mode and the
  * new "invlpg" and "bswap" instructions)
  */
-#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_BSWAP)
+#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
+	defined(CONFIG_X86_BSWAP)
 	if (boot_cpu_data.x86 == 3)
 		panic("Kernel requires i486+ for 'invlpg' and other features");
 #endif
@@ -170,6 +180,7 @@ void __init check_bugs(void)
 	check_fpu();
 	check_hlt();
 	check_popad();
-	init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
+	init_utsname()->machine[1] =
+		'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
 	alternative_instructions();
 }
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
index 9a3ed0649d4e..9a3ed0649d4e 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/cpu/bugs_64.c
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
new file mode 100644
index 000000000000..13526fd5cce1
--- /dev/null
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -0,0 +1,43 @@
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+#include "cpu.h"
+
+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+{
+	if (c->x86 == 0x6 && c->x86_model >= 0xf)
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+{
+	/* Cache sizes */
+	unsigned n;
+
+	n = c->extended_cpuid_level;
+	if (n >= 0x80000008) {
+		unsigned eax = cpuid_eax(0x80000008);
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+
+	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	}
+	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+}
+
+static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
+	.c_vendor	= "Centaur",
+	.c_ident	= { "CentaurHauls" },
+	.c_early_init	= early_init_centaur,
+	.c_init		= init_centaur,
+};
+
+cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 783691b2a738..4d894e8565fe 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,3 +1,6 @@
+#ifndef ARCH_X86_CPU_H
+
+#define ARCH_X86_CPU_H
 
 struct cpu_model_info {
 	int vendor;
@@ -36,3 +39,5 @@ extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
 
 extern int get_model_name(struct cpuinfo_x86 *c);
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
+
+#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index f03e9153618e..965ea52767ac 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -26,9 +26,10 @@
 #define NFORCE2_SAFE_DISTANCE 50
 
 /* Delay in ms between FSB changes */
-//#define NFORCE2_DELAY 10
+/* #define NFORCE2_DELAY 10 */
 
-/* nforce2_chipset:
+/*
+ * nforce2_chipset:
  * FSB is changed using the chipset
  */
 static struct pci_dev *nforce2_chipset_dev;
@@ -36,13 +37,13 @@ static struct pci_dev *nforce2_chipset_dev;
 /* fid:
  * multiplier * 10
  */
-static int fid = 0;
+static int fid;
 
 /* min_fsb, max_fsb:
  * minimum and maximum FSB (= FSB at boot time)
  */
-static int min_fsb = 0;
-static int max_fsb = 0;
+static int min_fsb;
+static int max_fsb;
 
 MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
 MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
@@ -53,7 +54,7 @@ module_param(min_fsb, int, 0444);
 
 MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
 MODULE_PARM_DESC(min_fsb,
-                 "Minimum FSB to use, if not defined: current FSB - 50");
+		"Minimum FSB to use, if not defined: current FSB - 50");
 
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "cpufreq-nforce2", msg)
 
@@ -139,7 +140,7 @@ static unsigned int nforce2_fsb_read(int bootfsb)
 
 	/* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
 	nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
-						0x01EF,PCI_ANY_ID,PCI_ANY_ID,NULL);
+						0x01EF, PCI_ANY_ID, PCI_ANY_ID, NULL);
 	if (!nforce2_sub5)
 		return 0;
 
@@ -147,13 +148,13 @@ static unsigned int nforce2_fsb_read(int bootfsb)
 	fsb /= 1000000;
 
 	/* Check if PLL register is already set */
-	pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+	pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
 
-	if(bootfsb || !temp)
+	if (bootfsb || !temp)
 		return fsb;
-		
+
 	/* Use PLL register FSB value */
-	pci_read_config_dword(nforce2_chipset_dev,NFORCE2_PLLREG, &temp);
+	pci_read_config_dword(nforce2_chipset_dev, NFORCE2_PLLREG, &temp);
 	fsb = nforce2_calc_fsb(temp);
 
 	return fsb;
@@ -184,7 +185,7 @@ static int nforce2_set_fsb(unsigned int fsb)
 	}
 
 	/* First write? Then set actual value */
-	pci_read_config_byte(nforce2_chipset_dev,NFORCE2_PLLENABLE, (u8 *)&temp);
+	pci_read_config_byte(nforce2_chipset_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
 	if (!temp) {
 		pll = nforce2_calc_pll(tfsb);
 
@@ -210,7 +211,8 @@ static int nforce2_set_fsb(unsigned int fsb)
 			tfsb--;
 
 		/* Calculate the PLL reg. value */
-		if ((pll = nforce2_calc_pll(tfsb)) == -1)
+		pll = nforce2_calc_pll(tfsb);
+		if (pll == -1)
 			return -EINVAL;
 
 		nforce2_write_pll(pll);
@@ -249,7 +251,7 @@ static unsigned int nforce2_get(unsigned int cpu)
 static int nforce2_target(struct cpufreq_policy *policy,
 			  unsigned int target_freq, unsigned int relation)
 {
-//        unsigned long         flags;
+/*        unsigned long         flags; */
 	struct cpufreq_freqs freqs;
 	unsigned int target_fsb;
 
@@ -271,17 +273,17 @@ static int nforce2_target(struct cpufreq_policy *policy,
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 
 	/* Disable IRQs */
-	//local_irq_save(flags);
+	/* local_irq_save(flags); */
 
 	if (nforce2_set_fsb(target_fsb) < 0)
 		printk(KERN_ERR "cpufreq: Changing FSB to %d failed\n",
-                       target_fsb);
+			target_fsb);
 	else
 		dprintk("Changed FSB successfully to %d\n",
-                       target_fsb);
+			target_fsb);
 
 	/* Enable IRQs */
-	//local_irq_restore(flags);
+	/* local_irq_restore(flags); */
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
@@ -302,8 +304,8 @@ static int nforce2_verify(struct cpufreq_policy *policy)
 		policy->max = (fsb_pol_max + 1) * fid * 100;
 
 	cpufreq_verify_within_limits(policy,
-                                     policy->cpuinfo.min_freq,
-                                     policy->cpuinfo.max_freq);
+				     policy->cpuinfo.min_freq,
+				     policy->cpuinfo.max_freq);
 	return 0;
 }
 
@@ -347,7 +349,7 @@ static int nforce2_cpu_init(struct cpufreq_policy *policy)
 	/* Set maximum FSB to FSB at boot time */
 	max_fsb = nforce2_fsb_read(1);
 
-	if(!max_fsb)
+	if (!max_fsb)
 		return -EIO;
 
 	if (!min_fsb)
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
new file mode 100644
index 000000000000..fcb1cc9d75ca
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -0,0 +1,103 @@
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <asm/processor.h>
+#include <asm/ptrace.h>
+#include <asm/topology.h>
+#include <asm/numa_64.h>
+
+#include "cpu.h"
+
+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+{
+	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
+	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+}
+
+/*
+ * find out the number of processor cores on the die
+ */
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
+{
+	unsigned int eax, t;
+
+	if (c->cpuid_level < 4)
+		return 1;
+
+	cpuid_count(4, 0, &eax, &t, &t, &t);
+
+	if (eax & 0x1f)
+		return ((eax >> 26) + 1);
+	else
+		return 1;
+}
+
+static void __cpuinit srat_detect_node(void)
+{
+#ifdef CONFIG_NUMA
+	unsigned node;
+	int cpu = smp_processor_id();
+	int apicid = hard_smp_processor_id();
+
+	/* Don't do the funky fallback heuristics the AMD version employs
+	   for now. */
+	node = apicid_to_node[apicid];
+	if (node == NUMA_NO_NODE || !node_online(node))
+		node = first_node(node_online_map);
+	numa_set_node(cpu, node);
+
+	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+{
+	/* Cache sizes */
+	unsigned n;
+
+	init_intel_cacheinfo(c);
+	if (c->cpuid_level > 9) {
+		unsigned eax = cpuid_eax(10);
+		/* Check for version and the number of counters */
+		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+	}
+
+	if (cpu_has_ds) {
+		unsigned int l1, l2;
+		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<11)))
+			set_cpu_cap(c, X86_FEATURE_BTS);
+		if (!(l1 & (1<<12)))
+			set_cpu_cap(c, X86_FEATURE_PEBS);
+	}
+
+
+	if (cpu_has_bts)
+		ds_init_intel(c);
+
+	n = c->extended_cpuid_level;
+	if (n >= 0x80000008) {
+		unsigned eax = cpuid_eax(0x80000008);
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+
+	if (c->x86 == 15)
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+	if (c->x86 == 6)
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+	c->x86_max_cores = intel_num_cpu_cores(c);
+
+	srat_detect_node();
+}
+
+static struct cpu_dev intel_cpu_dev __cpuinitdata = {
+	.c_vendor	= "Intel",
+	.c_ident	= { "GenuineIntel" },
+	.c_early_init   = early_init_intel,
+	.c_init		= init_intel,
+};
+cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
+
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 26d615dcb149..2c8afafa18e8 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -62,6 +62,7 @@ static struct _cache_table cache_table[] __cpuinitdata =
 	{ 0x4b, LVL_3,      8192 },	/* 16-way set assoc, 64 byte line size */
 	{ 0x4c, LVL_3,     12288 },	/* 12-way set assoc, 64 byte line size */
 	{ 0x4d, LVL_3,     16384 },	/* 16-way set assoc, 64 byte line size */
+	{ 0x4e, LVL_2,      6144 },	/* 24-way set assoc, 64 byte line size */
 	{ 0x60, LVL_1_DATA, 16 },	/* 8-way set assoc, sectored cache, 64 byte line size */
 	{ 0x66, LVL_1_DATA, 8 },	/* 4-way set assoc, sectored cache, 64 byte line size */
 	{ 0x67, LVL_1_DATA, 16 },	/* 4-way set assoc, sectored cache, 64 byte line size */
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index e633c9c2b764..f390c9f66351 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -9,23 +9,23 @@
 #include <linux/interrupt.h>
 #include <linux/smp.h>
 
-#include <asm/processor.h> 
+#include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
 
 /* Machine Check Handler For AMD Athlon/Duron */
-static void k7_machine_check(struct pt_regs * regs, long error_code)
+static void k7_machine_check(struct pt_regs *regs, long error_code)
 {
-	int recover=1;
+	int recover = 1;
 	u32 alow, ahigh, high, low;
 	u32 mcgstl, mcgsth;
 	int i;
 
-	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 	if (mcgstl & (1<<0))	/* Recoverable ? */
-		recover=0;
+		recover = 0;
 
 	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
 		smp_processor_id(), mcgsth, mcgstl);
@@ -60,12 +60,12 @@ static void k7_machine_check(struct pt_regs * regs, long error_code)
 	}
 
 	if (recover&2)
-		panic ("CPU context corrupt");
+		panic("CPU context corrupt");
 	if (recover&1)
-		panic ("Unable to continue");
-	printk (KERN_EMERG "Attempting to continue.\n");
+		panic("Unable to continue");
+	printk(KERN_EMERG "Attempting to continue.\n");
 	mcgstl &= ~(1<<2);
-	wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 }
 
 
@@ -81,25 +81,25 @@ void amd_mcheck_init(struct cpuinfo_x86 *c)
 	machine_check_vector = k7_machine_check;
 	wmb();
 
-	printk (KERN_INFO "Intel machine check architecture supported.\n");
-	rdmsr (MSR_IA32_MCG_CAP, l, h);
+	printk(KERN_INFO "Intel machine check architecture supported.\n");
+	rdmsr(MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<8))	/* Control register present ? */
-		wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 	nr_mce_banks = l & 0xff;
 
 	/* Clear status for MC index 0 separately, we don't touch CTL,
 	 * as some K7 Athlons cause spurious MCEs when its enabled. */
 	if (boot_cpu_data.x86 == 6) {
-		wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+		wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
 		i = 1;
 	} else
 		i = 0;
-	for (; i<nr_mce_banks; i++) {
-		wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-		wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+	for (; i < nr_mce_banks; i++) {
+		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
 	}
 
-	set_in_cr4 (X86_CR4_MCE);
-	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+	set_in_cr4(X86_CR4_MCE);
+	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
 		smp_processor_id());
 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae0..501ca1cea27d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -31,7 +31,7 @@
 #include <asm/idle.h>
 
 #define MISC_MCELOG_MINOR 227
-#define NR_BANKS 6
+#define NR_SYSFS_BANKS 6
 
 atomic_t mce_entry;
 
@@ -46,7 +46,7 @@ static int mce_dont_init;
  */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -209,7 +209,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	barrier();
 
 	for (i = 0; i < banks; i++) {
-		if (!bank[i])
+		if (i < NR_SYSFS_BANKS && !bank[i])
 			continue;
 
 		m.misc = 0;
@@ -444,9 +444,10 @@ static void mce_init(void *dummy)
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	banks = cap & 0xff;
-	if (banks > NR_BANKS) {
-		printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
-		banks = NR_BANKS;
+	if (banks > MCE_EXTENDED_BANK) {
+		banks = MCE_EXTENDED_BANK;
+		printk(KERN_INFO "MCE: warning: using only %d banks\n",
+		       MCE_EXTENDED_BANK);
 	}
 	/* Use accurate RIP reporting if available. */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
@@ -462,7 +463,11 @@ static void mce_init(void *dummy)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
-		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+		if (i < NR_SYSFS_BANKS)
+			wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+		else
+			wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
+
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 }
@@ -766,7 +771,10 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
 	}								\
 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
-/* TBD should generate these dynamically based on number of available banks */
+/*
+ * TBD should generate these dynamically based on number of available banks.
+ * Have only 6 contol banks in /sysfs until then.
+ */
 ACCESSOR(bank0ctl,bank[0],mce_restart())
 ACCESSOR(bank1ctl,bank[1],mce_restart())
 ACCESSOR(bank2ctl,bank[2],mce_restart())
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index cb03345554a5..eef001ad3bde 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -8,7 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp.h>
 
-#include <asm/processor.h> 
+#include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 #include <asm/apic.h>
@@ -32,12 +32,12 @@ struct intel_mce_extended_msrs {
 	/* u32 *reserved[]; */
 };
 
-static int mce_num_extended_msrs = 0;
+static int mce_num_extended_msrs;
 
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
 static void unexpected_thermal_interrupt(struct pt_regs *regs)
-{	
+{
 	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
 			smp_processor_id());
 	add_taint(TAINT_MACHINE_CHECK);
@@ -83,7 +83,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	 * be some SMM goo which handles it, so we can't even put a handler
 	 * since it might be delivered via SMI already -zwanem.
 	 */
-	rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 	h = apic_read(APIC_LVTTHMR);
 	if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
 		printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
@@ -91,7 +91,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 		return; /* -EBUSY */
 	}
 
-	/* check whether a vector already exists, temporarily masked? */	
+	/* check whether a vector already exists, temporarily masked? */
 	if (h & APIC_VECTOR_MASK) {
 		printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
 				"installed\n",
@@ -104,18 +104,18 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
 	apic_write_around(APIC_LVTTHMR, h);
 
-	rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
-	wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
 
 	/* ok we're good to go... */
 	vendor_thermal_interrupt = intel_thermal_interrupt;
-	
-	rdmsr (MSR_IA32_MISC_ENABLE, l, h);
-	wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
 
-	l = apic_read (APIC_LVTTHMR);
-	apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-	printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
+	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+	wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+
+	l = apic_read(APIC_LVTTHMR);
+	apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+	printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
 
 	/* enable thermal throttle processing */
 	atomic_set(&therm_throt_en, 1);
@@ -129,28 +129,28 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
 {
 	u32 h;
 
-	rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
-	rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
-	rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
-	rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
-	rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
-	rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
-	rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
-	rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
-	rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
-	rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
+	rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
+	rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
+	rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
+	rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
+	rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
+	rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
+	rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
+	rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
+	rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
+	rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
 }
 
-static void intel_machine_check(struct pt_regs * regs, long error_code)
+static void intel_machine_check(struct pt_regs *regs, long error_code)
 {
-	int recover=1;
+	int recover = 1;
 	u32 alow, ahigh, high, low;
 	u32 mcgstl, mcgsth;
 	int i;
 
-	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+	rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 	if (mcgstl & (1<<0))	/* Recoverable ? */
-		recover=0;
+		recover = 0;
 
 	printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
 		smp_processor_id(), mcgsth, mcgstl);
@@ -191,20 +191,20 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 	}
 
 	if (recover & 2)
-		panic ("CPU context corrupt");
+		panic("CPU context corrupt");
 	if (recover & 1)
-		panic ("Unable to continue");
+		panic("Unable to continue");
 
 	printk(KERN_EMERG "Attempting to continue.\n");
-	/* 
-	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
+	/*
+	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
 	 * recoverable/continuable.This will allow BIOS to look at the MSRs
 	 * for errors if the OS could not log the error.
 	 */
-	for (i=0; i<nr_mce_banks; i++) {
+	for (i = 0; i < nr_mce_banks; i++) {
 		u32 msr;
 		msr = MSR_IA32_MC0_STATUS+i*4;
-		rdmsr (msr, low, high);
+		rdmsr(msr, low, high);
 		if (high&(1<<31)) {
 			/* Clear it */
 			wrmsr(msr, 0UL, 0UL);
@@ -214,7 +214,7 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
 		}
 	}
 	mcgstl &= ~(1<<2);
-	wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+	wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
 }
 
 
@@ -222,30 +222,30 @@ void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
-	
+
 	machine_check_vector = intel_machine_check;
 	wmb();
 
-	printk (KERN_INFO "Intel machine check architecture supported.\n");
-	rdmsr (MSR_IA32_MCG_CAP, l, h);
+	printk(KERN_INFO "Intel machine check architecture supported.\n");
+	rdmsr(MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<8))	/* Control register present ? */
-		wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 	nr_mce_banks = l & 0xff;
 
-	for (i=0; i<nr_mce_banks; i++) {
-		wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-		wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+	for (i = 0; i < nr_mce_banks; i++) {
+		wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+		wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
 	}
 
-	set_in_cr4 (X86_CR4_MCE);
-	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+	set_in_cr4(X86_CR4_MCE);
+	printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
 		smp_processor_id());
 
 	/* Check for P4/Xeon extended MCE MSRs */
-	rdmsr (MSR_IA32_MCG_CAP, l, h);
+	rdmsr(MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<9))	{/* MCG_EXT_P */
 		mce_num_extended_msrs = (l >> 16) & 0xff;
-		printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
+		printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
 				" available\n",
 			smp_processor_id(), mce_num_extended_msrs);
 
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 5d241ce94a44..509bd3d9eacd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -37,7 +37,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
 static unsigned long smp_changes_mask;
 static struct mtrr_state mtrr_state = {};
 static int mtrr_state_set;
-static u64 tom2;
+u64 mtrr_tom2;
 
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
@@ -139,8 +139,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
 		}
 	}
 
-	if (tom2) {
-		if (start >= (1ULL<<32) && (end < tom2))
+	if (mtrr_tom2) {
+		if (start >= (1ULL<<32) && (end < mtrr_tom2))
 			return MTRR_TYPE_WRBACK;
 	}
 
@@ -158,6 +158,20 @@ get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
 	rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi);
 }
 
+/*  fill the MSR pair relating to a var range  */
+void fill_mtrr_var_range(unsigned int index,
+		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi)
+{
+	struct mtrr_var_range *vr;
+
+	vr = mtrr_state.var_ranges;
+
+	vr[index].base_lo = base_lo;
+	vr[index].base_hi = base_hi;
+	vr[index].mask_lo = mask_lo;
+	vr[index].mask_hi = mask_hi;
+}
+
 static void
 get_fixed_ranges(mtrr_type * frs)
 {
@@ -213,13 +227,13 @@ void __init get_mtrr_state(void)
 	mtrr_state.enabled = (lo & 0xc00) >> 10;
 
 	if (amd_special_default_mtrr()) {
-		unsigned lo, hi;
+		unsigned low, high;
 		/* TOP_MEM2 */
-		rdmsr(MSR_K8_TOP_MEM2, lo, hi);
-		tom2 = hi;
-		tom2 <<= 32;
-		tom2 |= lo;
-		tom2 &= 0xffffff8000000ULL;
+		rdmsr(MSR_K8_TOP_MEM2, low, high);
+		mtrr_tom2 = high;
+		mtrr_tom2 <<= 32;
+		mtrr_tom2 |= low;
+		mtrr_tom2 &= 0xffffff800000ULL;
 	}
 	if (mtrr_show) {
 		int high_width;
@@ -251,9 +265,9 @@ void __init get_mtrr_state(void)
 			else
 				printk(KERN_INFO "MTRR %u disabled\n", i);
 		}
-		if (tom2) {
+		if (mtrr_tom2) {
 			printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
-					  tom2, tom2>>20);
+					  mtrr_tom2, mtrr_tom2>>20);
 		}
 	}
 	mtrr_state_set = 1;
@@ -328,7 +342,7 @@ static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords)
 
 	if (lo != msrwords[0] || hi != msrwords[1]) {
 		if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		    boot_cpu_data.x86 == 15 &&
+		    (boot_cpu_data.x86 >= 0x0f && boot_cpu_data.x86 <= 0x11) &&
 		    ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK))
 			k8_enable_fixed_iorrs();
 		mtrr_wrmsr(msr, msrwords[0], msrwords[1]);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6a1e278d9323..105afe12beb0 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -37,6 +37,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/sort.h>
 
 #include <asm/e820.h>
 #include <asm/mtrr.h>
@@ -609,6 +610,787 @@ static struct sysdev_driver mtrr_sysdev_driver = {
 	.resume		= mtrr_restore,
 };
 
+/* should be related to MTRR_VAR_RANGES nums */
+#define RANGE_NUM 256
+
+struct res_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+static int __init
+add_range(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
+{
+	/* out of slots */
+	if (nr_range >= RANGE_NUM)
+		return nr_range;
+
+	range[nr_range].start = start;
+	range[nr_range].end = end;
+
+	nr_range++;
+
+	return nr_range;
+}
+
+static int __init
+add_range_with_merge(struct res_range *range, int nr_range, unsigned long start,
+			      unsigned long end)
+{
+	int i;
+
+	/* try to merge it with old one */
+	for (i = 0; i < nr_range; i++) {
+		unsigned long final_start, final_end;
+		unsigned long common_start, common_end;
+
+		if (!range[i].end)
+			continue;
+
+		common_start = max(range[i].start, start);
+		common_end = min(range[i].end, end);
+		if (common_start > common_end + 1)
+			continue;
+
+		final_start = min(range[i].start, start);
+		final_end = max(range[i].end, end);
+
+		range[i].start = final_start;
+		range[i].end =  final_end;
+		return nr_range;
+	}
+
+	/* need to add that */
+	return add_range(range, nr_range, start, end);
+}
+
+static void __init
+subtract_range(struct res_range *range, unsigned long start, unsigned long end)
+{
+	int i, j;
+
+	for (j = 0; j < RANGE_NUM; j++) {
+		if (!range[j].end)
+			continue;
+
+		if (start <= range[j].start && end >= range[j].end) {
+			range[j].start = 0;
+			range[j].end = 0;
+			continue;
+		}
+
+		if (start <= range[j].start && end < range[j].end &&
+		    range[j].start < end + 1) {
+			range[j].start = end + 1;
+			continue;
+		}
+
+
+		if (start > range[j].start && end >= range[j].end &&
+		    range[j].end > start - 1) {
+			range[j].end = start - 1;
+			continue;
+		}
+
+		if (start > range[j].start && end < range[j].end) {
+			/* find the new spare */
+			for (i = 0; i < RANGE_NUM; i++) {
+				if (range[i].end == 0)
+					break;
+			}
+			if (i < RANGE_NUM) {
+				range[i].end = range[j].end;
+				range[i].start = end + 1;
+			} else {
+				printk(KERN_ERR "run of slot in ranges\n");
+			}
+			range[j].end = start - 1;
+			continue;
+		}
+	}
+}
+
+static int __init cmp_range(const void *x1, const void *x2)
+{
+	const struct res_range *r1 = x1;
+	const struct res_range *r2 = x2;
+	long start1, start2;
+
+	start1 = r1->start;
+	start2 = r2->start;
+
+	return start1 - start2;
+}
+
+struct var_mtrr_range_state {
+	unsigned long base_pfn;
+	unsigned long size_pfn;
+	mtrr_type type;
+};
+
+struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static int __initdata debug_print;
+
+static int __init
+x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
+		       unsigned long extra_remove_base,
+		       unsigned long extra_remove_size)
+{
+	unsigned long i, base, size;
+	mtrr_type type;
+
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_WRBACK)
+			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
+		nr_range = add_range_with_merge(range, nr_range, base,
+						base + size - 1);
+	}
+	if (debug_print) {
+		printk(KERN_DEBUG "After WB checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* take out UC ranges */
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type != MTRR_TYPE_UNCACHABLE)
+			continue;
+		size = range_state[i].size_pfn;
+		if (!size)
+			continue;
+		base = range_state[i].base_pfn;
+		subtract_range(range, base, base + size - 1);
+	}
+	if (extra_remove_size)
+		subtract_range(range, extra_remove_base,
+				 extra_remove_base + extra_remove_size  - 1);
+
+	/* get new range num */
+	nr_range = 0;
+	for (i = 0; i < RANGE_NUM; i++) {
+		if (!range[i].end)
+			continue;
+		nr_range++;
+	}
+	if  (debug_print) {
+		printk(KERN_DEBUG "After UC checking\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* sort the ranges */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+	if  (debug_print) {
+		printk(KERN_DEBUG "After sorting\n");
+		for (i = 0; i < nr_range; i++)
+			printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
+				 range[i].start, range[i].end + 1);
+	}
+
+	/* clear those is not used */
+	for (i = nr_range; i < RANGE_NUM; i++)
+		memset(&range[i], 0, sizeof(range[i]));
+
+	return nr_range;
+}
+
+static struct res_range __initdata range[RANGE_NUM];
+
+#ifdef CONFIG_MTRR_SANITIZER
+
+static unsigned long __init sum_ranges(struct res_range *range, int nr_range)
+{
+	unsigned long sum;
+	int i;
+
+	sum = 0;
+	for (i = 0; i < nr_range; i++)
+		sum += range[i].end + 1 - range[i].start;
+
+	return sum;
+}
+
+static int enable_mtrr_cleanup __initdata =
+	CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT;
+
+static int __init disable_mtrr_cleanup_setup(char *str)
+{
+	if (enable_mtrr_cleanup != -1)
+		enable_mtrr_cleanup = 0;
+	return 0;
+}
+early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
+
+static int __init enable_mtrr_cleanup_setup(char *str)
+{
+	if (enable_mtrr_cleanup != -1)
+		enable_mtrr_cleanup = 1;
+	return 0;
+}
+early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+struct var_mtrr_state {
+	unsigned long	range_startk;
+	unsigned long	range_sizek;
+	unsigned long	chunk_sizek;
+	unsigned long	gran_sizek;
+	unsigned int	reg;
+};
+
+static void __init
+set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type, unsigned int address_bits)
+{
+	u32 base_lo, base_hi, mask_lo, mask_hi;
+	u64 base, mask;
+
+	if (!sizek) {
+		fill_mtrr_var_range(reg, 0, 0, 0, 0);
+		return;
+	}
+
+	mask = (1ULL << address_bits) - 1;
+	mask &= ~((((u64)sizek) << 10) - 1);
+
+	base  = ((u64)basek) << 10;
+
+	base |= type;
+	mask |= 0x800;
+
+	base_lo = base & ((1ULL<<32) - 1);
+	base_hi = base >> 32;
+
+	mask_lo = mask & ((1ULL<<32) - 1);
+	mask_hi = mask >> 32;
+
+	fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi);
+}
+
+static void __init
+save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek,
+		unsigned char type)
+{
+	range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10);
+	range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10);
+	range_state[reg].type = type;
+}
+
+static void __init
+set_var_mtrr_all(unsigned int address_bits)
+{
+	unsigned long basek, sizek;
+	unsigned char type;
+	unsigned int reg;
+
+	for (reg = 0; reg < num_var_ranges; reg++) {
+		basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10);
+		sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10);
+		type = range_state[reg].type;
+
+		set_var_mtrr(reg, basek, sizek, type, address_bits);
+	}
+}
+
+static unsigned int __init
+range_to_mtrr(unsigned int reg, unsigned long range_startk,
+	      unsigned long range_sizek, unsigned char type)
+{
+	if (!range_sizek || (reg >= num_var_ranges))
+		return reg;
+
+	while (range_sizek) {
+		unsigned long max_align, align;
+		unsigned long sizek;
+
+		/* Compute the maximum size I can make a range */
+		if (range_startk)
+			max_align = ffs(range_startk) - 1;
+		else
+			max_align = 32;
+		align = fls(range_sizek) - 1;
+		if (align > max_align)
+			align = max_align;
+
+		sizek = 1 << align;
+		if (debug_print)
+			printk(KERN_DEBUG "Setting variable MTRR %d, "
+				"base: %ldMB, range: %ldMB, type %s\n",
+				reg, range_startk >> 10, sizek >> 10,
+				(type == MTRR_TYPE_UNCACHABLE)?"UC":
+				    ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
+				);
+		save_var_mtrr(reg++, range_startk, sizek, type);
+		range_startk += sizek;
+		range_sizek -= sizek;
+		if (reg >= num_var_ranges)
+			break;
+	}
+	return reg;
+}
+
+static unsigned __init
+range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
+			unsigned long sizek)
+{
+	unsigned long hole_basek, hole_sizek;
+	unsigned long second_basek, second_sizek;
+	unsigned long range0_basek, range0_sizek;
+	unsigned long range_basek, range_sizek;
+	unsigned long chunk_sizek;
+	unsigned long gran_sizek;
+
+	hole_basek = 0;
+	hole_sizek = 0;
+	second_basek = 0;
+	second_sizek = 0;
+	chunk_sizek = state->chunk_sizek;
+	gran_sizek = state->gran_sizek;
+
+	/* align with gran size, prevent small block used up MTRRs */
+	range_basek = ALIGN(state->range_startk, gran_sizek);
+	if ((range_basek > basek) && basek)
+		return second_sizek;
+	state->range_sizek -= (range_basek - state->range_startk);
+	range_sizek = ALIGN(state->range_sizek, gran_sizek);
+
+	while (range_sizek > state->range_sizek) {
+		range_sizek -= gran_sizek;
+		if (!range_sizek)
+			return 0;
+	}
+	state->range_sizek = range_sizek;
+
+	/* try to append some small hole */
+	range0_basek = state->range_startk;
+	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+	if (range0_sizek == state->range_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
+				range0_basek<<10,
+				(range0_basek + state->range_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				state->range_sizek, MTRR_TYPE_WRBACK);
+		return 0;
+	}
+
+	range0_sizek -= chunk_sizek;
+	if (range0_sizek && sizek) {
+	    while (range0_basek + range0_sizek > (basek + sizek)) {
+		range0_sizek -= chunk_sizek;
+		if (!range0_sizek)
+			break;
+	    }
+	}
+
+	if (range0_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "range0: %016lx - %016lx\n",
+				range0_basek<<10,
+				(range0_basek + range0_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range0_basek,
+				range0_sizek, MTRR_TYPE_WRBACK);
+
+	}
+
+	range_basek = range0_basek + range0_sizek;
+	range_sizek = chunk_sizek;
+
+	if (range_basek + range_sizek > basek &&
+	    range_basek + range_sizek <= (basek + sizek)) {
+		/* one hole */
+		second_basek = basek;
+		second_sizek = range_basek + range_sizek - basek;
+	}
+
+	/* if last piece, only could one hole near end */
+	if ((second_basek || !basek) &&
+	    range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
+	    (chunk_sizek >> 1)) {
+		/*
+		 * one hole in middle (second_sizek is 0) or at end
+		 * (second_sizek is 0 )
+		 */
+		hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
+				 - second_sizek;
+		hole_basek = range_basek + range_sizek - hole_sizek
+				 - second_sizek;
+	} else {
+		/* fallback for big hole, or several holes */
+		range_sizek = state->range_sizek - range0_sizek;
+		second_basek = 0;
+		second_sizek = 0;
+	}
+
+	if (debug_print)
+		printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
+			 (range_basek + range_sizek)<<10);
+	state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
+					 MTRR_TYPE_WRBACK);
+	if (hole_sizek) {
+		if (debug_print)
+			printk(KERN_DEBUG "hole: %016lx - %016lx\n",
+				 hole_basek<<10, (hole_basek + hole_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
+						 MTRR_TYPE_UNCACHABLE);
+
+	}
+
+	return second_sizek;
+}
+
+static void __init
+set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
+		   unsigned long size_pfn)
+{
+	unsigned long basek, sizek;
+	unsigned long second_sizek = 0;
+
+	if (state->reg >= num_var_ranges)
+		return;
+
+	basek = base_pfn << (PAGE_SHIFT - 10);
+	sizek = size_pfn << (PAGE_SHIFT - 10);
+
+	/* See if I can merge with the last range */
+	if ((basek <= 1024) ||
+	    (state->range_startk + state->range_sizek == basek)) {
+		unsigned long endk = basek + sizek;
+		state->range_sizek = endk - state->range_startk;
+		return;
+	}
+	/* Write the range mtrrs */
+	if (state->range_sizek != 0)
+		second_sizek = range_to_mtrr_with_hole(state, basek, sizek);
+
+	/* Allocate an msr */
+	state->range_startk = basek + second_sizek;
+	state->range_sizek  = sizek - second_sizek;
+}
+
+/* mininum size of mtrr block that can take hole */
+static u64 mtrr_chunk_size __initdata = (256ULL<<20);
+
+static int __init parse_mtrr_chunk_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_chunk_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt);
+
+/* granity of mtrr of block */
+static u64 mtrr_gran_size __initdata;
+
+static int __init parse_mtrr_gran_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	mtrr_gran_size = memparse(p, &p);
+	return 0;
+}
+early_param("mtrr_gran_size", parse_mtrr_gran_size_opt);
+
+static int nr_mtrr_spare_reg __initdata =
+				 CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT;
+
+static int __init parse_mtrr_spare_reg(char *arg)
+{
+	if (arg)
+		nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0);
+	return 0;
+}
+
+early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
+
+static int __init
+x86_setup_var_mtrrs(struct res_range *range, int nr_range,
+		    u64 chunk_size, u64 gran_size)
+{
+	struct var_mtrr_state var_state;
+	int i;
+	int num_reg;
+
+	var_state.range_startk	= 0;
+	var_state.range_sizek	= 0;
+	var_state.reg		= 0;
+	var_state.chunk_sizek	= chunk_size >> 10;
+	var_state.gran_sizek	= gran_size >> 10;
+
+	memset(range_state, 0, sizeof(range_state));
+
+	/* Write the range etc */
+	for (i = 0; i < nr_range; i++)
+		set_var_mtrr_range(&var_state, range[i].start,
+				   range[i].end - range[i].start + 1);
+
+	/* Write the last range */
+	if (var_state.range_sizek != 0)
+		range_to_mtrr_with_hole(&var_state, 0, 0);
+
+	num_reg = var_state.reg;
+	/* Clear out the extra MTRR's */
+	while (var_state.reg < num_var_ranges) {
+		save_var_mtrr(var_state.reg, 0, 0, 0);
+		var_state.reg++;
+	}
+
+	return num_reg;
+}
+
+struct mtrr_cleanup_result {
+	unsigned long gran_sizek;
+	unsigned long chunk_sizek;
+	unsigned long lose_cover_sizek;
+	unsigned int num_reg;
+	int bad;
+};
+
+/*
+ * gran_size: 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 4G
+ * so we need (2+13)*6
+ */
+#define NUM_RESULT	90
+#define PSHIFT		(PAGE_SHIFT - 10)
+
+static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
+static struct res_range __initdata range_new[RANGE_NUM];
+static unsigned long __initdata min_loss_pfn[RANGE_NUM];
+
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+	unsigned long extra_remove_base, extra_remove_size;
+	unsigned long i, base, size, def, dummy;
+	mtrr_type type;
+	int nr_range, nr_range_new;
+	u64 chunk_size, gran_size;
+	unsigned long range_sums, range_sums_new;
+	int index_good;
+	int num_reg_good;
+
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
+
+	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
+		return 0;
+	rdmsr(MTRRdefType_MSR, def, dummy);
+	def &= 0xff;
+	if (def != MTRR_TYPE_UNCACHABLE)
+		return 0;
+
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
+
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		size = range_state[i].size_pfn;
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		num[type]++;
+	}
+
+	/* check if we got UC entries */
+	if (!num[MTRR_TYPE_UNCACHABLE])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	memset(range, 0, sizeof(range));
+	extra_remove_size = 0;
+	if (mtrr_tom2) {
+		extra_remove_base = 1 << (32 - PAGE_SHIFT);
+		extra_remove_size =
+			(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
+	}
+	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
+					  extra_remove_size);
+	range_sums = sum_ranges(range, nr_range);
+	printk(KERN_INFO "total RAM coverred: %ldM\n",
+	       range_sums >> (20 - PAGE_SHIFT));
+
+	if (mtrr_chunk_size && mtrr_gran_size) {
+		int num_reg;
+
+		debug_print = 1;
+		/* convert ranges to var ranges state */
+		num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
+					      mtrr_gran_size);
+
+		/* we got new setting in range_state, check it */
+		memset(range_new, 0, sizeof(range_new));
+		nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+						      extra_remove_base,
+						      extra_remove_size);
+		range_sums_new = sum_ranges(range_new, nr_range_new);
+
+		i = 0;
+		result[i].chunk_sizek = mtrr_chunk_size >> 10;
+		result[i].gran_sizek = mtrr_gran_size >> 10;
+		result[i].num_reg = num_reg;
+		if (range_sums < range_sums_new) {
+			result[i].lose_cover_sizek =
+				(range_sums_new - range_sums) << PSHIFT;
+			result[i].bad = 1;
+		} else
+			result[i].lose_cover_sizek =
+				(range_sums - range_sums_new) << PSHIFT;
+
+		printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+			 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
+			 result[i].chunk_sizek >> 10);
+		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ldM \n",
+			 result[i].num_reg, result[i].bad?"-":"",
+			 result[i].lose_cover_sizek >> 10);
+		if (!result[i].bad) {
+			set_var_mtrr_all(address_bits);
+			return 1;
+		}
+		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
+		       "will find optimal one\n");
+		debug_print = 0;
+		memset(result, 0, sizeof(result[0]));
+	}
+
+	i = 0;
+	memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
+	memset(result, 0, sizeof(result));
+	for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
+		for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+		     chunk_size <<= 1) {
+			int num_reg;
+
+			if (debug_print)
+				printk(KERN_INFO
+			       "\ngran_size: %lldM   chunk_size_size: %lldM\n",
+				       gran_size >> 20, chunk_size >> 20);
+			if (i >= NUM_RESULT)
+				continue;
+
+			/* convert ranges to var ranges state */
+			num_reg = x86_setup_var_mtrrs(range, nr_range,
+							 chunk_size, gran_size);
+
+			/* we got new setting in range_state, check it */
+			memset(range_new, 0, sizeof(range_new));
+			nr_range_new = x86_get_mtrr_mem_range(range_new, 0,
+					 extra_remove_base, extra_remove_size);
+			range_sums_new = sum_ranges(range_new, nr_range_new);
+
+			result[i].chunk_sizek = chunk_size >> 10;
+			result[i].gran_sizek = gran_size >> 10;
+			result[i].num_reg = num_reg;
+			if (range_sums < range_sums_new) {
+				result[i].lose_cover_sizek =
+					(range_sums_new - range_sums) << PSHIFT;
+				result[i].bad = 1;
+			} else
+				result[i].lose_cover_sizek =
+					(range_sums - range_sums_new) << PSHIFT;
+
+			/* double check it */
+			if (!result[i].bad && !result[i].lose_cover_sizek) {
+				if (nr_range_new != nr_range ||
+					memcmp(range, range_new, sizeof(range)))
+						result[i].bad = 1;
+			}
+
+			if (!result[i].bad && (range_sums - range_sums_new <
+					       min_loss_pfn[num_reg])) {
+				min_loss_pfn[num_reg] =
+					range_sums - range_sums_new;
+			}
+			i++;
+		}
+	}
+
+	/* print out all */
+	for (i = 0; i < NUM_RESULT; i++) {
+		printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
+		       result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
+		       result[i].chunk_sizek >> 10);
+		printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
+		       result[i].num_reg, result[i].bad?"-":"",
+		       result[i].lose_cover_sizek >> 10);
+	}
+
+	/* try to find the optimal index */
+	if (nr_mtrr_spare_reg >= num_var_ranges)
+		nr_mtrr_spare_reg = num_var_ranges - 1;
+	num_reg_good = -1;
+	for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
+		if (!min_loss_pfn[i]) {
+			num_reg_good = i;
+			break;
+		}
+	}
+
+	index_good = -1;
+	if (num_reg_good != -1) {
+		for (i = 0; i < NUM_RESULT; i++) {
+			if (!result[i].bad &&
+			    result[i].num_reg == num_reg_good &&
+			    !result[i].lose_cover_sizek) {
+				index_good = i;
+				break;
+			}
+		}
+	}
+
+	if (index_good != -1) {
+		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
+		i = index_good;
+		printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
+				result[i].gran_sizek >> 10,
+				result[i].chunk_sizek >> 10);
+		printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
+				result[i].num_reg,
+				result[i].lose_cover_sizek >> 10);
+		/* convert ranges to var ranges state */
+		chunk_size = result[i].chunk_sizek;
+		chunk_size <<= 10;
+		gran_size = result[i].gran_sizek;
+		gran_size <<= 10;
+		debug_print = 1;
+		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+		set_var_mtrr_all(address_bits);
+		return 1;
+	}
+
+	printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n");
+	printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n");
+
+	return 0;
+}
+#else
+static int __init mtrr_cleanup(unsigned address_bits)
+{
+	return 0;
+}
+#endif
+
+static int __initdata changed_by_mtrr_cleanup;
+
 static int disable_mtrr_trim;
 
 static int __init disable_mtrr_trim_setup(char *str)
@@ -648,6 +1430,19 @@ int __init amd_special_default_mtrr(void)
 	return 0;
 }
 
+static u64 __init real_trim_memory(unsigned long start_pfn,
+				   unsigned long limit_pfn)
+{
+	u64 trim_start, trim_size;
+	trim_start = start_pfn;
+	trim_start <<= PAGE_SHIFT;
+	trim_size = limit_pfn;
+	trim_size <<= PAGE_SHIFT;
+	trim_size -= trim_start;
+
+	return e820_update_range(trim_start, trim_size, E820_RAM,
+				E820_RESERVED);
+}
 /**
  * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
  * @end_pfn: ending page frame number
@@ -663,8 +1458,11 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 {
 	unsigned long i, base, size, highest_pfn = 0, def, dummy;
 	mtrr_type type;
-	u64 trim_start, trim_size;
+	int nr_range;
+	u64 total_trim_size;
 
+	/* extra one for all 0 */
+	int num[MTRR_NUM_TYPES + 1];
 	/*
 	 * Make sure we only trim uncachable memory on machines that
 	 * support the Intel MTRR architecture:
@@ -676,14 +1474,22 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	if (def != MTRR_TYPE_UNCACHABLE)
 		return 0;
 
-	if (amd_special_default_mtrr())
-		return 0;
+	/* get it and store it aside */
+	memset(range_state, 0, sizeof(range_state));
+	for (i = 0; i < num_var_ranges; i++) {
+		mtrr_if->get(i, &base, &size, &type);
+		range_state[i].base_pfn = base;
+		range_state[i].size_pfn = size;
+		range_state[i].type = type;
+	}
 
 	/* Find highest cached pfn */
 	for (i = 0; i < num_var_ranges; i++) {
-		mtrr_if->get(i, &base, &size, &type);
+		type = range_state[i].type;
 		if (type != MTRR_TYPE_WRBACK)
 			continue;
+		base = range_state[i].base_pfn;
+		size = range_state[i].size_pfn;
 		if (highest_pfn < base + size)
 			highest_pfn = base + size;
 	}
@@ -698,22 +1504,65 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 		return 0;
 	}
 
-	if (highest_pfn < end_pfn) {
+	/* check entries number */
+	memset(num, 0, sizeof(num));
+	for (i = 0; i < num_var_ranges; i++) {
+		type = range_state[i].type;
+		if (type >= MTRR_NUM_TYPES)
+			continue;
+		size = range_state[i].size_pfn;
+		if (!size)
+			type = MTRR_NUM_TYPES;
+		num[type]++;
+	}
+
+	/* no entry for WB? */
+	if (!num[MTRR_TYPE_WRBACK])
+		return 0;
+
+	/* check if we only had WB and UC */
+	if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] !=
+		num_var_ranges - num[MTRR_NUM_TYPES])
+		return 0;
+
+	memset(range, 0, sizeof(range));
+	nr_range = 0;
+	if (mtrr_tom2) {
+		range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
+		range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1;
+		if (highest_pfn < range[nr_range].end + 1)
+			highest_pfn = range[nr_range].end + 1;
+		nr_range++;
+	}
+	nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
+
+	total_trim_size = 0;
+	/* check the head */
+	if (range[0].start)
+		total_trim_size += real_trim_memory(0, range[0].start);
+	/* check the holes */
+	for (i = 0; i < nr_range - 1; i++) {
+		if (range[i].end + 1 < range[i+1].start)
+			total_trim_size += real_trim_memory(range[i].end + 1,
+							    range[i+1].start);
+	}
+	/* check the top */
+	i = nr_range - 1;
+	if (range[i].end + 1 < end_pfn)
+		total_trim_size += real_trim_memory(range[i].end + 1,
+							 end_pfn);
+
+	if (total_trim_size) {
 		printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover"
-			" all of memory, losing %luMB of RAM.\n",
-			(end_pfn - highest_pfn) >> (20 - PAGE_SHIFT));
+			" all of memory, losing %lluMB of RAM.\n",
+			total_trim_size >> 20);
 
-		WARN_ON(1);
+		if (!changed_by_mtrr_cleanup)
+			WARN_ON(1);
 
 		printk(KERN_INFO "update e820 for mtrr\n");
-		trim_start = highest_pfn;
-		trim_start <<= PAGE_SHIFT;
-		trim_size = end_pfn;
-		trim_size <<= PAGE_SHIFT;
-		trim_size -= trim_start;
-		update_memory_range(trim_start, trim_size, E820_RAM,
-					E820_RESERVED);
 		update_e820();
+
 		return 1;
 	}
 
@@ -729,18 +1578,21 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
  */
 void __init mtrr_bp_init(void)
 {
+	u32 phys_addr;
 	init_ifs();
 
+	phys_addr = 32;
+
 	if (cpu_has_mtrr) {
 		mtrr_if = &generic_mtrr_ops;
 		size_or_mask = 0xff000000;	/* 36 bits */
 		size_and_mask = 0x00f00000;
+		phys_addr = 36;
 
 		/* This is an AMD specific MSR, but we assume(hope?) that
 		   Intel will implement it to when they extend the address
 		   bus of the Xeon. */
 		if (cpuid_eax(0x80000000) >= 0x80000008) {
-			u32 phys_addr;
 			phys_addr = cpuid_eax(0x80000008) & 0xff;
 			/* CPUID workaround for Intel 0F33/0F34 CPU */
 			if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
@@ -758,6 +1610,7 @@ void __init mtrr_bp_init(void)
 			   don't support PAE */
 			size_or_mask = 0xfff00000;	/* 32 bits */
 			size_and_mask = 0;
+			phys_addr = 32;
 		}
 	} else {
 		switch (boot_cpu_data.x86_vendor) {
@@ -791,8 +1644,15 @@ void __init mtrr_bp_init(void)
 	if (mtrr_if) {
 		set_num_var_ranges();
 		init_table();
-		if (use_intel())
+		if (use_intel()) {
 			get_mtrr_state();
+
+			if (mtrr_cleanup(phys_addr)) {
+				changed_by_mtrr_cleanup = 1;
+				mtrr_if->set_all();
+			}
+
+		}
 	}
 }
 
@@ -829,9 +1689,10 @@ static int __init mtrr_init_finialize(void)
 {
 	if (!mtrr_if)
 		return 0;
-	if (use_intel())
-		mtrr_state_warn();
-	else {
+	if (use_intel()) {
+		if (!changed_by_mtrr_cleanup)
+			mtrr_state_warn();
+	} else {
 		/* The CPUs haven't MTRR and seem to not support SMP. They have
 		 * specific drivers, we use a tricky method to support
 		 * suspend/resume for them.
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2cc77eb6fea3..2dc4ec656b23 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -81,6 +81,8 @@ void set_mtrr_done(struct set_mtrr_context *ctxt);
 void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
 void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
 
+void fill_mtrr_var_range(unsigned int index,
+		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
 void get_mtrr_state(void);
 
 extern void set_mtrr_ops(struct mtrr_ops * ops);
@@ -92,6 +94,7 @@ extern struct mtrr_ops * mtrr_if;
 #define use_intel()	(mtrr_if && mtrr_if->use_intel_if == 1)
 
 extern unsigned int num_var_ranges;
+extern u64 mtrr_tom2;
 
 void mtrr_state_warn(void);
 const char *mtrr_attrib_to_str(int x);
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820.c
index af1eb0789740..7b613d2efb04 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820.c
@@ -17,172 +17,30 @@
 #include <linux/kexec.h>
 #include <linux/module.h>
 #include <linux/mm.h>
-#include <linux/suspend.h>
 #include <linux/pfn.h>
+#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/page.h>
 #include <asm/e820.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
-#include <asm/sections.h>
-#include <asm/kdebug.h>
 #include <asm/trampoline.h>
 
 struct e820map e820;
 
-/*
- * PFN of last memory page.
- */
-unsigned long end_pfn;
-
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_pfn_mapped;
-
-/*
- * Last pfn which the user wants to use.
- */
-static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
-
-/*
- * Early reserved memory areas.
- */
-#define MAX_EARLY_RES 20
-
-struct early_res {
-	unsigned long start, end;
-	char name[16];
-};
-static struct early_res early_res[MAX_EARLY_RES] __initdata = {
-	{ 0, PAGE_SIZE, "BIOS data page" },			/* BIOS data page */
-#ifdef CONFIG_X86_TRAMPOLINE
-	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0xaeedbabe;
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
 #endif
-	{}
-};
-
-void __init reserve_early(unsigned long start, unsigned long end, char *name)
-{
-	int i;
-	struct early_res *r;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		r = &early_res[i];
-		if (end > r->start && start < r->end)
-			panic("Overlapping early reservations %lx-%lx %s to %lx-%lx %s\n",
-			      start, end - 1, name?name:"", r->start, r->end - 1, r->name);
-	}
-	if (i >= MAX_EARLY_RES)
-		panic("Too many early reservations");
-	r = &early_res[i];
-	r->start = start;
-	r->end = end;
-	if (name)
-		strncpy(r->name, name, sizeof(r->name) - 1);
-}
-
-void __init free_early(unsigned long start, unsigned long end)
-{
-	struct early_res *r;
-	int i, j;
-
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		r = &early_res[i];
-		if (start == r->start && end == r->end)
-			break;
-	}
-	if (i >= MAX_EARLY_RES || !early_res[i].end)
-		panic("free_early on not reserved area: %lx-%lx!", start, end);
 
-	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
-		;
-
-	memmove(&early_res[i], &early_res[i + 1],
-	       (j - 1 - i) * sizeof(struct early_res));
-
-	early_res[j - 1].end = 0;
-}
-
-void __init early_res_to_bootmem(unsigned long start, unsigned long end)
-{
-	int i;
-	unsigned long final_start, final_end;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		final_start = max(start, r->start);
-		final_end = min(end, r->end);
-		if (final_start >= final_end)
-			continue;
-		printk(KERN_INFO "  early res: %d [%lx-%lx] %s\n", i,
-			final_start, final_end - 1, r->name);
-		reserve_bootmem_generic(final_start, final_end - final_start,
-				BOOTMEM_DEFAULT);
-	}
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
-{
-	int i;
-	unsigned long addr = *addrp, last;
-	int changed = 0;
-again:
-	last = addr + size;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		if (last >= r->start && addr < r->end) {
-			*addrp = addr = round_up(r->end, align);
-			changed = 1;
-			goto again;
-		}
-	}
-	return changed;
-}
-
-/* Check for already reserved areas */
-static inline int __init
-bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
-{
-	int i;
-	unsigned long addr = *addrp, last;
-	unsigned long size = *sizep;
-	int changed = 0;
-again:
-	last = addr + size;
-	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
-		struct early_res *r = &early_res[i];
-		if (last > r->start && addr < r->start) {
-			size = r->start - addr;
-			changed = 1;
-			goto again;
-		}
-		if (last > r->end && addr < r->end) {
-			addr = round_up(r->end, align);
-			size = last - addr;
-			changed = 1;
-			goto again;
-		}
-		if (last <= r->end && addr >= r->start) {
-			(*sizep)++;
-			return 0;
-		}
-	}
-	if (changed) {
-		*addrp = addr;
-		*sizep = size;
-	}
-	return changed;
-}
 /*
  * This function checks if any part of the range <start,end> is mapped
  * with type.
  */
 int
-e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
+e820_any_mapped(u64 start, u64 end, unsigned type)
 {
 	int i;
 
@@ -205,8 +63,7 @@ EXPORT_SYMBOL_GPL(e820_any_mapped);
  * Note: this function only works correct if the e820 table is sorted and
  * not-overlapping, which is the case
  */
-int __init e820_all_mapped(unsigned long start, unsigned long end,
-			   unsigned type)
+int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 {
 	int i;
 
@@ -235,214 +92,13 @@ int __init e820_all_mapped(unsigned long start, unsigned long end,
 }
 
 /*
- * Find a free area with specified alignment in a specific range.
- */
-unsigned long __init find_e820_area(unsigned long start, unsigned long end,
-				    unsigned long size, unsigned long align)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long addr, last;
-		unsigned long ei_last;
-
-		if (ei->type != E820_RAM)
-			continue;
-		addr = round_up(ei->addr, align);
-		ei_last = ei->addr + ei->size;
-		if (addr < start)
-			addr = round_up(start, align);
-		if (addr >= ei_last)
-			continue;
-		while (bad_addr(&addr, size, align) && addr+size <= ei_last)
-			;
-		last = addr + size;
-		if (last > ei_last)
-			continue;
-		if (last > end)
-			continue;
-		return addr;
-	}
-	return -1UL;
-}
-
-/*
- * Find next free range after *start
- */
-unsigned long __init find_e820_area_size(unsigned long start,
-					 unsigned long *sizep,
-					 unsigned long align)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		unsigned long addr, last;
-		unsigned long ei_last;
-
-		if (ei->type != E820_RAM)
-			continue;
-		addr = round_up(ei->addr, align);
-		ei_last = ei->addr + ei->size;
-		if (addr < start)
-			addr = round_up(start, align);
-		if (addr >= ei_last)
-			continue;
-		*sizep = ei_last - addr;
-		while (bad_addr_size(&addr, sizep, align) &&
-			addr + *sizep <= ei_last)
-			;
-		last = addr + *sizep;
-		if (last > ei_last)
-			continue;
-		return addr;
-	}
-	return -1UL;
-
-}
-/*
- * Find the highest page frame number we have available
- */
-unsigned long __init e820_end_of_ram(void)
-{
-	unsigned long end_pfn;
-
-	end_pfn = find_max_pfn_with_active_regions();
-
-	if (end_pfn > max_pfn_mapped)
-		max_pfn_mapped = end_pfn;
-	if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
-		max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
-	if (end_pfn > end_user_pfn)
-		end_pfn = end_user_pfn;
-	if (end_pfn > max_pfn_mapped)
-		end_pfn = max_pfn_mapped;
-
-	printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
-	return end_pfn;
-}
-
-/*
- * Mark e820 reserved areas as busy for the resource manager.
- */
-void __init e820_reserve_resources(void)
-{
-	int i;
-	struct resource *res;
-
-	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
-	for (i = 0; i < e820.nr_map; i++) {
-		switch (e820.map[i].type) {
-		case E820_RAM:	res->name = "System RAM"; break;
-		case E820_ACPI:	res->name = "ACPI Tables"; break;
-		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
-		default:	res->name = "reserved";
-		}
-		res->start = e820.map[i].addr;
-		res->end = res->start + e820.map[i].size - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		insert_resource(&iomem_resource, res);
-		res++;
-	}
-}
-
-/*
- * Find the ranges of physical addresses that do not correspond to
- * e820 RAM areas and mark the corresponding pages as nosave for software
- * suspend and suspend to RAM.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
-	int i;
-	unsigned long paddr;
-
-	paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
-	for (i = 1; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (paddr < ei->addr)
-			register_nosave_region(PFN_DOWN(paddr),
-						PFN_UP(ei->addr));
-
-		paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
-		if (ei->type != E820_RAM)
-			register_nosave_region(PFN_UP(ei->addr),
-						PFN_DOWN(paddr));
-
-		if (paddr >= (end_pfn << PAGE_SHIFT))
-			break;
-	}
-}
-
-/*
- * Finds an active region in the address range from start_pfn to end_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
- */
-static int __init e820_find_active_region(const struct e820entry *ei,
-					  unsigned long start_pfn,
-					  unsigned long end_pfn,
-					  unsigned long *ei_startpfn,
-					  unsigned long *ei_endpfn)
-{
-	*ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
-	*ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT;
-
-	/* Skip map entries smaller than a page */
-	if (*ei_startpfn >= *ei_endpfn)
-		return 0;
-
-	/* Check if max_pfn_mapped should be updated */
-	if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
-		max_pfn_mapped = *ei_endpfn;
-
-	/* Skip if map is outside the node */
-	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
-				    *ei_startpfn >= end_pfn)
-		return 0;
-
-	/* Check for overlaps */
-	if (*ei_startpfn < start_pfn)
-		*ei_startpfn = start_pfn;
-	if (*ei_endpfn > end_pfn)
-		*ei_endpfn = end_pfn;
-
-	/* Obey end_user_pfn to save on memmap */
-	if (*ei_startpfn >= end_user_pfn)
-		return 0;
-	if (*ei_endpfn > end_user_pfn)
-		*ei_endpfn = end_user_pfn;
-
-	return 1;
-}
-
-/* Walk the e820 map and register active regions within a node */
-void __init
-e820_register_active_regions(int nid, unsigned long start_pfn,
-							unsigned long end_pfn)
-{
-	unsigned long ei_startpfn;
-	unsigned long ei_endpfn;
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++)
-		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, end_pfn,
-					    &ei_startpfn, &ei_endpfn))
-			add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/*
  * Add a memory region to the kernel e820 map.
  */
-void __init add_memory_region(unsigned long start, unsigned long size, int type)
+void __init e820_add_region(u64 start, u64 size, int type)
 {
 	int x = e820.nr_map;
 
-	if (x == E820MAX) {
+	if (x == ARRAY_SIZE(e820.map)) {
 		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
 		return;
 	}
@@ -453,28 +109,7 @@ void __init add_memory_region(unsigned long start, unsigned long size, int type)
 	e820.nr_map++;
 }
 
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long end_pfn = end >> PAGE_SHIFT;
-	unsigned long ei_startpfn, ei_endpfn, ram = 0;
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		if (e820_find_active_region(&e820.map[i],
-					    start_pfn, end_pfn,
-					    &ei_startpfn, &ei_endpfn))
-			ram += ei_endpfn - ei_startpfn;
-	}
-	return end - start - (ram << PAGE_SHIFT);
-}
-
-static void __init e820_print_map(char *who)
+void __init e820_print_map(char *who)
 {
 	int i;
 
@@ -507,19 +142,75 @@ static void __init e820_print_map(char *who)
  * Sanitize the BIOS e820 map.
  *
  * Some e820 responses include overlapping entries. The following
- * replaces the original e820 map with a new one, removing overlaps.
+ * replaces the original e820 map with a new one, removing overlaps,
+ * and resolving conflicting memory types in favor of highest
+ * numbered type.
+ *
+ * The input parameter biosmap points to an array of 'struct
+ * e820entry' which on entry has elements in the range [0, *pnr_map)
+ * valid, and which has space for up to max_nr_map entries.
+ * On return, the resulting sanitized e820 map entries will be in
+ * overwritten in the same location, starting at biosmap.
+ *
+ * The integer pointed to by pnr_map must be valid on entry (the
+ * current number of valid entries located at biosmap) and will
+ * be updated on return, with the new number of valid entries
+ * (something no more than max_nr_map.)
+ *
+ * The return value from sanitize_e820_map() is zero if it
+ * successfully 'sanitized' the map entries passed in, and is -1
+ * if it did nothing, which can happen if either of (1) it was
+ * only passed one map entry, or (2) any of the input map entries
+ * were invalid (start + size < start, meaning that the size was
+ * so big the described memory range wrapped around through zero.)
+ *
+ *	Visually we're performing the following
+ *	(1,2,3,4 = memory types)...
+ *
+ *	Sample memory map (w/overlaps):
+ *	   ____22__________________
+ *	   ______________________4_
+ *	   ____1111________________
+ *	   _44_____________________
+ *	   11111111________________
+ *	   ____________________33__
+ *	   ___________44___________
+ *	   __________33333_________
+ *	   ______________22________
+ *	   ___________________2222_
+ *	   _________111111111______
+ *	   _____________________11_
+ *	   _________________4______
  *
+ *	Sanitized equivalent (no overlap):
+ *	   1_______________________
+ *	   _44_____________________
+ *	   ___1____________________
+ *	   ____22__________________
+ *	   ______11________________
+ *	   _________1______________
+ *	   __________3_____________
+ *	   ___________44___________
+ *	   _____________33_________
+ *	   _______________2________
+ *	   ________________1_______
+ *	   _________________4______
+ *	   ___________________2____
+ *	   ____________________33__
+ *	   ______________________4_
  */
-static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
+
+int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
+				int *pnr_map)
 {
 	struct change_member {
 		struct e820entry *pbios; /* pointer to original bios entry */
 		unsigned long long addr; /* address for this change point */
 	};
-	static struct change_member change_point_list[2*E820MAX] __initdata;
-	static struct change_member *change_point[2*E820MAX] __initdata;
-	static struct e820entry *overlap_list[E820MAX] __initdata;
-	static struct e820entry new_bios[E820MAX] __initdata;
+static struct change_member change_point_list[2*E820_X_MAX] __initdata;
+static struct change_member *change_point[2*E820_X_MAX] __initdata;
+static struct e820entry *overlap_list[E820_X_MAX] __initdata;
+static struct e820entry new_bios[E820_X_MAX] __initdata;
 	struct change_member *change_tmp;
 	unsigned long current_type, last_type;
 	unsigned long long last_addr;
@@ -529,48 +220,12 @@ static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
 	int old_nr, new_nr, chg_nr;
 	int i;
 
-	/*
-		Visually we're performing the following
-		(1,2,3,4 = memory types)...
-
-		Sample memory map (w/overlaps):
-		   ____22__________________
-		   ______________________4_
-		   ____1111________________
-		   _44_____________________
-		   11111111________________
-		   ____________________33__
-		   ___________44___________
-		   __________33333_________
-		   ______________22________
-		   ___________________2222_
-		   _________111111111______
-		   _____________________11_
-		   _________________4______
-
-		Sanitized equivalent (no overlap):
-		   1_______________________
-		   _44_____________________
-		   ___1____________________
-		   ____22__________________
-		   ______11________________
-		   _________1______________
-		   __________3_____________
-		   ___________44___________
-		   _____________33_________
-		   _______________2________
-		   ________________1_______
-		   _________________4______
-		   ___________________2____
-		   ____________________33__
-		   ______________________4_
-	*/
-
 	/* if there's only one memory region, don't bother */
 	if (*pnr_map < 2)
 		return -1;
 
 	old_nr = *pnr_map;
+	BUG_ON(old_nr > max_nr_map);
 
 	/* bail out if we find any unreasonable addresses in bios map */
 	for (i = 0; i < old_nr; i++)
@@ -682,7 +337,7 @@ static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
 					 * no more space left for new
 					 * bios entries ?
 					 */
-					if (++new_bios_entry >= E820MAX)
+					if (++new_bios_entry >= max_nr_map)
 						break;
 			}
 			if (current_type != 0)	{
@@ -704,22 +359,9 @@ static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map)
 	return 0;
 }
 
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- */
-static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
+static int __init __copy_e820_map(struct e820entry *biosmap, int nr_map)
 {
-	/* Only one memory region (or negative)? Ignore it */
-	if (nr_map < 2)
-		return -1;
-
-	do {
+	while (nr_map) {
 		u64 start = biosmap->addr;
 		u64 size = biosmap->size;
 		u64 end = start + size;
@@ -729,111 +371,37 @@ static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 		if (start > end)
 			return -1;
 
-		add_memory_region(start, size, type);
-	} while (biosmap++, --nr_map);
-	return 0;
-}
-
-static void early_panic(char *msg)
-{
-	early_printk(msg);
-	panic(msg);
-}
-
-/* We're not void only for x86 32-bit compat */
-char * __init machine_specific_memory_setup(void)
-{
-	char *who = "BIOS-e820";
-	/*
-	 * Try to copy the BIOS-supplied E820-map.
-	 *
-	 * Otherwise fake a memory map; one section from 0k->640k,
-	 * the next section from 1mb->appropriate_mem_k
-	 */
-	sanitize_e820_map(boot_params.e820_map, &boot_params.e820_entries);
-	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0)
-		early_panic("Cannot find a valid memory map");
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	e820_print_map(who);
-
-	/* In case someone cares... */
-	return who;
-}
+		e820_add_region(start, size, type);
 
-static int __init parse_memopt(char *p)
-{
-	if (!p)
-		return -EINVAL;
-	end_user_pfn = memparse(p, &p);
-	end_user_pfn >>= PAGE_SHIFT;
-	return 0;
-}
-early_param("mem", parse_memopt);
-
-static int userdef __initdata;
-
-static int __init parse_memmap_opt(char *p)
-{
-	char *oldp;
-	unsigned long long start_at, mem_size;
-
-	if (!strcmp(p, "exactmap")) {
-#ifdef CONFIG_CRASH_DUMP
-		/*
-		 * If we are doing a crash dump, we still need to know
-		 * the real mem size before original memory map is
-		 * reset.
-		 */
-		e820_register_active_regions(0, 0, -1UL);
-		saved_max_pfn = e820_end_of_ram();
-		remove_all_active_ranges();
-#endif
-		max_pfn_mapped = 0;
-		e820.nr_map = 0;
-		userdef = 1;
-		return 0;
+		biosmap++;
+		nr_map--;
 	}
-
-	oldp = p;
-	mem_size = memparse(p, &p);
-	if (p == oldp)
-		return -EINVAL;
-
-	userdef = 1;
-	if (*p == '@') {
-		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_RAM);
-	} else if (*p == '#') {
-		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_ACPI);
-	} else if (*p == '$') {
-		start_at = memparse(p+1, &p);
-		add_memory_region(start_at, mem_size, E820_RESERVED);
-	} else {
-		end_user_pfn = (mem_size >> PAGE_SHIFT);
-	}
-	return *p == '\0' ? 0 : -EINVAL;
+	return 0;
 }
-early_param("memmap", parse_memmap_opt);
 
-void __init finish_e820_parsing(void)
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ */
+int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 {
-	if (userdef) {
-		char nr = e820.nr_map;
-
-		if (sanitize_e820_map(e820.map, &nr) < 0)
-			early_panic("Invalid user supplied memory map");
-		e820.nr_map = nr;
+	/* Only one memory region (or negative)? Ignore it */
+	if (nr_map < 2)
+		return -1;
 
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		e820_print_map("user");
-	}
+	return __copy_e820_map(biosmap, nr_map);
 }
 
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
+u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 				unsigned new_type)
 {
 	int i;
+	u64 real_updated_size = 0;
 
 	BUG_ON(old_type == new_type);
 
@@ -843,8 +411,10 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type,
 		if (ei->type != old_type)
 			continue;
 		/* totally covered? */
-		if (ei->addr >= start && ei->size <= size) {
+		if (ei->addr >= start &&
+		    (ei->addr + ei->size) <= (start + size)) {
 			ei->type = new_type;
+			real_updated_size += ei->size;
 			continue;
 		}
 		/* partially covered */
@@ -852,26 +422,25 @@ void __init update_memory_range(u64 start, u64 size, unsigned old_type,
 		final_end = min(start + size, ei->addr + ei->size);
 		if (final_start >= final_end)
 			continue;
-		add_memory_region(final_start, final_end - final_start,
+		e820_add_region(final_start, final_end - final_start,
 					 new_type);
+		real_updated_size += final_end - final_start;
 	}
+	return real_updated_size;
 }
 
 void __init update_e820(void)
 {
-	u8 nr_map;
+	int nr_map;
 
 	nr_map = e820.nr_map;
-	if (sanitize_e820_map(e820.map, &nr_map))
+	if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
 		return;
 	e820.nr_map = nr_map;
 	printk(KERN_INFO "modified physical RAM map:\n");
 	e820_print_map("modified");
 }
 
-unsigned long pci_mem_start = 0xaeedbabe;
-EXPORT_SYMBOL(pci_mem_start);
-
 /*
  * Search for the biggest gap in the low 32 bits of the e820
  * memory space.  We pass this space to PCI to assign MMIO resources
@@ -881,7 +450,7 @@ EXPORT_SYMBOL(pci_mem_start);
 __init void e820_setup_gap(void)
 {
 	unsigned long gapstart, gapsize, round;
-	unsigned long last;
+	unsigned long long last;
 	int i;
 	int found = 0;
 
@@ -910,6 +479,7 @@ __init void e820_setup_gap(void)
 			last = start;
 	}
 
+#ifdef CONFIG_X86_64
 	if (!found) {
 		gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024;
 		printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit "
@@ -917,6 +487,7 @@ __init void e820_setup_gap(void)
 		       KERN_ERR "PCI: Unassigned devices with 32bit resource "
 		       "registers may break!\n");
 	}
+#endif
 
 	/*
 	 * See how much we want to round up: start off with
@@ -933,6 +504,586 @@ __init void e820_setup_gap(void)
 	       pci_mem_start, gapstart, gapsize);
 }
 
+/**
+ * Because of the size limitation of struct boot_params, only first
+ * 128 E820 memory entries are passed to kernel via
+ * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
+ * linked list of struct setup_data, which is parsed here.
+ */
+void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data)
+{
+	u32 map_len;
+	int entries;
+	struct e820entry *extmap;
+
+	entries = sdata->len / sizeof(struct e820entry);
+	map_len = sdata->len + sizeof(struct setup_data);
+	if (map_len > PAGE_SIZE)
+		sdata = early_ioremap(pa_data, map_len);
+	extmap = (struct e820entry *)(sdata->data);
+	__copy_e820_map(extmap, entries);
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+	if (map_len > PAGE_SIZE)
+		early_iounmap(sdata, map_len);
+	printk(KERN_INFO "extended physical RAM map:\n");
+	e820_print_map("extended");
+}
+
+#if defined(CONFIG_X86_64) || \
+	(defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
+/**
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for
+ * hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(unsigned long limit_pfn)
+{
+	int i;
+	unsigned long pfn;
+
+	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
+	for (i = 1; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (pfn < PFN_UP(ei->addr))
+			register_nosave_region(pfn, PFN_UP(ei->addr));
+
+		pfn = PFN_DOWN(ei->addr + ei->size);
+		if (ei->type != E820_RAM)
+			register_nosave_region(PFN_UP(ei->addr), pfn);
+
+		if (pfn >= limit_pfn)
+			break;
+	}
+}
+#endif
+
+/*
+ * Early reserved memory areas.
+ */
+#define MAX_EARLY_RES 20
+
+struct early_res {
+	u64 start, end;
+	char name[16];
+};
+static struct early_res early_res[MAX_EARLY_RES] __initdata = {
+	{ 0, PAGE_SIZE, "BIOS data page" },	/* BIOS data page */
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_TRAMPOLINE)
+	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
+#endif
+#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
+	/*
+	 * But first pinch a few for the stack/trampoline stuff
+	 * FIXME: Don't need the extra page at 4K, but need to fix
+	 * trampoline before removing it. (see the GDT stuff)
+	 */
+	{ PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE" },
+	/*
+	 * Has to be in very low memory so we can execute
+	 * real-mode AP code.
+	 */
+	{ TRAMPOLINE_BASE, TRAMPOLINE_BASE + PAGE_SIZE, "TRAMPOLINE" },
+#endif
+	{}
+};
+
+static int __init find_overlapped_early(u64 start, u64 end)
+{
+	int i;
+	struct early_res *r;
+
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		r = &early_res[i];
+		if (end > r->start && start < r->end)
+			break;
+	}
+
+	return i;
+}
+
+void __init reserve_early(u64 start, u64 end, char *name)
+{
+	int i;
+	struct early_res *r;
+
+	i = find_overlapped_early(start, end);
+	if (i >= MAX_EARLY_RES)
+		panic("Too many early reservations");
+	r = &early_res[i];
+	if (r->end)
+		panic("Overlapping early reservations "
+		      "%llx-%llx %s to %llx-%llx %s\n",
+		      start, end - 1, name?name:"", r->start,
+		      r->end - 1, r->name);
+	r->start = start;
+	r->end = end;
+	if (name)
+		strncpy(r->name, name, sizeof(r->name) - 1);
+}
+
+void __init free_early(u64 start, u64 end)
+{
+	struct early_res *r;
+	int i, j;
+
+	i = find_overlapped_early(start, end);
+	r = &early_res[i];
+	if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
+		panic("free_early on not reserved area: %llx-%llx!",
+			 start, end - 1);
+
+	for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+		;
+
+	memmove(&early_res[i], &early_res[i + 1],
+	       (j - 1 - i) * sizeof(struct early_res));
+
+	early_res[j - 1].end = 0;
+}
+
+void __init early_res_to_bootmem(u64 start, u64 end)
+{
+	int i;
+	u64 final_start, final_end;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		final_start = max(start, r->start);
+		final_end = min(end, r->end);
+		if (final_start >= final_end)
+			continue;
+		printk(KERN_INFO "  early res: %d [%llx-%llx] %s\n", i,
+			final_start, final_end - 1, r->name);
+		reserve_bootmem_generic(final_start, final_end - final_start,
+				BOOTMEM_DEFAULT);
+	}
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
+{
+	int i;
+	u64 addr = *addrp;
+	int changed = 0;
+	struct early_res *r;
+again:
+	i = find_overlapped_early(addr, addr + size);
+	r = &early_res[i];
+	if (i < MAX_EARLY_RES && r->end) {
+		*addrp = addr = round_up(r->end, align);
+		changed = 1;
+		goto again;
+	}
+	return changed;
+}
+
+/* Check for already reserved areas */
+static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
+{
+	int i;
+	u64 addr = *addrp, last;
+	u64 size = *sizep;
+	int changed = 0;
+again:
+	last = addr + size;
+	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+		struct early_res *r = &early_res[i];
+		if (last > r->start && addr < r->start) {
+			size = r->start - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last > r->end && addr < r->end) {
+			addr = round_up(r->end, align);
+			size = last - addr;
+			changed = 1;
+			goto again;
+		}
+		if (last <= r->end && addr >= r->start) {
+			(*sizep)++;
+			return 0;
+		}
+	}
+	if (changed) {
+		*addrp = addr;
+		*sizep = size;
+	}
+	return changed;
+}
+
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 addr, last;
+		u64 ei_last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		addr = round_up(ei->addr, align);
+		ei_last = ei->addr + ei->size;
+		if (addr < start)
+			addr = round_up(start, align);
+		if (addr >= ei_last)
+			continue;
+		while (bad_addr(&addr, size, align) && addr+size <= ei_last)
+			;
+		last = addr + size;
+		if (last > ei_last)
+			continue;
+		if (last > end)
+			continue;
+		return addr;
+	}
+	return -1ULL;
+}
+
+/*
+ * Find next free range after *start
+ */
+u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		u64 addr, last;
+		u64 ei_last;
+
+		if (ei->type != E820_RAM)
+			continue;
+		addr = round_up(ei->addr, align);
+		ei_last = ei->addr + ei->size;
+		if (addr < start)
+			addr = round_up(start, align);
+		if (addr >= ei_last)
+			continue;
+		*sizep = ei_last - addr;
+		while (bad_addr_size(&addr, sizep, align) &&
+			addr + *sizep <= ei_last)
+			;
+		last = addr + *sizep;
+		if (last > ei_last)
+			continue;
+		return addr;
+	}
+	return -1UL;
+
+}
+
+/*
+ * pre allocated 4k and reserved it in e820
+ */
+u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+{
+	u64 size = 0;
+	u64 addr;
+	u64 start;
+
+	start = startt;
+	while (size < sizet)
+		start = find_e820_area_size(start, &size, align);
+
+	if (size < sizet)
+		return 0;
+
+	addr = round_down(start + size - sizet, align);
+	e820_update_range(addr, sizet, E820_RAM, E820_RESERVED);
+	printk(KERN_INFO "update e820 for early_reserve_e820\n");
+	update_e820();
+
+	return addr;
+}
+
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_PAE
+#  define MAX_ARCH_PFN		(1ULL<<(36-PAGE_SHIFT))
+# else
+#  define MAX_ARCH_PFN		(1ULL<<(32-PAGE_SHIFT))
+# endif
+#else /* CONFIG_X86_32 */
+# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
+#endif
+
+/*
+ * Last pfn which the user wants to use.
+ */
+unsigned long __initdata end_user_pfn = MAX_ARCH_PFN;
+
+/*
+ * Find the highest page frame number we have available
+ */
+unsigned long __init e820_end_of_ram(void)
+{
+	unsigned long last_pfn;
+	unsigned long max_arch_pfn = MAX_ARCH_PFN;
+
+	last_pfn = find_max_pfn_with_active_regions();
+
+	if (last_pfn > max_arch_pfn)
+		last_pfn = max_arch_pfn;
+	if (last_pfn > end_user_pfn)
+		last_pfn = end_user_pfn;
+
+	printk(KERN_INFO "last_pfn = %lu max_arch_pfn = %lu\n",
+			 last_pfn, max_arch_pfn);
+	return last_pfn;
+}
+
+/*
+ * Finds an active region in the address range from start_pfn to last_pfn and
+ * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
+ */
+int __init e820_find_active_region(const struct e820entry *ei,
+				  unsigned long start_pfn,
+				  unsigned long last_pfn,
+				  unsigned long *ei_startpfn,
+				  unsigned long *ei_endpfn)
+{
+	u64 align = PAGE_SIZE;
+
+	*ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
+	*ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
+
+	/* Skip map entries smaller than a page */
+	if (*ei_startpfn >= *ei_endpfn)
+		return 0;
+
+	/* Skip if map is outside the node */
+	if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
+				    *ei_startpfn >= last_pfn)
+		return 0;
+
+	/* Check for overlaps */
+	if (*ei_startpfn < start_pfn)
+		*ei_startpfn = start_pfn;
+	if (*ei_endpfn > last_pfn)
+		*ei_endpfn = last_pfn;
+
+	/* Obey end_user_pfn to save on memmap */
+	if (*ei_startpfn >= end_user_pfn)
+		return 0;
+	if (*ei_endpfn > end_user_pfn)
+		*ei_endpfn = end_user_pfn;
+
+	return 1;
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init e820_register_active_regions(int nid, unsigned long start_pfn,
+					 unsigned long last_pfn)
+{
+	unsigned long ei_startpfn;
+	unsigned long ei_endpfn;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++)
+		if (e820_find_active_region(&e820.map[i],
+					    start_pfn, last_pfn,
+					    &ei_startpfn, &ei_endpfn))
+			add_active_range(nid, ei_startpfn, ei_endpfn);
+}
+
+/*
+ * Find the hole size (in bytes) in the memory range.
+ * @start: starting address of the memory range to scan
+ * @end: ending address of the memory range to scan
+ */
+u64 __init e820_hole_size(u64 start, u64 end)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long last_pfn = end >> PAGE_SHIFT;
+	unsigned long ei_startpfn, ei_endpfn, ram = 0;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		if (e820_find_active_region(&e820.map[i],
+					    start_pfn, last_pfn,
+					    &ei_startpfn, &ei_endpfn))
+			ram += ei_endpfn - ei_startpfn;
+	}
+	return end - start - ((u64)ram << PAGE_SHIFT);
+}
+
+static void early_panic(char *msg)
+{
+	early_printk(msg);
+	panic(msg);
+}
+
+/* "mem=nopentium" disables the 4MB page tables. */
+static int __init parse_memopt(char *p)
+{
+	u64 mem_size;
+
+	if (!p)
+		return -EINVAL;
+
+#ifdef CONFIG_X86_32
+	if (!strcmp(p, "nopentium")) {
+		setup_clear_cpu_cap(X86_FEATURE_PSE);
+		return 0;
+	}
+#endif
+
+	mem_size = memparse(p, &p);
+	end_user_pfn = mem_size>>PAGE_SHIFT;
+	return 0;
+}
+early_param("mem", parse_memopt);
+
+static int userdef __initdata;
+
+static int __init parse_memmap_opt(char *p)
+{
+	char *oldp;
+	u64 start_at, mem_size;
+
+	if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+		/*
+		 * If we are doing a crash dump, we still need to know
+		 * the real mem size before original memory map is
+		 * reset.
+		 */
+		e820_register_active_regions(0, 0, -1UL);
+		saved_max_pfn = e820_end_of_ram();
+		remove_all_active_ranges();
+#endif
+		e820.nr_map = 0;
+		userdef = 1;
+		return 0;
+	}
+
+	oldp = p;
+	mem_size = memparse(p, &p);
+	if (p == oldp)
+		return -EINVAL;
+
+	userdef = 1;
+	if (*p == '@') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_RAM);
+	} else if (*p == '#') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_ACPI);
+	} else if (*p == '$') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_RESERVED);
+	} else {
+		end_user_pfn = (mem_size >> PAGE_SHIFT);
+	}
+	return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+
+void __init finish_e820_parsing(void)
+{
+	if (userdef) {
+		int nr = e820.nr_map;
+
+		if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+			early_panic("Invalid user supplied memory map");
+		e820.nr_map = nr;
+
+		printk(KERN_INFO "user-defined physical RAM map:\n");
+		e820_print_map("user");
+	}
+}
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+	int i;
+	struct resource *res;
+
+	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+	for (i = 0; i < e820.nr_map; i++) {
+		switch (e820.map[i].type) {
+		case E820_RAM:	res->name = "System RAM"; break;
+		case E820_ACPI:	res->name = "ACPI Tables"; break;
+		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
+		default:	res->name = "reserved";
+		}
+		res->start = e820.map[i].addr;
+		res->end = res->start + e820.map[i].size - 1;
+#ifndef CONFIG_RESOURCES_64BIT
+		if (res->end > 0x100000000ULL) {
+			res++;
+			continue;
+		}
+#endif
+		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+		insert_resource(&iomem_resource, res);
+		res++;
+	}
+}
+
+char *__init default_machine_specific_memory_setup(void)
+{
+	char *who = "BIOS-e820";
+	int new_nr;
+	/*
+	 * Try to copy the BIOS-supplied E820-map.
+	 *
+	 * Otherwise fake a memory map; one section from 0k->640k,
+	 * the next section from 1mb->appropriate_mem_k
+	 */
+	new_nr = boot_params.e820_entries;
+	sanitize_e820_map(boot_params.e820_map,
+			ARRAY_SIZE(boot_params.e820_map),
+			&new_nr);
+	boot_params.e820_entries = new_nr;
+	if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) {
+		u64 mem_size;
+
+		/* compare results from other methods and take the greater */
+		if (boot_params.alt_mem_k
+		    < boot_params.screen_info.ext_mem_k) {
+			mem_size = boot_params.screen_info.ext_mem_k;
+			who = "BIOS-88";
+		} else {
+			mem_size = boot_params.alt_mem_k;
+			who = "BIOS-e801";
+		}
+
+		e820.nr_map = 0;
+		e820_add_region(0, LOWMEMSIZE(), E820_RAM);
+		e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+	}
+
+	/* In case someone cares... */
+	return who;
+}
+
+char *__init __attribute__((weak)) machine_specific_memory_setup(void)
+{
+	return default_machine_specific_memory_setup();
+}
+
+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+char * __init __attribute__((weak)) memory_setup(void)
+{
+	return machine_specific_memory_setup();
+}
+
+void __init setup_memory_map(void)
+{
+	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+	e820_print_map(memory_setup());
+}
+
+#ifdef CONFIG_X86_64
 int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
 {
 	int i;
@@ -951,3 +1102,4 @@ int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
 		max_pfn << PAGE_SHIFT) - *addr;
 	return i + 1;
 }
+#endif
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
deleted file mode 100644
index ed733e7cf4e6..000000000000
--- a/arch/x86/kernel/e820_32.c
+++ /dev/null
@@ -1,775 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/ioport.h>
-#include <linux/string.h>
-#include <linux/kexec.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pfn.h>
-#include <linux/uaccess.h>
-#include <linux/suspend.h>
-
-#include <asm/pgtable.h>
-#include <asm/page.h>
-#include <asm/e820.h>
-#include <asm/setup.h>
-
-struct e820map e820;
-struct change_member {
-	struct e820entry *pbios; /* pointer to original bios entry */
-	unsigned long long addr; /* address for this change point */
-};
-static struct change_member change_point_list[2*E820MAX] __initdata;
-static struct change_member *change_point[2*E820MAX] __initdata;
-static struct e820entry *overlap_list[E820MAX] __initdata;
-static struct e820entry new_bios[E820MAX] __initdata;
-/* For PCI or other memory-mapped resources */
-unsigned long pci_mem_start = 0x10000000;
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
-extern int user_defined_memmap;
-
-static struct resource system_rom_resource = {
-	.name	= "System ROM",
-	.start	= 0xf0000,
-	.end	= 0xfffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
-	.name	= "Extension ROM",
-	.start	= 0xe0000,
-	.end	= 0xeffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
-	.name 	= "Adapter ROM",
-	.start	= 0xc8000,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
-	.name 	= "Video ROM",
-	.start	= 0xc0000,
-	.end	= 0xc7fff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
-	const unsigned short * const ptr = (const unsigned short *)rom;
-	unsigned short sig;
-
-	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
-	unsigned char sum, c;
-
-	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
-		sum += c;
-	return !length && !sum;
-}
-
-static void __init probe_roms(void)
-{
-	const unsigned char *rom;
-	unsigned long start, length, upper;
-	unsigned char c;
-	int i;
-
-	/* video rom */
-	upper = adapter_rom_resources[0].start;
-	for (start = video_rom_resource.start; start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		video_rom_resource.start = start;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* if checksum okay, trust length byte */
-		if (length && romchecksum(rom, length))
-			video_rom_resource.end = start + length - 1;
-
-		request_resource(&iomem_resource, &video_rom_resource);
-		break;
-	}
-
-	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-	if (start < upper)
-		start = upper;
-
-	/* system rom */
-	request_resource(&iomem_resource, &system_rom_resource);
-	upper = system_rom_resource.start;
-
-	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt(extension_rom_resource.start);
-	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
-		if (romchecksum(rom, length)) {
-			request_resource(&iomem_resource, &extension_rom_resource);
-			upper = extension_rom_resource.start;
-		}
-	}
-
-	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* but accept any length that fits if checksum okay */
-		if (!length || start + length > upper || !romchecksum(rom, length))
-			continue;
-
-		adapter_rom_resources[i].start = start;
-		adapter_rom_resources[i].end = start + length - 1;
-		request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
-		start = adapter_rom_resources[i++].end & ~2047UL;
-	}
-}
-
-/*
- * Request address space for all standard RAM and ROM resources
- * and also for regions reported as reserved by the e820.
- */
-void __init init_iomem_resources(struct resource *code_resource,
-		struct resource *data_resource,
-		struct resource *bss_resource)
-{
-	int i;
-
-	probe_roms();
-	for (i = 0; i < e820.nr_map; i++) {
-		struct resource *res;
-#ifndef CONFIG_RESOURCES_64BIT
-		if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
-			continue;
-#endif
-		res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
-		switch (e820.map[i].type) {
-		case E820_RAM:	res->name = "System RAM"; break;
-		case E820_ACPI:	res->name = "ACPI Tables"; break;
-		case E820_NVS:	res->name = "ACPI Non-volatile Storage"; break;
-		default:	res->name = "reserved";
-		}
-		res->start = e820.map[i].addr;
-		res->end = res->start + e820.map[i].size - 1;
-		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		if (request_resource(&iomem_resource, res)) {
-			kfree(res);
-			continue;
-		}
-		if (e820.map[i].type == E820_RAM) {
-			/*
-			 *  We don't know which RAM region contains kernel data,
-			 *  so we try it repeatedly and let the resource manager
-			 *  test it.
-			 */
-			request_resource(res, code_resource);
-			request_resource(res, data_resource);
-			request_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
-			if (crashk_res.start != crashk_res.end)
-				request_resource(res, &crashk_res);
-#endif
-		}
-	}
-}
-
-#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION)
-/**
- * e820_mark_nosave_regions - Find the ranges of physical addresses that do not
- * correspond to e820 RAM areas and mark the corresponding pages as nosave for
- * hibernation.
- *
- * This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
- */
-void __init e820_mark_nosave_regions(void)
-{
-	int i;
-	unsigned long pfn;
-
-	pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
-	for (i = 1; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-
-		if (pfn < PFN_UP(ei->addr))
-			register_nosave_region(pfn, PFN_UP(ei->addr));
-
-		pfn = PFN_DOWN(ei->addr + ei->size);
-		if (ei->type != E820_RAM)
-			register_nosave_region(PFN_UP(ei->addr), pfn);
-
-		if (pfn >= max_low_pfn)
-			break;
-	}
-}
-#endif
-
-void __init add_memory_region(unsigned long long start,
-			      unsigned long long size, int type)
-{
-	int x;
-
-	x = e820.nr_map;
-
-	if (x == E820MAX) {
-		printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
-		return;
-	}
-
-	e820.map[x].addr = start;
-	e820.map[x].size = size;
-	e820.map[x].type = type;
-	e820.nr_map++;
-} /* add_memory_region */
-
-/*
- * Sanitize the BIOS e820 map.
- *
- * Some e820 responses include overlapping entries.  The following
- * replaces the original e820 map with a new one, removing overlaps.
- *
- */
-int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
-{
-	struct change_member *change_tmp;
-	unsigned long current_type, last_type;
-	unsigned long long last_addr;
-	int chgidx, still_changing;
-	int overlap_entries;
-	int new_bios_entry;
-	int old_nr, new_nr, chg_nr;
-	int i;
-
-	/*
-		Visually we're performing the following (1,2,3,4 = memory types)...
-
-		Sample memory map (w/overlaps):
-		   ____22__________________
-		   ______________________4_
-		   ____1111________________
-		   _44_____________________
-		   11111111________________
-		   ____________________33__
-		   ___________44___________
-		   __________33333_________
-		   ______________22________
-		   ___________________2222_
-		   _________111111111______
-		   _____________________11_
-		   _________________4______
-
-		Sanitized equivalent (no overlap):
-		   1_______________________
-		   _44_____________________
-		   ___1____________________
-		   ____22__________________
-		   ______11________________
-		   _________1______________
-		   __________3_____________
-		   ___________44___________
-		   _____________33_________
-		   _______________2________
-		   ________________1_______
-		   _________________4______
-		   ___________________2____
-		   ____________________33__
-		   ______________________4_
-	*/
-	/* if there's only one memory region, don't bother */
-	if (*pnr_map < 2) {
-		return -1;
-	}
-
-	old_nr = *pnr_map;
-
-	/* bail out if we find any unreasonable addresses in bios map */
-	for (i=0; i<old_nr; i++)
-		if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
-			return -1;
-		}
-
-	/* create pointers for initial change-point information (for sorting) */
-	for (i=0; i < 2*old_nr; i++)
-		change_point[i] = &change_point_list[i];
-
-	/* record all known change-points (starting and ending addresses),
-	   omitting those that are for empty memory regions */
-	chgidx = 0;
-	for (i=0; i < old_nr; i++)	{
-		if (biosmap[i].size != 0) {
-			change_point[chgidx]->addr = biosmap[i].addr;
-			change_point[chgidx++]->pbios = &biosmap[i];
-			change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
-			change_point[chgidx++]->pbios = &biosmap[i];
-		}
-	}
-	chg_nr = chgidx;    	/* true number of change-points */
-
-	/* sort change-point list by memory addresses (low -> high) */
-	still_changing = 1;
-	while (still_changing)	{
-		still_changing = 0;
-		for (i=1; i < chg_nr; i++)  {
-			/* if <current_addr> > <last_addr>, swap */
-			/* or, if current=<start_addr> & last=<end_addr>, swap */
-			if ((change_point[i]->addr < change_point[i-1]->addr) ||
-				((change_point[i]->addr == change_point[i-1]->addr) &&
-				 (change_point[i]->addr == change_point[i]->pbios->addr) &&
-				 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
-			   )
-			{
-				change_tmp = change_point[i];
-				change_point[i] = change_point[i-1];
-				change_point[i-1] = change_tmp;
-				still_changing=1;
-			}
-		}
-	}
-
-	/* create a new bios memory map, removing overlaps */
-	overlap_entries=0;	 /* number of entries in the overlap table */
-	new_bios_entry=0;	 /* index for creating new bios map entries */
-	last_type = 0;		 /* start with undefined memory type */
-	last_addr = 0;		 /* start with 0 as last starting address */
-	/* loop through change-points, determining affect on the new bios map */
-	for (chgidx=0; chgidx < chg_nr; chgidx++)
-	{
-		/* keep track of all overlapping bios entries */
-		if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
-		{
-			/* add map entry to overlap list (> 1 entry implies an overlap) */
-			overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
-		}
-		else
-		{
-			/* remove entry from list (order independent, so swap with last) */
-			for (i=0; i<overlap_entries; i++)
-			{
-				if (overlap_list[i] == change_point[chgidx]->pbios)
-					overlap_list[i] = overlap_list[overlap_entries-1];
-			}
-			overlap_entries--;
-		}
-		/* if there are overlapping entries, decide which "type" to use */
-		/* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
-		current_type = 0;
-		for (i=0; i<overlap_entries; i++)
-			if (overlap_list[i]->type > current_type)
-				current_type = overlap_list[i]->type;
-		/* continue building up new bios map based on this information */
-		if (current_type != last_type)	{
-			if (last_type != 0)	 {
-				new_bios[new_bios_entry].size =
-					change_point[chgidx]->addr - last_addr;
-				/* move forward only if the new size was non-zero */
-				if (new_bios[new_bios_entry].size != 0)
-					if (++new_bios_entry >= E820MAX)
-						break; 	/* no more space left for new bios entries */
-			}
-			if (current_type != 0)	{
-				new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
-				new_bios[new_bios_entry].type = current_type;
-				last_addr=change_point[chgidx]->addr;
-			}
-			last_type = current_type;
-		}
-	}
-	new_nr = new_bios_entry;   /* retain count for new bios entries */
-
-	/* copy new bios mapping into original location */
-	memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
-	*pnr_map = new_nr;
-
-	return 0;
-}
-
-/*
- * Copy the BIOS e820 map into a safe place.
- *
- * Sanity-check it while we're at it..
- *
- * If we're lucky and live on a modern system, the setup code
- * will have given us a memory map that we can use to properly
- * set up memory.  If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
- */
-int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
-{
-	/* Only one memory region (or negative)? Ignore it */
-	if (nr_map < 2)
-		return -1;
-
-	do {
-		u64 start = biosmap->addr;
-		u64 size = biosmap->size;
-		u64 end = start + size;
-		u32 type = biosmap->type;
-
-		/* Overflow in 64 bits? Ignore the memory map. */
-		if (start > end)
-			return -1;
-
-		add_memory_region(start, size, type);
-	} while (biosmap++, --nr_map);
-
-	return 0;
-}
-
-/*
- * Find the highest page frame number we have available
- */
-void __init propagate_e820_map(void)
-{
-	int i;
-
-	max_pfn = 0;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long start, end;
-		/* RAM? */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		start = PFN_UP(e820.map[i].addr);
-		end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-		if (start >= end)
-			continue;
-		if (end > max_pfn)
-			max_pfn = end;
-		memory_present(0, start, end);
-	}
-}
-
-/*
- * Register fully available low RAM pages with the bootmem allocator.
- */
-void __init register_bootmem_low_pages(unsigned long max_low_pfn)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		unsigned long curr_pfn, last_pfn, size;
-		/*
-		 * Reserve usable low memory
-		 */
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		/*
-		 * We are rounding up the start address of usable memory:
-		 */
-		curr_pfn = PFN_UP(e820.map[i].addr);
-		if (curr_pfn >= max_low_pfn)
-			continue;
-		/*
-		 * ... and at the end of the usable range downwards:
-		 */
-		last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
-
-		if (last_pfn > max_low_pfn)
-			last_pfn = max_low_pfn;
-
-		/*
-		 * .. finally, did all the rounding and playing
-		 * around just make the area go away?
-		 */
-		if (last_pfn <= curr_pfn)
-			continue;
-
-		size = last_pfn - curr_pfn;
-		free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
-	}
-}
-
-void __init e820_register_memory(void)
-{
-	unsigned long gapstart, gapsize, round;
-	unsigned long long last;
-	int i;
-
-	/*
-	 * Search for the biggest gap in the low 32 bits of the e820
-	 * memory space.
-	 */
-	last = 0x100000000ull;
-	gapstart = 0x10000000;
-	gapsize = 0x400000;
-	i = e820.nr_map;
-	while (--i >= 0) {
-		unsigned long long start = e820.map[i].addr;
-		unsigned long long end = start + e820.map[i].size;
-
-		/*
-		 * Since "last" is at most 4GB, we know we'll
-		 * fit in 32 bits if this condition is true
-		 */
-		if (last > end) {
-			unsigned long gap = last - end;
-
-			if (gap > gapsize) {
-				gapsize = gap;
-				gapstart = end;
-			}
-		}
-		if (start < last)
-			last = start;
-	}
-
-	/*
-	 * See how much we want to round up: start off with
-	 * rounding to the next 1MB area.
-	 */
-	round = 0x100000;
-	while ((gapsize >> 4) > round)
-		round += round;
-	/* Fun with two's complement */
-	pci_mem_start = (gapstart + round) & -round;
-
-	printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
-		pci_mem_start, gapstart, gapsize);
-}
-
-void __init print_memory_map(char *who)
-{
-	int i;
-
-	for (i = 0; i < e820.nr_map; i++) {
-		printk(" %s: %016Lx - %016Lx ", who,
-			e820.map[i].addr,
-			e820.map[i].addr + e820.map[i].size);
-		switch (e820.map[i].type) {
-		case E820_RAM:	printk("(usable)\n");
-				break;
-		case E820_RESERVED:
-				printk("(reserved)\n");
-				break;
-		case E820_ACPI:
-				printk("(ACPI data)\n");
-				break;
-		case E820_NVS:
-				printk("(ACPI NVS)\n");
-				break;
-		default:	printk("type %u\n", e820.map[i].type);
-				break;
-		}
-	}
-}
-
-void __init limit_regions(unsigned long long size)
-{
-	unsigned long long current_addr;
-	int i;
-
-	print_memory_map("limit_regions start");
-	for (i = 0; i < e820.nr_map; i++) {
-		current_addr = e820.map[i].addr + e820.map[i].size;
-		if (current_addr < size)
-			continue;
-
-		if (e820.map[i].type != E820_RAM)
-			continue;
-
-		if (e820.map[i].addr >= size) {
-			/*
-			 * This region starts past the end of the
-			 * requested size, skip it completely.
-			 */
-			e820.nr_map = i;
-		} else {
-			e820.nr_map = i + 1;
-			e820.map[i].size -= current_addr - size;
-		}
-		print_memory_map("limit_regions endfor");
-		return;
-	}
-	print_memory_map("limit_regions endfunc");
-}
-
-/*
- * This function checks if any part of the range <start,end> is mapped
- * with type.
- */
-int
-e820_any_mapped(u64 start, u64 end, unsigned type)
-{
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		const struct e820entry *ei = &e820.map[i];
-		if (type && ei->type != type)
-			continue;
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(e820_any_mapped);
-
- /*
-  * This function checks if the entire range <start,end> is mapped with type.
-  *
-  * Note: this function only works correct if the e820 table is sorted and
-  * not-overlapping, which is the case
-  */
-int __init
-e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
-{
-	u64 start = s;
-	u64 end = e;
-	int i;
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		if (type && ei->type != type)
-			continue;
-		/* is the region (part) in overlap with the current region ?*/
-		if (ei->addr >= end || ei->addr + ei->size <= start)
-			continue;
-		/* if the region is at the beginning of <start,end> we move
-		 * start to the end of the region since it's ok until there
-		 */
-		if (ei->addr <= start)
-			start = ei->addr + ei->size;
-		/* if start is now at or beyond end, we're done, full
-		 * coverage */
-		if (start >= end)
-			return 1; /* we're done */
-	}
-	return 0;
-}
-
-static int __init parse_memmap(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "exactmap") == 0) {
-#ifdef CONFIG_CRASH_DUMP
-		/* If we are doing a crash dump, we
-		 * still need to know the real mem
-		 * size before original memory map is
-		 * reset.
-		 */
-		propagate_e820_map();
-		saved_max_pfn = max_pfn;
-#endif
-		e820.nr_map = 0;
-		user_defined_memmap = 1;
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long start_at, mem_size;
-
-		mem_size = memparse(arg, &arg);
-		if (*arg == '@') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RAM);
-		} else if (*arg == '#') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_ACPI);
-		} else if (*arg == '$') {
-			start_at = memparse(arg+1, &arg);
-			add_memory_region(start_at, mem_size, E820_RESERVED);
-		} else {
-			limit_regions(mem_size);
-			user_defined_memmap = 1;
-		}
-	}
-	return 0;
-}
-early_param("memmap", parse_memmap);
-void __init update_memory_range(u64 start, u64 size, unsigned old_type,
-				unsigned new_type)
-{
-	int i;
-
-	BUG_ON(old_type == new_type);
-
-	for (i = 0; i < e820.nr_map; i++) {
-		struct e820entry *ei = &e820.map[i];
-		u64 final_start, final_end;
-		if (ei->type != old_type)
-			continue;
-		/* totally covered? */
-		if (ei->addr >= start && ei->size <= size) {
-			ei->type = new_type;
-			continue;
-		}
-		/* partially covered */
-		final_start = max(start, ei->addr);
-		final_end = min(start + size, ei->addr + ei->size);
-		if (final_start >= final_end)
-			continue;
-		add_memory_region(final_start, final_end - final_start,
-					 new_type);
-	}
-}
-void __init update_e820(void)
-{
-	u8 nr_map;
-
-	nr_map = e820.nr_map;
-	if (sanitize_e820_map(e820.map, &nr_map))
-		return;
-	e820.nr_map = nr_map;
-	printk(KERN_INFO "modified physical RAM map:\n");
-	print_memory_map("modified");
-}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9f51e1ea9e82..84fd9f2a28ff 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -98,17 +98,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
 
 }
 
-static void __init ati_bugs(int num, int slot, int func)
-{
-#ifdef CONFIG_X86_IO_APIC
-	if (timer_over_8254 == 1) {
-		timer_over_8254 = 0;
-		printk(KERN_INFO
-		"ATI board detected. Disabling timer routing over 8254.\n");
-	}
-#endif
-}
-
 #define QFLAG_APPLY_ONCE 	0x1
 #define QFLAG_APPLIED		0x2
 #define QFLAG_DONE		(QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -126,8 +115,6 @@ static struct chipset early_qrk[] __initdata = {
 	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs },
 	{ PCI_VENDOR_ID_VIA, PCI_ANY_ID,
 	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
-	{ PCI_VENDOR_ID_ATI, PCI_ANY_ID,
-	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs },
 	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
 	  PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
 	{}
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 77d424cf68b3..473c89fe5073 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -213,6 +213,48 @@ unsigned long efi_get_time(void)
 		      eft.minute, eft.second);
 }
 
+/*
+ * Tell the kernel about the EFI memory map.  This might include
+ * more than the max 128 entries that can fit in the e820 legacy
+ * (zeropage) memory map.
+ */
+
+static void __init add_efi_memmap(void)
+{
+	void *p;
+
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		efi_memory_desc_t *md = p;
+		unsigned long long start = md->phys_addr;
+		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+		int e820_type;
+
+		if (md->attribute & EFI_MEMORY_WB)
+			e820_type = E820_RAM;
+		else
+			e820_type = E820_RESERVED;
+		e820_add_region(start, size, e820_type);
+	}
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+}
+
+void __init efi_reserve_early(void)
+{
+	unsigned long pmap;
+
+	pmap = boot_params.efi_info.efi_memmap;
+#ifdef CONFIG_X86_64
+	pmap += (__u64)boot_params.efi_info.efi_memmap_hi << 32;
+#endif
+	memmap.phys_map = (void *)pmap;
+	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
+		boot_params.efi_info.efi_memdesc_size;
+	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
+	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
+	reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
+		      "EFI memmap");
+}
+
 #if EFI_DEBUG
 static void __init print_efi_memmap(void)
 {
@@ -242,21 +284,11 @@ void __init efi_init(void)
 	int i = 0;
 	void *tmp;
 
-#ifdef CONFIG_X86_32
 	efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-	memmap.phys_map = (void *)boot_params.efi_info.efi_memmap;
-#else
-	efi_phys.systab = (efi_system_table_t *)
-		(boot_params.efi_info.efi_systab |
-		 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
-	memmap.phys_map = (void *)
-		(boot_params.efi_info.efi_memmap |
-		 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
+#ifdef CONFIG_X86_64
+	efi_phys.systab = (void *)efi_phys.systab +
+		((__u64)boot_params.efi_info.efi_systab_hi<<32);
 #endif
-	memmap.nr_map = boot_params.efi_info.efi_memmap_size /
-		boot_params.efi_info.efi_memdesc_size;
-	memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
-	memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
 
 	efi.systab = early_ioremap((unsigned long)efi_phys.systab,
 				   sizeof(efi_system_table_t));
@@ -370,6 +402,7 @@ void __init efi_init(void)
 	if (memmap.desc_size != sizeof(efi_memory_desc_t))
 		printk(KERN_WARNING "Kernel-defined memdesc"
 		       "doesn't match the one from EFI!\n");
+	add_efi_memmap();
 
 	/* Setup for EFI runtime service */
 	reboot_type = BOOT_EFI;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d561dd5f1e62..652c5287215f 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -97,14 +97,7 @@ void __init efi_call_phys_epilog(void)
 	early_runtime_code_mapping_set_exec(0);
 }
 
-void __init efi_reserve_bootmem(void)
-{
-	reserve_bootmem_generic((unsigned long)memmap.phys_map,
-				memmap.nr_map * memmap.desc_size,
-				BOOTMEM_DEFAULT);
-}
-
-void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
 	static unsigned pages_mapped __initdata;
 	unsigned i, pages;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c778e4fa55a2..159a1c76d2bd 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,7 +51,7 @@
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
-#include "irq_vectors.h"
+#include <asm/irq_vectors.h>
 
 /*
  * We use macros for low-level operations which need to be overridden
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..e4c5f951e68d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -420,7 +420,6 @@ END(\label)
 	PTREGSCALL stub_clone, sys_clone, %r8
 	PTREGSCALL stub_fork, sys_fork, %rdi
 	PTREGSCALL stub_vfork, sys_vfork, %rdi
-	PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
 	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
 	PTREGSCALL stub_iopl, sys_iopl, %rsi
 
@@ -926,11 +925,11 @@ error_kernelspace:
 	   iret run with kernel gs again, so don't set the user space flag.
 	   B stepping K8s sometimes report an truncated RIP for IRET 
 	   exceptions returning to compat mode. Check for these here too. */
-	leaq irq_return(%rip),%rbp
-	cmpq %rbp,RIP(%rsp) 
+	leaq irq_return(%rip),%rcx
+	cmpq %rcx,RIP(%rsp)
 	je   error_swapgs
-	movl %ebp,%ebp	/* zero extend */
-	cmpq %rbp,RIP(%rsp) 
+	movl %ecx,%ecx	/* zero extend */
+	cmpq %rcx,RIP(%rsp)
 	je   error_swapgs
 	cmpq $gs_change,RIP(%rsp)
         je   error_swapgs
@@ -1120,10 +1119,6 @@ ENTRY(coprocessor_segment_overrun)
 	zeroentry do_coprocessor_segment_overrun
 END(coprocessor_segment_overrun)
 
-ENTRY(reserved)
-	zeroentry do_reserved
-END(reserved)
-
 	/* runs on exception stack */
 ENTRY(double_fault)
 	XCPT_FRAME
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index cbaaf69bedb2..1fa8be5bd217 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -51,7 +51,7 @@ void __init setup_apic_routing(void)
 	else
 #endif
 
-	if (num_possible_cpus() <= 8)
+	if (max_physical_apicid < 8)
 		genapic = &apic_flat;
 	else
 		genapic = &apic_physflat;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index ebf13908a743..45e84acca8a9 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -5,7 +5,7 @@
  *
  * SGI UV APIC functions (note: not an Intel compatible APIC)
  *
- * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
  */
 
 #include <linux/threads.h>
@@ -55,37 +55,37 @@ static cpumask_t uv_vector_allocation_domain(int cpu)
 int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
 {
 	unsigned long val;
-	int nasid;
+	int pnode;
 
-	nasid = uv_apicid_to_nasid(phys_apicid);
+	pnode = uv_apicid_to_pnode(phys_apicid);
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
 	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_INIT;
-	uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 	mdelay(10);
 
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
 	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
 	    APIC_DM_STARTUP;
-	uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 	return 0;
 }
 
 static void uv_send_IPI_one(int cpu, int vector)
 {
 	unsigned long val, apicid, lapicid;
-	int nasid;
+	int pnode;
 
 	apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */
 	lapicid = apicid & 0x3f;		/* ZZZ macro needed */
-	nasid = uv_apicid_to_nasid(apicid);
+	pnode = uv_apicid_to_pnode(apicid);
 	val =
 	    (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
 					      UVH_IPI_INT_APIC_ID_SHFT) |
 	    (vector << UVH_IPI_INT_VECTOR_SHFT);
-	uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 }
 
 static void uv_send_IPI_mask(cpumask_t mask, int vector)
@@ -159,39 +159,81 @@ struct genapic apic_x2apic_uv_x = {
 	.phys_pkg_id = phys_pkg_id,	/* Fixme ZZZ */
 };
 
-static __cpuinit void set_x2apic_extra_bits(int nasid)
+static __cpuinit void set_x2apic_extra_bits(int pnode)
 {
-	__get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6);
+	__get_cpu_var(x2apic_extra_bits) = (pnode << 6);
 }
 
 /*
  * Called on boot cpu.
  */
+static __init int boot_pnode_to_blade(int pnode)
+{
+	int blade;
+
+	for (blade = 0; blade < uv_num_possible_blades(); blade++)
+		if (pnode == uv_blade_info[blade].pnode)
+			return blade;
+	BUG();
+}
+
+struct redir_addr {
+	unsigned long redirect;
+	unsigned long alias;
+};
+
+#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
+
+static __initdata struct redir_addr redir_addrs[] = {
+	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
+	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
+	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
+};
+
+static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
+{
+	union uvh_si_alias0_overlay_config_u alias;
+	union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) {
+		alias.v = uv_read_local_mmr(redir_addrs[i].alias);
+		if (alias.s.base == 0) {
+			*size = (1UL << alias.s.m_alias);
+			redirect.v = uv_read_local_mmr(redir_addrs[i].redirect);
+			*base = (unsigned long)redirect.s.dest_base << DEST_SHIFT;
+			return;
+		}
+	}
+	BUG();
+}
+
 static __init void uv_system_init(void)
 {
 	union uvh_si_addr_map_config_u m_n_config;
-	int bytes, nid, cpu, lcpu, nasid, last_nasid, blade;
-	unsigned long mmr_base;
+	union uvh_node_id_u node_id;
+	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
+	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+	unsigned long mmr_base, present;
 
 	m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
+	m_val = m_n_config.s.m_skt;
+	n_val = m_n_config.s.n_skt;
 	mmr_base =
 	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
 	    ~UV_MMR_ENABLE;
 	printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
 
-	last_nasid = -1;
-	for_each_possible_cpu(cpu) {
-		nid = cpu_to_node(cpu);
-		nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
-		if (nasid != last_nasid)
-			uv_possible_blades++;
-		last_nasid = nasid;
-	}
+	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
+		uv_possible_blades +=
+		  hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
 	printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
 
 	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
 	uv_blade_info = alloc_bootmem_pages(bytes);
 
+	get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
+
 	bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
 	uv_node_to_blade = alloc_bootmem_pages(bytes);
 	memset(uv_node_to_blade, 255, bytes);
@@ -200,43 +242,56 @@ static __init void uv_system_init(void)
 	uv_cpu_to_blade = alloc_bootmem_pages(bytes);
 	memset(uv_cpu_to_blade, 255, bytes);
 
-	last_nasid = -1;
-	blade = -1;
-	lcpu = -1;
-	for_each_possible_cpu(cpu) {
-		nid = cpu_to_node(cpu);
-		nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
-		if (nasid != last_nasid) {
-			blade++;
-			lcpu = -1;
-			uv_blade_info[blade].nr_posible_cpus = 0;
+	blade = 0;
+	for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) {
+		present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8);
+		for (j = 0; j < 64; j++) {
+			if (!test_bit(j, &present))
+				continue;
+			uv_blade_info[blade].pnode = (i * 64 + j);
+			uv_blade_info[blade].nr_possible_cpus = 0;
 			uv_blade_info[blade].nr_online_cpus = 0;
+			blade++;
 		}
-		last_nasid = nasid;
-		lcpu++;
+	}
 
-		uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt;
-		uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt;
+	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+	gnode_upper = (((unsigned long)node_id.s.node_id) &
+		       ~((1 << n_val) - 1)) << m_val;
+
+	for_each_present_cpu(cpu) {
+		nid = cpu_to_node(cpu);
+		pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
+		blade = boot_pnode_to_blade(pnode);
+		lcpu = uv_blade_info[blade].nr_possible_cpus;
+		uv_blade_info[blade].nr_possible_cpus++;
+
+		uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
+		uv_cpu_hub_info(cpu)->lowmem_remap_top =
+					lowmem_redir_base + lowmem_redir_size;
+		uv_cpu_hub_info(cpu)->m_val = m_val;
+		uv_cpu_hub_info(cpu)->n_val = m_val;
 		uv_cpu_hub_info(cpu)->numa_blade_id = blade;
 		uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
-		uv_cpu_hub_info(cpu)->local_nasid = nasid;
-		uv_cpu_hub_info(cpu)->gnode_upper =
-		    nasid & ~((1 << uv_hub_info->n_val) - 1);
+		uv_cpu_hub_info(cpu)->pnode = pnode;
+		uv_cpu_hub_info(cpu)->pnode_mask = (1 << n_val) - 1;
+		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
+		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
 		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
 		uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
-		uv_blade_info[blade].nasid = nasid;
-		uv_blade_info[blade].nr_posible_cpus++;
 		uv_node_to_blade[nid] = blade;
 		uv_cpu_to_blade[cpu] = blade;
 
-		printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n",
-		       cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid);
-		printk(KERN_DEBUG "UV   lcpu %d, blade %d\n", lcpu, blade);
+		printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, pnode %d, nid %d, "
+			"lcpu %d, blade %d\n",
+			cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
+			lcpu, blade);
 	}
 }
 
 /*
  * Called on each cpu to initialize the per_cpu UV data area.
+ * 	ZZZ hotplug not supported yet
  */
 void __cpuinit uv_cpu_init(void)
 {
@@ -246,5 +301,5 @@ void __cpuinit uv_cpu_init(void)
 	uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
 
 	if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
-		set_x2apic_extra_bits(uv_hub_info->local_nasid);
+		set_x2apic_extra_bits(uv_hub_info->pnode);
 }
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
new file mode 100644
index 000000000000..a727c0b9819c
--- /dev/null
+++ b/arch/x86/kernel/head.c
@@ -0,0 +1,73 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <asm/setup.h>
+#include <asm/bios_ebda.h>
+
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+void __init reserve_ebda_region(void)
+{
+	unsigned int lowmem, ebda_addr;
+
+	/* To determine the position of the EBDA and the */
+	/* end of conventional memory, we need to look at */
+	/* the BIOS data area. In a paravirtual environment */
+	/* that area is absent. We'll just have to assume */
+	/* that the paravirt case can handle memory setup */
+	/* correctly, without our help. */
+	if (paravirt_enabled())
+		return;
+
+	/* end of low (conventional) memory */
+	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+	lowmem <<= 10;
+
+	/* start of EBDA area */
+	ebda_addr = get_bios_ebda();
+
+	/* Fixup: bios puts an EBDA in the top 64K segment */
+	/* of conventional memory, but does not adjust lowmem. */
+	if ((lowmem - ebda_addr) <= 0x10000)
+		lowmem = ebda_addr;
+
+	/* Fixup: bios does not report an EBDA at all. */
+	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+		lowmem = 0x9f000;
+
+	/* Paranoia: should never happen, but... */
+	if ((lowmem == 0) || (lowmem >= 0x100000))
+		lowmem = 0x9f000;
+
+	/* reserve all memory between lowmem and the 1MB mark */
+	reserve_early(lowmem, 0x100000, "BIOS reserved");
+}
+
+void __init reserve_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+	char buf[32];
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, sizeof(*data));
+		sprintf(buf, "setup data %x", data->type);
+		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+		pa_data = data->next;
+		early_iounmap(data, sizeof(*data));
+	}
+}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3db059058927..fa1d25dd83e3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,7 +8,34 @@
 #include <linux/init.h>
 #include <linux/start_kernel.h>
 
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/e820.h>
+#include <asm/bios_ebda.h>
+
 void __init i386_start_kernel(void)
 {
+	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	/* Reserve INITRD */
+	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
+		u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+		u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+		u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
+	}
+#endif
+	reserve_early(init_pg_tables_start, init_pg_tables_end,
+			"INIT_PG_TABLE");
+
+	reserve_ebda_region();
+
+	/*
+	 * At this point everything still needed from the boot loader
+	 * or BIOS or kernel text should be early reserved or marked not
+	 * RAM in e820. All other memory is free game.
+	 */
+
 	start_kernel();
 }
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 4bcb61cd9fcd..c970929bb15d 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -65,74 +65,6 @@ static void __init copy_bootdata(char *real_mode_data)
 	}
 }
 
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-	unsigned int lowmem, ebda_addr;
-
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
-	if (paravirt_enabled())
-		return;
-
-	/* end of low (conventional) memory */
-	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-	lowmem <<= 10;
-
-	/* start of EBDA area */
-	ebda_addr = get_bios_ebda();
-
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
-
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
-
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
-
-	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_early(lowmem, 0x100000, "BIOS reserved");
-}
-
-static void __init reserve_setup_data(void)
-{
-	struct setup_data *data;
-	unsigned long pa_data;
-	char buf[32];
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, sizeof(*data));
-		sprintf(buf, "setup data %x", data->type);
-		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
-		pa_data = data->next;
-		early_iounmap(data, sizeof(*data));
-	}
-}
-
 void __init x86_64_start_kernel(char * real_mode_data)
 {
 	int i;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f7357cc0162c..b98b338aae1a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -194,6 +194,7 @@ default_entry:
 	xorl %ebx,%ebx				/* %ebx is kept at zero */
 
 	movl $pa(pg0), %edi
+	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_pmd), %edx
 	movl $PTE_ATTR, %eax
 10:
@@ -219,6 +220,8 @@ default_entry:
 	jb 10b
 1:
 	movl %edi,pa(init_pg_tables_end)
+	shrl $12, %eax
+	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
 	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
@@ -228,6 +231,7 @@ default_entry:
 page_pde_offset = (__PAGE_OFFSET >> 20);
 
 	movl $pa(pg0), %edi
+	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_dir), %edx
 	movl $PTE_ATTR, %eax
 10:
@@ -249,6 +253,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	cmpl %ebp,%eax
 	jb 10b
 	movl %edi,pa(init_pg_tables_end)
+	shrl $12, %eax
+	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
 	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b817974ef942..263b9d14753e 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -18,6 +18,7 @@
 #include <asm/page.h>
 #include <asm/msr.h>
 #include <asm/cache.h>
+#include <asm/processor-flags.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/asm-offsets.h>
@@ -154,9 +155,7 @@ ENTRY(secondary_startup_64)
 	 */
 
 	/* Enable PAE mode and PGE */
-	xorq	%rax, %rax
-	btsq	$5, %rax
-	btsq	$7, %rax
+	movl	$(X86_CR4_PAE | X86_CR4_PGE), %eax
 	movq	%rax, %cr4
 
 	/* Setup early boot stage 4 level pagetables. */
@@ -184,14 +183,10 @@ ENTRY(secondary_startup_64)
 1:	wrmsr				/* Make changes effective */
 
 	/* Setup cr0 */
-#define CR0_PM				1		/* protected mode */
-#define CR0_MP				(1<<1)
-#define CR0_ET				(1<<4)
-#define CR0_NE				(1<<5)
-#define CR0_WP				(1<<16)
-#define CR0_AM				(1<<18)
-#define CR0_PAGING 			(1<<31)
-	movl $CR0_PM|CR0_MP|CR0_ET|CR0_NE|CR0_WP|CR0_AM|CR0_PAGING,%eax
+#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+			 X86_CR0_PG)
+	movl	$CR0_STATE, %eax
 	/* Make changes effective */
 	movq	%rax, %cr0
 
@@ -327,11 +322,11 @@ early_idt_ripmsg:
 ENTRY(name)
 
 /* Automate the creation of 1 to 1 mapping pmd entries */
-#define PMDS(START, PERM, COUNT)		\
-	i = 0 ;					\
-	.rept (COUNT) ;				\
-	.quad	(START) + (i << 21) + (PERM) ;	\
-	i = i + 1 ;				\
+#define PMDS(START, PERM, COUNT)			\
+	i = 0 ;						\
+	.rept (COUNT) ;					\
+	.quad	(START) + (i << PMD_SHIFT) + (PERM) ;	\
+	i = i + 1 ;					\
 	.endr
 
 	/*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 9b5cfcdfc426..ea230ec69057 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -17,7 +17,7 @@
 
 /* FSEC = 10^-15
    NSEC = 10^-9 */
-#define FSEC_PER_NSEC	1000000
+#define FSEC_PER_NSEC	1000000L
 
 /*
  * HPET address is set in acpi/boot.c, when an ACPI entry exists
@@ -206,20 +206,19 @@ static void hpet_enable_legacy_int(void)
 
 static void hpet_legacy_clockevent_register(void)
 {
-	uint64_t hpet_freq;
-
 	/* Start HPET legacy interrupts */
 	hpet_enable_legacy_int();
 
 	/*
-	 * The period is a femto seconds value. We need to calculate the
-	 * scaled math multiplication factor for nanosecond to hpet tick
-	 * conversion.
+	 * The mult factor is defined as (include/linux/clockchips.h)
+	 *  mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
+	 * hpet_period is in units of femtoseconds (per cycle), so
+	 *  mult/2^shift = cyc/ns = 10^6/hpet_period
+	 *  mult = (10^6 * 2^shift)/hpet_period
+	 *  mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
 	 */
-	hpet_freq = 1000000000000000ULL;
-	do_div(hpet_freq, hpet_period);
-	hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
-				      NSEC_PER_SEC, hpet_clockevent.shift);
+	hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
+				      hpet_period, hpet_clockevent.shift);
 	/* Calculate the min / max delta */
 	hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
 							   &hpet_clockevent);
@@ -324,7 +323,7 @@ static struct clocksource clocksource_hpet = {
 
 static int hpet_clocksource_register(void)
 {
-	u64 tmp, start, now;
+	u64 start, now;
 	cycle_t t1;
 
 	/* Start the counter */
@@ -351,21 +350,15 @@ static int hpet_clocksource_register(void)
 		return -ENODEV;
 	}
 
-	/* Initialize and register HPET clocksource
-	 *
-	 * hpet period is in femto seconds per cycle
-	 * so we need to convert this to ns/cyc units
-	 * approximated by mult/2^shift
-	 *
-	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
-	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
-	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
-	 *  (fsec/cyc << shift)/1000000 = mult
-	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	/*
+	 * The definition of mult is (include/linux/clocksource.h)
+	 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
+	 * so we first need to convert hpet_period to ns/cyc units:
+	 *  mult/2^shift = ns/cyc = hpet_period/10^6
+	 *  mult = (hpet_period * 2^shift)/10^6
+	 *  mult = (hpet_period << shift)/FSEC_PER_NSEC
 	 */
-	tmp = (u64)hpet_period << HPET_SHIFT;
-	do_div(tmp, FSEC_PER_NSEC);
-	clocksource_hpet.mult = (u32)tmp;
+	clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
 
 	clocksource_register(&clocksource_hpet);
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 95e80e5033c3..eb9ddd8efb82 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -162,7 +162,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 	int ret;
 
 	if (!cpu_has_fxsr)
-		return -EIO;
+		return -ENODEV;
 
 	ret = init_fpu(target);
 	if (ret)
@@ -179,7 +179,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	int ret;
 
 	if (!cpu_has_fxsr)
-		return -EIO;
+		return -ENODEV;
 
 	ret = init_fpu(target);
 	if (ret)
diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259.c
index fe631967d625..dc92b49d9204 100644
--- a/arch/x86/kernel/i8259_32.c
+++ b/arch/x86/kernel/i8259.c
@@ -1,8 +1,10 @@
+#include <linux/linkage.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
+#include <linux/timex.h>
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/init.h>
@@ -10,10 +12,12 @@
 #include <linux/sysdev.h>
 #include <linux/bitops.h>
 
+#include <asm/acpi.h>
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <asm/io.h>
 #include <asm/timer.h>
+#include <asm/hw_irq.h>
 #include <asm/pgtable.h>
 #include <asm/delay.h>
 #include <asm/desc.h>
@@ -32,7 +36,7 @@ static int i8259A_auto_eoi;
 DEFINE_SPINLOCK(i8259A_lock);
 static void mask_and_ack_8259A(unsigned int);
 
-static struct irq_chip i8259A_chip = {
+struct irq_chip i8259A_chip = {
 	.name		= "XT-PIC",
 	.mask		= disable_8259A_irq,
 	.disable	= disable_8259A_irq,
@@ -125,14 +129,14 @@ static inline int i8259A_irq_real(unsigned int irq)
 	int irqmask = 1<<irq;
 
 	if (irq < 8) {
-		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
+		outb(0x0B, PIC_MASTER_CMD);	/* ISR register */
 		value = inb(PIC_MASTER_CMD) & irqmask;
-		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
+		outb(0x0A, PIC_MASTER_CMD);	/* back to the IRR register */
 		return value;
 	}
-	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
+	outb(0x0B, PIC_SLAVE_CMD);	/* ISR register */
 	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
-	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
+	outb(0x0A, PIC_SLAVE_CMD);	/* back to the IRR register */
 	return value;
 }
 
@@ -171,12 +175,14 @@ handle_real_irq:
 	if (irq & 8) {
 		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
 		outb(cached_slave_mask, PIC_SLAVE_IMR);
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */
+		/* 'Specific EOI' to slave */
+		outb(0x60+(irq&7), PIC_SLAVE_CMD);
+		 /* 'Specific EOI' to master-IRQ2 */
+		outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD);
 	} else {
 		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
 		outb(cached_master_mask, PIC_MASTER_IMR);
-		outb(0x60+irq,PIC_MASTER_CMD);	/* 'Specific EOI to master */
+		outb(0x60+irq, PIC_MASTER_CMD);	/* 'Specific EOI to master */
 	}
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 	return;
@@ -199,7 +205,8 @@ spurious_8259A_irq:
 		 * lets ACK and report it. [once per IRQ]
 		 */
 		if (!(spurious_irq_mask & irqmask)) {
-			printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+			printk(KERN_DEBUG
+			       "spurious 8259A interrupt: IRQ%d.\n", irq);
 			spurious_irq_mask |= irqmask;
 		}
 		atomic_inc(&irq_err_count);
@@ -290,17 +297,28 @@ void init_8259A(int auto_eoi)
 	 * outb_pic - this has to work on a wide range of PC hardware.
 	 */
 	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
-	outb_pic(0x20 + 0, PIC_MASTER_IMR);	/* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
-	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);	/* 8259A-1 (the master) has a slave on IR2 */
+
+	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
+	                       to 0x20-0x27 on i386 */
+	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+
+	/* 8259A-1 (the master) has a slave on IR2 */
+	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
+
 	if (auto_eoi)	/* master does Auto EOI */
 		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
 	else		/* master expects normal EOI */
 		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
 
 	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
-	outb_pic(0x20 + 8, PIC_SLAVE_IMR);	/* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
-	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);	/* 8259A-2 is a slave on master's IR2 */
-	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */
+
+	/* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
+	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+	/* 8259A-2 is a slave on master's IR2 */
+	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
+	/* (slave's support for AEOI in flat mode is to be investigated) */
+	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
+
 	if (auto_eoi)
 		/*
 		 * In AEOI mode we just have to mask the interrupt
@@ -317,93 +335,3 @@ void init_8259A(int auto_eoi)
 
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 }
-
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
- 
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
-	extern void math_error(void __user *);
-	outb(0,0xF0);
-	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
-		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
-	return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
-	.handler = math_error_irq,
-	.mask = CPU_MASK_NONE,
-	.name = "fpu",
-};
-
-void __init init_ISA_irqs (void)
-{
-	int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	init_bsp_APIC();
-#endif
-	init_8259A(0);
-
-	/*
-	 * 16 old-style INTA-cycle interrupts:
-	 */
-	for (i = 0; i < 16; i++) {
-		set_irq_chip_and_handler_name(i, &i8259A_chip,
-					      handle_level_irq, "XT");
-	}
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
-	int i;
-
-	/* all the set up before the call gates are initialised */
-	pre_intr_init_hook();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (i >= NR_IRQS)
-			break;
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (!test_bit(vector, used_vectors))
-			set_intr_gate(vector, interrupt[i]);
-	}
-
-	/* setup after call gates are initialised (usually add in
-	 * the architecture specific gates)
-	 */
-	intr_init_hook();
-
-	/*
-	 * External FPU? Set up irq13 if so, for
-	 * original braindamaged IBM FERR coupling.
-	 */
-	if (boot_cpu_data.hard_math && !cpu_has_fpu)
-		setup_irq(FPU_IRQ, &fpu_irq);
-
-	irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
deleted file mode 100644
index fa57a1568508..000000000000
--- a/arch/x86/kernel/i8259_64.c
+++ /dev/null
@@ -1,512 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-
-#include <asm/acpi.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/delay.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * Common place to define all x86 IRQ vectors
- *
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
- *
- * These macros create the low-level assembly IRQ routines that save
- * register context and call do_IRQ(). do_IRQ() then does all the
- * operations that are needed to keep the AT (or SMP IOAPIC)
- * interrupt-controller happy.
- */
-
-#define BI(x,y) \
-	BUILD_IRQ(x##y)
-
-#define BUILD_16_IRQS(x) \
-	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
-	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
-	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
-	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-				      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
-
-#undef BUILD_16_IRQS
-#undef BI
-
-
-#define IRQ(x,y) \
-	IRQ##x##y##_interrupt
-
-#define IRQLIST_16(x) \
-	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
-	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
-	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
-	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-
-/* for the irq vectors */
-static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
-					  IRQLIST_16(0x2), IRQLIST_16(0x3),
-	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
-	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
-	IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
-};
-
-#undef IRQ
-#undef IRQLIST_16
-
-/*
- * This is the 'legacy' 8259A Programmable Interrupt Controller,
- * present in the majority of PC/AT boxes.
- * plus some generic x86 specific things if generic specifics makes
- * any sense at all.
- * this file should become arch/i386/kernel/irq.c when the old irq.c
- * moves to arch independent land
- */
-
-static int i8259A_auto_eoi;
-DEFINE_SPINLOCK(i8259A_lock);
-static void mask_and_ack_8259A(unsigned int);
-
-static struct irq_chip i8259A_chip = {
-	.name		= "XT-PIC",
-	.mask		= disable_8259A_irq,
-	.disable	= disable_8259A_irq,
-	.unmask		= enable_8259A_irq,
-	.mask_ack	= mask_and_ack_8259A,
-};
-
-/*
- * 8259A PIC functions to handle ISA devices:
- */
-
-/*
- * This contains the irq mask for both 8259A irq controllers,
- */
-unsigned int cached_irq_mask = 0xffff;
-
-/*
- * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
- * boards the timer interrupt is not really connected to any IO-APIC pin,
- * it's fed to the master 8259A's IR0 line only.
- *
- * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
- * this 'mixed mode' IRQ handling costs nothing because it's only used
- * at IRQ setup time.
- */
-unsigned long io_apic_irqs;
-
-void disable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask |= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-void enable_8259A_irq(unsigned int irq)
-{
-	unsigned int mask = ~(1 << irq);
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	cached_irq_mask &= mask;
-	if (irq & 8)
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-	else
-		outb(cached_master_mask, PIC_MASTER_IMR);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-int i8259A_irq_pending(unsigned int irq)
-{
-	unsigned int mask = 1<<irq;
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	if (irq < 8)
-		ret = inb(PIC_MASTER_CMD) & mask;
-	else
-		ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-
-	return ret;
-}
-
-void make_8259A_irq(unsigned int irq)
-{
-	disable_irq_nosync(irq);
-	io_apic_irqs &= ~(1<<irq);
-	set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
-				      "XT");
-	enable_irq(irq);
-}
-
-/*
- * This function assumes to be called rarely. Switching between
- * 8259A registers is slow.
- * This has to be protected by the irq controller spinlock
- * before being called.
- */
-static inline int i8259A_irq_real(unsigned int irq)
-{
-	int value;
-	int irqmask = 1<<irq;
-
-	if (irq < 8) {
-		outb(0x0B,PIC_MASTER_CMD);	/* ISR register */
-		value = inb(PIC_MASTER_CMD) & irqmask;
-		outb(0x0A,PIC_MASTER_CMD);	/* back to the IRR register */
-		return value;
-	}
-	outb(0x0B,PIC_SLAVE_CMD);	/* ISR register */
-	value = inb(PIC_SLAVE_CMD) & (irqmask >> 8);
-	outb(0x0A,PIC_SLAVE_CMD);	/* back to the IRR register */
-	return value;
-}
-
-/*
- * Careful! The 8259A is a fragile beast, it pretty
- * much _has_ to be done exactly like this (mask it
- * first, _then_ send the EOI, and the order of EOI
- * to the two 8259s is important!
- */
-static void mask_and_ack_8259A(unsigned int irq)
-{
-	unsigned int irqmask = 1 << irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-	/*
-	 * Lightweight spurious IRQ detection. We do not want
-	 * to overdo spurious IRQ handling - it's usually a sign
-	 * of hardware problems, so we only do the checks we can
-	 * do without slowing down good hardware unnecessarily.
-	 *
-	 * Note that IRQ7 and IRQ15 (the two spurious IRQs
-	 * usually resulting from the 8259A-1|2 PICs) occur
-	 * even if the IRQ is masked in the 8259A. Thus we
-	 * can check spurious 8259A IRQs without doing the
-	 * quite slow i8259A_irq_real() call for every IRQ.
-	 * This does not cover 100% of spurious interrupts,
-	 * but should be enough to warn the user that there
-	 * is something bad going on ...
-	 */
-	if (cached_irq_mask & irqmask)
-		goto spurious_8259A_irq;
-	cached_irq_mask |= irqmask;
-
-handle_real_irq:
-	if (irq & 8) {
-		inb(PIC_SLAVE_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_slave_mask, PIC_SLAVE_IMR);
-		/* 'Specific EOI' to slave */
-		outb(0x60+(irq&7),PIC_SLAVE_CMD);
-		 /* 'Specific EOI' to master-IRQ2 */
-		outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD);
-	} else {
-		inb(PIC_MASTER_IMR);	/* DUMMY - (do we need this?) */
-		outb(cached_master_mask, PIC_MASTER_IMR);
-		/* 'Specific EOI' to master */
-		outb(0x60+irq,PIC_MASTER_CMD);
-	}
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-	return;
-
-spurious_8259A_irq:
-	/*
-	 * this is the slow path - should happen rarely.
-	 */
-	if (i8259A_irq_real(irq))
-		/*
-		 * oops, the IRQ _is_ in service according to the
-		 * 8259A - not spurious, go handle it.
-		 */
-		goto handle_real_irq;
-
-	{
-		static int spurious_irq_mask;
-		/*
-		 * At this point we can be sure the IRQ is spurious,
-		 * lets ACK and report it. [once per IRQ]
-		 */
-		if (!(spurious_irq_mask & irqmask)) {
-			printk(KERN_DEBUG
-			       "spurious 8259A interrupt: IRQ%d.\n", irq);
-			spurious_irq_mask |= irqmask;
-		}
-		atomic_inc(&irq_err_count);
-		/*
-		 * Theoretically we do not have to handle this IRQ,
-		 * but in Linux this does not cause problems and is
-		 * simpler for us.
-		 */
-		goto handle_real_irq;
-	}
-}
-
-static char irq_trigger[2];
-/**
- * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
- */
-static void restore_ELCR(char *trigger)
-{
-	outb(trigger[0], 0x4d0);
-	outb(trigger[1], 0x4d1);
-}
-
-static void save_ELCR(char *trigger)
-{
-	/* IRQ 0,1,2,8,13 are marked as reserved */
-	trigger[0] = inb(0x4d0) & 0xF8;
-	trigger[1] = inb(0x4d1) & 0xDE;
-}
-
-static int i8259A_resume(struct sys_device *dev)
-{
-	init_8259A(i8259A_auto_eoi);
-	restore_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
-{
-	save_ELCR(irq_trigger);
-	return 0;
-}
-
-static int i8259A_shutdown(struct sys_device *dev)
-{
-	/* Put the i8259A into a quiescent state that
-	 * the kernel initialization code can get it
-	 * out of.
-	 */
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */
-	return 0;
-}
-
-static struct sysdev_class i8259_sysdev_class = {
-	.name = "i8259",
-	.suspend = i8259A_suspend,
-	.resume = i8259A_resume,
-	.shutdown = i8259A_shutdown,
-};
-
-static struct sys_device device_i8259A = {
-	.id	= 0,
-	.cls	= &i8259_sysdev_class,
-};
-
-static int __init i8259A_init_sysfs(void)
-{
-	int error = sysdev_class_register(&i8259_sysdev_class);
-	if (!error)
-		error = sysdev_register(&device_i8259A);
-	return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
-void init_8259A(int auto_eoi)
-{
-	unsigned long flags;
-
-	i8259A_auto_eoi = auto_eoi;
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-
-	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
-	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
-
-	/*
-	 * outb_pic - this has to work on a wide range of PC hardware.
-	 */
-	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
-	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
-	/* 8259A-1 (the master) has a slave on IR2 */
-	outb_pic(0x04, PIC_MASTER_IMR);
-	if (auto_eoi)	/* master does Auto EOI */
-		outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR);
-	else		/* master expects normal EOI */
-		outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR);
-
-	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
-	/* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */
-	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
-	/* 8259A-2 is a slave on master's IR2 */
-	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
-	/* (slave's support for AEOI in flat mode is to be investigated) */
-	outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR);
-
-	if (auto_eoi)
-		/*
-		 * In AEOI mode we just have to mask the interrupt
-		 * when acking.
-		 */
-		i8259A_chip.mask_ack = disable_8259A_irq;
-	else
-		i8259A_chip.mask_ack = mask_and_ack_8259A;
-
-	udelay(100);		/* wait for 8259A to initialize */
-
-	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
-	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
-
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-}
-
-
-
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
-	.handler = no_action,
-	.mask = CPU_MASK_NONE,
-	.name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... IRQ0_VECTOR - 1] = -1,
-	[IRQ0_VECTOR] = 0,
-	[IRQ1_VECTOR] = 1,
-	[IRQ2_VECTOR] = 2,
-	[IRQ3_VECTOR] = 3,
-	[IRQ4_VECTOR] = 4,
-	[IRQ5_VECTOR] = 5,
-	[IRQ6_VECTOR] = 6,
-	[IRQ7_VECTOR] = 7,
-	[IRQ8_VECTOR] = 8,
-	[IRQ9_VECTOR] = 9,
-	[IRQ10_VECTOR] = 10,
-	[IRQ11_VECTOR] = 11,
-	[IRQ12_VECTOR] = 12,
-	[IRQ13_VECTOR] = 13,
-	[IRQ14_VECTOR] = 14,
-	[IRQ15_VECTOR] = 15,
-	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-void __init init_ISA_irqs (void)
-{
-	int i;
-
-	init_bsp_APIC();
-	init_8259A(0);
-
-	for (i = 0; i < NR_IRQS; i++) {
-		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].action = NULL;
-		irq_desc[i].depth = 1;
-
-		if (i < 16) {
-			/*
-			 * 16 old-style INTA-cycle interrupts:
-			 */
-			set_irq_chip_and_handler_name(i, &i8259A_chip,
-						      handle_level_irq, "XT");
-		} else {
-			/*
-			 * 'high' PCI IRQs filled in on demand
-			 */
-			irq_desc[i].chip = &no_irq_chip;
-		}
-	}
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
-	int i;
-
-	init_ISA_irqs();
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
-			set_intr_gate(vector, interrupt[i]);
-	}
-
-#ifdef CONFIG_SMP
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPIs for invalidation */
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-	/* IPI for generic function call */
-	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* Low priority IPI to cleanup after moving an irq */
-	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-#endif
-	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-	set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
-	/* self generated IPI for local APIC timer */
-	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-	/* IPI vectors for APIC spurious and error interrupts */
-	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-	if (!acpi_ioapic)
-		setup_irq(2, &irq2);
-}
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d20..fedb3b113ace 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -58,7 +58,7 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 static DEFINE_SPINLOCK(ioapic_lock);
 static DEFINE_SPINLOCK(vector_lock);
 
-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;
 
 /*
  *	Is the SiS APIC rmw bug present ?
@@ -72,15 +72,21 @@ int sis_apic_bug = -1;
 int nr_ioapic_registers[MAX_IO_APICS];
 
 /* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
 
 /* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
 static int disable_timer_pin_1 __initdata;
 
 /*
@@ -110,7 +116,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
 }
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -239,7 +245,7 @@ static void __init replace_pin_at_irq(unsigned int irq,
 	}
 }
 
-static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
+static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
 {
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int pin, reg;
@@ -259,30 +265,32 @@ static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsign
 }
 
 /* mask = 1 */
-static void __mask_IO_APIC_irq (unsigned int irq)
+static void __mask_IO_APIC_irq(unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00010000, 0);
+	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
 }
 
 /* mask = 0 */
-static void __unmask_IO_APIC_irq (unsigned int irq)
+static void __unmask_IO_APIC_irq(unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0, 0x00010000);
+	__modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
 }
 
 /* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
+				IO_APIC_REDIR_LEVEL_TRIGGER);
 }
 
 /* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
 {
-	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
+				IO_APIC_REDIR_MASKED);
 }
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq(unsigned int irq)
 {
 	unsigned long flags;
 
@@ -291,7 +299,7 @@ static void mask_IO_APIC_irq (unsigned int irq)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq(unsigned int irq)
 {
 	unsigned long flags;
 
@@ -303,7 +311,7 @@ static void unmask_IO_APIC_irq (unsigned int irq)
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
-	
+
 	/* Check delivery_mode to be sure we're not clearing an SMI pin */
 	entry = ioapic_read_entry(apic, pin);
 	if (entry.delivery_mode == dest_SMI)
@@ -315,7 +323,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	ioapic_mask_entry(apic, pin);
 }
 
-static void clear_IO_APIC (void)
+static void clear_IO_APIC(void)
 {
 	int apic, pin;
 
@@ -332,7 +340,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int apicid_value;
 	cpumask_t tmp;
-	
+
 	cpus_and(tmp, cpumask, cpu_online_map);
 	if (cpus_empty(tmp))
 		tmp = TARGET_CPUS;
@@ -361,7 +369,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 # include <linux/kernel_stat.h>	/* kstat */
 # include <linux/slab.h>		/* kmalloc() */
 # include <linux/timer.h>
- 
+
 #define IRQBALANCE_CHECK_ARCH -999
 #define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
 #define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
@@ -373,14 +381,14 @@ static int physical_balance __read_mostly;
 static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
 
 static struct irq_cpu_info {
-	unsigned long * last_irq;
-	unsigned long * irq_delta;
+	unsigned long *last_irq;
+	unsigned long *irq_delta;
 	unsigned long irq;
 } irq_cpu_data[NR_CPUS];
 
 #define CPU_IRQ(cpu)		(irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu,irq) 	(irq_cpu_data[cpu].irq_delta[irq])
+#define LAST_CPU_IRQ(cpu, irq)   (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu, irq) 	(irq_cpu_data[cpu].irq_delta[irq])
 
 #define IDLE_ENOUGH(cpu,now) \
 	(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
@@ -419,8 +427,8 @@ inside:
 			if (cpu == -1)
 				cpu = NR_CPUS-1;
 		}
-	} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
-			(search_idle && !IDLE_ENOUGH(cpu,now)));
+	} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
+			(search_idle && !IDLE_ENOUGH(cpu, now)));
 
 	return cpu;
 }
@@ -430,15 +438,14 @@ static inline void balance_irq(int cpu, int irq)
 	unsigned long now = jiffies;
 	cpumask_t allowed_mask;
 	unsigned int new_cpu;
-		
+
 	if (irqbalance_disabled)
-		return; 
+		return;
 
 	cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
-	if (cpu != new_cpu) {
+	if (cpu != new_cpu)
 		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
-	}
 }
 
 static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
@@ -450,14 +457,14 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
 			if (!irq_desc[j].action)
 				continue;
 			/* Is it a significant load ?  */
-			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
 						useful_load_threshold)
 				continue;
 			balance_irq(i, j);
 		}
 	}
 	balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);	
+		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
 	return;
 }
 
@@ -486,22 +493,22 @@ static void do_irq_balance(void)
 			/* Is this an active IRQ or balancing disabled ? */
 			if (!irq_desc[j].action || irq_balancing_disabled(j))
 				continue;
-			if ( package_index == i )
-				IRQ_DELTA(package_index,j) = 0;
+			if (package_index == i)
+				IRQ_DELTA(package_index, j) = 0;
 			/* Determine the total count per processor per IRQ */
 			value_now = (unsigned long) kstat_cpu(i).irqs[j];
 
 			/* Determine the activity per processor per IRQ */
-			delta = value_now - LAST_CPU_IRQ(i,j);
+			delta = value_now - LAST_CPU_IRQ(i, j);
 
 			/* Update last_cpu_irq[][] for the next time */
-			LAST_CPU_IRQ(i,j) = value_now;
+			LAST_CPU_IRQ(i, j) = value_now;
 
 			/* Ignore IRQs whose rate is less than the clock */
 			if (delta < useful_load_threshold)
 				continue;
 			/* update the load for the processor or package total */
-			IRQ_DELTA(package_index,j) += delta;
+			IRQ_DELTA(package_index, j) += delta;
 
 			/* Keep track of the higher numbered sibling as well */
 			if (i != package_index)
@@ -527,7 +534,8 @@ static void do_irq_balance(void)
 	max_cpu_irq = ULONG_MAX;
 
 tryanothercpu:
-	/* Look for heaviest loaded processor.
+	/*
+	 * Look for heaviest loaded processor.
 	 * We may come back to get the next heaviest loaded processor.
 	 * Skip processors with trivial loads.
 	 */
@@ -536,7 +544,7 @@ tryanothercpu:
 	for_each_online_cpu(i) {
 		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
-		if (max_cpu_irq <= CPU_IRQ(i)) 
+		if (max_cpu_irq <= CPU_IRQ(i))
 			continue;
 		if (tmp_cpu_irq < CPU_IRQ(i)) {
 			tmp_cpu_irq = CPU_IRQ(i);
@@ -545,8 +553,9 @@ tryanothercpu:
 	}
 
 	if (tmp_loaded == -1) {
- 	 /* In the case of small number of heavy interrupt sources, 
-	  * loading some of the cpus too much. We use Ingo's original 
+	 /*
+	  * In the case of small number of heavy interrupt sources,
+	  * loading some of the cpus too much. We use Ingo's original
 	  * approach to rotate them around.
 	  */
 		if (!first_attempt && imbalance >= useful_load_threshold) {
@@ -555,13 +564,14 @@ tryanothercpu:
 		}
 		goto not_worth_the_effort;
 	}
-	
+
 	first_attempt = 0;		/* heaviest search */
 	max_cpu_irq = tmp_cpu_irq;	/* load */
 	max_loaded = tmp_loaded;	/* processor */
 	imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-	
-	/* if imbalance is less than approx 10% of max load, then
+
+	/*
+	 * if imbalance is less than approx 10% of max load, then
 	 * observe diminishing returns action. - quit
 	 */
 	if (imbalance < (max_cpu_irq >> 3))
@@ -577,26 +587,25 @@ tryanotherirq:
 		/* Is this an active IRQ? */
 		if (!irq_desc[j].action)
 			continue;
-		if (imbalance <= IRQ_DELTA(max_loaded,j))
+		if (imbalance <= IRQ_DELTA(max_loaded, j))
 			continue;
 		/* Try to find the IRQ that is closest to the imbalance
 		 * without going over.
 		 */
-		if (move_this_load < IRQ_DELTA(max_loaded,j)) {
-			move_this_load = IRQ_DELTA(max_loaded,j);
+		if (move_this_load < IRQ_DELTA(max_loaded, j)) {
+			move_this_load = IRQ_DELTA(max_loaded, j);
 			selected_irq = j;
 		}
 	}
-	if (selected_irq == -1) {
+	if (selected_irq == -1)
 		goto tryanothercpu;
-	}
 
 	imbalance = move_this_load;
-	
+
 	/* For physical_balance case, we accumulated both load
 	 * values in the one of the siblings cpu_irq[],
 	 * to use the same code for physical and logical processors
-	 * as much as possible. 
+	 * as much as possible.
 	 *
 	 * NOTE: the cpu_irq[] array holds the sum of the load for
 	 * sibling A and sibling B in the slot for the lowest numbered
@@ -625,11 +634,11 @@ tryanotherirq:
 		/* mark for change destination */
 		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
 
-		/* Since we made a change, come back sooner to 
+		/* Since we made a change, come back sooner to
 		 * check for more variation.
 		 */
 		balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);	
+			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
 		return;
 	}
 	goto tryanotherirq;
@@ -640,7 +649,7 @@ not_worth_the_effort:
 	 * upward
 	 */
 	balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);	
+		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
 	return;
 }
 
@@ -679,13 +688,13 @@ static int __init balanced_irq_init(void)
 	cpumask_t tmp;
 
 	cpus_shift_right(tmp, cpu_online_map, 2);
-        c = &boot_cpu_data;
+	c = &boot_cpu_data;
 	/* When not overwritten by the command line ask subarchitecture. */
 	if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
 		irqbalance_disabled = NO_BALANCE_IRQ;
 	if (irqbalance_disabled)
 		return 0;
-	
+
 	 /* disable irqbalance completely if there is only one processor online */
 	if (num_online_cpus() < 2) {
 		irqbalance_disabled = 1;
@@ -699,16 +708,14 @@ static int __init balanced_irq_init(void)
 		physical_balance = 1;
 
 	for_each_online_cpu(i) {
-		irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
-		irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+		irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
+		irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
 		if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
 			printk(KERN_ERR "balanced_irq_init: out of memory");
 			goto failed;
 		}
-		memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS);
-		memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS);
 	}
-	
+
 	printk(KERN_INFO "Starting balanced_irq\n");
 	if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
 		return 0;
@@ -801,10 +808,10 @@ static int find_irq_entry(int apic, int pin, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mpc_dstirq == pin)
+		if (mp_irqs[i].mp_irqtype == type &&
+		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mp_dstirq == pin)
 			return i;
 
 	return -1;
@@ -818,13 +825,13 @@ static int __init find_isa_irq_pin(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 
-			return mp_irqs[i].mpc_dstirq;
+			return mp_irqs[i].mp_dstirq;
 	}
 	return -1;
 }
@@ -834,17 +841,17 @@ static int __init find_isa_irq_apic(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 			break;
 	}
 	if (i < mp_irq_entries) {
 		int apic;
-		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+		for (apic = 0; apic < nr_ioapics; apic++) {
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
 				return apic;
 		}
 	}
@@ -864,28 +871,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 
 	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
 		"slot:%d, pin:%d.\n", bus, slot, pin);
-	if (mp_bus_id_to_pci_bus[bus] == -1) {
+	if (test_bit(bus, mp_bus_not_pci)) {
 		printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
 		return -1;
 	}
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
 				break;
 
 		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mpc_irqtype &&
+		    !mp_irqs[i].mp_irqtype &&
 		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
 
 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;
 
-			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
 				return irq;
 			/*
 			 * Use the first all-but-pin matching entry as a
@@ -900,7 +907,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 
 /*
- * This function currently is only a helper for the i386 smp boot process where 
+ * This function currently is only a helper for the i386 smp boot process where
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
@@ -952,7 +959,7 @@ static int EISA_ELCR(unsigned int irq)
  * EISA conforming in the MP table, that means its trigger type must
  * be read in from the ELCR */
 
-#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)
 
 /* PCI interrupts are always polarity one level triggered,
@@ -969,118 +976,115 @@ static int EISA_ELCR(unsigned int irq)
 
 static int MPBIOS_polarity(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mpc_irqflag & 3)
+	switch (mp_irqs[idx].mp_irqflag & 3) {
+	case 0: /* conforms, ie. bus-type dependent polarity */
 	{
-		case 0: /* conforms, ie. bus-type dependent polarity */
-		{
-			polarity = test_bit(bus, mp_bus_not_pci)?
-				default_ISA_polarity(idx):
-				default_PCI_polarity(idx);
-			break;
-		}
-		case 1: /* high active */
-		{
-			polarity = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
-		case 3: /* low active */
-		{
-			polarity = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
+		polarity = test_bit(bus, mp_bus_not_pci)?
+			default_ISA_polarity(idx):
+			default_PCI_polarity(idx);
+		break;
+	}
+	case 1: /* high active */
+	{
+		polarity = 0;
+		break;
+	}
+	case 2: /* reserved */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		polarity = 1;
+		break;
+	}
+	case 3: /* low active */
+	{
+		polarity = 1;
+		break;
+	}
+	default: /* invalid */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		polarity = 1;
+		break;
+	}
 	}
 	return polarity;
 }
 
 static int MPBIOS_trigger(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int trigger;
 
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
+	case 0: /* conforms, ie. bus-type dependent */
 	{
-		case 0: /* conforms, ie. bus-type dependent */
-		{
-			trigger = test_bit(bus, mp_bus_not_pci)?
-					default_ISA_trigger(idx):
-					default_PCI_trigger(idx);
+		trigger = test_bit(bus, mp_bus_not_pci)?
+				default_ISA_trigger(idx):
+				default_PCI_trigger(idx);
 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-			switch (mp_bus_id_to_type[bus])
-			{
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					trigger = default_EISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_MCA: /* MCA pin */
-				{
-					trigger = default_MCA_trigger(idx);
-					break;
-				}
-				default:
-				{
-					printk(KERN_WARNING "broken BIOS!!\n");
-					trigger = 1;
-					break;
-				}
-			}
-#endif
+		switch (mp_bus_id_to_type[bus]) {
+		case MP_BUS_ISA: /* ISA pin */
+		{
+			/* set before the switch */
 			break;
 		}
-		case 1: /* edge */
+		case MP_BUS_EISA: /* EISA pin */
 		{
-			trigger = 0;
+			trigger = default_EISA_trigger(idx);
 			break;
 		}
-		case 2: /* reserved */
+		case MP_BUS_PCI: /* PCI pin */
 		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			trigger = 1;
+			/* set before the switch */
 			break;
 		}
-		case 3: /* level */
+		case MP_BUS_MCA: /* MCA pin */
 		{
-			trigger = 1;
+			trigger = default_MCA_trigger(idx);
 			break;
 		}
-		default: /* invalid */
+		default:
 		{
 			printk(KERN_WARNING "broken BIOS!!\n");
-			trigger = 0;
+			trigger = 1;
 			break;
 		}
 	}
+#endif
+		break;
+	}
+	case 1: /* edge */
+	{
+		trigger = 0;
+		break;
+	}
+	case 2: /* reserved */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		trigger = 1;
+		break;
+	}
+	case 3: /* level */
+	{
+		trigger = 1;
+		break;
+	}
+	default: /* invalid */
+	{
+		printk(KERN_WARNING "broken BIOS!!\n");
+		trigger = 0;
+		break;
+	}
+	}
 	return trigger;
 }
 
@@ -1097,16 +1101,16 @@ static inline int irq_trigger(int idx)
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 
 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
-	if (mp_irqs[idx].mpc_dstirq != pin)
+	if (mp_irqs[idx].mp_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
 
 	if (test_bit(bus, mp_bus_not_pci))
-		irq = mp_irqs[idx].mpc_srcbusirq;
+		irq = mp_irqs[idx].mp_srcbusirq;
 	else {
 		/*
 		 * PCI IRQs are mapped in order
@@ -1148,8 +1152,8 @@ static inline int IO_APIC_irq_trigger(int irq)
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-			idx = find_irq_entry(apic,pin,mp_INT);
-			if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+			idx = find_irq_entry(apic, pin, mp_INT);
+			if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
 				return irq_trigger(idx);
 		}
 	}
@@ -1164,7 +1168,7 @@ static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }
 
 static int __assign_irq_vector(int irq)
 {
-	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+	static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
 	int vector, offset;
 
 	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
@@ -1176,7 +1180,7 @@ static int __assign_irq_vector(int irq)
 	offset = current_offset;
 next:
 	vector += 8;
-	if (vector >= FIRST_SYSTEM_VECTOR) {
+	if (vector >= first_system_vector) {
 		offset = (offset + 1) % 8;
 		vector = FIRST_DEVICE_VECTOR + offset;
 	}
@@ -1237,25 +1241,25 @@ static void __init setup_IO_APIC_irqs(void)
 		/*
 		 * add it to the IO-APIC irq-routing table:
 		 */
-		memset(&entry,0,sizeof(entry));
+		memset(&entry, 0, sizeof(entry));
 
 		entry.delivery_mode = INT_DELIVERY_MODE;
 		entry.dest_mode = INT_DEST_MODE;
 		entry.mask = 0;				/* enable IRQ */
-		entry.dest.logical.logical_dest = 
+		entry.dest.logical.logical_dest =
 					cpu_mask_to_apicid(TARGET_CPUS);
 
-		idx = find_irq_entry(apic,pin,mp_INT);
+		idx = find_irq_entry(apic, pin, mp_INT);
 		if (idx == -1) {
 			if (first_notcon) {
 				apic_printk(APIC_VERBOSE, KERN_DEBUG
 						" IO-APIC (apicid-pin) %d-%d",
-						mp_ioapics[apic].mpc_apicid,
+						mp_ioapics[apic].mp_apicid,
 						pin);
 				first_notcon = 0;
 			} else
 				apic_printk(APIC_VERBOSE, ", %d-%d",
-					mp_ioapics[apic].mpc_apicid, pin);
+					mp_ioapics[apic].mp_apicid, pin);
 			continue;
 		}
 
@@ -1289,7 +1293,7 @@ static void __init setup_IO_APIC_irqs(void)
 			vector = assign_irq_vector(irq);
 			entry.vector = vector;
 			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-		
+
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
 		}
@@ -1302,25 +1306,21 @@ static void __init setup_IO_APIC_irqs(void)
 }
 
 /*
- * Set up the 8259A-master output pin:
+ * Set up the timer pin, possibly with the 8259A-master behind.
  */
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+					int vector)
 {
 	struct IO_APIC_route_entry entry;
 
-	memset(&entry,0,sizeof(entry));
-
-	disable_8259A_irq(0);
-
-	/* mask LVT0 */
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+	memset(&entry, 0, sizeof(entry));
 
 	/*
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
 	 */
 	entry.dest_mode = INT_DEST_MODE;
-	entry.mask = 0;					/* unmask IRQ now */
+	entry.mask = 1;					/* mask IRQ now */
 	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.polarity = 0;
@@ -1329,17 +1329,14 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 
 	/*
 	 * The timer IRQ doesn't have to know that behind the
-	 * scene we have a 8259A-master in AEOI mode ...
+	 * scene we may have a 8259A-master in AEOI mode ...
 	 */
-	irq_desc[0].chip = &ioapic_chip;
-	set_irq_handler(0, handle_edge_irq);
+	ioapic_register_intr(0, vector, IOAPIC_EDGE);
 
 	/*
 	 * Add it to the IO-APIC irq-routing table:
 	 */
 	ioapic_write_entry(apic, pin, entry);
-
-	enable_8259A_irq(0);
 }
 
 void __init print_IO_APIC(void)
@@ -1354,10 +1351,10 @@ void __init print_IO_APIC(void)
 	if (apic_verbosity == APIC_QUIET)
 		return;
 
- 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
 
 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -1376,7 +1373,7 @@ void __init print_IO_APIC(void)
 		reg_03.raw = io_apic_read(apic, 3);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1459,7 +1456,7 @@ void __init print_IO_APIC(void)
 
 #if 0
 
-static void print_APIC_bitfield (int base)
+static void print_APIC_bitfield(int base)
 {
 	unsigned int v;
 	int i, j;
@@ -1480,7 +1477,7 @@ static void print_APIC_bitfield (int base)
 	}
 }
 
-void /*__init*/ print_local_APIC(void * dummy)
+void /*__init*/ print_local_APIC(void *dummy)
 {
 	unsigned int v, ver, maxlvt;
 
@@ -1489,6 +1486,7 @@ void /*__init*/ print_local_APIC(void * dummy)
 
 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
+	v = apic_read(APIC_ID);
 	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v,
 			GET_APIC_ID(read_apic_id()));
 	v = apic_read(APIC_LVR);
@@ -1563,7 +1561,7 @@ void /*__init*/ print_local_APIC(void * dummy)
 	printk("\n");
 }
 
-void print_all_local_APICs (void)
+void print_all_local_APICs(void)
 {
 	on_each_cpu(print_local_APIC, NULL, 1, 1);
 }
@@ -1586,11 +1584,11 @@ void /*__init*/ print_PIC(void)
 	v = inb(0xa0) << 8 | inb(0x20);
 	printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
 
-	outb(0x0b,0xa0);
-	outb(0x0b,0x20);
+	outb(0x0b, 0xa0);
+	outb(0x0b, 0x20);
 	v = inb(0xa0) << 8 | inb(0x20);
-	outb(0x0a,0xa0);
-	outb(0x0a,0x20);
+	outb(0x0a, 0xa0);
+	outb(0x0a, 0x20);
 
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 
@@ -1626,7 +1624,7 @@ static void __init enable_IO_APIC(void)
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
 	}
-	for(apic = 0; apic < nr_ioapics; apic++) {
+	for (apic = 0; apic < nr_ioapics; apic++) {
 		int pin;
 		/* See if any of the pins is in ExtINT mode */
 		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
@@ -1716,7 +1714,6 @@ void disable_IO_APIC(void)
  * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
  */
 
-#ifndef CONFIG_X86_NUMAQ
 static void __init setup_ioapic_ids_from_mpc(void)
 {
 	union IO_APIC_reg_00 reg_00;
@@ -1726,6 +1723,11 @@ static void __init setup_ioapic_ids_from_mpc(void)
 	unsigned char old_id;
 	unsigned long flags;
 
+#ifdef CONFIG_X86_NUMAQ
+	if (found_numaq)
+		return;
+#endif
+
 	/*
 	 * Don't check I/O APIC IDs for xAPIC systems.  They have
 	 * no meaning without the serial APIC bus.
@@ -1748,15 +1750,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		reg_00.raw = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
-		
-		old_id = mp_ioapics[apic].mpc_apicid;
 
-		if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+		old_id = mp_ioapics[apic].mp_apicid;
+
+		if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-				apic, mp_ioapics[apic].mpc_apicid);
+				apic, mp_ioapics[apic].mp_apicid);
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				reg_00.bits.ID);
-			mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+			mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
 		}
 
 		/*
@@ -1765,9 +1767,9 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		 * 'stuck on smp_invalidate_needed IPI wait' messages.
 		 */
 		if (check_apicid_used(phys_id_present_map,
-					mp_ioapics[apic].mpc_apicid)) {
+					mp_ioapics[apic].mp_apicid)) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-				apic, mp_ioapics[apic].mpc_apicid);
+				apic, mp_ioapics[apic].mp_apicid);
 			for (i = 0; i < get_physical_broadcast(); i++)
 				if (!physid_isset(i, phys_id_present_map))
 					break;
@@ -1776,13 +1778,13 @@ static void __init setup_ioapic_ids_from_mpc(void)
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				i);
 			physid_set(i, phys_id_present_map);
-			mp_ioapics[apic].mpc_apicid = i;
+			mp_ioapics[apic].mp_apicid = i;
 		} else {
 			physid_mask_t tmp;
-			tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+			tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
 			apic_printk(APIC_VERBOSE, "Setting %d in the "
 					"phys_id_present_map\n",
-					mp_ioapics[apic].mpc_apicid);
+					mp_ioapics[apic].mp_apicid);
 			physids_or(phys_id_present_map, phys_id_present_map, tmp);
 		}
 
@@ -1791,21 +1793,21 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		 * We need to adjust the IRQ routing table
 		 * if the ID changed.
 		 */
-		if (old_id != mp_ioapics[apic].mpc_apicid)
+		if (old_id != mp_ioapics[apic].mp_apicid)
 			for (i = 0; i < mp_irq_entries; i++)
-				if (mp_irqs[i].mpc_dstapic == old_id)
-					mp_irqs[i].mpc_dstapic
-						= mp_ioapics[apic].mpc_apicid;
+				if (mp_irqs[i].mp_dstapic == old_id)
+					mp_irqs[i].mp_dstapic
+						= mp_ioapics[apic].mp_apicid;
 
 		/*
 		 * Read the right value from the MPC table and
 		 * write it into the ID register.
-	 	 */
+		 */
 		apic_printk(APIC_VERBOSE, KERN_INFO
 			"...changing IO-APIC physical APIC ID to %d ...",
-			mp_ioapics[apic].mpc_apicid);
+			mp_ioapics[apic].mp_apicid);
 
-		reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+		reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0, reg_00.raw);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1816,15 +1818,12 @@ static void __init setup_ioapic_ids_from_mpc(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		reg_00.raw = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+		if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
 			printk("could not set ID!\n");
 		else
 			apic_printk(APIC_VERBOSE, " ok.\n");
 	}
 }
-#else
-static void __init setup_ioapic_ids_from_mpc(void) { }
-#endif
 
 int no_timer_check __initdata;
 
@@ -2020,7 +2019,7 @@ static void ack_apic(unsigned int irq)
 	ack_APIC_irq();
 }
 
-static void mask_lapic_irq (unsigned int irq)
+static void mask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;
 
@@ -2028,7 +2027,7 @@ static void mask_lapic_irq (unsigned int irq)
 	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void unmask_lapic_irq (unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;
 
@@ -2037,7 +2036,7 @@ static void unmask_lapic_irq (unsigned int irq)
 }
 
 static struct irq_chip lapic_chip __read_mostly = {
-	.name		= "local-APIC-edge",
+	.name		= "local-APIC",
 	.mask		= mask_lapic_irq,
 	.unmask		= unmask_lapic_irq,
 	.eoi		= ack_apic,
@@ -2046,14 +2045,14 @@ static struct irq_chip lapic_chip __read_mostly = {
 static void __init setup_nmi(void)
 {
 	/*
- 	 * Dirty trick to enable the NMI watchdog ...
+	 * Dirty trick to enable the NMI watchdog ...
 	 * We put the 8259A master into AEOI mode and
 	 * unmask on all local APICs LVT0 as NMI.
 	 *
 	 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
 	 * is from Maciej W. Rozycki - so we do not have to EOI from
 	 * the NMI handler or the timer interrupt.
-	 */ 
+	 */
 	apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
 
 	enable_NMI_through_LVT0();
@@ -2129,11 +2128,16 @@ static inline void __init unlock_ExtINT_logic(void)
 static inline void __init check_timer(void)
 {
 	int apic1, pin1, apic2, pin2;
+	int no_pin1 = 0;
 	int vector;
+	unsigned int ver;
 	unsigned long flags;
 
 	local_irq_save(flags);
 
+	ver = apic_read(APIC_LVR);
+	ver = GET_APIC_VERSION(ver);
+
 	/*
 	 * get/set the timer IRQ vector:
 	 */
@@ -2142,17 +2146,17 @@ static inline void __init check_timer(void)
 	set_intr_gate(vector, interrupt[0]);
 
 	/*
-	 * Subtle, code in do_timer_interrupt() expects an AEOI
-	 * mode for the 8259A whenever interrupts are routed
-	 * through I/O APICs.  Also IRQ0 has to be enabled in
-	 * the 8259A which implies the virtual wire has to be
-	 * disabled in the local APIC.
+	 * As IRQ0 is to be enabled in the 8259A, the virtual
+	 * wire has to be disabled in the local APIC.  Also
+	 * timer interrupts need to be acknowledged manually in
+	 * the 8259A for the i82489DX when using the NMI
+	 * watchdog as that APIC treats NMIs as level-triggered.
+	 * The AEOI mode will finish them in the 8259A
+	 * automatically.
 	 */
 	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	timer_ack = 1;
-	if (timer_over_8254 > 0)
-		enable_8259A_irq(0);
+	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
 
 	pin1  = find_isa_irq_pin(0, mp_INT);
 	apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2162,14 +2166,33 @@ static inline void __init check_timer(void)
 	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		vector, apic1, pin1, apic2, pin2);
 
+	/*
+	 * Some BIOS writers are clueless and report the ExtINTA
+	 * I/O APIC input from the cascaded 8259A as the timer
+	 * interrupt input.  So just in case, if only one pin
+	 * was found above, try it both directly and through the
+	 * 8259A.
+	 */
+	if (pin1 == -1) {
+		pin1 = pin2;
+		apic1 = apic2;
+		no_pin1 = 1;
+	} else if (pin2 == -1) {
+		pin2 = pin1;
+		apic2 = apic1;
+	}
+
 	if (pin1 != -1) {
 		/*
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
+		if (no_pin1) {
+			add_pin_to_irq(0, apic1, pin1);
+			setup_timer_IRQ0_pin(apic1, pin1, vector);
+		}
 		unmask_IO_APIC_irq(0);
 		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
-				disable_8259A_irq(0);
 				setup_nmi();
 				enable_8259A_irq(0);
 			}
@@ -2178,43 +2201,46 @@ static inline void __init check_timer(void)
 			goto out;
 		}
 		clear_IO_APIC_pin(apic1, pin1);
-		printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to "
-				"IO-APIC\n");
-	}
+		if (!no_pin1)
+			printk(KERN_ERR "..MP-BIOS bug: "
+			       "8254 timer not connected to IO-APIC\n");
 
-	printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
-	if (pin2 != -1) {
+		printk(KERN_INFO "...trying to set up timer (IRQ0) "
+		       "through the 8259A ... ");
 		printk("\n..... (found pin %d) ...", pin2);
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		setup_ExtINT_IRQ0_pin(apic2, pin2, vector);
+		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		setup_timer_IRQ0_pin(apic2, pin2, vector);
+		unmask_IO_APIC_irq(0);
+		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			printk("works.\n");
-			if (pin1 != -1)
-				replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
-			else
-				add_pin_to_irq(0, apic2, pin2);
+			timer_through_8259 = 1;
 			if (nmi_watchdog == NMI_IO_APIC) {
+				disable_8259A_irq(0);
 				setup_nmi();
+				enable_8259A_irq(0);
 			}
 			goto out;
 		}
 		/*
 		 * Cleanup, just in case ...
 		 */
+		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
+		printk(" failed.\n");
 	}
-	printk(" failed.\n");
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-		nmi_watchdog = 0;
+		nmi_watchdog = NMI_NONE;
 	}
+	timer_ack = 0;
 
 	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
-	disable_8259A_irq(0);
 	set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
 				      "fasteoi");
 	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
@@ -2224,12 +2250,12 @@ static inline void __init check_timer(void)
 		printk(" works.\n");
 		goto out;
 	}
+	disable_8259A_irq(0);
 	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
 	printk(" failed.\n");
 
 	printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
 
-	timer_ack = 0;
 	init_8259A(0);
 	make_8259A_irq(0);
 	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
@@ -2261,7 +2287,7 @@ void __init setup_IO_APIC(void)
 	int i;
 
 	/* Reserve all the system vectors. */
-	for (i = FIRST_SYSTEM_VECTOR; i < NR_VECTORS; i++)
+	for (i = first_system_vector; i < NR_VECTORS; i++)
 		set_bit(i, used_vectors);
 
 	enable_IO_APIC();
@@ -2286,28 +2312,14 @@ void __init setup_IO_APIC(void)
 		print_IO_APIC();
 }
 
-static int __init setup_disable_8254_timer(char *s)
-{
-	timer_over_8254 = -1;
-	return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-	timer_over_8254 = 2;
-	return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
 /*
  *	Called after all the initialization is done. If we didnt find any
  *	APIC bugs then we can allow the modify fast path
  */
- 
+
 static int __init io_apic_bug_finalize(void)
 {
-	if(sis_apic_bug == -1)
+	if (sis_apic_bug == -1)
 		sis_apic_bug = 0;
 	return 0;
 }
@@ -2318,17 +2330,17 @@ struct sysfs_ioapic_data {
 	struct sys_device dev;
 	struct IO_APIC_route_entry entry[0];
 };
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
 
 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
 {
 	struct IO_APIC_route_entry *entry;
 	struct sysfs_ioapic_data *data;
 	int i;
-	
+
 	data = container_of(dev, struct sysfs_ioapic_data, dev);
 	entry = data->entry;
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
 		entry[i] = ioapic_read_entry(dev->id, i);
 
 	return 0;
@@ -2341,18 +2353,18 @@ static int ioapic_resume(struct sys_device *dev)
 	unsigned long flags;
 	union IO_APIC_reg_00 reg_00;
 	int i;
-	
+
 	data = container_of(dev, struct sysfs_ioapic_data, dev);
 	entry = data->entry;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
 		ioapic_write_entry(dev->id, i, entry[i]);
 
 	return 0;
@@ -2366,24 +2378,23 @@ static struct sysdev_class ioapic_sysdev_class = {
 
 static int __init ioapic_init_sysfs(void)
 {
-	struct sys_device * dev;
+	struct sys_device *dev;
 	int i, size, error = 0;
 
 	error = sysdev_class_register(&ioapic_sysdev_class);
 	if (error)
 		return error;
 
-	for (i = 0; i < nr_ioapics; i++ ) {
-		size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
+	for (i = 0; i < nr_ioapics; i++) {
+		size = sizeof(struct sys_device) + nr_ioapic_registers[i]
 			* sizeof(struct IO_APIC_route_entry);
-		mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+		mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
 		if (!mp_ioapic_data[i]) {
 			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
 			continue;
 		}
-		memset(mp_ioapic_data[i], 0, size);
 		dev = &mp_ioapic_data[i]->dev;
-		dev->id = i; 
+		dev->id = i;
 		dev->cls = &ioapic_sysdev_class;
 		error = sysdev_register(dev);
 		if (error) {
@@ -2458,7 +2469,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 		msg->address_lo =
 			MSI_ADDR_BASE_LO |
 			((INT_DEST_MODE == 0) ?
-				MSI_ADDR_DEST_MODE_PHYSICAL:
+MSI_ADDR_DEST_MODE_PHYSICAL:
 				MSI_ADDR_DEST_MODE_LOGICAL) |
 			((INT_DELIVERY_MODE != dest_LowestPrio) ?
 				MSI_ADDR_REDIRECTION_CPU:
@@ -2469,7 +2480,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 			MSI_DATA_TRIGGER_EDGE |
 			MSI_DATA_LEVEL_ASSERT |
 			((INT_DELIVERY_MODE != dest_LowestPrio) ?
-				MSI_DATA_DELIVERY_FIXED:
+MSI_DATA_DELIVERY_FIXED:
 				MSI_DATA_DELIVERY_LOWPRI) |
 			MSI_DATA_VECTOR(vector);
 	}
@@ -2640,12 +2651,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 #endif /* CONFIG_HT_IRQ */
 
 /* --------------------------------------------------------------------------
-                          ACPI-based IOAPIC Configuration
+			ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
 
 #ifdef CONFIG_ACPI
 
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
 {
 	union IO_APIC_reg_00 reg_00;
 	static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -2654,10 +2665,10 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
 	int i = 0;
 
 	/*
-	 * The P4 platform supports up to 256 APIC IDs on two separate APIC 
-	 * buses (one for LAPICs, one for IOAPICs), where predecessors only 
+	 * The P4 platform supports up to 256 APIC IDs on two separate APIC
+	 * buses (one for LAPICs, one for IOAPICs), where predecessors only
 	 * supports up to 16 on one shared APIC bus.
-	 * 
+	 *
 	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
 	 *      advantage of new APIC bus architecture.
 	 */
@@ -2676,7 +2687,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
 	}
 
 	/*
-	 * Every APIC in a system must have a unique ID or we get lots of nice 
+	 * Every APIC in a system must have a unique ID or we get lots of nice
 	 * 'stuck on smp_invalidate_needed IPI wait' messages.
 	 */
 	if (check_apicid_used(apic_id_map, apic_id)) {
@@ -2693,7 +2704,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
 			"trying %d\n", ioapic, apic_id, i);
 
 		apic_id = i;
-	} 
+	}
 
 	tmp = apicid_to_cpu_present(apic_id);
 	physids_or(apic_id_map, apic_id_map, tmp);
@@ -2720,7 +2731,7 @@ int __init io_apic_get_unique_id (int ioapic, int apic_id)
 }
 
 
-int __init io_apic_get_version (int ioapic)
+int __init io_apic_get_version(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
@@ -2733,7 +2744,7 @@ int __init io_apic_get_version (int ioapic)
 }
 
 
-int __init io_apic_get_redir_entries (int ioapic)
+int __init io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
@@ -2746,7 +2757,7 @@ int __init io_apic_get_redir_entries (int ioapic)
 }
 
 
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
 {
 	struct IO_APIC_route_entry entry;
 
@@ -2762,7 +2773,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	 * corresponding device driver registers for this IRQ.
 	 */
 
-	memset(&entry,0,sizeof(entry));
+	memset(&entry, 0, sizeof(entry));
 
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.dest_mode = INT_DEST_MODE;
@@ -2781,7 +2792,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 
 	apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
 		"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
-		mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+		mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
 		edge_level, active_high_low);
 
 	ioapic_register_intr(irq, entry.vector, edge_level);
@@ -2802,8 +2813,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 		return -1;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == mp_INT &&
-		    mp_irqs[i].mpc_srcbusirq == bus_irq)
+		if (mp_irqs[i].mp_irqtype == mp_INT &&
+		    mp_irqs[i].mp_srcbusirq == bus_irq)
 			break;
 	if (i >= mp_irq_entries)
 		return -1;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..2eba4f4c14ba 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -61,7 +61,7 @@ struct irq_cfg {
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
 	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
 	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
 	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
@@ -82,6 +82,10 @@ struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
 
 static int assign_irq_vector(int irq, cpumask_t mask);
 
+int first_system_vector = 0xfe;
+
+char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
+
 #define __apicdebuginit  __init
 
 int sis_apic_bug; /* not actually supported, dummy for compile */
@@ -90,7 +94,7 @@ static int no_timer_check;
 
 static int disable_timer_pin_1 __initdata;
 
-int timer_over_8254 __initdata = 1;
+int timer_through_8259 __initdata;
 
 /* Where if anywhere is the i8259 connect in external int mode */
 static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
@@ -104,15 +108,17 @@ DEFINE_SPINLOCK(vector_lock);
 int nr_ioapic_registers[MAX_IO_APICS];
 
 /* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
 int nr_ioapics;
 
 /* MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+
 /*
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
@@ -140,7 +146,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
+		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
 }
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
@@ -183,7 +189,7 @@ static bool io_apic_level_ack_pending(unsigned int irq)
 			break;
 		reg = io_apic_read(entry->apic, 0x10 + pin*2);
 		/* Is the remote IRR bit set? */
-		if ((reg >> 14) & 1) {
+		if (reg & IO_APIC_REDIR_REMOTE_IRR) {
 			spin_unlock_irqrestore(&ioapic_lock, flags);
 			return true;
 		}
@@ -298,7 +304,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
 			break;
 		io_apic_write(apic, 0x11 + pin*2, dest);
 		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~0x000000ff;
+		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
 		io_apic_modify(apic, reg);
 		if (!entry->next)
@@ -360,16 +366,37 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	entry->pin = pin;
 }
 
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+				      int oldapic, int oldpin,
+				      int newapic, int newpin)
+{
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	while (1) {
+		if (entry->apic == oldapic && entry->pin == oldpin) {
+			entry->apic = newapic;
+			entry->pin = newpin;
+		}
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+}
+
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
 	static void name##_IO_APIC_irq (unsigned int irq)		\
 	__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-						/* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-						/* mask = 0 */
+/* mask = 1 */
+DO_ACTION(__mask,	0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+
+/* mask = 0 */
+DO_ACTION(__unmask,	0, &= ~IO_APIC_REDIR_MASKED, )
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -430,20 +457,6 @@ static int __init disable_timer_pin_setup(char *arg)
 }
 __setup("disable_timer_pin_1", disable_timer_pin_setup);
 
-static int __init setup_disable_8254_timer(char *s)
-{
-	timer_over_8254 = -1;
-	return 1;
-}
-static int __init setup_enable_8254_timer(char *s)
-{
-	timer_over_8254 = 2;
-	return 1;
-}
-
-__setup("disable_8254_timer", setup_disable_8254_timer);
-__setup("enable_8254_timer", setup_enable_8254_timer);
-
 
 /*
  * Find the IRQ entry number of a certain pin.
@@ -453,10 +466,10 @@ static int find_irq_entry(int apic, int pin, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == type &&
-		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
-		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mpc_dstirq == pin)
+		if (mp_irqs[i].mp_irqtype == type &&
+		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
+		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
+		    mp_irqs[i].mp_dstirq == pin)
 			return i;
 
 	return -1;
@@ -470,13 +483,13 @@ static int __init find_isa_irq_pin(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 
-			return mp_irqs[i].mpc_dstirq;
+			return mp_irqs[i].mp_dstirq;
 	}
 	return -1;
 }
@@ -486,17 +499,17 @@ static int __init find_isa_irq_apic(int irq, int type)
 	int i;
 
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mpc_irqtype == type) &&
-		    (mp_irqs[i].mpc_srcbusirq == irq))
+		    (mp_irqs[i].mp_irqtype == type) &&
+		    (mp_irqs[i].mp_srcbusirq == irq))
 			break;
 	}
 	if (i < mp_irq_entries) {
 		int apic;
 		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
 				return apic;
 		}
 	}
@@ -516,28 +529,28 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 
 	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
 		bus, slot, pin);
-	if (mp_bus_id_to_pci_bus[bus] == -1) {
+	if (test_bit(bus, mp_bus_not_pci)) {
 		apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
 		return -1;
 	}
 	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mpc_srcbus;
+		int lbus = mp_irqs[i].mp_srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
+			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
 				break;
 
 		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mpc_irqtype &&
+		    !mp_irqs[i].mp_irqtype &&
 		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq);
 
 			if (!(apic || IO_APIC_IRQ(irq)))
 				continue;
 
-			if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
 				return irq;
 			/*
 			 * Use the first all-but-pin matching entry as a
@@ -565,13 +578,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 
 static int MPBIOS_polarity(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].mpc_irqflag & 3)
+	switch (mp_irqs[idx].mp_irqflag & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent polarity */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -607,13 +620,13 @@ static int MPBIOS_polarity(int idx)
 
 static int MPBIOS_trigger(int idx)
 {
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 	int trigger;
 
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+	switch ((mp_irqs[idx].mp_irqflag>>2) & 3)
 	{
 		case 0: /* conforms, ie. bus-type dependent */
 			if (test_bit(bus, mp_bus_not_pci))
@@ -660,16 +673,16 @@ static inline int irq_trigger(int idx)
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
-	int bus = mp_irqs[idx].mpc_srcbus;
+	int bus = mp_irqs[idx].mp_srcbus;
 
 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
-	if (mp_irqs[idx].mpc_dstirq != pin)
+	if (mp_irqs[idx].mp_dstirq != pin)
 		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
 
 	if (test_bit(bus, mp_bus_not_pci)) {
-		irq = mp_irqs[idx].mpc_srcbusirq;
+		irq = mp_irqs[idx].mp_srcbusirq;
 	} else {
 		/*
 		 * PCI IRQs are mapped in order
@@ -730,7 +743,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
 		offset = current_offset;
 next:
 		vector += 8;
-		if (vector >= FIRST_SYSTEM_VECTOR) {
+		if (vector >= first_system_vector) {
 			/* If we run out of vectors on large boxen, must share them. */
 			offset = (offset + 1) % 8;
 			vector = FIRST_DEVICE_VECTOR + offset;
@@ -846,7 +859,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
 	apic_printk(APIC_VERBOSE,KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
 		    "IRQ %d Mode:%i Active:%i)\n",
-		    apic, mp_ioapics[apic].mpc_apicid, pin, cfg->vector,
+		    apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
 		    irq, trigger, polarity);
 
 	/*
@@ -887,10 +900,10 @@ static void __init setup_IO_APIC_irqs(void)
 		idx = find_irq_entry(apic,pin,mp_INT);
 		if (idx == -1) {
 			if (first_notcon) {
-				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
 				first_notcon = 0;
 			} else
-				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
 			continue;
 		}
 		if (!first_notcon) {
@@ -911,26 +924,21 @@ static void __init setup_IO_APIC_irqs(void)
 }
 
 /*
- * Set up the 8259A-master output pin as broadcast to all
- * CPUs.
+ * Set up the timer pin, possibly with the 8259A-master behind.
  */
-static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
+static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
+					int vector)
 {
 	struct IO_APIC_route_entry entry;
 
 	memset(&entry, 0, sizeof(entry));
 
-	disable_8259A_irq(0);
-
-	/* mask LVT0 */
-	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-
 	/*
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
 	 */
 	entry.dest_mode = INT_DEST_MODE;
-	entry.mask = 0;					/* unmask IRQ now */
+	entry.mask = 1;					/* mask IRQ now */
 	entry.dest = cpu_mask_to_apicid(TARGET_CPUS);
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.polarity = 0;
@@ -939,7 +947,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 
 	/*
 	 * The timer IRQ doesn't have to know that behind the
-	 * scene we have a 8259A-master in AEOI mode ...
+	 * scene we may have a 8259A-master in AEOI mode ...
 	 */
 	set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
 
@@ -947,8 +955,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
 	 * Add it to the IO-APIC irq-routing table:
 	 */
 	ioapic_write_entry(apic, pin, entry);
-
-	enable_8259A_irq(0);
 }
 
 void __apicdebuginit print_IO_APIC(void)
@@ -965,7 +971,7 @@ void __apicdebuginit print_IO_APIC(void)
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
 
 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -983,7 +989,7 @@ void __apicdebuginit print_IO_APIC(void)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	printk("\n");
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 
@@ -1077,6 +1083,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
 
 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
+	v = apic_read(APIC_ID);
 	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
@@ -1659,6 +1666,7 @@ static inline void __init check_timer(void)
 	struct irq_cfg *cfg = irq_cfg + 0;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
+	int no_pin1 = 0;
 
 	local_irq_save(flags);
 
@@ -1669,16 +1677,11 @@ static inline void __init check_timer(void)
 	assign_irq_vector(0, TARGET_CPUS);
 
 	/*
-	 * Subtle, code in do_timer_interrupt() expects an AEOI
-	 * mode for the 8259A whenever interrupts are routed
-	 * through I/O APICs.  Also IRQ0 has to be enabled in
-	 * the 8259A which implies the virtual wire has to be
-	 * disabled in the local APIC.
+	 * As IRQ0 is to be enabled in the 8259A, the virtual
+	 * wire has to be disabled in the local APIC.
 	 */
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	if (timer_over_8254 > 0)
-		enable_8259A_irq(0);
 
 	pin1  = find_isa_irq_pin(0, mp_INT);
 	apic1 = find_isa_irq_apic(0, mp_INT);
@@ -1688,15 +1691,39 @@ static inline void __init check_timer(void)
 	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		cfg->vector, apic1, pin1, apic2, pin2);
 
+	/*
+	 * Some BIOS writers are clueless and report the ExtINTA
+	 * I/O APIC input from the cascaded 8259A as the timer
+	 * interrupt input.  So just in case, if only one pin
+	 * was found above, try it both directly and through the
+	 * 8259A.
+	 */
+	if (pin1 == -1) {
+		pin1 = pin2;
+		apic1 = apic2;
+		no_pin1 = 1;
+	} else if (pin2 == -1) {
+		pin2 = pin1;
+		apic2 = apic1;
+	}
+
+	replace_pin_at_irq(0, 0, 0, apic1, pin1);
+	apic1 = 0;
+	pin1 = 0;
+	setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+
 	if (pin1 != -1) {
 		/*
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
+		if (no_pin1) {
+			add_pin_to_irq(0, apic1, pin1);
+			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+		}
 		unmask_IO_APIC_irq(0);
 		if (!no_timer_check && timer_irq_works()) {
 			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
-				disable_8259A_irq(0);
 				setup_nmi();
 				enable_8259A_irq(0);
 			}
@@ -1705,42 +1732,48 @@ static inline void __init check_timer(void)
 			goto out;
 		}
 		clear_IO_APIC_pin(apic1, pin1);
-		apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not "
-				"connected to IO-APIC\n");
-	}
+		if (!no_pin1)
+			apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+				    "8254 timer not connected to IO-APIC\n");
 
-	apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) "
-				"through the 8259A ... ");
-	if (pin2 != -1) {
+		apic_printk(APIC_VERBOSE,KERN_INFO
+			"...trying to set up timer (IRQ0) "
+			"through the 8259A ... ");
 		apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
 			apic2, pin2);
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector);
+		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+		unmask_IO_APIC_irq(0);
+		enable_8259A_irq(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_VERBOSE," works.\n");
+			timer_through_8259 = 1;
 			nmi_watchdog_default();
 			if (nmi_watchdog == NMI_IO_APIC) {
+				disable_8259A_irq(0);
 				setup_nmi();
+				enable_8259A_irq(0);
 			}
 			goto out;
 		}
 		/*
 		 * Cleanup, just in case ...
 		 */
+		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
+		apic_printk(APIC_VERBOSE," failed.\n");
 	}
-	apic_printk(APIC_VERBOSE," failed.\n");
 
 	if (nmi_watchdog == NMI_IO_APIC) {
 		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-		nmi_watchdog = 0;
+		nmi_watchdog = NMI_NONE;
 	}
 
 	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
 
-	disable_8259A_irq(0);
 	irq_desc[0].chip = &lapic_irq_type;
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
@@ -1749,6 +1782,7 @@ static inline void __init check_timer(void)
 		apic_printk(APIC_VERBOSE," works.\n");
 		goto out;
 	}
+	disable_8259A_irq(0);
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
 	apic_printk(APIC_VERBOSE," failed.\n");
 
@@ -1841,8 +1875,8 @@ static int ioapic_resume(struct sys_device *dev)
 
 	spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
+		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
 		io_apic_write(dev->id, 0, reg_00.raw);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2242,8 +2276,8 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 		return -1;
 
 	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mpc_irqtype == mp_INT &&
-		    mp_irqs[i].mpc_srcbusirq == bus_irq)
+		if (mp_irqs[i].mp_irqtype == mp_INT &&
+		    mp_irqs[i].mp_srcbusirq == bus_irq)
 			break;
 	if (i >= mp_irq_entries)
 		return -1;
@@ -2336,7 +2370,7 @@ void __init ioapic_init_mappings(void)
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
-			ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+			ioapic_phys = mp_ioapics[i].mp_apicaddr;
 		} else {
 			ioapic_phys = (unsigned long)
 				alloc_bootmem_pages(PAGE_SIZE);
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index c0df7b89ca23..9d98cda39ad9 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -8,7 +8,6 @@
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/cache.h>
-#include <linux/interrupt.h>
 #include <linux/cpu.h>
 #include <linux/module.h>
 
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 147352df28b9..47a6f6f12478 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -48,6 +48,29 @@ void ack_bad_irq(unsigned int irq)
 #endif
 }
 
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+/* Debugging check for stack overflow: is there less than 1KB free? */
+static int check_stack_overflow(void)
+{
+	long sp;
+
+	__asm__ __volatile__("andl %%esp,%0" :
+			     "=r" (sp) : "0" (THREAD_SIZE - 1));
+
+	return sp < (sizeof(struct thread_info) + STACK_WARN);
+}
+
+static void print_stack_overflow(void)
+{
+	printk(KERN_WARNING "low stack detected by irq handler\n");
+	dump_stack();
+}
+
+#else
+static inline int check_stack_overflow(void) { return 0; }
+static inline void print_stack_overflow(void) { }
+#endif
+
 #ifdef CONFIG_4KSTACKS
 /*
  * per-CPU IRQ handling contexts (thread information and stack)
@@ -59,48 +82,29 @@ union irq_ctx {
 
 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
-#endif
 
-/*
- * do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- */
-unsigned int do_IRQ(struct pt_regs *regs)
-{	
-	struct pt_regs *old_regs;
-	/* high bit used in ret_from_ code */
-	int irq = ~regs->orig_ax;
-	struct irq_desc *desc = irq_desc + irq;
-#ifdef CONFIG_4KSTACKS
-	union irq_ctx *curctx, *irqctx;
-	u32 *isp;
-#endif
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+		__attribute__((__section__(".bss.page_aligned")));
 
-	if (unlikely((unsigned)irq >= NR_IRQS)) {
-		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
-					__func__, irq);
-		BUG();
-	}
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+		__attribute__((__section__(".bss.page_aligned")));
 
-	old_regs = set_irq_regs(regs);
-	irq_enter();
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
-	/* Debugging check for stack overflow: is there less than 1KB free? */
-	{
-		long sp;
-
-		__asm__ __volatile__("andl %%esp,%0" :
-					"=r" (sp) : "0" (THREAD_SIZE - 1));
-		if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) {
-			printk("do_IRQ: stack overflow: %ld\n",
-				sp - sizeof(struct thread_info));
-			dump_stack();
-		}
-	}
-#endif
+static void call_on_stack(void *func, void *stack)
+{
+	asm volatile("xchgl	%%ebx,%%esp	\n"
+		     "call	*%%edi		\n"
+		     "movl	%%ebx,%%esp	\n"
+		     : "=b" (stack)
+		     : "0" (stack),
+		       "D"(func)
+		     : "memory", "cc", "edx", "ecx", "eax");
+}
 
-#ifdef CONFIG_4KSTACKS
+static inline int
+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
+{
+	union irq_ctx *curctx, *irqctx;
+	u32 *isp, arg1, arg2;
 
 	curctx = (union irq_ctx *) current_thread_info();
 	irqctx = hardirq_ctx[smp_processor_id()];
@@ -111,52 +115,39 @@ unsigned int do_IRQ(struct pt_regs *regs)
 	 * handler) we can't do that and just have to keep using the
 	 * current stack (which is the irq stack already after all)
 	 */
-	if (curctx != irqctx) {
-		int arg1, arg2, bx;
-
-		/* build the stack frame on the IRQ stack */
-		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
-		irqctx->tinfo.task = curctx->tinfo.task;
-		irqctx->tinfo.previous_esp = current_stack_pointer;
+	if (unlikely(curctx == irqctx))
+		return 0;
 
-		/*
-		 * Copy the softirq bits in preempt_count so that the
-		 * softirq checks work in the hardirq context.
-		 */
-		irqctx->tinfo.preempt_count =
-			(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
-			(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
-
-		asm volatile(
-			"       xchgl  %%ebx,%%esp    \n"
-			"       call   *%%edi         \n"
-			"       movl   %%ebx,%%esp    \n"
-			: "=a" (arg1), "=d" (arg2), "=b" (bx)
-			:  "0" (irq),   "1" (desc),  "2" (isp),
-			   "D" (desc->handle_irq)
-			: "memory", "cc", "ecx"
-		);
-	} else
-#endif
-		desc->handle_irq(irq, desc);
+	/* build the stack frame on the IRQ stack */
+	isp = (u32 *) ((char*)irqctx + sizeof(*irqctx));
+	irqctx->tinfo.task = curctx->tinfo.task;
+	irqctx->tinfo.previous_esp = current_stack_pointer;
 
-	irq_exit();
-	set_irq_regs(old_regs);
+	/*
+	 * Copy the softirq bits in preempt_count so that the
+	 * softirq checks work in the hardirq context.
+	 */
+	irqctx->tinfo.preempt_count =
+		(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+		(curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
+	if (unlikely(overflow))
+		call_on_stack(print_stack_overflow, isp);
+
+	asm volatile("xchgl	%%ebx,%%esp	\n"
+		     "call	*%%edi		\n"
+		     "movl	%%ebx,%%esp	\n"
+		     : "=a" (arg1), "=d" (arg2), "=b" (isp)
+		     :  "0" (irq),   "1" (desc),  "2" (isp),
+			"D" (desc->handle_irq)
+		     : "memory", "cc", "ecx");
 	return 1;
 }
 
-#ifdef CONFIG_4KSTACKS
-
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
-
 /*
  * allocate per-cpu stacks for hardirq and for softirq processing
  */
-void irq_ctx_init(int cpu)
+void __cpuinit irq_ctx_init(int cpu)
 {
 	union irq_ctx *irqctx;
 
@@ -164,25 +155,25 @@ void irq_ctx_init(int cpu)
 		return;
 
 	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
-	irqctx->tinfo.task              = NULL;
-	irqctx->tinfo.exec_domain       = NULL;
-	irqctx->tinfo.cpu               = cpu;
-	irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
-	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+	irqctx->tinfo.task		= NULL;
+	irqctx->tinfo.exec_domain	= NULL;
+	irqctx->tinfo.cpu		= cpu;
+	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;
+	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
 	hardirq_ctx[cpu] = irqctx;
 
 	irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
-	irqctx->tinfo.task              = NULL;
-	irqctx->tinfo.exec_domain       = NULL;
-	irqctx->tinfo.cpu               = cpu;
-	irqctx->tinfo.preempt_count     = 0;
-	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+	irqctx->tinfo.task		= NULL;
+	irqctx->tinfo.exec_domain	= NULL;
+	irqctx->tinfo.cpu		= cpu;
+	irqctx->tinfo.preempt_count	= 0;
+	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
 	softirq_ctx[cpu] = irqctx;
 
-	printk("CPU %u irqstacks, hard=%p soft=%p\n",
-		cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+	printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
+	       cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
 }
 
 void irq_ctx_exit(int cpu)
@@ -211,25 +202,56 @@ asmlinkage void do_softirq(void)
 		/* build the stack frame on the softirq stack */
 		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
 
-		asm volatile(
-			"       xchgl   %%ebx,%%esp     \n"
-			"       call    __do_softirq    \n"
-			"       movl    %%ebx,%%esp     \n"
-			: "=b"(isp)
-			: "0"(isp)
-			: "memory", "cc", "edx", "ecx", "eax"
-		);
+		call_on_stack(__do_softirq, isp);
 		/*
 		 * Shouldnt happen, we returned above if in_interrupt():
-	 	 */
+		 */
 		WARN_ON_ONCE(softirq_count());
 	}
 
 	local_irq_restore(flags);
 }
+
+#else
+static inline int
+execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
 #endif
 
 /*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+unsigned int do_IRQ(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs;
+	/* high bit used in ret_from_ code */
+	int overflow, irq = ~regs->orig_ax;
+	struct irq_desc *desc = irq_desc + irq;
+
+	if (unlikely((unsigned)irq >= NR_IRQS)) {
+		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+					__func__, irq);
+		BUG();
+	}
+
+	old_regs = set_irq_regs(regs);
+	irq_enter();
+
+	overflow = check_stack_overflow();
+
+	if (!execute_on_irq_stack(overflow, desc, irq)) {
+		if (unlikely(overflow))
+			print_stack_overflow();
+		desc->handle_irq(irq, desc);
+	}
+
+	irq_exit();
+	set_irq_regs(old_regs);
+	return 1;
+}
+
+/*
  * Interrupt statistics:
  */
 
@@ -313,16 +335,20 @@ skip:
 				per_cpu(irq_stat,j).irq_tlb_count);
 		seq_printf(p, "  TLB shootdowns\n");
 #endif
+#ifdef CONFIG_X86_MCE
 		seq_printf(p, "TRM: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ",
 				per_cpu(irq_stat,j).irq_thermal_count);
 		seq_printf(p, "  Thermal event interrupts\n");
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
 		seq_printf(p, "SPU: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ",
 				per_cpu(irq_stat,j).irq_spurious_count);
 		seq_printf(p, "  Spurious interrupts\n");
+#endif
 		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
 		seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
@@ -331,6 +357,40 @@ skip:
 	return 0;
 }
 
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+	u64 sum = nmi_count(cpu);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
+#endif
+#ifdef CONFIG_SMP
+	sum += per_cpu(irq_stat, cpu).irq_resched_count;
+	sum += per_cpu(irq_stat, cpu).irq_call_count;
+	sum += per_cpu(irq_stat, cpu).irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+	sum += per_cpu(irq_stat, cpu).irq_thermal_count;
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	sum += per_cpu(irq_stat, cpu).irq_spurious_count;
+#endif
+	return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+	u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+	sum += atomic_read(&irq_mis_count);
+#endif
+	return sum;
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 #include <mach_apic.h>
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 3aac15466a91..1f78b238d8d2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -135,6 +135,7 @@ skip:
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
 		seq_printf(p, "  TLB shootdowns\n");
 #endif
+#ifdef CONFIG_X86_MCE
 		seq_printf(p, "TRM: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
@@ -143,6 +144,7 @@ skip:
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
 		seq_printf(p, "  Threshold APIC interrupts\n");
+#endif
 		seq_printf(p, "SPU: ");
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
@@ -153,6 +155,32 @@ skip:
 }
 
 /*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+	u64 sum = cpu_pda(cpu)->__nmi_count;
+
+	sum += cpu_pda(cpu)->apic_timer_irqs;
+#ifdef CONFIG_SMP
+	sum += cpu_pda(cpu)->irq_resched_count;
+	sum += cpu_pda(cpu)->irq_call_count;
+	sum += cpu_pda(cpu)->irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+	sum += cpu_pda(cpu)->irq_thermal_count;
+	sum += cpu_pda(cpu)->irq_threshold_count;
+#endif
+	sum += cpu_pda(cpu)->irq_spurious_count;
+	return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+	return atomic_read(&irq_err_count);
+}
+
+/*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
  * handlers).
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
new file mode 100644
index 000000000000..d66914287ee1
--- /dev/null
+++ b/arch/x86/kernel/irqinit_32.c
@@ -0,0 +1,114 @@
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/timer.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/arch_hooks.h>
+#include <asm/i8259.h>
+
+
+
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+ 
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+	extern void math_error(void __user *);
+	outb(0,0xF0);
+	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+		return IRQ_NONE;
+	math_error((void __user *)get_irq_regs()->ip);
+	return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = {
+	.handler = math_error_irq,
+	.mask = CPU_MASK_NONE,
+	.name = "fpu",
+};
+
+void __init init_ISA_irqs (void)
+{
+	int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	/*
+	 * 16 old-style INTA-cycle interrupts:
+	 */
+	for (i = 0; i < 16; i++) {
+		set_irq_chip_and_handler_name(i, &i8259A_chip,
+					      handle_level_irq, "XT");
+	}
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* all the set up before the call gates are initialised */
+	pre_intr_init_hook();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (i >= NR_IRQS)
+			break;
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (!test_bit(vector, used_vectors))
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+	/* setup after call gates are initialised (usually add in
+	 * the architecture specific gates)
+	 */
+	intr_init_hook();
+
+	/*
+	 * External FPU? Set up irq13 if so, for
+	 * original braindamaged IBM FERR coupling.
+	 */
+	if (boot_cpu_data.hard_math && !cpu_has_fpu)
+		setup_irq(FPU_IRQ, &fpu_irq);
+
+	irq_ctx_init(smp_processor_id());
+}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
new file mode 100644
index 000000000000..31f49e8f46a7
--- /dev/null
+++ b/arch/x86/kernel/irqinit_64.c
@@ -0,0 +1,217 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+
+#define IRQ_NAME2(nr) nr##_interrupt(void)
+#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
+
+/*
+ *	SMP has a few special interrupts for IPI messages
+ */
+
+#define BUILD_IRQ(nr)				\
+	asmlinkage void IRQ_NAME(nr);		\
+	asm("\n.p2align\n"			\
+	    "IRQ" #nr "_interrupt:\n\t"		\
+	    "push $~(" #nr ") ; "		\
+	    "jmp common_interrupt");
+
+#define BI(x,y) \
+	BUILD_IRQ(x##y)
+
+#define BUILD_16_IRQS(x) \
+	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+	BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+				      BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
+
+#undef BUILD_16_IRQS
+#undef BI
+
+
+#define IRQ(x,y) \
+	IRQ##x##y##_interrupt
+
+#define IRQLIST_16(x) \
+	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+
+/* for the irq vectors */
+static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
+					  IRQLIST_16(0x2), IRQLIST_16(0x3),
+	IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+	IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+	IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
+};
+
+#undef IRQ
+#undef IRQLIST_16
+
+
+
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+
+static struct irqaction irq2 = {
+	.handler = no_action,
+	.mask = CPU_MASK_NONE,
+	.name = "cascade",
+};
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+	[0 ... IRQ0_VECTOR - 1] = -1,
+	[IRQ0_VECTOR] = 0,
+	[IRQ1_VECTOR] = 1,
+	[IRQ2_VECTOR] = 2,
+	[IRQ3_VECTOR] = 3,
+	[IRQ4_VECTOR] = 4,
+	[IRQ5_VECTOR] = 5,
+	[IRQ6_VECTOR] = 6,
+	[IRQ7_VECTOR] = 7,
+	[IRQ8_VECTOR] = 8,
+	[IRQ9_VECTOR] = 9,
+	[IRQ10_VECTOR] = 10,
+	[IRQ11_VECTOR] = 11,
+	[IRQ12_VECTOR] = 12,
+	[IRQ13_VECTOR] = 13,
+	[IRQ14_VECTOR] = 14,
+	[IRQ15_VECTOR] = 15,
+	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
+static void __init init_ISA_irqs (void)
+{
+	int i;
+
+	init_bsp_APIC();
+	init_8259A(0);
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc[i].status = IRQ_DISABLED;
+		irq_desc[i].action = NULL;
+		irq_desc[i].depth = 1;
+
+		if (i < 16) {
+			/*
+			 * 16 old-style INTA-cycle interrupts:
+			 */
+			set_irq_chip_and_handler_name(i, &i8259A_chip,
+						      handle_level_irq, "XT");
+		} else {
+			/*
+			 * 'high' PCI IRQs filled in on demand
+			 */
+			irq_desc[i].chip = &no_irq_chip;
+		}
+	}
+}
+
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	init_ISA_irqs();
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (vector != IA32_SYSCALL_VECTOR)
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+#ifdef CONFIG_SMP
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPIs for invalidation */
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+	/* IPI for generic function call */
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	/* Low priority IPI to cleanup after moving an irq */
+	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+
+	/* self generated IPI for local APIC timer */
+	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+	/* IPI vectors for APIC spurious and error interrupts */
+	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	if (!acpi_ioapic)
+		setup_irq(2, &irq2);
+}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0224c3637c73..21f2bae98c15 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -20,9 +20,9 @@
 #include <asm/mmu_context.h>
 
 #ifdef CONFIG_SMP
-static void flush_ldt(void *null)
+static void flush_ldt(void *current_mm)
 {
-	if (current->active_mm)
+	if (current->active_mm == current_mm)
 		load_LDT(&current->active_mm->context);
 }
 #endif
@@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 		load_LDT(pc);
 		mask = cpumask_of_cpu(smp_processor_id());
 		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-			smp_call_function(flush_ldt, NULL, 1, 1);
+			smp_call_function(flush_ldt, current->mm, 1, 1);
 		preempt_enable();
 #else
 		load_LDT(pc);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc31..f4960171bc66 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -39,7 +39,7 @@ static void set_idt(void *newidt, __u16 limit)
 	curidt.address = (unsigned long)newidt;
 
 	load_idt(&curidt);
-};
+}
 
 
 static void set_gdt(void *newgdt, __u16 limit)
@@ -51,7 +51,7 @@ static void set_gdt(void *newgdt, __u16 limit)
 	curgdt.address = (unsigned long)newgdt;
 
 	load_gdt(&curgdt);
-};
+}
 
 static void load_segments(void)
 {
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 69729e38b78a..9758fea87c5b 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -5,13 +5,14 @@
  *		      2006	Shaohua Li <shaohua.li@intel.com>
  *
  *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II, 
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
  *	Pentium III, Xeon, Pentium 4, etc.
  *
- *	Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
- *	Order Number 245472 or free download from:
- *		
- *	http://developer.intel.com/design/pentium4/manuals/245472.htm
+ *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *	Software Developer's Manual
+ *	Order Number 253668 or free download from:
+ *
+ *	http://developer.intel.com/design/pentium4/manuals/253668.htm
  *
  *	For more information, go to http://www.urbanmyth.org/microcode
  *
@@ -58,12 +59,12 @@
  *		nature of implementation.
  *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
  *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, 
+ *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
  *		Jun Nakajima <jun.nakajima@intel.com>
  *		Support for the microcode updates in the new format.
  *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
  *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode 
+ *		because we no longer hold a copy of applied microcode
  *		in kernel memory.
  *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
  *		Fix sigmatch() macro to handle old CPUs with pf == 0.
@@ -320,11 +321,11 @@ static void apply_microcode(int cpu)
 		return;
 
 	/* serialize access to the physical write to MSR 0x79 */
-	spin_lock_irqsave(&microcode_update_lock, flags);          
+	spin_lock_irqsave(&microcode_update_lock, flags);
 
 	/* write microcode via MSR 0x79 */
 	wrmsr(MSR_IA32_UCODE_WRITE,
-		(unsigned long) uci->mc->bits, 
+		(unsigned long) uci->mc->bits,
 		(unsigned long) uci->mc->bits >> 16 >> 16);
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 
@@ -341,7 +342,7 @@ static void apply_microcode(int cpu)
 		return;
 	}
 	printk(KERN_INFO "microcode: CPU%d updated from revision "
-	       "0x%x to 0x%x, date = %08x \n", 
+	       "0x%x to 0x%x, date = %08x \n",
 	       cpu_num, uci->rev, val[1], uci->mc->hdr.date);
 	uci->rev = val[1];
 }
@@ -534,7 +535,7 @@ static int cpu_request_microcode(int cpu)
 		c->x86, c->x86_model, c->x86_mask);
 	error = request_firmware(&firmware, name, &microcode_pdev->dev);
 	if (error) {
-		pr_debug("microcode: ucode data file %s load failed\n", name);
+		pr_debug("microcode: data file %s load failed\n", name);
 		return error;
 	}
 	buf = firmware->data;
@@ -805,6 +806,9 @@ static int __init microcode_init (void)
 {
 	int error;
 
+	printk(KERN_INFO
+		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
+
 	error = microcode_dev_init();
 	if (error)
 		return error;
@@ -825,9 +829,6 @@ static int __init microcode_init (void)
 	}
 
 	register_hotcpu_notifier(&mc_cpu_notifier);
-
-	printk(KERN_INFO 
-		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
 	return 0;
 }
 
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index edc5fbfe85c0..fdfdc550b366 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -12,6 +12,7 @@
 #include <asm/io.h>
 #include <asm/msr.h>
 #include <asm/acpi.h>
+#include <asm/mmconfig.h>
 
 #include "../pci/pci.h"
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 4901ae3f742c..8b6b1e05c306 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -25,6 +25,8 @@
 #include <asm/proto.h>
 #include <asm/acpi.h>
 #include <asm/bios_ebda.h>
+#include <asm/e820.h>
+#include <asm/trampoline.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_X86_32
@@ -32,28 +34,6 @@
 #include <mach_mpparse.h>
 #endif
 
-/* Have we found an MP table */
-int smp_found_config;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
-
-static int mp_current_pci_id;
-
-int pic_mode;
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
 /*
  * Checksum an MP configuration block.
  */
@@ -69,15 +49,73 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 }
 
 #ifdef CONFIG_X86_NUMAQ
+int found_numaq;
 /*
  * Have to match translation table entries to main table entries by counter
  * hence the mpc_record variable .... can't see a less disgusting way of
  * doing this ....
  */
+struct mpc_config_translation {
+	unsigned char mpc_type;
+	unsigned char trans_len;
+	unsigned char trans_type;
+	unsigned char trans_quad;
+	unsigned char trans_global;
+	unsigned char trans_local;
+	unsigned short trans_reserved;
+};
+
 
 static int mpc_record;
 static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
     __cpuinitdata;
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+
+static inline int mpc_apic_id(struct mpc_config_processor *m,
+			struct mpc_config_translation *translation_record)
+{
+	int quad = translation_record->trans_quad;
+	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+
+	printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver, quad, logical_apicid);
+	return logical_apicid;
+}
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+
+static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
+	struct mpc_config_translation *translation)
+{
+	int quad = translation->trans_quad;
+	int local = translation->trans_local;
+
+	mp_bus_id_to_node[m->mpc_busid] = quad;
+	mp_bus_id_to_local[m->mpc_busid] = local;
+	printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+	       m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+static void mpc_oem_pci_bus(struct mpc_config_bus *m,
+	struct mpc_config_translation *translation)
+{
+	int quad = translation->trans_quad;
+	int local = translation->trans_local;
+
+	quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
 #endif
 
 static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
@@ -90,7 +128,10 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 		return;
 	}
 #ifdef CONFIG_X86_NUMAQ
-	apicid = mpc_apic_id(m, translation_table[mpc_record]);
+	if (found_numaq)
+		apicid = mpc_apic_id(m, translation_table[mpc_record]);
+	else
+		apicid = m->mpc_apicid;
 #else
 	apicid = m->mpc_apicid;
 #endif
@@ -103,17 +144,18 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 	generic_processor_info(apicid, m->mpc_apicver);
 }
 
+#ifdef CONFIG_X86_IO_APIC
 static void __init MP_bus_info(struct mpc_config_bus *m)
 {
 	char str[7];
-
 	memcpy(str, m->mpc_bustype, 6);
 	str[6] = 0;
 
 #ifdef CONFIG_X86_NUMAQ
-	mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+	if (found_numaq)
+		mpc_oem_bus_info(m, str, translation_table[mpc_record]);
 #else
-	Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+	printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
 #endif
 
 #if MAX_MP_BUSSES < 256
@@ -132,11 +174,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
 #ifdef CONFIG_X86_NUMAQ
-		mpc_oem_pci_bus(m, translation_table[mpc_record]);
+		if (found_numaq)
+			mpc_oem_pci_bus(m, translation_table[mpc_record]);
 #endif
 		clear_bit(m->mpc_busid, mp_bus_not_pci);
-		mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
-		mp_current_pci_id++;
 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
 	} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
@@ -147,6 +188,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 	} else
 		printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
 }
+#endif
 
 #ifdef CONFIG_X86_IO_APIC
 
@@ -176,18 +218,89 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
 	if (bad_ioapic(m->mpc_apicaddr))
 		return;
 
-	mp_ioapics[nr_ioapics] = *m;
+	mp_ioapics[nr_ioapics].mp_apicaddr = m->mpc_apicaddr;
+	mp_ioapics[nr_ioapics].mp_apicid = m->mpc_apicid;
+	mp_ioapics[nr_ioapics].mp_type = m->mpc_type;
+	mp_ioapics[nr_ioapics].mp_apicver = m->mpc_apicver;
+	mp_ioapics[nr_ioapics].mp_flags = m->mpc_flags;
 	nr_ioapics++;
 }
 
-static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
 {
-	mp_irqs[mp_irq_entries] = *m;
-	Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+	printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 		m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+}
+
+static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
+{
+	printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
+		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
+		mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
+		(mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
+		mp_irq->mp_srcbusirq, mp_irq->mp_dstapic, mp_irq->mp_dstirq);
+}
+
+static void __init assign_to_mp_irq(struct mpc_config_intsrc *m,
+				    struct mp_config_intsrc *mp_irq)
+{
+	mp_irq->mp_dstapic = m->mpc_dstapic;
+	mp_irq->mp_type = m->mpc_type;
+	mp_irq->mp_irqtype = m->mpc_irqtype;
+	mp_irq->mp_irqflag = m->mpc_irqflag;
+	mp_irq->mp_srcbus = m->mpc_srcbus;
+	mp_irq->mp_srcbusirq = m->mpc_srcbusirq;
+	mp_irq->mp_dstirq = m->mpc_dstirq;
+}
+
+static void __init assign_to_mpc_intsrc(struct mp_config_intsrc *mp_irq,
+					struct mpc_config_intsrc *m)
+{
+	m->mpc_dstapic = mp_irq->mp_dstapic;
+	m->mpc_type = mp_irq->mp_type;
+	m->mpc_irqtype = mp_irq->mp_irqtype;
+	m->mpc_irqflag = mp_irq->mp_irqflag;
+	m->mpc_srcbus = mp_irq->mp_srcbus;
+	m->mpc_srcbusirq = mp_irq->mp_srcbusirq;
+	m->mpc_dstirq = mp_irq->mp_dstirq;
+}
+
+static int __init mp_irq_mpc_intsrc_cmp(struct mp_config_intsrc *mp_irq,
+					struct mpc_config_intsrc *m)
+{
+	if (mp_irq->mp_dstapic != m->mpc_dstapic)
+		return 1;
+	if (mp_irq->mp_type != m->mpc_type)
+		return 2;
+	if (mp_irq->mp_irqtype != m->mpc_irqtype)
+		return 3;
+	if (mp_irq->mp_irqflag != m->mpc_irqflag)
+		return 4;
+	if (mp_irq->mp_srcbus != m->mpc_srcbus)
+		return 5;
+	if (mp_irq->mp_srcbusirq != m->mpc_srcbusirq)
+		return 6;
+	if (mp_irq->mp_dstirq != m->mpc_dstirq)
+		return 7;
+
+	return 0;
+}
+
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
+{
+	int i;
+
+	print_MP_intsrc_info(m);
+
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
+			return;
+	}
+
+	assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
 	if (++mp_irq_entries == MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
@@ -196,7 +309,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
 
 static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
 {
-	Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+	printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
@@ -266,11 +379,14 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
 	}
 }
 
-static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
 				 char *productid)
 {
 	if (strncmp(oem, "IBM NUMA", 8))
-		printk("Warning!  May not be a NUMA-Q system!\n");
+		printk("Warning!  Not a NUMA-Q system!\n");
+	else
+		found_numaq = 1;
+
 	if (mpc->mpc_oemptr)
 		smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
 				 mpc->mpc_oemsize);
@@ -281,12 +397,9 @@ static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
  * Read/parse the MPC
  */
 
-static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+static int __init smp_check_mpc(struct mp_config_table *mpc, char *oem,
+				char *str)
 {
-	char str[16];
-	char oem[10];
-	int count = sizeof(*mpc);
-	unsigned char *mpt = ((unsigned char *)mpc) + count;
 
 	if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
 		printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
@@ -309,19 +422,42 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 	}
 	memcpy(oem, mpc->mpc_oem, 8);
 	oem[8] = 0;
-	printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
+	printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
 
 	memcpy(str, mpc->mpc_productid, 12);
 	str[12] = 0;
-	printk("Product ID: %s ", str);
 
-#ifdef CONFIG_X86_32
-	mps_oem_check(mpc, oem, str);
-#endif
-	printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
+	printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
 
 	printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
 
+	return 1;
+}
+
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
+{
+	char str[16];
+	char oem[10];
+
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
+
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;
+
+#ifdef CONFIG_X86_32
+	/*
+	 * need to make sure summit and es7000's mps_oem_check is safe to be
+	 * called early via genericarch 's mps_oem_check
+	 */
+	if (early) {
+#ifdef CONFIG_X86_NUMAQ
+		numaq_mps_oem_check(mpc, oem, str);
+#endif
+	} else
+		mps_oem_check(mpc, oem, str);
+#endif
+
 	/* save the local APIC address, it might be non-default */
 	if (!acpi_lapic)
 		mp_lapic_addr = mpc->mpc_lapic;
@@ -352,7 +488,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 			{
 				struct mpc_config_bus *m =
 				    (struct mpc_config_bus *)mpt;
+#ifdef CONFIG_X86_IO_APIC
 				MP_bus_info(m);
+#endif
 				mpt += sizeof(*m);
 				count += sizeof(*m);
 				break;
@@ -402,6 +540,11 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 		++mpc_record;
 #endif
 	}
+
+#ifdef CONFIG_X86_GENERICARCH
+       generic_bigsmp_probe();
+#endif
+
 	setup_apic_routing();
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
@@ -427,7 +570,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
 	intsrc.mpc_type = MP_INTSRC;
 	intsrc.mpc_irqflag = 0;	/* conforming */
 	intsrc.mpc_srcbus = 0;
-	intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+	intsrc.mpc_dstapic = mp_ioapics[0].mp_apicid;
 
 	intsrc.mpc_irqtype = mp_INT;
 
@@ -488,40 +631,11 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
 	MP_intsrc_info(&intsrc);
 }
 
-#endif
 
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+static void construct_ioapic_table(int mpc_default_type)
 {
-	struct mpc_config_processor processor;
-	struct mpc_config_bus bus;
-#ifdef CONFIG_X86_IO_APIC
 	struct mpc_config_ioapic ioapic;
-#endif
-	struct mpc_config_lintsrc lintsrc;
-	int linttypes[2] = { mp_ExtINT, mp_NMI };
-	int i;
-
-	/*
-	 * local APIC has default address
-	 */
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-	/*
-	 * 2 CPUs, numbered 0 & 1.
-	 */
-	processor.mpc_type = MP_PROCESSOR;
-	/* Either an integrated APIC or a discrete 82489DX. */
-	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
-	processor.mpc_cpuflag = CPU_ENABLED;
-	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
-	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
-	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
-	processor.mpc_reserved[0] = 0;
-	processor.mpc_reserved[1] = 0;
-	for (i = 0; i < 2; i++) {
-		processor.mpc_apicid = i;
-		MP_processor_info(&processor);
-	}
+	struct mpc_config_bus bus;
 
 	bus.mpc_type = MP_BUS;
 	bus.mpc_busid = 0;
@@ -550,7 +664,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 		MP_bus_info(&bus);
 	}
 
-#ifdef CONFIG_X86_IO_APIC
 	ioapic.mpc_type = MP_IOAPIC;
 	ioapic.mpc_apicid = 2;
 	ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -562,7 +675,42 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 	 * We set up most of the low 16 IO-APIC pins according to MPS rules.
 	 */
 	construct_default_ioirq_mptable(mpc_default_type);
+}
+#else
+static inline void construct_ioapic_table(int mpc_default_type) { }
 #endif
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+	struct mpc_config_processor processor;
+	struct mpc_config_lintsrc lintsrc;
+	int linttypes[2] = { mp_ExtINT, mp_NMI };
+	int i;
+
+	/*
+	 * local APIC has default address
+	 */
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+	/*
+	 * 2 CPUs, numbered 0 & 1.
+	 */
+	processor.mpc_type = MP_PROCESSOR;
+	/* Either an integrated APIC or a discrete 82489DX. */
+	processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+	processor.mpc_cpuflag = CPU_ENABLED;
+	processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+	    (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+	processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+	processor.mpc_reserved[0] = 0;
+	processor.mpc_reserved[1] = 0;
+	for (i = 0; i < 2; i++) {
+		processor.mpc_apicid = i;
+		MP_processor_info(&processor);
+	}
+
+	construct_ioapic_table(mpc_default_type);
+
 	lintsrc.mpc_type = MP_LINTSRC;
 	lintsrc.mpc_irqflag = 0;	/* conforming */
 	lintsrc.mpc_srcbusid = 0;
@@ -600,7 +748,7 @@ static void __init __get_smp_config(unsigned early)
 
 	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
 	       mpf->mpf_specification);
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
 	if (mpf->mpf_feature2 & (1 << 7)) {
 		printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
 		pic_mode = 1;
@@ -632,7 +780,9 @@ static void __init __get_smp_config(unsigned early)
 		 * override the defaults.
 		 */
 		if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
+#ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 0;
+#endif
 			printk(KERN_ERR
 			       "BIOS bug, MP table errors detected!...\n");
 			printk(KERN_ERR "... disabling SMP support. "
@@ -689,7 +839,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 	unsigned int *bp = phys_to_virt(base);
 	struct intel_mp_floating *mpf;
 
-	Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+	printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);
 
 	while (length > 0) {
@@ -699,15 +849,21 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 		    !mpf_checksum((unsigned char *)bp, 16) &&
 		    ((mpf->mpf_specification == 1)
 		     || (mpf->mpf_specification == 4))) {
-
+#ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 1;
+#endif
 			mpf_found = mpf;
-#ifdef CONFIG_X86_32
+
 			printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
 			       mpf, virt_to_phys(mpf));
-			reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
+
+			if (!reserve)
+				return 1;
+			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
 					BOOTMEM_DEFAULT);
 			if (mpf->mpf_physptr) {
+				unsigned long size = PAGE_SIZE;
+#ifdef CONFIG_X86_32
 				/*
 				 * We cannot access to MPC table to compute
 				 * table size yet, as only few megabytes from
@@ -717,25 +873,15 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 				 * PAGE_SIZE from mpg->mpf_physptr yields BUG()
 				 * in reserve_bootmem.
 				 */
-				unsigned long size = PAGE_SIZE;
 				unsigned long end = max_low_pfn * PAGE_SIZE;
 				if (mpf->mpf_physptr + size > end)
 					size = end - mpf->mpf_physptr;
-				reserve_bootmem(mpf->mpf_physptr, size,
+#endif
+				reserve_bootmem_generic(mpf->mpf_physptr, size,
 						BOOTMEM_DEFAULT);
 			}
 
-#else
-			if (!reserve)
-				return 1;
-
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
-				BOOTMEM_DEFAULT);
-			if (mpf->mpf_physptr)
-				reserve_bootmem_generic(mpf->mpf_physptr,
-					PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
-		return 1;
+			return 1;
 		}
 		bp += 4;
 		length -= 16;
@@ -791,298 +937,294 @@ void __init find_smp_config(void)
 	__find_smp_config(1);
 }
 
-/* --------------------------------------------------------------------------
-                            ACPI-based MP Configuration
-   -------------------------------------------------------------------------- */
+#ifdef CONFIG_X86_IO_APIC
+static u8 __initdata irq_used[MAX_IRQ_SOURCES];
 
-/*
- * Keep this outside and initialized to 0, for !CONFIG_ACPI builds:
- */
-int es7000_plat;
+static int  __init get_MP_intsrc_index(struct mpc_config_intsrc *m)
+{
+	int i;
 
-#ifdef CONFIG_ACPI
+	if (m->mpc_irqtype != mp_INT)
+		return 0;
 
-#ifdef	CONFIG_X86_IO_APIC
+	if (m->mpc_irqflag != 0x0f)
+		return 0;
 
-#define MP_ISA_BUS		0
+	/* not legacy */
 
-extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (mp_irqs[i].mp_irqtype != mp_INT)
+			continue;
 
-static int mp_find_ioapic(int gsi)
-{
-	int i = 0;
+		if (mp_irqs[i].mp_irqflag != 0x0f)
+			continue;
 
-	/* Find the IOAPIC that manages this GSI. */
-	for (i = 0; i < nr_ioapics; i++) {
-		if ((gsi >= mp_ioapic_routing[i].gsi_base)
-		    && (gsi <= mp_ioapic_routing[i].gsi_end))
-			return i;
+		if (mp_irqs[i].mp_srcbus != m->mpc_srcbus)
+			continue;
+		if (mp_irqs[i].mp_srcbusirq != m->mpc_srcbusirq)
+			continue;
+		if (irq_used[i]) {
+			/* already claimed */
+			return -2;
+		}
+		irq_used[i] = 1;
+		return i;
 	}
 
-	printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+	/* not found */
 	return -1;
 }
 
-static u8 __init uniq_ioapic_id(u8 id)
-{
-#ifdef CONFIG_X86_32
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
-	    !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-		return io_apic_get_unique_id(nr_ioapics, id);
-	else
-		return id;
-#else
-	int i;
-	DECLARE_BITMAP(used, 256);
-	bitmap_zero(used, 256);
-	for (i = 0; i < nr_ioapics; i++) {
-		struct mpc_config_ioapic *ia = &mp_ioapics[i];
-		__set_bit(ia->mpc_apicid, used);
-	}
-	if (!test_bit(id, used))
-		return id;
-	return find_first_zero_bit(used, 256);
+#define SPARE_SLOT_NUM 20
+
+static struct mpc_config_intsrc __initdata *m_spare[SPARE_SLOT_NUM];
 #endif
-}
 
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+static int  __init replace_intsrc_all(struct mp_config_table *mpc,
+					unsigned long mpc_new_phys,
+					unsigned long mpc_new_length)
 {
-	int idx = 0;
-
-	if (bad_ioapic(address))
-		return;
+#ifdef CONFIG_X86_IO_APIC
+	int i;
+	int nr_m_spare = 0;
+#endif
 
-	idx = nr_ioapics;
+	int count = sizeof(*mpc);
+	unsigned char *mpt = ((unsigned char *)mpc) + count;
 
-	mp_ioapics[idx].mpc_type = MP_IOAPIC;
-	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
-	mp_ioapics[idx].mpc_apicaddr = address;
+	printk(KERN_INFO "mpc_length %x\n", mpc->mpc_length);
+	while (count < mpc->mpc_length) {
+		switch (*mpt) {
+		case MP_PROCESSOR:
+			{
+				struct mpc_config_processor *m =
+				    (struct mpc_config_processor *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_BUS:
+			{
+				struct mpc_config_bus *m =
+				    (struct mpc_config_bus *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		case MP_IOAPIC:
+			{
+				mpt += sizeof(struct mpc_config_ioapic);
+				count += sizeof(struct mpc_config_ioapic);
+				break;
+			}
+		case MP_INTSRC:
+			{
+#ifdef CONFIG_X86_IO_APIC
+				struct mpc_config_intsrc *m =
+				    (struct mpc_config_intsrc *)mpt;
 
-	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-	mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
-	mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-#else
-	mp_ioapics[idx].mpc_apicver = 0;
+				printk(KERN_INFO "OLD ");
+				print_MP_intsrc_info(m);
+				i = get_MP_intsrc_index(m);
+				if (i > 0) {
+					assign_to_mpc_intsrc(&mp_irqs[i], m);
+					printk(KERN_INFO "NEW ");
+					print_mp_irq_info(&mp_irqs[i]);
+				} else if (!i) {
+					/* legacy, do nothing */
+				} else if (nr_m_spare < SPARE_SLOT_NUM) {
+					/*
+					 * not found (-1), or duplicated (-2)
+					 * are invalid entries,
+					 * we need to use the slot  later
+					 */
+					m_spare[nr_m_spare] = m;
+					nr_m_spare++;
+				}
 #endif
-	/*
-	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
-	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
-	 */
-	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
-	mp_ioapic_routing[idx].gsi_base = gsi_base;
-	mp_ioapic_routing[idx].gsi_end = gsi_base +
-	    io_apic_get_redir_entries(idx);
+				mpt += sizeof(struct mpc_config_intsrc);
+				count += sizeof(struct mpc_config_intsrc);
+				break;
+			}
+		case MP_LINTSRC:
+			{
+				struct mpc_config_lintsrc *m =
+				    (struct mpc_config_lintsrc *)mpt;
+				mpt += sizeof(*m);
+				count += sizeof(*m);
+				break;
+			}
+		default:
+			/* wrong mptable */
+			printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+			printk(KERN_ERR "type %x\n", *mpt);
+			print_hex_dump(KERN_ERR, "  ", DUMP_PREFIX_ADDRESS, 16,
+					1, mpc, mpc->mpc_length, 1);
+			goto out;
+		}
+	}
 
-	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-	       "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
-	       mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
-	       mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+#ifdef CONFIG_X86_IO_APIC
+	for (i = 0; i < mp_irq_entries; i++) {
+		if (irq_used[i])
+			continue;
 
-	nr_ioapics++;
-}
+		if (mp_irqs[i].mp_irqtype != mp_INT)
+			continue;
 
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
-	struct mpc_config_intsrc intsrc;
-	int ioapic = -1;
-	int pin = -1;
+		if (mp_irqs[i].mp_irqflag != 0x0f)
+			continue;
 
-	/*
-	 * Convert 'gsi' to 'ioapic.pin'.
-	 */
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return;
-	pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
-
-	/*
-	 * TBD: This check is for faulty timer entries, where the override
-	 *      erroneously sets the trigger to level, resulting in a HUGE
-	 *      increase of timer interrupts!
-	 */
-	if ((bus_irq == 0) && (trigger == 3))
-		trigger = 1;
-
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqtype = mp_INT;
-	intsrc.mpc_irqflag = (trigger << 2) | polarity;
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-	intsrc.mpc_srcbusirq = bus_irq;	/* IRQ */
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;	/* APIC ID */
-	intsrc.mpc_dstirq = pin;	/* INTIN# */
+		if (nr_m_spare > 0) {
+			printk(KERN_INFO "*NEW* found ");
+			nr_m_spare--;
+			assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+			m_spare[nr_m_spare] = NULL;
+		} else {
+			struct mpc_config_intsrc *m =
+			    (struct mpc_config_intsrc *)mpt;
+			count += sizeof(struct mpc_config_intsrc);
+			if (!mpc_new_phys) {
+				printk(KERN_INFO "No spare slots, try to append...take your risk, new mpc_length %x\n", count);
+			} else {
+				if (count <= mpc_new_length)
+					printk(KERN_INFO "No spare slots, try to append..., new mpc_length %x\n", count);
+				else {
+					printk(KERN_ERR "mpc_new_length %lx is too small\n", mpc_new_length);
+					goto out;
+				}
+			}
+			assign_to_mpc_intsrc(&mp_irqs[i], m);
+			mpc->mpc_length = count;
+			mpt += sizeof(struct mpc_config_intsrc);
+		}
+		print_mp_irq_info(&mp_irqs[i]);
+	}
+#endif
+out:
+	/* update checksum */
+	mpc->mpc_checksum = 0;
+	mpc->mpc_checksum -= mpf_checksum((unsigned char *)mpc,
+					   mpc->mpc_length);
 
-	MP_intsrc_info(&intsrc);
+	return 0;
 }
 
-void __init mp_config_acpi_legacy_irqs(void)
-{
-	struct mpc_config_intsrc intsrc;
-	int i = 0;
-	int ioapic = -1;
+static int __initdata enable_update_mptable;
 
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-	/*
-	 * Fabricate the legacy ISA bus (bus #31).
-	 */
-	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
-#endif
-	set_bit(MP_ISA_BUS, mp_bus_not_pci);
-	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+static int __init update_mptable_setup(char *str)
+{
+	enable_update_mptable = 1;
+	return 0;
+}
+early_param("update_mptable", update_mptable_setup);
 
-	/*
-	 * Older generations of ES7000 have no legacy identity mappings
-	 */
-	if (es7000_plat == 1)
-		return;
+static unsigned long __initdata mpc_new_phys;
+static unsigned long mpc_new_length __initdata = 4096;
 
-	/*
-	 * Locate the IOAPIC that manages the ISA IRQs (0-15).
-	 */
-	ioapic = mp_find_ioapic(0);
-	if (ioapic < 0)
-		return;
+/* alloc_mptable or alloc_mptable=4k */
+static int __initdata alloc_mptable;
+static int __init parse_alloc_mptable_opt(char *p)
+{
+	enable_update_mptable = 1;
+	alloc_mptable = 1;
+	if (!p)
+		return 0;
+	mpc_new_length = memparse(p, &p);
+	return 0;
+}
+early_param("alloc_mptable", parse_alloc_mptable_opt);
 
-	intsrc.mpc_type = MP_INTSRC;
-	intsrc.mpc_irqflag = 0;	/* Conforming */
-	intsrc.mpc_srcbus = MP_ISA_BUS;
-#ifdef CONFIG_X86_IO_APIC
-	intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+void __init early_reserve_e820_mpc_new(void)
+{
+	if (enable_update_mptable && alloc_mptable) {
+		u64 startt = 0;
+#ifdef CONFIG_X86_TRAMPOLINE
+		startt = TRAMPOLINE_BASE;
 #endif
-	/*
-	 * Use the default configuration for the IRQs 0-15.  Unless
-	 * overridden by (MADT) interrupt source override entries.
-	 */
-	for (i = 0; i < 16; i++) {
-		int idx;
-
-		for (idx = 0; idx < mp_irq_entries; idx++) {
-			struct mpc_config_intsrc *irq = mp_irqs + idx;
-
-			/* Do we already have a mapping for this ISA IRQ? */
-			if (irq->mpc_srcbus == MP_ISA_BUS
-			    && irq->mpc_srcbusirq == i)
-				break;
-
-			/* Do we already have a mapping for this IOAPIC pin */
-			if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
-			    (irq->mpc_dstirq == i))
-				break;
-		}
-
-		if (idx != mp_irq_entries) {
-			printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
-			continue;	/* IRQ already used */
-		}
-
-		intsrc.mpc_irqtype = mp_INT;
-		intsrc.mpc_srcbusirq = i;	/* Identity mapped */
-		intsrc.mpc_dstirq = i;
-
-		MP_intsrc_info(&intsrc);
+		mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
 	}
 }
 
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int __init update_mp_table(void)
 {
-	int ioapic;
-	int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM	4096
-#define IRQ_COMPRESSION_START	64
+	char str[16];
+	char oem[10];
+	struct intel_mp_floating *mpf;
+	struct mp_config_table *mpc;
+	struct mp_config_table *mpc_new;
+
+	if (!enable_update_mptable)
+		return 0;
+
+	mpf = mpf_found;
+	if (!mpf)
+		return 0;
 
-	static int pci_irq = IRQ_COMPRESSION_START;
 	/*
-	 * Mapping between Global System Interrupts, which
-	 * represent all possible interrupts, and IRQs
-	 * assigned to actual devices.
+	 * Now see if we need to go further.
 	 */
-	static int gsi_to_irq[MAX_GSI_NUM];
-#else
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-#endif
+	if (mpf->mpf_feature1 != 0)
+		return 0;
 
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (acpi_gbl_FADT.sci_interrupt == gsi)
-		return gsi;
+	if (!mpf->mpf_physptr)
+		return 0;
 
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0) {
-		printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
-		return gsi;
-	}
+	mpc = phys_to_virt(mpf->mpf_physptr);
 
-	ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+	if (!smp_check_mpc(mpc, oem, str))
+		return 0;
 
-#ifdef CONFIG_X86_32
-	if (ioapic_renumber_irq)
-		gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
+	printk(KERN_INFO "mpf: %lx\n", virt_to_phys(mpf));
+	printk(KERN_INFO "mpf_physptr: %x\n", mpf->mpf_physptr);
 
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
-		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
-		       ioapic_pin);
-		return gsi;
+	if (mpc_new_phys && mpc->mpc_length > mpc_new_length) {
+		mpc_new_phys = 0;
+		printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
+			 mpc_new_length);
 	}
-	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
-		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
-		return gsi;
-#endif
+
+	if (!mpc_new_phys) {
+		unsigned char old, new;
+		/* check if we can change the postion */
+		mpc->mpc_checksum = 0;
+		old = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+		mpc->mpc_checksum = 0xff;
+		new = mpf_checksum((unsigned char *)mpc, mpc->mpc_length);
+		if (old == new) {
+			printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+			return 0;
+		}
+		printk(KERN_INFO "use in-positon replacing\n");
+	} else {
+		mpf->mpf_physptr = mpc_new_phys;
+		mpc_new = phys_to_virt(mpc_new_phys);
+		memcpy(mpc_new, mpc, mpc->mpc_length);
+		mpc = mpc_new;
+		/* check if we can modify that */
+		if (mpc_new_phys - mpf->mpf_physptr) {
+			struct intel_mp_floating *mpf_new;
+			/* steal 16 bytes from [0, 1k) */
+			printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+			mpf_new = phys_to_virt(0x400 - 16);
+			memcpy(mpf_new, mpf, 16);
+			mpf = mpf_new;
+			mpf->mpf_physptr = mpc_new_phys;
+		}
+		mpf->mpf_checksum = 0;
+		mpf->mpf_checksum -= mpf_checksum((unsigned char *)mpf, 16);
+		printk(KERN_INFO "mpf_physptr new: %x\n", mpf->mpf_physptr);
 	}
 
-	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
 	/*
-	 * For GSI >= 64, use IRQ compression
+	 * only replace the one with mp_INT and
+	 *	 MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW,
+	 * already in mp_irqs , stored by ... and mp_config_acpi_gsi,
+	 * may need pci=routeirq for all coverage
 	 */
-	if ((gsi >= IRQ_COMPRESSION_START)
-	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
-		/*
-		 * For PCI devices assign IRQs in order, avoiding gaps
-		 * due to unused I/O APIC pins.
-		 */
-		int irq = gsi;
-		if (gsi < MAX_GSI_NUM) {
-			/*
-			 * Retain the VIA chipset work-around (gsi > 15), but
-			 * avoid a problem where the 8254 timer (IRQ0) is setup
-			 * via an override (so it's not on pin 0 of the ioapic),
-			 * and at the same time, the pin 0 interrupt is a PCI
-			 * type.  The gsi > 15 test could cause these two pins
-			 * to be shared as IRQ0, and they are not shareable.
-			 * So test for this condition, and if necessary, avoid
-			 * the pin collision.
-			 */
-			gsi = pci_irq++;
-			/*
-			 * Don't assign IRQ used by ACPI SCI
-			 */
-			if (gsi == acpi_gbl_FADT.sci_interrupt)
-				gsi = pci_irq++;
-			gsi_to_irq[irq] = gsi;
-		} else {
-			printk(KERN_ERR "GSI %u is too high\n", gsi);
-			return gsi;
-		}
-	}
-#endif
-	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-	return gsi;
+	replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length);
+
+	return 0;
 }
 
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+late_initcall(update_mp_table);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 84160f74eeb0..6580dae46277 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -24,8 +24,11 @@
 #include <linux/kdebug.h>
 #include <linux/slab.h>
 
+#include <asm/i8259.h>
+#include <asm/io_apic.h>
 #include <asm/smp.h>
 #include <asm/nmi.h>
+#include <asm/timer.h>
 
 #include "mach_traps.h"
 
@@ -81,7 +84,7 @@ int __init check_nmi_watchdog(void)
 
 	prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
 	if (!prev_nmi_count)
-		return -1;
+		goto error;
 
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
@@ -118,7 +121,7 @@ int __init check_nmi_watchdog(void)
 	if (!atomic_read(&nmi_active)) {
 		kfree(prev_nmi_count);
 		atomic_set(&nmi_active, -1);
-		return -1;
+		goto error;
 	}
 	printk("OK.\n");
 
@@ -129,6 +132,12 @@ int __init check_nmi_watchdog(void)
 
 	kfree(prev_nmi_count);
 	return 0;
+error:
+	if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
+		disable_8259A_irq(0);
+	timer_ack = 0;
+
+	return -1;
 }
 
 static int __init setup_nmi_watchdog(char *str)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 2861b9408ac9..d62f3b66b529 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -21,6 +21,8 @@
 #include <linux/cpumask.h>
 #include <linux/kdebug.h>
 
+#include <asm/i8259.h>
+#include <asm/io_apic.h>
 #include <asm/smp.h>
 #include <asm/nmi.h>
 #include <asm/proto.h>
@@ -90,7 +92,7 @@ int __init check_nmi_watchdog(void)
 
 	prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
 	if (!prev_nmi_count)
-		return -1;
+		goto error;
 
 	printk(KERN_INFO "Testing NMI watchdog ... ");
 
@@ -121,7 +123,7 @@ int __init check_nmi_watchdog(void)
 	if (!atomic_read(&nmi_active)) {
 		kfree(prev_nmi_count);
 		atomic_set(&nmi_active, -1);
-		return -1;
+		goto error;
 	}
 	printk("OK.\n");
 
@@ -132,6 +134,11 @@ int __init check_nmi_watchdog(void)
 
 	kfree(prev_nmi_count);
 	return 0;
+error:
+	if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259)
+		disable_8259A_irq(0);
+
+	return -1;
 }
 
 static int __init setup_nmi_watchdog(char *str)
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index e65281b1634b..f0f1de1c4a1d 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,6 +31,8 @@
 #include <asm/numaq.h>
 #include <asm/topology.h>
 #include <asm/processor.h>
+#include <asm/mpspec.h>
+#include <asm/e820.h>
 
 #define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 
@@ -58,6 +60,8 @@ static void __init smp_dump_qct(void)
 			node_end_pfn[node] = MB_TO_PAGES(
 				eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
 
+			e820_register_active_regions(node, node_start_pfn[node],
+							node_end_pfn[node]);
 			memory_present(node,
 				node_start_pfn[node], node_end_pfn[node]);
 			node_remap_size[node] = node_memmap_size_bytes(node,
@@ -67,13 +71,24 @@ static void __init smp_dump_qct(void)
 	}
 }
 
-/*
- * Unlike Summit, we don't really care to let the NUMA-Q
- * fall back to flat mode.  Don't compile for NUMA-Q
- * unless you really need it!
- */
+static __init void early_check_numaq(void)
+{
+	/*
+	 * Find possible boot-time SMP configuration:
+	 */
+	early_find_smp_config();
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		early_get_smp_config();
+}
+
 int __init get_memcfg_numaq(void)
 {
+	early_check_numaq();
+	if (!found_numaq)
+		return 0;
 	smp_dump_qct();
 	return 1;
 }
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 74f0c5ea2a03..f1ab0f727007 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -380,6 +380,9 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.pte_update = paravirt_nop,
 	.pte_update_defer = paravirt_nop,
 
+	.ptep_modify_prot_start = __ptep_modify_prot_start,
+	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
 #ifdef CONFIG_HIGHPTE
 	.kmap_atomic_pte = kmap_atomic,
 #endif
@@ -403,6 +406,7 @@ struct pv_mmu_ops pv_mmu_ops = {
 #endif /* PAGETABLE_LEVELS >= 3 */
 
 	.pte_val = native_pte_val,
+	.pte_flags = native_pte_val,
 	.pgd_val = native_pgd_val,
 
 	.make_pte = native_make_pte,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index dc00a1331ace..cb0bdf440715 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -7,6 +7,7 @@
 #include <asm/dma.h>
 #include <asm/gart.h>
 #include <asm/calgary.h>
+#include <asm/amd_iommu.h>
 
 int forbid_dac __read_mostly;
 EXPORT_SYMBOL(forbid_dac);
@@ -77,10 +78,14 @@ void __init dma32_reserve_bootmem(void)
 	if (end_pfn <= MAX_DMA32_PFN)
 		return;
 
+	/*
+	 * check aperture_64.c allocate_aperture() for reason about
+	 * using 512M as goal
+	 */
 	align = 64ULL<<20;
 	size = round_up(dma32_bootmem_size, align);
 	dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
-				 __pa(MAX_DMA_ADDRESS));
+				 512ULL<<20);
 	if (dma32_bootmem_ptr)
 		dma32_bootmem_size = size;
 	else
@@ -88,7 +93,6 @@ void __init dma32_reserve_bootmem(void)
 }
 static void __init dma32_free_bootmem(void)
 {
-	int node;
 
 	if (end_pfn <= MAX_DMA32_PFN)
 		return;
@@ -96,9 +100,7 @@ static void __init dma32_free_bootmem(void)
 	if (!dma32_bootmem_ptr)
 		return;
 
-	for_each_online_node(node)
-		free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
-				  dma32_bootmem_size);
+	free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
 
 	dma32_bootmem_ptr = NULL;
 	dma32_bootmem_size = 0;
@@ -122,6 +124,8 @@ void __init pci_iommu_alloc(void)
 
 	detect_intel_iommu();
 
+	amd_iommu_detect();
+
 #ifdef CONFIG_SWIOTLB
 	pci_swiotlb_init();
 #endif
@@ -357,7 +361,7 @@ int dma_supported(struct device *dev, u64 mask)
 EXPORT_SYMBOL(dma_supported);
 
 /* Allocate DMA memory on node near device */
-noinline struct page *
+static noinline struct page *
 dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
 {
 	int node;
@@ -502,6 +506,8 @@ static int __init pci_iommu_init(void)
 
 	intel_iommu_init();
 
+	amd_iommu_init();
+
 #ifdef CONFIG_GART_IOMMU
 	gart_iommu_init();
 #endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index aa8ec928caa8..021f3c684a62 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -104,7 +104,6 @@ static unsigned long alloc_iommu(struct device *dev, int size)
 					  size, base_index, boundary_size, 0);
 	}
 	if (offset != -1) {
-		set_bit_string(iommu_gart_bitmap, offset, size);
 		next_bit = offset+size;
 		if (next_bit >= iommu_pages) {
 			next_bit = 0;
@@ -534,8 +533,8 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
 	unsigned aper_size = 0, aper_base_32, aper_order;
 	u64 aper_base;
 
-	pci_read_config_dword(dev, 0x94, &aper_base_32);
-	pci_read_config_dword(dev, 0x90, &aper_order);
+	pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
+	pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
 	aper_order = (aper_order >> 1) & 7;
 
 	aper_base = aper_base_32 & 0x7fff;
@@ -549,14 +548,63 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
 	return aper_base;
 }
 
+static void enable_gart_translations(void)
+{
+	int i;
+
+	for (i = 0; i < num_k8_northbridges; i++) {
+		struct pci_dev *dev = k8_northbridges[i];
+
+		enable_gart_translation(dev, __pa(agp_gatt_table));
+	}
+}
+
+/*
+ * If fix_up_north_bridges is set, the north bridges have to be fixed up on
+ * resume in the same way as they are handled in gart_iommu_hole_init().
+ */
+static bool fix_up_north_bridges;
+static u32 aperture_order;
+static u32 aperture_alloc;
+
+void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
+{
+	fix_up_north_bridges = true;
+	aperture_order = aper_order;
+	aperture_alloc = aper_alloc;
+}
+
 static int gart_resume(struct sys_device *dev)
 {
+	printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n");
+
+	if (fix_up_north_bridges) {
+		int i;
+
+		printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n");
+
+		for (i = 0; i < num_k8_northbridges; i++) {
+			struct pci_dev *dev = k8_northbridges[i];
+
+			/*
+			 * Don't enable translations just yet.  That is the next
+			 * step.  Restore the pre-suspend aperture settings.
+			 */
+			pci_write_config_dword(dev, AMD64_GARTAPERTURECTL,
+						aperture_order << 1);
+			pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
+						aperture_alloc >> 25);
+		}
+	}
+
+	enable_gart_translations();
+
 	return 0;
 }
 
 static int gart_suspend(struct sys_device *dev, pm_message_t state)
 {
-	return -EINVAL;
+	return 0;
 }
 
 static struct sysdev_class gart_sysdev_class = {
@@ -614,27 +662,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	memset(gatt, 0, gatt_size);
 	agp_gatt_table = gatt;
 
-	for (i = 0; i < num_k8_northbridges; i++) {
-		u32 gatt_reg;
-		u32 ctl;
-
-		dev = k8_northbridges[i];
-		gatt_reg = __pa(gatt) >> 12;
-		gatt_reg <<= 4;
-		pci_write_config_dword(dev, 0x98, gatt_reg);
-		pci_read_config_dword(dev, 0x90, &ctl);
-
-		ctl |= 1;
-		ctl &= ~((1<<4) | (1<<5));
-
-		pci_write_config_dword(dev, 0x90, ctl);
-	}
+	enable_gart_translations();
 
 	error = sysdev_class_register(&gart_sysdev_class);
 	if (!error)
 		error = sysdev_register(&device_gart);
 	if (error)
 		panic("Could not register gart_sysdev -- would corrupt data on next suspend");
+
 	flush_gart();
 
 	printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
@@ -677,11 +712,11 @@ void gart_iommu_shutdown(void)
 		u32 ctl;
 
 		dev = k8_northbridges[i];
-		pci_read_config_dword(dev, 0x90, &ctl);
+		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
 
-		ctl &= ~1;
+		ctl &= ~GARTEN;
 
-		pci_write_config_dword(dev, 0x90, ctl);
+		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
 	}
 }
 
@@ -788,10 +823,10 @@ void __init gart_iommu_init(void)
 	wbinvd();
 
 	/*
-	 * Try to workaround a bug (thanks to BenH)
+	 * Try to workaround a bug (thanks to BenH):
 	 * Set unmapped entries to a scratch page instead of 0.
 	 * Any prefetches that hit unmapped entries won't get an bus abort
-	 * then.
+	 * then. (P2P bridge may be prefetching on DMA reads).
 	 */
 	scratch = get_zeroed_page(GFP_KERNEL);
 	if (!scratch)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba370dc8685b..4061d63aabe7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
+#include <linux/clockchips.h>
 
 struct kmem_cache *task_xstate_cachep;
 
@@ -45,6 +46,76 @@ void arch_task_cache_init(void)
 				  SLAB_PANIC, NULL);
 }
 
+/*
+ * Idle related variables and functions
+ */
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
+
+#ifdef CONFIG_X86_32
+/*
+ * This halt magic was a workaround for ancient floppy DMA
+ * wreckage. It should be safe to remove.
+ */
+static int hlt_counter;
+void disable_hlt(void)
+{
+	hlt_counter++;
+}
+EXPORT_SYMBOL(disable_hlt);
+
+void enable_hlt(void)
+{
+	hlt_counter--;
+}
+EXPORT_SYMBOL(enable_hlt);
+
+static inline int hlt_use_halt(void)
+{
+	return (!hlt_counter && boot_cpu_data.hlt_works_ok);
+}
+#else
+static inline int hlt_use_halt(void)
+{
+	return 1;
+}
+#endif
+
+/*
+ * We use this if we don't have any better
+ * idle routine..
+ */
+void default_idle(void)
+{
+	if (hlt_use_halt()) {
+		current_thread_info()->status &= ~TS_POLLING;
+		/*
+		 * TS_POLLING-cleared state must be visible before we
+		 * test NEED_RESCHED:
+		 */
+		smp_mb();
+
+		if (!need_resched())
+			safe_halt();	/* enables interrupts racelessly */
+		else
+			local_irq_enable();
+		current_thread_info()->status |= TS_POLLING;
+	} else {
+		local_irq_enable();
+		/* loop is done by the caller */
+		cpu_relax();
+	}
+}
+#ifdef CONFIG_APM_MODULE
+EXPORT_SYMBOL(default_idle);
+#endif
+
 static void do_nothing(void *unused)
 {
 }
@@ -122,44 +193,129 @@ static void poll_idle(void)
  *
  * idle=mwait overrides this decision and forces the usage of mwait.
  */
+
+#define MWAIT_INFO			0x05
+#define MWAIT_ECX_EXTENDED_INFO		0x01
+#define MWAIT_EDX_C1			0xf0
+
 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
 {
+	u32 eax, ebx, ecx, edx;
+
 	if (force_mwait)
 		return 1;
 
-	if (c->x86_vendor == X86_VENDOR_AMD) {
-		switch(c->x86) {
-		case 0x10:
-		case 0x11:
-			return 0;
-		}
-	}
+	if (c->cpuid_level < MWAIT_INFO)
+		return 0;
+
+	cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx);
+	/* Check, whether EDX has extended info about MWAIT */
+	if (!(ecx & MWAIT_ECX_EXTENDED_INFO))
+		return 1;
+
+	/*
+	 * edx enumeratios MONITOR/MWAIT extensions. Check, whether
+	 * C1  supports MWAIT
+	 */
+	return (edx & MWAIT_EDX_C1);
+}
+
+/*
+ * Check for AMD CPUs, which have potentially C1E support
+ */
+static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
+{
+	if (c->x86_vendor != X86_VENDOR_AMD)
+		return 0;
+
+	if (c->x86 < 0x0F)
+		return 0;
+
+	/* Family 0x0f models < rev F do not have C1E */
+	if (c->x86 == 0x0f && c->x86_model < 0x40)
+		return 0;
+
 	return 1;
 }
 
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+/*
+ * C1E aware idle routine. We check for C1E active in the interrupt
+ * pending message MSR. If we detect C1E, then we handle it the same
+ * way as C3 power states (local apic timer and TSC stop)
+ */
+static void c1e_idle(void)
 {
-	static int selected;
+	static cpumask_t c1e_mask = CPU_MASK_NONE;
+	static int c1e_detected;
 
-	if (selected)
+	if (need_resched())
 		return;
+
+	if (!c1e_detected) {
+		u32 lo, hi;
+
+		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
+			c1e_detected = 1;
+			mark_tsc_unstable("TSC halt in C1E");
+			printk(KERN_INFO "System has C1E enabled\n");
+		}
+	}
+
+	if (c1e_detected) {
+		int cpu = smp_processor_id();
+
+		if (!cpu_isset(cpu, c1e_mask)) {
+			cpu_set(cpu, c1e_mask);
+			/*
+			 * Force broadcast so ACPI can not interfere. Needs
+			 * to run with interrupts enabled as it uses
+			 * smp_function_call.
+			 */
+			local_irq_enable();
+			clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+					   &cpu);
+			printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
+			       cpu);
+			local_irq_disable();
+		}
+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+
+		default_idle();
+
+		/*
+		 * The switch back from broadcast mode needs to be
+		 * called with interrupts disabled.
+		 */
+		 local_irq_disable();
+		 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+		 local_irq_enable();
+	} else
+		default_idle();
+}
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
 #ifdef CONFIG_X86_SMP
 	if (pm_idle == poll_idle && smp_num_siblings > 1) {
 		printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
 			" performance may degrade.\n");
 	}
 #endif
+	if (pm_idle)
+		return;
+
 	if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
 		/*
-		 * Skip, if setup has overridden idle.
 		 * One CPU supports mwait => All CPUs supports mwait
 		 */
-		if (!pm_idle) {
-			printk(KERN_INFO "using mwait in idle threads.\n");
-			pm_idle = mwait_idle;
-		}
-	}
-	selected = 1;
+		printk(KERN_INFO "using mwait in idle threads.\n");
+		pm_idle = mwait_idle;
+	} else if (check_c1e_idle(c)) {
+		printk(KERN_INFO "using C1E aware idle routine\n");
+		pm_idle = c1e_idle;
+	} else
+		pm_idle = default_idle;
 }
 
 static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index e2db9ac5c61c..c2a11d77b1b5 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -58,11 +58,6 @@
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
-static int hlt_counter;
-
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
 
@@ -77,55 +72,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 	return ((unsigned long *)tsk->thread.sp)[3];
 }
 
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
-void disable_hlt(void)
-{
-	hlt_counter++;
-}
-
-EXPORT_SYMBOL(disable_hlt);
-
-void enable_hlt(void)
-{
-	hlt_counter--;
-}
-
-EXPORT_SYMBOL(enable_hlt);
-
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
-{
-	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
-		current_thread_info()->status &= ~TS_POLLING;
-		/*
-		 * TS_POLLING-cleared state must be visible before we
-		 * test NEED_RESCHED:
-		 */
-		smp_mb();
-
-		if (!need_resched())
-			safe_halt();	/* enables interrupts racelessly */
-		else
-			local_irq_enable();
-		current_thread_info()->status |= TS_POLLING;
-	} else {
-		local_irq_enable();
-		/* loop is done by the caller */
-		cpu_relax();
-	}
-}
-#ifdef CONFIG_APM_MODULE
-EXPORT_SYMBOL(default_idle);
-#endif
-
 #ifdef CONFIG_HOTPLUG_CPU
 #include <asm/nmi.h>
 /* We don't actually take CPU down, just spin without interrupts. */
@@ -168,24 +114,19 @@ void cpu_idle(void)
 	while (1) {
 		tick_nohz_stop_sched_tick();
 		while (!need_resched()) {
-			void (*idle)(void);
 
 			check_pgt_cache();
 			rmb();
-			idle = pm_idle;
 
 			if (rcu_pending(cpu))
 				rcu_check_callbacks(cpu, 0);
 
-			if (!idle)
-				idle = default_idle;
-
 			if (cpu_is_offline(cpu))
 				play_dead();
 
 			local_irq_disable();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
-			idle();
+			pm_idle();
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c6eb5c91e5f6..290183e9731a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -56,15 +56,6 @@ asmlinkage extern void ret_from_fork(void);
 
 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
 
-unsigned long boot_option_idle_override = 0;
-EXPORT_SYMBOL(boot_option_idle_override);
-
-/*
- * Powermanagement idle function, if any..
- */
-void (*pm_idle)(void);
-EXPORT_SYMBOL(pm_idle);
-
 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 
 void idle_notifier_register(struct notifier_block *n)
@@ -94,25 +85,6 @@ void exit_idle(void)
 	__exit_idle();
 }
 
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
-{
-	current_thread_info()->status &= ~TS_POLLING;
-	/*
-	 * TS_POLLING-cleared state must be visible before we
-	 * test NEED_RESCHED:
-	 */
-	smp_mb();
-	if (!need_resched())
-		safe_halt();	/* enables interrupts racelessly */
-	else
-		local_irq_enable();
-	current_thread_info()->status |= TS_POLLING;
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 DECLARE_PER_CPU(int, cpu_state);
 
@@ -150,12 +122,9 @@ void cpu_idle(void)
 	while (1) {
 		tick_nohz_stop_sched_tick();
 		while (!need_resched()) {
-			void (*idle)(void);
 
 			rmb();
-			idle = pm_idle;
-			if (!idle)
-				idle = default_idle;
+
 			if (cpu_is_offline(smp_processor_id()))
 				play_dead();
 			/*
@@ -165,7 +134,7 @@ void cpu_idle(void)
 			 */
 			local_irq_disable();
 			enter_idle();
-			idle();
+			pm_idle();
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7835f282936..77040b6070e1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -943,13 +943,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		return copy_regset_to_user(child, &user_x86_32_view,
 					   REGSET_XFP,
 					   0, sizeof(struct user_fxsr_struct),
-					   datap);
+					   datap) ? -EIO : 0;
 
 	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
 		return copy_regset_from_user(child, &user_x86_32_view,
 					     REGSET_XFP,
 					     0, sizeof(struct user_fxsr_struct),
-					     datap);
+					     datap) ? -EIO : 0;
 #endif
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d89a648fe710..79bdcd11c66e 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -65,6 +65,7 @@ static enum {
 	ICH_FORCE_HPET_RESUME,
 	VT8237_FORCE_HPET_RESUME,
 	NVIDIA_FORCE_HPET_RESUME,
+	ATI_FORCE_HPET_RESUME,
 } force_hpet_resume_type;
 
 static void __iomem *rcba_base;
@@ -158,6 +159,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
 			 ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
+			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
 			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
@@ -174,6 +177,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7,
 
 static struct pci_dev *cached_dev;
 
+static void hpet_print_force_info(void)
+{
+	printk(KERN_INFO "HPET not enabled in BIOS. "
+	       "You might try hpet=force boot option\n");
+}
+
 static void old_ich_force_hpet_resume(void)
 {
 	u32 val;
@@ -253,6 +262,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
 {
 	if (hpet_force_user)
 		old_ich_force_enable_hpet(dev);
+	else
+		hpet_print_force_info();
 }
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
@@ -290,8 +301,13 @@ static void vt8237_force_enable_hpet(struct pci_dev *dev)
 {
 	u32 uninitialized_var(val);
 
-	if (!hpet_force_user || hpet_address || force_hpet_address)
+	if (hpet_address || force_hpet_address)
+		return;
+
+	if (!hpet_force_user) {
+		hpet_print_force_info();
 		return;
+	}
 
 	pci_read_config_dword(dev, 0x68, &val);
 	/*
@@ -330,6 +346,36 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
 			 vt8237_force_enable_hpet);
 
+static void ati_force_hpet_resume(void)
+{
+	pci_write_config_dword(cached_dev, 0x14, 0xfed00000);
+	printk(KERN_DEBUG "Force enabled HPET at resume\n");
+}
+
+static void ati_force_enable_hpet(struct pci_dev *dev)
+{
+	u32 uninitialized_var(val);
+
+	if (hpet_address || force_hpet_address)
+		return;
+
+	if (!hpet_force_user) {
+		hpet_print_force_info();
+		return;
+	}
+
+	pci_write_config_dword(dev, 0x14, 0xfed00000);
+	pci_read_config_dword(dev, 0x14, &val);
+	force_hpet_address = val;
+	force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
+	dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
+		   force_hpet_address);
+	cached_dev = dev;
+	return;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+			 ati_force_enable_hpet);
+
 /*
  * Undocumented chipset feature taken from LinuxBIOS.
  */
@@ -343,8 +389,13 @@ static void nvidia_force_enable_hpet(struct pci_dev *dev)
 {
 	u32 uninitialized_var(val);
 
-	if (!hpet_force_user || hpet_address || force_hpet_address)
+	if (hpet_address || force_hpet_address)
+		return;
+
+	if (!hpet_force_user) {
+		hpet_print_force_info();
 		return;
+	}
 
 	pci_write_config_dword(dev, 0x44, 0xfed00001);
 	pci_read_config_dword(dev, 0x44, &val);
@@ -397,6 +448,9 @@ void force_hpet_resume(void)
 	case NVIDIA_FORCE_HPET_RESUME:
 		nvidia_force_hpet_resume();
 		return;
+	case ATI_FORCE_HPET_RESUME:
+		ati_force_hpet_resume();
+		return;
 	default:
 		break;
 	}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f6be7d5f82f8..f8a62160e151 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -27,7 +27,7 @@
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
 
-static long no_idt[3];
+static const struct desc_ptr no_idt = {};
 static int reboot_mode;
 enum reboot_type reboot_type = BOOT_KBD;
 int reboot_force;
@@ -201,15 +201,15 @@ core_initcall(reboot_init);
    controller to pulse the CPU reset line, which is more thorough, but
    doesn't work with at least one type of 486 motherboard.  It is easy
    to stop this code working; hence the copious comments. */
-static unsigned long long
+static const unsigned long long
 real_mode_gdt_entries [3] =
 {
 	0x0000000000000000ULL,	/* Null descriptor */
-	0x00009a000000ffffULL,	/* 16-bit real-mode 64k code at 0x00000000 */
-	0x000092000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
+	0x00009b000000ffffULL,	/* 16-bit real-mode 64k code at 0x00000000 */
+	0x000093000100ffffULL	/* 16-bit real-mode 64k data at 0x00000100 */
 };
 
-static struct desc_ptr
+static const struct desc_ptr
 real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
 real_mode_idt = { 0x3ff, 0 };
 
@@ -231,7 +231,7 @@ real_mode_idt = { 0x3ff, 0 };
 
    More could be done here to set up the registers as if a CPU reset had
    occurred; hopefully real BIOSs don't assume much. */
-static unsigned char real_mode_switch [] =
+static const unsigned char real_mode_switch [] =
 {
 	0x66, 0x0f, 0x20, 0xc0,			/*    movl  %cr0,%eax        */
 	0x66, 0x83, 0xe0, 0x11,			/*    andl  $0x00000011,%eax */
@@ -245,7 +245,7 @@ static unsigned char real_mode_switch [] =
 	0x24, 0x10,				/* f: andb  $0x10,al         */
 	0x66, 0x0f, 0x22, 0xc0			/*    movl  %eax,%cr0        */
 };
-static unsigned char jump_to_bios [] =
+static const unsigned char jump_to_bios [] =
 {
 	0xea, 0x00, 0x00, 0xff, 0xff		/*    ljmp  $0xffff,$0x0000  */
 };
@@ -255,7 +255,7 @@ static unsigned char jump_to_bios [] =
  * specified by the code and length parameters.
  * We assume that length will aways be less that 100!
  */
-void machine_real_restart(unsigned char *code, int length)
+void machine_real_restart(const unsigned char *code, int length)
 {
 	local_irq_disable();
 
@@ -368,7 +368,7 @@ static void native_machine_emergency_restart(void)
 			}
 
 		case BOOT_TRIPLE:
-			load_idt((const struct desc_ptr *)&no_idt);
+			load_idt(&no_idt);
 			__asm__ __volatile__("int3");
 
 			reboot_type = BOOT_KBD;
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index dec0b5ec25c2..61a837743fe5 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -49,7 +49,7 @@ struct device_fixup {
 	void (*reboot_fixup)(struct pci_dev *);
 };
 
-static struct device_fixup fixups_table[] = {
+static const struct device_fixup fixups_table[] = {
 { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
 { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
@@ -64,7 +64,7 @@ static struct device_fixup fixups_table[] = {
  */
 void mach_reboot_fixups(void)
 {
-	struct device_fixup *cur;
+	const struct device_fixup *cur;
 	struct pci_dev *dev;
 	int i;
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d4eaa4eb481d..ebb0a2bcdc08 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -17,6 +17,7 @@ unsigned int num_processors;
 unsigned disabled_cpus __cpuinitdata;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
+unsigned int max_physical_apicid;
 EXPORT_SYMBOL(boot_cpu_physical_apicid);
 
 /* Bitmask of physically existing CPUs */
@@ -206,6 +207,31 @@ void __init setup_per_cpu_areas(void)
 
 #endif
 
+void __init parse_setup_data(void)
+{
+	struct setup_data *data;
+	u64 pa_data;
+
+	if (boot_params.hdr.version < 0x0209)
+		return;
+	pa_data = boot_params.hdr.setup_data;
+	while (pa_data) {
+		data = early_ioremap(pa_data, PAGE_SIZE);
+		switch (data->type) {
+		case SETUP_E820_EXT:
+			parse_e820_ext(data, pa_data);
+			break;
+		default:
+			break;
+		}
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+		free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+		pa_data = data->next;
+		early_iounmap(data, PAGE_SIZE);
+	}
+}
+
 #ifdef X86_64_NUMA
 
 /*
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index ccd5f5cdbbe6..a9b19ad24edb 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -59,6 +59,7 @@
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
 #include <asm/sections.h>
+#include <asm/dmi.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
 #include <asm/io.h>
@@ -67,10 +68,13 @@
 #include <asm/bios_ebda.h>
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
+#include <asm/efi.h>
+#include <asm/bugs.h>
 
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
    address, and must not be in the .bss segment! */
+unsigned long init_pg_tables_start __initdata = ~0UL;
 unsigned long init_pg_tables_end __initdata = ~0UL;
 
 /*
@@ -182,6 +186,12 @@ int bootloader_type;
 static unsigned int highmem_pages = -1;
 
 /*
+ * Early DMI memory
+ */
+int dmi_alloc_index;
+char dmi_alloc_data[DMI_MAX_DATA];
+
+/*
  * Setup options
  */
 struct screen_info screen_info;
@@ -237,42 +247,6 @@ static inline void copy_edd(void)
 }
 #endif
 
-int __initdata user_defined_memmap;
-
-/*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem=  [also see Documentation/i386/boot.txt]
- */
-static int __init parse_mem(char *arg)
-{
-	if (!arg)
-		return -EINVAL;
-
-	if (strcmp(arg, "nopentium") == 0) {
-		setup_clear_cpu_cap(X86_FEATURE_PSE);
-	} else {
-		/* If the user specifies memory size, we
-		 * limit the BIOS-provided memory map to
-		 * that size. exactmap can be used to specify
-		 * the exact map. mem=number can be used to
-		 * trim the existing memory map.
-		 */
-		unsigned long long mem_size;
-
-		mem_size = memparse(arg, &arg);
-		limit_regions(mem_size);
-		user_defined_memmap = 1;
-	}
-	return 0;
-}
-early_param("mem", parse_mem);
-
 #ifdef CONFIG_PROC_VMCORE
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel.
@@ -395,56 +369,6 @@ unsigned long __init find_max_low_pfn(void)
 	return max_low_pfn;
 }
 
-#define BIOS_LOWMEM_KILOBYTES 0x413
-
-/*
- * The BIOS places the EBDA/XBDA at the top of conventional
- * memory, and usually decreases the reported amount of
- * conventional memory (int 0x12) too. This also contains a
- * workaround for Dell systems that neglect to reserve EBDA.
- * The same workaround also avoids a problem with the AMD768MPX
- * chipset: reserve a page before VGA to prevent PCI prefetch
- * into it (errata #56). Usually the page is reserved anyways,
- * unless you have no PS/2 mouse plugged in.
- */
-static void __init reserve_ebda_region(void)
-{
-	unsigned int lowmem, ebda_addr;
-
-	/* To determine the position of the EBDA and the */
-	/* end of conventional memory, we need to look at */
-	/* the BIOS data area. In a paravirtual environment */
-	/* that area is absent. We'll just have to assume */
-	/* that the paravirt case can handle memory setup */
-	/* correctly, without our help. */
-	if (paravirt_enabled())
-		return;
-
-	/* end of low (conventional) memory */
-	lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
-	lowmem <<= 10;
-
-	/* start of EBDA area */
-	ebda_addr = get_bios_ebda();
-
-	/* Fixup: bios puts an EBDA in the top 64K segment */
-	/* of conventional memory, but does not adjust lowmem. */
-	if ((lowmem - ebda_addr) <= 0x10000)
-		lowmem = ebda_addr;
-
-	/* Fixup: bios does not report an EBDA at all. */
-	/* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
-	if ((ebda_addr == 0) && (lowmem >= 0x9f000))
-		lowmem = 0x9f000;
-
-	/* Paranoia: should never happen, but... */
-	if ((lowmem == 0) || (lowmem >= 0x100000))
-		lowmem = 0x9f000;
-
-	/* reserve all memory between lowmem and the 1MB mark */
-	reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
-}
-
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 static void __init setup_bootmem_allocator(void);
 static unsigned long __init setup_memory(void)
@@ -462,11 +386,13 @@ static unsigned long __init setup_memory(void)
 	if (max_pfn > max_low_pfn) {
 		highstart_pfn = max_low_pfn;
 	}
+	memory_present(0, 0, highend_pfn);
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
 	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
+	memory_present(0, 0, max_low_pfn);
 	num_physpages = max_low_pfn;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
@@ -488,11 +414,12 @@ static void __init zone_sizes_init(void)
 	max_zone_pfns[ZONE_DMA] =
 		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+	remove_all_active_ranges();
 #ifdef CONFIG_HIGHMEM
 	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-	add_active_range(0, 0, highend_pfn);
+	e820_register_active_regions(0, 0, highend_pfn);
 #else
-	add_active_range(0, 0, max_low_pfn);
+	e820_register_active_regions(0, 0, max_low_pfn);
 #endif
 
 	free_area_init_nodes(max_zone_pfns);
@@ -526,25 +453,28 @@ static void __init reserve_crashkernel(void)
 	ret = parse_crashkernel(boot_command_line, total_mem,
 			&crash_size, &crash_base);
 	if (ret == 0 && crash_size > 0) {
-		if (crash_base > 0) {
-			printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
-					"for crashkernel (System RAM: %ldMB)\n",
-					(unsigned long)(crash_size >> 20),
-					(unsigned long)(crash_base >> 20),
-					(unsigned long)(total_mem >> 20));
-
-			if (reserve_bootmem(crash_base, crash_size,
-					BOOTMEM_EXCLUSIVE) < 0) {
-				printk(KERN_INFO "crashkernel reservation "
-					"failed - memory is in use\n");
-				return;
-			}
-
-			crashk_res.start = crash_base;
-			crashk_res.end   = crash_base + crash_size - 1;
-		} else
+		if (crash_base <= 0) {
 			printk(KERN_INFO "crashkernel reservation failed - "
 					"you have to specify a base address\n");
+			return;
+		}
+
+		if (reserve_bootmem_generic(crash_base, crash_size,
+					BOOTMEM_EXCLUSIVE) < 0) {
+			printk(KERN_INFO "crashkernel reservation failed - "
+					"memory is in use\n");
+			return;
+		}
+
+		printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
+				"for crashkernel (System RAM: %ldMB)\n",
+				(unsigned long)(crash_size >> 20),
+				(unsigned long)(crash_base >> 20),
+				(unsigned long)(total_mem >> 20));
+
+		crashk_res.start = crash_base;
+		crashk_res.end   = crash_base + crash_size - 1;
+		insert_resource(&iomem_resource, &crashk_res);
 	}
 }
 #else
@@ -558,44 +488,57 @@ static bool do_relocate_initrd = false;
 
 static void __init reserve_initrd(void)
 {
-	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-	unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
-	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	unsigned long ramdisk_here;
-
-	initrd_start = 0;
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 ramdisk_end   = ramdisk_image + ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 ramdisk_here;
 
 	if (!boot_params.hdr.type_of_loader ||
 	    !ramdisk_image || !ramdisk_size)
 		return;		/* No initrd provided by bootloader */
 
-	if (ramdisk_end < ramdisk_image) {
-		printk(KERN_ERR "initrd wraps around end of memory, "
-		       "disabling initrd\n");
-		return;
-	}
+	initrd_start = 0;
+
 	if (ramdisk_size >= end_of_lowmem/2) {
+		free_early(ramdisk_image, ramdisk_end);
 		printk(KERN_ERR "initrd too large to handle, "
 		       "disabling initrd\n");
 		return;
 	}
+
+	printk(KERN_INFO "old RAMDISK: %08llx - %08llx\n", ramdisk_image,
+			ramdisk_end);
+
+
 	if (ramdisk_end <= end_of_lowmem) {
 		/* All in lowmem, easy case */
-		reserve_bootmem(ramdisk_image, ramdisk_size, BOOTMEM_DEFAULT);
+		/*
+		 * don't need to reserve again, already reserved early
+		 * in i386_start_kernel
+		 */
 		initrd_start = ramdisk_image + PAGE_OFFSET;
 		initrd_end = initrd_start+ramdisk_size;
 		return;
 	}
 
 	/* We need to move the initrd down into lowmem */
-	ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK;
+	ramdisk_here = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+				 end_of_lowmem, ramdisk_size,
+				 PAGE_SIZE);
+
+	if (ramdisk_here == -1ULL)
+		panic("Cannot find place for new RAMDISK of size %lld\n",
+			 ramdisk_size);
 
 	/* Note: this includes all the lowmem currently occupied by
 	   the initrd, we rely on that fact to keep the data intact. */
-	reserve_bootmem(ramdisk_here, ramdisk_size, BOOTMEM_DEFAULT);
+	reserve_early(ramdisk_here, ramdisk_here + ramdisk_size,
+			 "NEW RAMDISK");
 	initrd_start = ramdisk_here + PAGE_OFFSET;
 	initrd_end   = initrd_start + ramdisk_size;
+	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
+			 ramdisk_here, ramdisk_here + ramdisk_size);
 
 	do_relocate_initrd = true;
 }
@@ -604,10 +547,10 @@ static void __init reserve_initrd(void)
 
 static void __init relocate_initrd(void)
 {
-	unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-	unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-	unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT;
-	unsigned long ramdisk_here;
+	u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+	u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+	u64 end_of_lowmem = max_low_pfn << PAGE_SHIFT;
+	u64 ramdisk_here;
 	unsigned long slop, clen, mapaddr;
 	char *p, *q;
 
@@ -624,6 +567,10 @@ static void __init relocate_initrd(void)
 		p = (char *)__va(ramdisk_image);
 		memcpy(q, p, clen);
 		q += clen;
+		/* need to free these low pages...*/
+		printk(KERN_INFO "Freeing old partial RAMDISK %08llx-%08llx\n",
+			 ramdisk_image, ramdisk_image + clen - 1);
+		free_bootmem(ramdisk_image, clen);
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
 	}
@@ -642,66 +589,47 @@ static void __init relocate_initrd(void)
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
 	}
+	/* high pages is not converted by early_res_to_bootmem */
+	ramdisk_image = boot_params.hdr.ramdisk_image;
+	ramdisk_size  = boot_params.hdr.ramdisk_size;
+	printk(KERN_INFO "Copied RAMDISK from %016llx - %016llx to %08llx - %08llx\n",
+		ramdisk_image, ramdisk_image + ramdisk_size - 1,
+		ramdisk_here, ramdisk_here + ramdisk_size - 1);
+
+	/* need to free that, otherwise init highmem will reserve it again */
+	free_early(ramdisk_image, ramdisk_image+ramdisk_size);
 }
 
 #endif /* CONFIG_BLK_DEV_INITRD */
 
 void __init setup_bootmem_allocator(void)
 {
-	unsigned long bootmap_size;
+	int i;
+	unsigned long bootmap_size, bootmap;
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
-	bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
-
-	register_bootmem_low_pages(max_low_pfn);
-
-	/*
-	 * Reserve the bootmem bitmap itself as well. We do this in two
-	 * steps (first step was init_bootmem()) because this catches
-	 * the (very unlikely) case of us accidentally initializing the
-	 * bootmem allocator with an invalid RAM area.
-	 */
-	reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
-			 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text),
-			 BOOTMEM_DEFAULT);
-
-	/*
-	 * reserve physical page 0 - it's a special BIOS page on many boxes,
-	 * enabling clean reboots, SMP operation, laptop functions.
-	 */
-	reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
-
-	/* reserve EBDA region */
-	reserve_ebda_region();
-
-#ifdef CONFIG_SMP
-	/*
-	 * But first pinch a few for the stack/trampoline stuff
-	 * FIXME: Don't need the extra page at 4K, but need to fix
-	 * trampoline before removing it. (see the GDT stuff)
-	 */
-	reserve_bootmem(PAGE_SIZE, PAGE_SIZE, BOOTMEM_DEFAULT);
-#endif
-#ifdef CONFIG_ACPI_SLEEP
-	/*
-	 * Reserve low memory region for sleep support.
-	 */
-	acpi_reserve_bootmem();
-#endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
-	/*
-	 * Find and reserve possible boot-time SMP configuration:
-	 */
-	find_smp_config();
-#endif
+	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
+	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
+				 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+				 PAGE_SIZE);
+	if (bootmap == -1L)
+		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
+	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
 #ifdef CONFIG_BLK_DEV_INITRD
 	reserve_initrd();
 #endif
-	numa_kva_reserve();
-	reserve_crashkernel();
+	bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, max_low_pfn);
+	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+		 max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
+		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
+		 bootmap, bootmap + bootmap_size);
+	for_each_online_node(i)
+		free_bootmem_with_active_regions(i, max_low_pfn);
+	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
 
-	reserve_ibft_region();
 }
 
 /*
@@ -731,11 +659,7 @@ static void set_mca_bus(int x)
 static void set_mca_bus(int x) { }
 #endif
 
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-char * __init __attribute__((weak)) memory_setup(void)
-{
-	return machine_specific_memory_setup();
-}
+static void probe_roms(void);
 
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
@@ -746,17 +670,21 @@ char * __init __attribute__((weak)) memory_setup(void)
  */
 void __init setup_arch(char **cmdline_p)
 {
+	int i;
 	unsigned long max_low_pfn;
 
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	pre_setup_arch_hook();
 	early_cpu_init();
 	early_ioremap_init();
+	reserve_setup_data();
 
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL32", 4))
+		     "EL32", 4)) {
 		efi_enabled = 1;
+		efi_reserve_early();
+	}
 #endif
 
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -780,8 +708,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	ARCH_SETUP
 
-	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
-	print_memory_map(memory_setup());
+	setup_memory_map();
 
 	copy_edd();
 
@@ -799,12 +726,18 @@ void __init setup_arch(char **cmdline_p)
 	bss_resource.start = virt_to_phys(&__bss_start);
 	bss_resource.end = virt_to_phys(&__bss_stop)-1;
 
+	parse_setup_data();
+
 	parse_early_param();
 
-	if (user_defined_memmap) {
-		printk(KERN_INFO "user-defined physical RAM map:\n");
-		print_memory_map("user");
-	}
+	finish_e820_parsing();
+
+	probe_roms();
+
+	/* after parse_early_param, so could debug it */
+	insert_resource(&iomem_resource, &code_resource);
+	insert_resource(&iomem_resource, &data_resource);
+	insert_resource(&iomem_resource, &bss_resource);
 
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
@@ -812,14 +745,67 @@ void __init setup_arch(char **cmdline_p)
 	if (efi_enabled)
 		efi_init();
 
+	if (ppro_with_ram_bug()) {
+		e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
+				  E820_RESERVED);
+		sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+		printk(KERN_INFO "fixed physical RAM map:\n");
+		e820_print_map("bad_ppro");
+	}
+
+	e820_register_active_regions(0, 0, -1UL);
+	/*
+	 * partially used pages are not usable - thus
+	 * we are rounding upwards:
+	 */
+	max_pfn = e820_end_of_ram();
+
+	/* preallocate 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
 	/* update e820 for memory not covered by WB MTRRs */
-	propagate_e820_map();
 	mtrr_bp_init();
-	if (mtrr_trim_uncached_memory(max_pfn))
-		propagate_e820_map();
+	if (mtrr_trim_uncached_memory(max_pfn)) {
+		remove_all_active_ranges();
+		e820_register_active_regions(0, 0, -1UL);
+		max_pfn = e820_end_of_ram();
+	}
+
+	dmi_scan_machine();
+
+	io_delay_init();
+
+#ifdef CONFIG_ACPI
+	/*
+	 * Parse the ACPI tables for possible boot-time SMP configuration.
+	 */
+	acpi_boot_table_init();
+#endif
+
+#ifdef CONFIG_ACPI_NUMA
+        /*
+         * Parse SRAT to discover nodes.
+         */
+        acpi_numa_init();
+#endif
 
 	max_low_pfn = setup_memory();
 
+#ifdef CONFIG_ACPI_SLEEP
+	/*
+	 * Reserve low memory region for sleep support.
+	 */
+	acpi_reserve_bootmem();
+#endif
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+	/*
+	 * Find and reserve possible boot-time SMP configuration:
+	 */
+	find_smp_config();
+#endif
+	reserve_crashkernel();
+
+	reserve_ibft_region();
+
 #ifdef CONFIG_KVM_CLOCK
 	kvmclock_init();
 #endif
@@ -843,9 +829,6 @@ void __init setup_arch(char **cmdline_p)
 	 * not to exceed the 8Mb limit.
 	 */
 
-#ifdef CONFIG_SMP
-	smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
-#endif
 	paging_init();
 
 	/*
@@ -857,10 +840,6 @@ void __init setup_arch(char **cmdline_p)
 		init_ohci1394_dma_on_all_controllers();
 #endif
 
-	remapped_pgdat_init();
-	sparse_init();
-	zone_sizes_init();
-
 	/*
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
@@ -869,42 +848,41 @@ void __init setup_arch(char **cmdline_p)
 	relocate_initrd();
 #endif
 
-	paravirt_post_allocator_init();
-
-	dmi_scan_machine();
+	remapped_pgdat_init();
+	sparse_init();
+	zone_sizes_init();
 
-	io_delay_init();
+	paravirt_post_allocator_init();
 
 #ifdef CONFIG_X86_GENERICARCH
 	generic_apic_probe();
 #endif
 
-#ifdef CONFIG_ACPI
-	/*
-	 * Parse the ACPI tables for possible boot-time SMP configuration.
-	 */
-	acpi_boot_table_init();
-#endif
-
 	early_quirks();
 
 #ifdef CONFIG_ACPI
 	acpi_boot_init();
-
+#endif
+#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS)
+	if (smp_found_config)
+		get_smp_config();
+#endif
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
 	if (def_to_bigsmp)
 		printk(KERN_WARNING "More than 8 CPUs detected and "
 			"CONFIG_X86_PC cannot handle it.\nUse "
 			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
 #endif
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	if (smp_found_config)
-		get_smp_config();
-#endif
 
-	e820_register_memory();
-	e820_mark_nosave_regions();
+	e820_reserve_resources();
+	e820_mark_nosave_regions(max_low_pfn);
+
+	request_resource(&iomem_resource, &video_ram_resource);
+	/* request I/O space for devices used on all i[345]86 PCs */
+	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
+		request_resource(&ioport_resource, &standard_io_resources[i]);
+
+	e820_setup_gap();
 
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
@@ -916,25 +894,147 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
-/*
- * Request address space for all standard resources
- *
- * This is called just before pcibios_init(), which is also a
- * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
- */
-static int __init request_standard_resources(void)
+static struct resource system_rom_resource = {
+	.name	= "System ROM",
+	.start	= 0xf0000,
+	.end	= 0xfffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+	.name	= "Extension ROM",
+	.start	= 0xe0000,
+	.end	= 0xeffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+	.name 	= "Adapter ROM",
+	.start	= 0xc8000,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+	.name 	= "Video ROM",
+	.start	= 0xc0000,
+	.end	= 0xc7fff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
 {
+	const unsigned short * const ptr = (const unsigned short *)rom;
+	unsigned short sig;
+
+	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+	unsigned char sum, c;
+
+	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+		sum += c;
+	return !length && !sum;
+}
+
+static void __init probe_roms(void)
+{
+	const unsigned char *rom;
+	unsigned long start, length, upper;
+	unsigned char c;
 	int i;
 
-	printk(KERN_INFO "Setting up standard PCI resources\n");
-	init_iomem_resources(&code_resource, &data_resource, &bss_resource);
+	/* video rom */
+	upper = adapter_rom_resources[0].start;
+	for (start = video_rom_resource.start; start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
 
-	request_resource(&iomem_resource, &video_ram_resource);
+		video_rom_resource.start = start;
 
-	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
-		request_resource(&ioport_resource, &standard_io_resources[i]);
-	return 0;
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* if checksum okay, trust length byte */
+		if (length && romchecksum(rom, length))
+			video_rom_resource.end = start + length - 1;
+
+		request_resource(&iomem_resource, &video_rom_resource);
+		break;
+	}
+
+	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+	if (start < upper)
+		start = upper;
+
+	/* system rom */
+	request_resource(&iomem_resource, &system_rom_resource);
+	upper = system_rom_resource.start;
+
+	/* check for extension rom (ignore length byte!) */
+	rom = isa_bus_to_virt(extension_rom_resource.start);
+	if (romsignature(rom)) {
+		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		if (romchecksum(rom, length)) {
+			request_resource(&iomem_resource, &extension_rom_resource);
+			upper = extension_rom_resource.start;
+		}
+	}
+
+	/* check for adapter roms on 2k boundaries */
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* but accept any length that fits if checksum okay */
+		if (!length || start + length > upper || !romchecksum(rom, length))
+			continue;
+
+		adapter_rom_resources[i].start = start;
+		adapter_rom_resources[i].end = start + length - 1;
+		request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+		start = adapter_rom_resources[i++].end & ~2047UL;
+	}
 }
 
-subsys_initcall(request_standard_resources);
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 4a666cdccb68..16ef53ab538a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -56,6 +56,7 @@
 #include <asm/desc.h>
 #include <video/edid.h>
 #include <asm/e820.h>
+#include <asm/mpspec.h>
 #include <asm/dma.h>
 #include <asm/gart.h>
 #include <asm/mpspec.h>
@@ -71,6 +72,7 @@
 #include <asm/topology.h>
 #include <asm/trampoline.h>
 #include <asm/pat.h>
+#include <asm/mmconfig.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_PARAVIRT
@@ -79,6 +81,8 @@
 #define ARCH_SETUP
 #endif
 
+#include "cpu/cpu.h"
+
 /*
  * Machine setup..
  */
@@ -95,8 +99,6 @@ int bootloader_type;
 
 unsigned long saved_video_mode;
 
-int force_mwait __cpuinitdata;
-
 /*
  * Early DMI memory
  */
@@ -118,7 +120,7 @@ EXPORT_SYMBOL_GPL(edid_info);
 
 extern int root_mountflags;
 
-char __initdata command_line[COMMAND_LINE_SIZE];
+static char __initdata command_line[COMMAND_LINE_SIZE];
 
 static struct resource standard_io_resources[] = {
 	{ .name = "dma1", .start = 0x00, .end = 0x1f,
@@ -164,6 +166,7 @@ static struct resource bss_resource = {
 	.flags = IORESOURCE_RAM,
 };
 
+static void __init early_cpu_init(void);
 static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
 
 #ifdef CONFIG_PROC_VMCORE
@@ -265,46 +268,6 @@ static inline void __init reserve_crashkernel(void)
 {}
 #endif
 
-/* Overridden in paravirt.c if CONFIG_PARAVIRT */
-void __attribute__((weak)) __init memory_setup(void)
-{
-       machine_specific_memory_setup();
-}
-
-static void __init parse_setup_data(void)
-{
-	struct setup_data *data;
-	unsigned long pa_data;
-
-	if (boot_params.hdr.version < 0x0209)
-		return;
-	pa_data = boot_params.hdr.setup_data;
-	while (pa_data) {
-		data = early_ioremap(pa_data, PAGE_SIZE);
-		switch (data->type) {
-		default:
-			break;
-		}
-#ifndef CONFIG_DEBUG_BOOT_PARAMS
-		free_early(pa_data, pa_data+sizeof(*data)+data->len);
-#endif
-		pa_data = data->next;
-		early_iounmap(data, PAGE_SIZE);
-	}
-}
-
-#ifdef CONFIG_PCI_MMCONFIG
-extern void __cpuinit fam10h_check_enable_mmcfg(void);
-extern void __init check_enable_amd_mmconf_dmi(void);
-#else
-void __cpuinit fam10h_check_enable_mmcfg(void)
-{
-}
-void __init check_enable_amd_mmconf_dmi(void)
-{
-}
-#endif
-
 /*
  * setup_arch - architecture-specific boot-time initializations
  *
@@ -329,13 +292,15 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #ifdef CONFIG_EFI
 	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-		     "EL64", 4))
+		     "EL64", 4)) {
 		efi_enabled = 1;
+		efi_reserve_early();
+	}
 #endif
 
 	ARCH_SETUP
 
-	memory_setup();
+	setup_memory_map();
 	copy_edd();
 
 	if (!boot_params.hdr.root_flags)
@@ -352,6 +317,7 @@ void __init setup_arch(char **cmdline_p)
 	bss_resource.start = virt_to_phys(&__bss_start);
 	bss_resource.end = virt_to_phys(&__bss_stop)-1;
 
+	early_cpu_init();
 	early_identify_cpu(&boot_cpu_data);
 
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
@@ -381,9 +347,13 @@ void __init setup_arch(char **cmdline_p)
 	 * we are rounding upwards:
 	 */
 	end_pfn = e820_end_of_ram();
+
+	/* pre allocte 4k for mptable mpc */
+	early_reserve_e820_mpc_new();
 	/* update e820 for memory not covered by WB MTRRs */
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(end_pfn)) {
+		remove_all_active_ranges();
 		e820_register_active_regions(0, 0, -1UL);
 		end_pfn = e820_end_of_ram();
 	}
@@ -392,7 +362,7 @@ void __init setup_arch(char **cmdline_p)
 
 	check_efer();
 
-	max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
+	max_pfn_mapped = init_memory_mapping(0, (end_pfn << PAGE_SHIFT));
 	if (efi_enabled)
 		efi_init();
 
@@ -444,13 +414,12 @@ void __init setup_arch(char **cmdline_p)
        acpi_reserve_bootmem();
 #endif
 
-	if (efi_enabled)
-		efi_reserve_bootmem();
-
+#ifdef CONFIG_X86_MPPARSE
        /*
 	* Find and reserve possible boot-time SMP configuration:
 	*/
 	find_smp_config();
+#endif
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
 		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -493,11 +462,13 @@ void __init setup_arch(char **cmdline_p)
 
 	init_cpu_to_node();
 
+#ifdef CONFIG_X86_MPPARSE
 	/*
 	 * get boot-time SMP configuration:
 	 */
 	if (smp_found_config)
 		get_smp_config();
+#endif
 	init_apic_mappings();
 	ioapic_init_mappings();
 
@@ -507,7 +478,7 @@ void __init setup_arch(char **cmdline_p)
 	 * We trust e820 completely. No explicit ROM probing in memory.
 	 */
 	e820_reserve_resources();
-	e820_mark_nosave_regions();
+	e820_mark_nosave_regions(end_pfn);
 
 	/* request I/O space for devices used on all i[345]86 PCs */
 	for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
@@ -528,7 +499,20 @@ void __init setup_arch(char **cmdline_p)
 	check_enable_amd_mmconf_dmi();
 }
 
-static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
+{
+	display_cacheinfo(c);
+}
+
+static struct cpu_dev __cpuinitdata default_cpu = {
+	.c_init	= default_init,
+	.c_vendor = "Unknown",
+};
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
+
+int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
 
@@ -544,7 +528,7 @@ static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 }
 
 
-static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
+void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 {
 	unsigned int n, dummy, eax, ebx, ecx, edx;
 
@@ -576,228 +560,6 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 	}
 }
 
-#ifdef CONFIG_NUMA
-static int __cpuinit nearby_node(int apicid)
-{
-	int i, node;
-
-	for (i = apicid - 1; i >= 0; i--) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned bits;
-#ifdef CONFIG_NUMA
-	int cpu = smp_processor_id();
-	int node = 0;
-	unsigned apicid = hard_smp_processor_id();
-#endif
-	bits = c->x86_coreid_bits;
-
-	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
-	/* Convert the initial APIC ID into the socket ID */
-	c->phys_proc_id = c->initial_apicid >> bits;
-
-#ifdef CONFIG_NUMA
-	node = c->phys_proc_id;
-	if (apicid_to_node[apicid] != NUMA_NO_NODE)
-		node = apicid_to_node[apicid];
-	if (!node_online(node)) {
-		/* Two possibilities here:
-		   - The CPU is missing memory and no node was created.
-		   In that case try picking one from a nearby CPU
-		   - The APIC IDs differ from the HyperTransport node IDs
-		   which the K8 northbridge parsing fills in.
-		   Assume they are all increased by a constant offset,
-		   but in the same order as the HT nodeids.
-		   If that doesn't result in a usable node fall back to the
-		   path for the previous case.  */
-
-		int ht_nodeid = c->initial_apicid;
-
-		if (ht_nodeid >= 0 &&
-		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-			node = apicid_to_node[ht_nodeid];
-		/* Pick a nearby node */
-		if (!node_online(node))
-			node = nearby_node(apicid);
-	}
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned bits, ecx;
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level < 0x80000008)
-		return;
-
-	ecx = cpuid_ecx(0x80000008);
-
-	c->x86_max_cores = (ecx & 0xff) + 1;
-
-	/* CPU telling us the core id bits shift? */
-	bits = (ecx >> 12) & 0xF;
-
-	/* Otherwise recompute */
-	if (bits == 0) {
-		while ((1 << bits) < c->x86_max_cores)
-			bits++;
-	}
-
-	c->x86_coreid_bits = bits;
-
-#endif
-}
-
-#define ENABLE_C1E_MASK		0x18000000
-#define CPUID_PROCESSOR_SIGNATURE	1
-#define CPUID_XFAM		0x0ff00000
-#define CPUID_XFAM_K8		0x00000000
-#define CPUID_XFAM_10H		0x00100000
-#define CPUID_XFAM_11H		0x00200000
-#define CPUID_XMOD		0x000f0000
-#define CPUID_XMOD_REV_F	0x00040000
-
-/* AMD systems with C1E don't have a working lAPIC timer. Check for that. */
-static __cpuinit int amd_apic_timer_broken(void)
-{
-	u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-
-	switch (eax & CPUID_XFAM) {
-	case CPUID_XFAM_K8:
-		if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F)
-			break;
-	case CPUID_XFAM_10H:
-	case CPUID_XFAM_11H:
-		rdmsr(MSR_K8_ENABLE_C1E, lo, hi);
-		if (lo & ENABLE_C1E_MASK)
-			return 1;
-		break;
-	default:
-		/* err on the side of caution */
-		return 1;
-	}
-	return 0;
-}
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
-{
-	early_init_amd_mc(c);
-
- 	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
-	if (c->x86_power & (1<<8))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
-	unsigned level;
-
-#ifdef CONFIG_SMP
-	unsigned long value;
-
-	/*
-	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
-	 * bit 6 of msr C001_0015
-	 *
-	 * Errata 63 for SH-B3 steppings
-	 * Errata 122 for all steppings (F+ have it disabled by default)
-	 */
-	if (c->x86 == 15) {
-		rdmsrl(MSR_K8_HWCR, value);
-		value |= 1 << 6;
-		wrmsrl(MSR_K8_HWCR, value);
-	}
-#endif
-
-	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
-	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-	clear_cpu_cap(c, 0*32+31);
-
-	/* On C+ stepping K8 rep microcode works well for copy/memset */
-	level = cpuid_eax(1);
-	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) ||
-			     level >= 0x0f58))
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	if (c->x86 == 0x10 || c->x86 == 0x11)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
-	/* Enable workaround for FXSAVE leak */
-	if (c->x86 >= 6)
-		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
-	level = get_model_name(c);
-	if (!level) {
-		switch (c->x86) {
-		case 15:
-			/* Should distinguish Models here, but this is only
-			   a fallback anyways. */
-			strcpy(c->x86_model_id, "Hammer");
-			break;
-		}
-	}
-	display_cacheinfo(c);
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level >= 0x80000008)
-		amd_detect_cmp(c);
-
-	if (c->extended_cpuid_level >= 0x80000006 &&
-		(cpuid_edx(0x80000006) & 0xf000))
-		num_cache_leaves = 4;
-	else
-		num_cache_leaves = 3;
-
-	if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11)
-		set_cpu_cap(c, X86_FEATURE_K8);
-
-	/* MFENCE stops RDTSC speculation */
-	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
-	if (c->x86 == 0x10)
-		fam10h_check_enable_mmcfg();
-
-	if (amd_apic_timer_broken())
-		disable_apic_timer = 1;
-
-	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
-		unsigned long long tseg;
-
-		/*
-		 * Split up direct mapping around the TSEG SMM area.
-		 * Don't do it for gbpages because there seems very little
-		 * benefit in doing so.
-		 */
-		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
-		(tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
-			set_memory_4k((unsigned long)__va(tseg), 1);
-	}
-}
-
 void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
@@ -848,135 +610,59 @@ out:
 #endif
 }
 
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
-	unsigned int eax, t;
-
-	if (c->cpuid_level < 4)
-		return 1;
-
-	cpuid_count(4, 0, &eax, &t, &t, &t);
-
-	if (eax & 0x1f)
-		return ((eax >> 26) + 1);
-	else
-		return 1;
-}
-
-static void __cpuinit srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
-	unsigned node;
-	int cpu = smp_processor_id();
-	int apicid = hard_smp_processor_id();
-
-	/* Don't do the funky fallback heuristics the AMD version employs
-	   for now. */
-	node = apicid_to_node[apicid];
-	if (node == NUMA_NO_NODE || !node_online(node))
-		node = first_node(node_online_map);
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
-{
-	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
-	/* Cache sizes */
-	unsigned n;
-
-	init_intel_cacheinfo(c);
-	if (c->cpuid_level > 9) {
-		unsigned eax = cpuid_eax(10);
-		/* Check for version and the number of counters */
-		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
-	}
-
-	if (cpu_has_ds) {
-		unsigned int l1, l2;
-		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
-		if (!(l1 & (1<<11)))
-			set_cpu_cap(c, X86_FEATURE_BTS);
-		if (!(l1 & (1<<12)))
-			set_cpu_cap(c, X86_FEATURE_PEBS);
+	char *v = c->x86_vendor_id;
+	int i;
+	static int printed;
+
+	for (i = 0; i < X86_VENDOR_NUM; i++) {
+		if (cpu_devs[i]) {
+			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+			    (cpu_devs[i]->c_ident[1] &&
+			    !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+				c->x86_vendor = i;
+				this_cpu = cpu_devs[i];
+				return;
+			}
+		}
 	}
-
-
-	if (cpu_has_bts)
-		ds_init_intel(c);
-
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-		/* CPUID workaround for Intel 0F34 CPU */
-		if (c->x86_vendor == X86_VENDOR_INTEL &&
-		    c->x86 == 0xF && c->x86_model == 0x3 &&
-		    c->x86_mask == 0x4)
-			c->x86_phys_bits = 36;
+	if (!printed) {
+		printed++;
+		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+		printk(KERN_ERR "CPU: Your system may be unstable.\n");
 	}
-
-	if (c->x86 == 15)
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-	if (c->x86 == 6)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-	c->x86_max_cores = intel_num_cpu_cores(c);
-
-	srat_detect_node();
-}
-
-static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
-{
-	if (c->x86 == 0x6 && c->x86_model >= 0xf)
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+	c->x86_vendor = X86_VENDOR_UNKNOWN;
 }
 
-static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+static void __init early_cpu_support_print(void)
 {
-	/* Cache sizes */
-	unsigned n;
+	int i,j;
+	struct cpu_dev *cpu_devx;
 
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-
-	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+	printk("KERNEL supported cpus:\n");
+	for (i = 0; i < X86_VENDOR_NUM; i++) {
+		cpu_devx = cpu_devs[i];
+		if (!cpu_devx)
+			continue;
+		for (j = 0; j < 2; j++) {
+			if (!cpu_devx->c_ident[j])
+				continue;
+			printk("  %s %s\n", cpu_devx->c_vendor,
+				cpu_devx->c_ident[j]);
+		}
 	}
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
 }
 
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
+static void __init early_cpu_init(void)
 {
-	char *v = c->x86_vendor_id;
+        struct cpu_vendor_dev *cvdev;
 
-	if (!strcmp(v, "AuthenticAMD"))
-		c->x86_vendor = X86_VENDOR_AMD;
-	else if (!strcmp(v, "GenuineIntel"))
-		c->x86_vendor = X86_VENDOR_INTEL;
-	else if (!strcmp(v, "CentaurHauls"))
-		c->x86_vendor = X86_VENDOR_CENTAUR;
-	else
-		c->x86_vendor = X86_VENDOR_UNKNOWN;
+        for (cvdev = __x86cpuvendor_start ;
+             cvdev < __x86cpuvendor_end   ;
+             cvdev++)
+                cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+	early_cpu_support_print();
 }
 
 /* Do some early cpuid on the boot CPU to get some parameter that are
@@ -1057,17 +743,9 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 
-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		early_init_amd(c);
-		break;
-	case X86_VENDOR_INTEL:
-		early_init_intel(c);
-		break;
-	case X86_VENDOR_CENTAUR:
-		early_init_centaur(c);
-		break;
-	}
+	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+	    cpu_devs[c->x86_vendor]->c_early_init)
+		cpu_devs[c->x86_vendor]->c_early_init(c);
 
 	validate_pat_support(c);
 }
@@ -1095,24 +773,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	 * At the end of this section, c->x86_capability better
 	 * indicate the features this CPU genuinely supports!
 	 */
-	switch (c->x86_vendor) {
-	case X86_VENDOR_AMD:
-		init_amd(c);
-		break;
-
-	case X86_VENDOR_INTEL:
-		init_intel(c);
-		break;
-
-	case X86_VENDOR_CENTAUR:
-		init_centaur(c);
-		break;
-
-	case X86_VENDOR_UNKNOWN:
-	default:
-		display_cacheinfo(c);
-		break;
-	}
+	if (this_cpu->c_init)
+		this_cpu->c_init(c);
 
 	detect_ht(c);
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bc1e1257e515..ae0a7a200421 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -59,7 +59,6 @@
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mtrr.h>
-#include <asm/nmi.h>
 #include <asm/vmi.h>
 #include <asm/genapic.h>
 #include <linux/mc146818rtc.h>
@@ -539,23 +538,6 @@ cpumask_t cpu_coregroup_map(int cpu)
 		return c->llc_shared_map;
 }
 
-#ifdef CONFIG_X86_32
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-void __init smp_alloc_memory(void)
-{
-	trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
-	/*
-	 * Has to be in very low memory so we can execute
-	 * real-mode AP code.
-	 */
-	if (__pa(trampoline_base) >= 0x9F000)
-		BUG();
-}
-#endif
-
 static void impress_friends(void)
 {
 	int cpu;
@@ -1174,9 +1156,11 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	 * If SMP should be disabled, then really disable it!
 	 */
 	if (!max_cpus) {
-		printk(KERN_INFO "SMP mode deactivated,"
-				 "forcing use of dummy APIC emulation.\n");
+		printk(KERN_INFO "SMP mode deactivated.\n");
 		smpboot_clear_io_apic();
+
+		localise_nmi_watchdog();
+
 #ifdef CONFIG_X86_32
 		connect_bsp_APIC();
 #endif
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index 70e4a374b4e8..5978023b799b 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -31,6 +31,7 @@
 #include <asm/srat.h>
 #include <asm/topology.h>
 #include <asm/smp.h>
+#include <asm/e820.h>
 
 /*
  * proximity macros and definitions
@@ -41,7 +42,7 @@
 #define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
 /* bitmap length; _PXM is at most 255 */
 #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
-static u8 pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
+static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
 
 #define MAX_CHUNKS_PER_NODE	3
 #define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
@@ -52,16 +53,37 @@ struct node_memory_chunk_s {
 	u8	nid;		// which cnode contains this chunk?
 	u8	bank;		// which mem bank on this node
 };
-static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
+static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
 
-static int num_memory_chunks;		/* total number of memory chunks */
+static int __initdata num_memory_chunks; /* total number of memory chunks */
 static u8 __initdata apicid_to_pxm[MAX_APICID];
 
+int numa_off __initdata;
+int acpi_numa __initdata;
+
+static __init void bad_srat(void)
+{
+        printk(KERN_ERR "SRAT: SRAT not used.\n");
+        acpi_numa = -1;
+	num_memory_chunks = 0;
+}
+
+static __init inline int srat_disabled(void)
+{
+	return numa_off || acpi_numa < 0;
+}
+
 /* Identify CPU proximity domains */
-static void __init parse_cpu_affinity_structure(char *p)
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
 {
-	struct acpi_srat_cpu_affinity *cpu_affinity =
-				(struct acpi_srat_cpu_affinity *) p;
+	if (srat_disabled())
+		return;
+	if (cpu_affinity->header.length !=
+	     sizeof(struct acpi_srat_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
 
 	if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 		return;		/* empty entry */
@@ -79,14 +101,21 @@ static void __init parse_cpu_affinity_structure(char *p)
  * Identify memory proximity domains and hot-remove capabilities.
  * Fill node memory chunk list structure.
  */
-static void __init parse_memory_affinity_structure (char *sratp)
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
 {
 	unsigned long long paddr, size;
 	unsigned long start_pfn, end_pfn;
 	u8 pxm;
 	struct node_memory_chunk_s *p, *q, *pend;
-	struct acpi_srat_mem_affinity *memory_affinity =
-			(struct acpi_srat_mem_affinity *) sratp;
+
+	if (srat_disabled())
+		return;
+	if (memory_affinity->header.length !=
+	     sizeof(struct acpi_srat_mem_affinity)) {
+		bad_srat();
+		return;
+	}
 
 	if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
 		return;		/* empty entry */
@@ -134,6 +163,14 @@ static void __init parse_memory_affinity_structure (char *sratp)
 		 "enabled and removable" : "enabled" ) );
 }
 
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+}
+
+void acpi_numa_arch_fixup(void)
+{
+}
 /*
  * The SRAT table always lists ascending addresses, so can always
  * assume that the first "start" address that you see is the real
@@ -166,39 +203,13 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
 		node_end_pfn[nid] = memory_chunk->end_pfn;
 }
 
-/* Parse the ACPI Static Resource Affinity Table */
-static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
+int __init get_memcfg_from_srat(void)
 {
-	u8 *start, *end, *p;
 	int i, j, nid;
 
-	start = (u8 *)(&(sratp->reserved) + 1);	/* skip header */
-	p = start;
-	end = (u8 *)sratp + sratp->header.length;
-
-	memset(pxm_bitmap, 0, sizeof(pxm_bitmap));	/* init proximity domain bitmap */
-	memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
 
-	num_memory_chunks = 0;
-	while (p < end) {
-		switch (*p) {
-		case ACPI_SRAT_TYPE_CPU_AFFINITY:
-			parse_cpu_affinity_structure(p);
-			break;
-		case ACPI_SRAT_TYPE_MEMORY_AFFINITY:
-			parse_memory_affinity_structure(p);
-			break;
-		default:
-			printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]);
-			break;
-		}
-		p += p[1];
-		if (p[1] == 0) {
-			printk("acpi20_parse_srat: Entry length value is zero;"
-				" can't parse any further!\n");
-			break;
-		}
-	}
+	if (srat_disabled())
+		goto out_fail;
 
 	if (num_memory_chunks == 0) {
 		printk("could not finy any ACPI SRAT memory areas.\n");
@@ -244,115 +255,19 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
 		printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
 		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
 		node_read_chunk(chunk->nid, chunk);
-		add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
+		e820_register_active_regions(chunk->nid, chunk->start_pfn,
+					     min(chunk->end_pfn, max_pfn));
 	}
- 
+
 	for_each_online_node(nid) {
 		unsigned long start = node_start_pfn[nid];
-		unsigned long end = node_end_pfn[nid];
+		unsigned long end = min(node_end_pfn[nid], max_pfn);
 
 		memory_present(nid, start, end);
 		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
 	}
 	return 1;
 out_fail:
-	return 0;
-}
-
-struct acpi_static_rsdt {
-	struct acpi_table_rsdt table;
-	u32 padding[7]; /* Allow for 7 more table entries */
-};
-
-int __init get_memcfg_from_srat(void)
-{
-	struct acpi_table_header *header = NULL;
-	struct acpi_table_rsdp *rsdp = NULL;
-	struct acpi_table_rsdt *rsdt = NULL;
-	acpi_native_uint rsdp_address = 0;
-	struct acpi_static_rsdt saved_rsdt;
-	int tables = 0;
-	int i = 0;
-
-	rsdp_address = acpi_os_get_root_pointer();
-	if (!rsdp_address) {
-		printk("%s: System description tables not found\n",
-		       __func__);
-		goto out_err;
-	}
-
-	printk("%s: assigning address to rsdp\n", __func__);
-	rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
-	if (!rsdp) {
-		printk("%s: Didn't find ACPI root!\n", __func__);
-		goto out_err;
-	}
-
-	printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision,
-		rsdp->oem_id);
-
-	if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
-		printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__);
-		goto out_err;
-	}
-
-	rsdt = (struct acpi_table_rsdt *)
-	    early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt));
-
-	if (!rsdt) {
-		printk(KERN_WARNING
-		       "%s: ACPI: Invalid root system description tables (RSDT)\n",
-		       __func__);
-		goto out_err;
-	}
-
-	header = &rsdt->header;
-
-	if (strncmp(header->signature, ACPI_SIG_RSDT, strlen(ACPI_SIG_RSDT))) {
-		printk(KERN_WARNING "ACPI: RSDT signature incorrect\n");
-		goto out_err;
-	}
-
-	/* 
-	 * The number of tables is computed by taking the 
-	 * size of all entries (header size minus total 
-	 * size of RSDT) divided by the size of each entry
-	 * (4-byte table pointers).
-	 */
-	tables = (header->length - sizeof(struct acpi_table_header)) / 4;
-
-	if (!tables)
-		goto out_err;
-
-	memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt));
-
-	if (saved_rsdt.table.header.length > sizeof(saved_rsdt)) {
-		printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n",
-		       saved_rsdt.table.header.length);
-		goto out_err;
-	}
-
-	printk("Begin SRAT table scan....\n");
-
-	for (i = 0; i < tables; i++) {
-		/* Map in header, then map in full table length. */
-		header = (struct acpi_table_header *)
-			early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header));
-		if (!header)
-			break;
-		header = (struct acpi_table_header *)
-			early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length);
-		if (!header)
-			break;
-
-		if (strncmp((char *) &header->signature, ACPI_SIG_SRAT, 4))
-			continue;
-
-		/* we've found the srat table. don't need to look at any more tables */
-		return acpi20_parse_srat((struct acpi_table_srat *)header);
-	}
-out_err:
-	remove_all_active_ranges();
 	printk("failed to get NUMA memory information from SRAT table\n");
 	return 0;
 }
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index ae751094eba9..d67ce5f044ba 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -36,7 +36,9 @@ static struct rio_table_hdr *rio_table_hdr __initdata;
 static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
 static struct rio_detail    *rio_devs[MAX_NUMNODES*4] __initdata;
 
+#ifndef CONFIG_X86_NUMAQ
 static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
+#endif
 
 static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
 {
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d2ab52cc1d6b..7066cb855a60 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -19,8 +19,8 @@
 #include <linux/utsname.h>
 #include <linux/ipc.h>
 
-#include <asm/uaccess.h>
-#include <asm/unistd.h>
+#include <linux/uaccess.h>
+#include <linux/unistd.h>
 
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
@@ -103,7 +103,7 @@ asmlinkage int old_select(struct sel_arg_struct __user *arg)
  *
  * This is really horribly ugly.
  */
-asmlinkage int sys_ipc (uint call, int first, int second,
+asmlinkage int sys_ipc(uint call, int first, int second,
 			int third, void __user *ptr, long fifth)
 {
 	int version, ret;
@@ -113,24 +113,24 @@ asmlinkage int sys_ipc (uint call, int first, int second,
 
 	switch (call) {
 	case SEMOP:
-		return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL);
+		return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
 	case SEMTIMEDOP:
 		return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
 					(const struct timespec __user *)fifth);
 
 	case SEMGET:
-		return sys_semget (first, second, third);
+		return sys_semget(first, second, third);
 	case SEMCTL: {
 		union semun fourth;
 		if (!ptr)
 			return -EINVAL;
 		if (get_user(fourth.__pad, (void __user * __user *) ptr))
 			return -EFAULT;
-		return sys_semctl (first, second, third, fourth);
+		return sys_semctl(first, second, third, fourth);
 	}
 
 	case MSGSND:
-		return sys_msgsnd (first, (struct msgbuf __user *) ptr, 
+		return sys_msgsnd(first, (struct msgbuf __user *) ptr,
 				   second, third);
 	case MSGRCV:
 		switch (version) {
@@ -138,45 +138,45 @@ asmlinkage int sys_ipc (uint call, int first, int second,
 			struct ipc_kludge tmp;
 			if (!ptr)
 				return -EINVAL;
-			
+
 			if (copy_from_user(&tmp,
-					   (struct ipc_kludge __user *) ptr, 
-					   sizeof (tmp)))
+					   (struct ipc_kludge __user *) ptr,
+					   sizeof(tmp)))
 				return -EFAULT;
-			return sys_msgrcv (first, tmp.msgp, second,
+			return sys_msgrcv(first, tmp.msgp, second,
 					   tmp.msgtyp, third);
 		}
 		default:
-			return sys_msgrcv (first,
+			return sys_msgrcv(first,
 					   (struct msgbuf __user *) ptr,
 					   second, fifth, third);
 		}
 	case MSGGET:
-		return sys_msgget ((key_t) first, second);
+		return sys_msgget((key_t) first, second);
 	case MSGCTL:
-		return sys_msgctl (first, second, (struct msqid_ds __user *) ptr);
+		return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
 
 	case SHMAT:
 		switch (version) {
 		default: {
 			ulong raddr;
-			ret = do_shmat (first, (char __user *) ptr, second, &raddr);
+			ret = do_shmat(first, (char __user *) ptr, second, &raddr);
 			if (ret)
 				return ret;
-			return put_user (raddr, (ulong __user *) third);
+			return put_user(raddr, (ulong __user *) third);
 		}
 		case 1:	/* iBCS2 emulator entry point */
 			if (!segment_eq(get_fs(), get_ds()))
 				return -EINVAL;
 			/* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
-			return do_shmat (first, (char __user *) ptr, second, (ulong *) third);
+			return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
 		}
-	case SHMDT: 
-		return sys_shmdt ((char __user *)ptr);
+	case SHMDT:
+		return sys_shmdt((char __user *)ptr);
 	case SHMGET:
-		return sys_shmget (first, second, third);
+		return sys_shmget(first, second, third);
 	case SHMCTL:
-		return sys_shmctl (first, second,
+		return sys_shmctl(first, second,
 				   (struct shmid_ds __user *) ptr);
 	default:
 		return -ENOSYS;
@@ -186,28 +186,28 @@ asmlinkage int sys_ipc (uint call, int first, int second,
 /*
  * Old cruft
  */
-asmlinkage int sys_uname(struct old_utsname __user * name)
+asmlinkage int sys_uname(struct old_utsname __user *name)
 {
 	int err;
 	if (!name)
 		return -EFAULT;
 	down_read(&uts_sem);
-	err = copy_to_user(name, utsname(), sizeof (*name));
+	err = copy_to_user(name, utsname(), sizeof(*name));
 	up_read(&uts_sem);
-	return err?-EFAULT:0;
+	return err? -EFAULT:0;
 }
 
-asmlinkage int sys_olduname(struct oldold_utsname __user * name)
+asmlinkage int sys_olduname(struct oldold_utsname __user *name)
 {
 	int error;
 
 	if (!name)
 		return -EFAULT;
-	if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+	if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
 		return -EFAULT;
-  
-  	down_read(&uts_sem);
-	
+
+	down_read(&uts_sem);
+
 	error = __copy_to_user(&name->sysname, &utsname()->sysname,
 			       __OLD_UTS_LEN);
 	error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
@@ -223,9 +223,9 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name)
 	error |= __copy_to_user(&name->machine, &utsname()->machine,
 				__OLD_UTS_LEN);
 	error |= __put_user(0, name->machine + __OLD_UTS_LEN);
-	
+
 	up_read(&uts_sem);
-	
+
 	error = error ? -EFAULT : 0;
 
 	return error;
@@ -241,6 +241,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[])
 	long __res;
 	asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
 	: "=a" (__res)
-	: "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory");
+	: "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory");
 	return __res;
 }
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 2ff21f398934..5f29f12da50c 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -84,8 +84,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 	if (timer_ack) {
 		/*
 		 * Subtle, when I/O APICs are used we have to ack timer IRQ
-		 * manually to reset the IRR bit for do_slow_gettimeoffset().
-		 * This will also deassert NMI lines for the watchdog if run
+		 * manually to deassert NMI lines for the watchdog if run
 		 * on an 82489DX-based system.
 		 */
 		spin_lock(&i8259A_lock);
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index c737849e2ef7..39ae8511a137 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -123,6 +123,8 @@ void __init time_init(void)
 		(boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
 		cpu_khz = calculate_cpu_khz();
 
+	lpj_fine = ((unsigned long)tsc_khz * 1000)/HZ;
+
 	if (unsynchronized_tsc())
 		mark_tsc_unstable("TSCs unsynchronized");
 
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index abbf199adebb..1106fac6024d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -2,7 +2,7 @@
 
 #include <asm/trampoline.h>
 
-/* ready for x86_64, no harm for x86, since it will overwrite after alloc */
+/* ready for x86_64 and x86 */
 unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
 
 /*
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76ea97c4..ec6d3b2130c4 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -71,7 +71,6 @@ asmlinkage void general_protection(void);
 asmlinkage void page_fault(void);
 asmlinkage void coprocessor_error(void);
 asmlinkage void simd_coprocessor_error(void);
-asmlinkage void reserved(void);
 asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
@@ -702,12 +701,10 @@ DO_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, regs->ip)
 DO_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_ERROR( 5, SIGSEGV, "bounds", bounds)
 DO_ERROR_INFO( 6, SIGILL,  "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR( 7, SIGSEGV, "device not available", device_not_available)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-DO_ERROR(18, SIGSEGV, "reserved", reserved)
 
 /* Runs on IST stack */
 asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 65b70637ad97..6240922e497c 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -1,6 +1,7 @@
 #include <linux/sched.h>
 #include <linux/clocksource.h>
 #include <linux/workqueue.h>
+#include <linux/delay.h>
 #include <linux/cpufreq.h>
 #include <linux/jiffies.h>
 #include <linux/init.h>
@@ -286,7 +287,6 @@ core_initcall(cpufreq_tsc);
 
 /* clock source code */
 
-static unsigned long current_tsc_khz;
 static struct clocksource clocksource_tsc;
 
 /*
@@ -404,6 +404,7 @@ static inline void check_geode_tsc_reliable(void) { }
 void __init tsc_init(void)
 {
 	int cpu;
+	u64 lpj;
 
 	if (!cpu_has_tsc || tsc_disabled > 0)
 		return;
@@ -416,6 +417,10 @@ void __init tsc_init(void)
 		return;
 	}
 
+	lpj = ((u64)tsc_khz * 1000);
+	do_div(lpj, HZ);
+	lpj_fine = lpj;
+
 	/* now allow native_sched_clock() to use rdtsc */
 	tsc_disabled = 0;
 
@@ -439,9 +444,8 @@ void __init tsc_init(void)
 
 	unsynchronized_tsc();
 	check_geode_tsc_reliable();
-	current_tsc_khz = tsc_khz;
-	clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
-							clocksource_tsc.shift);
+	clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
+						    clocksource_tsc.shift);
 	/* lower the rating if we already know its unstable: */
 	if (check_tsc_unstable()) {
 		clocksource_tsc.rating = 0;
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index 1784b8077a12..9898fb01edfd 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -242,7 +242,7 @@ void __init tsc_calibrate(void)
 	if (hpet) {
 		printk(KERN_INFO "TSC calibrated against HPET\n");
 		if (hpet2 < hpet1)
-			hpet2 += 0x100000000;
+			hpet2 += 0x100000000UL;
 		hpet2 -= hpet1;
 		tsc1 = (hpet2 * hpet_readl(HPET_PERIOD)) / 1000000;
 	} else {
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index a2b030780aa9..ba7d19e102b1 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -33,8 +33,7 @@
 #include <asm/apic.h>
 #include <asm/timer.h>
 #include <asm/i8253.h>
-
-#include <irq_vectors.h>
+#include <asm/irq_vectors.h>
 
 #define VMI_ONESHOT  (VMI_ALARM_IS_ONESHOT  | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
 #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ce5ed083a1e9..2674f5796275 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -60,13 +60,6 @@ SECTIONS
 
   BUG_TABLE :text
 
-  . = ALIGN(4);
-  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
-  	__tracedata_start = .;
-	*(.tracedata)
-  	__tracedata_end = .;
-  }
-
   RODATA
 
   /* writeable */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a5..fd246e22fe6b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -53,13 +53,6 @@ SECTIONS
 
   RODATA
 
-  . = ALIGN(4);
-  .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
-  	__tracedata_start = .;
-	*(.tracedata)
-  	__tracedata_end = .;
-  }
-
   . = ALIGN(PAGE_SIZE);		/* Align data segment to page size boundary */
 				/* Data */
   .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -177,6 +170,7 @@ SECTIONS
 	*(.con_initcall.init)
   }
   __con_initcall_end = .;
+  . = ALIGN(16);
   __x86cpuvendor_start = .;
   .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
 	*(.x86cpuvendor.init)
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index ba8c0b75ab0a..0c029e8959c7 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,9 +15,12 @@
 #include <linux/init.h>
 #include <linux/pci_ids.h>
 #include <linux/pci_regs.h>
+
+#include <asm/apic.h>
 #include <asm/pci-direct.h>
 #include <asm/io.h>
 #include <asm/paravirt.h>
+#include <asm/setup.h>
 
 #if defined CONFIG_PCI && defined CONFIG_PARAVIRT
 /*