From ff741906ad3cf4b8ca1a958acb013a97a6381ca2 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Fri, 11 Nov 2005 14:32:40 -0800
Subject: [IA64] support for cpu0 removal

here is the BSP removal support for IA64. Its pretty much the same thing that
was released a while back, but has your feedback incorporated.

- Removed CONFIG_BSP_REMOVE_WORKAROUND and associated cmdline param
- Fixed compile issue with sn2/zx1 due to a undefined fix_b0_for_bsp
- some formatting nits (whitespace etc)

This has been tested on tiger and long back by alex on hp systems as well.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/Kconfig                 |  19 +++++++
 arch/ia64/configs/tiger_defconfig |   2 +
 arch/ia64/kernel/acpi.c           |  12 ++--
 arch/ia64/kernel/iosapic.c        |   6 ++
 arch/ia64/kernel/irq.c            |  13 ++++-
 arch/ia64/kernel/mca.c            |   6 +-
 arch/ia64/kernel/perfmon.c        |   5 +-
 arch/ia64/kernel/smpboot.c        | 114 ++++++++++++++++++++++++++++++++++++--
 arch/ia64/kernel/time.c           |   9 ++-
 arch/ia64/mm/contig.c             |   4 +-
 arch/ia64/mm/discontig.c          |   9 ++-
 include/asm-ia64/mca.h            |   2 +
 12 files changed, 183 insertions(+), 18 deletions(-)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 199eeaf0f4e3..5e0f58e37c59 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -272,6 +272,25 @@ config SCHED_SMT
 	  Intel IA64 chips with MultiThreading at a cost of slightly increased
 	  overhead in some places. If unsure say N here.
 
+config PERMIT_BSP_REMOVE
+	bool "Support removal of Bootstrap Processor"
+	depends on HOTPLUG_CPU
+	default n
+	---help---
+	Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU
+	support. 
+
+config FORCE_CPEI_RETARGET
+	bool "Force assumption that CPEI can be re-targetted"
+	depends on PERMIT_BSP_REMOVE
+	default n
+	---help---
+	Say Y if you need to force the assumption that CPEI can be re-targetted to
+	any cpu in the system. This hint is available via ACPI 3.0 specifications.
+	Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP.
+	This option it useful to enable this feature on older BIOS's as well.
+	You can also enable this by using boot command line option force_cpei=1.
+
 config PREEMPT
 	bool "Preemptible Kernel"
         help
diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig
index b1e8f09e9fd5..aed034d33976 100644
--- a/arch/ia64/configs/tiger_defconfig
+++ b/arch/ia64/configs/tiger_defconfig
@@ -114,6 +114,8 @@ CONFIG_FORCE_MAX_ZONEORDER=17
 CONFIG_SMP=y
 CONFIG_NR_CPUS=4
 CONFIG_HOTPLUG_CPU=y
+CONFIG_PERMIT_BSP_REMOVE=y
+CONFIG_FORCE_CPEI_RETARGET=y
 # CONFIG_SCHED_SMT is not set
 # CONFIG_PREEMPT is not set
 CONFIG_SELECT_MEMORY_MODEL=y
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 9ad94ddf6687..fe1d90b0c6ea 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -287,16 +287,20 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header,
 unsigned int can_cpei_retarget(void)
 {
 	extern int cpe_vector;
+	extern unsigned int force_cpei_retarget;
 
 	/*
 	 * Only if CPEI is supported and the override flag
 	 * is present, otherwise return that its re-targettable
 	 * if we are in polling mode.
 	 */
-	if (cpe_vector > 0 && !acpi_cpei_override)
-		return 0;
-	else
-		return 1;
+	if (cpe_vector > 0) {
+		if (acpi_cpei_override || force_cpei_retarget)
+			return 1;
+		else
+			return 0;
+	}
+	return 1;
 }
 
 unsigned int is_cpu_cpei_target(unsigned int cpu)
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 574084f343fa..37ac742da8ed 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -631,6 +631,7 @@ get_target_cpu (unsigned int gsi, int vector)
 {
 #ifdef CONFIG_SMP
 	static int cpu = -1;
+	extern int cpe_vector;
 
 	/*
 	 * In case of vector shared by multiple RTEs, all RTEs that
@@ -653,6 +654,11 @@ get_target_cpu (unsigned int gsi, int vector)
 	if (!cpu_online(smp_processor_id()))
 		return cpu_physical_id(smp_processor_id());
 
+#ifdef CONFIG_ACPI
+		if (cpe_vector > 0 && vector == IA64_CPEP_VECTOR)
+			return get_cpei_target_cpu();
+#endif
+
 #ifdef CONFIG_NUMA
 	{
 		int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0;
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index d33244c32759..5ce908ef9c95 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -163,8 +163,19 @@ void fixup_irqs(void)
 {
 	unsigned int irq;
 	extern void ia64_process_pending_intr(void);
+	extern void ia64_disable_timer(void);
+	extern volatile int time_keeper_id;
+
+	ia64_disable_timer();
+
+	/*
+	 * Find a new timesync master
+	 */
+	if (smp_processor_id() == time_keeper_id) {
+		time_keeper_id = first_cpu(cpu_online_map);
+		printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id);
+	}
 
-	ia64_set_itv(1<<16);
 	/*
 	 * Phase 1: Locate irq's bound to this cpu and
 	 * relocate them for cpu removal.
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 355af15287c7..967571b466a2 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -289,6 +289,7 @@ ia64_mca_log_sal_error_record(int sal_info_type)
 #ifdef CONFIG_ACPI
 
 int cpe_vector = -1;
+int ia64_cpe_irq = -1;
 
 static irqreturn_t
 ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
@@ -1444,11 +1445,13 @@ void __devinit
 ia64_mca_cpu_init(void *cpu_data)
 {
 	void *pal_vaddr;
+	static int first_time = 1;
 
-	if (smp_processor_id() == 0) {
+	if (first_time) {
 		void *mca_data;
 		int cpu;
 
+		first_time = 0;
 		mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu)
 					 * NR_CPUS + KERNEL_STACK_SIZE);
 		mca_data = (void *)(((unsigned long)mca_data +
@@ -1704,6 +1707,7 @@ ia64_mca_late_init(void)
 					desc = irq_descp(irq);
 					desc->status |= IRQ_PER_CPU;
 					setup_irq(irq, &mca_cpe_irqaction);
+					ia64_cpe_irq = irq;
 				}
 			ia64_mca_register_cpev(cpe_vector);
 			IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__);
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 410d4804fa6e..18c51c37a9a3 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -6718,6 +6718,7 @@ __initcall(pfm_init);
 void
 pfm_init_percpu (void)
 {
+	static int first_time=1;
 	/*
 	 * make sure no measurement is active
 	 * (may inherit programmed PMCs from EFI).
@@ -6730,8 +6731,10 @@ pfm_init_percpu (void)
 	 */
 	pfm_unfreeze_pmu();
 
-	if (smp_processor_id() == 0)
+	if (first_time) {
 		register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
+		first_time=0;
+	}
 
 	ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
 	ia64_srlz_d();
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 8f44e7d2df66..e9d37bf67d69 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -70,6 +70,12 @@
 #endif
 
 #ifdef CONFIG_HOTPLUG_CPU
+#ifdef CONFIG_PERMIT_BSP_REMOVE
+#define bsp_remove_ok	1
+#else
+#define bsp_remove_ok	0
+#endif
+
 /*
  * Store all idle threads, this can be reused instead of creating
  * a new thread. Also avoids complicated thread destroy functionality
@@ -104,7 +110,7 @@ struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0];
 /*
  * ITC synchronization related stuff:
  */
-#define MASTER	0
+#define MASTER	(0)
 #define SLAVE	(SMP_CACHE_BYTES/8)
 
 #define NUM_ROUNDS	64	/* magic value */
@@ -151,6 +157,27 @@ char __initdata no_int_routing;
 
 unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */
 
+#ifdef CONFIG_FORCE_CPEI_RETARGET
+#define CPEI_OVERRIDE_DEFAULT	(1)
+#else
+#define CPEI_OVERRIDE_DEFAULT	(0)
+#endif
+
+unsigned int force_cpei_retarget = CPEI_OVERRIDE_DEFAULT;
+
+static int __init
+cmdl_force_cpei(char *str)
+{
+	int value=0;
+
+	get_option (&str, &value);
+	force_cpei_retarget = value;
+
+	return 1;
+}
+
+__setup("force_cpei=", cmdl_force_cpei);
+
 static int __init
 nointroute (char *str)
 {
@@ -161,6 +188,27 @@ nointroute (char *str)
 
 __setup("nointroute", nointroute);
 
+static void fix_b0_for_bsp(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	int cpuid;
+	static int fix_bsp_b0 = 1;
+
+	cpuid = smp_processor_id();
+
+	/*
+	 * Cache the b0 value on the first AP that comes up
+	 */
+	if (!(fix_bsp_b0 && cpuid))
+		return;
+
+	sal_boot_rendez_state[0].br[0] = sal_boot_rendez_state[cpuid].br[0];
+	printk ("Fixed BSP b0 value from CPU %d\n", cpuid);
+
+	fix_bsp_b0 = 0;
+#endif
+}
+
 void
 sync_master (void *arg)
 {
@@ -327,8 +375,9 @@ smp_setup_percpu_timer (void)
 static void __devinit
 smp_callin (void)
 {
-	int cpuid, phys_id;
+	int cpuid, phys_id, itc_master;
 	extern void ia64_init_itm(void);
+	extern volatile int time_keeper_id;
 
 #ifdef CONFIG_PERFMON
 	extern void pfm_init_percpu(void);
@@ -336,6 +385,7 @@ smp_callin (void)
 
 	cpuid = smp_processor_id();
 	phys_id = hard_smp_processor_id();
+	itc_master = time_keeper_id;
 
 	if (cpu_online(cpuid)) {
 		printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n",
@@ -343,6 +393,8 @@ smp_callin (void)
 		BUG();
 	}
 
+	fix_b0_for_bsp();
+
 	lock_ipi_calllock();
 	cpu_set(cpuid, cpu_online_map);
 	unlock_ipi_calllock();
@@ -365,8 +417,8 @@ smp_callin (void)
 		 * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls
 		 * local_bh_enable(), which bugs out if irqs are not enabled...
 		 */
-		Dprintk("Going to syncup ITC with BP.\n");
-		ia64_sync_itc(0);
+		Dprintk("Going to syncup ITC with ITC Master.\n");
+		ia64_sync_itc(itc_master);
 	}
 
 	/*
@@ -638,6 +690,47 @@ remove_siblinginfo(int cpu)
 }
 
 extern void fixup_irqs(void);
+
+int migrate_platform_irqs(unsigned int cpu)
+{
+	int new_cpei_cpu;
+	irq_desc_t *desc = NULL;
+	cpumask_t 	mask;
+	int 		retval = 0;
+
+	/*
+	 * dont permit CPEI target to removed.
+	 */
+	if (cpe_vector > 0 && is_cpu_cpei_target(cpu)) {
+		printk ("CPU (%d) is CPEI Target\n", cpu);
+		if (can_cpei_retarget()) {
+			/*
+			 * Now re-target the CPEI to a different processor
+			 */
+			new_cpei_cpu = any_online_cpu(cpu_online_map);
+			mask = cpumask_of_cpu(new_cpei_cpu);
+			set_cpei_target_cpu(new_cpei_cpu);
+			desc = irq_descp(ia64_cpe_irq);
+			/*
+			 * Switch for now, immediatly, we need to do fake intr
+			 * as other interrupts, but need to study CPEI behaviour with
+			 * polling before making changes.
+			 */
+			if (desc) {
+				desc->handler->disable(ia64_cpe_irq);
+				desc->handler->set_affinity(ia64_cpe_irq, mask);
+				desc->handler->enable(ia64_cpe_irq);
+				printk ("Re-targetting CPEI to cpu %d\n", new_cpei_cpu);
+			}
+		}
+		if (!desc) {
+			printk ("Unable to retarget CPEI, offline cpu [%d] failed\n", cpu);
+			retval = -EBUSY;
+		}
+	}
+	return retval;
+}
+
 /* must be called with cpucontrol mutex held */
 int __cpu_disable(void)
 {
@@ -646,8 +739,17 @@ int __cpu_disable(void)
 	/*
 	 * dont permit boot processor for now
 	 */
-	if (cpu == 0)
-		return -EBUSY;
+	if (cpu == 0 && !bsp_remove_ok) {
+		printk ("Your platform does not support removal of BSP\n");
+		return (-EBUSY);
+	}
+
+	cpu_clear(cpu, cpu_online_map);
+
+	if (migrate_platform_irqs(cpu)) {
+		cpu_set(cpu, cpu_online_map);
+		return (-EBUSY);
+	}
 
 	remove_siblinginfo(cpu);
 	cpu_clear(cpu, cpu_online_map);
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 028a2b95936c..1ca130a83856 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -32,7 +32,7 @@
 
 extern unsigned long wall_jiffies;
 
-#define TIME_KEEPER_ID	0	/* smp_processor_id() of time-keeper */
+volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
 
@@ -71,7 +71,7 @@ timer_interrupt (int irq, void *dev_id, struct pt_regs *regs)
 
 		new_itm += local_cpu_data->itm_delta;
 
-		if (smp_processor_id() == TIME_KEEPER_ID) {
+		if (smp_processor_id() == time_keeper_id) {
 			/*
 			 * Here we are in the timer irq handler. We have irqs locally
 			 * disabled, but we don't know if the timer_bh is running on
@@ -236,6 +236,11 @@ static struct irqaction timer_irqaction = {
 	.name =		"timer"
 };
 
+void __devinit ia64_disable_timer(void)
+{
+	ia64_set_itv(1 << 16);
+}
+
 void __init
 time_init (void)
 {
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index acaaec4e4681..9855ba318094 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -181,13 +181,15 @@ per_cpu_init (void)
 {
 	void *cpu_data;
 	int cpu;
+	static int first_time=1;
 
 	/*
 	 * get_free_pages() cannot be used before cpu_init() done.  BSP
 	 * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls
 	 * get_zeroed_page().
 	 */
-	if (smp_processor_id() == 0) {
+	if (first_time) {
+		first_time=0;
 		cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS,
 					   PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 		for (cpu = 0; cpu < NR_CPUS; cpu++) {
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c87d6d1d5813..573d5cc63e2b 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -528,12 +528,17 @@ void __init find_memory(void)
 void *per_cpu_init(void)
 {
 	int cpu;
+	static int first_time = 1;
+
 
 	if (smp_processor_id() != 0)
 		return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
-		per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
+	if (first_time) {
+		first_time = 0;
+		for (cpu = 0; cpu < NR_CPUS; cpu++)
+			per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu];
+	}
 
 	return __per_cpu_start + __per_cpu_offset[smp_processor_id()];
 }
diff --git a/include/asm-ia64/mca.h b/include/asm-ia64/mca.h
index c7d9c9ed38ba..bfbbb8da79c7 100644
--- a/include/asm-ia64/mca.h
+++ b/include/asm-ia64/mca.h
@@ -131,6 +131,8 @@ struct ia64_mca_cpu {
 /* Array of physical addresses of each CPU's MCA area.  */
 extern unsigned long __per_cpu_mca[NR_CPUS];
 
+extern int cpe_vector;
+extern int ia64_cpe_irq;
 extern void ia64_mca_init(void);
 extern void ia64_mca_cpu_init(void *);
 extern void ia64_os_mca_dispatch(void);
-- 
cgit v1.2.3


From b88e926584bf100bc23f5e76b7b674d4257edcb0 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Thu, 19 Jan 2006 16:18:47 -0800
Subject: [IA64] Fix UP build with BSP removal support.

Causes undefined force_cpei_retarget defined in arch/ia64/kernel/smpboot.c
Push the unneeded code inside #ifdef CONFIG_HOTPLUG_CPU.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/acpi.c     | 2 ++
 arch/ia64/kernel/iosapic.c  | 4 ++--
 arch/ia64/kernel/topology.c | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index fe1d90b0c6ea..8d350b33a20f 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -284,6 +284,7 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header,
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 unsigned int can_cpei_retarget(void)
 {
 	extern int cpe_vector;
@@ -319,6 +320,7 @@ void set_cpei_target_cpu(unsigned int cpu)
 {
 	acpi_cpei_phys_cpuid = cpu_physical_id(cpu);
 }
+#endif
 
 unsigned int get_cpei_target_cpu(void)
 {
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 37ac742da8ed..8832c553230a 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -655,8 +655,8 @@ get_target_cpu (unsigned int gsi, int vector)
 		return cpu_physical_id(smp_processor_id());
 
 #ifdef CONFIG_ACPI
-		if (cpe_vector > 0 && vector == IA64_CPEP_VECTOR)
-			return get_cpei_target_cpu();
+	if (cpe_vector > 0 && vector == IA64_CPEP_VECTOR)
+		return get_cpei_target_cpu();
 #endif
 
 #ifdef CONFIG_NUMA
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 706b7734e191..c9562d94b9c3 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -36,7 +36,7 @@ int arch_register_cpu(int num)
 	parent = &sysfs_nodes[cpu_to_node(num)];
 #endif /* CONFIG_NUMA */
 
-#ifdef CONFIG_ACPI
+#if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU)
 	/*
 	 * If CPEI cannot be re-targetted, and this is
 	 * CPEI target, then dont create the control file
-- 
cgit v1.2.3


From a454c2f3d1fd1cab7073b53c6c14d6d4b61f4e09 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 11 Jan 2006 17:11:09 -0800
Subject: [IA64] implement ia64 specific mutex primitives

Implement ia64 optimized mutex primitives.  It properly uses
acquire/release memory ordering semantics in lock/unlock path.
2nd version making them all static inline functions.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/asm-ia64/mutex.h | 93 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 88 insertions(+), 5 deletions(-)

diff --git a/include/asm-ia64/mutex.h b/include/asm-ia64/mutex.h
index 458c1f7fbc18..5a3224f6af38 100644
--- a/include/asm-ia64/mutex.h
+++ b/include/asm-ia64/mutex.h
@@ -1,9 +1,92 @@
 /*
- * Pull in the generic implementation for the mutex fastpath.
+ * ia64 implementation of the mutex fastpath.
  *
- * TODO: implement optimized primitives instead, or leave the generic
- * implementation in place, or pick the atomic_xchg() based generic
- * implementation. (see asm-generic/mutex-xchg.h for details)
+ * Copyright (C) 2006 Ken Chen <kenneth.w.chen@intel.com>
+ *
+ */
+
+#ifndef _ASM_MUTEX_H
+#define _ASM_MUTEX_H
+
+/**
+ *  __mutex_fastpath_lock - try to take the lock by moving the count
+ *                          from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function MUST leave the value lower than
+ * 1 even when the "1" assertion wasn't true.
+ */
+static inline void
+__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+	if (unlikely(ia64_fetchadd4_acq(count, -1) != 1))
+		fail_fn(count);
+}
+
+/**
+ *  __mutex_fastpath_lock_retval - try to take the lock by moving the count
+ *                                 from 1 to a 0 value
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 1
+ *
+ * Change the count from 1 to a value lower than 1, and call <fail_fn> if
+ * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
+ * or anything the slow path function returns.
+ */
+static inline int
+__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (unlikely(ia64_fetchadd4_acq(count, -1) != 1))
+		return fail_fn(count);
+	return 0;
+}
+
+/**
+ *  __mutex_fastpath_unlock - try to promote the count from 0 to 1
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: function to call if the original value was not 0
+ *
+ * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>.
+ * In the failure case, this function is allowed to either set the value to
+ * 1, or to set it to a value lower than 1.
+ *
+ * If the implementation sets it to a value of lower than 1, then the
+ * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
+ * to return 0 otherwise.
+ */
+static inline void
+__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *))
+{
+	int ret = ia64_fetchadd4_rel(count, 1);
+	if (unlikely(ret < 0))
+		fail_fn(count);
+}
+
+#define __mutex_slowpath_needs_to_unlock()		1
+
+/**
+ * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
+ *
+ *  @count: pointer of type atomic_t
+ *  @fail_fn: fallback function
+ *
+ * Change the count from 1 to a value lower than 1, and return 0 (failure)
+ * if it wasn't 1 originally, or return 1 (success) otherwise. This function
+ * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
+ * Additionally, if the value was < 0 originally, this function must not leave
+ * it to 0 on failure.
+ *
+ * If the architecture has no effective trylock variant, it should call the
+ * <fail_fn> spinlock-based trylock variant unconditionally.
  */
+static inline int
+__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *))
+{
+	if (likely(cmpxchg_acq(count, 1, 0)) == 1)
+		return 1;
+	return 0;
+}
 
-#include <asm-generic/mutex-dec.h>
+#endif
-- 
cgit v1.2.3


From 6e586f32931d6c98431d54cd0430d4366195b0ba Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Tue, 17 Jan 2006 12:24:39 -0500
Subject: [IA64-SGI] sem2mutex ioc4.c

Convert to use a single mutex instead of two rwsems as this isn't
performance critical.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Brent Casavant <bcasavan@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/sn/ioc4.c | 41 ++++++++++++++++++-----------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/drivers/sn/ioc4.c b/drivers/sn/ioc4.c
index ea75b3d0612b..67140a5804f5 100644
--- a/drivers/sn/ioc4.c
+++ b/drivers/sn/ioc4.c
@@ -31,7 +31,7 @@
 #include <linux/ioc4.h>
 #include <linux/mmtimer.h>
 #include <linux/rtc.h>
-#include <linux/rwsem.h>
+#include <linux/mutex.h>
 #include <asm/sn/addrs.h>
 #include <asm/sn/clksupport.h>
 #include <asm/sn/shub_mmr.h>
@@ -54,11 +54,10 @@
  * Submodule management *
  ************************/
 
-static LIST_HEAD(ioc4_devices);
-static DECLARE_RWSEM(ioc4_devices_rwsem);
+static DEFINE_MUTEX(ioc4_mutex);
 
+static LIST_HEAD(ioc4_devices);
 static LIST_HEAD(ioc4_submodules);
-static DECLARE_RWSEM(ioc4_submodules_rwsem);
 
 /* Register an IOC4 submodule */
 int
@@ -66,15 +65,13 @@ ioc4_register_submodule(struct ioc4_submodule *is)
 {
 	struct ioc4_driver_data *idd;
 
-	down_write(&ioc4_submodules_rwsem);
+	mutex_lock(&ioc4_mutex);
 	list_add(&is->is_list, &ioc4_submodules);
-	up_write(&ioc4_submodules_rwsem);
 
 	/* Initialize submodule for each IOC4 */
 	if (!is->is_probe)
-		return 0;
+		goto out;
 
-	down_read(&ioc4_devices_rwsem);
 	list_for_each_entry(idd, &ioc4_devices, idd_list) {
 		if (is->is_probe(idd)) {
 			printk(KERN_WARNING
@@ -84,8 +81,8 @@ ioc4_register_submodule(struct ioc4_submodule *is)
 			       pci_name(idd->idd_pdev));
 		}
 	}
-	up_read(&ioc4_devices_rwsem);
-
+ out:
+	mutex_unlock(&ioc4_mutex);
 	return 0;
 }
 
@@ -95,15 +92,13 @@ ioc4_unregister_submodule(struct ioc4_submodule *is)
 {
 	struct ioc4_driver_data *idd;
 
-	down_write(&ioc4_submodules_rwsem);
+	mutex_lock(&ioc4_mutex);
 	list_del(&is->is_list);
-	up_write(&ioc4_submodules_rwsem);
 
 	/* Remove submodule for each IOC4 */
 	if (!is->is_remove)
-		return;
+		goto out;
 
-	down_read(&ioc4_devices_rwsem);
 	list_for_each_entry(idd, &ioc4_devices, idd_list) {
 		if (is->is_remove(idd)) {
 			printk(KERN_WARNING
@@ -113,7 +108,8 @@ ioc4_unregister_submodule(struct ioc4_submodule *is)
 			       pci_name(idd->idd_pdev));
 		}
 	}
-	up_read(&ioc4_devices_rwsem);
+ out:
+	mutex_unlock(&ioc4_mutex);
 }
 
 /*********************
@@ -312,12 +308,11 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
 	/* Track PCI-device specific data */
 	idd->idd_serial_data = NULL;
 	pci_set_drvdata(idd->idd_pdev, idd);
-	down_write(&ioc4_devices_rwsem);
+
+	mutex_lock(&ioc4_mutex);
 	list_add(&idd->idd_list, &ioc4_devices);
-	up_write(&ioc4_devices_rwsem);
 
 	/* Add this IOC4 to all submodules */
-	down_read(&ioc4_submodules_rwsem);
 	list_for_each_entry(is, &ioc4_submodules, is_list) {
 		if (is->is_probe && is->is_probe(idd)) {
 			printk(KERN_WARNING
@@ -327,7 +322,7 @@ ioc4_probe(struct pci_dev *pdev, const struct pci_device_id *pci_id)
 			       pci_name(idd->idd_pdev));
 		}
 	}
-	up_read(&ioc4_submodules_rwsem);
+	mutex_unlock(&ioc4_mutex);
 
 	return 0;
 
@@ -351,7 +346,7 @@ ioc4_remove(struct pci_dev *pdev)
 	idd = pci_get_drvdata(pdev);
 
 	/* Remove this IOC4 from all submodules */
-	down_read(&ioc4_submodules_rwsem);
+	mutex_lock(&ioc4_mutex);
 	list_for_each_entry(is, &ioc4_submodules, is_list) {
 		if (is->is_remove && is->is_remove(idd)) {
 			printk(KERN_WARNING
@@ -361,7 +356,7 @@ ioc4_remove(struct pci_dev *pdev)
 			       pci_name(idd->idd_pdev));
 		}
 	}
-	up_read(&ioc4_submodules_rwsem);
+	mutex_unlock(&ioc4_mutex);
 
 	/* Release resources */
 	iounmap(idd->idd_misc_regs);
@@ -377,9 +372,9 @@ ioc4_remove(struct pci_dev *pdev)
 	pci_disable_device(pdev);
 
 	/* Remove and free driver data */
-	down_write(&ioc4_devices_rwsem);
+	mutex_lock(&ioc4_mutex);
 	list_del(&idd->idd_list);
-	up_write(&ioc4_devices_rwsem);
+	mutex_unlock(&ioc4_mutex);
 	kfree(idd);
 }
 
-- 
cgit v1.2.3


From b0a06623dc4caf6dfb6a84419507643471676d20 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Sun, 22 Jan 2006 10:55:25 +1100
Subject: [IA64] Delete MCA/INIT sigdelayed code

The only user of the MCA/INIT sigdelayed code (SGI's I/O probing) has
moved from the kernel into SAL.  Delete the MCA/INIT sigdelayed code.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/entry.S       |  14 ------
 arch/ia64/kernel/signal.c      | 101 -----------------------------------------
 include/asm-ia64/signal.h      |   2 -
 include/asm-ia64/thread_info.h |  11 +----
 4 files changed, 1 insertion(+), 127 deletions(-)

diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index 7a6ffd613789..9dda7a36d1ee 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1100,9 +1100,6 @@ skip_rbs_switch:
 	st8 [r2]=r8
 	st8 [r3]=r10
 .work_pending:
-	tbit.nz p6,p0=r31,TIF_SIGDELAYED		// signal delayed from  MCA/INIT/NMI/PMI context?
-(p6)	br.cond.sptk.few .sigdelayed
-	;;
 	tbit.z p6,p0=r31,TIF_NEED_RESCHED		// current_thread_info()->need_resched==0?
 (p6)	br.cond.sptk.few .notify
 #ifdef CONFIG_PREEMPT
@@ -1129,17 +1126,6 @@ skip_rbs_switch:
 (pLvSys)br.cond.sptk.few  .work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel	// don't re-check
 
-// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where
-// it could not be delivered.  Deliver it now.  The signal might be for us and
-// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed
-// signal.
-
-.sigdelayed:
-	br.call.sptk.many rp=do_sigdelayed
-	cmp.eq p6,p0=r0,r0				// p6 <- 1, always re-check
-(pLvSys)br.cond.sptk.few  .work_pending_syscall_end
-	br.cond.sptk.many .work_processed_kernel	// re-check
-
 .work_pending_syscall_end:
 	adds r2=PT(R8)+16,r12
 	adds r3=PT(R10)+16,r12
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 463f6bb44d07..1d7903ee2126 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -588,104 +588,3 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall)
 	}
 	return 0;
 }
-
-/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it
- * could not be delivered.  It is important that the target process is not
- * allowed to do any more work in user space.  Possible cases for the target
- * process:
- *
- * - It is sleeping and will wake up soon.  Store the data in the current task,
- *   the signal will be sent when the current task returns from the next
- *   interrupt.
- *
- * - It is running in user context.  Store the data in the current task, the
- *   signal will be sent when the current task returns from the next interrupt.
- *
- * - It is running in kernel context on this or another cpu and will return to
- *   user context.  Store the data in the target task, the signal will be sent
- *   to itself when the target task returns to user space.
- *
- * - It is running in kernel context on this cpu and will sleep before
- *   returning to user context.  Because this is also the current task, the
- *   signal will not get delivered and the task could sleep indefinitely.
- *   Store the data in the idle task for this cpu, the signal will be sent
- *   after the idle task processes its next interrupt.
- *
- * To cover all cases, store the data in the target task, the current task and
- * the idle task on this cpu.  Whatever happens, the signal will be delivered
- * to the target task before it can do any useful user space work.  Multiple
- * deliveries have no unwanted side effects.
- *
- * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts
- * disabled.  It must not take any locks nor use kernel structures or services
- * that require locks.
- */
-
-/* To ensure that we get the right pid, check its start time.  To avoid extra
- * include files in thread_info.h, convert the task start_time to unsigned long,
- * giving us a cycle time of > 580 years.
- */
-static inline unsigned long
-start_time_ul(const struct task_struct *t)
-{
-	return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec;
-}
-
-void
-set_sigdelayed(pid_t pid, int signo, int code, void __user *addr)
-{
-	struct task_struct *t;
-	unsigned long start_time =  0;
-	int i;
-
-	for (i = 1; i <= 3; ++i) {
-		switch (i) {
-		case 1:
-			t = find_task_by_pid(pid);
-			if (t)
-				start_time = start_time_ul(t);
-			break;
-		case 2:
-			t = current;
-			break;
-		default:
-			t = idle_task(smp_processor_id());
-			break;
-		}
-
-		if (!t)
-			return;
-		task_thread_info(t)->sigdelayed.signo = signo;
-		task_thread_info(t)->sigdelayed.code = code;
-		task_thread_info(t)->sigdelayed.addr = addr;
-		task_thread_info(t)->sigdelayed.start_time = start_time;
-		task_thread_info(t)->sigdelayed.pid = pid;
-		wmb();
-		set_tsk_thread_flag(t, TIF_SIGDELAYED);
-	}
-}
-
-/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that
- * was detected in MCA/INIT/NMI/PMI context where it could not be delivered.
- */
-
-void
-do_sigdelayed(void)
-{
-	struct siginfo siginfo;
-	pid_t pid;
-	struct task_struct *t;
-
-	clear_thread_flag(TIF_SIGDELAYED);
-	memset(&siginfo, 0, sizeof(siginfo));
-	siginfo.si_signo = current_thread_info()->sigdelayed.signo;
-	siginfo.si_code = current_thread_info()->sigdelayed.code;
-	siginfo.si_addr = current_thread_info()->sigdelayed.addr;
-	pid = current_thread_info()->sigdelayed.pid;
-	t = find_task_by_pid(pid);
-	if (!t)
-		return;
-	if (current_thread_info()->sigdelayed.start_time != start_time_ul(t))
-		return;
-	force_sig_info(siginfo.si_signo, &siginfo, t);
-}
diff --git a/include/asm-ia64/signal.h b/include/asm-ia64/signal.h
index 608168d713d3..5e328ed5d01d 100644
--- a/include/asm-ia64/signal.h
+++ b/include/asm-ia64/signal.h
@@ -158,8 +158,6 @@ struct k_sigaction {
 
 #define ptrace_signal_deliver(regs, cookie) do { } while (0)
 
-void set_sigdelayed(pid_t pid, int signo, int code, void __user *addr);
-
 #endif /* __KERNEL__ */
 
 # endif /* !__ASSEMBLY__ */
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index 1d6518fe1f02..a6ee27343a4a 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -29,13 +29,6 @@ struct thread_info {
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 	struct restart_block restart_block;
-	struct {
-		int signo;
-		int code;
-		void __user *addr;
-		unsigned long start_time;
-		pid_t pid;
-	} sigdelayed;			/* Saved information for TIF_SIGDELAYED */
 };
 
 #define THREAD_SIZE			KERNEL_STACK_SIZE
@@ -89,7 +82,6 @@ struct thread_info {
 #define TIF_NEED_RESCHED	2	/* rescheduling necessary */
 #define TIF_SYSCALL_TRACE	3	/* syscall trace active */
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
-#define TIF_SIGDELAYED		5	/* signal delayed from MCA/INIT/NMI/PMI context */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
 #define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
@@ -101,13 +93,12 @@ struct thread_info {
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_SIGDELAYED		(1 << TIF_SIGDELAYED)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_MCA_INIT		(1 << TIF_MCA_INIT)
 #define _TIF_DB_DISABLED	(1 << TIF_DB_DISABLED)
 
 /* "work to do on user-return" bits */
-#define TIF_ALLWORK_MASK	(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SIGDELAYED)
+#define TIF_ALLWORK_MASK	(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
 /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
 #define TIF_WORK_MASK		(TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
 
-- 
cgit v1.2.3


From 28ff6b9b2fc01d2c2746c72ce8af1729344fae27 Mon Sep 17 00:00:00 2001
From: Aaron Young <ayoung@google.engr.sgi.com>
Date: Mon, 23 Jan 2006 09:00:51 -0800
Subject: [IA64-SGI] Handle SC env. powerdown events

Handle system controller power down pending events
on SN systems. This allows the system to gracefully shutdown
before the system controller removes power due to
an adverse environmental condition.

Signed-off-by: Aaron Young <ayoung@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/char/snsc.h       |  5 ++++-
 drivers/char/snsc_event.c | 32 ++++++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/char/snsc.h b/drivers/char/snsc.h
index a9efc13cc858..8a98169b60c1 100644
--- a/drivers/char/snsc.h
+++ b/drivers/char/snsc.h
@@ -5,7 +5,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2004-2006 Silicon Graphics, Inc. All rights reserved.
  */
 
 /*
@@ -70,6 +70,9 @@ struct sysctl_data_s {
 #define EV_CLASS_TEST_WARNING	0x6000ul
 #define EV_CLASS_PWRD_NOTIFY	0x8000ul
 
+/* ENV class codes */
+#define ENV_PWRDN_PEND		0x4101ul
+
 #define EV_SEVERITY_POWER_STABLE	0x0000ul
 #define EV_SEVERITY_POWER_LOW_WARNING	0x0100ul
 #define EV_SEVERITY_POWER_HIGH_WARNING	0x0200ul
diff --git a/drivers/char/snsc_event.c b/drivers/char/snsc_event.c
index baaa365285fa..a4fa507eed9e 100644
--- a/drivers/char/snsc_event.c
+++ b/drivers/char/snsc_event.c
@@ -5,7 +5,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2004 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2004-2006 Silicon Graphics, Inc. All rights reserved.
  */
 
 /*
@@ -187,7 +187,8 @@ scdrv_event_severity(int code)
 static void
 scdrv_dispatch_event(char *event, int len)
 {
-	int code, esp_code, src;
+	static int snsc_shutting_down = 0;
+	int code, esp_code, src, class;
 	char desc[CHUNKSIZE];
 	char *severity;
 
@@ -199,9 +200,25 @@ scdrv_dispatch_event(char *event, int len)
 	/* how urgent is the message? */
 	severity = scdrv_event_severity(code);
 
-	if ((code & EV_CLASS_MASK) == EV_CLASS_PWRD_NOTIFY) {
+	class = (code & EV_CLASS_MASK);
+
+	if (class == EV_CLASS_PWRD_NOTIFY || code == ENV_PWRDN_PEND) {
 		struct task_struct *p;
 
+		if (snsc_shutting_down)
+			return;
+
+		snsc_shutting_down = 1;
+
+		/* give a message for each type of event */
+		if (class == EV_CLASS_PWRD_NOTIFY)
+			printk(KERN_NOTICE "Power off indication received."
+			       " Sending SIGPWR to init...\n");
+		else if (code == ENV_PWRDN_PEND)
+			printk(KERN_CRIT "WARNING: Shutting down the system"
+			       " due to a critical environmental condition."
+			       " Sending SIGPWR to init...\n");
+
 		/* give a SIGPWR signal to init proc */
 
 		/* first find init's task */
@@ -210,12 +227,11 @@ scdrv_dispatch_event(char *event, int len)
 			if (p->pid == 1)
 				break;
 		}
-		if (p) { /* we found init's task */
-			printk(KERN_EMERG "Power off indication received. Initiating power fail sequence...\n");
+		if (p) {
 			force_sig(SIGPWR, p);
-		} else { /* failed to find init's task - just give message(s) */
-			printk(KERN_WARNING "Failed to find init proc to handle power off!\n");
-			printk("%s|$(0x%x)%s\n", severity, esp_code, desc);
+		} else {
+			printk(KERN_ERR "Failed to signal init!\n");
+			snsc_shutting_down = 0; /* so can try again (?) */
 		}
 		read_unlock(&tasklist_lock);
 	} else {
-- 
cgit v1.2.3


From 13938ca7a1ad9a4788cf73309f187d99c97ddfde Mon Sep 17 00:00:00 2001
From: Mark Maule <maule@sgi.com>
Date: Thu, 26 Jan 2006 14:46:39 -0600
Subject: [IA64-SGI] driver bugfixes and hardware workarounds for CE1.0 asic

Various bugfixes and hardware bug workarounds necessary for the rev 1.0 version
of the altix TIO CE asic.

Signed-off-by: Mark Maule <maule@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/pci/tioce_provider.c | 326 +++++++++++++++++++++++++++++++++++---
 include/asm-ia64/sn/addrs.h       |   8 +
 include/asm-ia64/sn/tioce.h       |  36 ++++-
 3 files changed, 345 insertions(+), 25 deletions(-)

diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c
index e52831ed93eb..fa073cc4b565 100644
--- a/arch/ia64/sn/pci/tioce_provider.c
+++ b/arch/ia64/sn/pci/tioce_provider.c
@@ -15,6 +15,124 @@
 #include <asm/sn/pcidev.h>
 #include <asm/sn/pcibus_provider_defs.h>
 #include <asm/sn/tioce_provider.h>
+#include <asm/sn/sn2/sn_hwperf.h>
+
+/*
+ * 1/26/2006
+ *
+ * WAR for SGI PV 944642.  For revA TIOCE, need to use the following recipe
+ * (taken from the above PV) before and after accessing tioce internal MMR's
+ * to avoid tioce lockups.
+ *
+ * The recipe as taken from the PV:
+ *
+ *	if(mmr address < 0x45000) {
+ *		if(mmr address == 0 or 0x80)
+ *			mmr wrt or read address 0xc0
+ *		else if(mmr address == 0x148 or 0x200)
+ *			mmr wrt or read address 0x28
+ *		else
+ *			mmr wrt or read address 0x158
+ *
+ *		do desired mmr access (rd or wrt)
+ *
+ *		if(mmr address == 0x100)
+ *			mmr wrt or read address 0x38
+ *		mmr wrt or read address 0xb050
+ *	} else
+ *		do desired mmr access
+ *
+ * According to hw, we can use reads instead of writes to the above addres
+ *
+ * Note this WAR can only to be used for accessing internal MMR's in the
+ * TIOCE Coretalk Address Range 0x0 - 0x07ff_ffff.  This includes the
+ * "Local CE Registers and Memories" and "PCI Compatible Config Space" address
+ * spaces from table 2-1 of the "CE Programmer's Reference Overview" document.
+ *
+ * All registers defined in struct tioce will meet that criteria.
+ */
+
+static void inline
+tioce_mmr_war_pre(struct tioce_kernel *kern, void *mmr_addr)
+{
+	u64 mmr_base;
+	u64 mmr_offset;
+
+	if (kern->ce_common->ce_rev != TIOCE_REV_A)
+		return;
+
+	mmr_base = kern->ce_common->ce_pcibus.bs_base;
+	mmr_offset = (u64)mmr_addr - mmr_base;
+
+	if (mmr_offset < 0x45000) {
+		u64 mmr_war_offset;
+
+		if (mmr_offset == 0 || mmr_offset == 0x80)
+			mmr_war_offset = 0xc0;
+		else if (mmr_offset == 0x148 || mmr_offset == 0x200)
+			mmr_war_offset = 0x28;
+		else
+			mmr_war_offset = 0x158;
+
+		readq_relaxed((void *)(mmr_base + mmr_war_offset));
+	}
+}
+
+static void inline
+tioce_mmr_war_post(struct tioce_kernel *kern, void *mmr_addr)
+{
+	u64 mmr_base;
+	u64 mmr_offset;
+
+	if (kern->ce_common->ce_rev != TIOCE_REV_A)
+		return;
+
+	mmr_base = kern->ce_common->ce_pcibus.bs_base;
+	mmr_offset = (u64)mmr_addr - mmr_base;
+
+	if (mmr_offset < 0x45000) {
+		if (mmr_offset == 0x100)
+			readq_relaxed((void *)(mmr_base + 0x38));
+		readq_relaxed((void *)(mmr_base + 0xb050));
+	}
+}
+
+/* load mmr contents into a variable */
+#define tioce_mmr_load(kern, mmrp, varp) do {\
+	tioce_mmr_war_pre(kern, mmrp); \
+	*(varp) = readq_relaxed(mmrp); \
+	tioce_mmr_war_post(kern, mmrp); \
+} while (0)
+
+/* store variable contents into mmr */
+#define tioce_mmr_store(kern, mmrp, varp) do {\
+	tioce_mmr_war_pre(kern, mmrp); \
+	writeq(*varp, mmrp); \
+	tioce_mmr_war_post(kern, mmrp); \
+} while (0)
+
+/* store immediate value into mmr */
+#define tioce_mmr_storei(kern, mmrp, val) do {\
+	tioce_mmr_war_pre(kern, mmrp); \
+	writeq(val, mmrp); \
+	tioce_mmr_war_post(kern, mmrp); \
+} while (0)
+
+/* set bits (immediate value) into mmr */
+#define tioce_mmr_seti(kern, mmrp, bits) do {\
+	u64 tmp; \
+	tioce_mmr_load(kern, mmrp, &tmp); \
+	tmp |= (bits); \
+	tioce_mmr_store(kern, mmrp, &tmp); \
+} while (0)
+
+/* clear bits (immediate value) into mmr */
+#define tioce_mmr_clri(kern, mmrp, bits) do { \
+	u64 tmp; \
+	tioce_mmr_load(kern, mmrp, &tmp); \
+	tmp &= ~(bits); \
+	tioce_mmr_store(kern, mmrp, &tmp); \
+} while (0)
 
 /**
  * Bus address ranges for the 5 flavors of TIOCE DMA
@@ -62,9 +180,9 @@
 #define TIOCE_ATE_M40	2
 #define TIOCE_ATE_M40S	3
 
-#define KB(x)	((x) << 10)
-#define MB(x)	((x) << 20)
-#define GB(x)	((x) << 30)
+#define KB(x)	((u64)(x) << 10)
+#define MB(x)	((u64)(x) << 20)
+#define GB(x)	((u64)(x) << 30)
 
 /**
  * tioce_dma_d64 - create a DMA mapping using 64-bit direct mode
@@ -151,7 +269,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 	int last;
 	int entries;
 	int nates;
-	int pagesize;
+	u64 pagesize;
 	u64 *ate_shadow;
 	u64 *ate_reg;
 	u64 addr;
@@ -228,7 +346,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port,
 
 		ate = ATE_MAKE(addr, pagesize);
 		ate_shadow[i + j] = ate;
-		writeq(ate, &ate_reg[i + j]);
+		tioce_mmr_storei(ce_kern, &ate_reg[i + j], ate);
 		addr += pagesize;
 	}
 
@@ -272,7 +390,8 @@ tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr)
 		u64 tmp;
 
 		ce_kern->ce_port[port].dirmap_shadow = ct_upper;
-		writeq(ct_upper, &ce_mmr->ce_ure_dir_map[port]);
+		tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port],
+				 ct_upper);
 		tmp = ce_mmr->ce_ure_dir_map[port];
 		dma_ok = 1;
 	} else
@@ -344,7 +463,8 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
 	if (TIOCE_D32_ADDR(bus_addr)) {
 		if (--ce_kern->ce_port[port].dirmap_refcnt == 0) {
 			ce_kern->ce_port[port].dirmap_shadow = 0;
-			writeq(0, &ce_mmr->ce_ure_dir_map[port]);
+			tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port],
+					 0);
 		}
 	} else {
 		struct tioce_dmamap *map;
@@ -365,7 +485,7 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir)
 		} else if (--map->refcnt == 0) {
 			for (i = 0; i < map->ate_count; i++) {
 				map->ate_shadow[i] = 0;
-				map->ate_hw[i] = 0;
+				tioce_mmr_storei(ce_kern, &map->ate_hw[i], 0);
 			}
 
 			list_del(&map->ce_dmamap_list);
@@ -486,7 +606,7 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count,
 	spin_unlock_irqrestore(&ce_kern->ce_lock, flags);
 
 dma_map_done:
-	if (mapaddr & barrier)
+	if (mapaddr && barrier)
 		mapaddr = tioce_dma_barrier(mapaddr, 1);
 
 	return mapaddr;
@@ -541,17 +661,61 @@ tioce_error_intr_handler(int irq, void *arg, struct pt_regs *pt)
 			soft->ce_pcibus.bs_persist_segment,
 			soft->ce_pcibus.bs_persist_busnum, 0, 0, 0, 0, 0);
 
+	if (ret_stuff.v0)
+		panic("tioce_error_intr_handler:  Fatal TIOCE error");
+
 	return IRQ_HANDLED;
 }
 
+/**
+ * tioce_reserve_m32 - reserve M32 ate's for the indicated address range
+ * @tioce_kernel: TIOCE context to reserve ate's for
+ * @base: starting bus address to reserve
+ * @limit: last bus address to reserve
+ *
+ * If base/limit falls within the range of bus space mapped through the
+ * M32 space, reserve the resources corresponding to the range.
+ */
+static void
+tioce_reserve_m32(struct tioce_kernel *ce_kern, u64 base, u64 limit)
+{
+	int ate_index, last_ate, ps;
+	struct tioce *ce_mmr;
+
+	if (!TIOCE_M32_ADDR(base))
+		return;
+
+	ce_mmr = (struct tioce *)ce_kern->ce_common->ce_pcibus.bs_base;
+	ps = ce_kern->ce_ate3240_pagesize;
+	ate_index = ATE_PAGE(base, ps);
+	last_ate = ate_index + ATE_NPAGES(base, limit-base+1, ps) - 1;
+
+	if (ate_index < 64)
+		ate_index = 64;
+
+	while (ate_index <= last_ate) {
+		u64 ate;
+
+		ate = ATE_MAKE(0xdeadbeef, ps);
+		ce_kern->ce_ate3240_shadow[ate_index] = ate;
+		tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_ate3240[ate_index],
+				 ate);
+		ate_index++;
+	}
+}
+
 /**
  * tioce_kern_init - init kernel structures related to a given TIOCE
  * @tioce_common: ptr to a cached tioce_common struct that originated in prom
- */ static struct tioce_kernel *
+ */
+static struct tioce_kernel *
 tioce_kern_init(struct tioce_common *tioce_common)
 {
 	int i;
+	int ps;
+	int dev;
 	u32 tmp;
+	unsigned int seg, bus;
 	struct tioce *tioce_mmr;
 	struct tioce_kernel *tioce_kern;
 
@@ -572,9 +736,10 @@ tioce_kern_init(struct tioce_common *tioce_common)
 	 * here to use pci_read_config_xxx() so use the raw_pci_ops vector.
 	 */
 
-	raw_pci_ops->read(tioce_common->ce_pcibus.bs_persist_segment,
-			  tioce_common->ce_pcibus.bs_persist_busnum,
-			  PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1, &tmp);
+	seg = tioce_common->ce_pcibus.bs_persist_segment;
+	bus = tioce_common->ce_pcibus.bs_persist_busnum;
+
+	raw_pci_ops->read(seg, bus, PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1,&tmp);
 	tioce_kern->ce_port1_secondary = (u8) tmp;
 
 	/*
@@ -583,18 +748,76 @@ tioce_kern_init(struct tioce_common *tioce_common)
 	 */
 
 	tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base;
-	__sn_clrq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_PAGESIZE_MASK);
-	__sn_setq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_256K_PAGESIZE);
-	tioce_kern->ce_ate3240_pagesize = KB(256);
+	tioce_mmr_clri(tioce_kern, &tioce_mmr->ce_ure_page_map,
+		       CE_URE_PAGESIZE_MASK);
+	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_ure_page_map,
+		       CE_URE_256K_PAGESIZE);
+	ps = tioce_kern->ce_ate3240_pagesize = KB(256);
 
 	for (i = 0; i < TIOCE_NUM_M40_ATES; i++) {
 		tioce_kern->ce_ate40_shadow[i] = 0;
-		writeq(0, &tioce_mmr->ce_ure_ate40[i]);
+		tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate40[i], 0);
 	}
 
 	for (i = 0; i < TIOCE_NUM_M3240_ATES; i++) {
 		tioce_kern->ce_ate3240_shadow[i] = 0;
-		writeq(0, &tioce_mmr->ce_ure_ate3240[i]);
+		tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate3240[i], 0);
+	}
+
+	/*
+	 * Reserve ATE's corresponding to reserved address ranges.  These
+	 * include:
+	 *
+	 *	Memory space covered by each PPB mem base/limit register
+	 * 	Memory space covered by each PPB prefetch base/limit register
+	 *
+	 * These bus ranges are for pio (downstream) traffic only, and so
+	 * cannot be used for DMA.
+	 */
+
+	for (dev = 1; dev <= 2; dev++) {
+		u64 base, limit;
+
+		/* mem base/limit */
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_MEMORY_BASE, 2, &tmp);
+		base = (u64)tmp << 16;
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_MEMORY_LIMIT, 2, &tmp);
+		limit = (u64)tmp << 16;
+		limit |= 0xfffffUL;
+
+		if (base < limit)
+			tioce_reserve_m32(tioce_kern, base, limit);
+
+		/*
+		 * prefetch mem base/limit.  The tioce ppb's have 64-bit
+		 * decoders, so read the upper portions w/o checking the
+		 * attributes.
+		 */
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_PREF_MEMORY_BASE, 2, &tmp);
+		base = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16;
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_PREF_BASE_UPPER32, 4, &tmp);
+		base |= (u64)tmp << 32;
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_PREF_MEMORY_LIMIT, 2, &tmp);
+
+		limit = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16;
+		limit |= 0xfffffUL;
+
+		raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0),
+				  PCI_PREF_LIMIT_UPPER32, 4, &tmp);
+		limit |= (u64)tmp << 32;
+
+		if ((base < limit) && TIOCE_M32_ADDR(base))
+			tioce_reserve_m32(tioce_kern, base, limit);
 	}
 
 	return tioce_kern;
@@ -614,6 +837,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
 {
 	struct pcidev_info *pcidev_info;
 	struct tioce_common *ce_common;
+	struct tioce_kernel *ce_kern;
 	struct tioce *ce_mmr;
 	u64 force_int_val;
 
@@ -629,6 +853,29 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
 
 	ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info;
 	ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base;
+	ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private;
+
+	/*
+	 * TIOCE Rev A workaround (PV 945826), force an interrupt by writing
+	 * the TIO_INTx register directly (1/26/2006)
+	 */
+	if (ce_common->ce_rev == TIOCE_REV_A) {
+		u64 int_bit_mask = (1ULL << sn_irq_info->irq_int_bit);
+		u64 status;
+
+		tioce_mmr_load(ce_kern, &ce_mmr->ce_adm_int_status, &status);
+		if (status & int_bit_mask) {
+			u64 force_irq = (1 << 8) | sn_irq_info->irq_irq;
+			u64 ctalk = sn_irq_info->irq_xtalkaddr;
+			u64 nasid, offset;
+
+			nasid = (ctalk & CTALK_NASID_MASK) >> CTALK_NASID_SHFT;
+			offset = (ctalk & CTALK_NODE_OFFSET);
+			HUB_S(TIO_IOSPACE_ADDR(nasid, offset), force_irq);
+		}
+
+		return;
+	}
 
 	/*
 	 * irq_int_bit is originally set up by prom, and holds the interrupt
@@ -666,7 +913,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info)
 	default:
 		return;
 	}
-	writeq(force_int_val, &ce_mmr->ce_adm_force_int);
+	tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_force_int, force_int_val);
 }
 
 /**
@@ -685,6 +932,7 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
 {
 	struct pcidev_info *pcidev_info;
 	struct tioce_common *ce_common;
+	struct tioce_kernel *ce_kern;
 	struct tioce *ce_mmr;
 	int bit;
 	u64 vector;
@@ -695,14 +943,15 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
 
 	ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info;
 	ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base;
+	ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private;
 
 	bit = sn_irq_info->irq_int_bit;
 
-	__sn_setq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit));
+	tioce_mmr_seti(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit));
 	vector = (u64)sn_irq_info->irq_irq << INTR_VECTOR_SHFT;
 	vector |= sn_irq_info->irq_xtalkaddr;
-	writeq(vector, &ce_mmr->ce_adm_int_dest[bit]);
-	__sn_clrq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit));
+	tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_int_dest[bit], vector);
+	tioce_mmr_clri(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit));
 
 	tioce_force_interrupt(sn_irq_info);
 }
@@ -721,7 +970,11 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info)
 static void *
 tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *controller)
 {
+	int my_nasid;
+	cnodeid_t my_cnode, mem_cnode;
 	struct tioce_common *tioce_common;
+	struct tioce_kernel *tioce_kern;
+	struct tioce *tioce_mmr;
 
 	/*
 	 * Allocate kernel bus soft and copy from prom.
@@ -734,11 +987,23 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 	memcpy(tioce_common, prom_bussoft, sizeof(struct tioce_common));
 	tioce_common->ce_pcibus.bs_base |= __IA64_UNCACHED_OFFSET;
 
-	if (tioce_kern_init(tioce_common) == NULL) {
+	tioce_kern = tioce_kern_init(tioce_common);
+	if (tioce_kern == NULL) {
 		kfree(tioce_common);
 		return NULL;
 	}
 
+	/*
+	 * Clear out any transient errors before registering the error
+	 * interrupt handler.
+	 */
+
+	tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base;
+	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_int_status_alias, ~0ULL);
+	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_error_summary_alias,
+		       ~0ULL);
+	tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_dre_comp_err_addr, ~0ULL);
+
 	if (request_irq(SGI_PCIASIC_ERROR,
 			tioce_error_intr_handler,
 			SA_SHIRQ, "TIOCE error", (void *)tioce_common))
@@ -750,6 +1015,21 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont
 		       tioce_common->ce_pcibus.bs_persist_segment,
 		       tioce_common->ce_pcibus.bs_persist_busnum);
 
+	/*
+	 * identify closest nasid for memory allocations
+	 */
+
+	my_nasid = NASID_GET(tioce_common->ce_pcibus.bs_base);
+	my_cnode = nasid_to_cnodeid(my_nasid);
+
+	if (sn_hwperf_get_nearest_node(my_cnode, &mem_cnode, NULL) < 0) {
+		printk(KERN_WARNING "tioce_bus_fixup: failed to find "
+		       "closest node with MEM to TIO node %d\n", my_cnode);
+		mem_cnode = (cnodeid_t)-1; /* use any node */
+	}
+
+	controller->node = mem_cnode;
+
 	return tioce_common;
 }
 
diff --git a/include/asm-ia64/sn/addrs.h b/include/asm-ia64/sn/addrs.h
index 2c32e4b77b54..1d9efe541662 100644
--- a/include/asm-ia64/sn/addrs.h
+++ b/include/asm-ia64/sn/addrs.h
@@ -283,5 +283,13 @@
 #define REMOTE_HUB_L(n, a)		HUB_L(REMOTE_HUB_ADDR((n), (a)))
 #define REMOTE_HUB_S(n, a, d)		HUB_S(REMOTE_HUB_ADDR((n), (a)), (d))
 
+/*
+ * Coretalk address breakdown
+ */
+#define CTALK_NASID_SHFT		40
+#define CTALK_NASID_MASK		(0x3FFFULL << CTALK_NASID_SHFT)
+#define CTALK_CID_SHFT			38
+#define CTALK_CID_MASK			(0x3ULL << CTALK_CID_SHFT)
+#define CTALK_NODE_OFFSET		0x3FFFFFFFFF
 
 #endif /* _ASM_IA64_SN_ADDRS_H */
diff --git a/include/asm-ia64/sn/tioce.h b/include/asm-ia64/sn/tioce.h
index d4c990712eac..893468e1b41b 100644
--- a/include/asm-ia64/sn/tioce.h
+++ b/include/asm-ia64/sn/tioce.h
@@ -11,7 +11,7 @@
 
 /* CE ASIC part & mfgr information  */
 #define TIOCE_PART_NUM			0xCE00
-#define TIOCE_MFGR_NUM			0x36
+#define TIOCE_SRC_ID			0x01
 #define TIOCE_REV_A			0x1
 
 /* CE Virtual PPB Vendor/Device IDs */
@@ -20,7 +20,7 @@
 
 /* CE Host Bridge Vendor/Device IDs */
 #define CE_HOST_BRIDGE_VENDOR_ID	0x10a9
-#define CE_HOST_BRIDGE_DEVICE_ID	0x4003
+#define CE_HOST_BRIDGE_DEVICE_ID	0x4001
 
 
 #define TIOCE_NUM_M40_ATES		4096
@@ -463,6 +463,25 @@ typedef volatile struct tioce {
 	u64	ce_end_of_struct;			/* 0x044400 */
 } tioce_t;
 
+/* ce_lsiX_gb_cfg1 register bit masks & shifts */
+#define CE_LSI_GB_CFG1_RXL0S_THS_SHFT	0
+#define CE_LSI_GB_CFG1_RXL0S_THS_MASK	(0xffULL << 0)
+#define CE_LSI_GB_CFG1_RXL0S_SMP_SHFT	8
+#define CE_LSI_GB_CFG1_RXL0S_SMP_MASK	(0xfULL << 8);
+#define CE_LSI_GB_CFG1_RXL0S_ADJ_SHFT	12
+#define CE_LSI_GB_CFG1_RXL0S_ADJ_MASK	(0x7ULL << 12)
+#define CE_LSI_GB_CFG1_RXL0S_FLT_SHFT	15
+#define CE_LSI_GB_CFG1_RXL0S_FLT_MASK	(0x1ULL << 15)
+#define CE_LSI_GB_CFG1_LPBK_SEL_SHFT	16
+#define CE_LSI_GB_CFG1_LPBK_SEL_MASK	(0x3ULL << 16)
+#define CE_LSI_GB_CFG1_LPBK_EN_SHFT	18
+#define CE_LSI_GB_CFG1_LPBK_EN_MASK	(0x1ULL << 18)
+#define CE_LSI_GB_CFG1_RVRS_LB_SHFT	19
+#define CE_LSI_GB_CFG1_RVRS_LB_MASK	(0x1ULL << 19)
+#define CE_LSI_GB_CFG1_RVRS_CLK_SHFT	20
+#define CE_LSI_GB_CFG1_RVRS_CLK_MASK	(0x3ULL << 20)
+#define CE_LSI_GB_CFG1_SLF_TS_SHFT	24
+#define CE_LSI_GB_CFG1_SLF_TS_MASK	(0xfULL << 24)
 
 /* ce_adm_int_mask/ce_adm_int_status register bit defines */
 #define CE_ADM_INT_CE_ERROR_SHFT		0
@@ -592,6 +611,11 @@ typedef volatile struct tioce {
 #define CE_URE_RD_MRG_ENABLE		(0x1ULL << 0)
 #define CE_URE_WRT_MRG_ENABLE1		(0x1ULL << 4)
 #define CE_URE_WRT_MRG_ENABLE2		(0x1ULL << 5)
+#define CE_URE_WRT_MRG_TIMER_SHFT	12
+#define CE_URE_WRT_MRG_TIMER_MASK	(0x7FFULL << CE_URE_WRT_MRG_TIMER_SHFT)
+#define CE_URE_WRT_MRG_TIMER(x)		(((u64)(x) << \
+					  CE_URE_WRT_MRG_TIMER_SHFT) & \
+					 CE_URE_WRT_MRG_TIMER_MASK)
 #define CE_URE_RSPQ_BYPASS_DISABLE	(0x1ULL << 24)
 #define CE_URE_UPS_DAT1_PAR_DISABLE	(0x1ULL << 32)
 #define CE_URE_UPS_HDR1_PAR_DISABLE	(0x1ULL << 33)
@@ -653,8 +677,12 @@ typedef volatile struct tioce {
 #define CE_URE_SI			(0x1ULL << 0)
 #define CE_URE_ELAL_SHFT		4
 #define CE_URE_ELAL_MASK		(0x7ULL << CE_URE_ELAL_SHFT)
+#define CE_URE_ELAL_SET(n)		(((u64)(n) << CE_URE_ELAL_SHFT) & \
+					 CE_URE_ELAL_MASK)
 #define CE_URE_ELAL1_SHFT		8
 #define CE_URE_ELAL1_MASK		(0x7ULL << CE_URE_ELAL1_SHFT)
+#define CE_URE_ELAL1_SET(n)		(((u64)(n) << CE_URE_ELAL1_SHFT) & \
+					 CE_URE_ELAL1_MASK)
 #define CE_URE_SCC			(0x1ULL << 12)
 #define CE_URE_PN1_SHFT			16
 #define CE_URE_PN1_MASK			(0xFFULL << CE_URE_PN1_SHFT)
@@ -675,8 +703,12 @@ typedef volatile struct tioce {
 #define CE_URE_HPC			(0x1ULL << 6)
 #define CE_URE_SPLV_SHFT		7
 #define CE_URE_SPLV_MASK		(0xFFULL << CE_URE_SPLV_SHFT)
+#define CE_URE_SPLV_SET(n)		(((u64)(n) << CE_URE_SPLV_SHFT) & \
+					 CE_URE_SPLV_MASK)
 #define CE_URE_SPLS_SHFT		15
 #define CE_URE_SPLS_MASK		(0x3ULL << CE_URE_SPLS_SHFT)
+#define CE_URE_SPLS_SET(n)		(((u64)(n) << CE_URE_SPLS_SHFT) & \
+					 CE_URE_SPLS_MASK)
 #define CE_URE_PSN1_SHFT		19
 #define CE_URE_PSN1_MASK		(0x1FFFULL << CE_URE_PSN1_SHFT)
 #define CE_URE_PSN2_SHFT		32
-- 
cgit v1.2.3


From e08e6c521355cd33e647b2f739885bc3050eead6 Mon Sep 17 00:00:00 2001
From: Brent Casavant <bcasavan@sgi.com>
Date: Thu, 26 Jan 2006 15:55:52 -0800
Subject: [IA64] hooks to wait for mmio writes to drain when migrating
 processes

On SN2, MMIO writes which are issued from separate processors are not
guaranteed to arrive in any particular order at the IO hardware.  When
performing such writes from the kernel this is not a problem, as a
kernel thread will not migrate to another CPU during execution, and
mmiowb() calls can guarantee write ordering when control of the IO
resource is allowed to move between threads.

However, when MMIO writes can be performed from user space (e.g. DRM)
there are no such guarantees and mechanisms, as the process may
context-switch at any time, and may migrate to a different CPU as part
of the switch.  For such programs/hardware to operate correctly, it is
required that the MMIO writes from the old CPU be accepted by the IO
hardware before subsequent writes from the new CPU can be issued.

The following patch implements this behavior on SN2 by waiting for a
Shub register to indicate that these writes have been accepted.  This
is placed in the context switch-in path, and only performs the wait
when the newly scheduled task changes CPUs.

Signed-off-by: Prarit Bhargava <prarit@sgi.com>
Signed-off-by: Brent Casavant <bcasavan@sgi.com>
---
 arch/ia64/sn/kernel/setup.c       |  6 ++++--
 arch/ia64/sn/kernel/sn2/sn2_smp.c | 23 ++++++++++++++++++++++-
 include/asm-ia64/machvec.h        | 13 +++++++++++++
 include/asm-ia64/machvec_sn2.h    |  4 +++-
 include/asm-ia64/processor.h      |  3 ++-
 include/asm-ia64/system.h         |  7 +++++++
 include/asm-ia64/thread_info.h    |  1 +
 7 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index e510dce9971f..f1c1338b10b4 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -3,7 +3,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 1999,2001-2005 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 1999,2001-2006 Silicon Graphics, Inc. All rights reserved.
  */
 
 #include <linux/config.h>
@@ -496,6 +496,7 @@ void __init sn_setup(char **cmdline_p)
 	 * for sn.
 	 */
 	pm_power_off = ia64_sn_power_down;
+	current->thread.flags |= IA64_THREAD_MIGRATION;
 }
 
 /**
@@ -654,7 +655,8 @@ void __init sn_cpu_init(void)
 			SH2_PIO_WRITE_STATUS_1, SH2_PIO_WRITE_STATUS_3};
 		u64 *pio;
 		pio = is_shub1() ? pio1 : pio2;
-		pda->pio_write_status_addr = (volatile unsigned long *) LOCAL_MMR_ADDR(pio[slice]);
+		pda->pio_write_status_addr =
+		   (volatile unsigned long *)GLOBAL_MMR_ADDR(nasid, pio[slice]);
 		pda->pio_write_status_val = is_shub1() ? SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK : 0;
 	}
 
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 471bbaa65d1b..1b33fd5e4e3a 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -5,7 +5,7 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2000-2006 Silicon Graphics, Inc. All rights reserved.
  */
 
 #include <linux/init.h>
@@ -169,6 +169,27 @@ static inline unsigned long wait_piowc(void)
 	return ws;
 }
 
+/**
+ * sn_migrate - SN-specific task migration actions
+ * @task: Task being migrated to new CPU
+ *
+ * SN2 PIO writes from separate CPUs are not guaranteed to arrive in order.
+ * Context switching user threads which have memory-mapped MMIO may cause
+ * PIOs to issue from seperate CPUs, thus the PIO writes must be drained
+ * from the previous CPU's Shub before execution resumes on the new CPU.
+ */
+void sn_migrate(struct task_struct *task)
+{
+	pda_t *last_pda = pdacpu(task_thread_info(task)->last_cpu);
+	volatile unsigned long *adr = last_pda->pio_write_status_addr;
+	unsigned long val = last_pda->pio_write_status_val;
+
+	/* Drain PIO writes from old CPU's Shub */
+	while (unlikely((*adr & SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK)
+			!= val))
+		cpu_relax();
+}
+
 void sn_tlb_migrate_finish(struct mm_struct *mm)
 {
 	if (mm == current->mm)
diff --git a/include/asm-ia64/machvec.h b/include/asm-ia64/machvec.h
index ca5ea994d688..c3e4ed8a3e17 100644
--- a/include/asm-ia64/machvec.h
+++ b/include/asm-ia64/machvec.h
@@ -20,6 +20,7 @@ struct scatterlist;
 struct page;
 struct mm_struct;
 struct pci_bus;
+struct task_struct;
 
 typedef void ia64_mv_setup_t (char **);
 typedef void ia64_mv_cpu_init_t (void);
@@ -34,6 +35,7 @@ typedef int ia64_mv_pci_legacy_read_t (struct pci_bus *, u16 port, u32 *val,
 				       u8 size);
 typedef int ia64_mv_pci_legacy_write_t (struct pci_bus *, u16 port, u32 val,
 					u8 size);
+typedef void ia64_mv_migrate_t(struct task_struct * task);
 
 /* DMA-mapping interface: */
 typedef void ia64_mv_dma_init (void);
@@ -85,6 +87,11 @@ machvec_noop_mm (struct mm_struct *mm)
 {
 }
 
+static inline void
+machvec_noop_task (struct task_struct *task)
+{
+}
+
 extern void machvec_setup (char **);
 extern void machvec_timer_interrupt (int, void *, struct pt_regs *);
 extern void machvec_dma_sync_single (struct device *, dma_addr_t, size_t, int);
@@ -146,6 +153,7 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *);
 #  define platform_readw_relaxed        ia64_mv.readw_relaxed
 #  define platform_readl_relaxed        ia64_mv.readl_relaxed
 #  define platform_readq_relaxed        ia64_mv.readq_relaxed
+#  define platform_migrate		ia64_mv.migrate
 # endif
 
 /* __attribute__((__aligned__(16))) is required to make size of the
@@ -194,6 +202,7 @@ struct ia64_machine_vector {
 	ia64_mv_readw_relaxed_t *readw_relaxed;
 	ia64_mv_readl_relaxed_t *readl_relaxed;
 	ia64_mv_readq_relaxed_t *readq_relaxed;
+	ia64_mv_migrate_t *migrate;
 } __attribute__((__aligned__(16))); /* align attrib? see above comment */
 
 #define MACHVEC_INIT(name)			\
@@ -238,6 +247,7 @@ struct ia64_machine_vector {
 	platform_readw_relaxed,			\
 	platform_readl_relaxed,			\
 	platform_readq_relaxed,			\
+	platform_migrate,			\
 }
 
 extern struct ia64_machine_vector ia64_mv;
@@ -386,5 +396,8 @@ extern ia64_mv_dma_supported		swiotlb_dma_supported;
 #ifndef platform_readq_relaxed
 # define platform_readq_relaxed	__ia64_readq_relaxed
 #endif
+#ifndef platform_migrate
+# define platform_migrate machvec_noop_task
+#endif
 
 #endif /* _ASM_IA64_MACHVEC_H */
diff --git a/include/asm-ia64/machvec_sn2.h b/include/asm-ia64/machvec_sn2.h
index e1b6cd63f49e..6f0021bb3874 100644
--- a/include/asm-ia64/machvec_sn2.h
+++ b/include/asm-ia64/machvec_sn2.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2002-2003,2006 Silicon Graphics, Inc.  All Rights Reserved.
  * 
  * This program is free software; you can redistribute it and/or modify it 
  * under the terms of version 2 of the GNU General Public License 
@@ -71,6 +71,7 @@ extern ia64_mv_dma_sync_single_for_device sn_dma_sync_single_for_device;
 extern ia64_mv_dma_sync_sg_for_device	sn_dma_sync_sg_for_device;
 extern ia64_mv_dma_mapping_error	sn_dma_mapping_error;
 extern ia64_mv_dma_supported		sn_dma_supported;
+extern ia64_mv_migrate_t		sn_migrate;
 
 /*
  * This stuff has dual use!
@@ -120,6 +121,7 @@ extern ia64_mv_dma_supported		sn_dma_supported;
 #define platform_dma_sync_sg_for_device	sn_dma_sync_sg_for_device
 #define platform_dma_mapping_error		sn_dma_mapping_error
 #define platform_dma_supported		sn_dma_supported
+#define platform_migrate		sn_migrate
 
 #include <asm/sn/io.h>
 
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 09b99029ac1a..29d5574d4375 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -50,7 +50,8 @@
 #define IA64_THREAD_PM_VALID	(__IA64_UL(1) << 2)	/* performance registers valid? */
 #define IA64_THREAD_UAC_NOPRINT	(__IA64_UL(1) << 3)	/* don't log unaligned accesses */
 #define IA64_THREAD_UAC_SIGBUS	(__IA64_UL(1) << 4)	/* generate SIGBUS on unaligned acc. */
-							/* bit 5 is currently unused */
+#define IA64_THREAD_MIGRATION	(__IA64_UL(1) << 5)	/* require migration
+							   sync at ctx sw */
 #define IA64_THREAD_FPEMU_NOPRINT (__IA64_UL(1) << 6)	/* don't log any fpswa faults */
 #define IA64_THREAD_FPEMU_SIGFPE  (__IA64_UL(1) << 7)	/* send a SIGFPE for fpswa faults */
 
diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h
index 80c5a234e259..99b6f307e94b 100644
--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -244,6 +244,13 @@ extern void ia64_load_extra (struct task_struct *task);
 		__ia64_save_fpu((prev)->thread.fph);				\
 	}									\
 	__switch_to(prev, next, last);						\
+	/* "next" in old context is "current" in new context */			\
+	if (unlikely((current->thread.flags & IA64_THREAD_MIGRATION) &&	       \
+		     (task_cpu(current) !=				       \
+		      		      task_thread_info(current)->last_cpu))) { \
+		platform_migrate(current);				       \
+		task_thread_info(current)->last_cpu = task_cpu(current);       \
+	}								       \
 } while (0)
 #else
 # define switch_to(prev,next,last)	__switch_to(prev, next, last)
diff --git a/include/asm-ia64/thread_info.h b/include/asm-ia64/thread_info.h
index 1d6518fe1f02..81641a6905d1 100644
--- a/include/asm-ia64/thread_info.h
+++ b/include/asm-ia64/thread_info.h
@@ -26,6 +26,7 @@ struct thread_info {
 	struct exec_domain *exec_domain;/* execution domain */
 	__u32 flags;			/* thread_info flags (see TIF_*) */
 	__u32 cpu;			/* current CPU */
+	__u32 last_cpu;			/* Last CPU thread ran on */
 	mm_segment_t addr_limit;	/* user-level address space limit */
 	int preempt_count;		/* 0=premptable, <0=BUG; will also serve as bh-counter */
 	struct restart_block restart_block;
-- 
cgit v1.2.3


From 4bf64e72bd499d2bf3509c2dc60d09c39f72c782 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Thu, 26 Jan 2006 16:58:52 -0800
Subject: [IA64] map ia64_hint definition to intel compiler intrinsic

Map ia64_hint() to internal intel compiler intrinsic.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/asm-ia64/intel_intrin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/asm-ia64/intel_intrin.h b/include/asm-ia64/intel_intrin.h
index a7122d850177..88261ce79860 100644
--- a/include/asm-ia64/intel_intrin.h
+++ b/include/asm-ia64/intel_intrin.h
@@ -122,7 +122,8 @@ __s64 _m64_popcnt(__s64 a);
 #define ia64_getreg		__getReg
 #define ia64_setreg		__setReg
 
-#define ia64_hint(x)
+#define ia64_hint		__hint
+#define ia64_hint_pause		__hint_pause
 
 #define ia64_mux1_brcst	 0
 #define ia64_mux1_mix		 8
-- 
cgit v1.2.3


From c583f66dc41cfa4055b6ac8f50cc1ebf362298f7 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Thu, 26 Jan 2006 17:08:47 -0800
Subject: [IA64] clean up asm/intel_intrin.h

Include intrinsic header file from icc compiler.  Remove
duplicate definition from kernel source.

Signed-off-by: HJ Lu <hongjiu.lu@intel.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/asm-ia64/intel_intrin.h | 109 ++--------------------------------------
 1 file changed, 3 insertions(+), 106 deletions(-)

diff --git a/include/asm-ia64/intel_intrin.h b/include/asm-ia64/intel_intrin.h
index 88261ce79860..c7ac04945c12 100644
--- a/include/asm-ia64/intel_intrin.h
+++ b/include/asm-ia64/intel_intrin.h
@@ -5,113 +5,10 @@
  *
  * Copyright (C) 2002,2003 Jun Nakajima <jun.nakajima@intel.com>
  * Copyright (C) 2002,2003 Suresh Siddha <suresh.b.siddha@intel.com>
+ * Copyright (C) 2005,2006 Hongjiu Lu <hongjiu.lu@intel.com>
  *
  */
-#include <asm/types.h>
-
-void  __lfetch(int lfhint, void *y);
-void  __lfetch_excl(int lfhint, void *y);
-void  __lfetch_fault(int lfhint, void *y);
-void  __lfetch_fault_excl(int lfhint, void *y);
-
-/* In the following, whichFloatReg should be an integer from 0-127 */
-void  __ldfs(const int whichFloatReg, void *src);
-void  __ldfd(const int whichFloatReg, void *src);
-void  __ldfe(const int whichFloatReg, void *src);
-void  __ldf8(const int whichFloatReg, void *src);
-void  __ldf_fill(const int whichFloatReg, void *src);
-void  __stfs(void *dst, const int whichFloatReg);
-void  __stfd(void *dst, const int whichFloatReg);
-void  __stfe(void *dst, const int whichFloatReg);
-void  __stf8(void *dst, const int whichFloatReg);
-void  __stf_spill(void *dst, const int whichFloatReg);
-
-void  __st1_rel(void *dst, const __s8  value);
-void  __st2_rel(void *dst, const __s16 value);
-void  __st4_rel(void *dst, const __s32 value);
-void  __st8_rel(void *dst, const __s64 value);
-__u8  __ld1_acq(void *src);
-__u16 __ld2_acq(void *src);
-__u32 __ld4_acq(void *src);
-__u64 __ld8_acq(void *src);
-
-__u64 __fetchadd4_acq(__u32 *addend, const int increment);
-__u64 __fetchadd4_rel(__u32 *addend, const int increment);
-__u64 __fetchadd8_acq(__u64 *addend, const int increment);
-__u64 __fetchadd8_rel(__u64 *addend, const int increment);
-
-__u64 __getf_exp(double d);
-
-/* OS Related Itanium(R) Intrinsics  */
-
-/* The names to use for whichReg and whichIndReg below come from
-   the include file asm/ia64regs.h */
-
-__u64 __getIndReg(const int whichIndReg, __s64 index);
-__u64 __getReg(const int whichReg);
-
-void  __setIndReg(const int whichIndReg, __s64 index, __u64 value);
-void  __setReg(const int whichReg, __u64 value);
-
-void  __mf(void);
-void  __mfa(void);
-void  __synci(void);
-void  __itcd(__s64 pa);
-void  __itci(__s64 pa);
-void  __itrd(__s64 whichTransReg, __s64 pa);
-void  __itri(__s64 whichTransReg, __s64 pa);
-void  __ptce(__s64 va);
-void  __ptcl(__s64 va, __s64 pagesz);
-void  __ptcg(__s64 va, __s64 pagesz);
-void  __ptcga(__s64 va, __s64 pagesz);
-void  __ptri(__s64 va, __s64 pagesz);
-void  __ptrd(__s64 va, __s64 pagesz);
-void  __invala (void);
-void  __invala_gr(const int whichGeneralReg /* 0-127 */ );
-void  __invala_fr(const int whichFloatReg /* 0-127 */ );
-void  __nop(const int);
-void  __fc(__u64 *addr);
-void  __sum(int mask);
-void  __rum(int mask);
-void  __ssm(int mask);
-void  __rsm(int mask);
-__u64 __thash(__s64);
-__u64 __ttag(__s64);
-__s64 __tpa(__s64);
-
-/* Intrinsics for implementing get/put_user macros */
-void __st_user(const char *tableName, __u64 addr, char size, char relocType, __u64 val);
-void __ld_user(const char *tableName, __u64 addr, char size, char relocType);
-
-/* This intrinsic does not generate code, it creates a barrier across which
- * the compiler will not schedule data access instructions.
- */
-void __memory_barrier(void);
-
-void __isrlz(void);
-void __dsrlz(void);
-
-__u64  _m64_mux1(__u64 a, const int n);
-__u64  __thash(__u64);
-
-/* Lock and Atomic Operation Related Intrinsics */
-__u64 _InterlockedExchange8(volatile __u8 *trgt, __u8 value);
-__u64 _InterlockedExchange16(volatile __u16 *trgt, __u16 value);
-__s64 _InterlockedExchange(volatile __u32 *trgt, __u32 value);
-__s64 _InterlockedExchange64(volatile __u64 *trgt, __u64 value);
-
-__u64 _InterlockedCompareExchange8_rel(volatile __u8 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange8_acq(volatile __u8 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange16_rel(volatile __u16 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange16_acq(volatile __u16 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange_rel(volatile __u32 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange_acq(volatile __u32 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange64_rel(volatile __u64 *dest, __u64 xchg, __u64 comp);
-__u64 _InterlockedCompareExchange64_acq(volatile __u64 *dest, __u64 xchg, __u64 comp);
-
-__s64 _m64_dep_mi(const int v, __s64 s, const int p, const int len);
-__s64 _m64_shrp(__s64 a, __s64 b, const int count);
-__s64 _m64_popcnt(__s64 a);
+#include <ia64intrin.h>
 
 #define ia64_barrier()		__memory_barrier()
 
@@ -131,7 +28,7 @@ __s64 _m64_popcnt(__s64 a);
 #define ia64_mux1_alt		10
 #define ia64_mux1_rev		11
 
-#define ia64_mux1		_m64_mux1
+#define ia64_mux1(x,v)		_m_to_int64(_m64_mux1(_m_from_int64(x), (v)))
 #define ia64_popcnt		_m64_popcnt
 #define ia64_getf_exp		__getf_exp
 #define ia64_shrp		_m64_shrp
-- 
cgit v1.2.3


From 9df79decc395b2f9484ff93a1383ba705ff34b10 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Thu, 26 Jan 2006 17:12:02 -0800
Subject: [IA64] add __builtin_trap definition for icc build

Map __builtin_trap function to break 0 instruction.

Signed-off-by: HJ Lu <hongjiu.lu@intel.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/asm-ia64/intel_intrin.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/asm-ia64/intel_intrin.h b/include/asm-ia64/intel_intrin.h
index c7ac04945c12..67a4bafdc306 100644
--- a/include/asm-ia64/intel_intrin.h
+++ b/include/asm-ia64/intel_intrin.h
@@ -152,4 +152,6 @@ do {							\
 	}						\
 } while (0)
 
+#define __builtin_trap()	__break(0);
+
 #endif /* _ASM_IA64_INTEL_INTRIN_H */
-- 
cgit v1.2.3


From 412e6a378260608bf28f29d4fa8a9241e0240a2d Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Thu, 26 Jan 2006 17:19:46 -0800
Subject: [IA64] use icc defined constant

Use icc defined constant instead of magic number.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/asm-ia64/intel_intrin.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/asm-ia64/intel_intrin.h b/include/asm-ia64/intel_intrin.h
index 67a4bafdc306..d069b6acddce 100644
--- a/include/asm-ia64/intel_intrin.h
+++ b/include/asm-ia64/intel_intrin.h
@@ -22,11 +22,11 @@
 #define ia64_hint		__hint
 #define ia64_hint_pause		__hint_pause
 
-#define ia64_mux1_brcst	 0
-#define ia64_mux1_mix		 8
-#define ia64_mux1_shuf		 9
-#define ia64_mux1_alt		10
-#define ia64_mux1_rev		11
+#define ia64_mux1_brcst		_m64_mux1_brcst
+#define ia64_mux1_mix		_m64_mux1_mix
+#define ia64_mux1_shuf		_m64_mux1_shuf
+#define ia64_mux1_alt		_m64_mux1_alt
+#define ia64_mux1_rev		_m64_mux1_rev
 
 #define ia64_mux1(x,v)		_m_to_int64(_m64_mux1(_m_from_int64(x), (v)))
 #define ia64_popcnt		_m64_popcnt
@@ -56,7 +56,7 @@
 #define ia64_stf8		__stf8
 #define ia64_stf_spill		__stf_spill
 
-#define ia64_mf		__mf
+#define ia64_mf			__mf
 #define ia64_mfa		__mfa
 
 #define ia64_fetchadd4_acq	__fetchadd4_acq
@@ -132,10 +132,10 @@
 
 /* Values for lfhint in __lfetch and __lfetch_fault */
 
-#define ia64_lfhint_none   	0
-#define ia64_lfhint_nt1    	1
-#define ia64_lfhint_nt2    	2
-#define ia64_lfhint_nta    	3
+#define ia64_lfhint_none	__lfhint_none
+#define ia64_lfhint_nt1		__lfhint_nt1
+#define ia64_lfhint_nt2		__lfhint_nt2
+#define ia64_lfhint_nta		__lfhint_nta
 
 #define ia64_lfetch		__lfetch
 #define ia64_lfetch_excl	__lfetch_excl
-- 
cgit v1.2.3


From dcc1dd2366a7c355fd8b6543c52685b864a2044f Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Tue, 7 Feb 2006 09:24:14 -0800
Subject: [IA64-SGI] - Eliminate SN pio_phys_xxx macros. Move to assembly

Rewrite the SN pio_phys_xxx macros in assembly language. This
avoids issues with the Intel icc compiler. Function call
overhead is not an issue - the functions reference PIOs
and take 100's nsec to complete.

In addition, the functions should likely be in assembly
language anyway - they reference memory using physical
addressing mode. One function executes with psr.ic disabled.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/Makefile   |  3 +-
 arch/ia64/sn/kernel/pio_phys.S | 71 ++++++++++++++++++++++++++++++++++++++++++
 include/asm-ia64/sn/rw_mmr.h   | 56 +++------------------------------
 3 files changed, 78 insertions(+), 52 deletions(-)
 create mode 100644 arch/ia64/sn/kernel/pio_phys.S

diff --git a/arch/ia64/sn/kernel/Makefile b/arch/ia64/sn/kernel/Makefile
index 3e9b4eea7418..ab9c48c88012 100644
--- a/arch/ia64/sn/kernel/Makefile
+++ b/arch/ia64/sn/kernel/Makefile
@@ -10,7 +10,8 @@
 CPPFLAGS += -I$(srctree)/arch/ia64/sn/include
 
 obj-y				+= setup.o bte.o bte_error.o irq.o mca.o idle.o \
-				   huberror.o io_init.o iomv.o klconflib.o sn2/
+				   huberror.o io_init.o iomv.o klconflib.o pio_phys.o \
+				   sn2/
 obj-$(CONFIG_IA64_GENERIC)      += machvec.o
 obj-$(CONFIG_SGI_TIOCX)		+= tiocx.o
 obj-$(CONFIG_IA64_SGI_SN_XP)	+= xp.o
diff --git a/arch/ia64/sn/kernel/pio_phys.S b/arch/ia64/sn/kernel/pio_phys.S
new file mode 100644
index 000000000000..3c7d48d6ecb8
--- /dev/null
+++ b/arch/ia64/sn/kernel/pio_phys.S
@@ -0,0 +1,71 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved.
+ *
+ * This file contains macros used to access MMR registers via
+ * uncached physical addresses.
+ *      pio_phys_read_mmr  - read an MMR
+ *      pio_phys_write_mmr - write an MMR
+ *      pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0
+ *              Second MMR will be skipped if address is NULL
+ *
+ * Addresses passed to these routines should be uncached physical addresses
+ * 	ie., 0x80000....
+ */
+
+
+
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+GLOBAL_ENTRY(pio_phys_read_mmr)
+	.prologue
+	.regstk 1,0,0,0
+	.body
+	mov r2=psr
+	rsm psr.i | psr.dt
+	;;
+	srlz.d
+	ld8.acq r8=[r32]
+	;;
+	mov psr.l=r2;;
+	srlz.d
+	br.ret.sptk.many rp
+END(pio_phys_read_mmr)
+
+GLOBAL_ENTRY(pio_phys_write_mmr)
+	.prologue
+	.regstk 2,0,0,0
+	.body
+	mov r2=psr
+	rsm psr.i | psr.dt
+	;;
+	srlz.d
+	st8.rel [r32]=r33
+	;;
+	mov psr.l=r2;;
+	srlz.d
+	br.ret.sptk.many rp
+END(pio_phys_write_mmr)
+
+GLOBAL_ENTRY(pio_atomic_phys_write_mmrs)
+	.prologue
+	.regstk 4,0,0,0
+	.body
+	mov r2=psr
+	cmp.ne p9,p0=r34,r0;
+	rsm psr.i | psr.dt | psr.ic
+	;;
+	srlz.d
+	st8.rel [r32]=r33
+(p9)	st8.rel [r34]=r35
+	;;
+	mov psr.l=r2;;
+	srlz.d
+	br.ret.sptk.many rp
+END(pio_atomic_phys_write_mmrs)
+
+
diff --git a/include/asm-ia64/sn/rw_mmr.h b/include/asm-ia64/sn/rw_mmr.h
index f40fd1a5510d..2d78f4c5a45e 100644
--- a/include/asm-ia64/sn/rw_mmr.h
+++ b/include/asm-ia64/sn/rw_mmr.h
@@ -3,15 +3,14 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  *
- * Copyright (C) 2002-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (C) 2002-2006 Silicon Graphics, Inc.  All Rights Reserved.
  */
 #ifndef _ASM_IA64_SN_RW_MMR_H
 #define _ASM_IA64_SN_RW_MMR_H
 
 
 /*
- * This file contains macros used to access MMR registers via
- * uncached physical addresses.
+ * This file that access MMRs via uncached physical addresses.
  * 	pio_phys_read_mmr  - read an MMR
  * 	pio_phys_write_mmr - write an MMR
  * 	pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0
@@ -22,53 +21,8 @@
  */
 
 
-extern inline long
-pio_phys_read_mmr(volatile long *mmr) 
-{
-	long val;
-        asm volatile
-            ("mov r2=psr;;"
-             "rsm psr.i | psr.dt;;"
-             "srlz.i;;"
-             "ld8.acq %0=[%1];;"
-             "mov psr.l=r2;;"
-             "srlz.i;;"
-             : "=r"(val)
-             : "r"(mmr)
-	     : "r2");
-        return val;
-}
-
-
-
-extern inline void
-pio_phys_write_mmr(volatile long *mmr, long val) 
-{
-        asm volatile
-            ("mov r2=psr;;"
-             "rsm psr.i | psr.dt;;"
-             "srlz.i;;"
-             "st8.rel [%0]=%1;;"
-             "mov psr.l=r2;;"
-             "srlz.i;;"
-	     :: "r"(mmr), "r"(val)
-             : "r2", "memory");
-}            
-
-extern inline void
-pio_atomic_phys_write_mmrs(volatile long *mmr1, long val1, volatile long *mmr2, long val2) 
-{
-        asm volatile
-            ("mov r2=psr;;"
-             "rsm psr.i | psr.dt | psr.ic;;"
-	     "cmp.ne p9,p0=%2,r0;"
-             "srlz.i;;"
-             "st8.rel [%0]=%1;"
-             "(p9) st8.rel [%2]=%3;;"
-             "mov psr.l=r2;;"
-             "srlz.i;;"
-	     :: "r"(mmr1), "r"(val1), "r"(mmr2), "r"(val2)
-             : "p9", "r2", "memory");
-}            
+extern long pio_phys_read_mmr(volatile long *mmr); 
+extern void pio_phys_write_mmr(volatile long *mmr, long val);
+extern void pio_atomic_phys_write_mmrs(volatile long *mmr1, long val1, volatile long *mmr2, long val2); 
 
 #endif /* _ASM_IA64_SN_RW_MMR_H */
-- 
cgit v1.2.3


From 9336b0836bf789136b51caf9ddd49dcbf1726cf4 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Wed, 8 Feb 2006 13:40:59 +1100
Subject: [IA64] MCA: print messages in MCA handler

Print a message identifying the monarch MCA handler.  Print a summary
of the status of the slave MCA cpus.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index ee7eec9ee576..23d54413c006 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -908,7 +908,7 @@ no_mod:
 static void
 ia64_wait_for_slaves(int monarch)
 {
-	int c, wait = 0;
+	int c, wait = 0, missing = 0;
 	for_each_online_cpu(c) {
 		if (c == monarch)
 			continue;
@@ -919,15 +919,32 @@ ia64_wait_for_slaves(int monarch)
 		}
 	}
 	if (!wait)
-		return;
+		goto all_in;
 	for_each_online_cpu(c) {
 		if (c == monarch)
 			continue;
 		if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) {
 			udelay(5*1000000);	/* wait 5 seconds for slaves (arbitrary) */
+			if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
+				missing = 1;
 			break;
 		}
 	}
+	if (!missing)
+		goto all_in;
+	printk(KERN_INFO "OS MCA slave did not rendezvous on cpu");
+	for_each_online_cpu(c) {
+		if (c == monarch)
+			continue;
+		if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
+			printk(" %d", c);
+	}
+	printk("\n");
+	return;
+
+all_in:
+	printk(KERN_INFO "All OS MCA slaves have reached rendezvous\n");
+	return;
 }
 
 /*
@@ -953,6 +970,10 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 	task_t *previous_current;
 
 	oops_in_progress = 1;	/* FIXME: make printk NMI/MCA/INIT safe */
+	console_loglevel = 15;	/* make sure printks make it to console */
+	printk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d monarch=%ld\n",
+		sos->proc_state_param, cpu, sos->monarch);
+
 	previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
 	monarch_cpu = cpu;
 	if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0)
-- 
cgit v1.2.3


From e9ac054daaecf8a11f2113b60f2b6ce381c4f131 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Wed, 8 Feb 2006 13:41:04 +1100
Subject: [IA64] MCA: update MCA comm field for user space tasks

Update the comm field on the MCA handler for user tasks as well as for
verified kernel tasks.  This helps to identify the task that was
running when the MCA occurred.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca.c | 52 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 23d54413c006..4f8464ead634 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -630,6 +630,32 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat)
 	*tnat |= (nat << tslot);
 }
 
+/* Change the comm field on the MCA/INT task to include the pid that
+ * was interrupted, it makes for easier debugging.  If that pid was 0
+ * (swapper or nested MCA/INIT) then use the start of the previous comm
+ * field suffixed with its cpu.
+ */
+
+static void
+ia64_mca_modify_comm(const task_t *previous_current)
+{
+	char *p, comm[sizeof(current->comm)];
+	if (previous_current->pid)
+		snprintf(comm, sizeof(comm), "%s %d",
+			current->comm, previous_current->pid);
+	else {
+		int l;
+		if ((p = strchr(previous_current->comm, ' ')))
+			l = p - previous_current->comm;
+		else
+			l = strlen(previous_current->comm);
+		snprintf(comm, sizeof(comm), "%s %*s %d",
+			current->comm, l, previous_current->comm,
+			task_thread_info(previous_current)->cpu);
+	}
+	memcpy(current->comm, comm, sizeof(current->comm));
+}
+
 /* On entry to this routine, we are running on the per cpu stack, see
  * mca_asm.h.  The original stack has not been touched by this event.  Some of
  * the original stack's registers will be in the RBS on this stack.  This stack
@@ -648,7 +674,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 		struct ia64_sal_os_state *sos,
 		const char *type)
 {
-	char *p, comm[sizeof(current->comm)];
+	char *p;
 	ia64_va va;
 	extern char ia64_leave_kernel[];	/* Need asm address, not function descriptor */
 	const pal_min_state_area_t *ms = sos->pal_min_state;
@@ -721,6 +747,10 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 	/* Verify the previous stack state before we change it */
 	if (user_mode(regs)) {
 		msg = "occurred in user space";
+		/* previous_current is guaranteed to be valid when the task was
+		 * in user space, so ...
+		 */
+		ia64_mca_modify_comm(previous_current);
 		goto no_mod;
 	}
 	if (r13 != sos->prev_IA64_KR_CURRENT) {
@@ -750,25 +780,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 		goto no_mod;
 	}
 
-	/* Change the comm field on the MCA/INT task to include the pid that
-	 * was interrupted, it makes for easier debugging.  If that pid was 0
-	 * (swapper or nested MCA/INIT) then use the start of the previous comm
-	 * field suffixed with its cpu.
-	 */
-	if (previous_current->pid)
-		snprintf(comm, sizeof(comm), "%s %d",
-			current->comm, previous_current->pid);
-	else {
-		int l;
-		if ((p = strchr(previous_current->comm, ' ')))
-			l = p - previous_current->comm;
-		else
-			l = strlen(previous_current->comm);
-		snprintf(comm, sizeof(comm), "%s %*s %d",
-			current->comm, l, previous_current->comm,
-			task_thread_info(previous_current)->cpu);
-	}
-	memcpy(current->comm, comm, sizeof(current->comm));
+	ia64_mca_modify_comm(previous_current);
 
 	/* Make the original task look blocked.  First stack a struct pt_regs,
 	 * describing the state at the time of interrupt.  mca_asm.S built a
-- 
cgit v1.2.3


From 2730c9295a9a797a22b800d3befd6a64fdc56b02 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Wed, 8 Feb 2006 13:41:10 +1100
Subject: [IA64] MCA: remove obsolete ifdef

No platform in the community tree uses PLATFORM_MCA_HANDLERS, remove
the references.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/mca.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 4f8464ead634..909fed2c2499 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -281,11 +281,6 @@ ia64_mca_log_sal_error_record(int sal_info_type)
 		ia64_sal_clear_state_info(sal_info_type);
 }
 
-/*
- * platform dependent error handling
- */
-#ifndef PLATFORM_MCA_HANDLERS
-
 #ifdef CONFIG_ACPI
 
 int cpe_vector = -1;
@@ -377,8 +372,6 @@ ia64_mca_register_cpev (int cpev)
 }
 #endif /* CONFIG_ACPI */
 
-#endif /* PLATFORM_MCA_HANDLERS */
-
 /*
  * ia64_mca_cmc_vector_setup
  *
-- 
cgit v1.2.3


From 7aa6ba41362a7f888ad11fdcfe51ca8d92226cd3 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Fri, 17 Feb 2006 05:18:43 -0500
Subject: [IA64-SGI] SN2-XP reduce kmalloc wrapper inlining

Take advantage of kzalloc() as well as reduce the size of code generated
for the error returns in xpc_setup_infrastructure().

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Dean Nelson <dcn@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/sn/kernel/xpc_channel.c   | 102 +++++++++++++++++++++---------------
 arch/ia64/sn/kernel/xpc_main.c      |   1 -
 arch/ia64/sn/kernel/xpc_partition.c |  28 +++++++++-
 include/asm-ia64/sn/xpc.h           |  22 --------
 4 files changed, 85 insertions(+), 68 deletions(-)

diff --git a/arch/ia64/sn/kernel/xpc_channel.c b/arch/ia64/sn/kernel/xpc_channel.c
index cdf6856ce089..d0abddd9ffe6 100644
--- a/arch/ia64/sn/kernel/xpc_channel.c
+++ b/arch/ia64/sn/kernel/xpc_channel.c
@@ -21,7 +21,6 @@
 #include <linux/sched.h>
 #include <linux/cache.h>
 #include <linux/interrupt.h>
-#include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/completion.h>
 #include <asm/sn/bte.h>
@@ -29,6 +28,31 @@
 #include <asm/sn/xpc.h>
 
 
+/*
+ * Guarantee that the kzalloc'd memory is cacheline aligned.
+ */
+static void *
+xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+	/* see if kzalloc will give us cachline aligned memory by default */
+	*base = kzalloc(size, flags);
+	if (*base == NULL) {
+		return NULL;
+	}
+	if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
+		return *base;
+	}
+	kfree(*base);
+
+	/* nope, we'll have to do it ourselves */
+	*base = kzalloc(size + L1_CACHE_BYTES, flags);
+	if (*base == NULL) {
+		return NULL;
+	}
+	return (void *) L1_CACHE_ALIGN((u64) *base);
+}
+
+
 /*
  * Set up the initial values for the XPartition Communication channels.
  */
@@ -93,20 +117,19 @@ xpc_setup_infrastructure(struct xpc_partition *part)
 	 * Allocate all of the channel structures as a contiguous chunk of
 	 * memory.
 	 */
-	part->channels = kmalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
+	part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS,
 								GFP_KERNEL);
 	if (part->channels == NULL) {
 		dev_err(xpc_chan, "can't get memory for channels\n");
 		return xpcNoMemory;
 	}
-	memset(part->channels, 0, sizeof(struct xpc_channel) * XPC_NCHANNELS);
 
 	part->nchannels = XPC_NCHANNELS;
 
 
 	/* allocate all the required GET/PUT values */
 
-	part->local_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE,
+	part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
 					GFP_KERNEL, &part->local_GPs_base);
 	if (part->local_GPs == NULL) {
 		kfree(part->channels);
@@ -115,55 +138,51 @@ xpc_setup_infrastructure(struct xpc_partition *part)
 			"values\n");
 		return xpcNoMemory;
 	}
-	memset(part->local_GPs, 0, XPC_GP_SIZE);
 
-	part->remote_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE,
+	part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE,
 					GFP_KERNEL, &part->remote_GPs_base);
 	if (part->remote_GPs == NULL) {
-		kfree(part->channels);
-		part->channels = NULL;
-		kfree(part->local_GPs_base);
-		part->local_GPs = NULL;
 		dev_err(xpc_chan, "can't get memory for remote get/put "
 			"values\n");
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
 		return xpcNoMemory;
 	}
-	memset(part->remote_GPs, 0, XPC_GP_SIZE);
 
 
 	/* allocate all the required open and close args */
 
-	part->local_openclose_args = xpc_kmalloc_cacheline_aligned(
+	part->local_openclose_args = xpc_kzalloc_cacheline_aligned(
 					XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
 					&part->local_openclose_args_base);
 	if (part->local_openclose_args == NULL) {
-		kfree(part->channels);
-		part->channels = NULL;
-		kfree(part->local_GPs_base);
-		part->local_GPs = NULL;
+		dev_err(xpc_chan, "can't get memory for local connect args\n");
 		kfree(part->remote_GPs_base);
 		part->remote_GPs = NULL;
-		dev_err(xpc_chan, "can't get memory for local connect args\n");
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
 		return xpcNoMemory;
 	}
-	memset(part->local_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE);
 
-	part->remote_openclose_args = xpc_kmalloc_cacheline_aligned(
+	part->remote_openclose_args = xpc_kzalloc_cacheline_aligned(
 					XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL,
 					&part->remote_openclose_args_base);
 	if (part->remote_openclose_args == NULL) {
-		kfree(part->channels);
-		part->channels = NULL;
-		kfree(part->local_GPs_base);
-		part->local_GPs = NULL;
-		kfree(part->remote_GPs_base);
-		part->remote_GPs = NULL;
+		dev_err(xpc_chan, "can't get memory for remote connect args\n");
 		kfree(part->local_openclose_args_base);
 		part->local_openclose_args = NULL;
-		dev_err(xpc_chan, "can't get memory for remote connect args\n");
+		kfree(part->remote_GPs_base);
+		part->remote_GPs = NULL;
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
 		return xpcNoMemory;
 	}
-	memset(part->remote_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE);
 
 
 	xpc_initialize_channels(part, partid);
@@ -186,18 +205,18 @@ xpc_setup_infrastructure(struct xpc_partition *part)
 	ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, SA_SHIRQ,
 				part->IPI_owner, (void *) (u64) partid);
 	if (ret != 0) {
-		kfree(part->channels);
-		part->channels = NULL;
-		kfree(part->local_GPs_base);
-		part->local_GPs = NULL;
-		kfree(part->remote_GPs_base);
-		part->remote_GPs = NULL;
-		kfree(part->local_openclose_args_base);
-		part->local_openclose_args = NULL;
-		kfree(part->remote_openclose_args_base);
-		part->remote_openclose_args = NULL;
 		dev_err(xpc_chan, "can't register NOTIFY IRQ handler, "
 			"errno=%d\n", -ret);
+		kfree(part->remote_openclose_args_base);
+		part->remote_openclose_args = NULL;
+		kfree(part->local_openclose_args_base);
+		part->local_openclose_args = NULL;
+		kfree(part->remote_GPs_base);
+		part->remote_GPs = NULL;
+		kfree(part->local_GPs_base);
+		part->local_GPs = NULL;
+		kfree(part->channels);
+		part->channels = NULL;
 		return xpcLackOfResources;
 	}
 
@@ -446,22 +465,20 @@ xpc_allocate_local_msgqueue(struct xpc_channel *ch)
 	for (nentries = ch->local_nentries; nentries > 0; nentries--) {
 
 		nbytes = nentries * ch->msg_size;
-		ch->local_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes,
+		ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
 						GFP_KERNEL,
 						&ch->local_msgqueue_base);
 		if (ch->local_msgqueue == NULL) {
 			continue;
 		}
-		memset(ch->local_msgqueue, 0, nbytes);
 
 		nbytes = nentries * sizeof(struct xpc_notify);
-		ch->notify_queue = kmalloc(nbytes, GFP_KERNEL);
+		ch->notify_queue = kzalloc(nbytes, GFP_KERNEL);
 		if (ch->notify_queue == NULL) {
 			kfree(ch->local_msgqueue_base);
 			ch->local_msgqueue = NULL;
 			continue;
 		}
-		memset(ch->notify_queue, 0, nbytes);
 
 		spin_lock_irqsave(&ch->lock, irq_flags);
 		if (nentries < ch->local_nentries) {
@@ -501,13 +518,12 @@ xpc_allocate_remote_msgqueue(struct xpc_channel *ch)
 	for (nentries = ch->remote_nentries; nentries > 0; nentries--) {
 
 		nbytes = nentries * ch->msg_size;
-		ch->remote_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes,
+		ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes,
 						GFP_KERNEL,
 						&ch->remote_msgqueue_base);
 		if (ch->remote_msgqueue == NULL) {
 			continue;
 		}
-		memset(ch->remote_msgqueue, 0, nbytes);
 
 		spin_lock_irqsave(&ch->lock, irq_flags);
 		if (nentries < ch->remote_nentries) {
diff --git a/arch/ia64/sn/kernel/xpc_main.c b/arch/ia64/sn/kernel/xpc_main.c
index 8cbf16432570..99b123a6421a 100644
--- a/arch/ia64/sn/kernel/xpc_main.c
+++ b/arch/ia64/sn/kernel/xpc_main.c
@@ -52,7 +52,6 @@
 #include <linux/syscalls.h>
 #include <linux/cache.h>
 #include <linux/interrupt.h>
-#include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/reboot.h>
 #include <linux/completion.h>
diff --git a/arch/ia64/sn/kernel/xpc_partition.c b/arch/ia64/sn/kernel/xpc_partition.c
index 88a730e6cfdb..94211429fd0c 100644
--- a/arch/ia64/sn/kernel/xpc_partition.c
+++ b/arch/ia64/sn/kernel/xpc_partition.c
@@ -80,6 +80,31 @@ char ____cacheline_aligned xpc_remote_copy_buffer[XPC_RP_HEADER_SIZE +
 							XP_NASID_MASK_BYTES];
 
 
+/*
+ * Guarantee that the kmalloc'd memory is cacheline aligned.
+ */
+static void *
+xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
+{
+	/* see if kmalloc will give us cachline aligned memory by default */
+	*base = kmalloc(size, flags);
+	if (*base == NULL) {
+		return NULL;
+	}
+	if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
+		return *base;
+	}
+	kfree(*base);
+
+	/* nope, we'll have to do it ourselves */
+	*base = kmalloc(size + L1_CACHE_BYTES, flags);
+	if (*base == NULL) {
+		return NULL;
+	}
+	return (void *) L1_CACHE_ALIGN((u64) *base);
+}
+
+
 /*
  * Given a nasid, get the physical address of the  partition's reserved page
  * for that nasid. This function returns 0 on any error.
@@ -1038,13 +1063,12 @@ xpc_discovery(void)
 	remote_vars = (struct xpc_vars *) remote_rp;
 
 
-	discovered_nasids = kmalloc(sizeof(u64) * xp_nasid_mask_words,
+	discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words,
 							GFP_KERNEL);
 	if (discovered_nasids == NULL) {
 		kfree(remote_rp_base);
 		return;
 	}
-	memset(discovered_nasids, 0, sizeof(u64) * xp_nasid_mask_words);
 
 	rp = (struct xpc_rsvd_page *) xpc_rsvd_page;
 
diff --git a/include/asm-ia64/sn/xpc.h b/include/asm-ia64/sn/xpc.h
index df7f5f4f3cde..aa3b8ace9030 100644
--- a/include/asm-ia64/sn/xpc.h
+++ b/include/asm-ia64/sn/xpc.h
@@ -1227,28 +1227,6 @@ xpc_map_bte_errors(bte_result_t error)
 
 
-static inline void *
-xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
-{
-	/* see if kmalloc will give us cachline aligned memory by default */
-	*base = kmalloc(size, flags);
-	if (*base == NULL) {
-		return NULL;
-	}
-	if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) {
-		return *base;
-	}
-	kfree(*base);
-
-	/* nope, we'll have to do it ourselves */
-	*base = kmalloc(size + L1_CACHE_BYTES, flags);
-	if (*base == NULL) {
-		return NULL;
-	}
-	return (void *) L1_CACHE_ALIGN((u64) *base);
-}
-
-
 /*
  * Check to see if there is any channel activity to/from the specified
  * partition.
-- 
cgit v1.2.3