summaryrefslogtreecommitdiff
path: root/arch/powerpc/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kernel')
-rw-r--r--arch/powerpc/kernel/Makefile4
-rw-r--r--arch/powerpc/kernel/asm-offsets.c7
-rw-r--r--arch/powerpc/kernel/cacheinfo.c36
-rw-r--r--arch/powerpc/kernel/eeh.c1070
-rw-r--r--arch/powerpc/kernel/eeh_cache.c318
-rw-r--r--arch/powerpc/kernel/eeh_dev.c112
-rw-r--r--arch/powerpc/kernel/eeh_driver.c661
-rw-r--r--arch/powerpc/kernel/eeh_event.c182
-rw-r--r--arch/powerpc/kernel/eeh_pe.c800
-rw-r--r--arch/powerpc/kernel/eeh_sysfs.c74
-rw-r--r--arch/powerpc/kernel/entry_64.S30
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S56
-rw-r--r--arch/powerpc/kernel/hw_breakpoint.c3
-rw-r--r--arch/powerpc/kernel/idle.c4
-rw-r--r--arch/powerpc/kernel/io-workarounds.c11
-rw-r--r--arch/powerpc/kernel/iommu.c323
-rw-r--r--arch/powerpc/kernel/irq.c2
-rw-r--r--arch/powerpc/kernel/kprobes.c20
-rw-r--r--arch/powerpc/kernel/nvram_64.c20
-rw-r--r--arch/powerpc/kernel/pci-hotplug.c111
-rw-r--r--arch/powerpc/kernel/process.c4
-rw-r--r--arch/powerpc/kernel/prom.c42
-rw-r--r--arch/powerpc/kernel/ptrace.c4
-rw-r--r--arch/powerpc/kernel/reloc_32.S3
-rw-r--r--arch/powerpc/kernel/rtas.c4
-rw-r--r--arch/powerpc/kernel/setup_64.c2
-rw-r--r--arch/powerpc/kernel/signal_32.c70
-rw-r--r--arch/powerpc/kernel/signal_64.c8
-rw-r--r--arch/powerpc/kernel/smp.c12
-rw-r--r--arch/powerpc/kernel/sysfs.c6
-rw-r--r--arch/powerpc/kernel/time.c1
-rw-r--r--arch/powerpc/kernel/tm.S18
-rw-r--r--arch/powerpc/kernel/traps.c79
-rw-r--r--arch/powerpc/kernel/udbg.c2
-rw-r--r--arch/powerpc/kernel/vdso.c2
35 files changed, 3944 insertions, 157 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index f960a7944553..a8619bfe879e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -58,6 +58,8 @@ obj-$(CONFIG_RTAS_PROC) += rtas-proc.o
obj-$(CONFIG_LPARCFG) += lparcfg.o
obj-$(CONFIG_IBMVIO) += vio.o
obj-$(CONFIG_IBMEBUS) += ibmebus.o
+obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
+ eeh_driver.o eeh_event.o eeh_sysfs.o
obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_FA_DUMP) += fadump.o
@@ -100,7 +102,7 @@ obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o
-pci64-$(CONFIG_PPC64) += pci_dn.o isa-bridge.o
+pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o
obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \
pci-common.o pci_of_scan.o
obj-$(CONFIG_PCI_MSI) += msi.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6f16ffafa6f0..c7e8afc2ead0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -105,9 +105,6 @@ int main(void)
DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid));
#else /* CONFIG_PPC64 */
DEFINE(PGDIR, offsetof(struct thread_struct, pgdir));
-#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
- DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
-#endif
#ifdef CONFIG_SPE
DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0]));
DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc));
@@ -115,6 +112,9 @@ int main(void)
DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe));
#endif /* CONFIG_SPE */
#endif /* CONFIG_PPC64 */
+#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
+ DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0));
+#endif
#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
#endif
@@ -132,7 +132,6 @@ int main(void)
DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier));
DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0));
DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2));
- DEFINE(THREAD_MMCRA, offsetof(struct thread_struct, mmcra));
#endif
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch));
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 92c6b008dd2b..9262cf2bec4b 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -131,7 +131,8 @@ static const char *cache_type_string(const struct cache *cache)
return cache_type_info[cache->type].name;
}
-static void __cpuinit cache_init(struct cache *cache, int type, int level, struct device_node *ofnode)
+static void cache_init(struct cache *cache, int type, int level,
+ struct device_node *ofnode)
{
cache->type = type;
cache->level = level;
@@ -140,7 +141,7 @@ static void __cpuinit cache_init(struct cache *cache, int type, int level, struc
list_add(&cache->list, &cache_list);
}
-static struct cache *__cpuinit new_cache(int type, int level, struct device_node *ofnode)
+static struct cache *new_cache(int type, int level, struct device_node *ofnode)
{
struct cache *cache;
@@ -324,7 +325,8 @@ static bool cache_node_is_unified(const struct device_node *np)
return of_get_property(np, "cache-unified", NULL);
}
-static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_unified(struct device_node *node,
+ int level)
{
struct cache *cache;
@@ -335,7 +337,8 @@ static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *
return cache;
}
-static struct cache *__cpuinit cache_do_one_devnode_split(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode_split(struct device_node *node,
+ int level)
{
struct cache *dcache, *icache;
@@ -357,7 +360,7 @@ err:
return NULL;
}
-static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, int level)
+static struct cache *cache_do_one_devnode(struct device_node *node, int level)
{
struct cache *cache;
@@ -369,7 +372,8 @@ static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, in
return cache;
}
-static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *node, int level)
+static struct cache *cache_lookup_or_instantiate(struct device_node *node,
+ int level)
{
struct cache *cache;
@@ -385,7 +389,7 @@ static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *n
return cache;
}
-static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigger)
+static void link_cache_lists(struct cache *smaller, struct cache *bigger)
{
while (smaller->next_local) {
if (smaller->next_local == bigger)
@@ -396,13 +400,13 @@ static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigg
smaller->next_local = bigger;
}
-static void __cpuinit do_subsidiary_caches_debugcheck(struct cache *cache)
+static void do_subsidiary_caches_debugcheck(struct cache *cache)
{
WARN_ON_ONCE(cache->level != 1);
WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu"));
}
-static void __cpuinit do_subsidiary_caches(struct cache *cache)
+static void do_subsidiary_caches(struct cache *cache)
{
struct device_node *subcache_node;
int level = cache->level;
@@ -423,7 +427,7 @@ static void __cpuinit do_subsidiary_caches(struct cache *cache)
}
}
-static struct cache *__cpuinit cache_chain_instantiate(unsigned int cpu_id)
+static struct cache *cache_chain_instantiate(unsigned int cpu_id)
{
struct device_node *cpu_node;
struct cache *cpu_cache = NULL;
@@ -448,7 +452,7 @@ out:
return cpu_cache;
}
-static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id)
+static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id)
{
struct cache_dir *cache_dir;
struct device *dev;
@@ -653,7 +657,7 @@ static struct kobj_type cache_index_type = {
.default_attrs = cache_index_default_attrs,
};
-static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
+static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
{
const char *cache_name;
const char *cache_type;
@@ -696,7 +700,8 @@ static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *d
kfree(buf);
}
-static void __cpuinit cacheinfo_create_index_dir(struct cache *cache, int index, struct cache_dir *cache_dir)
+static void cacheinfo_create_index_dir(struct cache *cache, int index,
+ struct cache_dir *cache_dir)
{
struct cache_index_dir *index_dir;
int rc;
@@ -722,7 +727,8 @@ err:
kfree(index_dir);
}
-static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache *cache_list)
+static void cacheinfo_sysfs_populate(unsigned int cpu_id,
+ struct cache *cache_list)
{
struct cache_dir *cache_dir;
struct cache *cache;
@@ -740,7 +746,7 @@ static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache
}
}
-void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id)
+void cacheinfo_cpu_online(unsigned int cpu_id)
{
struct cache *cache;
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
new file mode 100644
index 000000000000..39954fe941b8
--- /dev/null
+++ b/arch/powerpc/kernel/eeh.c
@@ -0,0 +1,1070 @@
+/*
+ * Copyright IBM Corporation 2001, 2005, 2006
+ * Copyright Dave Engebretsen & Todd Inglett 2001
+ * Copyright Linas Vepstas 2005, 2006
+ * Copyright 2001-2012 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/proc_fs.h>
+#include <linux/rbtree.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/of.h>
+
+#include <linux/atomic.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/io.h>
+#include <asm/machdep.h>
+#include <asm/ppc-pci.h>
+#include <asm/rtas.h>
+
+
+/** Overview:
+ * EEH, or "Extended Error Handling" is a PCI bridge technology for
+ * dealing with PCI bus errors that can't be dealt with within the
+ * usual PCI framework, except by check-stopping the CPU. Systems
+ * that are designed for high-availability/reliability cannot afford
+ * to crash due to a "mere" PCI error, thus the need for EEH.
+ * An EEH-capable bridge operates by converting a detected error
+ * into a "slot freeze", taking the PCI adapter off-line, making
+ * the slot behave, from the OS'es point of view, as if the slot
+ * were "empty": all reads return 0xff's and all writes are silently
+ * ignored. EEH slot isolation events can be triggered by parity
+ * errors on the address or data busses (e.g. during posted writes),
+ * which in turn might be caused by low voltage on the bus, dust,
+ * vibration, humidity, radioactivity or plain-old failed hardware.
+ *
+ * Note, however, that one of the leading causes of EEH slot
+ * freeze events are buggy device drivers, buggy device microcode,
+ * or buggy device hardware. This is because any attempt by the
+ * device to bus-master data to a memory address that is not
+ * assigned to the device will trigger a slot freeze. (The idea
+ * is to prevent devices-gone-wild from corrupting system memory).
+ * Buggy hardware/drivers will have a miserable time co-existing
+ * with EEH.
+ *
+ * Ideally, a PCI device driver, when suspecting that an isolation
+ * event has occurred (e.g. by reading 0xff's), will then ask EEH
+ * whether this is the case, and then take appropriate steps to
+ * reset the PCI slot, the PCI device, and then resume operations.
+ * However, until that day, the checking is done here, with the
+ * eeh_check_failure() routine embedded in the MMIO macros. If
+ * the slot is found to be isolated, an "EEH Event" is synthesized
+ * and sent out for processing.
+ */
+
+/* If a device driver keeps reading an MMIO register in an interrupt
+ * handler after a slot isolation event, it might be broken.
+ * This sets the threshold for how many read attempts we allow
+ * before printing an error message.
+ */
+#define EEH_MAX_FAILS 2100000
+
+/* Time to wait for a PCI slot to report status, in milliseconds */
+#define PCI_BUS_RESET_WAIT_MSEC (60*1000)
+
+/* Platform dependent EEH operations */
+struct eeh_ops *eeh_ops = NULL;
+
+int eeh_subsystem_enabled;
+EXPORT_SYMBOL(eeh_subsystem_enabled);
+
+/*
+ * EEH probe mode support. The intention is to support multiple
+ * platforms for EEH. Some platforms like pSeries do PCI emunation
+ * based on device tree. However, other platforms like powernv probe
+ * PCI devices from hardware. The flag is used to distinguish that.
+ * In addition, struct eeh_ops::probe would be invoked for particular
+ * OF node or PCI device so that the corresponding PE would be created
+ * there.
+ */
+int eeh_probe_mode;
+
+/* Lock to avoid races due to multiple reports of an error */
+DEFINE_RAW_SPINLOCK(confirm_error_lock);
+
+/* Buffer for reporting pci register dumps. Its here in BSS, and
+ * not dynamically alloced, so that it ends up in RMO where RTAS
+ * can access it.
+ */
+#define EEH_PCI_REGS_LOG_LEN 4096
+static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN];
+
+/*
+ * The struct is used to maintain the EEH global statistic
+ * information. Besides, the EEH global statistics will be
+ * exported to user space through procfs
+ */
+struct eeh_stats {
+ u64 no_device; /* PCI device not found */
+ u64 no_dn; /* OF node not found */
+ u64 no_cfg_addr; /* Config address not found */
+ u64 ignored_check; /* EEH check skipped */
+ u64 total_mmio_ffs; /* Total EEH checks */
+ u64 false_positives; /* Unnecessary EEH checks */
+ u64 slot_resets; /* PE reset */
+};
+
+static struct eeh_stats eeh_stats;
+
+#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
+
+/**
+ * eeh_gather_pci_data - Copy assorted PCI config space registers to buff
+ * @edev: device to report data for
+ * @buf: point to buffer in which to log
+ * @len: amount of room in buffer
+ *
+ * This routine captures assorted PCI configuration space data,
+ * and puts them into a buffer for RTAS error logging.
+ */
+static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
+{
+ struct device_node *dn = eeh_dev_to_of_node(edev);
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ u32 cfg;
+ int cap, i;
+ int n = 0;
+
+ n += scnprintf(buf+n, len-n, "%s\n", dn->full_name);
+ printk(KERN_WARNING "EEH: of node=%s\n", dn->full_name);
+
+ eeh_ops->read_config(dn, PCI_VENDOR_ID, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
+ printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg);
+
+ eeh_ops->read_config(dn, PCI_COMMAND, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
+ printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg);
+
+ if (!dev) {
+ printk(KERN_WARNING "EEH: no PCI device for this of node\n");
+ return n;
+ }
+
+ /* Gather bridge-specific registers */
+ if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
+ eeh_ops->read_config(dn, PCI_SEC_STATUS, 2, &cfg);
+ n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
+ printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg);
+
+ eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &cfg);
+ n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
+ printk(KERN_WARNING "EEH: Bridge control: %04x\n", cfg);
+ }
+
+ /* Dump out the PCI-X command and status regs */
+ cap = pci_find_capability(dev, PCI_CAP_ID_PCIX);
+ if (cap) {
+ eeh_ops->read_config(dn, cap, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
+ printk(KERN_WARNING "EEH: PCI-X cmd: %08x\n", cfg);
+
+ eeh_ops->read_config(dn, cap+4, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
+ printk(KERN_WARNING "EEH: PCI-X status: %08x\n", cfg);
+ }
+
+ /* If PCI-E capable, dump PCI-E cap 10, and the AER */
+ cap = pci_find_capability(dev, PCI_CAP_ID_EXP);
+ if (cap) {
+ n += scnprintf(buf+n, len-n, "pci-e cap10:\n");
+ printk(KERN_WARNING
+ "EEH: PCI-E capabilities and status follow:\n");
+
+ for (i=0; i<=8; i++) {
+ eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+ printk(KERN_WARNING "EEH: PCI-E %02x: %08x\n", i, cfg);
+ }
+
+ cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
+ if (cap) {
+ n += scnprintf(buf+n, len-n, "pci-e AER:\n");
+ printk(KERN_WARNING
+ "EEH: PCI-E AER capability register set follows:\n");
+
+ for (i=0; i<14; i++) {
+ eeh_ops->read_config(dn, cap+4*i, 4, &cfg);
+ n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
+ printk(KERN_WARNING "EEH: PCI-E AER %02x: %08x\n", i, cfg);
+ }
+ }
+ }
+
+ return n;
+}
+
+/**
+ * eeh_slot_error_detail - Generate combined log including driver log and error log
+ * @pe: EEH PE
+ * @severity: temporary or permanent error log
+ *
+ * This routine should be called to generate the combined log, which
+ * is comprised of driver log and error log. The driver log is figured
+ * out from the config space of the corresponding PCI device, while
+ * the error log is fetched through platform dependent function call.
+ */
+void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
+{
+ size_t loglen = 0;
+ struct eeh_dev *edev;
+ bool valid_cfg_log = true;
+
+ /*
+ * When the PHB is fenced or dead, it's pointless to collect
+ * the data from PCI config space because it should return
+ * 0xFF's. For ER, we still retrieve the data from the PCI
+ * config space.
+ */
+ if (eeh_probe_mode_dev() &&
+ (pe->type & EEH_PE_PHB) &&
+ (pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)))
+ valid_cfg_log = false;
+
+ if (valid_cfg_log) {
+ eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+ eeh_ops->configure_bridge(pe);
+ eeh_pe_restore_bars(pe);
+
+ pci_regs_buf[0] = 0;
+ eeh_pe_for_each_dev(pe, edev) {
+ loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen,
+ EEH_PCI_REGS_LOG_LEN - loglen);
+ }
+ }
+
+ eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
+}
+
+/**
+ * eeh_token_to_phys - Convert EEH address token to phys address
+ * @token: I/O token, should be address in the form 0xA....
+ *
+ * This routine should be called to convert virtual I/O address
+ * to physical one.
+ */
+static inline unsigned long eeh_token_to_phys(unsigned long token)
+{
+ pte_t *ptep;
+ unsigned long pa;
+ int hugepage_shift;
+
+ /*
+ * We won't find hugepages here, iomem
+ */
+ ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
+ if (!ptep)
+ return token;
+ WARN_ON(hugepage_shift);
+ pa = pte_pfn(*ptep) << PAGE_SHIFT;
+
+ return pa | (token & (PAGE_SIZE-1));
+}
+
+/*
+ * On PowerNV platform, we might already have fenced PHB there.
+ * For that case, it's meaningless to recover frozen PE. Intead,
+ * We have to handle fenced PHB firstly.
+ */
+static int eeh_phb_check_failure(struct eeh_pe *pe)
+{
+ struct eeh_pe *phb_pe;
+ unsigned long flags;
+ int ret;
+
+ if (!eeh_probe_mode_dev())
+ return -EPERM;
+
+ /* Find the PHB PE */
+ phb_pe = eeh_phb_pe_get(pe->phb);
+ if (!phb_pe) {
+ pr_warning("%s Can't find PE for PHB#%d\n",
+ __func__, pe->phb->global_number);
+ return -EEXIST;
+ }
+
+ /* If the PHB has been in problematic state */
+ eeh_serialize_lock(&flags);
+ if (phb_pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Check PHB state */
+ ret = eeh_ops->get_state(phb_pe, NULL);
+ if ((ret < 0) ||
+ (ret == EEH_STATE_NOT_SUPPORT) ||
+ (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
+ (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Isolate the PHB and send event */
+ eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
+ eeh_serialize_unlock(flags);
+ eeh_send_failure_event(phb_pe);
+
+ pr_err("EEH: PHB#%x failure detected\n",
+ phb_pe->phb->global_number);
+ dump_stack();
+
+ return 1;
+out:
+ eeh_serialize_unlock(flags);
+ return ret;
+}
+
+/**
+ * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @edev: eeh device
+ *
+ * Check for an EEH failure for the given device node. Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze. This routine
+ * will query firmware for the EEH status.
+ *
+ * Returns 0 if there has not been an EEH error; otherwise returns
+ * a non-zero value and queues up a slot isolation event notification.
+ *
+ * It is safe to call this routine in an interrupt context.
+ */
+int eeh_dev_check_failure(struct eeh_dev *edev)
+{
+ int ret;
+ unsigned long flags;
+ struct device_node *dn;
+ struct pci_dev *dev;
+ struct eeh_pe *pe;
+ int rc = 0;
+ const char *location;
+
+ eeh_stats.total_mmio_ffs++;
+
+ if (!eeh_subsystem_enabled)
+ return 0;
+
+ if (!edev) {
+ eeh_stats.no_dn++;
+ return 0;
+ }
+ dn = eeh_dev_to_of_node(edev);
+ dev = eeh_dev_to_pci_dev(edev);
+ pe = edev->pe;
+
+ /* Access to IO BARs might get this far and still not want checking. */
+ if (!pe) {
+ eeh_stats.ignored_check++;
+ pr_debug("EEH: Ignored check for %s %s\n",
+ eeh_pci_name(dev), dn->full_name);
+ return 0;
+ }
+
+ if (!pe->addr && !pe->config_addr) {
+ eeh_stats.no_cfg_addr++;
+ return 0;
+ }
+
+ /*
+ * On PowerNV platform, we might already have fenced PHB
+ * there and we need take care of that firstly.
+ */
+ ret = eeh_phb_check_failure(pe);
+ if (ret > 0)
+ return ret;
+
+ /* If we already have a pending isolation event for this
+ * slot, we know it's bad already, we don't need to check.
+ * Do this checking under a lock; as multiple PCI devices
+ * in one slot might report errors simultaneously, and we
+ * only want one error recovery routine running.
+ */
+ eeh_serialize_lock(&flags);
+ rc = 1;
+ if (pe->state & EEH_PE_ISOLATED) {
+ pe->check_count++;
+ if (pe->check_count % EEH_MAX_FAILS == 0) {
+ location = of_get_property(dn, "ibm,loc-code", NULL);
+ printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
+ "location=%s driver=%s pci addr=%s\n",
+ pe->check_count, location,
+ eeh_driver_name(dev), eeh_pci_name(dev));
+ printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+ eeh_driver_name(dev));
+ dump_stack();
+ }
+ goto dn_unlock;
+ }
+
+ /*
+ * Now test for an EEH failure. This is VERY expensive.
+ * Note that the eeh_config_addr may be a parent device
+ * in the case of a device behind a bridge, or it may be
+ * function zero of a multi-function device.
+ * In any case they must share a common PHB.
+ */
+ ret = eeh_ops->get_state(pe, NULL);
+
+ /* Note that config-io to empty slots may fail;
+ * they are empty when they don't have children.
+ * We will punt with the following conditions: Failure to get
+ * PE's state, EEH not support and Permanently unavailable
+ * state, PE is in good state.
+ */
+ if ((ret < 0) ||
+ (ret == EEH_STATE_NOT_SUPPORT) ||
+ (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
+ (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
+ eeh_stats.false_positives++;
+ pe->false_positives++;
+ rc = 0;
+ goto dn_unlock;
+ }
+
+ eeh_stats.slot_resets++;
+
+ /* Avoid repeated reports of this failure, including problems
+ * with other functions on this device, and functions under
+ * bridges.
+ */
+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
+ eeh_serialize_unlock(flags);
+
+ eeh_send_failure_event(pe);
+
+ /* Most EEH events are due to device driver bugs. Having
+ * a stack trace will help the device-driver authors figure
+ * out what happened. So print that out.
+ */
+ pr_err("EEH: Frozen PE#%x detected on PHB#%x\n",
+ pe->addr, pe->phb->global_number);
+ dump_stack();
+
+ return 1;
+
+dn_unlock:
+ eeh_serialize_unlock(flags);
+ return rc;
+}
+
+EXPORT_SYMBOL_GPL(eeh_dev_check_failure);
+
+/**
+ * eeh_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @token: I/O token, should be address in the form 0xA....
+ * @val: value, should be all 1's (XXX why do we need this arg??)
+ *
+ * Check for an EEH failure at the given token address. Call this
+ * routine if the result of a read was all 0xff's and you want to
+ * find out if this is due to an EEH slot freeze event. This routine
+ * will query firmware for the EEH status.
+ *
+ * Note this routine is safe to call in an interrupt context.
+ */
+unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
+{
+ unsigned long addr;
+ struct eeh_dev *edev;
+
+ /* Finding the phys addr + pci device; this is pretty quick. */
+ addr = eeh_token_to_phys((unsigned long __force) token);
+ edev = eeh_addr_cache_get_dev(addr);
+ if (!edev) {
+ eeh_stats.no_device++;
+ return val;
+ }
+
+ eeh_dev_check_failure(edev);
+
+ pci_dev_put(eeh_dev_to_pci_dev(edev));
+ return val;
+}
+
+EXPORT_SYMBOL(eeh_check_failure);
+
+
+/**
+ * eeh_pci_enable - Enable MMIO or DMA transfers for this slot
+ * @pe: EEH PE
+ *
+ * This routine should be called to reenable frozen MMIO or DMA
+ * so that it would work correctly again. It's useful while doing
+ * recovery or log collection on the indicated device.
+ */
+int eeh_pci_enable(struct eeh_pe *pe, int function)
+{
+ int rc;
+
+ rc = eeh_ops->set_option(pe, function);
+ if (rc)
+ pr_warning("%s: Unexpected state change %d on PHB#%d-PE#%x, err=%d\n",
+ __func__, function, pe->phb->global_number, pe->addr, rc);
+
+ rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+ if (rc > 0 && (rc & EEH_STATE_MMIO_ENABLED) &&
+ (function == EEH_OPT_THAW_MMIO))
+ return 0;
+
+ return rc;
+}
+
+/**
+ * pcibios_set_pcie_slot_reset - Set PCI-E reset state
+ * @dev: pci device struct
+ * @state: reset state to enter
+ *
+ * Return value:
+ * 0 if success
+ */
+int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
+{
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+ struct eeh_pe *pe = edev->pe;
+
+ if (!pe) {
+ pr_err("%s: No PE found on PCI device %s\n",
+ __func__, pci_name(dev));
+ return -EINVAL;
+ }
+
+ switch (state) {
+ case pcie_deassert_reset:
+ eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+ break;
+ case pcie_hot_reset:
+ eeh_ops->reset(pe, EEH_RESET_HOT);
+ break;
+ case pcie_warm_reset:
+ eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+ break;
+ default:
+ return -EINVAL;
+ };
+
+ return 0;
+}
+
+/**
+ * eeh_set_pe_freset - Check the required reset for the indicated device
+ * @data: EEH device
+ * @flag: return value
+ *
+ * Each device might have its preferred reset type: fundamental or
+ * hot reset. The routine is used to collected the information for
+ * the indicated device and its children so that the bunch of the
+ * devices could be reset properly.
+ */
+static void *eeh_set_dev_freset(void *data, void *flag)
+{
+ struct pci_dev *dev;
+ unsigned int *freset = (unsigned int *)flag;
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+
+ dev = eeh_dev_to_pci_dev(edev);
+ if (dev)
+ *freset |= dev->needs_freset;
+
+ return NULL;
+}
+
+/**
+ * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second
+ * @pe: EEH PE
+ *
+ * Assert the PCI #RST line for 1/4 second.
+ */
+static void eeh_reset_pe_once(struct eeh_pe *pe)
+{
+ unsigned int freset = 0;
+
+ /* Determine type of EEH reset required for
+ * Partitionable Endpoint, a hot-reset (1)
+ * or a fundamental reset (3).
+ * A fundamental reset required by any device under
+ * Partitionable Endpoint trumps hot-reset.
+ */
+ eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
+
+ if (freset)
+ eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
+ else
+ eeh_ops->reset(pe, EEH_RESET_HOT);
+
+ /* The PCI bus requires that the reset be held high for at least
+ * a 100 milliseconds. We wait a bit longer 'just in case'.
+ */
+#define PCI_BUS_RST_HOLD_TIME_MSEC 250
+ msleep(PCI_BUS_RST_HOLD_TIME_MSEC);
+
+ /* We might get hit with another EEH freeze as soon as the
+ * pci slot reset line is dropped. Make sure we don't miss
+ * these, and clear the flag now.
+ */
+ eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+
+ eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
+
+ /* After a PCI slot has been reset, the PCI Express spec requires
+ * a 1.5 second idle time for the bus to stabilize, before starting
+ * up traffic.
+ */
+#define PCI_BUS_SETTLE_TIME_MSEC 1800
+ msleep(PCI_BUS_SETTLE_TIME_MSEC);
+}
+
+/**
+ * eeh_reset_pe - Reset the indicated PE
+ * @pe: EEH PE
+ *
+ * This routine should be called to reset indicated device, including
+ * PE. A PE might include multiple PCI devices and sometimes PCI bridges
+ * might be involved as well.
+ */
+int eeh_reset_pe(struct eeh_pe *pe)
+{
+ int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+ int i, rc;
+
+ /* Take three shots at resetting the bus */
+ for (i=0; i<3; i++) {
+ eeh_reset_pe_once(pe);
+
+ rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
+ if ((rc & flags) == flags)
+ return 0;
+
+ if (rc < 0) {
+ pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x",
+ __func__, pe->phb->global_number, pe->addr);
+ return -1;
+ }
+ pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n",
+ i+1, pe->phb->global_number, pe->addr, rc);
+ }
+
+ return -1;
+}
+
+/**
+ * eeh_save_bars - Save device bars
+ * @edev: PCI device associated EEH device
+ *
+ * Save the values of the device bars. Unlike the restore
+ * routine, this routine is *not* recursive. This is because
+ * PCI devices are added individually; but, for the restore,
+ * an entire slot is reset at a time.
+ */
+void eeh_save_bars(struct eeh_dev *edev)
+{
+ int i;
+ struct device_node *dn;
+
+ if (!edev)
+ return;
+ dn = eeh_dev_to_of_node(edev);
+
+ for (i = 0; i < 16; i++)
+ eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]);
+}
+
+/**
+ * eeh_ops_register - Register platform dependent EEH operations
+ * @ops: platform dependent EEH operations
+ *
+ * Register the platform dependent EEH operation callback
+ * functions. The platform should call this function before
+ * any other EEH operations.
+ */
+int __init eeh_ops_register(struct eeh_ops *ops)
+{
+ if (!ops->name) {
+ pr_warning("%s: Invalid EEH ops name for %p\n",
+ __func__, ops);
+ return -EINVAL;
+ }
+
+ if (eeh_ops && eeh_ops != ops) {
+ pr_warning("%s: EEH ops of platform %s already existing (%s)\n",
+ __func__, eeh_ops->name, ops->name);
+ return -EEXIST;
+ }
+
+ eeh_ops = ops;
+
+ return 0;
+}
+
+/**
+ * eeh_ops_unregister - Unreigster platform dependent EEH operations
+ * @name: name of EEH platform operations
+ *
+ * Unregister the platform dependent EEH operation callback
+ * functions.
+ */
+int __exit eeh_ops_unregister(const char *name)
+{
+ if (!name || !strlen(name)) {
+ pr_warning("%s: Invalid EEH ops name\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ if (eeh_ops && !strcmp(eeh_ops->name, name)) {
+ eeh_ops = NULL;
+ return 0;
+ }
+
+ return -EEXIST;
+}
+
+/**
+ * eeh_init - EEH initialization
+ *
+ * Initialize EEH by trying to enable it for all of the adapters in the system.
+ * As a side effect we can determine here if eeh is supported at all.
+ * Note that we leave EEH on so failed config cycles won't cause a machine
+ * check. If a user turns off EEH for a particular adapter they are really
+ * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't
+ * grant access to a slot if EEH isn't enabled, and so we always enable
+ * EEH for all slots/all devices.
+ *
+ * The eeh-force-off option disables EEH checking globally, for all slots.
+ * Even if force-off is set, the EEH hardware is still enabled, so that
+ * newer systems can boot.
+ */
+int eeh_init(void)
+{
+ struct pci_controller *hose, *tmp;
+ struct device_node *phb;
+ static int cnt = 0;
+ int ret = 0;
+
+ /*
+ * We have to delay the initialization on PowerNV after
+ * the PCI hierarchy tree has been built because the PEs
+ * are figured out based on PCI devices instead of device
+ * tree nodes
+ */
+ if (machine_is(powernv) && cnt++ <= 0)
+ return ret;
+
+ /* call platform initialization function */
+ if (!eeh_ops) {
+ pr_warning("%s: Platform EEH operation not found\n",
+ __func__);
+ return -EEXIST;
+ } else if ((ret = eeh_ops->init())) {
+ pr_warning("%s: Failed to call platform init function (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+
+ /* Initialize EEH event */
+ ret = eeh_event_init();
+ if (ret)
+ return ret;
+
+ /* Enable EEH for all adapters */
+ if (eeh_probe_mode_devtree()) {
+ list_for_each_entry_safe(hose, tmp,
+ &hose_list, list_node) {
+ phb = hose->dn;
+ traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
+ }
+ } else if (eeh_probe_mode_dev()) {
+ list_for_each_entry_safe(hose, tmp,
+ &hose_list, list_node)
+ pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL);
+ } else {
+ pr_warning("%s: Invalid probe mode %d\n",
+ __func__, eeh_probe_mode);
+ return -EINVAL;
+ }
+
+ /*
+ * Call platform post-initialization. Actually, It's good chance
+ * to inform platform that EEH is ready to supply service if the
+ * I/O cache stuff has been built up.
+ */
+ if (eeh_ops->post_init) {
+ ret = eeh_ops->post_init();
+ if (ret)
+ return ret;
+ }
+
+ if (eeh_subsystem_enabled)
+ pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
+ else
+ pr_warning("EEH: No capable adapters found\n");
+
+ return ret;
+}
+
+core_initcall_sync(eeh_init);
+
+/**
+ * eeh_add_device_early - Enable EEH for the indicated device_node
+ * @dn: device node for which to set up EEH
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ * This routine must be called before any i/o is performed to the
+ * adapter (inluding any config-space i/o).
+ * Whether this actually enables EEH or not for this device depends
+ * on the CEC architecture, type of the device, on earlier boot
+ * command-line arguments & etc.
+ */
+static void eeh_add_device_early(struct device_node *dn)
+{
+ struct pci_controller *phb;
+
+ /*
+ * If we're doing EEH probe based on PCI device, we
+ * would delay the probe until late stage because
+ * the PCI device isn't available this moment.
+ */
+ if (!eeh_probe_mode_devtree())
+ return;
+
+ if (!of_node_to_eeh_dev(dn))
+ return;
+ phb = of_node_to_eeh_dev(dn)->phb;
+
+ /* USB Bus children of PCI devices will not have BUID's */
+ if (NULL == phb || 0 == phb->buid)
+ return;
+
+ eeh_ops->of_probe(dn, NULL);
+}
+
+/**
+ * eeh_add_device_tree_early - Enable EEH for the indicated device
+ * @dn: device node
+ *
+ * This routine must be used to perform EEH initialization for the
+ * indicated PCI device that was added after system boot (e.g.
+ * hotplug, dlpar).
+ */
+void eeh_add_device_tree_early(struct device_node *dn)
+{
+ struct device_node *sib;
+
+ for_each_child_of_node(dn, sib)
+ eeh_add_device_tree_early(sib);
+ eeh_add_device_early(dn);
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
+
+/**
+ * eeh_add_device_late - Perform EEH initialization for the indicated pci device
+ * @dev: pci device for which to set up EEH
+ *
+ * This routine must be used to complete EEH initialization for PCI
+ * devices that were added after system boot (e.g. hotplug, dlpar).
+ */
+static void eeh_add_device_late(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct eeh_dev *edev;
+
+ if (!dev || !eeh_subsystem_enabled)
+ return;
+
+ pr_debug("EEH: Adding device %s\n", pci_name(dev));
+
+ dn = pci_device_to_OF_node(dev);
+ edev = of_node_to_eeh_dev(dn);
+ if (edev->pdev == dev) {
+ pr_debug("EEH: Already referenced !\n");
+ return;
+ }
+ WARN_ON(edev->pdev);
+
+ pci_dev_get(dev);
+ edev->pdev = dev;
+ dev->dev.archdata.edev = edev;
+
+ /*
+ * We have to do the EEH probe here because the PCI device
+ * hasn't been created yet in the early stage.
+ */
+ if (eeh_probe_mode_dev())
+ eeh_ops->dev_probe(dev, NULL);
+
+ eeh_addr_cache_insert_dev(dev);
+}
+
+/**
+ * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_device_tree_late(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ eeh_add_device_late(dev);
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+ struct pci_bus *subbus = dev->subordinate;
+ if (subbus)
+ eeh_add_device_tree_late(subbus);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
+
+/**
+ * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to add EEH sysfs files for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
+void eeh_add_sysfs_files(struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ eeh_sysfs_add_device(dev);
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+ struct pci_bus *subbus = dev->subordinate;
+ if (subbus)
+ eeh_add_sysfs_files(subbus);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(eeh_add_sysfs_files);
+
+/**
+ * eeh_remove_device - Undo EEH setup for the indicated pci device
+ * @dev: pci device to be removed
+ * @purge_pe: remove the PE or not
+ *
+ * This routine should be called when a device is removed from
+ * a running system (e.g. by hotplug or dlpar). It unregisters
+ * the PCI device from the EEH subsystem. I/O errors affecting
+ * this device will no longer be detected after this call; thus,
+ * i/o errors affecting this slot may leave this device unusable.
+ */
+static void eeh_remove_device(struct pci_dev *dev, int purge_pe)
+{
+ struct eeh_dev *edev;
+
+ if (!dev || !eeh_subsystem_enabled)
+ return;
+ edev = pci_dev_to_eeh_dev(dev);
+
+ /* Unregister the device with the EEH/PCI address search system */
+ pr_debug("EEH: Removing device %s\n", pci_name(dev));
+
+ if (!edev || !edev->pdev) {
+ pr_debug("EEH: Not referenced !\n");
+ return;
+ }
+ edev->pdev = NULL;
+ dev->dev.archdata.edev = NULL;
+ pci_dev_put(dev);
+
+ eeh_rmv_from_parent_pe(edev, purge_pe);
+ eeh_addr_cache_rmv_dev(dev);
+ eeh_sysfs_remove_device(dev);
+}
+
+/**
+ * eeh_remove_bus_device - Undo EEH setup for the indicated PCI device
+ * @dev: PCI device
+ * @purge_pe: remove the corresponding PE or not
+ *
+ * This routine must be called when a device is removed from the
+ * running system through hotplug or dlpar. The corresponding
+ * PCI address cache will be removed.
+ */
+void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe)
+{
+ struct pci_bus *bus = dev->subordinate;
+ struct pci_dev *child, *tmp;
+
+ eeh_remove_device(dev, purge_pe);
+
+ if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
+ list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
+ eeh_remove_bus_device(child, purge_pe);
+ }
+}
+EXPORT_SYMBOL_GPL(eeh_remove_bus_device);
+
+static int proc_eeh_show(struct seq_file *m, void *v)
+{
+ if (0 == eeh_subsystem_enabled) {
+ seq_printf(m, "EEH Subsystem is globally disabled\n");
+ seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs);
+ } else {
+ seq_printf(m, "EEH Subsystem is enabled\n");
+ seq_printf(m,
+ "no device=%llu\n"
+ "no device node=%llu\n"
+ "no config address=%llu\n"
+ "check not wanted=%llu\n"
+ "eeh_total_mmio_ffs=%llu\n"
+ "eeh_false_positives=%llu\n"
+ "eeh_slot_resets=%llu\n",
+ eeh_stats.no_device,
+ eeh_stats.no_dn,
+ eeh_stats.no_cfg_addr,
+ eeh_stats.ignored_check,
+ eeh_stats.total_mmio_ffs,
+ eeh_stats.false_positives,
+ eeh_stats.slot_resets);
+ }
+
+ return 0;
+}
+
+static int proc_eeh_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, proc_eeh_show, NULL);
+}
+
+static const struct file_operations proc_eeh_operations = {
+ .open = proc_eeh_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init eeh_init_proc(void)
+{
+ if (machine_is(pseries))
+ proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations);
+ return 0;
+}
+__initcall(eeh_init_proc);
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
new file mode 100644
index 000000000000..f9ac1232a746
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -0,0 +1,318 @@
+/*
+ * PCI address cache; allows the lookup of PCI devices based on I/O address
+ *
+ * Copyright IBM Corporation 2004
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2004
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+
+/**
+ * The pci address cache subsystem. This subsystem places
+ * PCI device address resources into a red-black tree, sorted
+ * according to the address range, so that given only an i/o
+ * address, the corresponding PCI device can be **quickly**
+ * found. It is safe to perform an address lookup in an interrupt
+ * context; this ability is an important feature.
+ *
+ * Currently, the only customer of this code is the EEH subsystem;
+ * thus, this code has been somewhat tailored to suit EEH better.
+ * In particular, the cache does *not* hold the addresses of devices
+ * for which EEH is not enabled.
+ *
+ * (Implementation Note: The RB tree seems to be better/faster
+ * than any hash algo I could think of for this problem, even
+ * with the penalty of slow pointer chases for d-cache misses).
+ */
+struct pci_io_addr_range {
+ struct rb_node rb_node;
+ unsigned long addr_lo;
+ unsigned long addr_hi;
+ struct eeh_dev *edev;
+ struct pci_dev *pcidev;
+ unsigned int flags;
+};
+
+static struct pci_io_addr_cache {
+ struct rb_root rb_root;
+ spinlock_t piar_lock;
+} pci_io_addr_cache_root;
+
+static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr)
+{
+ struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
+
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+ if (addr < piar->addr_lo) {
+ n = n->rb_left;
+ } else {
+ if (addr > piar->addr_hi) {
+ n = n->rb_right;
+ } else {
+ pci_dev_get(piar->pcidev);
+ return piar->edev;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_addr_cache_get_dev - Get device, given only address
+ * @addr: mmio (PIO) phys address or i/o port number
+ *
+ * Given an mmio phys address, or a port number, find a pci device
+ * that implements this address. Be sure to pci_dev_put the device
+ * when finished. I/O port numbers are assumed to be offset
+ * from zero (that is, they do *not* have pci_io_addr added in).
+ * It is safe to call this function within an interrupt.
+ */
+struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr)
+{
+ struct eeh_dev *edev;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ edev = __eeh_addr_cache_get_device(addr);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+ return edev;
+}
+
+#ifdef DEBUG
+/*
+ * Handy-dandy debug print routine, does nothing more
+ * than print out the contents of our addr cache.
+ */
+static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
+{
+ struct rb_node *n;
+ int cnt = 0;
+
+ n = rb_first(&cache->rb_root);
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+ pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n",
+ (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
+ piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
+ cnt++;
+ n = rb_next(n);
+ }
+}
+#endif
+
+/* Insert address range into the rb tree. */
+static struct pci_io_addr_range *
+eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
+ unsigned long ahi, unsigned int flags)
+{
+ struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
+ struct rb_node *parent = NULL;
+ struct pci_io_addr_range *piar;
+
+ /* Walk tree, find a place to insert into tree */
+ while (*p) {
+ parent = *p;
+ piar = rb_entry(parent, struct pci_io_addr_range, rb_node);
+ if (ahi < piar->addr_lo) {
+ p = &parent->rb_left;
+ } else if (alo > piar->addr_hi) {
+ p = &parent->rb_right;
+ } else {
+ if (dev != piar->pcidev ||
+ alo != piar->addr_lo || ahi != piar->addr_hi) {
+ pr_warning("PIAR: overlapping address range\n");
+ }
+ return piar;
+ }
+ }
+ piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+ if (!piar)
+ return NULL;
+
+ pci_dev_get(dev);
+ piar->addr_lo = alo;
+ piar->addr_hi = ahi;
+ piar->edev = pci_dev_to_eeh_dev(dev);
+ piar->pcidev = dev;
+ piar->flags = flags;
+
+#ifdef DEBUG
+ pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n",
+ alo, ahi, pci_name(dev));
+#endif
+
+ rb_link_node(&piar->rb_node, parent, p);
+ rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root);
+
+ return piar;
+}
+
+static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+ struct device_node *dn;
+ struct eeh_dev *edev;
+ int i;
+
+ dn = pci_device_to_OF_node(dev);
+ if (!dn) {
+ pr_warning("PCI: no pci dn found for dev=%s\n", pci_name(dev));
+ return;
+ }
+
+ edev = of_node_to_eeh_dev(dn);
+ if (!edev) {
+ pr_warning("PCI: no EEH dev found for dn=%s\n",
+ dn->full_name);
+ return;
+ }
+
+ /* Skip any devices for which EEH is not enabled. */
+ if (!eeh_probe_mode_dev() && !edev->pe) {
+#ifdef DEBUG
+ pr_info("PCI: skip building address cache for=%s - %s\n",
+ pci_name(dev), dn->full_name);
+#endif
+ return;
+ }
+
+ /* Walk resources on this device, poke them into the tree */
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ unsigned long start = pci_resource_start(dev,i);
+ unsigned long end = pci_resource_end(dev,i);
+ unsigned int flags = pci_resource_flags(dev,i);
+
+ /* We are interested only bus addresses, not dma or other stuff */
+ if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+ continue;
+ if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
+ continue;
+ eeh_addr_cache_insert(dev, start, end, flags);
+ }
+}
+
+/**
+ * eeh_addr_cache_insert_dev - Add a device to the address cache
+ * @dev: PCI device whose I/O addresses we are interested in.
+ *
+ * In order to support the fast lookup of devices based on addresses,
+ * we maintain a cache of devices that can be quickly searched.
+ * This routine adds a device to that cache.
+ */
+void eeh_addr_cache_insert_dev(struct pci_dev *dev)
+{
+ unsigned long flags;
+
+ /* Ignore PCI bridges */
+ if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
+ return;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ __eeh_addr_cache_insert_dev(dev);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+ struct rb_node *n;
+
+restart:
+ n = rb_first(&pci_io_addr_cache_root.rb_root);
+ while (n) {
+ struct pci_io_addr_range *piar;
+ piar = rb_entry(n, struct pci_io_addr_range, rb_node);
+
+ if (piar->pcidev == dev) {
+ rb_erase(n, &pci_io_addr_cache_root.rb_root);
+ pci_dev_put(piar->pcidev);
+ kfree(piar);
+ goto restart;
+ }
+ n = rb_next(n);
+ }
+}
+
+/**
+ * eeh_addr_cache_rmv_dev - remove pci device from addr cache
+ * @dev: device to remove
+ *
+ * Remove a device from the addr-cache tree.
+ * This is potentially expensive, since it will walk
+ * the tree multiple times (once per resource).
+ * But so what; device removal doesn't need to be that fast.
+ */
+void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
+ __eeh_addr_cache_rmv_dev(dev);
+ spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
+}
+
+/**
+ * eeh_addr_cache_build - Build a cache of I/O addresses
+ *
+ * Build a cache of pci i/o addresses. This cache will be used to
+ * find the pci device that corresponds to a given address.
+ * This routine scans all pci busses to build the cache.
+ * Must be run late in boot process, after the pci controllers
+ * have been scanned for devices (after all device resources are known).
+ */
+void eeh_addr_cache_build(void)
+{
+ struct device_node *dn;
+ struct eeh_dev *edev;
+ struct pci_dev *dev = NULL;
+
+ spin_lock_init(&pci_io_addr_cache_root.piar_lock);
+
+ for_each_pci_dev(dev) {
+ dn = pci_device_to_OF_node(dev);
+ if (!dn)
+ continue;
+
+ edev = of_node_to_eeh_dev(dn);
+ if (!edev)
+ continue;
+
+ pci_dev_get(dev); /* matching put is in eeh_remove_device() */
+ dev->dev.archdata.edev = edev;
+ edev->pdev = dev;
+
+ eeh_addr_cache_insert_dev(dev);
+
+ eeh_sysfs_add_device(dev);
+ }
+
+#ifdef DEBUG
+ /* Verify tree built up above, echo back the list of addrs. */
+ eeh_addr_cache_print(&pci_io_addr_cache_root);
+#endif
+}
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
new file mode 100644
index 000000000000..1efa28f5fc54
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -0,0 +1,112 @@
+/*
+ * The file intends to implement dynamic creation of EEH device, which will
+ * be bound with OF node and PCI device simutaneously. The EEH devices would
+ * be foundamental information for EEH core components to work proerly. Besides,
+ * We have to support multiple situations where dynamic creation of EEH device
+ * is required:
+ *
+ * 1) Before PCI emunation starts, we need create EEH devices according to the
+ * PCI sensitive OF nodes.
+ * 2) When PCI emunation is done, we need do the binding between PCI device and
+ * the associated EEH device.
+ * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device
+ * will be created while PCI sensitive OF node is detected from DR.
+ * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If
+ * PHB is newly inserted, we also need create EEH devices accordingly.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+/**
+ * eeh_dev_init - Create EEH device according to OF node
+ * @dn: device node
+ * @data: PHB
+ *
+ * It will create EEH device according to the given OF node. The function
+ * might be called by PCI emunation, DR, PHB hotplug.
+ */
+void *eeh_dev_init(struct device_node *dn, void *data)
+{
+ struct pci_controller *phb = data;
+ struct eeh_dev *edev;
+
+ /* Allocate EEH device */
+ edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+ if (!edev) {
+ pr_warning("%s: out of memory\n", __func__);
+ return NULL;
+ }
+
+ /* Associate EEH device with OF node */
+ PCI_DN(dn)->edev = edev;
+ edev->dn = dn;
+ edev->phb = phb;
+ INIT_LIST_HEAD(&edev->list);
+
+ return NULL;
+}
+
+/**
+ * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
+ * @phb: PHB
+ *
+ * Scan the PHB OF node and its child association, then create the
+ * EEH devices accordingly
+ */
+void eeh_dev_phb_init_dynamic(struct pci_controller *phb)
+{
+ struct device_node *dn = phb->dn;
+
+ /* EEH PE for PHB */
+ eeh_phb_pe_create(phb);
+
+ /* EEH device for PHB */
+ eeh_dev_init(dn, phb);
+
+ /* EEH devices for children OF nodes */
+ traverse_pci_devices(dn, eeh_dev_init, phb);
+}
+
+/**
+ * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs
+ *
+ * Scan all the existing PHBs and create EEH devices for their OF
+ * nodes and their children OF nodes
+ */
+static int __init eeh_dev_phb_init(void)
+{
+ struct pci_controller *phb, *tmp;
+
+ list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
+ eeh_dev_phb_init_dynamic(phb);
+
+ pr_info("EEH: devices created\n");
+
+ return 0;
+}
+
+core_initcall(eeh_dev_phb_init);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
new file mode 100644
index 000000000000..2b1ce17cae50
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -0,0 +1,661 @@
+/*
+ * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
+ * Copyright IBM Corp. 2004 2005
+ * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <asm/eeh.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+#include <asm/prom.h>
+#include <asm/rtas.h>
+
+/**
+ * eeh_pcid_name - Retrieve name of PCI device driver
+ * @pdev: PCI device
+ *
+ * This routine is used to retrieve the name of PCI device driver
+ * if that's valid.
+ */
+static inline const char *eeh_pcid_name(struct pci_dev *pdev)
+{
+ if (pdev && pdev->dev.driver)
+ return pdev->dev.driver->name;
+ return "";
+}
+
+/**
+ * eeh_pcid_get - Get the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is used to retrieve the PCI device driver for
+ * the indicated PCI device. Besides, we will increase the reference
+ * of the PCI device driver to prevent that being unloaded on
+ * the fly. Otherwise, kernel crash would be seen.
+ */
+static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
+{
+ if (!pdev || !pdev->driver)
+ return NULL;
+
+ if (!try_module_get(pdev->driver->driver.owner))
+ return NULL;
+
+ return pdev->driver;
+}
+
+/**
+ * eeh_pcid_put - Dereference on the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is called to do dereference on the PCI device
+ * driver of the indicated PCI device.
+ */
+static inline void eeh_pcid_put(struct pci_dev *pdev)
+{
+ if (!pdev || !pdev->driver)
+ return;
+
+ module_put(pdev->driver->driver.owner);
+}
+
+#if 0
+static void print_device_node_tree(struct pci_dn *pdn, int dent)
+{
+ int i;
+ struct device_node *pc;
+
+ if (!pdn)
+ return;
+ for (i = 0; i < dent; i++)
+ printk(" ");
+ printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n",
+ pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr,
+ pdn->eeh_pe_config_addr, pdn->node->full_name);
+ dent += 3;
+ pc = pdn->node->child;
+ while (pc) {
+ print_device_node_tree(PCI_DN(pc), dent);
+ pc = pc->sibling;
+ }
+}
+#endif
+
+/**
+ * eeh_disable_irq - Disable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called when reporting temporary or permanent
+ * error to the particular PCI device to disable interrupt of that
+ * device. If the device has enabled MSI or MSI-X interrupt, we needn't
+ * do real work because EEH should freeze DMA transfers for those PCI
+ * devices encountering EEH errors, which includes MSI or MSI-X.
+ */
+static void eeh_disable_irq(struct pci_dev *dev)
+{
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+ /* Don't disable MSI and MSI-X interrupts. They are
+ * effectively disabled by the DMA Stopped state
+ * when an EEH error occurs.
+ */
+ if (dev->msi_enabled || dev->msix_enabled)
+ return;
+
+ if (!irq_has_action(dev->irq))
+ return;
+
+ edev->mode |= EEH_DEV_IRQ_DISABLED;
+ disable_irq_nosync(dev->irq);
+}
+
+/**
+ * eeh_enable_irq - Enable interrupt for the recovering device
+ * @dev: PCI device
+ *
+ * This routine must be called to enable interrupt while failed
+ * device could be resumed.
+ */
+static void eeh_enable_irq(struct pci_dev *dev)
+{
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+
+ if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
+ edev->mode &= ~EEH_DEV_IRQ_DISABLED;
+ enable_irq(dev->irq);
+ }
+}
+
+/**
+ * eeh_report_error - Report pci error to each device driver
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * Report an EEH error to each device driver, collect up and
+ * merge the device driver responses. Cumulative response
+ * passed back in "userdata".
+ */
+static void *eeh_report_error(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver;
+
+ /* We might not have the associated PCI device,
+ * then we should continue for next one.
+ */
+ if (!dev) return NULL;
+ dev->error_state = pci_channel_io_frozen;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ eeh_disable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->error_detected) {
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
+
+ /* A driver that needs a reset trumps all others */
+ if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+ if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+/**
+ * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * Tells each device driver that IO ports, MMIO and config space I/O
+ * are now enabled. Collects up and merges the device driver responses.
+ * Cumulative response passed back in "userdata".
+ */
+static void *eeh_report_mmio_enabled(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ if (!driver->err_handler ||
+ !driver->err_handler->mmio_enabled) {
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ rc = driver->err_handler->mmio_enabled(dev);
+
+ /* A driver that needs a reset trumps all others */
+ if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+ if (*res == PCI_ERS_RESULT_NONE) *res = rc;
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+/**
+ * eeh_report_reset - Tell device that slot has been reset
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This routine must be called while EEH tries to reset particular
+ * PCI device so that the associated PCI device driver could take
+ * some actions, usually to save data the driver needs so that the
+ * driver can work again while the device is recovered.
+ */
+static void *eeh_report_reset(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ enum pci_ers_result rc, *res = userdata;
+ struct pci_driver *driver;
+
+ if (!dev) return NULL;
+ dev->error_state = pci_channel_io_normal;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ eeh_enable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->slot_reset) {
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ rc = driver->err_handler->slot_reset(dev);
+ if ((*res == PCI_ERS_RESULT_NONE) ||
+ (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc;
+ if (*res == PCI_ERS_RESULT_DISCONNECT &&
+ rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+/**
+ * eeh_report_resume - Tell device to resume normal operations
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This routine must be called to notify the device driver that it
+ * could resume so that the device driver can do some initialization
+ * to make the recovered device work again.
+ */
+static void *eeh_report_resume(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ struct pci_driver *driver;
+
+ if (!dev) return NULL;
+ dev->error_state = pci_channel_io_normal;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ eeh_enable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->resume) {
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ driver->err_handler->resume(dev);
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+/**
+ * eeh_report_failure - Tell device driver that device is dead.
+ * @data: eeh device
+ * @userdata: return value
+ *
+ * This informs the device driver that the device is permanently
+ * dead, and that no further recovery attempts will be made on it.
+ */
+static void *eeh_report_failure(void *data, void *userdata)
+{
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+ struct pci_driver *driver;
+
+ if (!dev) return NULL;
+ dev->error_state = pci_channel_io_perm_failure;
+
+ driver = eeh_pcid_get(dev);
+ if (!driver) return NULL;
+
+ eeh_disable_irq(dev);
+
+ if (!driver->err_handler ||
+ !driver->err_handler->error_detected) {
+ eeh_pcid_put(dev);
+ return NULL;
+ }
+
+ driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
+
+ eeh_pcid_put(dev);
+ return NULL;
+}
+
+/**
+ * eeh_reset_device - Perform actual reset of a pci slot
+ * @pe: EEH PE
+ * @bus: PCI bus corresponding to the isolcated slot
+ *
+ * This routine must be called to do reset on the indicated PE.
+ * During the reset, udev might be invoked because those affected
+ * PCI devices will be removed and then added.
+ */
+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
+{
+ struct timeval tstamp;
+ int cnt, rc;
+
+ /* pcibios will clear the counter; save the value */
+ cnt = pe->freeze_count;
+ tstamp = pe->tstamp;
+
+ /*
+ * We don't remove the corresponding PE instances because
+ * we need the information afterwords. The attached EEH
+ * devices are expected to be attached soon when calling
+ * into pcibios_add_pci_devices().
+ */
+ if (bus)
+ __pcibios_remove_pci_devices(bus, 0);
+
+ /* Reset the pci controller. (Asserts RST#; resets config space).
+ * Reconfigure bridges and devices. Don't try to bring the system
+ * up if the reset failed for some reason.
+ */
+ rc = eeh_reset_pe(pe);
+ if (rc)
+ return rc;
+
+ /* Restore PE */
+ eeh_ops->configure_bridge(pe);
+ eeh_pe_restore_bars(pe);
+
+ /* Give the system 5 seconds to finish running the user-space
+ * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
+ * this is a hack, but if we don't do this, and try to bring
+ * the device up before the scripts have taken it down,
+ * potentially weird things happen.
+ */
+ if (bus) {
+ ssleep(5);
+ pcibios_add_pci_devices(bus);
+ }
+
+ pe->tstamp = tstamp;
+ pe->freeze_count = cnt;
+
+ return 0;
+}
+
+/* The longest amount of time to wait for a pci device
+ * to come back on line, in seconds.
+ */
+#define MAX_WAIT_FOR_RECOVERY 150
+
+static void eeh_handle_normal_event(struct eeh_pe *pe)
+{
+ struct pci_bus *frozen_bus;
+ int rc = 0;
+ enum pci_ers_result result = PCI_ERS_RESULT_NONE;
+
+ frozen_bus = eeh_pe_bus_get(pe);
+ if (!frozen_bus) {
+ pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
+ __func__, pe->phb->global_number, pe->addr);
+ return;
+ }
+
+ eeh_pe_update_time_stamp(pe);
+ pe->freeze_count++;
+ if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
+ goto excess_failures;
+ pr_warning("EEH: This PCI device has failed %d times in the last hour\n",
+ pe->freeze_count);
+
+ /* Walk the various device drivers attached to this slot through
+ * a reset sequence, giving each an opportunity to do what it needs
+ * to accomplish the reset. Each child gets a report of the
+ * status ... if any child can't handle the reset, then the entire
+ * slot is dlpar removed and added.
+ */
+ pr_info("EEH: Notify device drivers to shutdown\n");
+ eeh_pe_dev_traverse(pe, eeh_report_error, &result);
+
+ /* Get the current PCI slot state. This can take a long time,
+ * sometimes over 3 seconds for certain systems.
+ */
+ rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
+ if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
+ pr_warning("EEH: Permanent failure\n");
+ goto hard_fail;
+ }
+
+ /* Since rtas may enable MMIO when posting the error log,
+ * don't post the error log until after all dev drivers
+ * have been informed.
+ */
+ pr_info("EEH: Collect temporary log\n");
+ eeh_slot_error_detail(pe, EEH_LOG_TEMP);
+
+ /* If all device drivers were EEH-unaware, then shut
+ * down all of the device drivers, and hope they
+ * go down willingly, without panicing the system.
+ */
+ if (result == PCI_ERS_RESULT_NONE) {
+ pr_info("EEH: Reset with hotplug activity\n");
+ rc = eeh_reset_device(pe, frozen_bus);
+ if (rc) {
+ pr_warning("%s: Unable to reset, err=%d\n",
+ __func__, rc);
+ goto hard_fail;
+ }
+ }
+
+ /* If all devices reported they can proceed, then re-enable MMIO */
+ if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+ pr_info("EEH: Enable I/O for affected devices\n");
+ rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+
+ if (rc < 0)
+ goto hard_fail;
+ if (rc) {
+ result = PCI_ERS_RESULT_NEED_RESET;
+ } else {
+ pr_info("EEH: Notify device drivers to resume I/O\n");
+ result = PCI_ERS_RESULT_NONE;
+ eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
+ }
+ }
+
+ /* If all devices reported they can proceed, then re-enable DMA */
+ if (result == PCI_ERS_RESULT_CAN_RECOVER) {
+ pr_info("EEH: Enabled DMA for affected devices\n");
+ rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
+
+ if (rc < 0)
+ goto hard_fail;
+ if (rc)
+ result = PCI_ERS_RESULT_NEED_RESET;
+ else
+ result = PCI_ERS_RESULT_RECOVERED;
+ }
+
+ /* If any device has a hard failure, then shut off everything. */
+ if (result == PCI_ERS_RESULT_DISCONNECT) {
+ pr_warning("EEH: Device driver gave up\n");
+ goto hard_fail;
+ }
+
+ /* If any device called out for a reset, then reset the slot */
+ if (result == PCI_ERS_RESULT_NEED_RESET) {
+ pr_info("EEH: Reset without hotplug activity\n");
+ rc = eeh_reset_device(pe, NULL);
+ if (rc) {
+ pr_warning("%s: Cannot reset, err=%d\n",
+ __func__, rc);
+ goto hard_fail;
+ }
+
+ pr_info("EEH: Notify device drivers "
+ "the completion of reset\n");
+ result = PCI_ERS_RESULT_NONE;
+ eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
+ }
+
+ /* All devices should claim they have recovered by now. */
+ if ((result != PCI_ERS_RESULT_RECOVERED) &&
+ (result != PCI_ERS_RESULT_NONE)) {
+ pr_warning("EEH: Not recovered\n");
+ goto hard_fail;
+ }
+
+ /* Tell all device drivers that they can resume operations */
+ pr_info("EEH: Notify device driver to resume\n");
+ eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
+
+ return;
+
+excess_failures:
+ /*
+ * About 90% of all real-life EEH failures in the field
+ * are due to poorly seated PCI cards. Only 10% or so are
+ * due to actual, failed cards.
+ */
+ pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
+ "last hour and has been permanently disabled.\n"
+ "Please try reseating or replacing it.\n",
+ pe->phb->global_number, pe->addr,
+ pe->freeze_count);
+ goto perm_error;
+
+hard_fail:
+ pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
+ "Please try reseating or replacing it\n",
+ pe->phb->global_number, pe->addr);
+
+perm_error:
+ eeh_slot_error_detail(pe, EEH_LOG_PERM);
+
+ /* Notify all devices that they're about to go down. */
+ eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+
+ /* Shut down the device drivers for good. */
+ if (frozen_bus)
+ pcibios_remove_pci_devices(frozen_bus);
+}
+
+static void eeh_handle_special_event(void)
+{
+ struct eeh_pe *pe, *phb_pe;
+ struct pci_bus *bus;
+ struct pci_controller *hose, *tmp;
+ unsigned long flags;
+ int rc = 0;
+
+ /*
+ * The return value from next_error() has been classified as follows.
+ * It might be good to enumerate them. However, next_error() is only
+ * supported by PowerNV platform for now. So it would be fine to use
+ * integer directly:
+ *
+ * 4 - Dead IOC 3 - Dead PHB
+ * 2 - Fenced PHB 1 - Frozen PE
+ * 0 - No error found
+ *
+ */
+ rc = eeh_ops->next_error(&pe);
+ if (rc <= 0)
+ return;
+
+ switch (rc) {
+ case 4:
+ /* Mark all PHBs in dead state */
+ eeh_serialize_lock(&flags);
+ list_for_each_entry_safe(hose, tmp,
+ &hose_list, list_node) {
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe) continue;
+
+ eeh_pe_state_mark(phb_pe,
+ EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+ }
+ eeh_serialize_unlock(flags);
+
+ /* Purge all events */
+ eeh_remove_event(NULL);
+ break;
+ case 3:
+ case 2:
+ case 1:
+ /* Mark the PE in fenced state */
+ eeh_serialize_lock(&flags);
+ if (rc == 3)
+ eeh_pe_state_mark(pe,
+ EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+ else
+ eeh_pe_state_mark(pe,
+ EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+ eeh_serialize_unlock(flags);
+
+ /* Purge all events of the PHB */
+ eeh_remove_event(pe);
+ break;
+ default:
+ pr_err("%s: Invalid value %d from next_error()\n",
+ __func__, rc);
+ return;
+ }
+
+ /*
+ * For fenced PHB and frozen PE, it's handled as normal
+ * event. We have to remove the affected PHBs for dead
+ * PHB and IOC
+ */
+ if (rc == 2 || rc == 1)
+ eeh_handle_normal_event(pe);
+ else {
+ list_for_each_entry_safe(hose, tmp,
+ &hose_list, list_node) {
+ phb_pe = eeh_phb_pe_get(hose);
+ if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
+ continue;
+
+ bus = eeh_pe_bus_get(phb_pe);
+ /* Notify all devices that they're about to go down. */
+ eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
+ pcibios_remove_pci_devices(bus);
+ }
+ }
+}
+
+/**
+ * eeh_handle_event - Reset a PCI device after hard lockup.
+ * @pe: EEH PE
+ *
+ * While PHB detects address or data parity errors on particular PCI
+ * slot, the associated PE will be frozen. Besides, DMA's occurring
+ * to wild addresses (which usually happen due to bugs in device
+ * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
+ * #PERR or other misc PCI-related errors also can trigger EEH errors.
+ *
+ * Recovery process consists of unplugging the device driver (which
+ * generated hotplug events to userspace), then issuing a PCI #RST to
+ * the device, then reconfiguring the PCI config space for all bridges
+ * & devices under this slot, and then finally restarting the device
+ * drivers (which cause a second set of hotplug events to go out to
+ * userspace).
+ */
+void eeh_handle_event(struct eeh_pe *pe)
+{
+ if (pe)
+ eeh_handle_normal_event(pe);
+ else
+ eeh_handle_special_event();
+}
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
new file mode 100644
index 000000000000..d27c5afc90ae
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -0,0 +1,182 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
+ */
+
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <asm/eeh_event.h>
+#include <asm/ppc-pci.h>
+
+/** Overview:
+ * EEH error states may be detected within exception handlers;
+ * however, the recovery processing needs to occur asynchronously
+ * in a normal kernel context and not an interrupt context.
+ * This pair of routines creates an event and queues it onto a
+ * work-queue, where a worker thread can drive recovery.
+ */
+
+static DEFINE_SPINLOCK(eeh_eventlist_lock);
+static struct semaphore eeh_eventlist_sem;
+LIST_HEAD(eeh_eventlist);
+
+/**
+ * eeh_event_handler - Dispatch EEH events.
+ * @dummy - unused
+ *
+ * The detection of a frozen slot can occur inside an interrupt,
+ * where it can be hard to do anything about it. The goal of this
+ * routine is to pull these detection events out of the context
+ * of the interrupt handler, and re-dispatch them for processing
+ * at a later time in a normal context.
+ */
+static int eeh_event_handler(void * dummy)
+{
+ unsigned long flags;
+ struct eeh_event *event;
+ struct eeh_pe *pe;
+
+ while (!kthread_should_stop()) {
+ if (down_interruptible(&eeh_eventlist_sem))
+ break;
+
+ /* Fetch EEH event from the queue */
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ event = NULL;
+ if (!list_empty(&eeh_eventlist)) {
+ event = list_entry(eeh_eventlist.next,
+ struct eeh_event, list);
+ list_del(&event->list);
+ }
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+ if (!event)
+ continue;
+
+ /* We might have event without binding PE */
+ pe = event->pe;
+ if (pe) {
+ eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+ pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
+ pe->phb->global_number, pe->addr);
+ eeh_handle_event(pe);
+ eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
+ } else {
+ eeh_handle_event(NULL);
+ }
+
+ kfree(event);
+ }
+
+ return 0;
+}
+
+/**
+ * eeh_event_init - Start kernel thread to handle EEH events
+ *
+ * This routine is called to start the kernel thread for processing
+ * EEH event.
+ */
+int eeh_event_init(void)
+{
+ struct task_struct *t;
+ int ret = 0;
+
+ /* Initialize semaphore */
+ sema_init(&eeh_eventlist_sem, 0);
+
+ t = kthread_run(eeh_event_handler, NULL, "eehd");
+ if (IS_ERR(t)) {
+ ret = PTR_ERR(t);
+ pr_err("%s: Failed to start EEH daemon (%d)\n",
+ __func__, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * eeh_send_failure_event - Generate a PCI error event
+ * @pe: EEH PE
+ *
+ * This routine can be called within an interrupt context;
+ * the actual event will be delivered in a normal context
+ * (from a workqueue).
+ */
+int eeh_send_failure_event(struct eeh_pe *pe)
+{
+ unsigned long flags;
+ struct eeh_event *event;
+
+ event = kzalloc(sizeof(*event), GFP_ATOMIC);
+ if (!event) {
+ pr_err("EEH: out of memory, event not handled\n");
+ return -ENOMEM;
+ }
+ event->pe = pe;
+
+ /* We may or may not be called in an interrupt context */
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ list_add(&event->list, &eeh_eventlist);
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+
+ /* For EEH deamon to knick in */
+ up(&eeh_eventlist_sem);
+
+ return 0;
+}
+
+/**
+ * eeh_remove_event - Remove EEH event from the queue
+ * @pe: Event binding to the PE
+ *
+ * On PowerNV platform, we might have subsequent coming events
+ * is part of the former one. For that case, those subsequent
+ * coming events are totally duplicated and unnecessary, thus
+ * they should be removed.
+ */
+void eeh_remove_event(struct eeh_pe *pe)
+{
+ unsigned long flags;
+ struct eeh_event *event, *tmp;
+
+ spin_lock_irqsave(&eeh_eventlist_lock, flags);
+ list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
+ /*
+ * If we don't have valid PE passed in, that means
+ * we already have event corresponding to dead IOC
+ * and all events should be purged.
+ */
+ if (!pe) {
+ list_del(&event->list);
+ kfree(event);
+ } else if (pe->type & EEH_PE_PHB) {
+ if (event->pe && event->pe->phb == pe->phb) {
+ list_del(&event->list);
+ kfree(event);
+ }
+ } else if (event->pe == pe) {
+ list_del(&event->list);
+ kfree(event);
+ }
+ }
+ spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
+}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
new file mode 100644
index 000000000000..016588a6f5ed
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -0,0 +1,800 @@
+/*
+ * The file intends to implement PE based on the information from
+ * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device.
+ * All the PEs should be organized as hierarchy tree. The first level
+ * of the tree will be associated to existing PHBs since the particular
+ * PE is only meaningful in one PHB domain.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+static LIST_HEAD(eeh_phb_pe);
+
+/**
+ * eeh_pe_alloc - Allocate PE
+ * @phb: PCI controller
+ * @type: PE type
+ *
+ * Allocate PE instance dynamically.
+ */
+static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
+{
+ struct eeh_pe *pe;
+
+ /* Allocate PHB PE */
+ pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL);
+ if (!pe) return NULL;
+
+ /* Initialize PHB PE */
+ pe->type = type;
+ pe->phb = phb;
+ INIT_LIST_HEAD(&pe->child_list);
+ INIT_LIST_HEAD(&pe->child);
+ INIT_LIST_HEAD(&pe->edevs);
+
+ return pe;
+}
+
+/**
+ * eeh_phb_pe_create - Create PHB PE
+ * @phb: PCI controller
+ *
+ * The function should be called while the PHB is detected during
+ * system boot or PCI hotplug in order to create PHB PE.
+ */
+int eeh_phb_pe_create(struct pci_controller *phb)
+{
+ struct eeh_pe *pe;
+
+ /* Allocate PHB PE */
+ pe = eeh_pe_alloc(phb, EEH_PE_PHB);
+ if (!pe) {
+ pr_err("%s: out of memory!\n", __func__);
+ return -ENOMEM;
+ }
+
+ /* Put it into the list */
+ list_add_tail(&pe->child, &eeh_phb_pe);
+
+ pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
+
+ return 0;
+}
+
+/**
+ * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB
+ * @phb: PCI controller
+ *
+ * The overall PEs form hierarchy tree. The first layer of the
+ * hierarchy tree is composed of PHB PEs. The function is used
+ * to retrieve the corresponding PHB PE according to the given PHB.
+ */
+struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
+{
+ struct eeh_pe *pe;
+
+ list_for_each_entry(pe, &eeh_phb_pe, child) {
+ /*
+ * Actually, we needn't check the type since
+ * the PE for PHB has been determined when that
+ * was created.
+ */
+ if ((pe->type & EEH_PE_PHB) && pe->phb == phb)
+ return pe;
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_next - Retrieve the next PE in the tree
+ * @pe: current PE
+ * @root: root PE
+ *
+ * The function is used to retrieve the next PE in the
+ * hierarchy PE tree.
+ */
+static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe,
+ struct eeh_pe *root)
+{
+ struct list_head *next = pe->child_list.next;
+
+ if (next == &pe->child_list) {
+ while (1) {
+ if (pe == root)
+ return NULL;
+ next = pe->child.next;
+ if (next != &pe->parent->child_list)
+ break;
+ pe = pe->parent;
+ }
+ }
+
+ return list_entry(next, struct eeh_pe, child);
+}
+
+/**
+ * eeh_pe_traverse - Traverse PEs in the specified PHB
+ * @root: root PE
+ * @fn: callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the specified PE and its
+ * child PEs. The traversing is to be terminated once the
+ * callback returns something other than NULL, or no more PEs
+ * to be traversed.
+ */
+static void *eeh_pe_traverse(struct eeh_pe *root,
+ eeh_traverse_func fn, void *flag)
+{
+ struct eeh_pe *pe;
+ void *ret;
+
+ for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+ ret = fn(pe, flag);
+ if (ret) return ret;
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_dev_traverse - Traverse the devices from the PE
+ * @root: EEH PE
+ * @fn: function callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the devices of the specified
+ * PE and its child PEs.
+ */
+void *eeh_pe_dev_traverse(struct eeh_pe *root,
+ eeh_traverse_func fn, void *flag)
+{
+ struct eeh_pe *pe;
+ struct eeh_dev *edev;
+ void *ret;
+
+ if (!root) {
+ pr_warning("%s: Invalid PE %p\n", __func__, root);
+ return NULL;
+ }
+
+ /* Traverse root PE */
+ for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+ eeh_pe_for_each_dev(pe, edev) {
+ ret = fn(edev, flag);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * __eeh_pe_get - Check the PE address
+ * @data: EEH PE
+ * @flag: EEH device
+ *
+ * For one particular PE, it can be identified by PE address
+ * or tranditional BDF address. BDF address is composed of
+ * Bus/Device/Function number. The extra data referred by flag
+ * indicates which type of address should be used.
+ */
+static void *__eeh_pe_get(void *data, void *flag)
+{
+ struct eeh_pe *pe = (struct eeh_pe *)data;
+ struct eeh_dev *edev = (struct eeh_dev *)flag;
+
+ /* Unexpected PHB PE */
+ if (pe->type & EEH_PE_PHB)
+ return NULL;
+
+ /* We prefer PE address */
+ if (edev->pe_config_addr &&
+ (edev->pe_config_addr == pe->addr))
+ return pe;
+
+ /* Try BDF address */
+ if (edev->config_addr &&
+ (edev->config_addr == pe->config_addr))
+ return pe;
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_get - Search PE based on the given address
+ * @edev: EEH device
+ *
+ * Search the corresponding PE based on the specified address which
+ * is included in the eeh device. The function is used to check if
+ * the associated PE has been created against the PE address. It's
+ * notable that the PE address has 2 format: traditional PE address
+ * which is composed of PCI bus/device/function number, or unified
+ * PE address.
+ */
+struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+{
+ struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
+ struct eeh_pe *pe;
+
+ pe = eeh_pe_traverse(root, __eeh_pe_get, edev);
+
+ return pe;
+}
+
+/**
+ * eeh_pe_get_parent - Retrieve the parent PE
+ * @edev: EEH device
+ *
+ * The whole PEs existing in the system are organized as hierarchy
+ * tree. The function is used to retrieve the parent PE according
+ * to the parent EEH device.
+ */
+static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
+{
+ struct device_node *dn;
+ struct eeh_dev *parent;
+
+ /*
+ * It might have the case for the indirect parent
+ * EEH device already having associated PE, but
+ * the direct parent EEH device doesn't have yet.
+ */
+ dn = edev->dn->parent;
+ while (dn) {
+ /* We're poking out of PCI territory */
+ if (!PCI_DN(dn)) return NULL;
+
+ parent = of_node_to_eeh_dev(dn);
+ /* We're poking out of PCI territory */
+ if (!parent) return NULL;
+
+ if (parent->pe)
+ return parent->pe;
+
+ dn = dn->parent;
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_add_to_parent_pe - Add EEH device to parent PE
+ * @edev: EEH device
+ *
+ * Add EEH device to the parent PE. If the parent PE already
+ * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
+ * we have to create new PE to hold the EEH device and the new
+ * PE will be linked to its parent PE as well.
+ */
+int eeh_add_to_parent_pe(struct eeh_dev *edev)
+{
+ struct eeh_pe *pe, *parent;
+
+ /*
+ * Search the PE has been existing or not according
+ * to the PE address. If that has been existing, the
+ * PE should be composed of PCI bus and its subordinate
+ * components.
+ */
+ pe = eeh_pe_get(edev);
+ if (pe && !(pe->type & EEH_PE_INVALID)) {
+ if (!edev->pe_config_addr) {
+ pr_err("%s: PE with addr 0x%x already exists\n",
+ __func__, edev->config_addr);
+ return -EEXIST;
+ }
+
+ /* Mark the PE as type of PCI bus */
+ pe->type = EEH_PE_BUS;
+ edev->pe = pe;
+
+ /* Put the edev to PE */
+ list_add_tail(&edev->list, &pe->edevs);
+ pr_debug("EEH: Add %s to Bus PE#%x\n",
+ edev->dn->full_name, pe->addr);
+
+ return 0;
+ } else if (pe && (pe->type & EEH_PE_INVALID)) {
+ list_add_tail(&edev->list, &pe->edevs);
+ edev->pe = pe;
+ /*
+ * We're running to here because of PCI hotplug caused by
+ * EEH recovery. We need clear EEH_PE_INVALID until the top.
+ */
+ parent = pe;
+ while (parent) {
+ if (!(parent->type & EEH_PE_INVALID))
+ break;
+ parent->type &= ~EEH_PE_INVALID;
+ parent = parent->parent;
+ }
+ pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+ edev->dn->full_name, pe->addr, pe->parent->addr);
+
+ return 0;
+ }
+
+ /* Create a new EEH PE */
+ pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+ if (!pe) {
+ pr_err("%s: out of memory!\n", __func__);
+ return -ENOMEM;
+ }
+ pe->addr = edev->pe_config_addr;
+ pe->config_addr = edev->config_addr;
+
+ /*
+ * While doing PE reset, we probably hot-reset the
+ * upstream bridge. However, the PCI devices including
+ * the associated EEH devices might be removed when EEH
+ * core is doing recovery. So that won't safe to retrieve
+ * the bridge through downstream EEH device. We have to
+ * trace the parent PCI bus, then the upstream bridge.
+ */
+ if (eeh_probe_mode_dev())
+ pe->bus = eeh_dev_to_pci_dev(edev)->bus;
+
+ /*
+ * Put the new EEH PE into hierarchy tree. If the parent
+ * can't be found, the newly created PE will be attached
+ * to PHB directly. Otherwise, we have to associate the
+ * PE with its parent.
+ */
+ parent = eeh_pe_get_parent(edev);
+ if (!parent) {
+ parent = eeh_phb_pe_get(edev->phb);
+ if (!parent) {
+ pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
+ __func__, edev->phb->global_number);
+ edev->pe = NULL;
+ kfree(pe);
+ return -EEXIST;
+ }
+ }
+ pe->parent = parent;
+
+ /*
+ * Put the newly created PE into the child list and
+ * link the EEH device accordingly.
+ */
+ list_add_tail(&pe->child, &parent->child_list);
+ list_add_tail(&edev->list, &pe->edevs);
+ edev->pe = pe;
+ pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+ edev->dn->full_name, pe->addr, pe->parent->addr);
+
+ return 0;
+}
+
+/**
+ * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE
+ * @edev: EEH device
+ * @purge_pe: remove PE or not
+ *
+ * The PE hierarchy tree might be changed when doing PCI hotplug.
+ * Also, the PCI devices or buses could be removed from the system
+ * during EEH recovery. So we have to call the function remove the
+ * corresponding PE accordingly if necessary.
+ */
+int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
+{
+ struct eeh_pe *pe, *parent, *child;
+ int cnt;
+
+ if (!edev->pe) {
+ pr_warning("%s: No PE found for EEH device %s\n",
+ __func__, edev->dn->full_name);
+ return -EEXIST;
+ }
+
+ /* Remove the EEH device */
+ pe = edev->pe;
+ edev->pe = NULL;
+ list_del(&edev->list);
+
+ /*
+ * Check if the parent PE includes any EEH devices.
+ * If not, we should delete that. Also, we should
+ * delete the parent PE if it doesn't have associated
+ * child PEs and EEH devices.
+ */
+ while (1) {
+ parent = pe->parent;
+ if (pe->type & EEH_PE_PHB)
+ break;
+
+ if (purge_pe) {
+ if (list_empty(&pe->edevs) &&
+ list_empty(&pe->child_list)) {
+ list_del(&pe->child);
+ kfree(pe);
+ } else {
+ break;
+ }
+ } else {
+ if (list_empty(&pe->edevs)) {
+ cnt = 0;
+ list_for_each_entry(child, &pe->child_list, child) {
+ if (!(child->type & EEH_PE_INVALID)) {
+ cnt++;
+ break;
+ }
+ }
+
+ if (!cnt)
+ pe->type |= EEH_PE_INVALID;
+ else
+ break;
+ }
+ }
+
+ pe = parent;
+ }
+
+ return 0;
+}
+
+/**
+ * eeh_pe_update_time_stamp - Update PE's frozen time stamp
+ * @pe: EEH PE
+ *
+ * We have time stamp for each PE to trace its time of getting
+ * frozen in last hour. The function should be called to update
+ * the time stamp on first error of the specific PE. On the other
+ * handle, we needn't account for errors happened in last hour.
+ */
+void eeh_pe_update_time_stamp(struct eeh_pe *pe)
+{
+ struct timeval tstamp;
+
+ if (!pe) return;
+
+ if (pe->freeze_count <= 0) {
+ pe->freeze_count = 0;
+ do_gettimeofday(&pe->tstamp);
+ } else {
+ do_gettimeofday(&tstamp);
+ if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
+ pe->tstamp = tstamp;
+ pe->freeze_count = 0;
+ }
+ }
+}
+
+/**
+ * __eeh_pe_state_mark - Mark the state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to mark the indicated state for the given
+ * PE. Also, the associated PCI devices will be put into IO frozen
+ * state as well.
+ */
+static void *__eeh_pe_state_mark(void *data, void *flag)
+{
+ struct eeh_pe *pe = (struct eeh_pe *)data;
+ int state = *((int *)flag);
+ struct eeh_dev *tmp;
+ struct pci_dev *pdev;
+
+ /*
+ * Mark the PE with the indicated state. Also,
+ * the associated PCI device will be put into
+ * I/O frozen state to avoid I/O accesses from
+ * the PCI device driver.
+ */
+ pe->state |= state;
+ eeh_pe_for_each_dev(pe, tmp) {
+ pdev = eeh_dev_to_pci_dev(tmp);
+ if (pdev)
+ pdev->error_state = pci_channel_io_frozen;
+ }
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_state_mark - Mark specified state for PE and its associated device
+ * @pe: EEH PE
+ *
+ * EEH error affects the current PE and its child PEs. The function
+ * is used to mark appropriate state for the affected PEs and the
+ * associated devices.
+ */
+void eeh_pe_state_mark(struct eeh_pe *pe, int state)
+{
+ eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
+}
+
+/**
+ * __eeh_pe_state_clear - Clear state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to clear the indicated state from the
+ * given PE. Besides, we also clear the check count of the PE
+ * as well.
+ */
+static void *__eeh_pe_state_clear(void *data, void *flag)
+{
+ struct eeh_pe *pe = (struct eeh_pe *)data;
+ int state = *((int *)flag);
+
+ pe->state &= ~state;
+ pe->check_count = 0;
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_state_clear - Clear state for the PE and its children
+ * @pe: PE
+ * @state: state to be cleared
+ *
+ * When the PE and its children has been recovered from error,
+ * we need clear the error state for that. The function is used
+ * for the purpose.
+ */
+void eeh_pe_state_clear(struct eeh_pe *pe, int state)
+{
+ eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
+}
+
+/*
+ * Some PCI bridges (e.g. PLX bridges) have primary/secondary
+ * buses assigned explicitly by firmware, and we probably have
+ * lost that after reset. So we have to delay the check until
+ * the PCI-CFG registers have been restored for the parent
+ * bridge.
+ *
+ * Don't use normal PCI-CFG accessors, which probably has been
+ * blocked on normal path during the stage. So we need utilize
+ * eeh operations, which is always permitted.
+ */
+static void eeh_bridge_check_link(struct pci_dev *pdev,
+ struct device_node *dn)
+{
+ int cap;
+ uint32_t val;
+ int timeout = 0;
+
+ /*
+ * We only check root port and downstream ports of
+ * PCIe switches
+ */
+ if (!pci_is_pcie(pdev) ||
+ (pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT &&
+ pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM))
+ return;
+
+ pr_debug("%s: Check PCIe link for %s ...\n",
+ __func__, pci_name(pdev));
+
+ /* Check slot status */
+ cap = pdev->pcie_cap;
+ eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val);
+ if (!(val & PCI_EXP_SLTSTA_PDS)) {
+ pr_debug(" No card in the slot (0x%04x) !\n", val);
+ return;
+ }
+
+ /* Check power status if we have the capability */
+ eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val);
+ if (val & PCI_EXP_SLTCAP_PCP) {
+ eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val);
+ if (val & PCI_EXP_SLTCTL_PCC) {
+ pr_debug(" In power-off state, power it on ...\n");
+ val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC);
+ val |= (0x0100 & PCI_EXP_SLTCTL_PIC);
+ eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val);
+ msleep(2 * 1000);
+ }
+ }
+
+ /* Enable link */
+ eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val);
+ val &= ~PCI_EXP_LNKCTL_LD;
+ eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val);
+
+ /* Check link */
+ eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val);
+ if (!(val & PCI_EXP_LNKCAP_DLLLARC)) {
+ pr_debug(" No link reporting capability (0x%08x) \n", val);
+ msleep(1000);
+ return;
+ }
+
+ /* Wait the link is up until timeout (5s) */
+ timeout = 0;
+ while (timeout < 5000) {
+ msleep(20);
+ timeout += 20;
+
+ eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val);
+ if (val & PCI_EXP_LNKSTA_DLLLA)
+ break;
+ }
+
+ if (val & PCI_EXP_LNKSTA_DLLLA)
+ pr_debug(" Link up (%s)\n",
+ (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB");
+ else
+ pr_debug(" Link not ready (0x%04x)\n", val);
+}
+
+#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
+
+static void eeh_restore_bridge_bars(struct pci_dev *pdev,
+ struct eeh_dev *edev,
+ struct device_node *dn)
+{
+ int i;
+
+ /*
+ * Device BARs: 0x10 - 0x18
+ * Bus numbers and windows: 0x18 - 0x30
+ */
+ for (i = 4; i < 13; i++)
+ eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+ /* Rom: 0x38 */
+ eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]);
+
+ /* Cache line & Latency timer: 0xC 0xD */
+ eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+ SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+ eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+ SAVED_BYTE(PCI_LATENCY_TIMER));
+ /* Max latency, min grant, interrupt ping and line: 0x3C */
+ eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+ /* PCI Command: 0x4 */
+ eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]);
+
+ /* Check the PCIe link is ready */
+ eeh_bridge_check_link(pdev, dn);
+}
+
+static void eeh_restore_device_bars(struct eeh_dev *edev,
+ struct device_node *dn)
+{
+ int i;
+ u32 cmd;
+
+ for (i = 4; i < 10; i++)
+ eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+ /* 12 == Expansion ROM Address */
+ eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
+
+ eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+ SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+ eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+ SAVED_BYTE(PCI_LATENCY_TIMER));
+
+ /* max latency, min grant, interrupt pin and line */
+ eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+ /*
+ * Restore PERR & SERR bits, some devices require it,
+ * don't touch the other command bits
+ */
+ eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd);
+ if (edev->config_space[1] & PCI_COMMAND_PARITY)
+ cmd |= PCI_COMMAND_PARITY;
+ else
+ cmd &= ~PCI_COMMAND_PARITY;
+ if (edev->config_space[1] & PCI_COMMAND_SERR)
+ cmd |= PCI_COMMAND_SERR;
+ else
+ cmd &= ~PCI_COMMAND_SERR;
+ eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
+}
+
+/**
+ * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
+ * @data: EEH device
+ * @flag: Unused
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static void *eeh_restore_one_device_bars(void *data, void *flag)
+{
+ struct pci_dev *pdev = NULL;
+ struct eeh_dev *edev = (struct eeh_dev *)data;
+ struct device_node *dn = eeh_dev_to_of_node(edev);
+
+ /* Trace the PCI bridge */
+ if (eeh_probe_mode_dev()) {
+ pdev = eeh_dev_to_pci_dev(edev);
+ if (pdev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
+ pdev = NULL;
+ }
+
+ if (pdev)
+ eeh_restore_bridge_bars(pdev, edev, dn);
+ else
+ eeh_restore_device_bars(edev, dn);
+
+ return NULL;
+}
+
+/**
+ * eeh_pe_restore_bars - Restore the PCI config space info
+ * @pe: EEH PE
+ *
+ * This routine performs a recursive walk to the children
+ * of this device as well.
+ */
+void eeh_pe_restore_bars(struct eeh_pe *pe)
+{
+ /*
+ * We needn't take the EEH lock since eeh_pe_dev_traverse()
+ * will take that.
+ */
+ eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL);
+}
+
+/**
+ * eeh_pe_bus_get - Retrieve PCI bus according to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the PCI bus according to the given PE. Basically,
+ * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
+ * primary PCI bus will be retrieved. The parent bus will be
+ * returned for BUS PE. However, we don't have associated PCI
+ * bus for DEVICE PE.
+ */
+struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
+{
+ struct pci_bus *bus = NULL;
+ struct eeh_dev *edev;
+ struct pci_dev *pdev;
+
+ if (pe->type & EEH_PE_PHB) {
+ bus = pe->phb->bus;
+ } else if (pe->type & EEH_PE_BUS ||
+ pe->type & EEH_PE_DEVICE) {
+ if (pe->bus) {
+ bus = pe->bus;
+ goto out;
+ }
+
+ edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
+ pdev = eeh_dev_to_pci_dev(edev);
+ if (pdev)
+ bus = pdev->bus;
+ }
+
+out:
+ return bus;
+}
diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
new file mode 100644
index 000000000000..e7ae3484918c
--- /dev/null
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -0,0 +1,74 @@
+/*
+ * Sysfs entries for PCI Error Recovery for PAPR-compliant platform.
+ * Copyright IBM Corporation 2007
+ * Copyright Linas Vepstas <linas@austin.ibm.com> 2007
+ *
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
+ */
+#include <linux/pci.h>
+#include <linux/stat.h>
+#include <asm/ppc-pci.h>
+#include <asm/pci-bridge.h>
+
+/**
+ * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic
+ * @_name: name of file in sysfs directory
+ * @_memb: name of member in struct pci_dn to access
+ * @_format: printf format for display
+ *
+ * All of the attributes look very similar, so just
+ * auto-gen a cut-n-paste routine to display them.
+ */
+#define EEH_SHOW_ATTR(_name,_memb,_format) \
+static ssize_t eeh_show_##_name(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ struct pci_dev *pdev = to_pci_dev(dev); \
+ struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); \
+ \
+ if (!edev) \
+ return 0; \
+ \
+ return sprintf(buf, _format "\n", edev->_memb); \
+} \
+static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
+
+EEH_SHOW_ATTR(eeh_mode, mode, "0x%x");
+EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x");
+EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x");
+
+void eeh_sysfs_add_device(struct pci_dev *pdev)
+{
+ int rc=0;
+
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
+ rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+
+ if (rc)
+ printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
+}
+
+void eeh_sysfs_remove_device(struct pci_dev *pdev)
+{
+ device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
+ device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
+ device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
+}
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 8741c854e03d..ab15b8d057ad 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -629,21 +629,43 @@ _GLOBAL(ret_from_except_lite)
CURRENT_THREAD_INFO(r9, r1)
ld r3,_MSR(r1)
+#ifdef CONFIG_PPC_BOOK3E
+ ld r10,PACACURRENT(r13)
+#endif /* CONFIG_PPC_BOOK3E */
ld r4,TI_FLAGS(r9)
andi. r3,r3,MSR_PR
beq resume_kernel
+#ifdef CONFIG_PPC_BOOK3E
+ lwz r3,(THREAD+THREAD_DBCR0)(r10)
+#endif /* CONFIG_PPC_BOOK3E */
/* Check current_thread_info()->flags */
andi. r0,r4,_TIF_USER_WORK_MASK
+#ifdef CONFIG_PPC_BOOK3E
+ bne 1f
+ /*
+ * Check to see if the dbcr0 register is set up to debug.
+ * Use the internal debug mode bit to do this.
+ */
+ andis. r0,r3,DBCR0_IDM@h
beq restore
-
- andi. r0,r4,_TIF_NEED_RESCHED
- beq 1f
+ mfmsr r0
+ rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */
+ mtmsr r0
+ mtspr SPRN_DBCR0,r3
+ li r10, -1
+ mtspr SPRN_DBSR,r10
+ b restore
+#else
+ beq restore
+#endif
+1: andi. r0,r4,_TIF_NEED_RESCHED
+ beq 2f
bl .restore_interrupts
SCHEDULE_USER
b .ret_from_except_lite
-1: bl .save_nvgprs
+2: bl .save_nvgprs
bl .restore_interrupts
addi r3,r1,STACK_FRAME_OVERHEAD
bl .do_notify_resume
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 40e4a17c8ba0..4e00d223b2e3 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -341,10 +341,17 @@ vsx_unavailable_pSeries_1:
EXCEPTION_PROLOG_0(PACA_EXGEN)
b vsx_unavailable_pSeries
+facility_unavailable_trampoline:
. = 0xf60
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXGEN)
- b tm_unavailable_pSeries
+ b facility_unavailable_pSeries
+
+hv_facility_unavailable_trampoline:
+ . = 0xf80
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b facility_unavailable_hv
#ifdef CONFIG_CBE_RAS
STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
@@ -522,8 +529,10 @@ denorm_done:
KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20)
STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40)
- STD_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable)
+ STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60)
+ STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable)
+ KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82)
/*
* An interrupt came in while soft-disabled. We set paca->irq_happened, then:
@@ -793,14 +802,10 @@ system_call_relon_pSeries:
STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step)
. = 0x4e00
- SET_SCRATCH0(r13)
- EXCEPTION_PROLOG_0(PACA_EXGEN)
- b h_data_storage_relon_hv
+ b . /* Can't happen, see v2.07 Book III-S section 6.5 */
. = 0x4e20
- SET_SCRATCH0(r13)
- EXCEPTION_PROLOG_0(PACA_EXGEN)
- b h_instr_storage_relon_hv
+ b . /* Can't happen, see v2.07 Book III-S section 6.5 */
. = 0x4e40
SET_SCRATCH0(r13)
@@ -808,9 +813,7 @@ system_call_relon_pSeries:
b emulation_assist_relon_hv
. = 0x4e60
- SET_SCRATCH0(r13)
- EXCEPTION_PROLOG_0(PACA_EXGEN)
- b hmi_exception_relon_hv
+ b . /* Can't happen, see v2.07 Book III-S section 6.5 */
. = 0x4e80
SET_SCRATCH0(r13)
@@ -835,11 +838,17 @@ vsx_unavailable_relon_pSeries_1:
EXCEPTION_PROLOG_0(PACA_EXGEN)
b vsx_unavailable_relon_pSeries
-tm_unavailable_relon_pSeries_1:
+facility_unavailable_relon_trampoline:
. = 0x4f60
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXGEN)
- b tm_unavailable_relon_pSeries
+ b facility_unavailable_relon_pSeries
+
+hv_facility_unavailable_relon_trampoline:
+ . = 0x4f80
+ SET_SCRATCH0(r13)
+ EXCEPTION_PROLOG_0(PACA_EXGEN)
+ b facility_unavailable_relon_hv
STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint)
#ifdef CONFIG_PPC_DENORMALISATION
@@ -1165,36 +1174,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
bl .vsx_unavailable_exception
b .ret_from_except
- .align 7
- .globl tm_unavailable_common
-tm_unavailable_common:
- EXCEPTION_PROLOG_COMMON(0xf60, PACA_EXGEN)
- bl .save_nvgprs
- DISABLE_INTS
- addi r3,r1,STACK_FRAME_OVERHEAD
- bl .tm_unavailable_exception
- b .ret_from_except
+ STD_EXCEPTION_COMMON(0xf60, facility_unavailable, .facility_unavailable_exception)
.align 7
.globl __end_handlers
__end_handlers:
/* Equivalents to the above handlers for relocation-on interrupt vectors */
- STD_RELON_EXCEPTION_HV_OOL(0xe00, h_data_storage)
- KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe00)
- STD_RELON_EXCEPTION_HV_OOL(0xe20, h_instr_storage)
- KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe20)
STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
- KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40)
- STD_RELON_EXCEPTION_HV_OOL(0xe60, hmi_exception)
- KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe60)
MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
- KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe80)
STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
- STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable)
+ STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
+ STD_RELON_EXCEPTION_HV_OOL(0xf80, facility_unavailable)
#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
/*
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index a949bdfc9623..f0b47d1a6b0e 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -176,7 +176,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
length_max = 512 ; /* 64 doublewords */
/* DAWR region can't cross 512 boundary */
if ((bp->attr.bp_addr >> 10) !=
- ((bp->attr.bp_addr + bp->attr.bp_len) >> 10))
+ ((bp->attr.bp_addr + bp->attr.bp_len - 1) >> 10))
return -EINVAL;
}
if (info->len >
@@ -250,6 +250,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args)
* we still need to single-step the instruction, but we don't
* generate an event.
*/
+ info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ;
if (!((bp->attr.bp_addr <= dar) &&
(dar - bp->attr.bp_addr < bp->attr.bp_len)))
info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 939ea7ef0dc8..d7216c9abda1 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -85,7 +85,7 @@ int powersave_nap;
/*
* Register the sysctl to set/clear powersave_nap.
*/
-static ctl_table powersave_nap_ctl_table[]={
+static struct ctl_table powersave_nap_ctl_table[] = {
{
.procname = "powersave-nap",
.data = &powersave_nap,
@@ -95,7 +95,7 @@ static ctl_table powersave_nap_ctl_table[]={
},
{}
};
-static ctl_table powersave_nap_sysctl_root[] = {
+static struct ctl_table powersave_nap_sysctl_root[] = {
{
.procname = "kernel",
.mode = 0555,
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 50e90b7e7139..fa0b54b2a362 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -55,6 +55,7 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr)
struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
{
+ unsigned hugepage_shift;
struct iowa_bus *bus;
int token;
@@ -70,11 +71,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
return NULL;
- ptep = find_linux_pte(init_mm.pgd, vaddr);
+ ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+ &hugepage_shift);
if (ptep == NULL)
paddr = 0;
- else
+ else {
+ /*
+ * we don't have hugepages backing iomem
+ */
+ WARN_ON(hugepage_shift);
paddr = pte_pfn(*ptep) << PAGE_SHIFT;
+ }
bus = iowa_pci_find(vaddr, paddr);
if (bus == NULL)
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index c0d0dbddfba1..b20ff173a671 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -36,6 +36,8 @@
#include <linux/hash.h>
#include <linux/fault-inject.h>
#include <linux/pci.h>
+#include <linux/iommu.h>
+#include <linux/sched.h>
#include <asm/io.h>
#include <asm/prom.h>
#include <asm/iommu.h>
@@ -44,6 +46,7 @@
#include <asm/kdump.h>
#include <asm/fadump.h>
#include <asm/vio.h>
+#include <asm/tce.h>
#define DBG(...)
@@ -724,6 +727,13 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
if (tbl->it_offset == 0)
clear_bit(0, tbl->it_map);
+#ifdef CONFIG_IOMMU_API
+ if (tbl->it_group) {
+ iommu_group_put(tbl->it_group);
+ BUG_ON(tbl->it_group);
+ }
+#endif
+
/* verify that table contains no entries */
if (!bitmap_empty(tbl->it_map, tbl->it_size))
pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
@@ -860,3 +870,316 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
free_pages((unsigned long)vaddr, get_order(size));
}
}
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void group_release(void *iommu_data)
+{
+ struct iommu_table *tbl = iommu_data;
+ tbl->it_group = NULL;
+}
+
+void iommu_register_group(struct iommu_table *tbl,
+ int pci_domain_number, unsigned long pe_num)
+{
+ struct iommu_group *grp;
+ char *name;
+
+ grp = iommu_group_alloc();
+ if (IS_ERR(grp)) {
+ pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
+ PTR_ERR(grp));
+ return;
+ }
+ tbl->it_group = grp;
+ iommu_group_set_iommudata(grp, tbl, group_release);
+ name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
+ pci_domain_number, pe_num);
+ if (!name)
+ return;
+ iommu_group_set_name(grp, name);
+ kfree(name);
+}
+
+enum dma_data_direction iommu_tce_direction(unsigned long tce)
+{
+ if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
+ return DMA_BIDIRECTIONAL;
+ else if (tce & TCE_PCI_READ)
+ return DMA_TO_DEVICE;
+ else if (tce & TCE_PCI_WRITE)
+ return DMA_FROM_DEVICE;
+ else
+ return DMA_NONE;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_direction);
+
+void iommu_flush_tce(struct iommu_table *tbl)
+{
+ /* Flush/invalidate TLB caches if necessary */
+ if (ppc_md.tce_flush)
+ ppc_md.tce_flush(tbl);
+
+ /* Make sure updates are seen by hardware */
+ mb();
+}
+EXPORT_SYMBOL_GPL(iommu_flush_tce);
+
+int iommu_tce_clear_param_check(struct iommu_table *tbl,
+ unsigned long ioba, unsigned long tce_value,
+ unsigned long npages)
+{
+ /* ppc_md.tce_free() does not support any value but 0 */
+ if (tce_value)
+ return -EINVAL;
+
+ if (ioba & ~IOMMU_PAGE_MASK)
+ return -EINVAL;
+
+ ioba >>= IOMMU_PAGE_SHIFT;
+ if (ioba < tbl->it_offset)
+ return -EINVAL;
+
+ if ((ioba + npages) > (tbl->it_offset + tbl->it_size))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
+
+int iommu_tce_put_param_check(struct iommu_table *tbl,
+ unsigned long ioba, unsigned long tce)
+{
+ if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+ return -EINVAL;
+
+ if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
+ return -EINVAL;
+
+ if (ioba & ~IOMMU_PAGE_MASK)
+ return -EINVAL;
+
+ ioba >>= IOMMU_PAGE_SHIFT;
+ if (ioba < tbl->it_offset)
+ return -EINVAL;
+
+ if ((ioba + 1) > (tbl->it_offset + tbl->it_size))
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
+
+unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+{
+ unsigned long oldtce;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ spin_lock(&(pool->lock));
+
+ oldtce = ppc_md.tce_get(tbl, entry);
+ if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
+ ppc_md.tce_free(tbl, entry, 1);
+ else
+ oldtce = 0;
+
+ spin_unlock(&(pool->lock));
+
+ return oldtce;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tce);
+
+int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+ unsigned long entry, unsigned long pages)
+{
+ unsigned long oldtce;
+ struct page *page;
+
+ for ( ; pages; --pages, ++entry) {
+ oldtce = iommu_clear_tce(tbl, entry);
+ if (!oldtce)
+ continue;
+
+ page = pfn_to_page(oldtce >> PAGE_SHIFT);
+ WARN_ON(!page);
+ if (page) {
+ if (oldtce & TCE_PCI_WRITE)
+ SetPageDirty(page);
+ put_page(page);
+ }
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
+
+/*
+ * hwaddr is a kernel virtual address here (0xc... bazillion),
+ * tce_build converts it to a physical address.
+ */
+int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+ unsigned long hwaddr, enum dma_data_direction direction)
+{
+ int ret = -EBUSY;
+ unsigned long oldtce;
+ struct iommu_pool *pool = get_pool(tbl, entry);
+
+ spin_lock(&(pool->lock));
+
+ oldtce = ppc_md.tce_get(tbl, entry);
+ /* Add new entry if it is not busy */
+ if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+ ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
+
+ spin_unlock(&(pool->lock));
+
+ /* if (unlikely(ret))
+ pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
+ __func__, hwaddr, entry << IOMMU_PAGE_SHIFT,
+ hwaddr, ret); */
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_build);
+
+int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
+ unsigned long tce)
+{
+ int ret;
+ struct page *page = NULL;
+ unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
+ enum dma_data_direction direction = iommu_tce_direction(tce);
+
+ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+ direction != DMA_TO_DEVICE, &page);
+ if (unlikely(ret != 1)) {
+ /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
+ tce, entry << IOMMU_PAGE_SHIFT, ret); */
+ return -EFAULT;
+ }
+ hwaddr = (unsigned long) page_address(page) + offset;
+
+ ret = iommu_tce_build(tbl, entry, hwaddr, direction);
+ if (ret)
+ put_page(page);
+
+ if (ret < 0)
+ pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
+ __func__, entry << IOMMU_PAGE_SHIFT, tce, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
+
+int iommu_take_ownership(struct iommu_table *tbl)
+{
+ unsigned long sz = (tbl->it_size + 7) >> 3;
+
+ if (tbl->it_offset == 0)
+ clear_bit(0, tbl->it_map);
+
+ if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
+ pr_err("iommu_tce: it_map is not empty");
+ return -EBUSY;
+ }
+
+ memset(tbl->it_map, 0xff, sz);
+ iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_take_ownership);
+
+void iommu_release_ownership(struct iommu_table *tbl)
+{
+ unsigned long sz = (tbl->it_size + 7) >> 3;
+
+ iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+ memset(tbl->it_map, 0, sz);
+
+ /* Restore bit#0 set by iommu_init_table() */
+ if (tbl->it_offset == 0)
+ set_bit(0, tbl->it_map);
+}
+EXPORT_SYMBOL_GPL(iommu_release_ownership);
+
+static int iommu_add_device(struct device *dev)
+{
+ struct iommu_table *tbl;
+ int ret = 0;
+
+ if (WARN_ON(dev->iommu_group)) {
+ pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n",
+ dev_name(dev),
+ iommu_group_id(dev->iommu_group));
+ return -EBUSY;
+ }
+
+ tbl = get_iommu_table_base(dev);
+ if (!tbl || !tbl->it_group) {
+ pr_debug("iommu_tce: skipping device %s with no tbl\n",
+ dev_name(dev));
+ return 0;
+ }
+
+ pr_debug("iommu_tce: adding %s to iommu group %d\n",
+ dev_name(dev), iommu_group_id(tbl->it_group));
+
+ ret = iommu_group_add_device(tbl->it_group, dev);
+ if (ret < 0)
+ pr_err("iommu_tce: %s has not been added, ret=%d\n",
+ dev_name(dev), ret);
+
+ return ret;
+}
+
+static void iommu_del_device(struct device *dev)
+{
+ iommu_group_remove_device(dev);
+}
+
+static int iommu_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ return iommu_add_device(dev);
+ case BUS_NOTIFY_DEL_DEVICE:
+ iommu_del_device(dev);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+ .notifier_call = iommu_bus_notifier,
+};
+
+static int __init tce_iommu_init(void)
+{
+ struct pci_dev *pdev = NULL;
+
+ BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+ for_each_pci_dev(pdev)
+ iommu_add_device(&pdev->dev);
+
+ bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+ return 0;
+}
+
+subsys_initcall_sync(tce_iommu_init);
+
+#else
+
+void iommu_register_group(struct iommu_table *tbl,
+ int pci_domain_number, unsigned long pe_num)
+{
+}
+
+#endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ea185e0b3cae..2e51cde616d2 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -116,8 +116,6 @@ static inline notrace int decrementer_check_overflow(void)
u64 now = get_tb_or_rtc();
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
- if (now >= *next_tb)
- set_dec(1);
return now >= *next_tb;
}
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 11f5b03a0b06..2156ea90eb54 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -36,12 +36,6 @@
#include <asm/sstep.h>
#include <asm/uaccess.h>
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
-#define MSR_SINGLESTEP (MSR_DE)
-#else
-#define MSR_SINGLESTEP (MSR_SE)
-#endif
-
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -104,19 +98,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
{
- /* We turn off async exceptions to ensure that the single step will
- * be for the instruction we have the kprobe on, if we dont its
- * possible we'd get the single step reported for an exception handler
- * like Decrementer or External Interrupt */
- regs->msr &= ~MSR_EE;
- regs->msr |= MSR_SINGLESTEP;
-#ifdef CONFIG_PPC_ADV_DEBUG_REGS
- regs->msr &= ~MSR_CE;
- mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
-#ifdef CONFIG_PPC_47x
- isync();
-#endif
-#endif
+ enable_single_step(regs);
/*
* On powerpc we should single step on the original
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index 48fbc2b97e95..8213ee1eb05a 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -84,22 +84,30 @@ static ssize_t dev_nvram_read(struct file *file, char __user *buf,
char *tmp = NULL;
ssize_t size;
- ret = -ENODEV;
- if (!ppc_md.nvram_size)
+ if (!ppc_md.nvram_size) {
+ ret = -ENODEV;
goto out;
+ }
- ret = 0;
size = ppc_md.nvram_size();
- if (*ppos >= size || size < 0)
+ if (size < 0) {
+ ret = size;
+ goto out;
+ }
+
+ if (*ppos >= size) {
+ ret = 0;
goto out;
+ }
count = min_t(size_t, count, size - *ppos);
count = min(count, PAGE_SIZE);
- ret = -ENOMEM;
tmp = kmalloc(count, GFP_KERNEL);
- if (!tmp)
+ if (!tmp) {
+ ret = -ENOMEM;
goto out;
+ }
ret = ppc_md.nvram_read(tmp, count, ppos);
if (ret <= 0)
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
new file mode 100644
index 000000000000..3f608800c06b
--- /dev/null
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -0,0 +1,111 @@
+/*
+ * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c"
+ *
+ * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com>
+ * Copyright (C) 2005 International Business Machines
+ *
+ * Updates, 2005, John Rose <johnrose@austin.ibm.com>
+ * Updates, 2005, Linas Vepstas <linas@austin.ibm.com>
+ * Updates, 2013, Gavin Shan <shangw@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/pci.h>
+#include <linux/export.h>
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+#include <asm/firmware.h>
+#include <asm/eeh.h>
+
+/**
+ * __pcibios_remove_pci_devices - remove all devices under this bus
+ * @bus: the indicated PCI bus
+ * @purge_pe: destroy the PE on removal of PCI devices
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ * By default, the corresponding PE will be destroied during the
+ * normal PCI hotplug path. For PCI hotplug during EEH recovery,
+ * the corresponding PE won't be destroied and deallocated.
+ */
+void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe)
+{
+ struct pci_dev *dev, *tmp;
+ struct pci_bus *child_bus;
+
+ /* First go down child busses */
+ list_for_each_entry(child_bus, &bus->children, node)
+ __pcibios_remove_pci_devices(child_bus, purge_pe);
+
+ pr_debug("PCI: Removing devices on bus %04x:%02x\n",
+ pci_domain_nr(bus), bus->number);
+ list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
+ pr_debug(" * Removing %s...\n", pci_name(dev));
+ eeh_remove_bus_device(dev, purge_pe);
+ pci_stop_and_remove_bus_device(dev);
+ }
+}
+
+/**
+ * pcibios_remove_pci_devices - remove all devices under this bus
+ * @bus: the indicated PCI bus
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ */
+void pcibios_remove_pci_devices(struct pci_bus *bus)
+{
+ __pcibios_remove_pci_devices(bus, 1);
+}
+EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
+
+/**
+ * pcibios_add_pci_devices - adds new pci devices to bus
+ * @bus: the indicated PCI bus
+ *
+ * This routine will find and fixup new pci devices under
+ * the indicated bus. This routine presumes that there
+ * might already be some devices under this bridge, so
+ * it carefully tries to add only new devices. (And that
+ * is how this routine differs from other, similar pcibios
+ * routines.)
+ */
+void pcibios_add_pci_devices(struct pci_bus * bus)
+{
+ int slotno, num, mode, pass, max;
+ struct pci_dev *dev;
+ struct device_node *dn = pci_bus_to_OF_node(bus);
+
+ eeh_add_device_tree_early(dn);
+
+ mode = PCI_PROBE_NORMAL;
+ if (ppc_md.pci_probe_mode)
+ mode = ppc_md.pci_probe_mode(bus);
+
+ if (mode == PCI_PROBE_DEVTREE) {
+ /* use ofdt-based probe */
+ of_rescan_bus(dn, bus);
+ } else if (mode == PCI_PROBE_NORMAL) {
+ /* use legacy probe */
+ slotno = PCI_SLOT(PCI_DN(dn->child)->devfn);
+ num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0));
+ if (!num)
+ return;
+ pcibios_setup_bus_devices(bus);
+ max = bus->busn_res.start;
+ for (pass = 0; pass < 2; pass++) {
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE ||
+ dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
+ max = pci_scan_bridge(bus, dev,
+ max, pass);
+ }
+ }
+ }
+ pcibios_finish_adding_to_bus(bus);
+}
+EXPORT_SYMBOL_GPL(pcibios_add_pci_devices);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 076d1242507a..c517dbe705fd 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -916,7 +916,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
flush_altivec_to_thread(src);
flush_vsx_to_thread(src);
flush_spe_to_thread(src);
+
*dst = *src;
+
+ clear_task_ebb(dst);
+
return 0;
}
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 8b6f7a99cce2..eb23ac92abb9 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -559,6 +559,35 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start,
}
#endif
+static void __init early_reserve_mem_dt(void)
+{
+ unsigned long i, len, dt_root;
+ const __be32 *prop;
+
+ dt_root = of_get_flat_dt_root();
+
+ prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len);
+
+ if (!prop)
+ return;
+
+ DBG("Found new-style reserved-ranges\n");
+
+ /* Each reserved range is an (address,size) pair, 2 cells each,
+ * totalling 4 cells per range. */
+ for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+ u64 base, size;
+
+ base = of_read_number(prop + (i * 4) + 0, 2);
+ size = of_read_number(prop + (i * 4) + 2, 2);
+
+ if (size) {
+ DBG("reserving: %llx -> %llx\n", base, size);
+ memblock_reserve(base, size);
+ }
+ }
+}
+
static void __init early_reserve_mem(void)
{
u64 base, size;
@@ -574,12 +603,16 @@ static void __init early_reserve_mem(void)
self_size = initial_boot_params->totalsize;
memblock_reserve(self_base, self_size);
+ /* Look for the new "reserved-regions" property in the DT */
+ early_reserve_mem_dt();
+
#ifdef CONFIG_BLK_DEV_INITRD
- /* then reserve the initrd, if any */
- if (initrd_start && (initrd_end > initrd_start))
+ /* Then reserve the initrd, if any */
+ if (initrd_start && (initrd_end > initrd_start)) {
memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE),
_ALIGN_UP(initrd_end, PAGE_SIZE) -
_ALIGN_DOWN(initrd_start, PAGE_SIZE));
+ }
#endif /* CONFIG_BLK_DEV_INITRD */
#ifdef CONFIG_PPC32
@@ -591,6 +624,8 @@ static void __init early_reserve_mem(void)
u32 base_32, size_32;
u32 *reserve_map_32 = (u32 *)reserve_map;
+ DBG("Found old 32-bit reserve map\n");
+
while (1) {
base_32 = *(reserve_map_32++);
size_32 = *(reserve_map_32++);
@@ -605,6 +640,9 @@ static void __init early_reserve_mem(void)
return;
}
#endif
+ DBG("Processing reserve map\n");
+
+ /* Handle the reserve map in the fdt blob if it exists */
while (1) {
base = *(reserve_map++);
size = *(reserve_map++);
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 98c2fc198712..64f7bd5b1b0f 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -1449,7 +1449,9 @@ static long ppc_set_hwdebug(struct task_struct *child,
*/
if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) {
len = bp_info->addr2 - bp_info->addr;
- } else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) {
+ } else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT)
+ len = 1;
+ else {
ptrace_put_breakpoints(child);
return -EINVAL;
}
diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S
index ef46ba6e094f..f366fedb0872 100644
--- a/arch/powerpc/kernel/reloc_32.S
+++ b/arch/powerpc/kernel/reloc_32.S
@@ -166,7 +166,7 @@ ha16:
/* R_PPC_ADDR16_LO */
lo16:
cmpwi r4, R_PPC_ADDR16_LO
- bne nxtrela
+ bne unknown_type
lwz r4, 0(r9) /* r_offset */
lwz r0, 8(r9) /* r_addend */
add r0, r0, r3
@@ -191,6 +191,7 @@ nxtrela:
dcbst r4,r7
sync /* Ensure the data is flushed before icbi */
icbi r4,r7
+unknown_type:
cmpwi r8, 0 /* relasz = 0 ? */
ble done
add r9, r9, r6 /* move to next entry in the .rela table */
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 52add6f3e201..80b5ef403f68 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1172,7 +1172,7 @@ int __init early_init_dt_scan_rtas(unsigned long node,
static arch_spinlock_t timebase_lock;
static u64 timebase = 0;
-void __cpuinit rtas_give_timebase(void)
+void rtas_give_timebase(void)
{
unsigned long flags;
@@ -1189,7 +1189,7 @@ void __cpuinit rtas_give_timebase(void)
local_irq_restore(flags);
}
-void __cpuinit rtas_take_timebase(void)
+void rtas_take_timebase(void)
{
while (!timebase)
barrier();
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index e379d3fd1694..389fb8077cc9 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -76,7 +76,7 @@
#endif
int boot_cpuid = 0;
-int __initdata spinning_secondaries;
+int spinning_secondaries;
u64 ppc64_pft_size;
/* Pick defaults since we might want to patch instructions
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 201385c3a1ae..0f83122e6676 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -407,7 +407,8 @@ inline unsigned long copy_transact_fpr_from_user(struct task_struct *task,
* altivec/spe instructions at some point.
*/
static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
- int sigret, int ctx_has_vsx_region)
+ struct mcontext __user *tm_frame, int sigret,
+ int ctx_has_vsx_region)
{
unsigned long msr = regs->msr;
@@ -475,6 +476,12 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame,
if (__put_user(msr, &frame->mc_gregs[PT_MSR]))
return 1;
+ /* We need to write 0 the MSR top 32 bits in the tm frame so that we
+ * can check it on the restore to see if TM is active
+ */
+ if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR]))
+ return 1;
+
if (sigret) {
/* Set up the sigreturn trampoline: li r0,sigret; sc */
if (__put_user(0x38000000UL + sigret, &frame->tramp[0])
@@ -747,7 +754,7 @@ static long restore_tm_user_regs(struct pt_regs *regs,
struct mcontext __user *tm_sr)
{
long err;
- unsigned long msr;
+ unsigned long msr, msr_hi;
#ifdef CONFIG_VSX
int i;
#endif
@@ -852,8 +859,11 @@ static long restore_tm_user_regs(struct pt_regs *regs,
tm_enable();
/* This loads the checkpointed FP/VEC state, if used */
tm_recheckpoint(&current->thread, msr);
- /* The task has moved into TM state S, so ensure MSR reflects this */
- regs->msr = (regs->msr & ~MSR_TS_MASK) | MSR_TS_S;
+ /* Get the top half of the MSR */
+ if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR]))
+ return 1;
+ /* Pull in MSR TM from user context */
+ regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK);
/* This loads the speculative FP/VEC state, if used */
if (msr & MSR_FP) {
@@ -952,6 +962,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
{
struct rt_sigframe __user *rt_sf;
struct mcontext __user *frame;
+ struct mcontext __user *tm_frame = NULL;
void __user *addr;
unsigned long newsp = 0;
int sigret;
@@ -985,23 +996,24 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
}
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ tm_frame = &rt_sf->uc_transact.uc_mcontext;
if (MSR_TM_ACTIVE(regs->msr)) {
- if (save_tm_user_regs(regs, &rt_sf->uc.uc_mcontext,
- &rt_sf->uc_transact.uc_mcontext, sigret))
+ if (save_tm_user_regs(regs, frame, tm_frame, sigret))
goto badframe;
}
else
#endif
- if (save_user_regs(regs, frame, sigret, 1))
+ {
+ if (save_user_regs(regs, frame, tm_frame, sigret, 1))
goto badframe;
+ }
regs->link = tramp;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (MSR_TM_ACTIVE(regs->msr)) {
if (__put_user((unsigned long)&rt_sf->uc_transact,
&rt_sf->uc.uc_link)
- || __put_user(to_user_ptr(&rt_sf->uc_transact.uc_mcontext),
- &rt_sf->uc_transact.uc_regs))
+ || __put_user((unsigned long)tm_frame, &rt_sf->uc_transact.uc_regs))
goto badframe;
}
else
@@ -1170,7 +1182,7 @@ long sys_swapcontext(struct ucontext __user *old_ctx,
mctx = (struct mcontext __user *)
((unsigned long) &old_ctx->uc_mcontext & ~0xfUL);
if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
- || save_user_regs(regs, mctx, 0, ctx_has_vsx_region)
+ || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region)
|| put_sigset_t(&old_ctx->uc_sigmask, &current->blocked)
|| __put_user(to_user_ptr(mctx), &old_ctx->uc_regs))
return -EFAULT;
@@ -1233,7 +1245,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR]))
goto bad;
- if (MSR_TM_SUSPENDED(msr_hi<<32)) {
+ if (MSR_TM_ACTIVE(msr_hi<<32)) {
/* We only recheckpoint on return if we're
* transaction.
*/
@@ -1392,6 +1404,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
{
struct sigcontext __user *sc;
struct sigframe __user *frame;
+ struct mcontext __user *tm_mctx = NULL;
unsigned long newsp = 0;
int sigret;
unsigned long tramp;
@@ -1425,6 +1438,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
}
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ tm_mctx = &frame->mctx_transact;
if (MSR_TM_ACTIVE(regs->msr)) {
if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact,
sigret))
@@ -1432,8 +1446,10 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
}
else
#endif
- if (save_user_regs(regs, &frame->mctx, sigret, 1))
+ {
+ if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1))
goto badframe;
+ }
regs->link = tramp;
@@ -1481,16 +1497,22 @@ badframe:
long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
struct pt_regs *regs)
{
+ struct sigframe __user *sf;
struct sigcontext __user *sc;
struct sigcontext sigctx;
struct mcontext __user *sr;
void __user *addr;
sigset_t set;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ struct mcontext __user *mcp, *tm_mcp;
+ unsigned long msr_hi;
+#endif
/* Always make any pending restarted system calls return -EINTR */
current_thread_info()->restart_block.fn = do_no_restart_syscall;
- sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+ sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE);
+ sc = &sf->sctx;
addr = sc;
if (copy_from_user(&sigctx, sc, sizeof(sigctx)))
goto badframe;
@@ -1507,11 +1529,25 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8,
#endif
set_current_blocked(&set);
- sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
- addr = sr;
- if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
- || restore_user_regs(regs, sr, 1))
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ mcp = (struct mcontext __user *)&sf->mctx;
+ tm_mcp = (struct mcontext __user *)&sf->mctx_transact;
+ if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR]))
goto badframe;
+ if (MSR_TM_ACTIVE(msr_hi<<32)) {
+ if (!cpu_has_feature(CPU_FTR_TM))
+ goto badframe;
+ if (restore_tm_user_regs(regs, mcp, tm_mcp))
+ goto badframe;
+ } else
+#endif
+ {
+ sr = (struct mcontext __user *)from_user_ptr(sigctx.regs);
+ addr = sr;
+ if (!access_ok(VERIFY_READ, sr, sizeof(*sr))
+ || restore_user_regs(regs, sr, 1))
+ goto badframe;
+ }
set_thread_flag(TIF_RESTOREALL);
return 0;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 345947367ec0..887e99d85bc2 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -410,6 +410,10 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
/* get MSR separately, transfer the LE bit if doing signal return */
err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
+ /* pull in MSR TM from user context */
+ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK);
+
+ /* pull in MSR LE from user context */
regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE);
/* The following non-GPR non-FPR non-VR state is also checkpointed: */
@@ -505,8 +509,6 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
tm_enable();
/* This loads the checkpointed FP/VEC state, if used */
tm_recheckpoint(&current->thread, msr);
- /* The task has moved into TM state S, so ensure MSR reflects this: */
- regs->msr = (regs->msr & ~MSR_TS_MASK) | __MASK(33);
/* This loads the speculative FP/VEC state, if used */
if (msr & MSR_FP) {
@@ -654,7 +656,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR]))
goto badframe;
- if (MSR_TM_SUSPENDED(msr)) {
+ if (MSR_TM_ACTIVE(msr)) {
/* We recheckpoint on return. */
struct ucontext __user *uc_transact;
if (__get_user(uc_transact, &uc->uc_link))
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ee7ac5e6e28a..38b0ba65a735 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -480,7 +480,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
secondary_ti = current_set[cpu] = ti;
}
-int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
+int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int rc, c;
@@ -610,7 +610,7 @@ static struct device_node *cpu_to_l2cache(int cpu)
}
/* Activate a secondary processor. */
-__cpuinit void start_secondary(void *unused)
+void start_secondary(void *unused)
{
unsigned int cpu = smp_processor_id();
struct device_node *l2_cache;
@@ -637,12 +637,10 @@ __cpuinit void start_secondary(void *unused)
vdso_getcpu_init();
#endif
- notify_cpu_starting(cpu);
- set_cpu_online(cpu, true);
/* Update sibling maps */
base = cpu_first_thread_sibling(cpu);
for (i = 0; i < threads_per_core; i++) {
- if (cpu_is_offline(base + i))
+ if (cpu_is_offline(base + i) && (cpu != base + i))
continue;
cpumask_set_cpu(cpu, cpu_sibling_mask(base + i));
cpumask_set_cpu(base + i, cpu_sibling_mask(cpu));
@@ -667,6 +665,10 @@ __cpuinit void start_secondary(void *unused)
}
of_node_put(l2_cache);
+ smp_wmb();
+ notify_cpu_starting(cpu);
+ set_cpu_online(cpu, true);
+
local_irq_enable();
cpu_startup_entry(CPUHP_ONLINE);
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index e68a84568b8b..27a90b99ef67 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -341,7 +341,7 @@ static struct device_attribute pa6t_attrs[] = {
#endif /* HAS_PPC_PMC_PA6T */
#endif /* HAS_PPC_PMC_CLASSIC */
-static void __cpuinit register_cpu_online(unsigned int cpu)
+static void register_cpu_online(unsigned int cpu)
{
struct cpu *c = &per_cpu(cpu_devices, cpu);
struct device *s = &c->dev;
@@ -502,7 +502,7 @@ ssize_t arch_cpu_release(const char *buf, size_t count)
#endif /* CONFIG_HOTPLUG_CPU */
-static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
+static int sysfs_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned int)(long)hcpu;
@@ -522,7 +522,7 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block __cpuinitdata sysfs_cpu_nb = {
+static struct notifier_block sysfs_cpu_nb = {
.notifier_call = sysfs_cpu_notify,
};
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 5fc29ad7e26f..65ab9e909377 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -631,7 +631,6 @@ static int __init get_freq(char *name, int cells, unsigned long *val)
return found;
}
-/* should become __cpuinit when secondary_cpu_time_init also is */
void start_cpu_decrementer(void)
{
#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 2da67e7a16d5..51be8fb24803 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -112,9 +112,18 @@ _GLOBAL(tm_reclaim)
std r3, STACK_PARAM(0)(r1)
SAVE_NVGPRS(r1)
+ /* We need to setup MSR for VSX register save instructions. Here we
+ * also clear the MSR RI since when we do the treclaim, we won't have a
+ * valid kernel pointer for a while. We clear RI here as it avoids
+ * adding another mtmsr closer to the treclaim. This makes the region
+ * maked as non-recoverable wider than it needs to be but it saves on
+ * inserting another mtmsrd later.
+ */
mfmsr r14
mr r15, r14
ori r15, r15, MSR_FP
+ li r16, MSR_RI
+ andc r15, r15, r16
oris r15, r15, MSR_VEC@h
#ifdef CONFIG_VSX
BEGIN_FTR_SECTION
@@ -349,9 +358,10 @@ restore_gprs:
mtcr r5
mtxer r6
- /* MSR and flags: We don't change CRs, and we don't need to alter
- * MSR.
+ /* Clear the MSR RI since we are about to change R1. EE is already off
*/
+ li r4, 0
+ mtmsrd r4, 1
REST_4GPRS(0, r7) /* GPR0-3 */
REST_GPR(4, r7) /* GPR4-6 */
@@ -377,6 +387,10 @@ restore_gprs:
GET_PACA(r13)
GET_SCRATCH0(r1)
+ /* R1 is restored, so we are recoverable again. EE is still off */
+ li r4, MSR_RI
+ mtmsrd r4, 1
+
REST_NVGPRS(r1)
addi r1, r1, TM_FRAME_SIZE
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index c0e5caf8ccc7..bf33c22e38a4 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -866,6 +866,10 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword)
u8 val;
u32 shift = 8 * (3 - (pos & 0x3));
+ /* if process is 32-bit, clear upper 32 bits of EA */
+ if ((regs->msr & MSR_64BIT) == 0)
+ EA &= 0xFFFFFFFF;
+
switch ((instword & PPC_INST_STRING_MASK)) {
case PPC_INST_LSWX:
case PPC_INST_LSWI:
@@ -1125,7 +1129,17 @@ void __kprobes program_check_exception(struct pt_regs *regs)
* ESR_DST (!?) or 0. In the process of chasing this with the
* hardware people - not sure if it can happen on any illegal
* instruction or only on FP instructions, whether there is a
- * pattern to occurrences etc. -dgibson 31/Mar/2003 */
+ * pattern to occurrences etc. -dgibson 31/Mar/2003
+ */
+
+ /*
+ * If we support a HW FPU, we need to ensure the FP state
+ * if flushed into the thread_struct before attempting
+ * emulation
+ */
+#ifdef CONFIG_PPC_FPU
+ flush_fp_to_thread(current);
+#endif
switch (do_mathemu(regs)) {
case 0:
emulate_single_step(regs);
@@ -1282,25 +1296,50 @@ void vsx_unavailable_exception(struct pt_regs *regs)
die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT);
}
-void tm_unavailable_exception(struct pt_regs *regs)
+void facility_unavailable_exception(struct pt_regs *regs)
{
+ static char *facility_strings[] = {
+ "FPU",
+ "VMX/VSX",
+ "DSCR",
+ "PMU SPRs",
+ "BHRB",
+ "TM",
+ "AT",
+ "EBB",
+ "TAR",
+ };
+ char *facility, *prefix;
+ u64 value;
+
+ if (regs->trap == 0xf60) {
+ value = mfspr(SPRN_FSCR);
+ prefix = "";
+ } else {
+ value = mfspr(SPRN_HFSCR);
+ prefix = "Hypervisor ";
+ }
+
+ value = value >> 56;
+
/* We restore the interrupt state now */
if (!arch_irq_disabled_regs(regs))
local_irq_enable();
- /* Currently we never expect a TMU exception. Catch
- * this and kill the process!
- */
- printk(KERN_EMERG "Unexpected TM unavailable exception at %lx "
- "(msr %lx)\n",
- regs->nip, regs->msr);
+ if (value < ARRAY_SIZE(facility_strings))
+ facility = facility_strings[value];
+ else
+ facility = "unknown";
+
+ pr_err("%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n",
+ prefix, facility, regs->nip, regs->msr);
if (user_mode(regs)) {
_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
return;
}
- die("Unexpected TM unavailable exception", regs, SIGABRT);
+ die("Unexpected facility unavailable exception", regs, SIGABRT);
}
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -1396,8 +1435,7 @@ void performance_monitor_exception(struct pt_regs *regs)
void SoftwareEmulation(struct pt_regs *regs)
{
extern int do_mathemu(struct pt_regs *);
- extern int Soft_emulate_8xx(struct pt_regs *);
-#if defined(CONFIG_MATH_EMULATION) || defined(CONFIG_8XX_MINIMAL_FPEMU)
+#if defined(CONFIG_MATH_EMULATION)
int errcode;
#endif
@@ -1430,23 +1468,6 @@ void SoftwareEmulation(struct pt_regs *regs)
_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
return;
}
-
-#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
- errcode = Soft_emulate_8xx(regs);
- if (errcode >= 0)
- PPC_WARN_EMULATED(8xx, regs);
-
- switch (errcode) {
- case 0:
- emulate_single_step(regs);
- return;
- case 1:
- _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
- return;
- case -EFAULT:
- _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip);
- return;
- }
#else
_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
#endif
@@ -1796,8 +1817,6 @@ struct ppc_emulated ppc_emulated = {
WARN_EMULATED_SETUP(unaligned),
#ifdef CONFIG_MATH_EMULATION
WARN_EMULATED_SETUP(math),
-#elif defined(CONFIG_8XX_MINIMAL_FPEMU)
- WARN_EMULATED_SETUP(8xx),
#endif
#ifdef CONFIG_VSX
WARN_EMULATED_SETUP(vsx),
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index 9d3fdcd66290..a15837519dca 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -50,7 +50,7 @@ void __init udbg_early_init(void)
udbg_init_debug_beat();
#elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE)
udbg_init_pas_realmode();
-#elif defined(CONFIG_BOOTX_TEXT)
+#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX)
udbg_init_btext();
#elif defined(CONFIG_PPC_EARLY_DEBUG_44x)
/* PPC44x debug */
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index d4f463ac65b1..1d9c92621b36 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -711,7 +711,7 @@ static void __init vdso_setup_syscall_map(void)
}
#ifdef CONFIG_PPC64
-int __cpuinit vdso_getcpu_init(void)
+int vdso_getcpu_init(void)
{
unsigned long cpu, node, val;