From 1b2439dbb703ae8d95a9ce7ece6b7800b80f41f0 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 15 Aug 2008 15:29:38 -0700
Subject: debug: add notifier chain debugging

during some development we suspected a case where we left something
in a notifier chain that was from a module that was unloaded already...
and that sort of thing is rather hard to track down.

This patch adds a very simple sanity check (which isn't all that
expensive) to make sure the notifier we're about to call is
actually from either the kernel itself of from a still-loaded
module, avoiding a hard-to-chase-down crash.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/notifier.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/notifier.c b/kernel/notifier.c
index 823be11584ef..143fdd77dbf7 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -21,6 +21,10 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
 static int notifier_chain_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
+	if (!kernel_text_address((unsigned long)n->notifier_call)) {
+		WARN(1, "Invalid notifier registered!");
+		return 0;
+	}
 	while ((*nl) != NULL) {
 		if (n->priority > (*nl)->priority)
 			break;
@@ -34,6 +38,10 @@ static int notifier_chain_register(struct notifier_block **nl,
 static int notifier_chain_cond_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
+	if (!kernel_text_address((unsigned long)n->notifier_call)) {
+		WARN(1, "Invalid notifier registered!");
+		return 0;
+	}
 	while ((*nl) != NULL) {
 		if ((*nl) == n)
 			return 0;
@@ -82,6 +90,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
 
 	while (nb && nr_to_call) {
 		next_nb = rcu_dereference(nb->next);
+
+#ifdef CONFIG_DEBUG_NOTIFIERS
+		if (!kernel_text_address((unsigned long)nb->notifier_call)) {
+			WARN(1, "Invalid notifier called!");
+			nb = next_nb;
+			continue;
+		}
+#endif
 		ret = nb->notifier_call(nb, val, v);
 
 		if (nr_calls)
-- 
cgit v1.2.3


From fb822db465bd9fd4208eef1af4490539b236c54e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 Aug 2008 11:17:40 +0200
Subject: softlockup: increase hung tasks check from 2 minutes to 8 minutes

Andrew says:

> Seems that about 100% of the reports we get of this warning triggering
> are sys_sync, transaction commit, etc.

increase the timeout. If it still triggers for people, we can kill it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b75b492fbfcf..17a058065331 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
 /*
  * Zero means infinite timeout - no checking done:
  */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
 
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
-- 
cgit v1.2.3


From ab7476cf76e560f0efda2a631a70aabe93009025 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 15 Aug 2008 15:29:38 -0700
Subject: debug: add notifier chain debugging, v2

- unbreak ia64 (and powerpc) where function pointers dont
  point at code but at data (reported by Tony Luck)

[ mingo@elte.hu: various cleanups ]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c  | 16 ++++++++++++++++
 kernel/notifier.c | 10 +---------
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e17023..adf0cc9c02d6 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -66,3 +66,19 @@ int kernel_text_address(unsigned long addr)
 		return 1;
 	return module_text_address(addr) != NULL;
 }
+
+/*
+ * On some architectures (PPC64, IA64) function pointers
+ * are actually only tokens to some data that then holds the
+ * real function address. As a result, to find if a function
+ * pointer is part of the kernel text, we need to do some
+ * special dereferencing first.
+ */
+int func_ptr_is_kernel_text(void *ptr)
+{
+	unsigned long addr;
+	addr = (unsigned long) dereference_function_descriptor(ptr);
+	if (core_kernel_text(addr))
+		return 1;
+	return module_text_address(addr) != NULL;
+}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 143fdd77dbf7..0f39e398ef60 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -21,10 +21,6 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list);
 static int notifier_chain_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
-	if (!kernel_text_address((unsigned long)n->notifier_call)) {
-		WARN(1, "Invalid notifier registered!");
-		return 0;
-	}
 	while ((*nl) != NULL) {
 		if (n->priority > (*nl)->priority)
 			break;
@@ -38,10 +34,6 @@ static int notifier_chain_register(struct notifier_block **nl,
 static int notifier_chain_cond_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
-	if (!kernel_text_address((unsigned long)n->notifier_call)) {
-		WARN(1, "Invalid notifier registered!");
-		return 0;
-	}
 	while ((*nl) != NULL) {
 		if ((*nl) == n)
 			return 0;
@@ -92,7 +84,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
 		next_nb = rcu_dereference(nb->next);
 
 #ifdef CONFIG_DEBUG_NOTIFIERS
-		if (!kernel_text_address((unsigned long)nb->notifier_call)) {
+		if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
 			WARN(1, "Invalid notifier called!");
 			nb = next_nb;
 			continue;
-- 
cgit v1.2.3


From 6918bc5c830e890681eabb3c6cb6b8d117a52d14 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:41 +0200
Subject: lockstat: fixup signed division

Some recent modification to this code made me notice the little todo mark.
Now that we have more elaborate 64-bit division functions this isn't hard.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep_proc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..8d3a6eba8d5a 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
 
 static void snprint_time(char *buf, size_t bufsiz, s64 nr)
 {
-	unsigned long rem;
+	s64 div;
+	s32 rem;
 
 	nr += 5; /* for display rounding */
-	rem = do_div(nr, 1000); /* XXX: do_div_signed */
-	snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
+	div = div_s64_rem(nr, 1000, &rem);
+	snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
 }
 
 static void seq_time(struct seq_file *m, s64 time)
-- 
cgit v1.2.3


From 38d47c1b7075bd7ec3881141bb3629da58f88dab Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Sep 2008 19:32:20 +0200
Subject: futex: rely on get_user_pages() for shared futexes

On the way of getting rid of the mmap_sem requirement for shared futexes,
start by relying on get_user_pages().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 162 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 80 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 7d1136e97c14..a4c39fa0a7a3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -161,6 +161,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 		&& key1->both.offset == key2->both.offset);
 }
 
+/*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ */
+static void get_futex_key_refs(union futex_key *key)
+{
+	if (!key->both.ptr)
+		return;
+
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+	case FUT_OFF_INODE:
+		atomic_inc(&key->shared.inode->i_count);
+		break;
+	case FUT_OFF_MMSHARED:
+		atomic_inc(&key->private.mm->mm_count);
+		break;
+	}
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held.
+ */
+static void drop_futex_key_refs(union futex_key *key)
+{
+	if (!key->both.ptr)
+		return;
+
+	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+	case FUT_OFF_INODE:
+		iput(key->shared.inode);
+		break;
+	case FUT_OFF_MMSHARED:
+		mmdrop(key->private.mm);
+		break;
+	}
+}
+
 /**
  * get_futex_key - Get parameters which are the keys for a futex.
  * @uaddr: virtual address of the futex
@@ -184,7 +223,6 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
 	struct page *page;
 	int err;
 
@@ -210,98 +248,47 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 		key->private.address = address;
 		return 0;
 	}
-	/*
-	 * The futex is hashed differently depending on whether
-	 * it's in a shared or private mapping.  So check vma first.
-	 */
-	vma = find_extend_vma(mm, address);
-	if (unlikely(!vma))
-		return -EFAULT;
 
-	/*
-	 * Permissions.
-	 */
-	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
-		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+again:
+	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
+	if (err < 0)
+		return err;
+
+	lock_page(page);
+	if (!page->mapping) {
+		unlock_page(page);
+		put_page(page);
+		goto again;
+	}
 
 	/*
 	 * Private mappings are handled in a simple way.
 	 *
 	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
 	 * it's a read-only handle, it's expected that futexes attach to
-	 * the object not the particular process.  Therefore we use
-	 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
-	 * mappings of _writable_ handles.
+	 * the object not the particular process.
 	 */
-	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
-		key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
+	if (PageAnon(page)) {
+		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 		key->private.mm = mm;
 		key->private.address = address;
-		return 0;
-	}
-
-	/*
-	 * Linear file mappings are also simple.
-	 */
-	key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
-	key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
-	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
-		key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
-				     + vma->vm_pgoff);
-		return 0;
+	} else {
+		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+		key->shared.inode = page->mapping->host;
+		key->shared.pgoff = page->index;
 	}
 
-	/*
-	 * We could walk the page table to read the non-linear
-	 * pte, and get the page index without fetching the page
-	 * from swap.  But that's a lot of code to duplicate here
-	 * for a rare case, so we simply fetch the page.
-	 */
-	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
-	if (err >= 0) {
-		key->shared.pgoff =
-			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		put_page(page);
-		return 0;
-	}
-	return err;
-}
+	get_futex_key_refs(key);
 
-/*
- * Take a reference to the resource addressed by a key.
- * Can be called while holding spinlocks.
- *
- */
-static void get_futex_key_refs(union futex_key *key)
-{
-	if (key->both.ptr == NULL)
-		return;
-	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-		case FUT_OFF_INODE:
-			atomic_inc(&key->shared.inode->i_count);
-			break;
-		case FUT_OFF_MMSHARED:
-			atomic_inc(&key->private.mm->mm_count);
-			break;
-	}
+	unlock_page(page);
+	put_page(page);
+	return 0;
 }
 
-/*
- * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
- */
-static void drop_futex_key_refs(union futex_key *key)
+static inline
+void put_futex_key(struct rw_semaphore *fshared, union futex_key *key)
 {
-	if (!key->both.ptr)
-		return;
-	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-		case FUT_OFF_INODE:
-			iput(key->shared.inode);
-			break;
-		case FUT_OFF_MMSHARED:
-			mmdrop(key->private.mm);
-			break;
-	}
+	drop_futex_key_refs(key);
 }
 
 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -385,6 +372,7 @@ static int refill_pi_state_cache(void)
 	/* pi_mutex gets initialized later */
 	pi_state->owner = NULL;
 	atomic_set(&pi_state->refcount, 1);
+	pi_state->key = FUTEX_KEY_INIT;
 
 	current->pi_state_cache = pi_state;
 
@@ -462,7 +450,7 @@ void exit_pi_state_list(struct task_struct *curr)
 	struct list_head *next, *head = &curr->pi_state_list;
 	struct futex_pi_state *pi_state;
 	struct futex_hash_bucket *hb;
-	union futex_key key;
+	union futex_key key = FUTEX_KEY_INIT;
 
 	if (!futex_cmpxchg_enabled)
 		return;
@@ -725,7 +713,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
 	struct plist_head *head;
-	union futex_key key;
+	union futex_key key = FUTEX_KEY_INIT;
 	int ret;
 
 	if (!bitset)
@@ -760,6 +748,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 
 	spin_unlock(&hb->lock);
 out:
+	put_futex_key(fshared, &key);
 	futex_unlock_mm(fshared);
 	return ret;
 }
@@ -773,7 +762,7 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
 	      u32 __user *uaddr2,
 	      int nr_wake, int nr_wake2, int op)
 {
-	union futex_key key1, key2;
+	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head;
 	struct futex_q *this, *next;
@@ -873,6 +862,8 @@ retry:
 	if (hb1 != hb2)
 		spin_unlock(&hb2->lock);
 out:
+	put_futex_key(fshared, &key2);
+	put_futex_key(fshared, &key1);
 	futex_unlock_mm(fshared);
 
 	return ret;
@@ -886,7 +877,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 			 u32 __user *uaddr2,
 			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
-	union futex_key key1, key2;
+	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head1;
 	struct futex_q *this, *next;
@@ -974,6 +965,8 @@ out_unlock:
 		drop_futex_key_refs(&key1);
 
 out:
+	put_futex_key(fshared, &key2);
+	put_futex_key(fshared, &key1);
 	futex_unlock_mm(fshared);
 	return ret;
 }
@@ -1220,6 +1213,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
  retry:
 	futex_lock_mm(fshared);
 
+	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
@@ -1360,6 +1354,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	queue_unlock(&q, hb);
 
  out_release_sem:
+	put_futex_key(fshared, &q.key);
 	futex_unlock_mm(fshared);
 	return ret;
 }
@@ -1411,6 +1406,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
  retry:
 	futex_lock_mm(fshared);
 
+	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
@@ -1625,6 +1621,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	queue_unlock(&q, hb);
 
  out_release_sem:
+	put_futex_key(fshared, &q.key);
 	futex_unlock_mm(fshared);
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
@@ -1671,7 +1668,7 @@ static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
 	struct futex_q *this, *next;
 	u32 uval;
 	struct plist_head *head;
-	union futex_key key;
+	union futex_key key = FUTEX_KEY_INIT;
 	int ret, attempt = 0;
 
 retry:
@@ -1744,6 +1741,7 @@ retry_unlocked:
 out_unlock:
 	spin_unlock(&hb->lock);
 out:
+	put_futex_key(fshared, &key);
 	futex_unlock_mm(fshared);
 
 	return ret;
-- 
cgit v1.2.3


From 61270708ecf1cda148e84fbf6e0703ee5aa81814 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Sep 2008 19:32:21 +0200
Subject: futex: reduce mmap_sem usage

now that we rely on get_user_pages() for the shared key handling
move all the mmap_sem stuff closely around the slow paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 83 +++-------------------------------------------------------
 1 file changed, 4 insertions(+), 79 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index a4c39fa0a7a3..6a726684217e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -122,24 +122,6 @@ struct futex_hash_bucket {
 
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 
-/*
- * Take mm->mmap_sem, when futex is shared
- */
-static inline void futex_lock_mm(struct rw_semaphore *fshared)
-{
-	if (fshared)
-		down_read(fshared);
-}
-
-/*
- * Release mm->mmap_sem, when the futex is shared
- */
-static inline void futex_unlock_mm(struct rw_semaphore *fshared)
-{
-	if (fshared)
-		up_read(fshared);
-}
-
 /*
  * We hash on the keys returned from get_futex_key (see below).
  */
@@ -250,7 +232,9 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 	}
 
 again:
+	down_read(&mm->mmap_sem);
 	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
+	up_read(&mm->mmap_sem);
 	if (err < 0)
 		return err;
 
@@ -327,8 +311,7 @@ static int futex_handle_fault(unsigned long address,
 	if (attempt > 2)
 		return ret;
 
-	if (!fshared)
-		down_read(&mm->mmap_sem);
+	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, address);
 	if (vma && address >= vma->vm_start &&
 	    (vma->vm_flags & VM_WRITE)) {
@@ -348,8 +331,7 @@ static int futex_handle_fault(unsigned long address,
 				current->min_flt++;
 		}
 	}
-	if (!fshared)
-		up_read(&mm->mmap_sem);
+	up_read(&mm->mmap_sem);
 	return ret;
 }
 
@@ -719,8 +701,6 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 	if (!bitset)
 		return -EINVAL;
 
-	futex_lock_mm(fshared);
-
 	ret = get_futex_key(uaddr, fshared, &key);
 	if (unlikely(ret != 0))
 		goto out;
@@ -749,7 +729,6 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 	spin_unlock(&hb->lock);
 out:
 	put_futex_key(fshared, &key);
-	futex_unlock_mm(fshared);
 	return ret;
 }
 
@@ -769,8 +748,6 @@ futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
 	int ret, op_ret, attempt = 0;
 
 retryfull:
-	futex_lock_mm(fshared);
-
 	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
@@ -821,12 +798,6 @@ retry:
 			goto retry;
 		}
 
-		/*
-		 * If we would have faulted, release mmap_sem,
-		 * fault it in and start all over again.
-		 */
-		futex_unlock_mm(fshared);
-
 		ret = get_user(dummy, uaddr2);
 		if (ret)
 			return ret;
@@ -864,7 +835,6 @@ retry:
 out:
 	put_futex_key(fshared, &key2);
 	put_futex_key(fshared, &key1);
-	futex_unlock_mm(fshared);
 
 	return ret;
 }
@@ -884,8 +854,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 	int ret, drop_count = 0;
 
  retry:
-	futex_lock_mm(fshared);
-
 	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
@@ -908,12 +876,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
 			if (hb1 != hb2)
 				spin_unlock(&hb2->lock);
 
-			/*
-			 * If we would have faulted, release mmap_sem, fault
-			 * it in and start all over again.
-			 */
-			futex_unlock_mm(fshared);
-
 			ret = get_user(curval, uaddr1);
 
 			if (!ret)
@@ -967,7 +929,6 @@ out_unlock:
 out:
 	put_futex_key(fshared, &key2);
 	put_futex_key(fshared, &key1);
-	futex_unlock_mm(fshared);
 	return ret;
 }
 
@@ -1211,8 +1172,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	q.pi_state = NULL;
 	q.bitset = bitset;
  retry:
-	futex_lock_mm(fshared);
-
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
@@ -1245,12 +1204,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	if (unlikely(ret)) {
 		queue_unlock(&q, hb);
 
-		/*
-		 * If we would have faulted, release mmap_sem, fault it in and
-		 * start all over again.
-		 */
-		futex_unlock_mm(fshared);
-
 		ret = get_user(uval, uaddr);
 
 		if (!ret)
@@ -1264,12 +1217,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 	/* Only actually queue if *uaddr contained val.  */
 	queue_me(&q, hb);
 
-	/*
-	 * Now the futex is queued and we have checked the data, we
-	 * don't want to hold mmap_sem while we sleep.
-	 */
-	futex_unlock_mm(fshared);
-
 	/*
 	 * There might have been scheduling since the queue_me(), as we
 	 * cannot hold a spinlock across the get_user() in case it
@@ -1355,7 +1302,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 
  out_release_sem:
 	put_futex_key(fshared, &q.key);
-	futex_unlock_mm(fshared);
 	return ret;
 }
 
@@ -1404,8 +1350,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
 	q.pi_state = NULL;
  retry:
-	futex_lock_mm(fshared);
-
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
 	if (unlikely(ret != 0))
@@ -1495,7 +1439,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 			 * exit to complete.
 			 */
 			queue_unlock(&q, hb);
-			futex_unlock_mm(fshared);
 			cond_resched();
 			goto retry;
 
@@ -1527,12 +1470,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	 */
 	queue_me(&q, hb);
 
-	/*
-	 * Now the futex is queued and we have checked the data, we
-	 * don't want to hold mmap_sem while we sleep.
-	 */
-	futex_unlock_mm(fshared);
-
 	WARN_ON(!q.pi_state);
 	/*
 	 * Block on the PI mutex:
@@ -1545,7 +1482,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 		ret = ret ? 0 : -EWOULDBLOCK;
 	}
 
-	futex_lock_mm(fshared);
 	spin_lock(q.lock_ptr);
 
 	if (!ret) {
@@ -1611,7 +1547,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(&q);
-	futex_unlock_mm(fshared);
 
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
@@ -1622,7 +1557,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
  out_release_sem:
 	put_futex_key(fshared, &q.key);
-	futex_unlock_mm(fshared);
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
 	return ret;
@@ -1646,8 +1580,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 		goto retry_unlocked;
 	}
 
-	futex_unlock_mm(fshared);
-
 	ret = get_user(uval, uaddr);
 	if (!ret && (uval != -EFAULT))
 		goto retry;
@@ -1679,10 +1611,6 @@ retry:
 	 */
 	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
 		return -EPERM;
-	/*
-	 * First take all the futex related locks:
-	 */
-	futex_lock_mm(fshared);
 
 	ret = get_futex_key(uaddr, fshared, &key);
 	if (unlikely(ret != 0))
@@ -1742,7 +1670,6 @@ out_unlock:
 	spin_unlock(&hb->lock);
 out:
 	put_futex_key(fshared, &key);
-	futex_unlock_mm(fshared);
 
 	return ret;
 
@@ -1766,8 +1693,6 @@ pi_faulted:
 		goto retry_unlocked;
 	}
 
-	futex_unlock_mm(fshared);
-
 	ret = get_user(uval, uaddr);
 	if (!ret && (uval != -EFAULT))
 		goto retry;
-- 
cgit v1.2.3


From 734b05b10e51d4ba38c8fc3ee02e846aab09eedf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Sep 2008 19:32:22 +0200
Subject: futex: use fast_gup()

Change the get_user_pages() call with fast_gup() which doesn't require holding
the mmap_sem thereby removing the mmap_sem from all fast paths.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 6a726684217e..facf17d1a705 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -232,9 +232,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
 	}
 
 again:
-	down_read(&mm->mmap_sem);
-	err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
-	up_read(&mm->mmap_sem);
+	err = get_user_pages_fast(address, 1, 0, &page);
 	if (err < 0)
 		return err;
 
-- 
cgit v1.2.3


From c2f9f20154bfb137ccdf8c9159992429a40dfe20 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 26 Sep 2008 19:32:23 +0200
Subject: futex: cleanup fshared

fshared doesn't need to be a rw_sem pointer anymore, so clean that up.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 48 +++++++++++++++++++-----------------------------
 1 file changed, 19 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index facf17d1a705..60b47bb9e3dd 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -200,8 +200,7 @@ static void drop_futex_key_refs(union futex_key *key)
  * For other futexes, it points to &current->mm->mmap_sem and
  * caller must have taken the reader lock. but NOT any spinlocks.
  */
-static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
-			 union futex_key *key)
+static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
@@ -268,7 +267,7 @@ again:
 }
 
 static inline
-void put_futex_key(struct rw_semaphore *fshared, union futex_key *key)
+void put_futex_key(int fshared, union futex_key *key)
 {
 	drop_futex_key_refs(key);
 }
@@ -297,10 +296,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 
 /*
  * Fault handling.
- * if fshared is non NULL, current->mm->mmap_sem is already held
  */
-static int futex_handle_fault(unsigned long address,
-			      struct rw_semaphore *fshared, int attempt)
+static int futex_handle_fault(unsigned long address, int attempt)
 {
 	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
@@ -687,8 +684,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
-		      int nr_wake, u32 bitset)
+static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -735,8 +731,7 @@ out:
  * to this virtual address:
  */
 static int
-futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
-	      u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 	      int nr_wake, int nr_wake2, int op)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -790,7 +785,7 @@ retry:
 		 */
 		if (attempt++) {
 			ret = futex_handle_fault((unsigned long)uaddr2,
-						 fshared, attempt);
+						 attempt);
 			if (ret)
 				goto out;
 			goto retry;
@@ -841,8 +836,7 @@ out:
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
-			 u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 			 int nr_wake, int nr_requeue, u32 *cmpval)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
@@ -1048,8 +1042,7 @@ static void unqueue_me_pi(struct futex_q *q)
  * private futexes.
  */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-				struct task_struct *newowner,
-				struct rw_semaphore *fshared)
+				struct task_struct *newowner, int fshared)
 {
 	u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
 	struct futex_pi_state *pi_state = q->pi_state;
@@ -1128,7 +1121,7 @@ retry:
 handle_fault:
 	spin_unlock(q->lock_ptr);
 
-	ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
+	ret = futex_handle_fault((unsigned long)uaddr, attempt++);
 
 	spin_lock(q->lock_ptr);
 
@@ -1152,7 +1145,7 @@ handle_fault:
 
 static long futex_wait_restart(struct restart_block *restart);
 
-static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+static int futex_wait(u32 __user *uaddr, int fshared,
 		      u32 val, ktime_t *abs_time, u32 bitset)
 {
 	struct task_struct *curr = current;
@@ -1307,13 +1300,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
-	struct rw_semaphore *fshared = NULL;
+	int fshared = 0;
 	ktime_t t;
 
 	t.tv64 = restart->futex.time;
 	restart->fn = do_no_restart_syscall;
 	if (restart->futex.flags & FLAGS_SHARED)
-		fshared = &current->mm->mmap_sem;
+		fshared = 1;
 	return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
 				restart->futex.bitset);
 }
@@ -1325,7 +1318,7 @@ static long futex_wait_restart(struct restart_block *restart)
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+static int futex_lock_pi(u32 __user *uaddr, int fshared,
 			 int detect, ktime_t *time, int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
@@ -1571,8 +1564,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 	queue_unlock(&q, hb);
 
 	if (attempt++) {
-		ret = futex_handle_fault((unsigned long)uaddr, fshared,
-					 attempt);
+		ret = futex_handle_fault((unsigned long)uaddr, attempt);
 		if (ret)
 			goto out_release_sem;
 		goto retry_unlocked;
@@ -1592,7 +1584,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
  */
-static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
+static int futex_unlock_pi(u32 __user *uaddr, int fshared)
 {
 	struct futex_hash_bucket *hb;
 	struct futex_q *this, *next;
@@ -1683,8 +1675,7 @@ pi_faulted:
 	spin_unlock(&hb->lock);
 
 	if (attempt++) {
-		ret = futex_handle_fault((unsigned long)uaddr, fshared,
-					 attempt);
+		ret = futex_handle_fault((unsigned long)uaddr, attempt);
 		if (ret)
 			goto out;
 		uval = 0;
@@ -1816,8 +1807,7 @@ retry:
 		 * PI futexes happens in exit_pi_state():
 		 */
 		if (!pi && (uval & FUTEX_WAITERS))
-			futex_wake(uaddr, &curr->mm->mmap_sem, 1,
-				   FUTEX_BITSET_MATCH_ANY);
+			futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
 	}
 	return 0;
 }
@@ -1913,10 +1903,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 {
 	int ret = -ENOSYS;
 	int cmd = op & FUTEX_CMD_MASK;
-	struct rw_semaphore *fshared = NULL;
+	int fshared = 0;
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
-		fshared = &current->mm->mmap_sem;
+		fshared = 1;
 
 	switch (cmd) {
 	case FUTEX_WAIT:
-- 
cgit v1.2.3


From 42569c39917a08e8de1e8b5685463be7b74baebd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 30 Sep 2008 12:33:07 +0200
Subject: futex: fixup get_futex_key() for private futexes

With the get_user_pages_fast() patches we made get_futex_key() obtain a
reference on the returned key, but failed to do so for private futexes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 60b47bb9e3dd..62cbd648e28a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -227,6 +227,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 			return -EFAULT;
 		key->private.mm = mm;
 		key->private.address = address;
+		get_futex_key_refs(key);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 7317d7b87edb41a9135e30be1ec3f7ef817c53dd Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 30 Sep 2008 20:50:27 +1000
Subject: sched: improve preempt debugging

This patch helped me out with a problem I recently had....

Basically, when the kernel lock is held, then preempt_count underflow does not
get detected until it is released which may be a long time (and arbitrarily,
eg at different points it may be rescheduled). If the bkl is released at
schedule, the resulting output is actually fairly cryptic...

With any other lock that elevates preempt_count, it is illegal to schedule
under it (which would get found pretty quickly). bkl allows scheduling with
preempt_count elevated, which makes underflows hard to debug.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 98890807375b..ec3bd1f398b3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4305,7 +4305,7 @@ void __kprobes sub_preempt_count(int val)
 	/*
 	 * Underflow?
 	 */
-	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
-- 
cgit v1.2.3


From c7e78cff6b7518212247fb20b1dc6411540dc9af Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 16 Oct 2008 23:17:09 +0200
Subject: lockstat: contend with points

We currently only provide points that have to wait on contention, also
lists the points we have to wait for.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c      | 33 +++++++++++++++++++++------------
 kernel/lockdep_proc.c | 21 +++++++++++++++++++--
 kernel/mutex.c        |  2 +-
 3 files changed, 41 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index dbda475b13bd..234a9dccb4be 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -136,16 +136,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
 
-static int lock_contention_point(struct lock_class *class, unsigned long ip)
+static int lock_point(unsigned long points[], unsigned long ip)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
-		if (class->contention_point[i] == 0) {
-			class->contention_point[i] = ip;
+	for (i = 0; i < LOCKSTAT_POINTS; i++) {
+		if (points[i] == 0) {
+			points[i] = ip;
 			break;
 		}
-		if (class->contention_point[i] == ip)
+		if (points[i] == ip)
 			break;
 	}
 
@@ -185,6 +185,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
 		for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
 			stats.contention_point[i] += pcs->contention_point[i];
 
+		for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
+			stats.contending_point[i] += pcs->contending_point[i];
+
 		lock_time_add(&pcs->read_waittime, &stats.read_waittime);
 		lock_time_add(&pcs->write_waittime, &stats.write_waittime);
 
@@ -209,6 +212,7 @@ void clear_lock_stats(struct lock_class *class)
 		memset(cpu_stats, 0, sizeof(struct lock_class_stats));
 	}
 	memset(class->contention_point, 0, sizeof(class->contention_point));
+	memset(class->contending_point, 0, sizeof(class->contending_point));
 }
 
 static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -3001,7 +3005,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 	struct held_lock *hlock, *prev_hlock;
 	struct lock_class_stats *stats;
 	unsigned int depth;
-	int i, point;
+	int i, contention_point, contending_point;
 
 	depth = curr->lockdep_depth;
 	if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3025,18 +3029,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 found_it:
 	hlock->waittime_stamp = sched_clock();
 
-	point = lock_contention_point(hlock_class(hlock), ip);
+	contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
+	contending_point = lock_point(hlock_class(hlock)->contending_point,
+				      lock->ip);
 
 	stats = get_lock_stats(hlock_class(hlock));
-	if (point < ARRAY_SIZE(stats->contention_point))
-		stats->contention_point[point]++;
+	if (contention_point < LOCKSTAT_POINTS)
+		stats->contention_point[contention_point]++;
+	if (contending_point < LOCKSTAT_POINTS)
+		stats->contending_point[contending_point]++;
 	if (lock->cpu != smp_processor_id())
 		stats->bounces[bounce_contended + !!hlock->read]++;
 	put_lock_stats(stats);
 }
 
 static void
-__lock_acquired(struct lockdep_map *lock)
+__lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
@@ -3085,6 +3093,7 @@ found_it:
 	put_lock_stats(stats);
 
 	lock->cpu = cpu;
+	lock->ip = ip;
 }
 
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3106,7 +3115,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
-void lock_acquired(struct lockdep_map *lock)
+void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
@@ -3119,7 +3128,7 @@ void lock_acquired(struct lockdep_map *lock)
 	raw_local_irq_save(flags);
 	check_flags(flags);
 	current->lockdep_recursion = 1;
-	__lock_acquired(lock);
+	__lock_acquired(lock, ip);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 8d3a6eba8d5a..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -557,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 	if (stats->read_holdtime.nr)
 		namelen += 2;
 
-	for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
+	for (i = 0; i < LOCKSTAT_POINTS; i++) {
 		char sym[KSYM_SYMBOL_LEN];
 		char ip[32];
 
@@ -574,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 				stats->contention_point[i],
 				ip, sym);
 	}
+	for (i = 0; i < LOCKSTAT_POINTS; i++) {
+		char sym[KSYM_SYMBOL_LEN];
+		char ip[32];
+
+		if (class->contending_point[i] == 0)
+			break;
+
+		if (!i)
+			seq_line(m, '-', 40-namelen, namelen);
+
+		sprint_symbol(sym, class->contending_point[i]);
+		snprintf(ip, sizeof(ip), "[<%p>]",
+				(void *)class->contending_point[i]);
+		seq_printf(m, "%40s %14lu %29s %s\n", name,
+				stats->contending_point[i],
+				ip, sym);
+	}
 	if (i) {
 		seq_puts(m, "\n");
 		seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -583,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 
 static void seq_header(struct seq_file *m)
 {
-	seq_printf(m, "lock_stat version 0.2\n");
+	seq_printf(m, "lock_stat version 0.3\n");
 	seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
 	seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
 			"%14s %14s\n",
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..39a3816b68d9 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 
 done:
-	lock_acquired(&lock->dep_map);
+	lock_acquired(&lock->dep_map, ip);
 	/* got the lock - rejoice! */
 	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
 	debug_mutex_set_owner(lock, task_thread_info(task));
-- 
cgit v1.2.3


From be19ef82e068e92a28df50341938fdeb5ea56436 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 3 Nov 2008 14:35:12 +0100
Subject: rcu: make rcu-stall debug printout more standard

Impact: change debug printout

Change "RCU detected CPU stall" to "INFO: RCU detected CPU stall"
message, to make it easier for tools to pick up the warning.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..e503a002f330 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 
 	/* OK, time to rat on our buddy... */
 
-	printk(KERN_ERR "RCU detected CPU stalls:");
+	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
 	for_each_possible_cpu(cpu) {
 		if (cpu_isset(cpu, rcp->cpumask))
 			printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 	unsigned long flags;
 
-	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
 			smp_processor_id(), jiffies,
 			jiffies - rcp->gp_start);
 	dump_stack();
-- 
cgit v1.2.3


From 74fcd524e808975dd546dac847119f1995a7c622 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 17 Nov 2008 15:39:52 +0100
Subject: account_steal_time: kill the unneeded account_group_system_time()

Impact: remove unnecessary accounting call

I don't actually understand account_steal_time() and I failed to find the
commit which added account_group_system_time(), but this looks bogus.
In any case rq->idle must be single-threaded, so it can't have ->totals.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c94baf2969e7..b388c9b243e9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4202,7 +4202,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 
 	if (p == rq->idle) {
 		p->stime = cputime_add(p->stime, steal);
-		account_group_system_time(p, steal);
 		if (atomic_read(&rq->nr_iowait) > 0)
 			cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
 		else
-- 
cgit v1.2.3


From ce394471d13bf071939a9a0b48c64c297676d233 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 17 Nov 2008 15:40:01 +0100
Subject: thread_group_cputime: kill the bogus ->signal != NULL check

Impact: simplify the code

thread_group_cputime() is called by current when it must have the valid
->signal, or under ->siglock, or under tasklist_lock after the ->signal
check, or the caller is wait_task_zombie() which reaps the child. In any
case ->signal can't be NULL.

But the point of this patch is not optimization. If it is possible to call
thread_group_cputime() when ->signal == NULL we are doing something wrong,
and we should not mask the problem. thread_group_cputime() fills *times
and the caller will use it, if we silently use task_struct->*times* we
report the wrong values.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 895337b16a24..3f4377e0aa04 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
 	struct task_struct *tsk,
 	struct task_cputime *times)
 {
-	struct signal_struct *sig;
+	struct task_cputime *totals, *tot;
 	int i;
-	struct task_cputime *tot;
 
-	sig = tsk->signal;
-	if (unlikely(!sig) || !sig->cputime.totals) {
+	totals = tsk->signal->cputime.totals;
+	if (!totals) {
 		times->utime = tsk->utime;
 		times->stime = tsk->stime;
 		times->sum_exec_runtime = tsk->se.sum_exec_runtime;
 		return;
 	}
+
 	times->stime = times->utime = cputime_zero;
 	times->sum_exec_runtime = 0;
 	for_each_possible_cpu(i) {
-		tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
+		tot = per_cpu_ptr(totals, i);
 		times->utime = cputime_add(times->utime, tot->utime);
 		times->stime = cputime_add(times->stime, tot->stime);
 		times->sum_exec_runtime += tot->sum_exec_runtime;
-- 
cgit v1.2.3


From 2b5fe6de58276d0b5a7c884d5dbfc300ca47db78 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 17 Nov 2008 15:40:08 +0100
Subject: thread_group_cputime: move a couple of callsites outside of ->siglock

Impact: relax the locking of cpu-time accounting calls

->siglock buys nothing for thread_group_cputime() in do_sys_times() and
wait_task_zombie() (which btw takes the unrelated parent's ->siglock).

Actually I think do_sys_times() doesn't need ->siglock at all.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 2 +-
 kernel/sys.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ae2b92be5fae..b9c4d8bb72e5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1330,10 +1330,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
 		 * group, which consolidates times for all threads in the
 		 * group including the group leader.
 		 */
+		thread_group_cputime(p, &cputime);
 		spin_lock_irq(&p->parent->sighand->siglock);
 		psig = p->parent->signal;
 		sig = p->signal;
-		thread_group_cputime(p, &cputime);
 		psig->cutime =
 			cputime_add(psig->cutime,
 			cputime_add(cputime.utime,
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d16..5fc3a0cfb994 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -858,8 +858,8 @@ void do_sys_times(struct tms *tms)
 	struct task_cputime cputime;
 	cputime_t cutime, cstime;
 
-	spin_lock_irq(&current->sighand->siglock);
 	thread_group_cputime(current, &cputime);
+	spin_lock_irq(&current->sighand->siglock);
 	cutime = current->signal->cutime;
 	cstime = current->signal->cstime;
 	spin_unlock_irq(&current->sighand->siglock);
-- 
cgit v1.2.3


From 7918baa555140989eeee1270f48533987d48fdba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?T=C3=B6r=C3=B6k=20Edwin?= <edwintorok@gmail.com>
Date: Mon, 24 Nov 2008 10:17:42 +0200
Subject: mutex: __used is needed for function referenced only from inline asm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix build failure on llvm-gcc-4.2

According to the gcc manual, the 'used' attribute should be applied to
functions referenced only from inline assembly.
This fixes a build failure with llvm-gcc-4.2, which deleted
__mutex_lock_slowpath, __mutex_unlock_slowpath.

Signed-off-by: Török Edwin <edwintorok@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 39a3816b68d9..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
  * We also put the fastpath first in the kernel image, to make sure the
  * branch is predicted by the CPU as default-untaken.
  */
-static void noinline __sched
+static __used noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count);
 
 /***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
 
-static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 
 /***
  * mutex_unlock - release the mutex
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 /*
  * Release the lock, slowpath:
  */
-static noinline void
+static __used noinline void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
 	__mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
-static noinline void __sched
+static __used noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
-- 
cgit v1.2.3


From 1acdac104668a0834cfa267de9946fac7764d486 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 20 Nov 2008 10:02:53 -0800
Subject: futex: make clock selectable for FUTEX_WAIT_BITSET

FUTEX_WAIT_BITSET could be used instead of FUTEX_WAIT by setting the
bit set to FUTEX_BITSET_MATCH_ANY, but FUTEX_WAIT uses CLOCK_REALTIME
while FUTEX_WAIT_BITSET uses CLOCK_MONOTONIC.

Add a flag to select CLOCK_REALTIME for FUTEX_WAIT_BITSET so glibc can
replace the FUTEX_WAIT logic which needs to do gettimeofday() calls
before and after the syscall to convert the absolute timeout to a
relative timeout for FUTEX_WAIT.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Ulrich Drepper <drepper@redhat.com>
---
 kernel/futex.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index e10c5c8786a6..ba0d3b83c091 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1142,12 +1142,13 @@ handle_fault:
  * In case we must use restart_block to restart a futex_wait,
  * we encode in the 'flags' shared capability
  */
-#define FLAGS_SHARED  1
+#define FLAGS_SHARED		0x01
+#define FLAGS_CLOCKRT		0x02
 
 static long futex_wait_restart(struct restart_block *restart);
 
 static int futex_wait(u32 __user *uaddr, int fshared,
-		      u32 val, ktime_t *abs_time, u32 bitset)
+		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
 	struct task_struct *curr = current;
 	DECLARE_WAITQUEUE(wait, curr);
@@ -1233,8 +1234,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 			slack = current->timer_slack_ns;
 			if (rt_task(current))
 				slack = 0;
-			hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
-						HRTIMER_MODE_ABS);
+			hrtimer_init_on_stack(&t.timer,
+					      clockrt ? CLOCK_REALTIME :
+					      CLOCK_MONOTONIC,
+					      HRTIMER_MODE_ABS);
 			hrtimer_init_sleeper(&t, current);
 			hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
 
@@ -1289,6 +1292,8 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 		if (fshared)
 			restart->futex.flags |= FLAGS_SHARED;
+		if (clockrt)
+			restart->futex.flags |= FLAGS_CLOCKRT;
 		return -ERESTART_RESTARTBLOCK;
 	}
 
@@ -1312,7 +1317,8 @@ static long futex_wait_restart(struct restart_block *restart)
 	if (restart->futex.flags & FLAGS_SHARED)
 		fshared = 1;
 	return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
-				restart->futex.bitset);
+				restart->futex.bitset,
+				restart->futex.flags & FLAGS_CLOCKRT);
 }
 
 
@@ -1905,18 +1911,22 @@ void exit_robust_list(struct task_struct *curr)
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		u32 __user *uaddr2, u32 val2, u32 val3)
 {
-	int ret = -ENOSYS;
+	int clockrt, ret = -ENOSYS;
 	int cmd = op & FUTEX_CMD_MASK;
 	int fshared = 0;
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		fshared = 1;
 
+	clockrt = op & FUTEX_CLOCK_REALTIME;
+	if (clockrt && cmd != FUTEX_WAIT_BITSET)
+		return -ENOSYS;
+
 	switch (cmd) {
 	case FUTEX_WAIT:
 		val3 = FUTEX_BITSET_MATCH_ANY;
 	case FUTEX_WAIT_BITSET:
-		ret = futex_wait(uaddr, fshared, val, timeout, val3);
+		ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
 		break;
 	case FUTEX_WAKE:
 		val3 = FUTEX_BITSET_MATCH_ANY;
-- 
cgit v1.2.3


From 7807fafa52b990abb321f1212416c71e64523ecb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 25 Nov 2008 08:44:24 +0100
Subject: lockdep: fix unused function warning in kernel/lockdep.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: fix build warning

this warning:

  kernel/lockdep.c:584: warning: ‘print_lock_dependencies’ defined but not used

triggers because print_lock_dependencies() is only used if both
CONFIG_TRACE_IRQFLAGS and CONFIG_PROVE_LOCKING are enabled.

But adding #ifdefs is not an option here - it would spread out to 4-5
other helper functions and uglify the file. So mark this function
as __used - it's static and the compiler can eliminate it just fine.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index a42858303233..c137953420e0 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -580,7 +580,8 @@ static void print_lock_class_header(struct lock_class *class, int depth)
 /*
  * printk all lock dependencies starting at <entry>:
  */
-static void print_lock_dependencies(struct lock_class *class, int depth)
+static void __used
+print_lock_dependencies(struct lock_class *class, int depth)
 {
 	struct lock_list *entry;
 
-- 
cgit v1.2.3


From 8b752e3ef6e3f5cde87afc649dd51d92b1e549c1 Mon Sep 17 00:00:00 2001
From: Liming Wang <liming.wang@windriver.com>
Date: Fri, 28 Nov 2008 09:52:40 +0800
Subject: softirq: remove useless function __local_bh_enable

Impact: remove unused code

__local_bh_enable has been replaced with _local_bh_enable.
As comments says "it always nests inside local_bh_enable() sections"
has not been valid now. Also there is no reason to use __local_bh_enable
anywhere, so we can remove this useless function.

Signed-off-by: Liming Wang <liming.wang@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d69..8d9934b4162a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -102,20 +102,6 @@ void local_bh_disable(void)
 
 EXPORT_SYMBOL(local_bh_disable);
 
-void __local_bh_enable(void)
-{
-	WARN_ON_ONCE(in_irq());
-
-	/*
-	 * softirqs should never be enabled by __local_bh_enable(),
-	 * it always nests inside local_bh_enable() sections:
-	 */
-	WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
-
-	sub_preempt_count(SOFTIRQ_OFFSET);
-}
-EXPORT_SYMBOL_GPL(__local_bh_enable);
-
 /*
  * Special-case - softirqs can safely be enabled in
  * cond_resched_softirq(), or by __do_softirq(),
-- 
cgit v1.2.3


From 74853dba2f7a1a9b0905a09abcf65c1f3ce0b14f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 28 Nov 2008 08:35:25 -0800
Subject: debug warnings: consolidate warn_slowpath and warn_on_slowpath

Impact: cleanup, code reduction

warn_slowpath is a superset of warn_on_slowpath; just have
warn_on_slowpath call warn_slowpath with a NULL 3rd argument.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 32 ++++++++++++--------------------
 1 file changed, 12 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6513aac8e992..6bbf7b905c76 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -320,23 +320,6 @@ void oops_exit(void)
 }
 
 #ifdef WANT_WARN_ON_SLOWPATH
-void warn_on_slowpath(const char *file, int line)
-{
-	char function[KSYM_SYMBOL_LEN];
-	unsigned long caller = (unsigned long) __builtin_return_address(0);
-	sprint_symbol(function, caller);
-
-	printk(KERN_WARNING "------------[ cut here ]------------\n");
-	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
-		line, function);
-	print_modules();
-	dump_stack();
-	print_oops_end_marker();
-	add_taint(TAINT_WARN);
-}
-EXPORT_SYMBOL(warn_on_slowpath);
-
-
 void warn_slowpath(const char *file, int line, const char *fmt, ...)
 {
 	va_list args;
@@ -347,9 +330,12 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	printk(KERN_WARNING "------------[ cut here ]------------\n");
 	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
 		line, function);
-	va_start(args, fmt);
-	vprintk(fmt, args);
-	va_end(args);
+
+	if (fmt) {
+		va_start(args, fmt);
+		vprintk(fmt, args);
+		va_end(args);
+	}
 
 	print_modules();
 	dump_stack();
@@ -357,6 +343,12 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_slowpath);
+
+void warn_on_slowpath(const char *file, int line)
+{
+	warn_slowpath(file, line, NULL);
+}
+EXPORT_SYMBOL(warn_on_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-- 
cgit v1.2.3


From bd89bb29a01503c5cffa367eccb0b356f910cb8d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 28 Nov 2008 08:36:09 -0800
Subject: debug warnings: print the DMI board info name in a WARN/WARN_ON

Impact: extend WARN_ON() output with DMI_PRODUCT_NAME

It's very useful for many low level WARN_ON's to find out which
motherboard has the broken BIOS etc... this patch adds a printk
to the WARN_ON code for this.

On architectures without DMI, gcc should optimize the code out.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6bbf7b905c76..73d365199c3f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
 #include <linux/debug_locks.h>
 #include <linux/random.h>
 #include <linux/kallsyms.h>
+#include <linux/dmi.h>
 
 int panic_on_oops;
 static unsigned long tainted_mask;
@@ -325,11 +326,16 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	va_list args;
 	char function[KSYM_SYMBOL_LEN];
 	unsigned long caller = (unsigned long)__builtin_return_address(0);
+	const char *board;
+
 	sprint_symbol(function, caller);
 
 	printk(KERN_WARNING "------------[ cut here ]------------\n");
 	printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
 		line, function);
+	board = dmi_get_system_info(DMI_PRODUCT_NAME);
+	if (board)
+		printk(KERN_WARNING "Hardware name: %s\n", board);
 
 	if (fmt) {
 		va_start(args, fmt);
-- 
cgit v1.2.3


From ec5679e513305f1411753e5f5489935bd638af23 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 28 Nov 2008 17:56:14 +0100
Subject: debug warnings: eliminate warn_on_slowpath()

Impact: cleanup, eliminate code

now that warn_on_slowpath() uses warn_slowpath(...,NULL), we can
eliminate warn_on_slowpath() altogether and use warn_slowpath().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 73d365199c3f..50349a41fba7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -349,12 +349,6 @@ void warn_slowpath(const char *file, int line, const char *fmt, ...)
 	add_taint(TAINT_WARN);
 }
 EXPORT_SYMBOL(warn_slowpath);
-
-void warn_on_slowpath(const char *file, int line)
-{
-	warn_slowpath(file, line, NULL);
-}
-EXPORT_SYMBOL(warn_on_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-- 
cgit v1.2.3


From 470c66239ef0336429b35345f3f615d47341e13b Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Mon, 1 Dec 2008 14:31:37 -0800
Subject: genirq: warn when IRQF_DISABLED may be ignored

Impact: emit new warning

We periodically waste time tracking down problems from the genirq
framework not respecting IRQF_DISABLED for some shared IRQ cases.  Linus
views this as "will not fix", but we're still left with the bugs caused by
this misbehavior.

This patch adds a nag message in request_irq(), so that drivers can fix
their IRQ handlers to avoid this problem.

Note that developers will never see the relevant bugs when they run with
LOCKDEP, so it's no wonder these bugs are hard to find.  (That also means
LOCKDEP is overlooking some IRQ-related bugs involving IRQ handlers that
don't set IRQF_DISABLED...)

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c498a1b8c621..7fd891c3a33d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -635,6 +635,18 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 	struct irq_desc *desc;
 	int retval;
 
+	/*
+	 * handle_IRQ_event() always ignores IRQF_DISABLED except for
+	 * the _first_ irqaction (sigh).  That can cause oopsing, but
+	 * the behavior is classified as "will not fix" so we need to
+	 * start nudging drivers away from using that idiom.
+	 */
+	if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
+			== (IRQF_SHARED|IRQF_DISABLED))
+		pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
+				"guaranteed on shared IRQs\n",
+				irq, devname);
+
 #ifdef CONFIG_LOCKDEP
 	/*
 	 * Lockdep wants atomic interrupt handlers:
-- 
cgit v1.2.3


From 00ef9f7348dfd2fc223ec42aceb30836e86b367f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Dec 2008 09:00:17 +0100
Subject: lockdep: change a held lock's class

Impact: introduce new lockdep API

Allow to change a held lock's class. Basically the same as the existing
code to change a subclass therefore reuse all that.

The XFS code will be able to use this to annotate their inode locking.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 90f3fb64dbce..4fa6eeb4e8a7 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -291,14 +291,12 @@ void lockdep_off(void)
 {
 	current->lockdep_recursion++;
 }
-
 EXPORT_SYMBOL(lockdep_off);
 
 void lockdep_on(void)
 {
 	current->lockdep_recursion--;
 }
-
 EXPORT_SYMBOL(lockdep_on);
 
 /*
@@ -2513,7 +2511,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
 	if (subclass)
 		register_lock_class(lock, subclass, 1);
 }
-
 EXPORT_SYMBOL_GPL(lockdep_init_map);
 
 /*
@@ -2694,8 +2691,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 }
 
 static int
-__lock_set_subclass(struct lockdep_map *lock,
-		    unsigned int subclass, unsigned long ip)
+__lock_set_class(struct lockdep_map *lock, const char *name,
+		 struct lock_class_key *key, unsigned int subclass,
+		 unsigned long ip)
 {
 	struct task_struct *curr = current;
 	struct held_lock *hlock, *prev_hlock;
@@ -2722,6 +2720,7 @@ __lock_set_subclass(struct lockdep_map *lock,
 	return print_unlock_inbalance_bug(curr, lock, ip);
 
 found_it:
+	lockdep_init_map(lock, name, key, 0);
 	class = register_lock_class(lock, subclass, 0);
 	hlock->class_idx = class - lock_classes + 1;
 
@@ -2906,9 +2905,9 @@ static void check_flags(unsigned long flags)
 #endif
 }
 
-void
-lock_set_subclass(struct lockdep_map *lock,
-		  unsigned int subclass, unsigned long ip)
+void lock_set_class(struct lockdep_map *lock, const char *name,
+		    struct lock_class_key *key, unsigned int subclass,
+		    unsigned long ip)
 {
 	unsigned long flags;
 
@@ -2918,13 +2917,12 @@ lock_set_subclass(struct lockdep_map *lock,
 	raw_local_irq_save(flags);
 	current->lockdep_recursion = 1;
 	check_flags(flags);
-	if (__lock_set_subclass(lock, subclass, ip))
+	if (__lock_set_class(lock, name, key, subclass, ip))
 		check_chain_key(current);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-
-EXPORT_SYMBOL_GPL(lock_set_subclass);
+EXPORT_SYMBOL_GPL(lock_set_class);
 
 /*
  * We are not always called with irqs disabled - do that here,
@@ -2948,7 +2946,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-
 EXPORT_SYMBOL_GPL(lock_acquire);
 
 void lock_release(struct lockdep_map *lock, int nested,
@@ -2966,7 +2963,6 @@ void lock_release(struct lockdep_map *lock, int nested,
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-
 EXPORT_SYMBOL_GPL(lock_release);
 
 #ifdef CONFIG_LOCK_STAT
@@ -3451,7 +3447,6 @@ retry:
 	if (unlock)
 		read_unlock(&tasklist_lock);
 }
-
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
 
 /*
@@ -3472,7 +3467,6 @@ void debug_show_held_locks(struct task_struct *task)
 {
 		__debug_show_held_locks(task);
 }
-
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 
 void lockdep_sys_exit(void)
-- 
cgit v1.2.3


From 3ac52669c7a24b93663acfcab606d1065ed1accd Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 13 Dec 2008 09:15:27 -0800
Subject: resources: skip sanity check of busy resources

Impact: reduce false positives in iomem_map_sanity_check()

Some drivers (vesafb) only map/reserve a portion of a resource.
If then some other driver comes in and maps the whole resource,
the current code WARN_ON's. This is not the intent of the checks
in iomem_map_sanity_check(); rather these checks want to
warn when crossing *hardware* resources only.

This patch skips BUSY resources as suggested by Linus.

Note: having two drivers talk to the same hardware at the same
time is obviously not optimal behavior, but that's a separate story.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/resource.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 4337063663ef..e633106b12f6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -853,6 +853,15 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
 		if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
 		    PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
 			continue;
+		/*
+		 * if a resource is "BUSY", it's not a hardware resource
+		 * but a driver mapping of such a resource; we don't want
+		 * to warn for those; some drivers legitimately map only
+		 * partial hardware resources. (example: vesafb)
+		 */
+		if (p->flags & IORESOURCE_BUSY)
+			continue;
+
 		printk(KERN_WARNING "resource map sanity check conflict: "
 		       "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
 		       (unsigned long long)addr,
-- 
cgit v1.2.3


From 343e9099c8152daff20e10d6269edec21da44fc0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 15 Dec 2008 16:13:07 -0800
Subject: rcu: fix rcutorture behavior during reboot

Impact: fix very rare reboot hang

Because rcutorture ignored all signals, it does not terminate in
response to the signals sent at shutdown time.  This can cause strange
failures due to its continuing to make use of kernel function too late
in the shutdown sequence.  This patch therefore adds a shutdown notifier
to rcutorture, causing it to shut down in response to a reboot or an
orderly shutdown.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutorture.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 56 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 85cb90588a55..b31065522104 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,6 +39,7 @@
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
+#include <linux/reboot.h>
 #include <linux/freezer.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
@@ -108,7 +109,6 @@ struct rcu_torture {
 	int rtort_mbtest;
 };
 
-static int fullstop = 0;	/* stop generating callbacks at test end. */
 static LIST_HEAD(rcu_torture_freelist);
 static struct rcu_torture *rcu_torture_current = NULL;
 static long rcu_torture_current_version = 0;
@@ -136,6 +136,30 @@ static int stutter_pause_test = 0;
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 
+#define FULLSTOP_SIGNALED 1	/* Bail due to signal. */
+#define FULLSTOP_CLEANUP  2	/* Orderly shutdown. */
+static int fullstop;		/* stop generating callbacks at test end. */
+DEFINE_MUTEX(fullstop_mutex);	/* protect fullstop transitions and */
+				/*  spawning of kthreads. */
+
+/*
+ * Detect and respond to a signal-based shutdown.
+ */
+static int
+rcutorture_shutdown_notify(struct notifier_block *unused1,
+			   unsigned long unused2, void *unused3)
+{
+	if (fullstop)
+		return NOTIFY_DONE;
+	if (signal_pending(current)) {
+		mutex_lock(&fullstop_mutex);
+		if (!ACCESS_ONCE(fullstop))
+			fullstop = FULLSTOP_SIGNALED;
+		mutex_unlock(&fullstop_mutex);
+	}
+	return NOTIFY_DONE;
+}
+
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -199,11 +223,12 @@ rcu_random(struct rcu_random_state *rrsp)
 static void
 rcu_stutter_wait(void)
 {
-	while (stutter_pause_test || !rcutorture_runnable)
+	while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
 		if (rcutorture_runnable)
 			schedule_timeout_interruptible(1);
 		else
 			schedule_timeout_interruptible(round_jiffies_relative(HZ));
+	}
 }
 
 /*
@@ -599,7 +624,7 @@ rcu_torture_writer(void *arg)
 		rcu_stutter_wait();
 	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
-	while (!kthread_should_stop())
+	while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -624,7 +649,7 @@ rcu_torture_fakewriter(void *arg)
 	} while (!kthread_should_stop() && !fullstop);
 
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
-	while (!kthread_should_stop())
+	while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -734,7 +759,7 @@ rcu_torture_reader(void *arg)
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
 	if (irqreader && cur_ops->irqcapable)
 		del_timer_sync(&t);
-	while (!kthread_should_stop())
+	while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
 		schedule_timeout_uninterruptible(1);
 	return 0;
 }
@@ -831,7 +856,7 @@ rcu_torture_stats(void *arg)
 	do {
 		schedule_timeout_interruptible(stat_interval * HZ);
 		rcu_torture_stats_print();
-	} while (!kthread_should_stop());
+	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
 	return 0;
 }
@@ -899,7 +924,7 @@ rcu_torture_shuffle(void *arg)
 	do {
 		schedule_timeout_interruptible(shuffle_interval * HZ);
 		rcu_torture_shuffle_tasks();
-	} while (!kthread_should_stop());
+	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
 	return 0;
 }
@@ -914,10 +939,10 @@ rcu_torture_stutter(void *arg)
 	do {
 		schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 1;
-		if (!kthread_should_stop())
+		if (!kthread_should_stop() && !fullstop)
 			schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 0;
-	} while (!kthread_should_stop());
+	} while (!kthread_should_stop() && !fullstop);
 	VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
 	return 0;
 }
@@ -934,12 +959,27 @@ rcu_torture_print_module_parms(char *tag)
 		stutter, irqreader);
 }
 
+static struct notifier_block rcutorture_nb = {
+	.notifier_call = rcutorture_shutdown_notify,
+};
+
 static void
 rcu_torture_cleanup(void)
 {
 	int i;
 
-	fullstop = 1;
+	mutex_lock(&fullstop_mutex);
+	if (!fullstop) {
+		/* If being signaled, let it happen, then exit. */
+		mutex_unlock(&fullstop_mutex);
+		schedule_timeout_interruptible(10 * HZ);
+		if (cur_ops->cb_barrier != NULL)
+			cur_ops->cb_barrier();
+		return;
+	}
+	fullstop = FULLSTOP_CLEANUP;
+	mutex_unlock(&fullstop_mutex);
+	unregister_reboot_notifier(&rcutorture_nb);
 	if (stutter_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
 		kthread_stop(stutter_task);
@@ -1015,6 +1055,8 @@ rcu_torture_init(void)
 		{ &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
 		  &srcu_ops, &sched_ops, &sched_ops_sync, };
 
+	mutex_lock(&fullstop_mutex);
+
 	/* Process args and tell the world that the torturer is on the job. */
 	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
 		cur_ops = torture_ops[i];
@@ -1024,6 +1066,7 @@ rcu_torture_init(void)
 	if (i == ARRAY_SIZE(torture_ops)) {
 		printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
 		       torture_type);
+		mutex_unlock(&fullstop_mutex);
 		return (-EINVAL);
 	}
 	if (cur_ops->init)
@@ -1146,9 +1189,12 @@ rcu_torture_init(void)
 			goto unwind;
 		}
 	}
+	register_reboot_notifier(&rcutorture_nb);
+	mutex_unlock(&fullstop_mutex);
 	return 0;
 
 unwind:
+	mutex_unlock(&fullstop_mutex);
 	rcu_torture_cleanup();
 	return firsterr;
 }
-- 
cgit v1.2.3


From 73500ac545d24610eb2cf8579ffc88957e9c5847 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Wed, 17 Dec 2008 17:29:56 -0800
Subject: futex: rename field in futex_q to clarify single waiter semantics

Impact: simplify code

I've tripped over the naming of this field a couple times.

The futex_q uses a "waiters" list to represent a single blocked task and
then calles wake_up_all().

This can lead to confusion in trying to understand the intent of the code,
which is to have a single futex_q for every task waiting on a futex.

This patch corrects the problem, using a single pointer to the waiting
task, and an appropriate call to wake_up, rather than wake_up_all.

Compile and boot tested on an 8way x86_64 machine.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index ba0d3b83c091..99f8acce08bf 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -92,11 +92,12 @@ struct futex_pi_state {
  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
  * The order of wakup is always to make the first condition true, then
- * wake up q->waiters, then make the second condition true.
+ * wake up q->waiter, then make the second condition true.
  */
 struct futex_q {
 	struct plist_node list;
-	wait_queue_head_t waiters;
+	/* There can only be a single waiter */
+	wait_queue_head_t waiter;
 
 	/* Which hash list lock to use: */
 	spinlock_t *lock_ptr;
@@ -573,7 +574,7 @@ static void wake_futex(struct futex_q *q)
 	 * The lock in wake_up_all() is a crucial memory barrier after the
 	 * plist_del() and also before assigning to q->lock_ptr.
 	 */
-	wake_up_all(&q->waiters);
+	wake_up(&q->waiter);
 	/*
 	 * The waiting task can free the futex_q as soon as this is written,
 	 * without taking any locks.  This must come last.
@@ -930,7 +931,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
 	struct futex_hash_bucket *hb;
 
-	init_waitqueue_head(&q->waiters);
+	init_waitqueue_head(&q->waiter);
 
 	get_futex_key_refs(&q->key);
 	hb = hash_futex(&q->key);
@@ -1221,7 +1222,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 	/* add_wait_queue is the barrier after __set_current_state. */
 	__set_current_state(TASK_INTERRUPTIBLE);
-	add_wait_queue(&q.waiters, &wait);
+	add_wait_queue(&q.waiter, &wait);
 	/*
 	 * !plist_node_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
-- 
cgit v1.2.3


From 64db4cfff99c04cd5f550357edcc8780f96b54a2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 18 Dec 2008 21:55:32 +0100
Subject: "Tree RCU": scalable classic RCU implementation

This patch fixes a long-standing performance bug in classic RCU that
results in massive internal-to-RCU lock contention on systems with
more than a few hundred CPUs.  Although this patch creates a separate
flavor of RCU for ease of review and patch maintenance, it is intended
to replace classic RCU.

This patch still handles stress better than does mainline, so I am still
calling it ready for inclusion.  This patch is against the -tip tree.
Nevertheless, experience on an actual 1000+ CPU machine would still be
most welcome.

Most of the changes noted below were found while creating an rcutiny
(which should permit ejecting the current rcuclassic) and while doing
detailed line-by-line documentation.

Updates from v9 (http://lkml.org/lkml/2008/12/2/334):

o	Fixes from remainder of line-by-line code walkthrough,
	including comment spelling, initialization, undesirable
	narrowing due to type conversion, removing redundant memory
	barriers, removing redundant local-variable initialization,
	and removing redundant local variables.

	I do not believe that any of these fixes address the CPU-hotplug
	issues that Andi Kleen was seeing, but please do give it a whirl
	in case the machine is smarter than I am.

	A writeup from the walkthrough may be found at the following
	URL, in case you are suffering from terminal insomnia or
	masochism:

	http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf

o	Made rcutree tracing use seq_file, as suggested some time
	ago by Lai Jiangshan.

o	Added a .csv variant of the rcudata debugfs trace file, to allow
	people having thousands of CPUs to drop the data into
	a spreadsheet.	Tested with oocalc and gnumeric.  Updated
	documentation to suit.

Updates from v8 (http://lkml.org/lkml/2008/11/15/139):

o	Fix a theoretical race between grace-period initialization and
	force_quiescent_state() that could occur if more than three
	jiffies were required to carry out the grace-period
	initialization.  Which it might, if you had enough CPUs.

o	Apply Ingo's printk-standardization patch.

o	Substitute local variables for repeated accesses to global
	variables.

o	Fix comment misspellings and redundant (but harmless) increments
	of ->n_rcu_pending (this latter after having explicitly added it).

o	Apply checkpatch fixes.

Updates from v7 (http://lkml.org/lkml/2008/10/10/291):

o	Fixed a number of problems noted by Gautham Shenoy, including
	the cpu-stall-detection bug that he was having difficulty
	convincing me was real.  ;-)

o	Changed cpu-stall detection to wait for ten seconds rather than
	three in order to reduce false positive, as suggested by Ingo
	Molnar.

o	Produced a design document (http://lwn.net/Articles/305782/).
	The act of writing this document uncovered a number of both
	theoretical and "here and now" bugs as noted below.

o	Fix dynticks_nesting accounting confusion, simplify WARN_ON()
	condition, fix kerneldoc comments, and add memory barriers
	in dynticks interface functions.

o	Add more data to tracing.

o	Remove unused "rcu_barrier" field from rcu_data structure.

o	Count calls to rcu_pending() from scheduling-clock interrupt
	to use as a surrogate timebase should jiffies stop counting.

o	Fix a theoretical race between force_quiescent_state() and
	grace-period initialization.  Yes, initialization does have to
	go on for some jiffies for this race to occur, but given enough
	CPUs...

Updates from v6 (http://lkml.org/lkml/2008/9/23/448):

o	Fix a number of checkpatch.pl complaints.

o	Apply review comments from Ingo Molnar and Lai Jiangshan
	on the stall-detection code.

o	Fix several bugs in !CONFIG_SMP builds.

o	Fix a misspelled config-parameter name so that RCU now announces
	at boot time if stall detection is configured.

o	Run tests on numerous combinations of configurations parameters,
	which after the fixes above, now build and run correctly.

Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line):

o	Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a
	changeset some time ago, and finally got around to retesting
	this option).

o	Fix some tracing bugs in rcupreempt that caused incorrect
	totals to be printed.

o	I now test with a more brutal random-selection online/offline
	script (attached).  Probably more brutal than it needs to be
	on the people reading it as well, but so it goes.

o	A number of optimizations and usability improvements:

	o	Make rcu_pending() ignore the grace-period timeout when
		there is no grace period in progress.

	o	Make force_quiescent_state() avoid going for a global
		lock in the case where there is no grace period in
		progress.

	o	Rearrange struct fields to improve struct layout.

	o	Make call_rcu() initiate a grace period if RCU was
		idle, rather than waiting for the next scheduling
		clock interrupt.

	o	Invoke rcu_irq_enter() and rcu_irq_exit() only when
		idle, as suggested by Andi Kleen.  I still don't
		completely trust this change, and might back it out.

	o	Make CONFIG_RCU_TRACE be the single config variable
		manipulated for all forms of RCU, instead of the prior
		confusion.

	o	Document tracing files and formats for both rcupreempt
		and rcutree.

Updates from v4 for those missing v5 given its bad subject line:

o	Separated dynticks interface so that NMIs and irqs call separate
	functions, greatly simplifying it.  In particular, this code
	no longer requires a proof of correctness.  ;-)

o	Separated dynticks state out into its own per-CPU structure,
	avoiding the duplicated accounting.

o	The case where a dynticks-idle CPU runs an irq handler that
	invokes call_rcu() is now correctly handled, forcing that CPU
	out of dynticks-idle mode.

o	Review comments have been applied (thank you all!!!).
	For but one example, fixed the dynticks-ordering issue that
	Manfred pointed out, saving me much debugging.  ;-)

o	Adjusted rcuclassic and rcupreempt to handle dynticks changes.

Attached is an updated patch to Classic RCU that applies a hierarchy,
greatly reducing the contention on the top-level lock for large machines.
This passes 10-hour concurrent rcutorture and online-offline testing on
128-CPU ppc64 without dynticks enabled, and exposes some timekeeping
bugs in presence of dynticks (exciting working on a system where
"sleep 1" hangs until interrupted...), which were fixed in the
2.6.27 kernel.  It is getting more reliable than mainline by some
measures, so the next version will be against -tip for inclusion.
See also Manfred Spraul's recent patches (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
We will converge onto a common patch in the fullness of time, but are
currently exploring different regions of the design space.  That said,
I have already gratefully stolen quite a few of Manfred's ideas.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	More bugs will probably surface as a result of an ongoing
	line-by-line code inspection.

	Patches will be provided as required.

o	There are probably hangs, rcutorture failures, &c.  Seems
	quite stable on a 128-CPU machine, but that is kind of small
	compared to 4096 CPUs.  However, seems to do better than
	mainline.

	Patches will be provided as required.

o	The memory footprint of this version is several KB larger
	than rcuclassic.

	A separate UP-only rcutiny patch will be provided, which will
	reduce the memory footprint significantly, even compared
	to the old rcuclassic.  One such patch passes light testing,
	and has a memory footprint smaller even than rcuclassic.
	Initial reaction from various embedded guys was "it is not
	worth it", so am putting it aside.

Credits:

o	Manfred Spraul for ideas, review comments, and bugs spotted,
	as well as some good friendly competition.  ;-)

o	Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton
	for reviews and comments.

o	Thomas Gleixner for much-needed help with some timer issues
	(see patches below).

o	Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos,
	Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton
	Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines
	alive despite my heavy abuse^Wtesting.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.preempt    |   62 +-
 kernel/Makefile           |    6 +-
 kernel/rcupreempt.c       |   10 +
 kernel/rcupreempt_trace.c |   10 +-
 kernel/rcutree.c          | 1535 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c    |  271 ++++++++
 kernel/softirq.c          |    5 +-
 7 files changed, 1883 insertions(+), 16 deletions(-)
 create mode 100644 kernel/rcutree.c
 create mode 100644 kernel/rcutree_trace.c

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1fc..463f29743ea0 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,10 +52,29 @@ config PREEMPT
 
 endchoice
 
+choice
+	prompt "RCU Implementation"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+	  
+	  Select this option if you are unsure.
+
+config TREE_RCU
+	bool "Tree-based hierarchical RCU"
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP system with hundreds or
+	  thousands of CPUs.
+
 config PREEMPT_RCU
 	bool "Preemptible RCU"
 	depends on PREEMPT
-	default n
 	help
 	  This option reduces the latency of the kernel by making certain
 	  RCU sections preemptible. Normally RCU code is non-preemptible, if
@@ -64,16 +83,47 @@ config PREEMPT_RCU
 	  now-naive assumptions about each RCU read-side critical section
 	  remaining on a given CPU through its execution.
 
-	  Say N if you are unsure.
+endchoice
 
 config RCU_TRACE
-	bool "Enable tracing for RCU - currently stats in debugfs"
-	depends on PREEMPT_RCU
-	select DEBUG_FS
-	default y
+	bool "Enable tracing for RCU"
+	depends on TREE_RCU || PREEMPT_RCU
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Tree-based hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on TREE_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable tree-based hierarchical RCU auto-balancing"
+	depends on TREE_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
+
+	
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19d..b4fdbbff5ec0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-ifeq ($(CONFIG_PREEMPT_RCU),y)
-obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
-endif
+obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9daa..04982659875a 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -551,6 +551,16 @@ void rcu_irq_exit(void)
 	}
 }
 
+void rcu_nmi_enter(void)
+{
+	rcu_irq_enter();
+}
+
+void rcu_nmi_exit(void)
+{
+	rcu_irq_exit();
+}
+
 static void dyntick_save_progress_counter(int cpu)
 {
 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ecf..7c2665cac172 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
 		sp->done_length += cp->done_length;
 		sp->done_add += cp->done_add;
 		sp->done_remove += cp->done_remove;
-		atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
+		atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
 		sp->rcu_check_callbacks += cp->rcu_check_callbacks;
-		atomic_set(&sp->rcu_try_flip_1,
-			   atomic_read(&cp->rcu_try_flip_1));
-		atomic_set(&sp->rcu_try_flip_e1,
-			   atomic_read(&cp->rcu_try_flip_e1));
+		atomic_add(atomic_read(&cp->rcu_try_flip_1),
+			   &sp->rcu_try_flip_1);
+		atomic_add(atomic_read(&cp->rcu_try_flip_e1),
+			   &sp->rcu_try_flip_e1);
 		sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
 		sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
 		sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 000000000000..a342b032112c
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1535 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
+		NUM_RCU_LVL_1, \
+		NUM_RCU_LVL_2, \
+		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+	}, \
+	.signaled = RCU_SIGNAL_INIT, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.n_force_qs = 0, \
+	.n_force_qs_ngp = 0, \
+}
+
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
+#endif /* #ifdef CONFIG_NO_HZ */
+
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU BH batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* ACCESS_ONCE() because we are accessing outside of lock. */
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * If the specified CPU is offline, tell the caller that it is in
+ * a quiescent state.  Otherwise, whack it with a reschedule IPI.
+ * Grace periods can end up waiting on an offline CPU when that
+ * CPU is in the process of coming online -- it will be added to the
+ * rcu_node bitmasks before it actually makes it online.  The same thing
+ * can happen while a CPU is in the process of coming online.  Because this
+ * race is quite rare, we check for it after detecting that the grace
+ * period has been delayed rather than checking each and every CPU
+ * each and every time we start a new grace period.
+ */
+static int rcu_implicit_offline_qs(struct rcu_data *rdp)
+{
+	/*
+	 * If the CPU is offline, it is in a quiescent state.  We can
+	 * trust its state not to change because interrupts are disabled.
+	 */
+	if (cpu_is_offline(rdp->cpu)) {
+		rdp->offline_fqs++;
+		return 1;
+	}
+
+	/* The CPU is online, so send it a reschedule IPI. */
+	if (rdp->cpu != smp_processor_id())
+		smp_send_reschedule(rdp->cpu);
+	else
+		set_need_resched();
+	rdp->resched_ipi++;
+	return 0;
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#ifdef CONFIG_NO_HZ
+static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
+
+/**
+ * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ *
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
+ */
+void rcu_enter_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting--;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+	local_irq_restore(flags);
+}
+
+/*
+ * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+ *
+ * Exit nohz mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections normally occur.
+ */
+void rcu_exit_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	local_irq_restore(flags);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_enter - inform RCU of entry to NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is active.
+ */
+void rcu_nmi_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_exit - inform RCU of exit from NMI context
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is no longer active.
+ */
+void rcu_nmi_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
+}
+
+/**
+ * rcu_irq_enter - inform RCU of entry to hard irq context
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks_nesting++)
+		return;
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_irq_exit - inform RCU of exit from hard irq context
+ *
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * to put let the RCU handling be aware that the CPU is going back to idle
+ * with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (--rdtp->dynticks_nesting)
+		return;
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+
+	/* If the interrupt queued a callback, get out of dyntick mode. */
+	if (__get_cpu_var(rcu_data).nxtlist ||
+	    __get_cpu_var(rcu_bh_data).nxtlist)
+		set_need_resched();
+}
+
+/*
+ * Record the specified "completed" value, which is later used to validate
+ * dynticks counter manipulations.  Specify "rsp->completed - 1" to
+ * unconditionally invalidate any future dynticks manipulations (which is
+ * useful at the beginning of a grace period).
+ */
+static void dyntick_record_completed(struct rcu_state *rsp, long comp)
+{
+	rsp->dynticks_completed = comp;
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Recall the previously recorded value of the completion for dynticks.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->dynticks_completed;
+}
+
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state.  Return 1 if this CPU
+ * is already in a quiescent state courtesy of dynticks idle mode.
+ */
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	int ret;
+	int snap;
+	int snap_nmi;
+
+	snap = rdp->dynticks->dynticks;
+	snap_nmi = rdp->dynticks->dynticks_nmi;
+	smp_mb();	/* Order sampling of snap with end of grace period. */
+	rdp->dynticks_snap = snap;
+	rdp->dynticks_nmi_snap = snap_nmi;
+	ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
+	if (ret)
+		rdp->dynticks_fqs++;
+	return ret;
+}
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU.
+ */
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	long curr;
+	long curr_nmi;
+	long snap;
+	long snap_nmi;
+
+	curr = rdp->dynticks->dynticks;
+	snap = rdp->dynticks_snap;
+	curr_nmi = rdp->dynticks->dynticks_nmi;
+	snap_nmi = rdp->dynticks_nmi_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq/NMI handlers, then we can safely pretend that the CPU
+	 * already acknowledged the request to pass through a quiescent
+	 * state.  Either way, that CPU cannot possibly be in an RCU
+	 * read-side critical section that started before the beginning
+	 * of the current RCU grace period.
+	 */
+	if ((curr != snap || (curr & 0x1) == 0) &&
+	    (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+		rdp->dynticks_fqs++;
+		return 1;
+	}
+
+	/* Go check for the CPU being offline. */
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static void dyntick_record_completed(struct rcu_state *rsp, long comp)
+{
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * If there are no dynticks, then the only way that a CPU can passively
+ * be in a quiescent state is to be offline.  Unlike dynticks idle, which
+ * is a point in time during the prior (already finished) grace period,
+ * an offline CPU is always in a quiescent state, and thus can be
+ * unconditionally applied.  So just return the current value of completed.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->completed;
+}
+
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	return 0;
+}
+
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #ifdef CONFIG_SMP */
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+	rsp->gp_start = jiffies;
+	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_state *rsp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rnp->lock, flags);
+	delta = jiffies - rsp->jiffies_stall;
+	if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		if (rnp_cur->qsmask == 0)
+			continue;
+		for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
+			if (rnp_cur->qsmask & (1UL << cpu))
+				printk(" %d", rnp_cur->grplo + cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
+	force_quiescent_state(rsp, 0);  /* Kick them all. */
+}
+
+static void print_cpu_stall(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
+			smp_processor_id(), jiffies - rsp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rnp->lock, flags);
+	if ((long)(jiffies - rsp->jiffies_stall) >= 0)
+		rsp->jiffies_stall =
+			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long delta;
+	struct rcu_node *rnp;
+
+	delta = jiffies - rsp->jiffies_stall;
+	rnp = rdp->mynode;
+	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rsp);
+
+	} else if (rsp->gpnum != rsp->completed &&
+		   delta >= RCU_STALL_RAT_DELAY) {
+
+		/* They had two time units to dump stack, so complain. */
+		print_other_cpu_stall(rsp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
+ */
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.  Must be called
+ * on the CPU corresponding to rdp.
+ */
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
+	__releases(rcu_get_root(rsp)->lock)
+{
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize state. */
+	rsp->gpnum++;
+	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+	record_gp_stall_check_time(rsp);
+	dyntick_record_completed(rsp, rsp->completed - 1);
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Special-case the common single-level case. */
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	spin_unlock(&rnp->lock);  /* leave irqs disabled. */
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LVLS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
+	}
+
+	rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * group must be represented by the same leaf rcu_node structure.
+ * That structure's lock must be held upon entry, and it is released
+ * before return.
+ */
+static void
+cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
+	      unsigned long flags)
+	__releases(rnp->lock)
+{
+	/* Walk up the rcu_node hierarchy. */
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = rnp->grpmask;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
+	}
+
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+	rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU, which must either be
+ * the current CPU or an offline CPU.  The lastcomp argument is used to
+ * make sure we are still in the grace period of interest.  We don't want
+ * to end the current grace period based on quiescent states detected in
+ * an earlier grace period!
+ */
+static void
+cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp;
+
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	if (lastcomp != ACCESS_ONCE(rsp->completed)) {
+
+		/*
+		 * Someone beat us to it for this grace period, so leave.
+		 * The race with GP start is resolved by the fact that we
+		 * hold the leaf rcu_node lock, so that the per-CPU bits
+		 * cannot yet be initialized -- so we would simply find our
+		 * CPU's bit already cleared in cpu_quiet_msk() if this race
+		 * occurred.
+		 */
+		rdp->passed_quiesc = 0;	/* try again later! */
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	mask = rdp->grpmask;
+	if ((rnp->qsmask & mask) == 0) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+	} else {
+		rdp->qs_pending = 0;
+
+		/*
+		 * This GP can't end until cpu checks in, so all of our
+		 * callbacks can be processed during the next GP.
+		 */
+		rdp = rsp->rda[smp_processor_id()];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+	}
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
+
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+
+	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
+	cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
+ */
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
+{
+	int i;
+	unsigned long flags;
+	long lastcomp;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp_me;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;	/* rnp->grplo is constant. */
+	do {
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL);
+	lastcomp = rsp->completed;
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp, lastcomp);
+
+	/*
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
+	 */
+	rdp_me = rsp->rda[smp_processor_id()];
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+		rdp_me->qlen += rdp->qlen;
+		rdp->qlen = 0;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
+static void rcu_offline_cpu(int cpu)
+{
+	__rcu_offline_cpu(cpu, &rcu_state);
+	__rcu_offline_cpu(cpu, &rcu_bh_state);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.  Thottle as specified by rdp->blimit.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
+
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
+
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
+
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	local_irq_save(flags);
+
+	/* Update count, and requeue any remaining callbacks. */
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	local_irq_restore(flags);
+
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * No memory barrier is required here because both
+		 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
+		 * only CPU-local variables that other CPUs neither
+		 * access nor modify, at least not while the corresponding
+		 * CPU is online.
+		 */
+
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+
+	} else if (!in_softirq()) {
+
+		/*
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.
+		 */
+
+		rcu_bh_qsctr_inc(cpu);
+	}
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Returns 1 if the current grace period ends while scanning (possibly
+ * because we made it end).
+ */
+static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+			       int (*f)(struct rcu_data *))
+{
+	unsigned long bit;
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		mask = 0;
+		spin_lock_irqsave(&rnp_cur->lock, flags);
+		if (rsp->completed != lastcomp) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			return 1;
+		}
+		if (rnp_cur->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			continue;
+		}
+		cpu = rnp_cur->grplo;
+		bit = 1;
+		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
+			if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+				mask |= bit;
+		}
+		if (mask != 0 && rsp->completed == lastcomp) {
+
+			/* cpu_quiet_msk() releases rnp_cur->lock. */
+			cpu_quiet_msk(mask, rsp, rnp_cur, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+	}
+	return 0;
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	unsigned long flags;
+	long lastcomp;
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	u8 signaled;
+
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
+		return;  /* No grace period in progress, nothing to force. */
+	if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
+		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
+		return;	/* Someone else is already on the job. */
+	}
+	if (relaxed &&
+	    (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
+	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
+		goto unlock_ret; /* no emergency and done recently. */
+	rsp->n_force_qs++;
+	spin_lock(&rnp->lock);
+	lastcomp = rsp->completed;
+	signaled = rsp->signaled;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
+				      RCU_JIFFIES_TILL_FORCE_QS;
+	if (lastcomp == rsp->gpnum) {
+		rsp->n_force_qs_ngp++;
+		spin_unlock(&rnp->lock);
+		goto unlock_ret;  /* no GP in progress, time updated. */
+	}
+	spin_unlock(&rnp->lock);
+	switch (signaled) {
+	case RCU_GP_INIT:
+
+		break; /* grace period still initializing, ignore. */
+
+	case RCU_SAVE_DYNTICK:
+
+		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
+			break; /* So gcc recognizes the dead code. */
+
+		/* Record dyntick-idle state. */
+		if (rcu_process_dyntick(rsp, lastcomp,
+					dyntick_save_progress_counter))
+			goto unlock_ret;
+
+		/* Update state, record completion counter. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed) {
+			rsp->signaled = RCU_FORCE_QS;
+			dyntick_record_completed(rsp, lastcomp);
+		}
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_FORCE_QS:
+
+		/* Check dyntick-idle state, send IPI to laggarts. */
+		if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
+					rcu_implicit_dynticks_qs))
+			goto unlock_ret;
+
+		/* Leave state in case more forcing is required. */
+
+		break;
+	}
+unlock_ret:
+	spin_unlock_irqrestore(&rsp->fqslock, flags);
+}
+
+#else /* #ifdef CONFIG_SMP */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	set_need_resched();
+}
+
+#endif /* #else #ifdef CONFIG_SMP */
+
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.  This may be called
+ * only from the CPU to whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * If an RCU GP has gone long enough, go check for dyntick
+	 * idle CPUs and, if needed, send resched IPIs.
+	 */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+	    (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+		force_quiescent_state(rsp, 1);
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, flags);  /* releases above lock */
+	}
+
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
+}
+
+/*
+ * Do softirq processing for the current CPU.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	/*
+	 * Memory references from any prior RCU read-side critical sections
+	 * executed by the interrupted code must be seen before any RCU
+	 * grace-period manipulations below.
+	 */
+	smp_mb(); /* See above block comment. */
+
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+
+	/*
+	 * Memory references from any later RCU read-side critical sections
+	 * executed by the interrupted code must be seen after any RCU
+	 * grace-period manipulations above.
+	 */
+	smp_mb(); /* See above block comment. */
+}
+
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+	   struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	local_irq_save(flags);
+	rdp = rsp->rda[smp_processor_id()];
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	/* Add the callback to our list. */
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	/* Start a new grace period if one not already started. */
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
+		unsigned long nestflag;
+		struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+		spin_lock_irqsave(&rnp_root->lock, nestflag);
+		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
+	}
+
+	/* Force the grace period if too many callbacks or too long waiting. */
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = LONG_MAX;
+		force_quiescent_state(rsp, 0);
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+		   (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
+		force_quiescent_state(rsp, 1);
+	local_irq_restore(flags);
+}
+
+/*
+ * Queue an RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Queue an RCU for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->n_rcu_pending++;
+
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rsp, rdp);
+
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
+
+	/* Does this CPU have callbacks ready to invoke? */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		return 1;
+
+	/* Has RCU gone idle with this CPU needing another grace period? */
+	if (cpu_needs_another_gp(rsp, rdp))
+		return 1;
+
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+		return 1;
+
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+		return 1;
+
+	/* Has an RCU GP gone long enough to send resched IPIs &c? */
+	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
+	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
+	     (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	/* RCU callbacks either ready or pending? */
+	return per_cpu(rcu_data, cpu).nxtlist ||
+	       per_cpu(rcu_bh_data, cpu).nxtlist;
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	int i;
+	long lastcomp;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/* Set up local state, ensuring consistent view of global state. */
+	spin_lock_irqsave(&rnp->lock, flags);
+	lastcomp = rsp->completed;
+	rdp->completed = lastcomp;
+	rdp->gpnum = lastcomp;
+	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
+	rdp->beenonline = 1;	 /* We have now been online. */
+	rdp->passed_quiesc_completed = lastcomp - 1;
+	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rdp->qlen = 0;
+	rdp->blimit = blimit;
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we won't be part
+	 * of it, but that is OK, as we are currently in a quiescent state.
+	 */
+
+	/* Exclude any attempts to start a new GP on large systems. */
+	spin_lock(&rsp->onofflock);		/* irqs already disabled. */
+
+	/* Add CPU to rcu_node bitmasks. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;
+	do {
+		/* Exclude any attempts to start a new GP on small systems. */
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		rnp->qsmaskinit |= mask;
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we will be part of
+	 * it, and its gpnum will be greater than ours, so we will
+	 * participate.  It is also possible for the gpnum to have been
+	 * incremented before this function was called, and the bitmasks
+	 * to not be filled out until now, in which case we will also
+	 * participate due to our gpnum being behind.
+	 */
+
+	/* Since it is coming online, the CPU is in a quiescent state. */
+	cpu_quiet(cpu, rsp, rdp, lastcomp);
+	local_irq_restore(flags);
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	rdtp->dynticks_nesting = 1;
+	rdtp->dynticks |= 1; 	/* need consecutive #s even for hotplug. */
+	rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
+#endif /* #ifdef CONFIG_NO_HZ */
+	rcu_init_percpu_data(cpu, &rcu_state);
+	rcu_init_percpu_data(cpu, &rcu_bh_state);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+/*
+ * Handle CPU online/offline notifcation events.
+ */
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
+		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int cpustride = 1;
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LVLS; i++)
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		cpustride *= rsp->levelspread[i];
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->qsmaskinit = 0;
+			rnp->grplo = j * cpustride;
+			rnp->grphi = (j + 1) * cpustride - 1;
+			if (rnp->grphi >= NR_CPUS)
+				rnp->grphi = NR_CPUS - 1;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->grpmask = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->grpmask = 1UL << rnp->grpnum;
+				rnp->parent = rsp->level[i - 1] +
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for __rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+	} \
+} while (0)
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+void __init __rcu_init(void)
+{
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 000000000000..d6db3e837826
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
+{
+	if (!rdp->beenonline)
+		return;
+	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		   rdp->completed, rdp->gpnum,
+		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->qs_pending,
+		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
+		   (int)(rdp->n_rcu_pending & 0xffff));
+#ifdef CONFIG_NO_HZ
+	seq_printf(m, " dt=%d/%d dn=%d df=%lu",
+		   rdp->dynticks->dynticks,
+		   rdp->dynticks->dynticks_nesting,
+		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
+	seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+}
+
+#define PRINT_RCU_DATA(name, func, m) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_possible_cpu(_p_r_d_i) \
+			func(m, &per_cpu(name, _p_r_d_i)); \
+	} while (0)
+
+static int show_rcudata(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
+	seq_puts(m, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
+	return 0;
+}
+
+static int rcudata_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcudata, NULL);
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.open = rcudata_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
+{
+	if (!rdp->beenonline)
+		return;
+	seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
+		   rdp->completed, rdp->gpnum,
+		   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		   rdp->qs_pending,
+		   rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
+		   rdp->n_rcu_pending);
+#ifdef CONFIG_NO_HZ
+	seq_printf(m, ",%d,%d,%d,%lu",
+		   rdp->dynticks->dynticks,
+		   rdp->dynticks->dynticks_nesting,
+		   rdp->dynticks->dynticks_nmi,
+		   rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
+	seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
+}
+
+static int show_rcudata_csv(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
+#ifdef CONFIG_NO_HZ
+	seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+#endif /* #ifdef CONFIG_NO_HZ */
+	seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
+	seq_puts(m, "\"rcu:\"\n");
+	PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
+	seq_puts(m, "\"rcu_bh:\"\n");
+	PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
+	return 0;
+}
+
+static int rcudata_csv_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcudata_csv, NULL);
+}
+
+static struct file_operations rcudata_csv_fops = {
+	.owner = THIS_MODULE,
+	.open = rcudata_csv_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
+{
+	int level = 0;
+	struct rcu_node *rnp;
+
+	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
+	              "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+		   rsp->completed, rsp->gpnum, rsp->signaled,
+		   (long)(rsp->jiffies_force_qs - jiffies),
+		   (int)(jiffies & 0xffff),
+		   rsp->n_force_qs, rsp->n_force_qs_ngp,
+		   rsp->n_force_qs - rsp->n_force_qs_ngp,
+		   rsp->n_force_qs_lh);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			seq_puts(m, "\n");
+			level = rnp->level;
+		}
+		seq_printf(m, "%lx/%lx %d:%d ^%d    ",
+			   rnp->qsmask, rnp->qsmaskinit,
+			   rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	seq_puts(m, "\n");
+}
+
+static int show_rcuhier(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	print_one_rcu_state(m, &rcu_state);
+	seq_puts(m, "rcu_bh:\n");
+	print_one_rcu_state(m, &rcu_bh_state);
+	return 0;
+}
+
+static int rcuhier_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcuhier, NULL);
+}
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.open = rcuhier_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int show_rcugp(struct seq_file *m, void *unused)
+{
+	seq_printf(m, "rcu: completed=%ld  gpnum=%ld\n",
+		   rcu_state.completed, rcu_state.gpnum);
+	seq_printf(m, "rcu_bh: completed=%ld  gpnum=%ld\n",
+		   rcu_bh_state.completed, rcu_bh_state.gpnum);
+	return 0;
+}
+
+static int rcugp_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcugp, NULL);
+}
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.open = rcugp_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static int __init rcuclassic_trace_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
+						NULL, &rcudata_csv_fops);
+	if (!datadir_csv)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (datadir_csv)
+		debugfs_remove(datadir_csv);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(datadir_csv);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);
+
+MODULE_AUTHOR("Paul E. McKenney");
+MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
+MODULE_LICENSE("GPL");
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d69..80d323e6f61a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -269,6 +269,7 @@ void irq_enter(void)
 {
 	int cpu = smp_processor_id();
 
+	rcu_irq_enter();
 	if (idle_cpu(cpu) && !in_interrupt()) {
 		__irq_enter();
 		tick_check_idle(cpu);
@@ -295,9 +296,9 @@ void irq_exit(void)
 
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
-	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
-		tick_nohz_stop_sched_tick(0);
 	rcu_irq_exit();
+	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
+		tick_nohz_stop_sched_tick(0);
 #endif
 	preempt_enable_no_resched();
 }
-- 
cgit v1.2.3


From b56863630ddbdea6e22df8835f78f0b1da037103 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 18 Dec 2008 15:06:34 -0800
Subject: futex: clean up futex_(un)lock_pi fault handling

Impact: cleanup

Some apparently left over cruft code was complicating the fault logic:

Testing if uval != -EFAULT doesn't have any meaning, get_user() sets ret
to either 0 or -EFAULT, there's no need to compare uval, especially not
against EFAULT which it will never be.  This patch removes the superfluous
test and clarifies the comment blocks.

Build and boot tested on an 8way x86_64 system.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 99f8acce08bf..b4f87bac91c1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1565,12 +1565,11 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 
  uaddr_faulted:
 	/*
-	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
-	 * non-atomically.  Therefore, if get_user below is not
-	 * enough, we need to handle the fault ourselves, while
-	 * still holding the mmap_sem.
-	 *
-	 * ... and hb->lock. :-) --ANK
+	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
+	 * atomically.  Therefore, if we continue to fault after get_user()
+	 * below, we need to handle the fault ourselves, while still holding
+	 * the mmap_sem.  This can occur if the uaddr is under contention as
+	 * we have to drop the mmap_sem in order to call get_user().
 	 */
 	queue_unlock(&q, hb);
 
@@ -1582,7 +1581,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	}
 
 	ret = get_user(uval, uaddr);
-	if (!ret && (uval != -EFAULT))
+	if (!ret)
 		goto retry;
 
 	if (to)
@@ -1676,12 +1675,11 @@ out:
 
 pi_faulted:
 	/*
-	 * We have to r/w  *(int __user *)uaddr, but we can't modify it
-	 * non-atomically.  Therefore, if get_user below is not
-	 * enough, we need to handle the fault ourselves, while
-	 * still holding the mmap_sem.
-	 *
-	 * ... and hb->lock. --ANK
+	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
+	 * atomically.  Therefore, if we continue to fault after get_user()
+	 * below, we need to handle the fault ourselves, while still holding
+	 * the mmap_sem.  This can occur if the uaddr is under contention as
+	 * we have to drop the mmap_sem in order to call get_user().
 	 */
 	spin_unlock(&hb->lock);
 
@@ -1694,7 +1692,7 @@ pi_faulted:
 	}
 
 	ret = get_user(uval, uaddr);
-	if (!ret && (uval != -EFAULT))
+	if (!ret)
 		goto retry;
 
 	return ret;
-- 
cgit v1.2.3


From 26cc271db798cf211d35967cbfbb53e997126b84 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Fri, 19 Dec 2008 10:23:03 -0800
Subject: printk: fix discarding message when recursion_bug

Impact: fix truncated recursion bug message printout

When recursion_bug is true, kernel discards original message because printk_buf
contains recursion_bug_msg with NULL terminator. The sizeof(recursion_bug_msg)
makes this, use strlen() to get correct length without NULL terminator.

Reported-by: Toshikazu Nakayama <nakayama.ts@ncos.nec.co.jp>
Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index f492f1583d77..e651ab05655f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -662,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	if (recursion_bug) {
 		recursion_bug = 0;
 		strcpy(printk_buf, recursion_bug_msg);
-		printed_len = sizeof(recursion_bug_msg);
+		printed_len = strlen(recursion_bug_msg);
 	}
 	/* Emit the output into the temporary buffer */
 	printed_len += vscnprintf(printk_buf + printed_len,
-- 
cgit v1.2.3


From 12d79bafb75639f406a9f71aab94808c414c836e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 25 Dec 2008 09:31:28 +0100
Subject: rcu: provide RCU options on non-preempt architectures too

Impact: build fix

Some old architectures still do not use kernel/Kconfig.preempt, so the
moving of the RCU options there broke their build:

 In file included from /home/mingo/tip/include/linux/sem.h:81,
                 from /home/mingo/tip/include/linux/sched.h:69,
                 from /home/mingo/tip/arch/alpha/kernel/asm-offsets.c:9:
 /home/mingo/tip/include/linux/rcupdate.h:62:2: error: #error "Unknown RCU implementation specified to kernel configuration"

Move these options back to init/Kconfig, which every architecture
includes.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Kconfig.preempt | 75 --------------------------------------------------
 1 file changed, 75 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 463f29743ea0..bf987b95b356 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,78 +52,3 @@ config PREEMPT
 
 endchoice
 
-choice
-	prompt "RCU Implementation"
-	default CLASSIC_RCU
-
-config CLASSIC_RCU
-	bool "Classic RCU"
-	help
-	  This option selects the classic RCU implementation that is
-	  designed for best read-side performance on non-realtime
-	  systems.
-	  
-	  Select this option if you are unsure.
-
-config TREE_RCU
-	bool "Tree-based hierarchical RCU"
-	help
-	  This option selects the RCU implementation that is
-	  designed for very large SMP system with hundreds or
-	  thousands of CPUs.
-
-config PREEMPT_RCU
-	bool "Preemptible RCU"
-	depends on PREEMPT
-	help
-	  This option reduces the latency of the kernel by making certain
-	  RCU sections preemptible. Normally RCU code is non-preemptible, if
-	  this option is selected then read-only RCU sections become
-	  preemptible. This helps latency, but may expose bugs due to
-	  now-naive assumptions about each RCU read-side critical section
-	  remaining on a given CPU through its execution.
-
-endchoice
-
-config RCU_TRACE
-	bool "Enable tracing for RCU"
-	depends on TREE_RCU || PREEMPT_RCU
-	help
-	  This option provides tracing in RCU which presents stats
-	  in debugfs for debugging RCU implementation.
-
-	  Say Y here if you want to enable RCU tracing
-	  Say N if you are unsure.
-
-config RCU_FANOUT
-	int "Tree-based hierarchical RCU fanout value"
-	range 2 64 if 64BIT
-	range 2 32 if !64BIT
-	depends on TREE_RCU
-	default 64 if 64BIT
-	default 32 if !64BIT
-	help
-	  This option controls the fanout of hierarchical implementations
-	  of RCU, allowing RCU to work efficiently on machines with
-	  large numbers of CPUs.  This value must be at least the cube
-	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
-	  systems and up to 262,144 for 64-bit systems.
-
-	  Select a specific number if testing RCU itself.
-	  Take the default if unsure.
-
-config RCU_FANOUT_EXACT
-	bool "Disable tree-based hierarchical RCU auto-balancing"
-	depends on TREE_RCU
-	default n
-	help
-	  This option forces use of the exact RCU_FANOUT value specified,
-	  regardless of imbalances in the hierarchy.  This is useful for
-	  testing RCU itself, and might one day be useful on systems with
-	  strong NUMA behavior.
-
-	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
-
-	  Say n if unsure.
-
-	
-- 
cgit v1.2.3


From 9212ddb5eada64fec5a08b28207401f3cc3d0876 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 25 Dec 2008 11:21:20 +0100
Subject: stacktrace: provide save_stack_trace_tsk() weak alias

Impact: build fix

Some architectures have not implemented save_stack_trace_tsk() yet:

  fs/built-in.o: In function `proc_pid_stack':
  base.c:(.text+0x3f140): undefined reference to `save_stack_trace_tsk'

So warn about that if the facility is used.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/stacktrace.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 94b527ef1d1e..eb212f8f8bc8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  */
 #include <linux/sched.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
@@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
 }
 EXPORT_SYMBOL_GPL(print_stack_trace);
 
+/*
+ * Architectures that do not implement save_stack_trace_tsk get this
+ * weak alias and a once-per-bootup warning (whenever this facility
+ * is utilized - for example by procfs):
+ */
+__weak void
+save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
+}
-- 
cgit v1.2.3