From 271f5a9b8f8ae0db95de72779d115c9d0b9d3cc5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 1 Sep 2008 12:32:52 +1000
Subject: [PATCH 1/2] Remove invalidate_partition call from do_md_stop.

When stopping an md array, or just switching to read-only, we
currently call invalidate_partition while holding the mddev lock.
The main reason for this is probably to ensure all dirty buffers
are flushed (invalidate_partition calls fsync_bdev).

However if any dirty buffers are found, it will almost certainly cause
a deadlock as starting writeout will require an update to the
superblock, and performing that updates requires taking the mddev
lock - which is already held.

This deadlock can be demonstrated by running "reboot -f -n" with
a root filesystem on md/raid, and some dirty buffers in memory.

All other calls to stop an array should already happen after a flush.
The normal sequence is to stop using the array (e.g. umount) which
will cause __blkdev_put to call sync_blockdev.  Then open the
array and issue the STOP_ARRAY ioctl while the buffers are all still
clean.

So this invalidate_partition is normally a no-op, except for one case
where it will cause a deadlock.

So remove it.

This patch possibly addresses the regression recored in
   http://bugzilla.kernel.org/show_bug.cgi?id=11460
and
   http://bugzilla.kernel.org/show_bug.cgi?id=11452

though it isn't yet clear how it ever worked.


Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8cfadc5bd2ba..4790c83d78d0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3841,8 +3841,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 
 		del_timer_sync(&mddev->safemode_timer);
 
-		invalidate_partition(disk, 0);
-
 		switch(mode) {
 		case 1: /* readonly */
 			err  = -ENXIO;

From b2d2c4ceaddc3098f19637a732f74b820a81a9e7 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 1 Sep 2008 12:48:13 +1000
Subject: [PATCH 2/2] Fix problem with waiting while holding rcu read lock in
 md/bitmap.c

A recent patch to protect the rdev list with rcu locking leaves us
with a problem because we can sleep on memalloc while holding the
rcu lock.

The rcu lock is only needed while walking the linked list as
uninteresting devices (failed or spares) can be removed at any time.

So only take the rcu lock while actually walking the linked list.
Take a refcount on the rdev during the time when we drop the lock
and do the memalloc to start IO.
When we return to the locked code, all the interesting devices
on the list will not have moved, so we can simply use
list_for_each_continue_rcu to pick up where we left off.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 45 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7e65bad522cb..ac89a5deaca2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -238,15 +238,47 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde
 
 }
 
+static mdk_rdev_t *next_active_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
+{
+	/* Iterate the disks of an mddev, using rcu to protect access to the
+	 * linked list, and raising the refcount of devices we return to ensure
+	 * they don't disappear while in use.
+	 * As devices are only added or removed when raid_disk is < 0 and
+	 * nr_pending is 0 and In_sync is clear, the entries we return will
+	 * still be in the same position on the list when we re-enter
+	 * list_for_each_continue_rcu.
+	 */
+	struct list_head *pos;
+	rcu_read_lock();
+	if (rdev == NULL)
+		/* start at the beginning */
+		pos = &mddev->disks;
+	else {
+		/* release the previous rdev and start from there. */
+		rdev_dec_pending(rdev, mddev);
+		pos = &rdev->same_set;
+	}
+	list_for_each_continue_rcu(pos, &mddev->disks) {
+		rdev = list_entry(pos, mdk_rdev_t, same_set);
+		if (rdev->raid_disk >= 0 &&
+		    test_bit(In_sync, &rdev->flags) &&
+		    !test_bit(Faulty, &rdev->flags)) {
+			/* this is a usable devices */
+			atomic_inc(&rdev->nr_pending);
+			rcu_read_unlock();
+			return rdev;
+		}
+	}
+	rcu_read_unlock();
+	return NULL;
+}
+
 static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 {
-	mdk_rdev_t *rdev;
+	mdk_rdev_t *rdev = NULL;
 	mddev_t *mddev = bitmap->mddev;
 
-	rcu_read_lock();
-	rdev_for_each_rcu(rdev, mddev)
-		if (test_bit(In_sync, &rdev->flags)
-		    && !test_bit(Faulty, &rdev->flags)) {
+	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 			int size = PAGE_SIZE;
 			if (page->index == bitmap->file_pages-1)
 				size = roundup(bitmap->last_page_size,
@@ -281,8 +313,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 				       + page->index * (PAGE_SIZE/512),
 				       size,
 				       page);
-		}
-	rcu_read_unlock();
+	}
 
 	if (wait)
 		md_super_wait(mddev);