[PATCH v2 1/2] btrfs: handle volume split brain scenario

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



In raid configs RAID1/RAID5/RAID6 it's possible to have some devices
missing which would render btrfs to be mounted in degraded state but
still be operational. In those cases it's possible (albeit highly
undesirable) that the degraded and missing parts of the filesystem are
mounted independently. When writes occur such split-brain scenarios
(caused by intentional user action) then one of the sides of the RAID
config will have to be blown away when bringing it back to the
consistent state.

Handle split-brain volumes by setting a new flag
BTRFS_SUPER_FLAG_DEGRADED if the device is mounted degraded. So we
could detect and fail the mount if all the disks contains this flag.

To reassemble a split-brain volume first mount the good disk and then
scan in the device on which new writes can be ignored, (it needs patch
btrfs: handle dynamically reappearing missing device)

Warning:  A raid1 root device, in split brain condition, would fail
to bootup to protect the arbitrary loss of data.

Signed-off-by: Anand Jain <anand.jain@xxxxxxxxxx>
---
On top of misc-next kdave.
v2:
 Improve commit log.
 Rename to BTRFS_SUPER_FLAG_DEGRADED.
 Rename variables to fs_devices and device.
 In open_ctree() check for split-brain after btrfs_read_chunk_tree()

 fs/btrfs/disk-io.c              | 55 ++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/btrfs_tree.h |  1 +
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b302db90598c..e87924b7145b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -61,7 +61,8 @@
 				 BTRFS_HEADER_FLAG_RELOC |\
 				 BTRFS_SUPER_FLAG_ERROR |\
 				 BTRFS_SUPER_FLAG_SEEDING |\
-				 BTRFS_SUPER_FLAG_METADUMP)
+				 BTRFS_SUPER_FLAG_METADUMP|\
+				 BTRFS_SUPER_FLAG_DEGRADED)
 
 static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -2383,6 +2384,43 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
+bool volume_has_split_brain(struct btrfs_fs_info *fs_info)
+{
+	unsigned long devs_moved_on = 0;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct list_head *head = &fs_devices->devices;
+	struct btrfs_device *device;
+
+again:
+	list_for_each_entry(device, head, dev_list) {
+		struct buffer_head *bh;
+		struct btrfs_super_block *sb;
+
+		if (!device->devid)
+			continue;
+
+		bh = btrfs_read_dev_super(device->bdev);
+		if (IS_ERR(bh))
+			continue;
+
+		sb = (struct btrfs_super_block *)bh->b_data;
+		if (btrfs_super_flags(sb) & BTRFS_SUPER_FLAG_DEGRADED)
+			devs_moved_on++;
+		brelse(bh);
+	}
+
+	fs_devices = fs_devices->seed;
+	if (fs_devices) {
+		head = &fs_devices->devices;
+		goto again;
+	}
+
+	if (devs_moved_on == fs_info->fs_devices->total_devices)
+		return true;
+	else
+		return false;
+}
+
 int open_ctree(struct super_block *sb,
 	       struct btrfs_fs_devices *fs_devices,
 	       char *options)
@@ -2765,6 +2803,21 @@ int open_ctree(struct super_block *sb,
 		goto fail_tree_roots;
 	}
 
+	if (fs_info->fs_devices->missing_devices) {
+		btrfs_set_super_flags(fs_info->super_copy,
+				      fs_info->super_copy->flags |
+				      BTRFS_SUPER_FLAG_DEGRADED);
+	} else if (fs_info->super_copy->flags & BTRFS_SUPER_FLAG_DEGRADED) {
+		if (volume_has_split_brain(fs_info)) {
+			btrfs_err(fs_info,
+				  "Detected 'degraded' flag on all devices");
+			goto fail_tree_roots;
+		}
+		btrfs_set_super_flags(fs_info->super_copy,
+				      fs_info->super_copy->flags &
+				      ~BTRFS_SUPER_FLAG_DEGRADED);
+	}
+
 	/*
 	 * keep the device that is marked to be the target device for the
 	 * dev_replace procedure
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 33e814ef992f..c08b9b89e285 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2057,8 +2057,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
 	device->fs_devices->num_devices--;
 	device->fs_devices->total_devices--;
 
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 		device->fs_devices->missing_devices--;
+		if (!device->fs_devices->missing_devices)
+			btrfs_set_super_flags(fs_info->super_copy,
+				fs_info->super_copy->flags &
+				~BTRFS_SUPER_FLAG_DEGRADED);
+	}
 
 	btrfs_assign_next_active_device(fs_info, device, NULL);
 
@@ -2132,8 +2137,13 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
 	list_del_rcu(&srcdev->dev_list);
 	list_del(&srcdev->dev_alloc_list);
 	fs_devices->num_devices--;
-	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
+	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) {
 		fs_devices->missing_devices--;
+		if (!fs_devices->missing_devices)
+			btrfs_set_super_flags(fs_info->super_copy,
+				fs_info->super_copy->flags &
+				~BTRFS_SUPER_FLAG_DEGRADED);
+	}
 
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
 		fs_devices->rw_devices--;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 6d6e5da51527..ed1325d04033 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -456,6 +456,7 @@ struct btrfs_free_space_header {
 
 #define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
+#define BTRFS_SUPER_FLAG_DEGRADED	(1ULL << 36)
 
 
 /*
-- 
2.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux