On 2020/7/1 下午10:44, Josef Bacik wrote:
> One of the things that came up consistently in talking with Fedora about
> switching to btrfs as default is that btrfs is particularly vulnerable
> to metadata corruption. If any of the core global roots are corrupted,
> the fs is unmountable and fsck can't usually do anything for you without
> some special options.
>
> Qu addressed this sort of with rescue=skipbg, but that's poorly named as
> what it really does is just allow you to operate without an extent root.
> However there are a lot of other roots, and I'd rather not have to do
>
> mount -o rescue=skipbg,rescue=nocsum,rescue=nofreespacetree,rescue=blah
Yep, that's the problem.
Extent tree is chosen mostly by the fact that, extent tree seems to be
the first victim for transid error.
I still like the idea to ignore more trees.
>
> Instead take his original idea and modify it so it just works for
> everything. Turn it into rescue=onlyfs, and then any major root we fail
> to read just gets left empty and we carry on.
That's indeed an improvement, other trees, especially extent, uuid,
reloc, dev trees are completely useless in completely RO environment.
>
> Obviously if the fs roots are screwed then the user is in trouble, but
> otherwise this makes it much easier to pull stuff off the disk without
> needing our special rescue tools. I tested this with my TEST_DEV that
> had a bunch of data on it by corrupting the csum tree and then reading
> files off the disk.
And since we still try to read csum tree, and if it's not corrupted, it
still get read and we go regular csum verification, so I'm completely
fine with this enhancement.
>
> Signed-off-by: Josef Bacik <josef@xxxxxxxxxxxxxx>
> ---
>
> I'm not married to the rescue=onlyfs name, if we can think of something better
> I'm good.
Naming is never easy to do.
What about 'skipall'?
Despite that, I'm pretty happy with the enhancement, with only one
concern below.
...
> @@ -2275,21 +2274,27 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
> location.objectid = BTRFS_DEV_TREE_OBJECTID;
> root = btrfs_read_tree_root(tree_root, &location);
> if (IS_ERR(root)) {
> - ret = PTR_ERR(root);
> - goto out;
> + if (!btrfs_test_opt(fs_info, ONLYFS)) {
> + ret = PTR_ERR(root);
> + goto out;
> + }
> + } else {
> + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> + fs_info->dev_root = root;
> + btrfs_init_devices_late(fs_info);
> }
> - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> - fs_info->dev_root = root;
> - btrfs_init_devices_late(fs_info);
>
> location.objectid = BTRFS_CSUM_TREE_OBJECTID;
> root = btrfs_read_tree_root(tree_root, &location);
> if (IS_ERR(root)) {
> - ret = PTR_ERR(root);
> - goto out;
> + if (!btrfs_test_opt(fs_info, ONLYFS)) {
> + ret = PTR_ERR(root);
> + goto out;
> + }
One of my concern here, I didn't see btrfs_lookup_csums_range() have the
ability to skip NULL csum root.
Wouldn't this cause NULL pointer dereference if the csum root is corrupted?
Thanks,
Qu
> + } else {
> + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> + fs_info->csum_root = root;
> }
> - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> - fs_info->csum_root = root;
>
> /*
> * This tree can share blocks with some other fs tree during relocation
> @@ -2298,11 +2303,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
> root = btrfs_get_fs_root(tree_root->fs_info,
> BTRFS_DATA_RELOC_TREE_OBJECTID, true);
> if (IS_ERR(root)) {
> - ret = PTR_ERR(root);
> - goto out;
> + if (!btrfs_test_opt(fs_info, ONLYFS)) {
> + ret = PTR_ERR(root);
> + goto out;
> + }
> + } else {
> + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> + fs_info->data_reloc_root = root;
> }
> - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> - fs_info->data_reloc_root = root;
>
> location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
> root = btrfs_read_tree_root(tree_root, &location);
> @@ -2315,9 +2323,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
> location.objectid = BTRFS_UUID_TREE_OBJECTID;
> root = btrfs_read_tree_root(tree_root, &location);
> if (IS_ERR(root)) {
> - ret = PTR_ERR(root);
> - if (ret != -ENOENT)
> - goto out;
> + if (!btrfs_test_opt(fs_info, ONLYFS)) {
> + ret = PTR_ERR(root);
> + if (ret != -ENOENT)
> + goto out;
> + }
> } else {
> set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> fs_info->uuid_root = root;
> @@ -2327,11 +2337,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
> location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
> root = btrfs_read_tree_root(tree_root, &location);
> if (IS_ERR(root)) {
> - ret = PTR_ERR(root);
> - goto out;
> + if (!btrfs_test_opt(fs_info, ONLYFS)) {
> + ret = PTR_ERR(root);
> + goto out;
> + }
> + } else {
> + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> + fs_info->free_space_root = root;
> }
> - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
> - fs_info->free_space_root = root;
> }
>
> return 0;
> @@ -3047,20 +3060,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
> }
>
> /* Skip bg needs RO and no tree-log to replay */
> - if (btrfs_test_opt(fs_info, SKIPBG)) {
> - if (!sb_rdonly(sb)) {
> - btrfs_err(fs_info,
> - "rescue=skipbg can only be used on read-only mount");
> - err = -EINVAL;
> - goto fail_alloc;
> - }
> - if (btrfs_super_log_root(disk_super) &&
> - !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
> - btrfs_err(fs_info,
> -"rescue=skipbg must be used with rescue=nologreplay when tree-log needs to replayed");
> - err = -EINVAL;
> - goto fail_alloc;
> - }
> + if (btrfs_test_opt(fs_info, ONLYFS) && !sb_rdonly(sb)) {
> + btrfs_err(fs_info,
> + "rescue=onlyfs can only be used on read-only mount");
> + err = -EINVAL;
> + goto fail_alloc;
> }
>
> ret = btrfs_init_workqueues(fs_info, fs_devices);
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index d301550b9c70..9f8ef22ac65e 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -2209,7 +2209,8 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
> int skip_sum;
> int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
>
> - skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
> + skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
> + btrfs_test_opt(fs_info, ONLYFS);
>
> if (btrfs_is_free_space_inode(BTRFS_I(inode)))
> metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
> @@ -2866,6 +2867,9 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
> if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
> return 0;
>
> + if (btrfs_test_opt(root->fs_info, ONLYFS))
> + return 0;
> +
> if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
> test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
> clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 3c9ebd4f2b61..7ea9f8f53156 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -345,7 +345,7 @@ enum {
> Opt_rescue,
> Opt_usebackuproot,
> Opt_nologreplay,
> - Opt_rescue_skipbg,
> + Opt_rescue_onlyfs,
>
> /* Deprecated options */
> Opt_alloc_start,
> @@ -445,7 +445,7 @@ static const match_table_t tokens = {
> static const match_table_t rescue_tokens = {
> {Opt_usebackuproot, "usebackuproot"},
> {Opt_nologreplay, "nologreplay"},
> - {Opt_rescue_skipbg, "skipbg"},
> + {Opt_rescue_onlyfs, "onlyfs"},
> {Opt_err, NULL},
> };
>
> @@ -478,9 +478,10 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
> btrfs_set_and_info(info, NOLOGREPLAY,
> "disabling log replay at mount time");
> break;
> - case Opt_rescue_skipbg:
> - btrfs_set_and_info(info, SKIPBG,
> - "skip mount time block group searching");
> + case Opt_rescue_onlyfs:
> + btrfs_set_and_info(info, ONLYFS,
> + "only reading fs roots, also setting nologreplay");
> + btrfs_set_opt(info->mount_opt, NOLOGREPLAY);
> break;
> case Opt_err:
> btrfs_info(info, "unrecognized rescue option '%s'", p);
> @@ -1418,8 +1419,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
> seq_puts(seq, ",notreelog");
> if (btrfs_test_opt(info, NOLOGREPLAY))
> seq_puts(seq, ",rescue=nologreplay");
> - if (btrfs_test_opt(info, SKIPBG))
> - seq_puts(seq, ",rescue=skipbg");
> + if (btrfs_test_opt(info, ONLYFS))
> + seq_puts(seq, ",rescue=onlyfs");
> if (btrfs_test_opt(info, FLUSHONCOMMIT))
> seq_puts(seq, ",flushoncommit");
> if (btrfs_test_opt(info, DISCARD_SYNC))
> @@ -1859,10 +1860,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
> if (ret)
> goto restore;
>
> - if (btrfs_test_opt(fs_info, SKIPBG) !=
> - (old_opts & BTRFS_MOUNT_SKIPBG)) {
> + if (btrfs_test_opt(fs_info, ONLYFS) !=
> + (old_opts & BTRFS_MOUNT_ONLYFS)) {
> btrfs_err(fs_info,
> - "rescue=skipbg mount option can't be changed during remount");
> + "rescue=onlyfs mount option can't be changed during remount");
> ret = -EINVAL;
> goto restore;
> }
> @@ -1932,9 +1933,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
> goto restore;
> }
>
> - if (btrfs_test_opt(fs_info, SKIPBG)) {
> + if (btrfs_test_opt(fs_info, ONLYFS)) {
> btrfs_err(fs_info,
> - "remounting read-write with rescue=skipbg is not allowed");
> + "remounting read-write with rescue=onlyfs is not allowed");
> ret = -EINVAL;
> goto restore;
> }
> @@ -2245,7 +2246,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
> *
> * Or if we're rescuing, set available to 0 anyway.
> */
> - if (btrfs_test_opt(fs_info, SKIPBG) ||
> + if (btrfs_test_opt(fs_info, ONLYFS) ||
> (!mixed && block_rsv->space_info->full &&
> total_free_meta - thresh < block_rsv->size))
> buf->f_bavail = 0;
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index aabc6c922e04..a5d124f95ce2 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -7595,10 +7595,10 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
> int ret = 0;
>
> /*
> - * For rescue=skipbg mount option, we're already RO and are salvaging
> + * For rescue=onlyfs mount option, we're already RO and are salvaging
> * data, no need for such strict check.
> */
> - if (btrfs_test_opt(fs_info, SKIPBG))
> + if (btrfs_test_opt(fs_info, ONLYFS))
> return 0;
>
> key.objectid = 1;
>
Attachment:
signature.asc
Description: OpenPGP digital signature
