Re: [PATCH RFC] btrfs: scrub: Don't use inode pages for device replace

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 5/31/18 4:35 AM, Qu Wenruo wrote:
> [BUG]
> Btrfs can easily create compressed extent without checksum (even
> though it shouldn't), and if we then try to replace device containing
> such extent, the result device will contain all the uncompressed data
> instead of the compressed one.
> 
> Leading to data corruption.
> 
> [Cause]
> When handling compressed extent without checksum, device replace will
> goes into copy_nocow_pages() function.
> 
> In that function, btrfs will get all inodes referring to this data
> extents and then use find_or_create_page() to get pages direct from that
> inode.
> 
> The problem here is, pages directly from inode are always uncompressed.
> And for compressed data extent, they mismatch with on-disk data.
> Thus this leads to corrupted compressed data extent written to replace
> device.
> 
> [Fix]
> The fix is a little tricky.
> As we can't determine if an extent is compressed until we iterate
> through at least one of its refereners, so we still need to go into
> copy_nocow_pages() function.
> 
> And before we really begin to copy data, we check if the extent is
> compressed, and if it is, we fallback to scrub_pages() to read the
> on-disk data other than inode's uncompressed data.
> 
> To be able to call scrub_pages(), we need extra parameters, add all of
> them (@physical, @dev, @gen, excluding @flags which is fixed to
> BTRFS_EXTENT_FLAG_DATA) to copy_nocow_pages() and nocow_ctx structure.
> 
> Reported-by: James Harvey <jamespharvey20@xxxxxxxxx>
> Signed-off-by: Qu Wenruo <wqu@xxxxxxxx>
> ---
> 
> In fact, the copy_nocow_pages() and its children functions are just a
> kind of "clever" optimization to skip some read IO.
> 
> However I'm never a fan of such "optimization", no to mention when it's
> causing bugs.
> Although the ability to use page cache to write data back looks pretty
> nice, it's in fact pretty niche.
> 
> I'm pretty happy if we could remove the whole copy_nocow_pages() branch,
> and use the only and unified scrub_pages() to handle everything.
> At least it's way more cleaner than current solution.

I think this is the way to go.  It's added complexity for not a lot of
benefit.

-Jeff

> ---
>  fs/btrfs/scrub.c | 50 +++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 45 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 52b39a0924e9..307a06450e5c 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -212,6 +212,11 @@ struct scrub_copy_nocow_ctx {
>  	u64			physical_for_dev_replace;
>  	struct list_head	inodes;
>  	struct btrfs_work	work;
> +
> +	/* All these members are only for falling back to scrub_pages() */
> +	u64			fb_physical;
> +	struct btrfs_device	*fb_dev;
> +	u64			fb_gen;
>  };
>  
>  struct scrub_warning {
> @@ -282,6 +287,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
>  static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
>  				      struct scrub_copy_nocow_ctx *ctx);
>  static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
> +			    u64 physical, struct btrfs_device *dev, u64 gen,
>  			    int mirror_num, u64 physical_for_dev_replace);
>  static void copy_nocow_pages_worker(struct btrfs_work *work);
>  static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
> @@ -2801,8 +2807,8 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
>  				++sctx->stat.no_csum;
>  			if (sctx->is_dev_replace && !have_csum) {
>  				ret = copy_nocow_pages(sctx, logical, l,
> -						       mirror_num,
> -						      physical_for_dev_replace);
> +						physical, dev, gen, mirror_num,
> +						physical_for_dev_replace);
>  				goto behind_scrub_pages;
>  			}
>  		}
> @@ -4359,6 +4365,7 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
>  }
>  
>  static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
> +			    u64 physical, struct btrfs_device *dev, u64 gen,
>  			    int mirror_num, u64 physical_for_dev_replace)
>  {
>  	struct scrub_copy_nocow_ctx *nocow_ctx;
> @@ -4379,6 +4386,11 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
>  	nocow_ctx->len = len;
>  	nocow_ctx->mirror_num = mirror_num;
>  	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
> +
> +	nocow_ctx->fb_physical = physical;
> +	nocow_ctx->fb_gen = gen;
> +	nocow_ctx->fb_dev = dev;
> +
>  	btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
>  			copy_nocow_pages_worker, NULL, NULL);
>  	INIT_LIST_HEAD(&nocow_ctx->inodes);
> @@ -4487,7 +4499,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
>  }
>  
>  static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
> -				 u64 logical)
> +				 u64 logical, int *compressed)
>  {
>  	struct extent_state *cached_state = NULL;
>  	struct btrfs_ordered_extent *ordered;
> @@ -4523,6 +4535,7 @@ static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
>  		ret = 1;
>  		goto out_unlock;
>  	}
> +	*compressed = em->compress_type;
>  	free_extent_map(em);
>  
>  out_unlock:
> @@ -4543,6 +4556,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
>  	u64 nocow_ctx_logical;
>  	u64 len = nocow_ctx->len;
>  	unsigned long index;
> +	int compressed;
>  	int srcu_index;
>  	int ret = 0;
>  	int err = 0;
> @@ -4576,12 +4590,20 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
>  	nocow_ctx_logical = nocow_ctx->logical;
>  
>  	ret = check_extent_to_block(BTRFS_I(inode), offset, len,
> -			nocow_ctx_logical);
> +			nocow_ctx_logical, &compressed);
>  	if (ret) {
>  		ret = ret > 0 ? 0 : ret;
>  		goto out;
>  	}
>  
> +	/*
> +	 * We hit the damn nodatasum compressed extent, we can't use any page
> +	 * from inode as they are all *UNCOMPRESSED*.
> +	 * We fall back to scrub_pages() for such case.
> +	 */
> +	if (compressed)
> +		goto fallback;
> +
>  	while (len >= PAGE_SIZE) {
>  		index = offset >> PAGE_SHIFT;
>  again:
> @@ -4624,11 +4646,16 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
>  		}
>  
>  		ret = check_extent_to_block(BTRFS_I(inode), offset, len,
> -					    nocow_ctx_logical);
> +					    nocow_ctx_logical, &compressed);
>  		if (ret) {
>  			ret = ret > 0 ? 0 : ret;
>  			goto next_page;
>  		}
> +		if (compressed) {
> +			unlock_page(page);
> +			put_page(page);
> +			goto fallback;
> +		}
>  
>  		err = write_page_nocow(nocow_ctx->sctx,
>  				       physical_for_dev_replace, page);
> @@ -4651,6 +4678,19 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
>  	inode_unlock(inode);
>  	iput(inode);
>  	return ret;
> +
> +fallback:
> +	inode_unlock(inode);
> +	iput(inode);
> +
> +	ret = scrub_pages(nocow_ctx->sctx, nocow_ctx->logical,
> +			  nocow_ctx->len, nocow_ctx->fb_physical,
> +			  nocow_ctx->fb_dev, BTRFS_EXTENT_FLAG_DATA,
> +			  nocow_ctx->fb_gen, nocow_ctx->mirror_num,
> +			  NULL, 0, physical_for_dev_replace);
> +	if (!ret)
> +		ret = COPY_COMPLETE;
> +	return ret;
>  }
>  
>  static int write_page_nocow(struct scrub_ctx *sctx,
> 


-- 
Jeff Mahoney
SUSE Labs

Attachment: signature.asc
Description: OpenPGP digital signature


[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux