On 18.02.19 г. 7:27 ч., Qu Wenruo wrote:
> There are at least 2 reports about memory bit flip sneaking into on-disk
> data.
>
> Currently we only have a relaxed check triggered at
> btrfs_mark_buffer_dirty() time, as it's not mandatory and only for
> CONFIG_BTRFS_FS_CHECK_INTEGRITY enabled build, it doesn't help user to
> detect such problem.
>
> This patch will address the hole by triggering comprehensive check on
> tree blocks before writing it back to disk.
>
> The design points are:
> - Timing of the check: Tree block write hook
> This timing is chosen to reduce the overhead.
> The comprehensive check should be as expensive as csum.
> Doing full check at btrfs_mark_buffer_dirty() is too expensive for end
> user.
>
> - Loose empty leaf check
> Originally for empty leaf, tree-checker will report error if it's not
> a tree root.
> The problem for such check at write time is:
> * False alert for tree root created in current transaction
> In that case, the commit root still needs to be written to disk.
> And since current root can differ from commit root, then it will
> cause false alert.
> This happens for log tree.
>
> * False alert for relocated tree block
> Relocated tree block can be written to disk due to memory pressure,
> in that case an empty csum tree root can be written to disk and
> cause false alert, since csum root node hasn't been updated.
>
> Although some more reliable empty leaf check is still kept as is.
> Namely essential trees (e.g. extent, chunk) should never be empty.
>
> The example error output will be something like:
> BTRFS critical (device dm-3): corrupt leaf: root=2 block=1350630375424 slot=68, bad key order, prev (10510212874240 169 0) current (1714119868416 169 0)
> BTRFS error (device dm-3): block=1350630375424 write time tree block corruption detected
> BTRFS: error (device dm-3) in btrfs_commit_transaction:2220: errno=-5 IO failure (Error while writing out transaction)
> BTRFS info (device dm-3): forced readonly
> BTRFS warning (device dm-3): Skipping commit of aborted transaction.
> BTRFS: error (device dm-3) in cleanup_transaction:1839: errno=-5 IO failure
> BTRFS info (device dm-3): delayed_refs has NO entry
>
> Reported-by: Leonard Lausen <leonard@xxxxxxxxx>
> Signed-off-by: Qu Wenruo <wqu@xxxxxxxx>
> ---
> fs/btrfs/disk-io.c | 10 ++++++++++
> fs/btrfs/tree-checker.c | 24 +++++++++++++++++++++---
> fs/btrfs/tree-checker.h | 8 ++++++++
> 3 files changed, 39 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 6052ab508f84..fff789f8db63 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -313,6 +313,16 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
> return -EUCLEAN;
> }
> } else {
> + if (btrfs_header_level(buf))
> + err = btrfs_check_node(fs_info, buf);
> + else
> + err = btrfs_check_leaf_write(fs_info, buf);
> + if (err < 0) {
> + btrfs_err(fs_info,
> + "block=%llu write time tree block corruption detected",
> + buf->start);
> + return err;
> + }
This code should be moved in csum_dirty_buffer. Currently there is
pending cleanups in csum_tree_block and the final if there will be
removed and respective read/write code factored out in
csum_dirty_buffer/btree_readpage_end_io_hook.
Eventually csum_tree_block's sole purpose should be to calculate the
checksum and nothing more.
> write_extent_buffer(buf, result, 0, csum_size);
> }
>
> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
> index a62e1e837a89..b8cdaf472031 100644
> --- a/fs/btrfs/tree-checker.c
> +++ b/fs/btrfs/tree-checker.c
> @@ -477,7 +477,7 @@ static int check_leaf_item(struct btrfs_fs_info *fs_info,
> }
>
> static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
> - bool check_item_data)
> + bool check_item_data, bool check_empty_leaf)
> {
> /* No valid key type is 0, so all key should be larger than this key */
> struct btrfs_key prev_key = {0, 0, 0};
> @@ -516,6 +516,18 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
> owner);
> return -EUCLEAN;
> }
> +
> + /*
> + * Skip empty leaf check, mostly for write time tree block
> + *
> + * Such skip mostly happens for tree block write time, as
> + * we can't use @owner as accurate owner indicator.
> + * Case like balance and new tree block created for commit root
> + * can break owner check easily.
> + */
> + if (!check_empty_leaf)
> + return 0;
> +
> key.objectid = owner;
> key.type = BTRFS_ROOT_ITEM_KEY;
> key.offset = (u64)-1;
> @@ -636,13 +648,19 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
> int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
> struct extent_buffer *leaf)
> {
> - return check_leaf(fs_info, leaf, true);
> + return check_leaf(fs_info, leaf, true, true);
> }
>
> int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
> struct extent_buffer *leaf)
> {
> - return check_leaf(fs_info, leaf, false);
> + return check_leaf(fs_info, leaf, false, true);
> +}
> +
> +int btrfs_check_leaf_write(struct btrfs_fs_info *fs_info,
> + struct extent_buffer *leaf)
> +{
> + return check_leaf(fs_info, leaf, false, false);
> }
>
> int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
> diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
> index ff043275b784..6f8d1b627c53 100644
> --- a/fs/btrfs/tree-checker.h
> +++ b/fs/btrfs/tree-checker.h
> @@ -23,6 +23,14 @@ int btrfs_check_leaf_full(struct btrfs_fs_info *fs_info,
> */
> int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info,
> struct extent_buffer *leaf);
> +
> +/*
> + * Write time specific leaf checker.
> + * Don't check if the empty leaf belongs to a tree root. Mostly for balance
> + * and new tree created in current transaction.
> + */
> +int btrfs_check_leaf_write(struct btrfs_fs_info *fs_info,
> + struct extent_buffer *leaf);
> int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node);
>
> #endif
>