On Sun, Feb 15, 2015 at 10:38:54PM +0000, Filipe Manana wrote:
> When punching a file hole if we endup only zeroing parts of a page,
> because the start offset isn't a multiple of the sector size or the
> start offset and length fall within the same page, we were not updating
> the inode item. This prevented an fsync from doing anything, if no other
> file changes happened in the current transaction, because the fields
> in btrfs_inode used to check if the inode needs to be fsync'ed weren't
> updated.
>
> This issue is easy to reproduce and the following excerpt from the
> xfstest case I made shows how to trigger it:
>
> _scratch_mkfs >> $seqres.full 2>&1
> _init_flakey
> _mount_flakey
>
> # Create our test file.
> $XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \
> $SCRATCH_MNT/foo | _filter_xfs_io
>
> # Fsync the file, this makes btrfs update some btrfs inode specific fields
> # that are used to track if the inode needs to be written/updated to the fsync
> # log or not. After this fsync, the new values for those fields indicate that
> # a subsequent fsync does not need to touch the fsync log.
> $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
>
> # Force a commit of the current transaction. After this point, any operation
> # that modifies the data or metadata of our file, should update those fields in
> # the btrfs inode with values that make the next fsync operation write to the
> # fsync log.
> sync
>
> # Punch a hole in our file. This small range affects only 1 page.
> # This made the btrfs hole punching implementation write only some zeroes in
> # one page, but it did not update the btrfs inode fields used to determine if
> # the next fsync needs to write to the fsync log.
> $XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo
>
> # Another variation of the previously mentioned case.
> $XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo
>
> # Now fsync the file. This was a no-operation because the previous hole punch
> # operation didn't update the inode's fields mentioned before, so they remained
> # with the values they had after the first fsync - that is, they indicate that
> # it is not needed to write to fsync log.
> $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo
>
> echo "File content before:"
> od -t x1 $SCRATCH_MNT/foo
>
> # Simulate a crash/power loss.
> _load_flakey_table $FLAKEY_DROP_WRITES
> _unmount_flakey
>
> # Enable writes and mount the fs. This makes the fsync log replay code run.
> _load_flakey_table $FLAKEY_ALLOW_WRITES
> _mount_flakey
>
> # Because the last fsync didn't do anything, here the file content matched what
> # it was after the first fsync, before the holes were punched, and not what it
> # was after the holes were punched.
> echo "File content after:"
> od -t x1 $SCRATCH_MNT/foo
>
> This issue has been around since 2012, when the punch hole implementation
> was added, commit 2aaa66558172 ("Btrfs: add hole punching").
Reviewed-by: Liu Bo <bo.li.liu@xxxxxxxxxx>
Thanks,
-liubo
>
> A test case for xfstests follows soon.
>
> Signed-off-by: Filipe Manana <fdmanana@xxxxxxxx>
> ---
> fs/btrfs/file.c | 31 ++++++++++++++++++++++++++++---
> 1 file changed, 28 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index e409025..b476e56 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -2276,6 +2276,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> bool same_page;
> bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
> u64 ino_size;
> + bool truncated_page = false;
> + bool updated_inode = false;
>
> ret = btrfs_wait_ordered_range(inode, offset, len);
> if (ret)
> @@ -2307,13 +2309,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> * entire page.
> */
> if (same_page && len < PAGE_CACHE_SIZE) {
> - if (offset < ino_size)
> + if (offset < ino_size) {
> + truncated_page = true;
> ret = btrfs_truncate_page(inode, offset, len, 0);
> + } else {
> + ret = 0;
> + }
> goto out_only_mutex;
> }
>
> /* zero back part of the first page */
> if (offset < ino_size) {
> + truncated_page = true;
> ret = btrfs_truncate_page(inode, offset, 0, 0);
> if (ret) {
> mutex_unlock(&inode->i_mutex);
> @@ -2349,6 +2356,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> if (!ret) {
> /* zero the front end of the last page */
> if (tail_start + tail_len < ino_size) {
> + truncated_page = true;
> ret = btrfs_truncate_page(inode,
> tail_start + tail_len, 0, 1);
> if (ret)
> @@ -2358,8 +2366,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> }
>
> if (lockend < lockstart) {
> - mutex_unlock(&inode->i_mutex);
> - return 0;
> + ret = 0;
> + goto out_only_mutex;
> }
>
> while (1) {
> @@ -2507,6 +2515,7 @@ out_trans:
>
> trans->block_rsv = &root->fs_info->trans_block_rsv;
> ret = btrfs_update_inode(trans, root, inode);
> + updated_inode = true;
> btrfs_end_transaction(trans, root);
> btrfs_btree_balance_dirty(root);
> out_free:
> @@ -2516,6 +2525,22 @@ out:
> unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
> &cached_state, GFP_NOFS);
> out_only_mutex:
> + if (!updated_inode && truncated_page && !ret && !err) {
> + /*
> + * If we only end up zeroing part of a page, we still need to
> + * update the inode item, so that all the time fields are
> + * updated as well as the necessary btrfs inode in memory fields
> + * for detecting, at fsync time, if the inode isn't yet in the
> + * log tree or it's there but not up to date.
> + */
> + trans = btrfs_start_transaction(root, 1);
> + if (IS_ERR(trans)) {
> + err = PTR_ERR(trans);
> + } else {
> + err = btrfs_update_inode(trans, root, inode);
> + ret = btrfs_end_transaction(trans, root);
> + }
> + }
> mutex_unlock(&inode->i_mutex);
> if (ret && !err)
> err = ret;
> --
> 2.1.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html