On Thu, Jan 05, 2012 at 04:32:46PM +0800, Miao Xie wrote:
> The original truncation of btrfs has a bug, that is the orphan item will not be
> dropped when the truncation fails. This bug will trigger BUG() when unlink that
> truncated file. And besides that, if the user does pre-allocation for the file
> which is truncated unsuccessfully, after re-mount(umount-mount, not -o remount),
> the pre-allocated extent will be dropped.
>
> This patch modified the relative functions of the truncation, and makes the
> truncation update i_size and disk_i_size of i-nodes every time we drop the file
> extent successfully, and set them to the real value. By this way, we needn't
> add orphan items to guarantee the consistency of the meta-data.
>
> By this patch, it is possible that the file may not be truncated to the size
> that the user expects(may be <= the orignal size and >= the expected one), so I
> think it is better that we shouldn't lose the data that lies within the range
> <the expected size, the real size>, because the user may take it for granted
> that the data in that extent is not lost. In order to implement it, we just
> write out all the dirty pages which are beyond the expected size of the file.
> This operation will spend lots of time if there are many dirty pages. It is
> also the only disadvantage of this patch. (Maybe I'm overcautious, we needn't
> hold that data.)
>
> Signed-off-by: Miao Xie <miaox@xxxxxxxxxxxxxx>
> ---
> fs/btrfs/inode.c | 159 +++++++++++++++++-------------------------------------
> 1 files changed, 49 insertions(+), 110 deletions(-)
>
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index df6060f..9ace01b 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
> };
>
> static int btrfs_setsize(struct inode *inode, loff_t newsize);
> -static int btrfs_truncate(struct inode *inode);
> +static int btrfs_truncate(struct inode *inode, loff_t newsize);
> static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
> static noinline int cow_file_range(struct inode *inode,
> struct page *locked_page,
> @@ -2230,7 +2230,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
> * btrfs_delalloc_reserve_space to catch offenders.
> */
> mutex_lock(&inode->i_mutex);
> - ret = btrfs_truncate(inode);
> + ret = btrfs_truncate(inode, inode->i_size);
> mutex_unlock(&inode->i_mutex);
> } else {
> nr_unlink++;
> @@ -2993,7 +2993,7 @@ static int btrfs_release_and_test_inline_data_extent(
> return 0;
>
> /*
> - * Truncate inline items is special, we have done it by
> + * Truncate inline items is special, we will do it by
> * btrfs_truncate_page();
> */
> if (offset < new_size)
> @@ -3121,9 +3121,9 @@ static int btrfs_release_and_test_data_extent(struct btrfs_trans_handle *trans,
> * will kill all the items on this inode, including the INODE_ITEM_KEY.
> */
> int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
> - struct btrfs_root *root,
> - struct inode *inode,
> - u64 new_size, u32 min_type)
> + struct btrfs_root *root,
> + struct inode *inode,
> + u64 new_size, u32 min_type)
> {
> struct btrfs_path *path;
> struct extent_buffer *leaf;
> @@ -3131,6 +3131,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
> struct btrfs_key found_key;
> u64 mask = root->sectorsize - 1;
> u64 ino = btrfs_ino(inode);
> + u64 old_size = i_size_read(inode);
> u32 found_type;
> int pending_del_nr = 0;
> int pending_del_slot = 0;
> @@ -3138,6 +3139,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
> int err = 0;
>
> BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
> + BUG_ON(new_size & mask);
>
> path = btrfs_alloc_path();
> if (!path)
> @@ -3190,6 +3192,13 @@ search_again:
> ret = btrfs_release_and_test_data_extent(trans, root,
> path, inode, found_key.offset,
> new_size);
> + if (root->ref_cows ||
> + root == root->fs_info->tree_root) {
> + if (ret && found_key.offset < old_size)
> + i_size_write(inode, found_key.offset);
> + else if (!ret)
> + i_size_write(inode, new_size);
> + }
> if (!ret)
> break;
> }
> @@ -3247,12 +3256,10 @@ out:
> static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
> {
> struct inode *inode = mapping->host;
> - struct btrfs_root *root = BTRFS_I(inode)->root;
> struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> struct btrfs_ordered_extent *ordered;
> struct extent_state *cached_state = NULL;
> char *kaddr;
> - u32 blocksize = root->sectorsize;
> pgoff_t index = from >> PAGE_CACHE_SHIFT;
> unsigned offset = from & (PAGE_CACHE_SIZE-1);
> struct page *page;
> @@ -3261,8 +3268,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
> u64 page_start;
> u64 page_end;
>
> - if ((offset & (blocksize - 1)) == 0)
> - goto out;
> ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
> if (ret)
> goto out;
> @@ -3329,6 +3334,7 @@ again:
> }
> ClearPageChecked(page);
> set_page_dirty(page);
> + i_size_write(inode, from);
> unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
> GFP_NOFS);
>
> @@ -3459,7 +3465,9 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
> ret = btrfs_update_inode(trans, root, inode);
> btrfs_end_transaction_throttle(trans, root);
> } else {
> -
> + btrfs_wait_ordered_range(inode,
> + newsize & ~(root->sectorsize - 1),
> + (u64)-1);
> /*
> * We're truncating a file that used to have good data down to
> * zero. Make sure it gets into the ordered flush list so that
> @@ -3469,8 +3477,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
> BTRFS_I(inode)->ordered_data_close = 1;
>
> /* we don't support swapfiles, so vmtruncate shouldn't fail */
> - truncate_setsize(inode, newsize);
> - ret = btrfs_truncate(inode);
> + truncate_pagecache(inode, oldsize, newsize);
> + ret = btrfs_truncate(inode, newsize);
> }
>
> return ret;
> @@ -6522,87 +6530,43 @@ out:
> return ret;
> }
>
> -static int btrfs_truncate(struct inode *inode)
> +static int btrfs_truncate(struct inode *inode, loff_t newsize)
> {
> struct btrfs_root *root = BTRFS_I(inode)->root;
> - struct btrfs_block_rsv *rsv;
> int ret;
> int err = 0;
> struct btrfs_trans_handle *trans;
> unsigned long nr;
> u64 mask = root->sectorsize - 1;
> - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
> -
> - ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
> - if (ret)
> - return ret;
> -
> - btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
> - btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
>
> /*
> * Yes ladies and gentelment, this is indeed ugly. The fact is we have
> - * 3 things going on here
> + * 2 things going on here
> *
> - * 1) We need to reserve space for our orphan item and the space to
> - * delete our orphan item. Lord knows we don't want to have a dangling
> - * orphan item because we didn't reserve space to remove it.
> + * 1) We need to reserve space to update our inode.
> *
> - * 2) We need to reserve space to update our inode.
> - *
> - * 3) We need to have something to cache all the space that is going to
> + * 2) We need to have something to cache all the space that is going to
> * be free'd up by the truncate operation, but also have some slack
> * space reserved in case it uses space during the truncate (thank you
> * very much snapshotting).
> *
> - * And we need these to all be seperate. The fact is we can use alot of
> - * space doing the truncate, and we have no earthly idea how much space
> - * we will use, so we need the truncate reservation to be seperate so it
> - * doesn't end up using space reserved for updating the inode or
> - * removing the orphan item. We also need to be able to stop the
> - * transaction and start a new one, which means we need to be able to
> - * update the inode several times, and we have no idea of knowing how
> - * many times that will be, so we can't just reserve 1 item for the
> - * entirety of the opration, so that has to be done seperately as well.
> - * Then there is the orphan item, which does indeed need to be held on
> - * to for the whole operation, and we need nobody to touch this reserved
> - * space except the orphan code.
> - *
> - * So that leaves us with
> - *
> - * 1) root->orphan_block_rsv - for the orphan deletion.
> - * 2) rsv - for the truncate reservation, which we will steal from the
> - * transaction reservation.
> - * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
> - * updating the inode.
> + * And we need these to all be seperate. The fact is we can use a lot
> + * of space doing the truncate, and we have no earthly idea how much
> + * space we will use, so we need the truncate reservation to be
> + * seperate. We also need to be able to stop the transaction and start
> + * a new one, which means we need to be able to update the inode several
> + * times, and we have no idea of knowing how many times that will be,
> + * so we can't just reserve 1 item for the entirety of the operation,
> + * so that has to be done seperately as well.
> */
> - rsv = btrfs_alloc_block_rsv(root);
> - if (!rsv)
> - return -ENOMEM;
> - rsv->size = min_size;
>
> /*
> * 1 for the truncate slack space
> - * 1 for the orphan item we're going to add
> - * 1 for the orphan item deletion
> * 1 for updating the inode.
> */
> - trans = btrfs_start_transaction(root, 4);
> - if (IS_ERR(trans)) {
> - err = PTR_ERR(trans);
> - goto out;
> - }
> -
> - /* Migrate the slack space for the truncate to our reserve */
> - ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
> - min_size);
> - BUG_ON(ret);
> -
> - ret = btrfs_orphan_add(trans, inode);
> - if (ret) {
> - btrfs_end_transaction(trans, root);
> - goto out;
> - }
> + trans = btrfs_start_transaction(root, 2);
> + if (IS_ERR(trans))
> + return PTR_ERR(trans);
>
> /*
> * setattr is responsible for setting the ordered_data_close flag,
> @@ -6621,26 +6585,12 @@ static int btrfs_truncate(struct inode *inode)
> * using truncate to replace the contents of the file will
> * end up with a zero length file after a crash.
> */
> - if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
> + if (newsize == 0 && BTRFS_I(inode)->ordered_data_close)
> btrfs_add_ordered_operation(trans, root, inode);
>
> while (1) {
> - ret = btrfs_block_rsv_refill(root, rsv, min_size);
> - if (ret) {
> - /*
> - * This can only happen with the original transaction we
> - * started above, every other time we shouldn't have a
> - * transaction started yet.
> - */
> - if (ret == -EAGAIN)
> - goto end_trans;
> - err = ret;
> - break;
> - }
> -
Taking this part out is wrong, we need to have this slack space to account for
any COW that truncate does. Other than that this looks pretty good. Thanks,
Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html