On 05/09/2017 09:36 PM, Qu Wenruo wrote:
> Introduce a new parameter, struct extent_changeset for
> btrfs_qgroup_reserved_data() and its callers.
>
> Such extent_changeset was used in btrfs_qgroup_reserve_data() to record
> which range it reserved in current reserve, so it can free it at error
> path.
>
> The reason we need to export it to callers is, at buffered write error
> path, without knowing what exactly which range we reserved in current
> allocation, we can free space which is not reserved by us.
>
> This will lead to qgroup reserved space underflow.
>
> Reviewed-by: Chandan Rajendra <chandan@xxxxxxxxxxxxxxxxxx>
> Signed-off-by: Qu Wenruo <quwenruo@xxxxxxxxxxxxxx>
> ---
> fs/btrfs/ctree.h | 6 ++++--
> fs/btrfs/extent-tree.c | 16 +++++++++++-----
> fs/btrfs/extent_io.h | 34 ++++++++++++++++++++++++++++++++++
> fs/btrfs/file.c | 12 +++++++++---
> fs/btrfs/inode-map.c | 4 +++-
> fs/btrfs/inode.c | 18 ++++++++++++++----
> fs/btrfs/ioctl.c | 5 ++++-
> fs/btrfs/qgroup.c | 41 +++++++++++++++++++++++++++++------------
> fs/btrfs/qgroup.h | 3 ++-
> fs/btrfs/relocation.c | 4 +++-
> 10 files changed, 113 insertions(+), 30 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 1e82516fe2d8..52a0147cd612 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -2704,8 +2704,9 @@ enum btrfs_flush_state {
> COMMIT_TRANS = 6,
> };
>
> -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
> int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
> +int btrfs_check_data_free_space(struct inode *inode,
> + struct extent_changeset **reserved, u64 start, u64 len);
> void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
> void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
> u64 len);
> @@ -2723,7 +2724,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
> struct btrfs_block_rsv *rsv);
> int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
> void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
> -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
> +int btrfs_delalloc_reserve_space(struct inode *inode,
> + struct extent_changeset **reserved, u64 start, u64 len);
> void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
> void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
> struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 4f62696131a6..782e0f5feb69 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -3364,6 +3364,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
> struct btrfs_fs_info *fs_info = block_group->fs_info;
> struct btrfs_root *root = fs_info->tree_root;
> struct inode *inode = NULL;
> + struct extent_changeset *data_reserved = NULL;
> u64 alloc_hint = 0;
> int dcs = BTRFS_DC_ERROR;
> u64 num_pages = 0;
> @@ -3483,7 +3484,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
> num_pages *= 16;
> num_pages *= PAGE_SIZE;
>
> - ret = btrfs_check_data_free_space(inode, 0, num_pages);
> + ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
> if (ret)
> goto out_put;
>
> @@ -3514,6 +3515,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
> block_group->disk_cache_state = dcs;
> spin_unlock(&block_group->lock);
>
> + extent_changeset_free(data_reserved);
> return ret;
> }
>
> @@ -4282,7 +4284,8 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
> * Will replace old btrfs_check_data_free_space(), but for patch split,
> * add a new function first and then replace it.
> */
> -int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
> +int btrfs_check_data_free_space(struct inode *inode,
> + struct extent_changeset **reserved, u64 start, u64 len)
> {
> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> int ret;
> @@ -4297,9 +4300,11 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
> return ret;
>
> /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
> - ret = btrfs_qgroup_reserve_data(inode, start, len);
> + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
> if (ret < 0)
> btrfs_free_reserved_data_space_noquota(inode, start, len);
> + else
> + ret = 0;
> return ret;
> }
>
> @@ -6140,11 +6145,12 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
> * Return 0 for success
> * Return <0 for error(-ENOSPC or -EQUOT)
> */
> -int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
> +int btrfs_delalloc_reserve_space(struct inode *inode,
> + struct extent_changeset **reserved, u64 start, u64 len)
Please change the comment before the function name to include
"@reserved" description.
> {
> int ret;
>
> - ret = btrfs_check_data_free_space(inode, start, len);
> + ret = btrfs_check_data_free_space(inode, reserved, start, len);
> if (ret < 0)
> return ret;
> ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index cc1b08fa9fe7..afb3553f4f78 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -211,6 +211,40 @@ struct extent_changeset {
> struct ulist range_changed;
> };
>
> +static inline void extent_changeset_init(struct extent_changeset *changeset)
> +{
> + changeset->bytes_changed = 0;
> + ulist_init(&changeset->range_changed);
> +}
> +
> +static inline struct extent_changeset *extent_changeset_alloc(void)
> +{
> + struct extent_changeset *ret;
> +
> + ret = kmalloc(sizeof(*ret), GFP_KERNEL);
> + if (!ret)
> + return NULL;
> +
> + extent_changeset_init(ret);
> + return ret;
> +}
> +
> +static inline void extent_changeset_release(struct extent_changeset *changeset)
> +{
> + if (!changeset)
> + return;
> + changeset->bytes_changed = 0;
> + ulist_release(&changeset->range_changed);
> +}
> +
> +static inline void extent_changeset_free(struct extent_changeset *changeset)
> +{
> + if (!changeset)
> + return;
> + extent_changeset_release(changeset);
> + kfree(changeset);
> +}
> +
> static inline void extent_set_compress_type(unsigned long *bio_flags,
> int compress_type)
> {
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index da1096eb1a40..d31c935b36ed 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1581,6 +1581,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
> struct btrfs_root *root = BTRFS_I(inode)->root;
> struct page **pages = NULL;
> struct extent_state *cached_state = NULL;
> + struct extent_changeset *data_reserved = NULL;
> u64 release_bytes = 0;
> u64 lockstart;
> u64 lockend;
> @@ -1628,7 +1629,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
> reserve_bytes = round_up(write_bytes + sector_offset,
> fs_info->sectorsize);
>
> - ret = btrfs_check_data_free_space(inode, pos, write_bytes);
> + extent_changeset_release(data_reserved);
> + ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
> + write_bytes);
> if (ret < 0) {
> if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
> BTRFS_INODE_PREALLOC)) &&
> @@ -1802,6 +1805,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
> }
> }
>
> + extent_changeset_free(data_reserved);
> return num_written ? num_written : ret;
> }
>
> @@ -2769,6 +2773,7 @@ static long btrfs_fallocate(struct file *file, int mode,
> {
> struct inode *inode = file_inode(file);
> struct extent_state *cached_state = NULL;
> + struct extent_changeset *data_reserved = NULL;
> struct falloc_range *range;
> struct falloc_range *tmp;
> struct list_head reserve_list;
> @@ -2898,8 +2903,8 @@ static long btrfs_fallocate(struct file *file, int mode,
> free_extent_map(em);
> break;
> }
> - ret = btrfs_qgroup_reserve_data(inode, cur_offset,
> - last_byte - cur_offset);
> + ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
> + cur_offset, last_byte - cur_offset);
> if (ret < 0) {
> free_extent_map(em);
> break;
> @@ -2971,6 +2976,7 @@ static long btrfs_fallocate(struct file *file, int mode,
> if (ret != 0)
> btrfs_free_reserved_data_space(inode, alloc_start,
> alloc_end - cur_offset);
> + extent_changeset_free(data_reserved);
> return ret;
> }
>
> diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
> index 5c6c20ec64d8..d02019747d00 100644
> --- a/fs/btrfs/inode-map.c
> +++ b/fs/btrfs/inode-map.c
> @@ -400,6 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
> struct btrfs_path *path;
> struct inode *inode;
> struct btrfs_block_rsv *rsv;
> + struct extent_changeset *data_reserved = NULL;
> u64 num_bytes;
> u64 alloc_hint = 0;
> int ret;
> @@ -492,7 +493,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
> /* Just to make sure we have enough space */
> prealloc += 8 * PAGE_SIZE;
>
> - ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
> + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, 0, prealloc);
> if (ret)
> goto out_put;
>
> @@ -516,6 +517,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
> trans->bytes_reserved = num_bytes;
>
> btrfs_free_path(path);
> + extent_changeset_free(data_reserved);
> return ret;
> }
>
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index a1294d5baef5..4e211b54b267 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -2035,6 +2035,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
> struct btrfs_writepage_fixup *fixup;
> struct btrfs_ordered_extent *ordered;
> struct extent_state *cached_state = NULL;
> + struct extent_changeset *data_reserved = NULL;
> struct page *page;
> struct inode *inode;
> u64 page_start;
> @@ -2072,7 +2073,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
> goto again;
> }
>
> - ret = btrfs_delalloc_reserve_space(inode, page_start,
> + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
> PAGE_SIZE);
> if (ret) {
> mapping_set_error(page->mapping, ret);
> @@ -2092,6 +2093,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
> unlock_page(page);
> put_page(page);
> kfree(fixup);
> + extent_changeset_free(data_reserved);
> }
>
> /*
> @@ -4767,6 +4769,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
> struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> struct btrfs_ordered_extent *ordered;
> struct extent_state *cached_state = NULL;
> + struct extent_changeset *data_reserved = NULL;
> char *kaddr;
> u32 blocksize = fs_info->sectorsize;
> pgoff_t index = from >> PAGE_SHIFT;
> @@ -4781,7 +4784,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
> (!len || ((len & (blocksize - 1)) == 0)))
> goto out;
>
> - ret = btrfs_delalloc_reserve_space(inode,
> + ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
> round_down(from, blocksize), blocksize);
> if (ret)
> goto out;
> @@ -4866,6 +4869,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
> unlock_page(page);
> put_page(page);
> out:
> + extent_changeset_free(data_reserved);
> return ret;
> }
>
> @@ -8725,6 +8729,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> struct inode *inode = file->f_mapping->host;
> struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> struct btrfs_dio_data dio_data = { 0 };
> + struct extent_changeset *data_reserved = NULL;
> loff_t offset = iocb->ki_pos;
> size_t count = 0;
> int flags = 0;
> @@ -8761,7 +8766,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> inode_unlock(inode);
> relock = true;
> }
> - ret = btrfs_delalloc_reserve_space(inode, offset, count);
> + ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
> + offset, count);
> if (ret)
> goto out;
> dio_data.outstanding_extents = count_max_extents(count);
> @@ -8818,6 +8824,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
> if (relock)
> inode_lock(inode);
>
> + extent_changeset_free(data_reserved);
> return ret;
> }
>
> @@ -9050,6 +9057,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
> struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
> struct btrfs_ordered_extent *ordered;
> struct extent_state *cached_state = NULL;
> + struct extent_changeset *data_reserved = NULL;
> char *kaddr;
> unsigned long zero_start;
> loff_t size;
> @@ -9075,7 +9083,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
> * end up waiting indefinitely to get a lock on the page currently
> * being processed by btrfs_page_mkwrite() function.
> */
> - ret = btrfs_delalloc_reserve_space(inode, page_start,
> + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
> reserved_space);
> if (!ret) {
> ret = file_update_time(vmf->vma->vm_file);
> @@ -9181,6 +9189,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
> out_unlock:
> if (!ret) {
> sb_end_pagefault(inode->i_sb);
> + extent_changeset_free(data_reserved);
> return VM_FAULT_LOCKED;
> }
> unlock_page(page);
> @@ -9188,6 +9197,7 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
> btrfs_delalloc_release_space(inode, page_start, reserved_space);
> out_noreserve:
> sb_end_pagefault(inode->i_sb);
> + extent_changeset_free(data_reserved);
> return ret;
> }
>
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index a29dc3fd7152..a16a16a66ed9 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -1127,6 +1127,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
> struct btrfs_ordered_extent *ordered;
> struct extent_state *cached_state = NULL;
> struct extent_io_tree *tree;
> + struct extent_changeset *data_reserved = NULL;
> gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
>
> file_end = (isize - 1) >> PAGE_SHIFT;
> @@ -1135,7 +1136,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
>
> page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
>
> - ret = btrfs_delalloc_reserve_space(inode,
> + ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
> start_index << PAGE_SHIFT,
> page_cnt << PAGE_SHIFT);
> if (ret)
> @@ -1247,6 +1248,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
> unlock_page(pages[i]);
> put_page(pages[i]);
> }
> + extent_changeset_free(data_reserved);
> return i_done;
> out:
> for (i = 0; i < i_done; i++) {
> @@ -1256,6 +1258,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
> btrfs_delalloc_release_space(inode,
> start_index << PAGE_SHIFT,
> page_cnt << PAGE_SHIFT);
> + extent_changeset_free(data_reserved);
> return ret;
>
> }
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index ad2e99491395..1b24c40729bd 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -2824,43 +2824,60 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
> * Return <0 for error (including -EQUOT)
> *
> * NOTE: this function may sleep for memory allocation.
> + * if btrfs_qgroup_reserve_data() is called multiple times with
> + * same @reserved, caller must ensure when error happens it's OK
> + * to free *ALL* reserved space.
> */
> -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
> +int btrfs_qgroup_reserve_data(struct inode *inode,
> + struct extent_changeset **reserved_ret, u64 start,
> + u64 len)
> {
> struct btrfs_root *root = BTRFS_I(inode)->root;
> - struct extent_changeset changeset;
> struct ulist_node *unode;
> struct ulist_iterator uiter;
> + struct extent_changeset *reserved;
> + u64 orig_reserved;
> + u64 to_reserve;
> int ret;
>
> if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
> !is_fstree(root->objectid) || len == 0)
> return 0;
>
> - changeset.bytes_changed = 0;
> - ulist_init(&changeset.range_changed);
> + /* @reserved parameter is mandatory for qgroup */
> + if (WARN_ON(!reserved_ret))
> + return -EINVAL;
> + if (!*reserved_ret) {
> + *reserved_ret = extent_changeset_alloc();
> + if (!*reserved_ret)
> + return -ENOMEM;
> + }
> + reserved = *reserved_ret;
> + /* Record already reserved space */
> + orig_reserved = reserved->bytes_changed;
> ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
> - start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
> + start + len -1, EXTENT_QGROUP_RESERVED, reserved);
> +
> + /* Newly reserved space */
> + to_reserve = reserved->bytes_changed - orig_reserved;
> trace_btrfs_qgroup_reserve_data(inode, start, len,
> - changeset.bytes_changed,
> - QGROUP_RESERVE);
> + to_reserve, QGROUP_RESERVE);
> if (ret < 0)
> goto cleanup;
> - ret = qgroup_reserve(root, changeset.bytes_changed, true);
> + ret = qgroup_reserve(root, to_reserve, true);
> if (ret < 0)
> goto cleanup;
>
> - ulist_release(&changeset.range_changed);
> return ret;
>
> cleanup:
> - /* cleanup already reserved ranges */
> + /* cleanup *ALL* already reserved ranges */
> ULIST_ITER_INIT(&uiter);
> - while ((unode = ulist_next(&changeset.range_changed, &uiter)))
> + while ((unode = ulist_next(&reserved->range_changed, &uiter)))
> clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
> unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
> GFP_NOFS);
> - ulist_release(&changeset.range_changed);
> + extent_changeset_release(reserved);
> return ret;
> }
There are some instances where you can use
extent_changeset_init()/release() instead of the ulist
initialization/release. This would reduce code as well.
>
> diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
> index 38d14d4575c0..c7a2bcaff5d5 100644
> --- a/fs/btrfs/qgroup.h
> +++ b/fs/btrfs/qgroup.h
> @@ -241,7 +241,8 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
> #endif
>
> /* New io_tree based accurate qgroup reserve API */
> -int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
> +int btrfs_qgroup_reserve_data(struct inode *inode,
> + struct extent_changeset **reserved, u64 start, u64 len);
> int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
> int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
>
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index d60df51959f7..5823e2d67a84 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -3093,11 +3093,12 @@ int prealloc_file_extent_cluster(struct inode *inode,
> u64 prealloc_start = cluster->start - offset;
> u64 prealloc_end = cluster->end - offset;
> u64 cur_offset;
> + struct extent_changeset *data_reserved = NULL;
>
> BUG_ON(cluster->start != cluster->boundary[0]);
> inode_lock(inode);
>
> - ret = btrfs_check_data_free_space(inode, prealloc_start,
> + ret = btrfs_check_data_free_space(inode, &data_reserved, prealloc_start,
> prealloc_end + 1 - prealloc_start);
> if (ret)
> goto out;
> @@ -3129,6 +3130,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
> prealloc_end + 1 - cur_offset);
> out:
> inode_unlock(inode);
> + extent_changeset_free(data_reserved);
> return ret;
> }
>
>
--
Goldwyn
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html