> [ENHANCEMENT]
> This patch will introduce btrfs_device::hint_free_dev_extent member to
> give some hint for chunk allocator to find free dev extents.
>
> The hint itself is pretty simple, only tells where the first free slot
> could possibly be.
>
> It is not 100% correct, unlike free space cache, but since
> find_free_dev_extent_start() is already robust enough to handle
> search_hint, so there is not need to introduce a complex and fancy free
> dev extent cache.
>
> With this patch, allocating 4G on a 4T filled fs will be way more
> faster:
>
> v5.0-rc1 | patched | function
> ---------------------------------------------------------------------
> 7) | 7) | __btrfs_alloc_chunk [btrfs]() {
> 7) ! 152.496 us | 7) 7.885 us | find_free_dev_extent_start [btrfs]();
> 7) ! 185.488 us | 7) + 36.649 us | }
> 7) | 7) | __btrfs_alloc_chunk [btrfs]() {
> 7) ! 132.889 us | 7) 2.454 us | find_free_dev_extent_start [btrfs]();
> 7) ! 152.115 us | 7) + 24.145 us | }
> 7) | 7) | __btrfs_alloc_chunk [btrfs]() {
> 7) ! 127.689 us | 7) 2.245 us | find_free_dev_extent_start [btrfs]();
> 7) ! 146.595 us | 7) + 19.376 us | }
> 7) | 7) | __btrfs_alloc_chunk [btrfs]() {
> 7) ! 126.657 us | 7) 2.174 us | find_free_dev_extent_start [btrfs]();
> 7) ! 144.521 us | 7) + 16.321 us | }
For anyone who is interesting in unrealistic workload, without this
patch, fallocating a 1PiB file TiB by TiB will take 5+ hours!!
With this patch, it's just going to take around 15~20min.
Anyway, we're still far from customer oriented 1PiB HDDs, so that's not
something we need to bother yet.
Thanks,
Qu
>
> Signed-off-by: Qu Wenruo <wqu@xxxxxxxx>
> ---
> fs/btrfs/volumes.c | 23 +++++++++++++++---
> fs/btrfs/volumes.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 78 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 8e932d7d2fe6..cc15bf70dc72 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -411,6 +411,7 @@ static struct btrfs_device *__alloc_device(void)
> btrfs_device_data_ordered_init(dev);
> INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
> INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
> + dev->hint_free_dev_extent = (u64)-1;
>
> return dev;
> }
> @@ -1741,9 +1742,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
> struct btrfs_device *device, u64 num_bytes,
> u64 *start, u64 *len)
> {
> - /* FIXME use last free of some kind */
> - return find_free_dev_extent_start(trans->transaction, device,
> - num_bytes, 0, start, len);
> + return find_free_dev_extent_start(trans->transaction, device, num_bytes,
> + device->hint_free_dev_extent, start,
> + len);
> }
>
> static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
> @@ -1799,6 +1800,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
> "Failed to remove dev extent item");
> } else {
> set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
> + btrfs_device_hint_add_free(device, key.offset, *dev_extent_len);
> }
> out:
> btrfs_free_path(path);
> @@ -1841,6 +1843,7 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
> btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
>
> btrfs_set_dev_extent_length(leaf, extent, num_bytes);
> + btrfs_device_hint_del_free(device, key.offset, num_bytes);
> btrfs_mark_buffer_dirty(leaf);
> out:
> btrfs_free_path(path);
> @@ -7913,6 +7916,14 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
> devid = key.objectid;
> physical_offset = key.offset;
>
> + /*
> + * previous device verification is done, update its free dev
> + * extent hint
> + */
> + if (device && devid != device->devid)
> + btrfs_device_hint_add_free(device, prev_dev_ext_end,
> + device->disk_total_bytes - prev_dev_ext_end);
> +
> if (!device || devid != device->devid) {
> device = btrfs_find_device(fs_info, devid, NULL, NULL);
> if (!device) {
> @@ -7940,6 +7951,10 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
> physical_offset, physical_len);
> if (ret < 0)
> goto out;
> +
> + btrfs_device_hint_add_free(device, prev_dev_ext_end,
> + physical_offset - prev_dev_ext_end);
> +
> prev_devid = devid;
> prev_dev_ext_end = physical_offset + physical_len;
>
> @@ -7951,6 +7966,8 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
> break;
> }
> }
> + btrfs_device_hint_add_free(device, prev_dev_ext_end,
> + device->disk_total_bytes - prev_dev_ext_end);
>
> /* Ensure all chunks have corresponding dev extents */
> ret = verify_chunk_dev_extent_mapping(fs_info);
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index ed806649a473..00f7ef72466f 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -108,6 +108,14 @@ struct btrfs_device {
>
> /* bytes used on the current transaction */
> u64 commit_bytes_used;
> +
> + /*
> + * hint about where the first possible free dev extent is.
> + *
> + * u64(-1) means no hint.
> + */
> + u64 hint_free_dev_extent;
> +
> /*
> * used to manage the device which is resized
> *
> @@ -569,4 +577,54 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
> int btrfs_bg_type_to_factor(u64 flags);
> int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
>
> +static inline void btrfs_device_hint_add_free(struct btrfs_device *dev,
> + u64 start, u64 len)
> +{
> + if (dev->disk_total_bytes == 0 || start + len > dev->disk_total_bytes)
> + return;
> + if (len < SZ_16M)
> + return;
> + if (start > dev->hint_free_dev_extent)
> + return;
> + dev->hint_free_dev_extent = start;
> +}
> +
> +static inline void btrfs_device_hint_del_free(struct btrfs_device *dev,
> + u64 start, u64 len)
> +{
> + u64 free_hint = dev->hint_free_dev_extent;
> +
> + if (dev->disk_total_bytes == 0 || start + len > dev->disk_total_bytes)
> + return;
> + /*
> + * |<- to be removed ->|
> + * | free hint
> + * Not affecting free hint
> + */
> + if (start + len <= free_hint)
> + return;
> + /*
> + * |<- to be removed ->|
> + * | free hint
> + * Or
> + * |<- to be removed ->|
> + * | free hint
> + * |<-->| Less than 16M
> + *
> + * Move the hint to the range end
> + */
> + if ((start <= free_hint && start + len > free_hint) ||
> + (start > free_hint && free_hint - start < SZ_16M)) {
> + dev->hint_free_dev_extent = start + len;
> + return;
> + }
> +
> + /*
> + * |<- to be removed ->|
> + * | free hint
> + *
> + * We still have larger than 16M free space, no need to update
> + * free hint
> + */
> +}
> #endif
>
Attachment:
signature.asc
Description: OpenPGP digital signature
