On 9.02.2018 09:44, Qu Wenruo wrote:
> Kernel uses a delayed chunk allocation behavior for metadata chunks
>
> KERNEL:
> btrfs_alloc_chunk()
> |- __btrfs_alloc_chunk(): Only allocate chunk mapping
> |- btrfs_make_block_group(): Add corresponding bg to fs_info->new_bgs
>
> Then at transaction commit time, it finishes the remaining work:
> btrfs_start_dirty_block_groups():
> |- btrfs_create_pending_block_groups()
> |- btrfs_insert_item(): To insert block group item
> |- btrfs_finish_chunk_alloc(): Insert chunk items/dev extents
>
> Although btrfs-progs btrfs_alloc_chunk() does all the work in one
> function, it can still benefit from function refactor like:
> btrfs-progs:
> btrfs_alloc_chunk(): Wrapper for both normal and convert chunks
> |- __btrfs_alloc_chunk(): Only alloc chunk mapping
> | |- btrfs_make_block_group(): <<Insert bg items into extent tree>>
> |- btrfs_finish_chunk_alloc(): Insert chunk items/dev extents
>
> With such refactor, the following functions can share most of its code
> with kernel now:
> __btrfs_alloc_chunk()
> btrfs_finish_chunk_alloc()
> btrfs_alloc_dev_extent()
>
> Signed-off-by: Qu Wenruo <wqu@xxxxxxxx>
> ---
> volumes.c | 421 ++++++++++++++++++++++++++++++++++++++------------------------
> 1 file changed, 260 insertions(+), 161 deletions(-)
>
> diff --git a/volumes.c b/volumes.c
> index cff54c612872..e89520326314 100644
> --- a/volumes.c
> +++ b/volumes.c
> @@ -523,55 +523,40 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
> return find_free_dev_extent_start(device, num_bytes, 0, start, len);
> }
>
> -static int btrfs_insert_dev_extents(struct btrfs_trans_handle *trans,
> - struct btrfs_fs_info *fs_info,
> - struct map_lookup *map, u64 stripe_size)
> +static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
> + struct btrfs_device *device,
> + u64 chunk_offset, u64 physical,
> + u64 stripe_size)
> {
> - int ret = 0;
> - struct btrfs_path path;
> + int ret;
> + struct btrfs_path *path;
> + struct btrfs_fs_info *fs_info = device->fs_info;
> struct btrfs_root *root = fs_info->dev_root;
> struct btrfs_dev_extent *extent;
> struct extent_buffer *leaf;
> struct btrfs_key key;
> - int i;
>
> - btrfs_init_path(&path);
> -
> - for (i = 0; i < map->num_stripes; i++) {
> - struct btrfs_device *device = map->stripes[i].dev;
> -
> - key.objectid = device->devid;
> - key.offset = map->stripes[i].physical;
> - key.type = BTRFS_DEV_EXTENT_KEY;
> + path = btrfs_alloc_path();
> + if (!path)
> + return -ENOMEM;
>
> - ret = btrfs_insert_empty_item(trans, root, &path, &key,
> - sizeof(*extent));
> - if (ret < 0)
> - goto out;
> - leaf = path.nodes[0];
> - extent = btrfs_item_ptr(leaf, path.slots[0],
> - struct btrfs_dev_extent);
> - btrfs_set_dev_extent_chunk_tree(leaf, extent,
> + key.objectid = device->devid;
> + key.offset = physical;
> + key.type = BTRFS_DEV_EXTENT_KEY;
> + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
> + if (ret)
> + goto out;
> + leaf = path->nodes[0];
> + extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
> + btrfs_set_dev_extent_chunk_tree(leaf, extent,
> BTRFS_CHUNK_TREE_OBJECTID);
> - btrfs_set_dev_extent_chunk_objectid(leaf, extent,
> - BTRFS_FIRST_CHUNK_TREE_OBJECTID);
> - btrfs_set_dev_extent_chunk_offset(leaf, extent, map->ce.start);
> -
> - write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
> - (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
> - BTRFS_UUID_SIZE);
> -
> - btrfs_set_dev_extent_length(leaf, extent, stripe_size);
> - btrfs_mark_buffer_dirty(leaf);
> - btrfs_release_path(&path);
> -
> - device->bytes_used += stripe_size;
> - ret = btrfs_update_device(trans, device);
> - if (ret < 0)
> - goto out;
> - }
> + btrfs_set_dev_extent_chunk_objectid(leaf, extent,
> + BTRFS_FIRST_CHUNK_TREE_OBJECTID);
> + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
> + btrfs_set_dev_extent_length(leaf, extent, stripe_size);
> + btrfs_mark_buffer_dirty(leaf);
> out:
> - btrfs_release_path(&path);
> + btrfs_free_path(path);
> return ret;
> }
>
> @@ -813,28 +798,28 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
> / sizeof(struct btrfs_stripe) + 1)
>
> /*
> - * Alloc a chunk, will insert dev extents, chunk item, and insert new
> - * block group and update space info (so that extent allocator can use
> - * newly allocated chunk).
> + * Alloc a chunk mapping.
> + * Will do chunk size calculation and free dev extent search, and insert
> + * chunk mapping into chunk mapping tree.
> + *
> + * NOTE: This function doesn't handle any chunk item/dev extent insert.
> + * chunk item/dev extent insert is handled by later btrfs_finish_chunk_alloc().
> + * And for convert chunk (1:1 mapped, more flex chunk location), it's
> + * handled by __btrfs_alloc_convert_chunk().
> + *
> + * Qu: Block group item is still inserted in this function by
> + * btrfs_make_block_group(), this still differs from kernel.
> *
If it still differs does that mean this function should undergo further
refactoring to unify it with the kernel or it means it will never be the
same as the kernel's code? I thought the idea of unifying the
userspace/kernel code is to have only 1 set of functions which work both
in kernel and userspace?
> * @start: return value of allocated chunk start bytenr.
> * @num_bytes: return value of allocated chunk size
> * @type: chunk type (including both profile and type)
> - * @convert: if the chunk is allocated for convert case.
> - * If @convert is true, chunk allocator will skip device extent
> - * search, but use *start and *num_bytes as chunk start/num_bytes
> - * and devive offset, to build a 1:1 chunk mapping for convert.
> */
> -int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
> - struct btrfs_fs_info *info, u64 *start,
> - u64 *num_bytes, u64 type, bool convert)
> +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
> + struct btrfs_fs_info *info, u64 *start,
> + u64 *num_bytes, u64 type)
> {
> - struct btrfs_root *extent_root = info->extent_root;
> - struct btrfs_root *chunk_root = info->chunk_root;
> struct btrfs_device *device = NULL;
> - struct btrfs_chunk *chunk;
> struct list_head *dev_list = &info->fs_devices->devices;
> - struct btrfs_stripe *stripe;
> struct map_lookup *map;
> struct btrfs_device_info *devices_info = NULL;
> u64 percent_max;
> @@ -845,8 +830,6 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
> int index;
> int ndevs = 0;
> int rw_devs = 0;
> - int stripe_len = BTRFS_STRIPE_LEN;
> - struct btrfs_key key;
> u64 offset;
> int data_stripes; /* number of stripes that counts for bg size */
> int sub_stripes; /* sub_stripes info for map */
> @@ -874,34 +857,6 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
> devs_increment = btrfs_raid_array[index].devs_increment;
> ncopies = btrfs_raid_array[index].ncopies;
>
> - if (convert) {
> - /* For convert, profile must be SINGLE */
> - if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
> - error("convert only suports SINGLE profile");
> - return -EINVAL;
> - }
> - if (!IS_ALIGNED(*start, info->sectorsize)) {
> - error("chunk start not aligned, start=%llu sectorsize=%u",
> - *start, info->sectorsize);
> - return -EINVAL;
> - }
> - if (!IS_ALIGNED(*num_bytes, info->sectorsize)) {
> - error("chunk size not aligned, size=%llu sectorsize=%u",
> - *num_bytes, info->sectorsize);
> - return -EINVAL;
> - }
> - num_stripes = 1;
> - data_stripes = 1;
> - offset = *start;
> - stripe_size = *num_bytes;
> - /*
> - * For convert, we use 1:1 chunk mapping specified by @start and
> - * @num_bytes, so there is no need to go through dev_extent
> - * searching.
> - */
> - goto alloc_chunk;
> - }
> -
> /*
> * Chunk size calculation part.
> */
> @@ -1047,55 +1002,23 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
> /*
> * Fill chunk mapping and chunk stripes
> */
> -alloc_chunk:
> - if (!convert) {
> - ret = find_next_chunk(info, &offset);
> - if (ret)
> - goto out_devices_info;
> - }
> - key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
> - key.type = BTRFS_CHUNK_ITEM_KEY;
> - key.offset = offset;
> - *num_bytes = stripe_size * data_stripes;
> -
> - chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
> - if (!chunk)
> + ret = find_next_chunk(info, &offset);
> + if (ret)
> goto out_devices_info;
> + *num_bytes = stripe_size * data_stripes;
>
> map = kmalloc(btrfs_map_lookup_size(num_stripes), GFP_NOFS);
> if (!map)
> goto out_chunk_map;
> map->num_stripes = num_stripes;
>
> - if (convert) {
> - device = list_entry(dev_list->next, struct btrfs_device,
> - dev_list);
> + for (i = 0; i < ndevs; i++) {
> + for (j = 0; j < dev_stripes; ++j) {
> + int s = i * dev_stripes + j;
>
> - map->stripes[0].dev = device;
> - map->stripes[0].physical = *start;
> - stripe = &chunk->stripe;
> - btrfs_set_stack_stripe_devid(stripe, device->devid);
> - btrfs_set_stack_stripe_offset(stripe, *start);
> - memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
> - } else {
> - for (i = 0; i < ndevs; i++) {
> - for (j = 0; j < dev_stripes; ++j) {
> - int s = i * dev_stripes + j;
> -
> - device = devices_info[i].dev;
> - map->stripes[s].dev = device;
> - map->stripes[s].physical =
> - devices_info[i].dev_offset +
> - j * stripe_size;
> - stripe = &chunk->stripe + s;
> - btrfs_set_stack_stripe_devid(stripe,
> - device->devid);
> - btrfs_set_stack_stripe_offset(stripe,
> - devices_info[i].dev_offset +
> - j * stripe_size);
> - memcpy(stripe->dev_uuid, device->uuid,
> - BTRFS_UUID_SIZE);
> - }
> + map->stripes[s].dev = devices_info[i].dev;
> + map->stripes[s].physical = devices_info[i].dev_offset +
> + j * stripe_size;
> }
> }
> map->stripe_len = BTRFS_STRIPE_LEN;
> @@ -1104,60 +1027,236 @@ alloc_chunk:
> map->type = type;
> map->sub_stripes = sub_stripes;
> map->sector_size = info->sectorsize;
> - map->ce.start = key.offset;
> + map->ce.start = offset;
> map->ce.size = *num_bytes;
>
> + kfree(devices_info);
>
> + /*
> + * Insert chunk mapping and block group
> + */
> ret = insert_cache_extent(&info->mapping_tree.cache_tree, &map->ce);
> if (ret < 0)
> goto out_chunk_map;
>
> - /* key was set above */
> - btrfs_set_stack_chunk_length(chunk, *num_bytes);
> - btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
> - btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
> - btrfs_set_stack_chunk_type(chunk, type);
> - btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
> - btrfs_set_stack_chunk_io_align(chunk, stripe_len);
> - btrfs_set_stack_chunk_io_width(chunk, stripe_len);
> - btrfs_set_stack_chunk_sector_size(chunk, info->sectorsize);
> - btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
> + ret = btrfs_make_block_group(trans, info, 0, type, offset,
> + *num_bytes);
> + *start = offset;
> + return ret;
> +
> +out_chunk_map:
> + kfree(map);
> +out_devices_info:
> + kfree(devices_info);
> + return ret;
> +}
> +
> +/*
> + * Alloc a chunk mapping for convert.
> + * Convert needs special SINGLE chunk whose logical bytenr is the same as its
> + * physical bytenr.
> + * Chunk size and start bytenr are all specified by @start and @num_bytes
> + *
> + * And just like __btrfs_alloc_chunk() this only handles chunk mapping and
> + * block group item.
> + */
> +static int __btrfs_alloc_convert_chunk(struct btrfs_trans_handle *trans,
> + struct btrfs_fs_info *fs_info, u64 start,
> + u64 num_bytes, u64 type)
> +{
> + struct list_head *dev_list = &fs_info->fs_devices->devices;
> + struct map_lookup *map;
> + struct btrfs_device *device;
> + int ret;
> +
> + /* For convert, profile must be SINGLE */
> + if (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
> + error("convert only suports SINGLE profile");
> + return -EINVAL;
> + }
> + if (!IS_ALIGNED(start, fs_info->sectorsize)) {
> + error("chunk start not aligned, start=%llu sectorsize=%u",
> + start, fs_info->sectorsize);
> + return -EINVAL;
> + }
> + if (!IS_ALIGNED(num_bytes, fs_info->sectorsize)) {
> + error("chunk size not aligned, size=%llu sectorsize=%u",
> + num_bytes, fs_info->sectorsize);
> + return -EINVAL;
> + }
> + if (list_empty(dev_list)) {
> + error("no writable device");
> + return -ENODEV;
> + }
> +
> + device = list_entry(dev_list->next, struct btrfs_device, dev_list);
> + map = malloc(btrfs_map_lookup_size(1));
> + if (!map)
> + return -ENOMEM;
> + map->num_stripes = 1;
> + map->stripes[0].dev = device;
> + map->stripes[0].physical = start;
> + map->stripe_len = BTRFS_STRIPE_LEN;
> + map->io_align = BTRFS_STRIPE_LEN;
> + map->io_width = BTRFS_STRIPE_LEN;
> + map->type = type;
> + map->sub_stripes = 1;
> + map->sector_size = fs_info->sectorsize;
> + map->ce.start = start;
> + map->ce.size = num_bytes;
> +
> + ret = insert_cache_extent(&fs_info->mapping_tree.cache_tree, &map->ce);
> + if (ret < 0)
> + goto error;
> + ret = btrfs_make_block_group(trans, fs_info, 0, type, start, num_bytes);
> + return ret;
> +error:
> + free(map);
> + return ret;
> +}
> +
> +/*
> + * Finish the chunk allocation by inserting needed chunk item and device
> + * extents, and update device used bytes
> + */
> +static int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
> + struct btrfs_fs_info *fs_info,
> + u64 chunk_start, u64 chunk_size)
> +{
> + struct btrfs_root *extent_root = fs_info->extent_root;
> + struct btrfs_root *chunk_root = fs_info->chunk_root;
> + struct btrfs_key key;
> + struct btrfs_device *device;
> + struct btrfs_chunk *chunk;
> + struct btrfs_stripe *stripe;
> + struct cache_extent *ce;
> + struct map_lookup *map;
> + size_t item_size;
> + u64 dev_offset;
> + u64 stripe_size;
> + int i = 0;
> + int ret = 0;
> +
> + ce = search_cache_extent(&fs_info->mapping_tree.cache_tree, chunk_start);
> + if (!ce)
> + return -ENOENT;
> +
> + map = container_of(ce, struct map_lookup, ce);
> + item_size = btrfs_chunk_item_size(map->num_stripes);
> + stripe_size = calc_stripe_length(map->type, map->ce.size,
> + map->num_stripes);
> +
> + chunk = kzalloc(item_size, GFP_NOFS);
> + if (!chunk) {
> + ret = -ENOMEM;
> + goto out;
> + }
>
> /*
> - * Insert chunk item and chunk mapping.
> + * Take the device list mutex to prevent races with the final phase of
> + * a device replace operation that replaces the device object associated
> + * with the map's stripes, because the device object's id can change
> + * at any time during that final phase of the device replace operation
> + * (dev-replace.c:btrfs_dev_replace_finishing()).
> */
> - ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
> - btrfs_chunk_item_size(num_stripes));
> - BUG_ON(ret);
> - *start = key.offset;
> + /* mutex_lock(&fs_info->fs_devices->device_list_mutex); */
> + for (i = 0; i < map->num_stripes; i++) {
> + device = map->stripes[i].dev;
> + dev_offset = map->stripes[i].physical;
>
> - if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
> - ret = btrfs_add_system_chunk(info, &key,
> - chunk, btrfs_chunk_item_size(num_stripes));
> - if (ret < 0)
> - goto out_chunk;
> + device->bytes_used += stripe_size;
> + ret = btrfs_update_device(trans, device);
> + if (ret)
> + break;
> + ret = btrfs_alloc_dev_extent(trans, device, chunk_start,
> + dev_offset, stripe_size);
> + if (ret)
> + break;
> + }
> + if (ret) {
> + /* mutex_unlock(&fs_info->fs_devices->device_list_mutex); */
> + goto out;
> }
>
> + stripe = &chunk->stripe;
> + for (i = 0; i < map->num_stripes; i++) {
> + device = map->stripes[i].dev;
> + dev_offset = map->stripes[i].physical;
> +
> + btrfs_set_stack_stripe_devid(stripe, device->devid);
> + btrfs_set_stack_stripe_offset(stripe, dev_offset);
> + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
> + stripe++;
> + }
> + /* mutex_unlock(&fs_info->fs_devices->device_list_mutex); */
> +
> + btrfs_set_stack_chunk_length(chunk, chunk_size);
> + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
> + btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
> + btrfs_set_stack_chunk_type(chunk, map->type);
> + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
> + btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
> + btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
> + btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
> + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
> +
> + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
> + key.type = BTRFS_CHUNK_ITEM_KEY;
> + key.offset = chunk_start;
> +
> + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
> + if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
> + /*
> + * TODO: Cleanup of inserted chunk root in case of
> + * failure.
> + */
> + ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
> + }
> +
> +out:
> kfree(chunk);
> + return ret;
> +}
> +
> +/*
> + * Alloc a chunk.
> + * Will do all the needed work including seaching free device extent, insert
> + * chunk mapping, chunk item, block group item and device extents.
> + *
> + * @start: return value of allocated chunk start bytenr.
> + * @num_bytes: return value of allocated chunk size
> + * @type: chunk type (including both profile and type)
> + * @convert: if the chunk is allocated for convert case.
> + * If @convert is true, chunk allocator will skip device extent
> + * search, but use *start and *num_bytes as chunk start/num_bytes
> + * and devive offset, to build a 1:1 chunk mapping for convert.
> + */
> +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
> + struct btrfs_fs_info *fs_info, u64 *start, u64 *num_bytes,
> + u64 type, bool convert)
> +{
> + int ret;
>
> /*
> - * Insert device extents
> + * Allocate chunk mapping
> */
> - ret = btrfs_insert_dev_extents(trans, info, map, stripe_size);
> - if (ret < 0)
> - goto out_devices_info;
> -
> - ret = btrfs_make_block_group(trans, info, 0, type, map->ce.start,
> - map->ce.size);
> - kfree(devices_info);
> - return ret;
> + if (convert)
> + ret = __btrfs_alloc_convert_chunk(trans, fs_info, *start,
> + *num_bytes, type);
> + else
> + ret = __btrfs_alloc_chunk(trans, fs_info, start, num_bytes,
> + type);
> + if (ret < 0) {
> + error("failed to allocate chunk mapping: %s", strerror(-ret));
> + return ret;
> + }
>
> -out_chunk_map:
> - kfree(map);
> -out_chunk:
> - kfree(chunk);
> -out_devices_info:
> - kfree(devices_info);
> + /*
> + * Insert the remaining part (insert variant items)
> + */
> + ret = btrfs_finish_chunk_alloc(trans, fs_info, *start, *num_bytes);
> + if (ret < 0)
> + error("failed to finish chunk allocation: %s", strerror(-ret));
> return ret;
> }
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html