Re: [PATCH v4 3/3] btrfs: statfs: Use virtual chunk allocation to calculation available data space

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2020/1/8 下午10:35, Josef Bacik wrote:
> On 1/8/20 2:05 AM, Qu Wenruo wrote:
>> Although btrfs_calc_avail_data_space() is trying to do an estimation
>> on how many data chunks it can allocate, the estimation is far from
>> perfect:
>>
>> - Metadata over-commit is not considered at all
>> - Chunk allocation doesn't take RAID5/6 into consideration
>>
>> This patch will change btrfs_calc_avail_data_space() to use
>> pre-calculated per-profile available space.
>>
>> This provides the following benefits:
>> - Accurate unallocated data space estimation, including RAID5/6
>>    It's as accurate as chunk allocator, and can handle RAID5/6.
>>
>> Although it still can't handle metadata over-commit that accurately, we
>> still have fallback method for over-commit, by using factor based
>> estimation.
>>
>> The good news is, over-commit can only happen when we have enough
>> unallocated space, so even we may not report byte accurate result when
>> the fs is empty, the result will get more and more accurate when
>> unallocated space is reducing.
>>
>> So the metadata over-commit shouldn't cause too many problem.
>>
>> Since we're keeping the old lock-free design, statfs should not
>> experience
>> any extra delay.
>>
>> Signed-off-by: Qu Wenruo <wqu@xxxxxxxx>
>> ---
>>   fs/btrfs/super.c   | 174 +++++++++++----------------------------------
>>   fs/btrfs/volumes.h |   4 ++
>>   2 files changed, 47 insertions(+), 131 deletions(-)
>>
>> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
>> index f452a94abdc3..ecca25c40637 100644
>> --- a/fs/btrfs/super.c
>> +++ b/fs/btrfs/super.c
>> @@ -1894,118 +1894,53 @@ static inline void
>> btrfs_descending_sort_devices(
>>    * The helper to calc the free space on the devices that can be used
>> to store
>>    * file data.
>>    */
>> -static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info
>> *fs_info,
>> -                          u64 *free_bytes)
>> +static u64 btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
>> +                       u64 free_meta)
>>   {
>> -    struct btrfs_device_info *devices_info;
>>       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>> -    struct btrfs_device *device;
>> -    u64 type;
>> -    u64 avail_space;
>> -    u64 min_stripe_size;
>> -    int num_stripes = 1;
>> -    int i = 0, nr_devices;
>> -    const struct btrfs_raid_attr *rattr;
>> +    enum btrfs_raid_types data_type;
>> +    u64 data_profile = btrfs_data_alloc_profile(fs_info);
>> +    u64 data_avail;
>> +    u64 meta_rsv;
>>   -    /*
>> -     * We aren't under the device list lock, so this is racy-ish, but
>> good
>> -     * enough for our purposes.
>> -     */
>> -    nr_devices = fs_info->fs_devices->open_devices;
>> -    if (!nr_devices) {
>> -        smp_mb();
>> -        nr_devices = fs_info->fs_devices->open_devices;
>> -        ASSERT(nr_devices);
>> -        if (!nr_devices) {
>> -            *free_bytes = 0;
>> -            return 0;
>> -        }
>> -    }
>> -
>> -    devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
>> -                   GFP_KERNEL);
>> -    if (!devices_info)
>> -        return -ENOMEM;
>> -
>> -    /* calc min stripe number for data space allocation */
>> -    type = btrfs_data_alloc_profile(fs_info);
>> -    rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];
>> -
>> -    if (type & BTRFS_BLOCK_GROUP_RAID0)
>> -        num_stripes = nr_devices;
>> -    else if (type & BTRFS_BLOCK_GROUP_RAID1)
>> -        num_stripes = 2;
>> -    else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
>> -        num_stripes = 3;
>> -    else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
>> -        num_stripes = 4;
>> -    else if (type & BTRFS_BLOCK_GROUP_RAID10)
>> -        num_stripes = 4;
>> +    spin_lock(&fs_info->global_block_rsv.lock);
>> +    meta_rsv = fs_info->global_block_rsv.size;
>> +    spin_unlock(&fs_info->global_block_rsv.lock);
>>   -    /* Adjust for more than 1 stripe per device */
>> -    min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
>> +    data_type = btrfs_bg_flags_to_raid_index(data_profile);
>>   -    rcu_read_lock();
>> -    list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
>> -        if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
>> -                        &device->dev_state) ||
>> -            !device->bdev ||
>> -            test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
>> -            continue;
>> +    spin_lock(&fs_devices->per_profile_lock);
>> +    data_avail = fs_devices->per_profile_avail[data_type];
>> +    spin_unlock(&fs_devices->per_profile_lock);
>>   -        if (i >= nr_devices)
>> -            break;
>> -
>> -        avail_space = device->total_bytes - device->bytes_used;
>> -
>> -        /* align with stripe_len */
>> -        avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
>> -
>> -        /*
>> -         * In order to avoid overwriting the superblock on the drive,
>> -         * btrfs starts at an offset of at least 1MB when doing chunk
>> -         * allocation.
>> -         *
>> -         * This ensures we have at least min_stripe_size free space
>> -         * after excluding 1MB.
>> -         */
>> -        if (avail_space <= SZ_1M + min_stripe_size)
>> -            continue;
>> -
>> -        avail_space -= SZ_1M;
>> -
>> -        devices_info[i].dev = device;
>> -        devices_info[i].max_avail = avail_space;
>> -
>> -        i++;
>> -    }
>> -    rcu_read_unlock();
>> -
>> -    nr_devices = i;
>> -
>> -    btrfs_descending_sort_devices(devices_info, nr_devices);
>> -
>> -    i = nr_devices - 1;
>> -    avail_space = 0;
>> -    while (nr_devices >= rattr->devs_min) {
>> -        num_stripes = min(num_stripes, nr_devices);
>> -
>> -        if (devices_info[i].max_avail >= min_stripe_size) {
>> -            int j;
>> -            u64 alloc_size;
>> -
>> -            avail_space += devices_info[i].max_avail * num_stripes;
>> -            alloc_size = devices_info[i].max_avail;
>> -            for (j = i + 1 - num_stripes; j <= i; j++)
>> -                devices_info[j].max_avail -= alloc_size;
>> -        }
>> -        i--;
>> -        nr_devices--;
>> +    /*
>> +     * We have meta over-committed, do some wild guess using factor.
>> +     *
>> +     * To get an accurate result, we should allocate a metadata virtual
>> +     * chunk first, then allocate data virtual chunks to get real
>> +     * estimation.
>> +     * But that needs chunk_mutex, which could be very slow to accquire.
>> +     *
>> +     * So here we trade for non-blocking statfs. And meta
>> over-committing is
>> +     * less a problem because:
>> +     * - Meta over-commit only happens when we have unallocated space
>> +     *   So no over-commit if we're low on available space.
>> +     *
>> +     * This may not be as accurate as virtual chunk based one, but it
>> +     * should be good enough for statfs usage.
>> +     */
>> +    if (free_meta < meta_rsv) {
>> +        u64 meta_needed = meta_rsv - free_meta;
>> +        int data_factor = btrfs_bg_type_to_factor(data_profile);
>> +        int meta_factor = btrfs_bg_type_to_factor(
>> +                btrfs_metadata_alloc_profile(fs_info));
>> +
>> +        if (data_avail < meta_needed * meta_factor / data_factor)
>> +            data_avail = 0;
>> +        else
>> +            data_avail -= meta_needed * meta_factor / data_factor;
>>       }
>> -
>> -    kfree(devices_info);
>> -    *free_bytes = avail_space;
>> -    return 0;
>> +    return data_avail;
>>   }
>>     /*
>> @@ -2033,8 +1968,6 @@ static int btrfs_statfs(struct dentry *dentry,
>> struct kstatfs *buf)
>>       __be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
>>       unsigned factor = 1;
>>       struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
>> -    int ret;
>> -    u64 thresh = 0;
>>       int mixed = 0;
>>         rcu_read_lock();
>> @@ -2082,31 +2015,10 @@ static int btrfs_statfs(struct dentry *dentry,
>> struct kstatfs *buf)
>>           buf->f_bfree = 0;
>>       spin_unlock(&block_rsv->lock);
>>   -    buf->f_bavail = div_u64(total_free_data, factor);
>> -    ret = btrfs_calc_avail_data_space(fs_info, &total_free_data);
>> -    if (ret)
>> -        return ret;
>> -    buf->f_bavail += div_u64(total_free_data, factor);
>> +    buf->f_bavail = btrfs_calc_avail_data_space(fs_info,
>> total_free_meta);
>> +    if (buf->f_bavail > 0)
>> +        buf->f_bavail += total_free_data;
> 
> I'm not sure I understand this, we're only including the free space of
> already allocated data extents _if_ we have free data chunks?  That
> doesn't seem right, shouldn't we always have buf->f_bavail =
> total_free_data and then we add whatever comes from
> btrfs_calc_avail_data_space()?  Thanks,

Oh, you're right, it should be before the call.

Thanks,
Qu

> 
> Josef

Attachment: signature.asc
Description: OpenPGP digital signature


[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux