Re: [PATCH 07/19] btrfs: discard one region at a time in async discard

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Oct 10, 2019 at 11:22:44AM -0400, Josef Bacik wrote:
> On Mon, Oct 07, 2019 at 04:17:38PM -0400, Dennis Zhou wrote:
> > The prior two patches added discarding via a background workqueue. This
> > just piggybacked off of the fstrim code to trim the whole block at once.
> > Well inevitably this is worse performance wise and will aggressively
> > overtrim. But it was nice to plumb the other infrastructure to keep the
> > patches easier to review.
> > 
> > This adds the real goal of this series which is discarding slowly (ie a
> > slow long running fstrim). The discarding is split into two phases,
> > extents and then bitmaps. The reason for this is two fold. First, the
> > bitmap regions overlap the extent regions. Second, discarding the
> > extents first will let the newly trimmed bitmaps have the highest chance
> > of coalescing when being readded to the free space cache.
> > 
> > Signed-off-by: Dennis Zhou <dennis@xxxxxxxxxx>
> > ---
> >  fs/btrfs/block-group.h      |   2 +
> >  fs/btrfs/discard.c          |  73 ++++++++++++++++++++-----
> >  fs/btrfs/discard.h          |  16 ++++++
> >  fs/btrfs/extent-tree.c      |   3 +-
> >  fs/btrfs/free-space-cache.c | 106 ++++++++++++++++++++++++++----------
> >  fs/btrfs/free-space-cache.h |   6 +-
> >  6 files changed, 159 insertions(+), 47 deletions(-)
> > 
> > diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
> > index 0f9a1c91753f..b59e6a8ed73d 100644
> > --- a/fs/btrfs/block-group.h
> > +++ b/fs/btrfs/block-group.h
> > @@ -120,6 +120,8 @@ struct btrfs_block_group_cache {
> >  	struct list_head discard_list;
> >  	int discard_index;
> >  	u64 discard_delay;
> > +	u64 discard_cursor;
> > +	u32 discard_flags;
> >  
> 
> Same comment as the free space flags, this is just a state holder and never has
> more than one bit set, so switch it to an enum and treat it like a state.
> 

Yeah, I've switched it to an enum.


> >  	/* For dirty block groups */
> >  	struct list_head dirty_list;
> > diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
> > index fb92b888774d..26a1e44b4bfa 100644
> > --- a/fs/btrfs/discard.c
> > +++ b/fs/btrfs/discard.c
> > @@ -22,21 +22,28 @@ btrfs_get_discard_list(struct btrfs_discard_ctl *discard_ctl,
> >  	return &discard_ctl->discard_list[cache->discard_index];
> >  }
> >  
> > -void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
> > -			       struct btrfs_block_group_cache *cache)
> > +static void __btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
> > +					struct btrfs_block_group_cache *cache)
> >  {
> >  	u64 now = ktime_get_ns();
> >  
> > -	spin_lock(&discard_ctl->lock);
> > -
> >  	if (list_empty(&cache->discard_list) || !cache->discard_index) {
> >  		if (!cache->discard_index)
> >  			cache->discard_index = 1;
> >  		cache->discard_delay = now + BTRFS_DISCARD_DELAY;
> > +		cache->discard_flags |= BTRFS_DISCARD_RESET_CURSOR;
> >  	}
> >  
> >  	list_move_tail(&cache->discard_list,
> >  		       btrfs_get_discard_list(discard_ctl, cache));
> > +}
> > +
> > +void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
> > +			       struct btrfs_block_group_cache *cache)
> > +{
> > +	spin_lock(&discard_ctl->lock);
> > +
> > +	__btrfs_add_to_discard_list(discard_ctl, cache);
> >  
> >  	spin_unlock(&discard_ctl->lock);
> >  }
> > @@ -53,6 +60,7 @@ void btrfs_add_to_discard_free_list(struct btrfs_discard_ctl *discard_ctl,
> >  
> >  	cache->discard_index = 0;
> >  	cache->discard_delay = now;
> > +	cache->discard_flags |= BTRFS_DISCARD_RESET_CURSOR;
> >  	list_add_tail(&cache->discard_list, &discard_ctl->discard_list[0]);
> >  
> >  	spin_unlock(&discard_ctl->lock);
> > @@ -114,13 +122,24 @@ peek_discard_list(struct btrfs_discard_ctl *discard_ctl)
> >  
> >  	spin_lock(&discard_ctl->lock);
> >  
> > +again:
> >  	cache = find_next_cache(discard_ctl, now);
> >  
> > -	if (cache && now < cache->discard_delay)
> > +	if (cache && now > cache->discard_delay) {
> > +		discard_ctl->cache = cache;
> > +		if (cache->discard_index == 0 &&
> > +		    cache->free_space_ctl->free_space != cache->key.offset) {
> > +			__btrfs_add_to_discard_list(discard_ctl, cache);
> > +			goto again;
> 
> The magic number thing again, it needs to be discard_index == UNUSED_DISCARD or
> some such descriptive thing.  Also needs to be btrfs_block_group_used(cache) ==
> 0;
> 

I added #defines with an explanation of names. But the gist is 0 is
reserved for fully free block groups in the process of being trimmed for
the unused_bg free path. >= 1 is just monotonically decreasing filters
to prioritize trimming of regions.

> > +		}
> > +		if (btrfs_discard_reset_cursor(cache)) {
> > +			cache->discard_cursor = cache->key.objectid;
> > +			cache->discard_flags &= ~(BTRFS_DISCARD_RESET_CURSOR |
> > +						  BTRFS_DISCARD_BITMAPS);
> > +		}
> > +	} else {
> >  		cache = NULL;
> > -
> > -	discard_ctl->cache = cache;
> > -
> > +	}
> >  	spin_unlock(&discard_ctl->lock);
> >  
> >  	return cache;
> > @@ -173,18 +192,42 @@ static void btrfs_discard_workfn(struct work_struct *work)
> >  
> >  	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
> >  
> > +again:
> >  	cache = peek_discard_list(discard_ctl);
> >  	if (!cache || !btrfs_run_discard_work(discard_ctl))
> >  		return;
> >  
> > -	btrfs_trim_block_group(cache, &trimmed, cache->key.objectid,
> > -			       btrfs_block_group_end(cache), 0);
> > +	if (btrfs_discard_bitmaps(cache))
> > +		btrfs_trim_block_group_bitmaps(cache, &trimmed,
> > +					       cache->discard_cursor,
> > +					       btrfs_block_group_end(cache),
> > +					       0, true);
> > +	else
> > +		btrfs_trim_block_group(cache, &trimmed, cache->discard_cursor,
> > +				       btrfs_block_group_end(cache), 0, true);
> > +
> > +	if (cache->discard_cursor >= btrfs_block_group_end(cache)) {
> > +		if (btrfs_discard_bitmaps(cache)) {
> > +			remove_from_discard_list(discard_ctl, cache);
> > +			if (btrfs_is_free_space_trimmed(cache))
> > +				btrfs_mark_bg_unused(cache);
> > +			else if (cache->free_space_ctl->free_space ==
> > +				 cache->key.offset)
> 
> btrfs_block_group_used(cache) == 0;
> 

Fixed.

> > +				btrfs_add_to_discard_free_list(discard_ctl,
> > +							       cache);
> > +		} else {
> > +			cache->discard_cursor = cache->key.objectid;
> > +			cache->discard_flags |= BTRFS_DISCARD_BITMAPS;
> > +		}
> > +	}
> > +
> > +	spin_lock(&discard_ctl->lock);
> > +	discard_ctl->cache = NULL;
> > +	spin_unlock(&discard_ctl->lock);
> >  
> > -	remove_from_discard_list(discard_ctl, cache);
> > -	if (btrfs_is_free_space_trimmed(cache))
> > -		btrfs_mark_bg_unused(cache);
> > -	else if (cache->free_space_ctl->free_space == cache->key.offset)
> > -		btrfs_add_to_discard_free_list(discard_ctl, cache);
> > +	/* we didn't trim anything but we really ought to so try again */
> > +	if (trimmed == 0)
> > +		goto again;
> 
> Why?  We'll reschedule if we need to.  We unconditionally do this, I feel like
> there's going to be some corner case where we end up seeing this workfn using
> 100% on a bunch of sandcastle boxes and we'll all be super sad.
> 

I only added this as a eh let's make forward progress as much as
possible kind of deal as the rate limits are pretty aggressive. It seems
sandcastle was fine with this, but I can take it out.

> >  
> >  	btrfs_discard_schedule_work(discard_ctl, false);
> >  }
> > diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
> > index 55f79b624943..22cfa7e401bb 100644
> > --- a/fs/btrfs/discard.h
> > +++ b/fs/btrfs/discard.h
> > @@ -13,6 +13,22 @@
> >  #include "block-group.h"
> >  #include "free-space-cache.h"
> >  
> > +/* discard flags */
> > +#define BTRFS_DISCARD_RESET_CURSOR	(1UL << 0)
> > +#define BTRFS_DISCARD_BITMAPS           (1UL << 1)
> > +
> > +static inline
> > +bool btrfs_discard_reset_cursor(struct btrfs_block_group_cache *cache)
> > +{
> > +	return (cache->discard_flags & BTRFS_DISCARD_RESET_CURSOR);
> > +}
> > +
> > +static inline
> > +bool btrfs_discard_bitmaps(struct btrfs_block_group_cache *cache)
> > +{
> > +	return (cache->discard_flags & BTRFS_DISCARD_BITMAPS);
> > +}
> > +
> >  void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
> >  			       struct btrfs_block_group_cache *cache);
> >  void btrfs_add_to_discard_free_list(struct btrfs_discard_ctl *discard_ctl,
> > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> > index d69ee5f51b38..ff42e4abb01d 100644
> > --- a/fs/btrfs/extent-tree.c
> > +++ b/fs/btrfs/extent-tree.c
> > @@ -5683,7 +5683,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
> >  						     &group_trimmed,
> >  						     start,
> >  						     end,
> > -						     range->minlen);
> > +						     range->minlen,
> > +						     false);
> >  
> >  			trimmed += group_trimmed;
> >  			if (ret) {
> > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> > index ed0e7ee4c78d..97b3074e83c0 100644
> > --- a/fs/btrfs/free-space-cache.c
> > +++ b/fs/btrfs/free-space-cache.c
> > @@ -3267,7 +3267,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group,
> >  }
> >  
> >  static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
> > -			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
> > +			  u64 *total_trimmed, u64 start, u64 end, u64 minlen,
> > +			  bool async)
> >  {
> >  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> >  	struct btrfs_free_space *entry;
> > @@ -3284,36 +3285,25 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
> >  		mutex_lock(&ctl->cache_writeout_mutex);
> >  		spin_lock(&ctl->tree_lock);
> >  
> > -		if (ctl->free_space < minlen) {
> > -			spin_unlock(&ctl->tree_lock);
> > -			mutex_unlock(&ctl->cache_writeout_mutex);
> > -			break;
> > -		}
> > +		if (ctl->free_space < minlen)
> > +			goto out_unlock;
> >  
> >  		entry = tree_search_offset(ctl, start, 0, 1);
> > -		if (!entry) {
> > -			spin_unlock(&ctl->tree_lock);
> > -			mutex_unlock(&ctl->cache_writeout_mutex);
> > -			break;
> > -		}
> > +		if (!entry)
> > +			goto out_unlock;
> >  
> >  		/* skip bitmaps */
> > -		while (entry->bitmap) {
> > +		while (entry->bitmap || (async &&
> > +					 btrfs_free_space_trimmed(entry))) {
> 
> Update the comment to say we're skipping already trimmed entries as well please.
> 

Done.

> >  			node = rb_next(&entry->offset_index);
> > -			if (!node) {
> > -				spin_unlock(&ctl->tree_lock);
> > -				mutex_unlock(&ctl->cache_writeout_mutex);
> > -				goto out;
> > -			}
> > +			if (!node)
> > +				goto out_unlock;
> >  			entry = rb_entry(node, struct btrfs_free_space,
> >  					 offset_index);
> >  		}
> >  
> > -		if (entry->offset >= end) {
> > -			spin_unlock(&ctl->tree_lock);
> > -			mutex_unlock(&ctl->cache_writeout_mutex);
> > -			break;
> > -		}
> > +		if (entry->offset >= end)
> > +			goto out_unlock;
> >  
> >  		extent_start = entry->offset;
> >  		extent_bytes = entry->bytes;
> > @@ -3338,10 +3328,15 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
> >  		ret = do_trimming(block_group, total_trimmed, start, bytes,
> >  				  extent_start, extent_bytes, extent_flags,
> >  				  &trim_entry);
> > -		if (ret)
> > +		if (ret) {
> > +			block_group->discard_cursor = start + bytes;
> >  			break;
> > +		}
> >  next:
> >  		start += bytes;
> > +		block_group->discard_cursor = start;
> > +		if (async && *total_trimmed)
> > +			break;
> 
> Alright so this means we'll only trim one entry and then return if we're async?
> It seems to be the same below for bitmaps.  This deserves a comment for the
> functions, it fundamentally changes the behavior of the function if we're async.
> 

Yeah, I added a short comment in the beginning of each function.

> >  
> >  		if (fatal_signal_pending(current)) {
> >  			ret = -ERESTARTSYS;
> > @@ -3350,7 +3345,14 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
> >  
> >  		cond_resched();
> >  	}
> > -out:
> > +
> > +	return ret;
> > +
> > +out_unlock:
> > +	block_group->discard_cursor = btrfs_block_group_end(block_group);
> > +	spin_unlock(&ctl->tree_lock);
> > +	mutex_unlock(&ctl->cache_writeout_mutex);
> > +
> >  	return ret;
> >  }
> >  
> > @@ -3390,7 +3392,8 @@ static void end_trimming_bitmap(struct btrfs_free_space *entry)
> >  }
> >  
> >  static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> > -			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
> > +			u64 *total_trimmed, u64 start, u64 end, u64 minlen,
> > +			bool async)
> >  {
> >  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> >  	struct btrfs_free_space *entry;
> > @@ -3407,13 +3410,16 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> >  		spin_lock(&ctl->tree_lock);
> >  
> >  		if (ctl->free_space < minlen) {
> > +			block_group->discard_cursor =
> > +				btrfs_block_group_end(block_group);
> >  			spin_unlock(&ctl->tree_lock);
> >  			mutex_unlock(&ctl->cache_writeout_mutex);
> >  			break;
> >  		}
> >  
> >  		entry = tree_search_offset(ctl, offset, 1, 0);
> > -		if (!entry) {
> > +		if (!entry || (async && start == offset &&
> > +			       btrfs_free_space_trimmed(entry))) {
> >  			spin_unlock(&ctl->tree_lock);
> >  			mutex_unlock(&ctl->cache_writeout_mutex);
> >  			next_bitmap = true;
> > @@ -3446,6 +3452,16 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> >  			goto next;
> >  		}
> >  
> > +		/*
> > +		 * We already trimmed a region, but are using the locking above
> > +		 * to reset the BTRFS_FSC_TRIMMING_BITMAP flag.
> > +		 */
> > +		if (async && *total_trimmed) {
> > +			spin_unlock(&ctl->tree_lock);
> > +			mutex_unlock(&ctl->cache_writeout_mutex);
> > +			return ret;
> > +		}
> > +
> >  		bytes = min(bytes, end - start);
> >  		if (bytes < minlen) {
> >  			entry->flags &= ~BTRFS_FSC_TRIMMING_BITMAP;
> > @@ -3468,6 +3484,8 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> >  				  start, bytes, 0, &trim_entry);
> >  		if (ret) {
> >  			reset_trimming_bitmap(ctl, offset);
> > +			block_group->discard_cursor =
> > +				btrfs_block_group_end(block_group);
> >  			break;
> >  		}
> >  next:
> > @@ -3477,6 +3495,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> >  		} else {
> >  			start += bytes;
> >  		}
> > +		block_group->discard_cursor = start;
> >  
> >  		if (fatal_signal_pending(current)) {
> >  			if (start != offset)
> > @@ -3488,6 +3507,9 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
> >  		cond_resched();
> >  	}
> >  
> > +	if (offset >= end)
> > +		block_group->discard_cursor = end;
> > +
> >  	return ret;
> >  }
> >  
> > @@ -3532,7 +3554,8 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group)
> >  }
> >  
> >  int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
> > -			   u64 *trimmed, u64 start, u64 end, u64 minlen)
> > +			   u64 *trimmed, u64 start, u64 end, u64 minlen,
> > +			   bool async)
> >  {
> >  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> >  	int ret;
> > @@ -3547,11 +3570,11 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
> >  	btrfs_get_block_group_trimming(block_group);
> >  	spin_unlock(&block_group->lock);
> >  
> > -	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
> > -	if (ret)
> > +	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async);
> > +	if (ret || async)
> >  		goto out;
> >  
> 
> You already separate out btrfs_trim_block_group_bitmaps, so this function really
> only trims the whole block group if !async.  Make a separate helper for trimming
> only the extents so it's clear what the async stuff is doing, and we're not
> relying on the async to change the behavior of btrfs_trim_block_group().
> Thanks,

Done.

Thanks,
Dennis



[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux