Re: [PATCH 3/3] Btrfs: account for large extents with enospc

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Feb 11, 2015 at 03:08:59PM -0500, Josef Bacik wrote:
> On our gluster boxes we stream large tar balls of backups onto our fses.  With
> 160gb of ram this means we get really large contiguous ranges of dirty data, but
> the way our ENOSPC stuff works is that as long as it's contiguous we only hold
> metadata reservation for one extent.  The problem is we limit our extents to
> 128mb, so we'll end up with at least 800 extents so our enospc accounting is
> quite a bit lower than what we need.  To keep track of this make sure we
> increase outstanding_extents for every multiple of the max extent size so we can
> be sure to have enough reserved metadata space.  Thanks,
> 
> Signed-off-by: Josef Bacik <jbacik@xxxxxx>
> ---
>  fs/btrfs/ctree.h       |  2 ++
>  fs/btrfs/extent-tree.c | 16 +++++++++----
>  fs/btrfs/extent_io.c   |  2 +-
>  fs/btrfs/inode.c       | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 76 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 0b4683f..1675602 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
>  
>  #define BTRFS_DIRTY_METADATA_THRESH	(32 * 1024 * 1024)
>  
> +#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
> +
>  /*
>   * The key defines the order in the tree, and so it also defines (optimal)
>   * block layout.
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index a346487..eb30b90 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4966,19 +4966,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
>  /**
>   * drop_outstanding_extent - drop an outstanding extent
>   * @inode: the inode we're dropping the extent for
> + * @num_bytes: the number of bytes we're relaseing.
>   *
>   * This is called when we are freeing up an outstanding extent, either called
>   * after an error or after an extent is written.  This will return the number of
>   * reserved extents that need to be freed.  This must be called with
>   * BTRFS_I(inode)->lock held.
>   */
> -static unsigned drop_outstanding_extent(struct inode *inode)
> +static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
>  {
>  	unsigned drop_inode_space = 0;
>  	unsigned dropped_extents = 0;
> +	unsigned num_extents = 0;
>  
> -	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
> -	BTRFS_I(inode)->outstanding_extents--;
> +	num_extents = (unsigned)div64_u64(num_bytes +
> +					  BTRFS_MAX_EXTENT_SIZE - 1,
> +					  BTRFS_MAX_EXTENT_SIZE);

A fastpath is better, like btrfs_merge_extent_hook().

(num_extents < BTRFS_MAX_EXTENT_SIZE) ? num_extents = 1 : (div64_u64(...))

> +	ASSERT(num_extents);
> +	ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
> +	BTRFS_I(inode)->outstanding_extents -= num_extents;
>  
>  	if (BTRFS_I(inode)->outstanding_extents == 0 &&
>  	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> @@ -5149,7 +5155,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
>  
>  out_fail:
>  	spin_lock(&BTRFS_I(inode)->lock);
> -	dropped = drop_outstanding_extent(inode);
> +	dropped = drop_outstanding_extent(inode, num_bytes);
>  	/*
>  	 * If the inodes csum_bytes is the same as the original
>  	 * csum_bytes then we know we haven't raced with any free()ers
> @@ -5228,7 +5234,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
>  
>  	num_bytes = ALIGN(num_bytes, root->sectorsize);
>  	spin_lock(&BTRFS_I(inode)->lock);
> -	dropped = drop_outstanding_extent(inode);
> +	dropped = drop_outstanding_extent(inode, num_bytes);
>  
>  	if (num_bytes)
>  		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4ebabd2..3fbc177 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -3239,7 +3239,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
>  					       page,
>  					       &delalloc_start,
>  					       &delalloc_end,
> -					       128 * 1024 * 1024);
> +					       BTRFS_MAX_EXTENT_SIZE);
>  		if (nr_delalloc == 0) {
>  			delalloc_start = delalloc_end + 1;
>  			continue;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index fb16fd3..4564975 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
>  static void btrfs_split_extent_hook(struct inode *inode,
>  				    struct extent_state *orig, u64 split)
>  {
> +	u64 size;
> +
>  	/* not delalloc, ignore it */
>  	if (!(orig->state & EXTENT_DELALLOC))
>  		return;
>  
> +	size = orig->end - orig->start + 1;
> +	if (size > BTRFS_MAX_EXTENT_SIZE) {
> +		u64 num_extents;
> +		u64 new_size;
> +
> +		/*
> +		 * We need the largest size of the remaining extent to see if we
> +		 * need to add a new outstanding extent.  Think of the following
> +		 * case
> +		 *
> +		 * [MEAX_EXTENT_SIZEx2 - 4k][4k]

s/MEAX/MAX

> +		 *
> +		 * The new_size would just be 4k and we'd think we had enough
> +		 * outstanding extents for this if we only took one side of the
> +		 * split, same goes for the other direction.  We need to see if
> +		 * the larger size still is the same amount of extents as the
> +		 * original size, because if it is we need to add a new
> +		 * outstanding extent.  But if we split up and the larger size
> +		 * is less than the original then we are good to go since we've
> +		 * already accounted for the extra extent in our original
> +		 * accounting.
> +		 */
> +		new_size = orig->end - split + 1;
> +		if ((split - orig->start) > new_size)
> +			new_size = split - orig->start;
> +
> +		num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
> +					BTRFS_MAX_EXTENT_SIZE);
> +		if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
> +			      BTRFS_MAX_EXTENT_SIZE) < num_extents)

BTRFS_MAX_EXTENT_SIZE (128mb) means 1 << BTRFS_MAX_EXTENT_BIT (27), can we use '>>' instead of div64_u64?

Thanks,

-liubo
> +			return;
> +	}
> +
>  	spin_lock(&BTRFS_I(inode)->lock);
>  	BTRFS_I(inode)->outstanding_extents++;
>  	spin_unlock(&BTRFS_I(inode)->lock);
> @@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode,
>  				    struct extent_state *new,
>  				    struct extent_state *other)
>  {
> +	u64 new_size, old_size;
> +	u64 num_extents;
> +
>  	/* not delalloc, ignore it */
>  	if (!(other->state & EXTENT_DELALLOC))
>  		return;
>  
> +	old_size = other->end - other->start + 1;
> +	new_size = old_size + (new->end - new->start + 1);
> +
> +	/* we're not bigger than the max, unreserve the space and go */
> +	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
> +		spin_lock(&BTRFS_I(inode)->lock);
> +		BTRFS_I(inode)->outstanding_extents--;
> +		spin_unlock(&BTRFS_I(inode)->lock);
> +		return;
> +	}
> +
> +	/*
> +	 * If we grew by another max_extent, just return, we want to keep that
> +	 * reserved amount.
> +	 */
> +	num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
> +				BTRFS_MAX_EXTENT_SIZE);
> +	if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
> +		      BTRFS_MAX_EXTENT_SIZE) > num_extents)
> +		return;
> +
>  	spin_lock(&BTRFS_I(inode)->lock);
>  	BTRFS_I(inode)->outstanding_extents--;
>  	spin_unlock(&BTRFS_I(inode)->lock);
> @@ -1648,6 +1707,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
>  				 unsigned long *bits)
>  {
>  	u64 len = state->end + 1 - state->start;
> +	u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
> +				    BTRFS_MAX_EXTENT_SIZE);
>  
>  	spin_lock(&BTRFS_I(inode)->lock);
>  	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
> @@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
>  			*bits &= ~EXTENT_FIRST_DELALLOC;
>  		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
>  			spin_lock(&BTRFS_I(inode)->lock);
> -			BTRFS_I(inode)->outstanding_extents--;
> +			BTRFS_I(inode)->outstanding_extents -= num_extents;
>  			spin_unlock(&BTRFS_I(inode)->lock);
>  		}
>  
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux