Re: [PATCH 06/42] btrfs: introduce delayed_refs_rsv

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 28.09.2018 14:17, Josef Bacik wrote:
> From: Josef Bacik <jbacik@xxxxxx>
> 
> Traditionally we've had voodoo in btrfs to account for the space that
> delayed refs may take up by having a global_block_rsv.  This works most
> of the time, except when it doesn't.  We've had issues reported and seen
> in production where sometimes the global reserve is exhausted during
> transaction commit before we can run all of our delayed refs, resulting
> in an aborted transaction.  Because of this voodoo we have equally
> dubious flushing semantics around throttling delayed refs which we often
> get wrong.
> 
> So instead give them their own block_rsv.  This way we can always know
> exactly how much outstanding space we need for delayed refs.  This
> allows us to make sure we are constantly filling that reservation up
> with space, and allows us to put more precise pressure on the enospc
> system.  Instead of doing math to see if its a good time to throttle,
> the normal enospc code will be invoked if we have a lot of delayed refs
> pending, and they will be run via the normal flushing mechanism.
> 
> For now the delayed_refs_rsv will hold the reservations for the delayed
> refs, the block group updates, and deleting csums.  We could have a
> separate rsv for the block group updates, but the csum deletion stuff is
> still handled via the delayed_refs so that will stay there.
> 
> Signed-off-by: Josef Bacik <jbacik@xxxxxx>
> ---
>  fs/btrfs/ctree.h             |  27 +++--
>  fs/btrfs/delayed-ref.c       |  28 ++++-
>  fs/btrfs/disk-io.c           |   4 +
>  fs/btrfs/extent-tree.c       | 279 +++++++++++++++++++++++++++++++++++--------
>  fs/btrfs/inode.c             |   2 +-
>  fs/btrfs/transaction.c       |  77 ++++++------
>  include/trace/events/btrfs.h |   2 +
>  7 files changed, 312 insertions(+), 107 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 66f1d3895bca..1a2c3b629af2 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -452,8 +452,9 @@ struct btrfs_space_info {
>  #define	BTRFS_BLOCK_RSV_TRANS		3
>  #define	BTRFS_BLOCK_RSV_CHUNK		4
>  #define	BTRFS_BLOCK_RSV_DELOPS		5
> -#define	BTRFS_BLOCK_RSV_EMPTY		6
> -#define	BTRFS_BLOCK_RSV_TEMP		7
> +#define BTRFS_BLOCK_RSV_DELREFS		6
> +#define	BTRFS_BLOCK_RSV_EMPTY		7
> +#define	BTRFS_BLOCK_RSV_TEMP		8
>  
>  struct btrfs_block_rsv {
>  	u64 size;
> @@ -794,6 +795,8 @@ struct btrfs_fs_info {
>  	struct btrfs_block_rsv chunk_block_rsv;
>  	/* block reservation for delayed operations */
>  	struct btrfs_block_rsv delayed_block_rsv;
> +	/* block reservation for delayed refs */
> +	struct btrfs_block_rsv delayed_refs_rsv;
>  
>  	struct btrfs_block_rsv empty_block_rsv;
>  
> @@ -2608,8 +2611,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
>  
>  int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
>  				       struct btrfs_fs_info *fs_info);
> -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
> -				       struct btrfs_fs_info *fs_info);
> +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
>  void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
>  					 const u64 start);
>  void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
> @@ -2723,10 +2725,12 @@ enum btrfs_reserve_flush_enum {
>  enum btrfs_flush_state {
>  	FLUSH_DELAYED_ITEMS_NR	=	1,
>  	FLUSH_DELAYED_ITEMS	=	2,
> -	FLUSH_DELALLOC		=	3,
> -	FLUSH_DELALLOC_WAIT	=	4,
> -	ALLOC_CHUNK		=	5,
> -	COMMIT_TRANS		=	6,
> +	FLUSH_DELAYED_REFS_NR	=	3,
> +	FLUSH_DELAYED_REFS	=	4,
> +	FLUSH_DELALLOC		=	5,
> +	FLUSH_DELALLOC_WAIT	=	6,
> +	ALLOC_CHUNK		=	7,
> +	COMMIT_TRANS		=	8,
>  };
>  
>  int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
> @@ -2777,6 +2781,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
>  void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
>  			     struct btrfs_block_rsv *block_rsv,
>  			     u64 num_bytes);
> +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
> +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
> +int btrfs_throttle_delayed_refs(struct btrfs_fs_info *fs_info,
> +				enum btrfs_reserve_flush_enum flush);
> +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
> +				       struct btrfs_block_rsv *src,
> +				       u64 num_bytes);
>  int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
>  void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
>  void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 27f7dd4e3d52..96ce087747b2 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -467,11 +467,14 @@ static int insert_delayed_ref(struct btrfs_trans_handle *trans,
>   * existing and update must have the same bytenr
>   */
>  static noinline void
> -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
> +update_existing_head_ref(struct btrfs_trans_handle *trans,
>  			 struct btrfs_delayed_ref_head *existing,
>  			 struct btrfs_delayed_ref_head *update,
>  			 int *old_ref_mod_ret)
>  {
> +	struct btrfs_delayed_ref_root *delayed_refs =
> +		&trans->transaction->delayed_refs;
> +	struct btrfs_fs_info *fs_info = trans->fs_info;
>  	int old_ref_mod;
>  
>  	BUG_ON(existing->is_data != update->is_data);
> @@ -529,10 +532,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
>  	 * versa we need to make sure to adjust pending_csums accordingly.
>  	 */
>  	if (existing->is_data) {
> -		if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
> +		u64 csum_items =
> +			btrfs_csum_bytes_to_leaves(fs_info,
> +						   existing->num_bytes);
> +
> +		if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
>  			delayed_refs->pending_csums -= existing->num_bytes;
> -		if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
> +			btrfs_delayed_refs_rsv_release(fs_info, csum_items);
> +		}
> +		if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
>  			delayed_refs->pending_csums += existing->num_bytes;
> +			trans->delayed_ref_updates += csum_items;
> +		}
>  	}
>  	spin_unlock(&existing->lock);
>  }
> @@ -638,7 +649,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
>  			&& head_ref->qgroup_reserved
>  			&& existing->qgroup_ref_root
>  			&& existing->qgroup_reserved);
> -		update_existing_head_ref(delayed_refs, existing, head_ref,
> +		update_existing_head_ref(trans, existing, head_ref,
>  					 old_ref_mod);
>  		/*
>  		 * we've updated the existing ref, free the newly
> @@ -649,8 +660,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
>  	} else {
>  		if (old_ref_mod)
>  			*old_ref_mod = 0;
> -		if (head_ref->is_data && head_ref->ref_mod < 0)
> +		if (head_ref->is_data && head_ref->ref_mod < 0) {
>  			delayed_refs->pending_csums += head_ref->num_bytes;
> +			trans->delayed_ref_updates +=
> +				btrfs_csum_bytes_to_leaves(trans->fs_info,
> +							   head_ref->num_bytes);
> +		}
>  		delayed_refs->num_heads++;
>  		delayed_refs->num_heads_ready++;
>  		atomic_inc(&delayed_refs->num_entries);
> @@ -785,6 +800,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
>  
>  	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
>  	spin_unlock(&delayed_refs->lock);
> +	btrfs_update_delayed_refs_rsv(trans);

You haven't adressed my initial point about merging modification of
delayed_ref_updates and calling btrfs_update_delayed_refs_rsv into one
function otherwise this seems error prone. I don't see why this cannot
be made, if there is some reason which I'm missing then explain it.

As it stands this btrfs_updated_delayed_refs_rsv is paired with the
modifications made in one of the 2nd level callees:

 btrfs_add_delayed_tree_ref
   add_delayed_ref_head
    update_existing_head_ref

I'd rather have btrfs_update_delayed_refs_rsv renamed to something else
with 'inc' in its name and called everytime we modify
delayed_ref_update. I'm willing to bet 50 bucks in 6 months time someone
will change delayed_ref_updates and will forget to call
btrfs_update_delayed_refs_rsv.


WRT locking in update_existing_head_ref we are guaranteed to hold
delayed_refs->lock, same thing in add_delayed_extent_op. The only places
where we don't hold it is in the bg-related paths. But that's easily
solvable by simplying breaking the function down into an internal helper
doing the actual work with lockdep_assert_held(delayed_refs) at the top
and a "public" api which will be taking the lock and calling the helper.
WRT performance you will not be putting that much extra code in the
critical section i.e the check + the arithmetic of
btrfs_calc_trans_metadata_size.


>  
>  	trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
>  				   action == BTRFS_ADD_DELAYED_EXTENT ?
> @@ -866,6 +882,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
>  
>  	ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
>  	spin_unlock(&delayed_refs->lock);
> +	btrfs_update_delayed_refs_rsv(trans);
>  
>  	trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
>  				   action == BTRFS_ADD_DELAYED_EXTENT ?
> @@ -903,6 +920,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
>  			     NULL, NULL, NULL);
>  
>  	spin_unlock(&delayed_refs->lock);
> +	btrfs_update_delayed_refs_rsv(trans);
>  	return 0;
>  }
>  
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 5124c15705ce..377ad9c1cb17 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2692,6 +2692,9 @@ int open_ctree(struct super_block *sb,
>  	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
>  	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
>  			     BTRFS_BLOCK_RSV_DELOPS);
> +	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
> +			     BTRFS_BLOCK_RSV_DELREFS);
> +
>  	atomic_set(&fs_info->async_delalloc_pages, 0);
>  	atomic_set(&fs_info->defrag_running, 0);
>  	atomic_set(&fs_info->qgroup_op_seq, 0);
> @@ -4419,6 +4422,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
>  
>  		spin_unlock(&cur_trans->dirty_bgs_lock);
>  		btrfs_put_block_group(cache);
> +		btrfs_delayed_refs_rsv_release(fs_info, 1);
>  		spin_lock(&cur_trans->dirty_bgs_lock);
>  	}
>  	spin_unlock(&cur_trans->dirty_bgs_lock);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index b32bd38390dd..1213f573eea2 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2481,6 +2481,7 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
>  	struct btrfs_fs_info *fs_info = trans->fs_info;
>  	struct btrfs_delayed_ref_root *delayed_refs =
>  		&trans->transaction->delayed_refs;
> +	int nr_items = 1;
>  
>  	if (head->total_ref_mod < 0) {
>  		struct btrfs_space_info *space_info;
> @@ -2502,12 +2503,15 @@ static void cleanup_ref_head_accounting(struct btrfs_trans_handle *trans,
>  			spin_lock(&delayed_refs->lock);
>  			delayed_refs->pending_csums -= head->num_bytes;
>  			spin_unlock(&delayed_refs->lock);
> +			nr_items += btrfs_csum_bytes_to_leaves(fs_info,
> +				head->num_bytes);
>  		}
>  	}
>  
>  	/* Also free its reserved qgroup space */
>  	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
>  				      head->qgroup_reserved);
> +	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
>  }
>  
>  static int cleanup_ref_head(struct btrfs_trans_handle *trans,
> @@ -2802,40 +2806,22 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
>  	return num_csums;
>  }
>  
> -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
> -				       struct btrfs_fs_info *fs_info)
> +bool btrfs_check_space_for_delayed_refs( struct btrfs_fs_info *fs_info)
>  {
> -	struct btrfs_block_rsv *global_rsv;
> -	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
> -	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
> -	unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
> -	u64 num_bytes, num_dirty_bgs_bytes;
> -	int ret = 0;
> -
> -	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
> -	num_heads = heads_to_leaves(fs_info, num_heads);
> -	if (num_heads > 1)
> -		num_bytes += (num_heads - 1) * fs_info->nodesize;
> -	num_bytes <<= 1;
> -	num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
> -							fs_info->nodesize;
> -	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
> -							     num_dirty_bgs);
> -	global_rsv = &fs_info->global_block_rsv;
> -
> -	/*
> -	 * If we can't allocate any more chunks lets make sure we have _lots_ of
> -	 * wiggle room since running delayed refs can create more delayed refs.
> -	 */
> -	if (global_rsv->space_info->full) {
> -		num_dirty_bgs_bytes <<= 1;
> -		num_bytes <<= 1;
> -	}
> +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> +	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
> +	u64 reserved;
> +	bool ret = false;
>  
>  	spin_lock(&global_rsv->lock);
> -	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
> -		ret = 1;
> +	reserved = global_rsv->reserved;
>  	spin_unlock(&global_rsv->lock);
> +
> +	spin_lock(&delayed_refs_rsv->lock);
> +	reserved += delayed_refs_rsv->reserved;
> +	if (delayed_refs_rsv->size >= reserved)
> +		ret = true;
> +	spin_unlock(&delayed_refs_rsv->lock);
>  	return ret;
>  }
>  
> @@ -2855,7 +2841,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
>  	if (val >= NSEC_PER_SEC / 2)
>  		return 2;
>  
> -	return btrfs_check_space_for_delayed_refs(trans, fs_info);
> +	return btrfs_check_space_for_delayed_refs(fs_info) ? 1 : 0;
>  }
>  
>  struct async_delayed_refs {
> @@ -3610,6 +3596,8 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
>  	 */
>  	mutex_lock(&trans->transaction->cache_write_mutex);
>  	while (!list_empty(&dirty)) {
> +		bool drop_reserve = true;
> +
>  		cache = list_first_entry(&dirty,
>  					 struct btrfs_block_group_cache,
>  					 dirty_list);
> @@ -3682,6 +3670,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
>  					list_add_tail(&cache->dirty_list,
>  						      &cur_trans->dirty_bgs);
>  					btrfs_get_block_group(cache);
> +					drop_reserve = false;
>  				}
>  				spin_unlock(&cur_trans->dirty_bgs_lock);
>  			} else if (ret) {
> @@ -3692,6 +3681,8 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
>  		/* if its not on the io list, we need to put the block group */
>  		if (should_put)
>  			btrfs_put_block_group(cache);
> +		if (drop_reserve)
> +			btrfs_delayed_refs_rsv_release(fs_info, 1);
>  
>  		if (ret)
>  			break;
> @@ -3840,6 +3831,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
>  		/* if its not on the io list, we need to put the block group */
>  		if (should_put)
>  			btrfs_put_block_group(cache);
> +		btrfs_delayed_refs_rsv_release(fs_info, 1);
>  		spin_lock(&cur_trans->dirty_bgs_lock);
>  	}
>  	spin_unlock(&cur_trans->dirty_bgs_lock);
> @@ -4816,8 +4808,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
>  {
>  	struct reserve_ticket *ticket = NULL;
>  	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
> +	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
>  	struct btrfs_trans_handle *trans;
>  	u64 bytes;
> +	u64 reclaim_bytes = 0;
>  
>  	trans = (struct btrfs_trans_handle *)current->journal_info;
>  	if (trans)
> @@ -4850,12 +4844,16 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
>  		return -ENOSPC;
>  
>  	spin_lock(&delayed_rsv->lock);
> -	if (delayed_rsv->size > bytes)
> -		bytes = 0;
> -	else
> -		bytes -= delayed_rsv->size;
> +	reclaim_bytes += delayed_rsv->reserved;
>  	spin_unlock(&delayed_rsv->lock);
>  
> +	spin_lock(&delayed_refs_rsv->lock);
> +	reclaim_bytes += delayed_refs_rsv->reserved;
> +	spin_unlock(&delayed_refs_rsv->lock);
> +	if (reclaim_bytes >= bytes)
> +		goto commit;
> +	bytes -= reclaim_bytes;
> +
>  	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
>  				   bytes,
>  				   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
> @@ -4905,6 +4903,20 @@ static void flush_space(struct btrfs_fs_info *fs_info,
>  		shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
>  				state == FLUSH_DELALLOC_WAIT);
>  		break;
> +	case FLUSH_DELAYED_REFS_NR:
> +	case FLUSH_DELAYED_REFS:
> +		trans = btrfs_join_transaction(root);
> +		if (IS_ERR(trans)) {
> +			ret = PTR_ERR(trans);
> +			break;
> +		}
> +		if (state == FLUSH_DELAYED_REFS_NR)
> +			nr = calc_reclaim_items_nr(fs_info, num_bytes);
> +		else
> +			nr = 0;
> +		btrfs_run_delayed_refs(trans, nr);
> +		btrfs_end_transaction(trans);
> +		break;
>  	case ALLOC_CHUNK:
>  		trans = btrfs_join_transaction(root);
>  		if (IS_ERR(trans)) {
> @@ -5377,6 +5389,91 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
>  	return 0;
>  }
>  
> +/**
> + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
> + * @fs_info - the fs info for our fs.
> + * @src - the source block rsv to transfer from.
> + * @num_bytes - the number of bytes to transfer.
> + *
> + * This transfers up to the num_bytes amount from the src rsv to the
> + * delayed_refs_rsv.  Any extra bytes are returned to the space info.
> + */
> +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
> +				       struct btrfs_block_rsv *src,
> +				       u64 num_bytes)
> +{
> +	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
> +	u64 to_free = 0;
> +
> +	spin_lock(&src->lock);
> +	src->reserved -= num_bytes;
> +	src->size -= num_bytes;
> +	spin_unlock(&src->lock);
> +
> +	spin_lock(&delayed_refs_rsv->lock);
> +	if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
> +		u64 delta = delayed_refs_rsv->size -
> +			delayed_refs_rsv->reserved;
> +		if (num_bytes > delta) {
> +			to_free = num_bytes - delta;
> +			num_bytes = delta;
> +		}
> +	} else {
> +		to_free = num_bytes;
> +		num_bytes = 0;
> +	}
> +
> +	if (num_bytes)
> +		delayed_refs_rsv->reserved += num_bytes;
> +	if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
> +		delayed_refs_rsv->full = 1;
> +	spin_unlock(&delayed_refs_rsv->lock);
> +
> +	if (num_bytes)
> +		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
> +					      0, num_bytes, 1);
> +	if (to_free)
> +		space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
> +					 to_free);
> +}
> +
> +/**
> + * btrfs_throttle_delayed_refs - throttle based on our delayed refs usage.
> + * @fs_info - the fs_info for our fs.
> + * @flush - control how we can flush for this reservation.
> + *
> + * This will refill the delayed block_rsv up to 1 items size worth of space and
> + * will return -ENOSPC if we can't make the reservation.
> + */
> +int btrfs_throttle_delayed_refs(struct btrfs_fs_info *fs_info,
> +				enum btrfs_reserve_flush_enum flush)
> +{
> +	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
> +	u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
> +	u64 num_bytes = 0;
> +	int ret = -ENOSPC;
> +
> +	spin_lock(&block_rsv->lock);
> +	if (block_rsv->reserved < block_rsv->size) {
> +		num_bytes = block_rsv->size - block_rsv->reserved;
> +		num_bytes = min(num_bytes, limit);
> +	}
> +	spin_unlock(&block_rsv->lock);
> +
> +	if (!num_bytes)
> +		return 0;
> +
> +	ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
> +				     num_bytes, flush);
> +	if (ret)
> +		return ret;
> +	block_rsv_add_bytes(block_rsv, num_bytes, 0);
> +	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
> +				      0, num_bytes, 1);
> +	return 0;
> +}
> +
> +
>  /*
>   * This is for space we already have accounted in space_info->bytes_may_use, so
>   * basically when we're returning space from block_rsv's.
> @@ -5699,6 +5796,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
>  	return ret;
>  }
>  
> +static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
> +				     struct btrfs_block_rsv *block_rsv,
> +				     u64 num_bytes, u64 *qgroup_to_release)
> +{
> +	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> +	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
> +	struct btrfs_block_rsv *target = delayed_rsv;
> +
> +	if (target->full || target == block_rsv)
> +		target = global_rsv;
> +
> +	if (block_rsv->space_info != target->space_info)
> +		target = NULL;
> +
> +	return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
> +				       qgroup_to_release);
> +}
> +
> +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
> +			     struct btrfs_block_rsv *block_rsv,
> +			     u64 num_bytes)
> +{
> +	__btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
> +}
> +
>  /**
>   * btrfs_inode_rsv_release - release any excessive reservation.
>   * @inode - the inode we need to release from.
> @@ -5713,7 +5835,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
>  static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
>  {
>  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
> -	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
>  	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
>  	u64 released = 0;
>  	u64 qgroup_to_release = 0;
> @@ -5723,8 +5844,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
>  	 * are releasing 0 bytes, and then we'll just get the reservation over
>  	 * the size free'd.
>  	 */
> -	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
> -					   &qgroup_to_release);
> +	released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
> +					     &qgroup_to_release);
>  	if (released > 0)
>  		trace_btrfs_space_reservation(fs_info, "delalloc",
>  					      btrfs_ino(inode), released, 0);
> @@ -5735,16 +5856,26 @@ static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
>  						   qgroup_to_release);
>  }
>  
> -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
> -			     struct btrfs_block_rsv *block_rsv,
> -			     u64 num_bytes)
> +/**
> + * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
> + * @fs_info - the fs_info for our fs.
> + * @nr - the number of items to drop.
> + *
> + * This drops the delayed ref head's count from the delayed refs rsv and free's
> + * any excess reservation we had.
> + */
> +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
>  {
> +	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
>  	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> +	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
> +	u64 released = 0;
>  
> -	if (global_rsv == block_rsv ||
> -	    block_rsv->space_info != global_rsv->space_info)
> -		global_rsv = NULL;
> -	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
> +	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
> +					   num_bytes, NULL);
> +	if (released)
> +		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
> +					      0, released, 0);
>  }
>  
>  static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
> @@ -5809,9 +5940,10 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
>  	fs_info->trans_block_rsv.space_info = space_info;
>  	fs_info->empty_block_rsv.space_info = space_info;
>  	fs_info->delayed_block_rsv.space_info = space_info;
> +	fs_info->delayed_refs_rsv.space_info = space_info;
>  
> -	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
> -	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
> +	fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
> +	fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
>  	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
>  	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
>  	if (fs_info->quota_root)
> @@ -5831,8 +5963,34 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
>  	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
>  	WARN_ON(fs_info->delayed_block_rsv.size > 0);
>  	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
> +	WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
> +	WARN_ON(fs_info->delayed_refs_rsv.size > 0);
>  }
>  
> +/*
> + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
> + * @trans - the trans that may have generated delayed refs
> + *
> + * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
> + * it'll calculate the additional size and add it to the delayed_refs_rsv.
> + */
> +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
> +{
> +	struct btrfs_fs_info *fs_info = trans->fs_info;
> +	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
> +	u64 num_bytes;
> +
> +	if (!trans->delayed_ref_updates)
> +		return;
> +
> +	num_bytes = btrfs_calc_trans_metadata_size(fs_info,
> +						   trans->delayed_ref_updates);
> +	spin_lock(&delayed_rsv->lock);
> +	delayed_rsv->size += num_bytes;
> +	delayed_rsv->full = 0;
> +	spin_unlock(&delayed_rsv->lock);
> +	trans->delayed_ref_updates = 0;
> +}
>  
>  /*
>   * To be called after all the new block groups attached to the transaction
> @@ -6126,6 +6284,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
>  	u64 old_val;
>  	u64 byte_in_group;
>  	int factor;
> +	int ret = 0;
>  
>  	/* block accounting for super block */
>  	spin_lock(&info->delalloc_root_lock);
> @@ -6139,8 +6298,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
>  
>  	while (total) {
>  		cache = btrfs_lookup_block_group(info, bytenr);
> -		if (!cache)
> -			return -ENOENT;
> +		if (!cache) {
> +			ret = -ENOENT;
> +			break;
> +		}
>  		factor = btrfs_bg_type_to_factor(cache->flags);
>  
>  		/*
> @@ -6199,6 +6360,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
>  			list_add_tail(&cache->dirty_list,
>  				      &trans->transaction->dirty_bgs);
>  			trans->transaction->num_dirty_bgs++;
> +			trans->delayed_ref_updates++;
>  			btrfs_get_block_group(cache);
>  		}
>  		spin_unlock(&trans->transaction->dirty_bgs_lock);
> @@ -6216,7 +6378,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
>  		total -= num_bytes;
>  		bytenr += num_bytes;
>  	}
> -	return 0;
> +
> +	/* Modified block groups are accounted for in the delayed_refs_rsv. */
> +	btrfs_update_delayed_refs_rsv(trans);
> +	return ret;
>  }
>  
>  static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
> @@ -8230,7 +8395,12 @@ use_block_rsv(struct btrfs_trans_handle *trans,
>  		goto again;
>  	}
>  
> -	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
> +	/*
> +	 * The global reserve still exists to save us from ourselves, so don't
> +	 * warn_on if we are short on our delayed refs reserve.
> +	 */
> +	if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
> +	    btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
>  		static DEFINE_RATELIMIT_STATE(_rs,
>  				DEFAULT_RATELIMIT_INTERVAL * 10,
>  				/*DEFAULT_RATELIMIT_BURST*/ 1);
> @@ -10146,6 +10316,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
>  		add_block_group_free_space(trans, block_group);
>  		/* already aborted the transaction if it failed. */
>  next:
> +		btrfs_delayed_refs_rsv_release(fs_info, 1);
>  		list_del_init(&block_group->bg_list);
>  	}
>  	trans->can_flush_pending_bgs = can_flush_pending_bgs;
> @@ -10223,6 +10394,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
>  	link_block_group(cache);
>  
>  	list_add_tail(&cache->bg_list, &trans->new_bgs);
> +	trans->delayed_ref_updates++;
> +	btrfs_update_delayed_refs_rsv(trans);
>  
>  	set_avail_alloc_bits(fs_info, type);
>  	return 0;
> @@ -10260,6 +10433,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>  	int factor;
>  	struct btrfs_caching_control *caching_ctl = NULL;
>  	bool remove_em;
> +	bool remove_rsv = false;
>  
>  	block_group = btrfs_lookup_block_group(fs_info, group_start);
>  	BUG_ON(!block_group);
> @@ -10324,6 +10498,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>  
>  	if (!list_empty(&block_group->dirty_list)) {
>  		list_del_init(&block_group->dirty_list);
> +		remove_rsv = true;
>  		btrfs_put_block_group(block_group);
>  	}
>  	spin_unlock(&trans->transaction->dirty_bgs_lock);
> @@ -10533,6 +10708,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>  
>  	ret = btrfs_del_item(trans, root, path);
>  out:
> +	if (remove_rsv)
> +		btrfs_delayed_refs_rsv_release(fs_info, 1);
>  	btrfs_free_path(path);
>  	return ret;
>  }
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 212fa71317d6..cd00ec869c96 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -5382,7 +5382,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
>  		 * Try to steal from the global reserve if there is space for
>  		 * it.
>  		 */
> -		if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
> +		if (!btrfs_check_space_for_delayed_refs(fs_info) &&
>  		    !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
>  			return trans;
>  
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 3b84f5015029..117e0c4a914a 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -455,7 +455,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
>  		  bool enforce_qgroups)
>  {
>  	struct btrfs_fs_info *fs_info = root->fs_info;
> -
> +	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
>  	struct btrfs_trans_handle *h;
>  	struct btrfs_transaction *cur_trans;
>  	u64 num_bytes = 0;
> @@ -484,13 +484,28 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
>  	 * the appropriate flushing if need be.
>  	 */
>  	if (num_items && root != fs_info->chunk_root) {
> +		struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
> +		u64 delayed_refs_bytes = 0;
> +
>  		qgroup_reserved = num_items * fs_info->nodesize;
>  		ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
>  				enforce_qgroups);
>  		if (ret)
>  			return ERR_PTR(ret);
>  
> +		/*
> +		 * We want to reserve all the bytes we may need all at once, so
> +		 * we only do 1 enospc flushing cycle per transaction start.  We
> +		 * accomplish this by simply assuming we'll do 2 x num_items
> +		 * worth of delayed refs updates in this trans handle, and
> +		 * refill that amount for whatever is missing in the reserve.
> +		 */
>  		num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
> +		if (delayed_refs_rsv->full == 0) {
> +			delayed_refs_bytes = num_bytes;
> +			num_bytes <<= 1;
> +		}
> +
>  		/*
>  		 * Do the reservation for the relocation root creation
>  		 */
> @@ -499,8 +514,24 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
>  			reloc_reserved = true;
>  		}
>  
> -		ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
> -					  num_bytes, flush);
> +		ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
> +		if (ret)
> +			goto reserve_fail;
> +		if (delayed_refs_bytes) {
> +			btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
> +							  delayed_refs_bytes);
> +			num_bytes -= delayed_refs_bytes;
> +		}
> +	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
> +		   !delayed_refs_rsv->full) {
> +		/*
> +		 * Some people call with btrfs_start_transaction(root, 0)
> +		 * because they can be throttled, but have some other mechanism
> +		 * for reserving space.  We still want these guys to refill the
> +		 * delayed block_rsv so just add 1 items worth of reservation
> +		 * here.
> +		 */
> +		ret = btrfs_throttle_delayed_refs(fs_info, flush);
>  		if (ret)
>  			goto reserve_fail;
>  	}
> @@ -759,7 +790,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
>  {
>  	struct btrfs_fs_info *fs_info = trans->fs_info;
>  
> -	if (btrfs_check_space_for_delayed_refs(trans, fs_info))
> +	if (btrfs_check_space_for_delayed_refs(fs_info))
>  		return 1;
>  
>  	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
> @@ -768,22 +799,12 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
>  int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
>  {
>  	struct btrfs_transaction *cur_trans = trans->transaction;
> -	int updates;
> -	int err;
>  
>  	smp_mb();
>  	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
>  	    cur_trans->delayed_refs.flushing)
>  		return 1;
>  
> -	updates = trans->delayed_ref_updates;
> -	trans->delayed_ref_updates = 0;
> -	if (updates) {
> -		err = btrfs_run_delayed_refs(trans, updates * 2);
> -		if (err) /* Error code will also eval true */
> -			return err;
> -	}
> -
>  	return should_end_transaction(trans);
>  }
>  
> @@ -813,11 +834,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
>  {
>  	struct btrfs_fs_info *info = trans->fs_info;
>  	struct btrfs_transaction *cur_trans = trans->transaction;
> -	u64 transid = trans->transid;
> -	unsigned long cur = trans->delayed_ref_updates;
>  	int lock = (trans->type != TRANS_JOIN_NOLOCK);
>  	int err = 0;
> -	int must_run_delayed_refs = 0;
>  
>  	if (refcount_read(&trans->use_count) > 1) {
>  		refcount_dec(&trans->use_count);
> @@ -828,27 +846,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
>  	btrfs_trans_release_metadata(trans);
>  	trans->block_rsv = NULL;
>  
> -	if (!list_empty(&trans->new_bgs))
> -		btrfs_create_pending_block_groups(trans);
> -
> -	trans->delayed_ref_updates = 0;
> -	if (!trans->sync) {
> -		must_run_delayed_refs =
> -			btrfs_should_throttle_delayed_refs(trans, info);
> -		cur = max_t(unsigned long, cur, 32);
> -
> -		/*
> -		 * don't make the caller wait if they are from a NOLOCK
> -		 * or ATTACH transaction, it will deadlock with commit
> -		 */
> -		if (must_run_delayed_refs == 1 &&
> -		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
> -			must_run_delayed_refs = 2;
> -	}
> -
> -	btrfs_trans_release_metadata(trans);
> -	trans->block_rsv = NULL;
> -
>  	if (!list_empty(&trans->new_bgs))
>  		btrfs_create_pending_block_groups(trans);
>  
> @@ -893,10 +890,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
>  	}
>  
>  	kmem_cache_free(btrfs_trans_handle_cachep, trans);
> -	if (must_run_delayed_refs) {
> -		btrfs_async_run_delayed_refs(info, cur, transid,
> -					     must_run_delayed_refs == 1);
> -	}
>  	return err;
>  }
>  
> diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
> index b401c4e36394..7d205e50b09c 100644
> --- a/include/trace/events/btrfs.h
> +++ b/include/trace/events/btrfs.h
> @@ -1048,6 +1048,8 @@ TRACE_EVENT(btrfs_trigger_flush,
>  		{ FLUSH_DELAYED_ITEMS,		"FLUSH_DELAYED_ITEMS"},		\
>  		{ FLUSH_DELALLOC,		"FLUSH_DELALLOC"},		\
>  		{ FLUSH_DELALLOC_WAIT,		"FLUSH_DELALLOC_WAIT"},		\
> +		{ FLUSH_DELAYED_REFS_NR,	"FLUSH_DELAYED_REFS_NR"},	\
> +		{ FLUSH_DELAYED_REFS,		"FLUSH_ELAYED_REFS"},		\
>  		{ ALLOC_CHUNK,			"ALLOC_CHUNK"},			\
>  		{ COMMIT_TRANS,			"COMMIT_TRANS"})
>  
> 



[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux