Re: [PATCH 2/2] Btrfs: serialize flushers in reserve_metadata_bytes

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Jun 07, 2011 at 04:44:09PM -0400, Josef Bacik wrote:
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2932,6 +2932,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
>  	found->full = 0;
>  	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
>  	found->chunk_alloc = 0;
> +	found->flush = 0;
> +	init_waitqueue_head(&found->wait);
>  	*space_info = found;
>  	list_add_rcu(&found->list, &info->space_info);
>  	atomic_set(&found->caching_threads, 0);
> @@ -3314,9 +3316,13 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
>  	if (reserved == 0)
>  		return 0;
>  
> -	/* nothing to shrink - nothing to reclaim */
> -	if (root->fs_info->delalloc_bytes == 0)
> +	smp_mb();

can you please explain why do you use the barrier here? (and add that
explanation as a comment)

it's for the delalloc_bytes test, right? this is being modified in
btrfs_set_bit_hook/btrfs_clear_bit_hook, protected by a spinlock.
the counter is sum of all delalloc_bytes for each delalloc inode.
if the counter is nonzero, then fs_info->delalloc_inodes is expected to
be nonempty, but the list is not touched in this function, but
indirectly from writeback_inodes_sb_nr_if_idle and the respective
.writepages callback, ending up in __extent_writepage which starts
messing with delalloc.

it think it's possible to reach state, where counter is nonzero, but the
delalloc_inodes list is empty. then writeback will just skip the
delalloc work in this round and will process it later.
even with a barrier in place:

btrfs_set_bit_hook:
# counter increased, a barrier will assure len is obtained from
# delalloc_bytes in shrink_delalloc
1349                 root->fs_info->delalloc_bytes += len;
# but fs_info->delalloc_list may be empty
1350                 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1351				list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1352				&root->fs_info->delalloc_inodes);
1353                 }

although this does not seem to crash or cause corruption, I suggest to
use the spinlock as the synchronization mechanism to protect reading
fs_info->delalloc_bytes


david



> +	if (root->fs_info->delalloc_bytes == 0) {
> +		if (trans)
> +			return 0;
> +		btrfs_wait_ordered_extents(root, 0, 0);
>  		return 0;
> +	}
>  
>  	max_reclaim = min(reserved, to_reclaim);
>  
> @@ -3360,6 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
>  		}
>  
>  	}
> +	if (reclaimed >= to_reclaim && !trans)
> +		btrfs_wait_ordered_extents(root, 0, 0);
>  	return reclaimed >= to_reclaim;
>  }
>  
> @@ -3384,15 +3392,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
>  	u64 num_bytes = orig_bytes;
>  	int retries = 0;
>  	int ret = 0;
> -	bool reserved = false;
>  	bool committed = false;
> +	bool flushing = false;
>  
>  again:
> -	ret = -ENOSPC;
> -	if (reserved)
> -		num_bytes = 0;
> -
> +	ret = 0;
>  	spin_lock(&space_info->lock);
> +	/*
> +	 * We only want to wait if somebody other than us is flushing and we are
> +	 * actually alloed to flush.
> +	 */
> +	while (flush && !flushing && space_info->flush) {
> +		spin_unlock(&space_info->lock);
> +		/*
> +		 * If we have a trans handle we can't wait because the flusher
> +		 * may have to commit the transaction, which would mean we would
> +		 * deadlock since we are waiting for the flusher to finish, but
> +		 * hold the current transaction open.
> +		 */
> +		if (trans)
> +			return -EAGAIN;
> +		ret = wait_event_interruptible(space_info->wait,
> +					       !space_info->flush);
> +		/* Must have been interrupted, return */
> +		if (ret)
> +			return -EINTR;
> +
> +		spin_lock(&space_info->lock);
> +	}
> +
> +	ret = -ENOSPC;
>  	unused = space_info->bytes_used + space_info->bytes_reserved +
>  		 space_info->bytes_pinned + space_info->bytes_readonly +
>  		 space_info->bytes_may_use;
> @@ -3407,8 +3436,7 @@ again:
>  	if (unused <= space_info->total_bytes) {
>  		unused = space_info->total_bytes - unused;
>  		if (unused >= num_bytes) {
> -			if (!reserved)
> -				space_info->bytes_may_use += orig_bytes;
> +			space_info->bytes_may_use += orig_bytes;
>  			ret = 0;
>  		} else {
>  			/*
> @@ -3433,17 +3461,14 @@ again:
>  	 * to reclaim space we can actually use it instead of somebody else
>  	 * stealing it from us.
>  	 */
> -	if (ret && !reserved) {
> -		space_info->bytes_may_use += orig_bytes;
> -		reserved = true;
> +	if (ret && flush) {
> +		flushing = true;
> +		space_info->flush = 1;
>  	}
>  
>  	spin_unlock(&space_info->lock);
>  
> -	if (!ret)
> -		return 0;
> -
> -	if (!flush)
> +	if (!ret || !flush)
>  		goto out;
>  
>  	/*
> @@ -3451,9 +3476,7 @@ again:
>  	 * metadata until after the IO is completed.
>  	 */
>  	ret = shrink_delalloc(trans, root, num_bytes, 1);
> -	if (ret > 0)
> -		return 0;
> -	else if (ret < 0)
> +	if (ret < 0)
>  		goto out;
>  
>  	/*
> @@ -3466,11 +3489,11 @@ again:
>  		goto again;
>  	}
>  
> -	spin_lock(&space_info->lock);
>  	/*
>  	 * Not enough space to be reclaimed, don't bother committing the
>  	 * transaction.
>  	 */
> +	spin_lock(&space_info->lock);
>  	if (space_info->bytes_pinned < orig_bytes)
>  		ret = -ENOSPC;
>  	spin_unlock(&space_info->lock);
> @@ -3493,12 +3516,12 @@ again:
>  	}
>  
>  out:
> -	if (reserved) {
> +	if (flushing) {
>  		spin_lock(&space_info->lock);
> -		space_info->bytes_may_use -= orig_bytes;
> +		space_info->flush = 0;
> +		wake_up_all(&space_info->wait);
>  		spin_unlock(&space_info->lock);
>  	}
> -
>  	return ret;
>  }
>  
> -- 
> 1.7.2.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux