Re: [RFC PATCH 2/2] Btrfs: introduce free space cluster for each node

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Any Comment?

On Thu, 08 Sep 2011 16:18:09 +0800, Miao Xie wrote:
> This patch introduce free space cluster for each node in the b+tree. And
> we also simplify the fill method and the space allocation of the cluster:
> - Allocate free space cluster for each node
> - Allocate the free space extent (>=256KB, split a large extent or get the
>   contiguous blocks from the bitmaps) from the block group cache and cache
>   it in the cluster, instead of moving the free space entry from the block
>   group cache to the cluster directly. By this way, we just manage a extent
>   in the cluster.
> - When doing the tree block allocation, we can get the space just by split
>   the extent in the parent's cluster. By this way, we can allocate the tree
>   block concurrently and also can store the child nodes/leaves into the
>   contiguous blocks.
> 
> Beside that, we modify the source of the space cache to make it fit the above
> change. When we write out the information of the free space, we also write out
> the extent information in the clusters. And when we load the free space
> information, we will try to merge the free space fragment at first, because the
> extent in the clusters is small, and may merge them to be a large one.
> (Before applying this patch, we build the free space tree by the cached information
> directly. I think we needn't worry about the compatibility. At the worst, we may
> create lots of small extents which may be merge into a large one, but it will not
> break the use of Btrfs.)
> 
> We did some performance test for this patch, we find it works well, and make the
> small file performance grow up.
> 
> The sequential write performance of the small file:
> N_files		N_threads	File Size	Before		After
> 10240		8		2KB(Inline)	2.2055MB/s	4.1779MB/s
> 10240		8		4KB		1.001MB/s	1.1893MB/s
> 
> Signed-off-by: Miao Xie <miaox@xxxxxxxxxxxxxx>
> ---
>  fs/btrfs/ctree.c            |   28 ++-
>  fs/btrfs/ctree.h            |   50 +++-
>  fs/btrfs/disk-io.c          |    2 +-
>  fs/btrfs/extent-tree.c      |  107 +++++---
>  fs/btrfs/extent_io.c        |    7 +-
>  fs/btrfs/extent_io.h        |    3 +
>  fs/btrfs/free-space-cache.c |  667 ++++++++++++++++---------------------------
>  fs/btrfs/free-space-cache.h |   13 +
>  fs/btrfs/inode-map.c        |    1 +
>  fs/btrfs/inode.c            |   25 +-
>  fs/btrfs/ioctl.c            |    2 +-
>  fs/btrfs/tree-log.c         |    7 +
>  12 files changed, 419 insertions(+), 493 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 011cab3..1d3edd8 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -238,7 +238,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
>  	else
>  		btrfs_node_key(buf, &disk_key, 0);
>  
> -	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
> +	cow = btrfs_alloc_free_block(trans, root, buf->len, NULL, 0,
>  				     new_root_objectid, &disk_key, level,
>  				     buf->start, 0);
>  	if (IS_ERR(cow))
> @@ -444,9 +444,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
>  	} else
>  		parent_start = 0;
>  
> -	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
> -				     root->root_key.objectid, &disk_key,
> -				     level, search_start, empty_size);
> +	cow = btrfs_alloc_free_block(trans, root, buf->len, parent,
> +				     parent_start, root->root_key.objectid,
> +				     &disk_key, level, search_start,
> +				     empty_size);
>  	if (IS_ERR(cow))
>  		return PTR_ERR(cow);
>  
> @@ -467,6 +468,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
>  			    (unsigned long)btrfs_header_fsid(cow),
>  			    BTRFS_FSID_SIZE);
>  
> +	WARN_ON(cow->cluster);
> +	cow->cluster = buf->cluster;
> +	buf->cluster = NULL;
> +
>  	update_ref_for_cow(trans, root, buf, cow, &last_ref);
>  
>  	if (root->ref_cows)
> @@ -2070,7 +2075,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
>  	else
>  		btrfs_node_key(lower, &lower_key, 0);
>  
> -	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
> +	c = btrfs_alloc_free_block(trans, root, root->nodesize, NULL, 0,
>  				   root->root_key.objectid, &lower_key,
>  				   level, root->node->start, 0);
>  	if (IS_ERR(c))
> @@ -2170,6 +2175,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
>  {
>  	struct extent_buffer *c;
>  	struct extent_buffer *split;
> +	struct extent_buffer *parent = NULL;
>  	struct btrfs_disk_key disk_key;
>  	int mid;
>  	int ret;
> @@ -2197,9 +2203,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
>  	mid = (c_nritems + 1) / 2;
>  	btrfs_node_key(c, &disk_key, mid);
>  
> -	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
> -					root->root_key.objectid,
> -					&disk_key, level, c->start, 0);
> +	if (level + 1 < BTRFS_MAX_LEVEL)
> +		parent = path->nodes[level + 1];
> +
> +	split = btrfs_alloc_free_block(trans, root, root->nodesize, parent, 0,
> +				       root->root_key.objectid,
> +				       &disk_key, level, c->start, 0);
>  	if (IS_ERR(split))
>  		return PTR_ERR(split);
>  
> @@ -2951,7 +2960,8 @@ again:
>  	else
>  		btrfs_item_key(l, &disk_key, mid);
>  
> -	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
> +	right = btrfs_alloc_free_block(trans, root, root->leafsize,
> +					path->nodes[1], 0,
>  					root->root_key.objectid,
>  					&disk_key, 0, l->start, 0);
>  	if (IS_ERR(right))
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index c364d50..8c33a74 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -789,14 +789,9 @@ struct btrfs_block_rsv {
>   */
>  struct btrfs_free_cluster {
>  	spinlock_t lock;
> -	spinlock_t refill_lock;
> -	struct rb_root root;
> +	struct mutex refill_lock;
>  
> -	/* largest extent in this cluster */
> -	u64 max_size;
> -
> -	/* first extent starting offset */
> -	u64 window_start;
> +	struct btrfs_free_space *info;
>  
>  	struct btrfs_block_group_cache *block_group;
>  	/*
> @@ -2156,7 +2151,9 @@ u64 btrfs_find_block_group(struct btrfs_root *root,
>  			   u64 search_start, u64 search_hint, int owner);
>  struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>  					struct btrfs_root *root, u32 blocksize,
> -					u64 parent, u64 root_objectid,
> +					struct extent_buffer *parent,
> +					u64 parent_start,
> +					u64 root_objectid,
>  					struct btrfs_disk_key *key, int level,
>  					u64 hint, u64 empty_size);
>  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
> @@ -2175,12 +2172,37 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
>  				   struct btrfs_root *root,
>  				   u64 root_objectid, u64 owner, u64 offset,
>  				   struct btrfs_key *ins);
> -int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
> -				  struct btrfs_root *root,
> -				  u64 num_bytes, u64 min_alloc_size,
> -				  u64 empty_size, u64 hint_byte,
> -				  u64 search_end, struct btrfs_key *ins,
> -				  u64 data);
> +int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
> +			   struct btrfs_root *root,
> +			   u64 num_bytes, u64 min_alloc_size,
> +			   u64 empty_size, u64 hint_byte,
> +			   u64 search_end, struct btrfs_key *ins,
> +			   u64 data, struct btrfs_free_cluster *cluster);
> +static inline int btrfs_reserve_extent_data(struct btrfs_trans_handle *trans,
> +					    struct btrfs_root *root,
> +					    u64 num_bytes, u64 min_alloc_size,
> +					    u64 empty_size, u64 hint_byte,
> +					    u64 search_end,
> +					    struct btrfs_key *ins)
> +{
> +	return __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
> +				      empty_size, hint_byte, search_end, ins,
> +				      1, NULL);
> +}
> +
> +static inline int btrfs_reserve_extent_mdata(struct btrfs_trans_handle *trans,
> +					     struct btrfs_root *root,
> +					     u64 num_bytes, u64 min_alloc_size,
> +					     u64 empty_size, u64 hint_byte,
> +					     u64 search_end,
> +					     struct btrfs_key *ins,
> +					     struct btrfs_free_cluster *cluster)
> +{
> +	return __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
> +				      empty_size, hint_byte, search_end, ins,
> +				      0, cluster);
> +}
> +
>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>  		  struct extent_buffer *buf, int full_backref);
>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 07b3ac6..951a57f 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1171,7 +1171,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
>  	 */
>  	root->ref_cows = 0;
>  
> -	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
> +	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, NULL, 0,
>  				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
>  	if (IS_ERR(leaf)) {
>  		kfree(root);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 80d6148..43cc5c4 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4672,6 +4672,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
>  	struct btrfs_block_group_cache *cache = NULL;
>  	int ret;
>  
> +	btrfs_free_extent_cluster(buf);
> +
>  	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
>  		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
>  						parent, root->root_key.objectid,
> @@ -4853,8 +4855,10 @@ enum btrfs_loop_type {
>  	LOOP_FIND_IDEAL = 0,
>  	LOOP_CACHING_NOWAIT = 1,
>  	LOOP_CACHING_WAIT = 2,
> -	LOOP_ALLOC_CHUNK = 3,
> -	LOOP_NO_EMPTY_SIZE = 4,
> +	LOOP_RECLAIM_CLUSTERS = 3,
> +	LOOP_RECLAIM_ALL_CLUSTERS = 4,
> +	LOOP_ALLOC_CHUNK = 5,
> +	LOOP_NO_EMPTY_SIZE = 6,
>  };
>  
>  /*
> @@ -4870,7 +4874,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
>  				     u64 num_bytes, u64 empty_size,
>  				     u64 search_start, u64 search_end,
>  				     u64 hint_byte, struct btrfs_key *ins,
> -				     u64 data)
> +				     u64 data,
> +				     struct btrfs_free_cluster *cluster)
>  {
>  	int ret = 0;
>  	struct btrfs_root *root = orig_root->fs_info->extent_root;
> @@ -4912,9 +4917,12 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
>  		allowed_chunk_alloc = 1;
>  
>  	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
> -		last_ptr = &root->fs_info->meta_alloc_cluster;
> +		if (cluster)
> +			last_ptr = cluster;
> +		else
> +			last_ptr = &root->fs_info->meta_alloc_cluster;
>  		if (!btrfs_test_opt(root, SSD))
> -			empty_cluster = 64 * 1024;
> +			empty_cluster = 256 * 1024;
>  	}
>  
>  	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
> @@ -4925,7 +4933,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
>  	if (last_ptr) {
>  		spin_lock(&last_ptr->lock);
>  		if (last_ptr->block_group)
> -			hint_byte = last_ptr->window_start;
> +			hint_byte = last_ptr->info->offset;
>  		spin_unlock(&last_ptr->lock);
>  	}
>  
> @@ -5043,6 +5051,13 @@ have_block_group:
>  		if (unlikely(block_group->ro))
>  			goto loop;
>  
> +
> +		if (loop == LOOP_RECLAIM_CLUSTERS) {
> +			btrfs_reclaim_extent_clusters(block_group,
> +						      empty_cluster * 2);
> +		} else if (loop == LOOP_RECLAIM_ALL_CLUSTERS)
> +			btrfs_reclaim_extent_clusters(block_group, 0);
> +
>  		spin_lock(&block_group->free_space_ctl->tree_lock);
>  		if (cached &&
>  		    block_group->free_space_ctl->free_space <
> @@ -5066,7 +5081,7 @@ have_block_group:
>  			 * the refill lock keeps out other
>  			 * people trying to start a new cluster
>  			 */
> -			spin_lock(&last_ptr->refill_lock);
> +			mutex_lock(&last_ptr->refill_lock);
>  			if (last_ptr->block_group &&
>  			    (last_ptr->block_group->ro ||
>  			    !block_group_bits(last_ptr->block_group, data))) {
> @@ -5078,7 +5093,7 @@ have_block_group:
>  						 num_bytes, search_start);
>  			if (offset) {
>  				/* we have a block, we're done */
> -				spin_unlock(&last_ptr->refill_lock);
> +				mutex_unlock(&last_ptr->refill_lock);
>  				goto checks;
>  			}
>  
> @@ -5097,7 +5112,7 @@ have_block_group:
>  				block_group = last_ptr->block_group;
>  				btrfs_get_block_group(block_group);
>  				spin_unlock(&last_ptr->lock);
> -				spin_unlock(&last_ptr->refill_lock);
> +				mutex_unlock(&last_ptr->refill_lock);
>  
>  				last_ptr_loop = 1;
>  				search_start = block_group->key.objectid;
> @@ -5123,7 +5138,7 @@ refill_cluster:
>  			/* allocate a cluster in this block group */
>  			ret = btrfs_find_space_cluster(trans, root,
>  					       block_group, last_ptr,
> -					       offset, num_bytes,
> +					       search_start, num_bytes,
>  					       empty_cluster + empty_size);
>  			if (ret == 0) {
>  				/*
> @@ -5135,12 +5150,12 @@ refill_cluster:
>  						  search_start);
>  				if (offset) {
>  					/* we found one, proceed */
> -					spin_unlock(&last_ptr->refill_lock);
> +					mutex_unlock(&last_ptr->refill_lock);
>  					goto checks;
>  				}
>  			} else if (!cached && loop > LOOP_CACHING_NOWAIT
>  				   && !failed_cluster_refill) {
> -				spin_unlock(&last_ptr->refill_lock);
> +				mutex_unlock(&last_ptr->refill_lock);
>  
>  				failed_cluster_refill = true;
>  				wait_block_group_cache_progress(block_group,
> @@ -5155,7 +5170,7 @@ refill_cluster:
>  			 * to use, and go to the next block group
>  			 */
>  			btrfs_return_cluster_to_free_space(NULL, last_ptr);
> -			spin_unlock(&last_ptr->refill_lock);
> +			mutex_unlock(&last_ptr->refill_lock);
>  			goto loop;
>  		}
>  
> @@ -5197,9 +5212,11 @@ checks:
>  		ins->objectid = search_start;
>  		ins->offset = num_bytes;
>  
> -		if (offset < search_start)
> +		if (offset < search_start) {
>  			btrfs_add_free_space(block_group, offset,
>  					     search_start - offset);
> +			offset = search_start;
> +		}
>  		BUG_ON(offset > search_start);
>  
>  		ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
> @@ -5210,13 +5227,6 @@ checks:
>  		}
>  
>  		/* we are all good, lets return */
> -		ins->objectid = search_start;
> -		ins->offset = num_bytes;
> -
> -		if (offset < search_start)
> -			btrfs_add_free_space(block_group, offset,
> -					     search_start - offset);
> -		BUG_ON(offset > search_start);
>  		btrfs_put_block_group(block_group);
>  		break;
>  loop:
> @@ -5236,6 +5246,12 @@ loop:
>  	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
>  	 *			caching kthreads as we move along
>  	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
> +	 * LOOP_RECLAIM_CLUSTERS, reclaim free space from some clusters, and
> +	 *			  by this way, we may find enough continuous
> +	 *			  space to fill the cluster, and then search
> +	 *			  the free space again.
> +	 * LOOP_RECLAIM_ALL_CLUSTERS, reclaim free space from all the clusters,
> +	 *			      and then search again.
>  	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
>  	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
>  	 *			again
> @@ -5362,12 +5378,12 @@ again:
>  	up_read(&info->groups_sem);
>  }
>  
> -int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
> -			 struct btrfs_root *root,
> -			 u64 num_bytes, u64 min_alloc_size,
> -			 u64 empty_size, u64 hint_byte,
> -			 u64 search_end, struct btrfs_key *ins,
> -			 u64 data)
> +int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
> +			   struct btrfs_root *root,
> +			   u64 num_bytes, u64 min_alloc_size,
> +			   u64 empty_size, u64 hint_byte,
> +			   u64 search_end, struct btrfs_key *ins,
> +			   u64 data, struct btrfs_free_cluster *cluster)
>  {
>  	int ret;
>  	u64 search_start = 0;
> @@ -5386,7 +5402,7 @@ again:
>  	WARN_ON(num_bytes < root->sectorsize);
>  	ret = find_free_extent(trans, root, num_bytes, empty_size,
>  			       search_start, search_end, hint_byte,
> -			       ins, data);
> +			       ins, data, cluster);
>  
>  	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
>  		num_bytes = num_bytes >> 1;
> @@ -5741,13 +5757,15 @@ static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
>   */
>  struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>  					struct btrfs_root *root, u32 blocksize,
> -					u64 parent, u64 root_objectid,
> +					struct extent_buffer *parent,
> +					u64 parent_start, u64 root_objectid,
>  					struct btrfs_disk_key *key, int level,
>  					u64 hint, u64 empty_size)
>  {
>  	struct btrfs_key ins;
>  	struct btrfs_block_rsv *block_rsv;
>  	struct extent_buffer *buf;
> +	struct btrfs_free_cluster *cluster = NULL;
>  	u64 flags = 0;
>  	int ret;
>  
> @@ -5756,8 +5774,19 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>  	if (IS_ERR(block_rsv))
>  		return ERR_CAST(block_rsv);
>  
> -	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
> -				   empty_size, hint, (u64)-1, &ins, 0);
> +	/*
> +	 * We needn't worry about allocation failure, because if failed,
> +	 * we will use the default metadata cluster in fs_info
> +	 */
> +	if (parent && !parent->cluster)
> +		parent->cluster = btrfs_alloc_free_cluster();
> +
> +	if (parent)
> +		cluster = parent->cluster;
> +
> +	ret = btrfs_reserve_extent_mdata(trans, root, blocksize, blocksize,
> +					 empty_size, hint, (u64)-1, &ins,
> +					 cluster);
>  	if (ret) {
>  		unuse_block_rsv(block_rsv, blocksize);
>  		return ERR_PTR(ret);
> @@ -5768,11 +5797,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>  	BUG_ON(IS_ERR(buf));
>  
>  	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
> -		if (parent == 0)
> -			parent = ins.objectid;
> +		if (parent_start == 0)
> +			parent_start = ins.objectid;
>  		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
>  	} else
> -		BUG_ON(parent > 0);
> +		BUG_ON(parent_start > 0);
>  
>  	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
>  		struct btrfs_delayed_extent_op *extent_op;
> @@ -5788,7 +5817,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
>  		extent_op->is_data = 0;
>  
>  		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
> -					ins.offset, parent, root_objectid,
> +					ins.offset, parent_start, root_objectid,
>  					level, BTRFS_ADD_DELAYED_EXTENT,
>  					extent_op);
>  		BUG_ON(ret);
> @@ -7231,18 +7260,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>  
>  	/* make sure this block group isn't part of an allocation cluster */
>  	cluster = &root->fs_info->data_alloc_cluster;
> -	spin_lock(&cluster->refill_lock);
> +	mutex_lock(&cluster->refill_lock);
>  	btrfs_return_cluster_to_free_space(block_group, cluster);
> -	spin_unlock(&cluster->refill_lock);
> +	mutex_unlock(&cluster->refill_lock);
>  
>  	/*
>  	 * make sure this block group isn't part of a metadata
>  	 * allocation cluster
>  	 */
>  	cluster = &root->fs_info->meta_alloc_cluster;
> -	spin_lock(&cluster->refill_lock);
> +	mutex_lock(&cluster->refill_lock);
>  	btrfs_return_cluster_to_free_space(block_group, cluster);
> -	spin_unlock(&cluster->refill_lock);
> +	mutex_unlock(&cluster->refill_lock);
>  
>  	path = btrfs_alloc_path();
>  	if (!path) {
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index d418164..78196f2 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -17,6 +17,7 @@
>  #include "compat.h"
>  #include "ctree.h"
>  #include "btrfs_inode.h"
> +#include "free-space-cache.h"
>  
>  static struct kmem_cache *extent_state_cache;
>  static struct kmem_cache *extent_buffer_cache;
> @@ -2988,6 +2989,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
>  	spin_unlock_irqrestore(&leak_lock, flags);
>  #endif
>  	atomic_set(&eb->refs, 1);
> +	eb->cluster = NULL;
>  
>  	return eb;
>  }
> @@ -3827,7 +3829,10 @@ out:
>  	spin_unlock(&tree->buffer_lock);
>  
>  	/* at this point we can safely release the extent buffer */
> -	if (atomic_read(&eb->refs) == 0)
> +	if (atomic_read(&eb->refs) == 0) {
> +		btrfs_free_extent_cluster(eb);
>  		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
> +	}
>  	return ret;
>  }
> +
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 7b2f0c3..8ee8953 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -146,6 +146,9 @@ struct extent_buffer {
>  	 * to unlock
>  	 */
>  	wait_queue_head_t read_lock_wq;
> +
> +	/* Used for the node to allocate space for its children */
> +	struct btrfs_free_cluster *cluster;
>  };
>  
>  static inline void extent_set_compress_type(unsigned long *bio_flags,
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index e555899..e540e77 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -31,9 +31,15 @@
>  #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
>  
>  static struct kmem_cache *btrfs_free_space_cachep;
> +static struct kmem_cache *btrfs_free_cluster_cachep;
>  
>  static int link_free_space(struct btrfs_free_space_ctl *ctl,
>  			   struct btrfs_free_space *info);
> +static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
> +				 struct btrfs_free_space *info,
> +				 bool update_stat);
> +static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
> +			      struct btrfs_free_space *info, bool update_stat);
>  
>  int __init free_space_cache_init(void)
>  {
> @@ -43,6 +49,14 @@ int __init free_space_cache_init(void)
>  	if (!btrfs_free_space_cachep)
>  		return -ENOMEM;
>  
> +	btrfs_free_cluster_cachep = kmem_cache_create("extent_clusters",
> +			sizeof(struct btrfs_free_cluster), 0,
> +			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,	NULL);
> +	if (!btrfs_free_cluster_cachep) {
> +		kmem_cache_destroy(btrfs_free_space_cachep);
> +		return -ENOMEM;
> +	}
> +
>  	return 0;
>  }
>  
> @@ -50,6 +64,8 @@ void free_space_cache_exit(void)
>  {
>  	if (btrfs_free_space_cachep)
>  		kmem_cache_destroy(btrfs_free_space_cachep);
> +	if (btrfs_free_cluster_cachep)
> +		kmem_cache_destroy(btrfs_free_cluster_cachep);
>  }
>  
>  static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
> @@ -397,11 +413,32 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
>  
>  			if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
>  				spin_lock(&ctl->tree_lock);
> +				if (try_merge_free_space(ctl, e, true))
> +					goto link;
> +
> +				ret = insert_into_bitmap(ctl, e, true);
> +				if (ret < 0) {
> +					spin_unlock(&ctl->tree_lock);
> +					printk(KERN_ERR "Inserting into bitmap "
> +					       "failed, dumping\n");
> +					kmem_cache_free(btrfs_free_space_cachep,
> +							e);
> +					kunmap(page);
> +					unlock_page(page);
> +					page_cache_release(page);
> +					goto free_cache;
> +				} else if (ret) {
> +					ret = 0;
> +					goto next_entry;
> +				}
> +link:
>  				ret = link_free_space(ctl, e);
>  				spin_unlock(&ctl->tree_lock);
>  				if (ret) {
>  					printk(KERN_ERR "Duplicate entries in "
>  					       "free space cache, dumping\n");
> +					kmem_cache_free(btrfs_free_space_cachep,
> +							e);
>  					kunmap(page);
>  					unlock_page(page);
>  					page_cache_release(page);
> @@ -425,6 +462,9 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
>  				if (ret) {
>  					printk(KERN_ERR "Duplicate entries in "
>  					       "free space cache, dumping\n");
> +					kfree(e->bitmap);
> +					kmem_cache_free(
> +						btrfs_free_space_cachep, e);
>  					kunmap(page);
>  					unlock_page(page);
>  					page_cache_release(page);
> @@ -432,7 +472,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
>  				}
>  				list_add_tail(&e->list, &bitmaps);
>  			}
> -
> +next_entry:
>  			num_entries--;
>  			offset += sizeof(struct btrfs_free_space_entry);
>  			if (offset + sizeof(struct btrfs_free_space_entry) >=
> @@ -676,7 +716,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>  		while (node && !next_page) {
>  			struct btrfs_free_space *e;
>  
> -			e = rb_entry(node, struct btrfs_free_space, offset_index);
> +			e = rb_entry(node, struct btrfs_free_space,
> +				     offset_index);
>  			entries++;
>  
>  			entry->offset = cpu_to_le64(e->offset);
> @@ -689,10 +730,39 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
>  				entry->type = BTRFS_FREE_SPACE_EXTENT;
>  			}
>  			node = rb_next(node);
> -			if (!node && cluster) {
> -				node = rb_first(&cluster->root);
> +			offset += sizeof(struct btrfs_free_space_entry);
> +			if (offset + sizeof(struct btrfs_free_space_entry) >=
> +			    PAGE_CACHE_SIZE)
> +				next_page = true;
> +			entry++;
> +		}
> +
> +		/*
> +		 * We want to write the extent in the cluster to our free space
> +		 * cache.
> +		 */
> +		while (cluster && !next_page) {
> +			struct btrfs_free_space *e;
> +
> +			e = cluster->info;
> +			if (!e || !e->bytes)
> +				break;
> +
> +			entries++;
> +
> +			entry->offset = cpu_to_le64(e->offset);
> +			entry->bytes = cpu_to_le64(e->bytes);
> +			entry->type = BTRFS_FREE_SPACE_EXTENT;
> +
> +			if (list_is_last(&cluster->block_group_list,
> +					 &block_group->cluster_list))
>  				cluster = NULL;
> -			}
> +			else
> +				cluster = list_entry(
> +						cluster->block_group_list.next,
> +						struct btrfs_free_cluster,
> +						block_group_list);
> +
>  			offset += sizeof(struct btrfs_free_space_entry);
>  			if (offset + sizeof(struct btrfs_free_space_entry) >=
>  			    PAGE_CACHE_SIZE)
> @@ -1125,19 +1195,30 @@ static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
>  	ctl->free_space -= info->bytes;
>  }
>  
> +static int __link_free_space(struct btrfs_free_space_ctl *ctl,
> +			     struct btrfs_free_space *info)
> +{
> +	int ret;
> +
> +	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
> +				 &info->offset_index, (info->bitmap != NULL));
> +	if (ret)
> +		return ret;
> +	ctl->free_extents++;
> +	return 0;
> +}
> +
>  static int link_free_space(struct btrfs_free_space_ctl *ctl,
>  			   struct btrfs_free_space *info)
>  {
> -	int ret = 0;
> +	int ret;
>  
>  	BUG_ON(!info->bitmap && !info->bytes);
> -	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
> -				 &info->offset_index, (info->bitmap != NULL));
> +	ret = __link_free_space(ctl, info);
>  	if (ret)
>  		return ret;
>  
>  	ctl->free_space += info->bytes;
> -	ctl->free_extents++;
>  	return ret;
>  }
>  
> @@ -1212,7 +1293,7 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
>  
>  static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
>  			    struct btrfs_free_space *info, u64 offset,
> -			    u64 bytes)
> +			    u64 bytes, bool update_stat)
>  {
>  	unsigned long start, count;
>  
> @@ -1223,7 +1304,8 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
>  	bitmap_set(info->bitmap, start, count);
>  
>  	info->bytes += bytes;
> -	ctl->free_space += bytes;
> +	if (update_stat)
> +		ctl->free_space += bytes;
>  }
>  
>  static int search_bitmap(struct btrfs_free_space_ctl *ctl,
> @@ -1394,7 +1476,7 @@ again:
>  
>  static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
>  			       struct btrfs_free_space *info, u64 offset,
> -			       u64 bytes)
> +			       u64 bytes, bool update_stat)
>  {
>  	u64 bytes_to_set = 0;
>  	u64 end;
> @@ -1403,7 +1485,7 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
>  
>  	bytes_to_set = min(end - offset, bytes);
>  
> -	bitmap_set_bits(ctl, info, offset, bytes_to_set);
> +	bitmap_set_bits(ctl, info, offset, bytes_to_set, update_stat);
>  
>  	return bytes_to_set;
>  
> @@ -1451,10 +1533,9 @@ static struct btrfs_free_space_op free_space_op = {
>  };
>  
>  static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
> -			      struct btrfs_free_space *info)
> +			      struct btrfs_free_space *info, bool update_stat)
>  {
>  	struct btrfs_free_space *bitmap_info;
> -	struct btrfs_block_group_cache *block_group = NULL;
>  	int added = 0;
>  	u64 bytes, offset, bytes_added;
>  	int ret;
> @@ -1465,49 +1546,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
>  	if (!ctl->op->use_bitmap(ctl, info))
>  		return 0;
>  
> -	if (ctl->op == &free_space_op)
> -		block_group = ctl->private;
>  again:
> -	/*
> -	 * Since we link bitmaps right into the cluster we need to see if we
> -	 * have a cluster here, and if so and it has our bitmap we need to add
> -	 * the free space to that bitmap.
> -	 */
> -	if (block_group && !list_empty(&block_group->cluster_list)) {
> -		struct btrfs_free_cluster *cluster;
> -		struct rb_node *node;
> -		struct btrfs_free_space *entry;
> -
> -		cluster = list_entry(block_group->cluster_list.next,
> -				     struct btrfs_free_cluster,
> -				     block_group_list);
> -		spin_lock(&cluster->lock);
> -		node = rb_first(&cluster->root);
> -		if (!node) {
> -			spin_unlock(&cluster->lock);
> -			goto no_cluster_bitmap;
> -		}
> -
> -		entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -		if (!entry->bitmap) {
> -			spin_unlock(&cluster->lock);
> -			goto no_cluster_bitmap;
> -		}
> -
> -		if (entry->offset == offset_to_bitmap(ctl, offset)) {
> -			bytes_added = add_bytes_to_bitmap(ctl, entry,
> -							  offset, bytes);
> -			bytes -= bytes_added;
> -			offset += bytes_added;
> -		}
> -		spin_unlock(&cluster->lock);
> -		if (!bytes) {
> -			ret = 1;
> -			goto out;
> -		}
> -	}
> -
> -no_cluster_bitmap:
>  	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
>  					 1, 0);
>  	if (!bitmap_info) {
> @@ -1515,7 +1554,8 @@ no_cluster_bitmap:
>  		goto new_bitmap;
>  	}
>  
> -	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
> +	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes,
> +					  update_stat);
>  	bytes -= bytes_added;
>  	offset += bytes_added;
>  	added = 0;
> @@ -1533,6 +1573,12 @@ new_bitmap:
>  		info = NULL;
>  		goto again;
>  	} else {
> +		if (current->flags & PF_MEMALLOC) {
> +			printk(KERN_INFO "Alloc memory when reclaim "
> +			       "the pages\n");
> +			return 0;
> +		}
> +
>  		spin_unlock(&ctl->tree_lock);
>  
>  		/* no pre-allocated info, allocate a new one */
> @@ -1635,7 +1681,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
>  	 * extent then we know we're going to have to allocate a new extent, so
>  	 * before we do that see if we need to drop this into a bitmap
>  	 */
> -	ret = insert_into_bitmap(ctl, info);
> +	ret = insert_into_bitmap(ctl, info, true);
>  	if (ret < 0) {
>  		goto out;
>  	} else if (ret) {
> @@ -1829,36 +1875,50 @@ __btrfs_return_cluster_to_free_space(
>  {
>  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>  	struct btrfs_free_space *entry;
> -	struct rb_node *node;
> +	int ret = 0;
>  
>  	spin_lock(&cluster->lock);
> -	if (cluster->block_group != block_group)
> +	if (cluster->block_group != block_group) {
> +		spin_unlock(&cluster->lock);
>  		goto out;
> +	}
>  
>  	cluster->block_group = NULL;
> -	cluster->window_start = 0;
> +
> +	entry = cluster->info;
> +	cluster->info = NULL;
> +
>  	list_del_init(&cluster->block_group_list);
> +	spin_unlock(&cluster->lock);
>  
> -	node = rb_first(&cluster->root);
> -	while (node) {
> -		bool bitmap;
> +	if (!entry)
> +		goto out;
>  
> -		entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -		node = rb_next(&entry->offset_index);
> -		rb_erase(&entry->offset_index, &cluster->root);
> -
> -		bitmap = (entry->bitmap != NULL);
> -		if (!bitmap)
> -			try_merge_free_space(ctl, entry, false);
> -		tree_insert_offset(&ctl->free_space_offset,
> -				   entry->offset, &entry->offset_index, bitmap);
> +	if (!entry->bytes) {
> +		kmem_cache_free(btrfs_free_space_cachep, entry);
> +		goto out;
>  	}
> -	cluster->root = RB_ROOT;
>  
> +	if (try_merge_free_space(ctl, entry, false))
> +		goto link;
> +
> +	ret = insert_into_bitmap(ctl, entry, false);
> +	if (ret < 0)
> +		goto out;
> +	else if (ret) {
> +		ret = 0;
> +		goto out;
> +	}
> +link:
> +	ret = __link_free_space(ctl, entry);
> +	if (ret) {
> +		kmem_cache_free(btrfs_free_space_cachep, entry);
> +		printk(KERN_ERR "Duplicate entries in free space cache, "
> +				"dumping\n");
> +	}
>  out:
> -	spin_unlock(&cluster->lock);
>  	btrfs_put_block_group(block_group);
> -	return 0;
> +	return ret;
>  }
>  
>  void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
> @@ -1889,29 +1949,58 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
>  	spin_unlock(&ctl->tree_lock);
>  }
>  
> -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
> +static void __btrfs_reclaim_extent_clusters(
> +				struct btrfs_block_group_cache *block_group,
> +				u64 to_reclaim)
>  {
> -	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
>  	struct btrfs_free_cluster *cluster;
>  	struct list_head *head;
> +	u64 bytes;
> +	bool reclaim_all = (to_reclaim == 0);
>  
> -	spin_lock(&ctl->tree_lock);
>  	while ((head = block_group->cluster_list.next) !=
>  	       &block_group->cluster_list) {
>  		cluster = list_entry(head, struct btrfs_free_cluster,
>  				     block_group_list);
>  
>  		WARN_ON(cluster->block_group != block_group);
> +
> +		bytes = cluster->info->bytes;
>  		__btrfs_return_cluster_to_free_space(block_group, cluster);
> +
> +		if (!reclaim_all) {
> +			if (to_reclaim > bytes)
> +				to_reclaim -= bytes;
> +			else {
> +				to_reclaim = 0;
> +				break;
> +			}
> +		}
> +
>  		if (need_resched()) {
> -			spin_unlock(&ctl->tree_lock);
> +			spin_unlock(&block_group->free_space_ctl->tree_lock);
>  			cond_resched();
> -			spin_lock(&ctl->tree_lock);
> +			spin_lock(&block_group->free_space_ctl->tree_lock);
>  		}
>  	}
> +}
> +
> +void btrfs_reclaim_extent_clusters(struct btrfs_block_group_cache *block_group,
> +				   u64 to_reclaim)
> +{
> +	spin_lock(&block_group->free_space_ctl->tree_lock);
> +	__btrfs_reclaim_extent_clusters(block_group, to_reclaim);
> +	spin_unlock(&block_group->free_space_ctl->tree_lock);
> +}
> +
> +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
> +{
> +	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> +
> +	spin_lock(&ctl->tree_lock);
> +	__btrfs_reclaim_extent_clusters(block_group, 0);
>  	__btrfs_remove_free_space_cache_locked(ctl);
>  	spin_unlock(&ctl->tree_lock);
> -
>  }
>  
>  u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
> @@ -1991,30 +2080,6 @@ int btrfs_return_cluster_to_free_space(
>  	return ret;
>  }
>  
> -static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
> -				   struct btrfs_free_cluster *cluster,
> -				   struct btrfs_free_space *entry,
> -				   u64 bytes, u64 min_start)
> -{
> -	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -	int err;
> -	u64 search_start = cluster->window_start;
> -	u64 search_bytes = bytes;
> -	u64 ret = 0;
> -
> -	search_start = min_start;
> -	search_bytes = bytes;
> -
> -	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
> -	if (err)
> -		return 0;
> -
> -	ret = search_start;
> -	__bitmap_clear_bits(ctl, entry, ret, bytes);
> -
> -	return ret;
> -}
> -
>  /*
>   * given a cluster, try to allocate 'bytes' from it, returns 0
>   * if it couldn't find anything suitably large, or a logical disk offset
> @@ -2025,56 +2090,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
>  			     u64 min_start)
>  {
>  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -	struct btrfs_free_space *entry = NULL;
> -	struct rb_node *node;
>  	u64 ret = 0;
>  
>  	spin_lock(&cluster->lock);
> -	if (bytes > cluster->max_size)
> -		goto out;
> -
>  	if (cluster->block_group != block_group)
>  		goto out;
>  
> -	node = rb_first(&cluster->root);
> -	if (!node)
> +	/*
> +	 * If we set ->block_group, it means we have filled this cluster,
> +	 * and ->info also has been set. So we needn't check ->info is
> +	 * NULL or not now.
> +	 */
> +	if (bytes > cluster->info->bytes)
>  		goto out;
>  
> -	entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -	while(1) {
> -		if (entry->bytes < bytes ||
> -		    (!entry->bitmap && entry->offset < min_start)) {
> -			node = rb_next(&entry->offset_index);
> -			if (!node)
> -				break;
> -			entry = rb_entry(node, struct btrfs_free_space,
> -					 offset_index);
> -			continue;
> -		}
> -
> -		if (entry->bitmap) {
> -			ret = btrfs_alloc_from_bitmap(block_group,
> -						      cluster, entry, bytes,
> -						      min_start);
> -			if (ret == 0) {
> -				node = rb_next(&entry->offset_index);
> -				if (!node)
> -					break;
> -				entry = rb_entry(node, struct btrfs_free_space,
> -						 offset_index);
> -				continue;
> -			}
> -		} else {
> -			ret = entry->offset;
> -
> -			entry->offset += bytes;
> -			entry->bytes -= bytes;
> -		}
> +	if (min_start >= cluster->info->offset + cluster->info->bytes)
> +		goto out;
>  
> -		if (entry->bytes == 0)
> -			rb_erase(&entry->offset_index, &cluster->root);
> -		break;
> -	}
> +	ret = cluster->info->offset;
> +	cluster->info->offset += bytes;
> +	cluster->info->bytes -= bytes;
>  out:
>  	spin_unlock(&cluster->lock);
>  
> @@ -2082,260 +2117,12 @@ out:
>  		return 0;
>  
>  	spin_lock(&ctl->tree_lock);
> -
>  	ctl->free_space -= bytes;
> -	if (entry->bytes == 0) {
> -		ctl->free_extents--;
> -		if (entry->bitmap) {
> -			kfree(entry->bitmap);
> -			ctl->total_bitmaps--;
> -			ctl->op->recalc_thresholds(ctl);
> -		}
> -		kmem_cache_free(btrfs_free_space_cachep, entry);
> -	}
> -
>  	spin_unlock(&ctl->tree_lock);
>  
>  	return ret;
>  }
>  
> -static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
> -				struct btrfs_free_space *entry,
> -				struct btrfs_free_cluster *cluster,
> -				u64 offset, u64 bytes, u64 min_bytes)
> -{
> -	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -	unsigned long next_zero;
> -	unsigned long i;
> -	unsigned long search_bits;
> -	unsigned long total_bits;
> -	unsigned long found_bits;
> -	unsigned long start = 0;
> -	unsigned long total_found = 0;
> -	int ret;
> -	bool found = false;
> -
> -	i = offset_to_bit(entry->offset, block_group->sectorsize,
> -			  max_t(u64, offset, entry->offset));
> -	search_bits = bytes_to_bits(bytes, block_group->sectorsize);
> -	total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
> -
> -again:
> -	found_bits = 0;
> -	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
> -	     i < BITS_PER_BITMAP;
> -	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
> -		next_zero = find_next_zero_bit(entry->bitmap,
> -					       BITS_PER_BITMAP, i);
> -		if (next_zero - i >= search_bits) {
> -			found_bits = next_zero - i;
> -			break;
> -		}
> -		i = next_zero;
> -	}
> -
> -	if (!found_bits)
> -		return -ENOSPC;
> -
> -	if (!found) {
> -		start = i;
> -		found = true;
> -	}
> -
> -	total_found += found_bits;
> -
> -	if (cluster->max_size < found_bits * block_group->sectorsize)
> -		cluster->max_size = found_bits * block_group->sectorsize;
> -
> -	if (total_found < total_bits) {
> -		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
> -		if (i - start > total_bits * 2) {
> -			total_found = 0;
> -			cluster->max_size = 0;
> -			found = false;
> -		}
> -		goto again;
> -	}
> -
> -	cluster->window_start = start * block_group->sectorsize +
> -		entry->offset;
> -	rb_erase(&entry->offset_index, &ctl->free_space_offset);
> -	ret = tree_insert_offset(&cluster->root, entry->offset,
> -				 &entry->offset_index, 1);
> -	BUG_ON(ret);
> -
> -	return 0;
> -}
> -
> -/*
> - * This searches the block group for just extents to fill the cluster with.
> - */
> -static noinline int
> -setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
> -			struct btrfs_free_cluster *cluster,
> -			struct list_head *bitmaps, u64 offset, u64 bytes,
> -			u64 min_bytes)
> -{
> -	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -	struct btrfs_free_space *first = NULL;
> -	struct btrfs_free_space *entry = NULL;
> -	struct btrfs_free_space *prev = NULL;
> -	struct btrfs_free_space *last;
> -	struct rb_node *node;
> -	u64 window_start;
> -	u64 window_free;
> -	u64 max_extent;
> -	u64 max_gap = 128 * 1024;
> -
> -	entry = tree_search_offset(ctl, offset, 0, 1);
> -	if (!entry)
> -		return -ENOSPC;
> -
> -	/*
> -	 * We don't want bitmaps, so just move along until we find a normal
> -	 * extent entry.
> -	 */
> -	while (entry->bitmap) {
> -		if (list_empty(&entry->list))
> -			list_add_tail(&entry->list, bitmaps);
> -		node = rb_next(&entry->offset_index);
> -		if (!node)
> -			return -ENOSPC;
> -		entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -	}
> -
> -	window_start = entry->offset;
> -	window_free = entry->bytes;
> -	max_extent = entry->bytes;
> -	first = entry;
> -	last = entry;
> -	prev = entry;
> -
> -	while (window_free <= min_bytes) {
> -		node = rb_next(&entry->offset_index);
> -		if (!node)
> -			return -ENOSPC;
> -		entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -
> -		if (entry->bitmap) {
> -			if (list_empty(&entry->list))
> -				list_add_tail(&entry->list, bitmaps);
> -			continue;
> -		}
> -
> -		/*
> -		 * we haven't filled the empty size and the window is
> -		 * very large.  reset and try again
> -		 */
> -		if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
> -		    entry->offset - window_start > (min_bytes * 2)) {
> -			first = entry;
> -			window_start = entry->offset;
> -			window_free = entry->bytes;
> -			last = entry;
> -			max_extent = entry->bytes;
> -		} else {
> -			last = entry;
> -			window_free += entry->bytes;
> -			if (entry->bytes > max_extent)
> -				max_extent = entry->bytes;
> -		}
> -		prev = entry;
> -	}
> -
> -	cluster->window_start = first->offset;
> -
> -	node = &first->offset_index;
> -
> -	/*
> -	 * now we've found our entries, pull them out of the free space
> -	 * cache and put them into the cluster rbtree
> -	 */
> -	do {
> -		int ret;
> -
> -		entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -		node = rb_next(&entry->offset_index);
> -		if (entry->bitmap)
> -			continue;
> -
> -		rb_erase(&entry->offset_index, &ctl->free_space_offset);
> -		ret = tree_insert_offset(&cluster->root, entry->offset,
> -					 &entry->offset_index, 0);
> -		BUG_ON(ret);
> -	} while (node && entry != last);
> -
> -	cluster->max_size = max_extent;
> -
> -	return 0;
> -}
> -
> -/*
> - * This specifically looks for bitmaps that may work in the cluster, we assume
> - * that we have already failed to find extents that will work.
> - */
> -static noinline int
> -setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
> -		     struct btrfs_free_cluster *cluster,
> -		     struct list_head *bitmaps, u64 offset, u64 bytes,
> -		     u64 min_bytes)
> -{
> -	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -	struct btrfs_free_space *entry;
> -	struct rb_node *node;
> -	int ret = -ENOSPC;
> -
> -	if (ctl->total_bitmaps == 0)
> -		return -ENOSPC;
> -
> -	/*
> -	 * First check our cached list of bitmaps and see if there is an entry
> -	 * here that will work.
> -	 */
> -	list_for_each_entry(entry, bitmaps, list) {
> -		if (entry->bytes < min_bytes)
> -			continue;
> -		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
> -					   bytes, min_bytes);
> -		if (!ret)
> -			return 0;
> -	}
> -
> -	/*
> -	 * If we do have entries on our list and we are here then we didn't find
> -	 * anything, so go ahead and get the next entry after the last entry in
> -	 * this list and start the search from there.
> -	 */
> -	if (!list_empty(bitmaps)) {
> -		entry = list_entry(bitmaps->prev, struct btrfs_free_space,
> -				   list);
> -		node = rb_next(&entry->offset_index);
> -		if (!node)
> -			return -ENOSPC;
> -		entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -		goto search;
> -	}
> -
> -	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
> -	if (!entry)
> -		return -ENOSPC;
> -
> -search:
> -	node = &entry->offset_index;
> -	do {
> -		entry = rb_entry(node, struct btrfs_free_space, offset_index);
> -		node = rb_next(&entry->offset_index);
> -		if (!entry->bitmap)
> -			continue;
> -		if (entry->bytes < min_bytes)
> -			continue;
> -		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
> -					   bytes, min_bytes);
> -	} while (ret && node);
> -
> -	return ret;
> -}
> -
>  /*
>   * here we try to find a cluster of blocks in a block group.  The goal
>   * is to find at least bytes free and up to empty_size + bytes free.
> @@ -2351,11 +2138,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
>  			     u64 offset, u64 bytes, u64 empty_size)
>  {
>  	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> -	struct list_head bitmaps;
> -	struct btrfs_free_space *entry, *tmp;
> +	struct btrfs_free_space *entry, *info;
>  	u64 min_bytes;
>  	int ret;
>  
> +	BUG_ON(cluster->info);
> +
>  	/* for metadata, allow allocates with more holes */
>  	if (btrfs_test_opt(root, SSD_SPREAD)) {
>  		min_bytes = bytes + empty_size;
> @@ -2369,9 +2157,16 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
>  			min_bytes = max(bytes, (bytes + empty_size) >> 1);
>  		else
>  			min_bytes = max(bytes, (bytes + empty_size) >> 4);
> +		min_bytes = max(min_bytes, (u64)256 * 1024);
>  	} else
>  		min_bytes = max(bytes, (bytes + empty_size) >> 2);
>  
> +	min_bytes = round_up(min_bytes, root->sectorsize);
> +
> +	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
> +	if (!info)
> +		return -ENOMEM;
> +
>  	spin_lock(&ctl->tree_lock);
>  
>  	/*
> @@ -2380,6 +2175,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
>  	 */
>  	if (ctl->free_space < min_bytes) {
>  		spin_unlock(&ctl->tree_lock);
> +		kmem_cache_free(btrfs_free_space_cachep, info);
>  		return -ENOSPC;
>  	}
>  
> @@ -2388,26 +2184,44 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
>  	/* someone already found a cluster, hooray */
>  	if (cluster->block_group) {
>  		ret = 0;
> +		kmem_cache_free(btrfs_free_space_cachep, info);
>  		goto out;
>  	}
>  
> -	INIT_LIST_HEAD(&bitmaps);
> -	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
> -				      bytes, min_bytes);
> -	if (ret)
> -		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
> -					   offset, bytes, min_bytes);
> +	bytes = min_bytes;
> +	entry = find_free_space(ctl, &offset, &min_bytes);
> +	if (!entry) {
> +		ret = -ENOSPC;
> +		kmem_cache_free(btrfs_free_space_cachep, info);
> +		goto out;
> +	}
>  
> -	/* Clear our temporary list */
> -	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
> -		list_del_init(&entry->list);
> +	BUG_ON(min_bytes < bytes);
>  
> -	if (!ret) {
> -		atomic_inc(&block_group->count);
> -		list_add_tail(&cluster->block_group_list,
> -			      &block_group->cluster_list);
> -		cluster->block_group = block_group;
> +	ret = 0;
> +	if (entry->bitmap) {
> +		__bitmap_clear_bits(ctl, entry, offset, bytes);
> +		if (!entry->bytes)
> +			free_bitmap(ctl, entry);
> +	} else {
> +		__unlink_free_space(ctl, entry);
> +		entry->offset += bytes;
> +		entry->bytes -= bytes;
> +		if (!entry->bytes)
> +			kmem_cache_free(btrfs_free_space_cachep, entry);
> +		else
> +			__link_free_space(ctl, entry);
>  	}
> +
> +	info->offset = offset;
> +	info->bytes = bytes;
> +
> +	cluster->info = info;
> +
> +	atomic_inc(&block_group->count);
> +	list_add_tail(&cluster->block_group_list,
> +		      &block_group->cluster_list);
> +	cluster->block_group = block_group;
>  out:
>  	spin_unlock(&cluster->lock);
>  	spin_unlock(&ctl->tree_lock);
> @@ -2421,11 +2235,10 @@ out:
>  void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
>  {
>  	spin_lock_init(&cluster->lock);
> -	spin_lock_init(&cluster->refill_lock);
> -	cluster->root = RB_ROOT;
> -	cluster->max_size = 0;
> +	mutex_init(&cluster->refill_lock);
>  	INIT_LIST_HEAD(&cluster->block_group_list);
>  	cluster->block_group = NULL;
> +	cluster->info = NULL;
>  }
>  
>  int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
> @@ -2665,3 +2478,27 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
>  	iput(inode);
>  	return ret;
>  }
> +
> +struct btrfs_free_cluster *btrfs_alloc_free_cluster(void)
> +{
> +	struct btrfs_free_cluster *cluster;
> +
> +	cluster = kmem_cache_zalloc(btrfs_free_cluster_cachep, GFP_NOFS);
> +	if (!cluster)
> +		return NULL;
> +
> +	btrfs_init_free_cluster(cluster);
> +	return cluster;
> +}
> +
> +void btrfs_release_free_cluster(struct btrfs_free_cluster *cluster)
> +{
> +	/* return the free space in the cluster to the space cache. */
> +	mutex_lock(&cluster->refill_lock);
> +	btrfs_return_cluster_to_free_space(NULL, cluster);
> +	mutex_unlock(&cluster->refill_lock);
> +
> +	/* free the cluster. */
> +	kmem_cache_free(btrfs_free_cluster_cachep, cluster);
> +}
> +
> diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
> index c27ccba..1f7e75c 100644
> --- a/fs/btrfs/free-space-cache.h
> +++ b/fs/btrfs/free-space-cache.h
> @@ -34,6 +34,7 @@ struct btrfs_free_space_ctl {
>  	int extents_thresh;
>  	int free_extents;
>  	int total_bitmaps;
> +	int total_clusters;
>  	int unit;
>  	u64 start;
>  	struct btrfs_free_space_op *op;
> @@ -113,4 +114,16 @@ int btrfs_return_cluster_to_free_space(
>  			       struct btrfs_free_cluster *cluster);
>  int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
>  			   u64 *trimmed, u64 start, u64 end, u64 minlen);
> +void btrfs_reclaim_extent_clusters(struct btrfs_block_group_cache *block_group,
> +				   u64 to_reclaim);
> +struct btrfs_free_cluster *btrfs_alloc_free_cluster(void);
> +void btrfs_release_free_cluster(struct btrfs_free_cluster *cluster);
> +static inline void btrfs_free_extent_cluster(struct extent_buffer *eb)
> +{
> +	if (!eb->cluster)
> +		return;
> +
> +	btrfs_release_free_cluster(eb->cluster);
> +	eb->cluster = NULL;
> +}
>  #endif
> diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
> index b4087e0..d501c1f 100644
> --- a/fs/btrfs/inode-map.c
> +++ b/fs/btrfs/inode-map.c
> @@ -457,6 +457,7 @@ again:
>  	spin_unlock(&root->cache_lock);
>  
>  	spin_lock(&ctl->tree_lock);
> +	WARN_ON(ctl->total_clusters);
>  	prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
>  	prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
>  	prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 7a9e01f..38a5da9 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -51,7 +51,6 @@
>  #include "tree-log.h"
>  #include "compression.h"
>  #include "locking.h"
> -#include "free-space-cache.h"
>  #include "inode-map.h"
>  
>  struct btrfs_iget_args {
> @@ -623,11 +622,10 @@ retry:
>  		trans = btrfs_join_transaction(root);
>  		BUG_ON(IS_ERR(trans));
>  		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
> -		ret = btrfs_reserve_extent(trans, root,
> -					   async_extent->compressed_size,
> -					   async_extent->compressed_size,
> -					   0, alloc_hint,
> -					   (u64)-1, &ins, 1);
> +		ret = btrfs_reserve_extent_data(trans, root,
> +						async_extent->compressed_size,
> +						async_extent->compressed_size,
> +						0, alloc_hint, (u64)-1, &ins);
>  		btrfs_end_transaction(trans, root);
>  
>  		if (ret) {
> @@ -828,9 +826,9 @@ static noinline int cow_file_range(struct inode *inode,
>  		unsigned long op;
>  
>  		cur_alloc_size = disk_num_bytes;
> -		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
> -					   root->sectorsize, 0, alloc_hint,
> -					   (u64)-1, &ins, 1);
> +		ret = btrfs_reserve_extent_data(trans, root, cur_alloc_size,
> +						root->sectorsize, 0, alloc_hint,
> +						(u64)-1, &ins);
>  		BUG_ON(ret);
>  
>  		em = alloc_extent_map();
> @@ -5409,8 +5407,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
>  	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
>  
>  	alloc_hint = get_extent_allocation_hint(inode, start, len);
> -	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
> -				   alloc_hint, (u64)-1, &ins, 1);
> +	ret = btrfs_reserve_extent_data(trans, root, len, root->sectorsize, 0,
> +					alloc_hint, (u64)-1, &ins);
>  	if (ret) {
>  		em = ERR_PTR(ret);
>  		goto out;
> @@ -7276,8 +7274,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
>  			}
>  		}
>  
> -		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
> -					   0, *alloc_hint, (u64)-1, &ins, 1);
> +		ret = btrfs_reserve_extent_data(trans, root, num_bytes,
> +						min_size, 0, *alloc_hint,
> +						(u64)-1, &ins);
>  		if (ret) {
>  			if (own_trans)
>  				btrfs_end_transaction(trans, root);
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 970977a..808203c 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -347,7 +347,7 @@ static noinline int create_subvol(struct btrfs_root *root,
>  	if (IS_ERR(trans))
>  		return PTR_ERR(trans);
>  
> -	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
> +	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, NULL,
>  				      0, objectid, NULL, 0, 0, 0);
>  	if (IS_ERR(leaf)) {
>  		ret = PTR_ERR(leaf);
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 786639f..21d08e2 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -25,6 +25,7 @@
>  #include "print-tree.h"
>  #include "compat.h"
>  #include "tree-log.h"
> +#include "free-space-cache.h"
>  
>  /* magic values for the inode_only field in btrfs_log_inode:
>   *
> @@ -1763,6 +1764,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
>  				ret = btrfs_free_reserved_extent(root,
>  							 bytenr, blocksize);
>  				BUG_ON(ret);
> +
> +				btrfs_free_extent_cluster(next);
>  			}
>  			free_extent_buffer(next);
>  			continue;
> @@ -1832,6 +1835,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
>  						path->nodes[*level]->start,
>  						path->nodes[*level]->len);
>  				BUG_ON(ret);
> +
> +				btrfs_free_extent_cluster(next);
>  			}
>  			free_extent_buffer(path->nodes[*level]);
>  			path->nodes[*level] = NULL;
> @@ -1900,6 +1905,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
>  			ret = btrfs_free_reserved_extent(log, next->start,
>  							 next->len);
>  			BUG_ON(ret);
> +
> +			btrfs_free_extent_cluster(next);
>  		}
>  	}
>  

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux