On Fri, Dec 19, 2014 at 01:41:35PM -0500, Josef Bacik wrote:
> Currently any time we try to update the block groups on disk we will walk _all_
> block groups and check for the ->dirty flag to see if it is set. This function
> can get called several times during a commit. So if you have several terabytes
> of data you will be a very sad panda as we will loop through _all_ of the block
> groups several times, which makes the commit take a while which slows down the
> rest of the file system operations.
>
> This patch introduces a dirty list for the block groups that we get added to
> when we dirty the block group for the first time. Then we simply update any
> block groups that have been dirtied since the last time we called
> btrfs_write_dirty_block_groups. This allows us to clean up how we write the
> free space cache out so it is much cleaner. Thanks,
>
> Signed-off-by: Josef Bacik <jbacik@xxxxxx>
> ---
> V1->V2: Don't unconditionally take the dirty bg list lock in update_block_group,
> only do it if our dirty_bg list is empty.
>
> fs/btrfs/ctree.h | 5 +-
> fs/btrfs/extent-tree.c | 169 ++++++++++++++------------------------------
> fs/btrfs/free-space-cache.c | 8 ++-
> fs/btrfs/transaction.c | 14 ++--
> fs/btrfs/transaction.h | 2 +
> 5 files changed, 74 insertions(+), 124 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index b62315b..e5bc509 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1237,7 +1237,6 @@ enum btrfs_disk_cache_state {
> BTRFS_DC_ERROR = 1,
> BTRFS_DC_CLEAR = 2,
> BTRFS_DC_SETUP = 3,
> - BTRFS_DC_NEED_WRITE = 4,
> };
>
> struct btrfs_caching_control {
> @@ -1275,7 +1274,6 @@ struct btrfs_block_group_cache {
> unsigned long full_stripe_len;
>
> unsigned int ro:1;
> - unsigned int dirty:1;
> unsigned int iref:1;
>
> int disk_cache_state;
> @@ -1309,6 +1307,9 @@ struct btrfs_block_group_cache {
>
> /* For read-only block groups */
> struct list_head ro_list;
> +
> + /* For dirty block groups */
> + struct list_head dirty_list;
> };
>
> /* delayed seq elem */
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 74eb29d..71a9752 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -74,8 +74,9 @@ enum {
> RESERVE_ALLOC_NO_ACCOUNT = 2,
> };
>
> -static int update_block_group(struct btrfs_root *root,
> - u64 bytenr, u64 num_bytes, int alloc);
> +static int update_block_group(struct btrfs_trans_handle *trans,
> + struct btrfs_root *root, u64 bytenr,
> + u64 num_bytes, int alloc);
> static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
> struct btrfs_root *root,
> u64 bytenr, u64 num_bytes, u64 parent,
> @@ -3315,120 +3316,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
> struct btrfs_root *root)
> {
> struct btrfs_block_group_cache *cache;
> - int err = 0;
> + struct btrfs_transaction *cur_trans = trans->transaction;
> + int ret = 0;
> struct btrfs_path *path;
> - u64 last = 0;
> +
> + if (list_empty(&cur_trans->dirty_bgs))
> + return 0;
>
> path = btrfs_alloc_path();
> if (!path)
> return -ENOMEM;
>
> -again:
> - while (1) {
> - cache = btrfs_lookup_first_block_group(root->fs_info, last);
> - while (cache) {
> - if (cache->disk_cache_state == BTRFS_DC_CLEAR)
> - break;
> - cache = next_block_group(root, cache);
> - }
> - if (!cache) {
> - if (last == 0)
> - break;
> - last = 0;
> - continue;
> - }
> - err = cache_save_setup(cache, trans, path);
> - last = cache->key.objectid + cache->key.offset;
> - btrfs_put_block_group(cache);
> - }
> -
> - while (1) {
> - if (last == 0) {
> - err = btrfs_run_delayed_refs(trans, root,
> - (unsigned long)-1);
> - if (err) /* File system offline */
> - goto out;
> - }
> -
> - cache = btrfs_lookup_first_block_group(root->fs_info, last);
> - while (cache) {
> - if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
> - btrfs_put_block_group(cache);
> - goto again;
> - }
> -
> - if (cache->dirty)
> - break;
> - cache = next_block_group(root, cache);
> - }
> - if (!cache) {
> - if (last == 0)
> - break;
> - last = 0;
> - continue;
> - }
> -
> - if (cache->disk_cache_state == BTRFS_DC_SETUP)
> - cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
> - cache->dirty = 0;
> - last = cache->key.objectid + cache->key.offset;
> -
> - err = write_one_cache_group(trans, root, path, cache);
> - btrfs_put_block_group(cache);
> - if (err) /* File system offline */
> - goto out;
> - }
> -
> - while (1) {
> - /*
> - * I don't think this is needed since we're just marking our
> - * preallocated extent as written, but just in case it can't
> - * hurt.
> - */
> - if (last == 0) {
> - err = btrfs_run_delayed_refs(trans, root,
> - (unsigned long)-1);
> - if (err) /* File system offline */
> - goto out;
> - }
> -
> - cache = btrfs_lookup_first_block_group(root->fs_info, last);
> - while (cache) {
> - /*
> - * Really this shouldn't happen, but it could if we
> - * couldn't write the entire preallocated extent and
> - * splitting the extent resulted in a new block.
> - */
> - if (cache->dirty) {
> - btrfs_put_block_group(cache);
> - goto again;
> - }
> - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
> - break;
> - cache = next_block_group(root, cache);
> - }
> - if (!cache) {
> - if (last == 0)
> - break;
> - last = 0;
> - continue;
> - }
> -
> - err = btrfs_write_out_cache(root, trans, cache, path);
> -
> - /*
> - * If we didn't have an error then the cache state is still
> - * NEED_WRITE, so we can set it to WRITTEN.
> - */
> - if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
> - cache->disk_cache_state = BTRFS_DC_WRITTEN;
> - last = cache->key.objectid + cache->key.offset;
> + /*
> + * We don't need the lock here since we are protected by the transaction
> + * commit. We want to do the cache_save_setup first and then run the
> + * delayed refs to make sure we have the best chance at doing this all
> + * in one shot.
> + */
> + while (!list_empty(&cur_trans->dirty_bgs)) {
> + cache = list_first_entry(&cur_trans->dirty_bgs,
> + struct btrfs_block_group_cache,
> + dirty_list);
> + list_del_init(&cache->dirty_list);
> + if (cache->disk_cache_state == BTRFS_DC_CLEAR)
> + cache_save_setup(cache, trans, path);
> + if (!ret)
> + ret = btrfs_run_delayed_refs(trans, root,
> + (unsigned long) -1);
> + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
> + btrfs_write_out_cache(root, trans, cache, path);
> + if (!ret)
> + ret = write_one_cache_group(trans, root, path, cache);
> btrfs_put_block_group(cache);
> }
> -out:
>
> btrfs_free_path(path);
> - return err;
> + return ret;
> }
>
> int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
> @@ -5375,8 +5298,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
> btrfs_free_reserved_data_space(inode, num_bytes);
> }
>
> -static int update_block_group(struct btrfs_root *root,
> - u64 bytenr, u64 num_bytes, int alloc)
> +static int update_block_group(struct btrfs_trans_handle *trans,
> + struct btrfs_root *root, u64 bytenr,
> + u64 num_bytes, int alloc)
> {
> struct btrfs_block_group_cache *cache = NULL;
> struct btrfs_fs_info *info = root->fs_info;
> @@ -5414,6 +5338,16 @@ static int update_block_group(struct btrfs_root *root,
> if (!alloc && cache->cached == BTRFS_CACHE_NO)
> cache_block_group(cache, 1);
>
> + if (list_empty(&cache->dirty_list)) {
> + spin_lock(&trans->transaction->dirty_bgs_lock);
> + if (list_empty(&cache->dirty_list)) {
> + list_add_tail(&cache->dirty_list,
> + &trans->transaction->dirty_bgs);
> + btrfs_get_block_group(cache);
> + }
> + spin_unlock(&trans->transaction->dirty_bgs_lock);
> + }
> +
> byte_in_group = bytenr - cache->key.objectid;
> WARN_ON(byte_in_group > cache->key.offset);
>
> @@ -5424,7 +5358,6 @@ static int update_block_group(struct btrfs_root *root,
> cache->disk_cache_state < BTRFS_DC_CLEAR)
> cache->disk_cache_state = BTRFS_DC_CLEAR;
>
> - cache->dirty = 1;
> old_val = btrfs_block_group_used(&cache->item);
> num_bytes = min(total, cache->key.offset - byte_in_group);
> if (alloc) {
> @@ -6102,7 +6035,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
> }
> }
>
> - ret = update_block_group(root, bytenr, num_bytes, 0);
> + ret = update_block_group(trans, root, bytenr, num_bytes, 0);
> if (ret) {
> btrfs_abort_transaction(trans, extent_root, ret);
> goto out;
> @@ -7062,7 +6995,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
> if (ret)
> return ret;
>
> - ret = update_block_group(root, ins->objectid, ins->offset, 1);
> + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
> if (ret) { /* -ENOENT, logic error */
> btrfs_err(fs_info, "update block group failed for %llu %llu",
> ins->objectid, ins->offset);
> @@ -7151,7 +7084,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> - ret = update_block_group(root, ins->objectid, root->nodesize, 1);
> + ret = update_block_group(trans, root, ins->objectid, root->nodesize,
> + 1);
> if (ret) { /* -ENOENT, logic error */
> btrfs_err(fs_info, "update block group failed for %llu %llu",
> ins->objectid, ins->offset);
> @@ -9003,6 +8937,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
> INIT_LIST_HEAD(&cache->cluster_list);
> INIT_LIST_HEAD(&cache->bg_list);
> INIT_LIST_HEAD(&cache->ro_list);
> + INIT_LIST_HEAD(&cache->dirty_list);
> btrfs_init_free_space_ctl(cache);
>
> return cache;
> @@ -9065,9 +9000,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
> * b) Setting 'dirty flag' makes sure that we flush
> * the new space cache info onto disk.
> */
> - cache->disk_cache_state = BTRFS_DC_CLEAR;
> if (btrfs_test_opt(root, SPACE_CACHE))
> - cache->dirty = 1;
> + cache->disk_cache_state = BTRFS_DC_CLEAR;
> }
>
> read_extent_buffer(leaf, &cache->item,
> @@ -9427,6 +9361,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> if (block_group->cached == BTRFS_CACHE_STARTED)
> wait_block_group_cache_done(block_group);
>
> + spin_lock(&trans->transaction->dirty_bgs_lock);
> + if (!list_empty(&block_group->dirty_list)) {
> + list_del_init(&block_group->dirty_list);
> + btrfs_put_block_group(block_group);
> + }
> + spin_unlock(&trans->transaction->dirty_bgs_lock);
> +
> btrfs_remove_free_space_cache(block_group);
>
> spin_lock(&block_group->space_info->lock);
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 3384819..e85de8c 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -1210,6 +1210,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
> struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
> struct inode *inode;
> int ret = 0;
> + enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
>
> root = root->fs_info->tree_root;
>
> @@ -1233,9 +1234,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
> ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
> path, block_group->key.objectid);
> if (ret) {
> - spin_lock(&block_group->lock);
> - block_group->disk_cache_state = BTRFS_DC_ERROR;
> - spin_unlock(&block_group->lock);
> + dcs = BTRFS_DC_ERROR;
> ret = 0;
> #ifdef DEBUG
> btrfs_err(root->fs_info,
> @@ -1244,6 +1243,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
> #endif
> }
>
> + spin_lock(&block_group->lock);
> + block_group->disk_cache_state = dcs;
> + spin_unlock(&block_group->lock);
> iput(inode);
> return ret;
> }
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 6b1ac53..4b3f999 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -220,6 +220,8 @@ loop:
> INIT_LIST_HEAD(&cur_trans->pending_snapshots);
> INIT_LIST_HEAD(&cur_trans->pending_chunks);
> INIT_LIST_HEAD(&cur_trans->switch_commits);
> + INIT_LIST_HEAD(&cur_trans->dirty_bgs);
> + spin_lock_init(&cur_trans->dirty_bgs_lock);
> list_add_tail(&cur_trans->list, &fs_info->trans_list);
> extent_io_tree_init(&cur_trans->dirty_pages,
> fs_info->btree_inode->i_mapping);
> @@ -957,7 +959,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
> while (1) {
> old_root_bytenr = btrfs_root_bytenr(&root->root_item);
> if (old_root_bytenr == root->node->start &&
> - old_root_used == btrfs_root_used(&root->root_item))
> + old_root_used == btrfs_root_used(&root->root_item) &&
> + (!extent_root ||
> + list_empty(&trans->transaction->dirty_bgs)))
> break;
>
> btrfs_set_root_node(&root->root_item, root->node);
> @@ -976,6 +980,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
> ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
> if (ret)
> return ret;
> + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
> + if (ret)
> + return ret;
Does the second btrfs_run_delayed_refs() mean to update extent_tree's delayed refs produced by the first one?
Thanks,
-liubo
> }
>
> return 0;
> @@ -996,10 +1003,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
> struct extent_buffer *eb;
> int ret;
>
> - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
> - if (ret)
> - return ret;
> -
> eb = btrfs_lock_root_node(fs_info->tree_root);
> ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
> 0, &eb);
> @@ -1897,6 +1900,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
> switch_commit_roots(cur_trans, root->fs_info);
>
> assert_qgroups_uptodate(trans);
> + ASSERT(list_empty(&cur_trans->dirty_bgs));
> update_super_roots(root);
>
> btrfs_set_super_log_root(root->fs_info->super_copy, 0);
> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
> index d8f40e1..2aa345e 100644
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -57,6 +57,8 @@ struct btrfs_transaction {
> struct list_head pending_snapshots;
> struct list_head pending_chunks;
> struct list_head switch_commits;
> + struct list_head dirty_bgs;
> + spinlock_t dirty_bgs_lock;
> struct btrfs_delayed_ref_root delayed_refs;
> int aborted;
> };
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html