I'm a little confused of what "avg_delayed_ref_runtime" means.
In __btrfs_run_delayed_refs(), "avg_delayed_ref_runtime" is set to the
runtime of all delayed refs processed in current transaction commit.
However, in btrfs_should_throttle_delayed_refs(), we based on the
following condition to decide whether throttle refs or not:
*****************************************
avg_runtime = fs_info->avg_delayed_ref_runtime;
if (num_entries * avg_runtime >= NSEC_PER_SEC)
return 1;
*****************************************
It looks like "avg_delayed_ref_runtime" is used as runtime of each
delayed ref processed in average here. So what does it really means?
Thanks,
Kai
2014-01-24 2:07 GMT+08:00 Josef Bacik <jbacik@xxxxxx>:
> On one of our gluster clusters we noticed some pretty big lag spikes. This
> turned out to be because our transaction commit was taking like 3 minutes to
> complete. This is because we have like 30 gigs of metadata, so our global
> reserve would end up being the max which is like 512 mb. So our throttling code
> would allow a ridiculous amount of delayed refs to build up and then they'd all
> get run at transaction commit time, and for a cold mounted file system that
> could take up to 3 minutes to run. So fix the throttling to be based on both
> the size of the global reserve and how long it takes us to run delayed refs.
> This patch tracks the time it takes to run delayed refs and then only allows 1
> seconds worth of outstanding delayed refs at a time. This way it will auto-tune
> itself from cold cache up to when everything is in memory and it no longer has
> to go to disk. This makes our transaction commits take much less time to run.
> Thanks,
>
> Signed-off-by: Josef Bacik <jbacik@xxxxxx>
> ---
> fs/btrfs/ctree.h | 3 +++
> fs/btrfs/disk-io.c | 2 +-
> fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++++++++++++++++++++++-
> fs/btrfs/transaction.c | 4 ++--
> 4 files changed, 46 insertions(+), 4 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 3cebb4a..ca6bcc3 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1360,6 +1360,7 @@ struct btrfs_fs_info {
>
> u64 generation;
> u64 last_trans_committed;
> + u64 avg_delayed_ref_runtime;
>
> /*
> * this is updated to the current trans every time a full commit
> @@ -3172,6 +3173,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
>
> int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
> struct btrfs_root *root);
> +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
> + struct btrfs_root *root);
> void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
> int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
> struct btrfs_root *root, unsigned long count);
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index ed23127..f0e7bbe 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb,
> fs_info->free_chunk_space = 0;
> fs_info->tree_mod_log = RB_ROOT;
> fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
> -
> + fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
> /* readahead state */
> INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
> spin_lock_init(&fs_info->reada_lock);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index c77156c..b532259 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2322,8 +2322,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
> struct btrfs_delayed_ref_head *locked_ref = NULL;
> struct btrfs_delayed_extent_op *extent_op;
> struct btrfs_fs_info *fs_info = root->fs_info;
> + ktime_t start = ktime_get();
> int ret;
> unsigned long count = 0;
> + unsigned long actual_count = 0;
> int must_insert_reserved = 0;
>
> delayed_refs = &trans->transaction->delayed_refs;
> @@ -2452,6 +2454,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
> &delayed_refs->href_root);
> spin_unlock(&delayed_refs->lock);
> } else {
> + actual_count++;
> ref->in_tree = 0;
> rb_erase(&ref->rb_node, &locked_ref->ref_root);
> }
> @@ -2502,6 +2505,26 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
> count++;
> cond_resched();
> }
> +
> + /*
> + * We don't want to include ref heads since we can have empty ref heads
> + * and those will drastically skew our runtime down since we just do
> + * accounting, no actual extent tree updates.
> + */
> + if (actual_count > 0) {
> + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
> + u64 avg;
> +
> + /*
> + * We weigh the current average higher than our current runtime
> + * to avoid large swings in the average.
> + */
> + spin_lock(&delayed_refs->lock);
> + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
> + avg = div64_u64(avg, 4);
> + fs_info->avg_delayed_ref_runtime = avg;
> + spin_unlock(&delayed_refs->lock);
> + }
> return 0;
> }
>
> @@ -2600,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
> return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
> }
>
> -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
> +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
> struct btrfs_root *root)
> {
> struct btrfs_block_rsv *global_rsv;
> @@ -2629,6 +2652,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
> + struct btrfs_root *root)
> +{
> + struct btrfs_fs_info *fs_info = root->fs_info;
> + u64 num_entries =
> + atomic_read(&trans->transaction->delayed_refs.num_entries);
> + u64 avg_runtime;
> +
> + smp_mb();
> + avg_runtime = fs_info->avg_delayed_ref_runtime;
> + if (num_entries * avg_runtime >= NSEC_PER_SEC)
> + return 1;
> +
> + return btrfs_check_space_for_delayed_refs(trans, root);
> +}
> +
> /*
> * this starts processing the delayed reference count updates and
> * extent insertions we have queued up so far. count can be
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index fd14464..5e2bfda 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -645,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
> struct btrfs_root *root)
> {
> if (root->fs_info->global_block_rsv.space_info->full &&
> - btrfs_should_throttle_delayed_refs(trans, root))
> + btrfs_check_space_for_delayed_refs(trans, root))
> return 1;
>
> return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
> @@ -710,7 +710,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
>
> trans->delayed_ref_updates = 0;
> if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
> - cur = max_t(unsigned long, cur, 1);
> + cur = max_t(unsigned long, cur, 32);
> trans->delayed_ref_updates = 0;
> btrfs_run_delayed_refs(trans, root, cur);
> }
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html