On Fri, Oct 21, 2016 at 05:05:07PM +0800, Wang Xiaoguang wrote:
> This issue was found when I tried to delete a heavily reflinked file,
> when deleting such files, other transaction operation will not have a
> chance to make progress, for example, start_transaction() will blocked
> in wait_current_trans(root) for long time, sometimes it even triggers
> soft lockups, and the time taken to delete such heavily reflinked file
> is also very large, often hundreds of seconds. Using perf top, it reports
> that:
>
> PerfTop: 7416 irqs/sec kernel:99.8% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
> ---------------------------------------------------------------------------------------
> 84.37% [btrfs] [k] __btrfs_run_delayed_refs.constprop.80
> 11.02% [kernel] [k] delay_tsc
> 0.79% [kernel] [k] _raw_spin_unlock_irq
> 0.78% [kernel] [k] _raw_spin_unlock_irqrestore
> 0.45% [kernel] [k] do_raw_spin_lock
> 0.18% [kernel] [k] __slab_alloc
> It seems __btrfs_run_delayed_refs() took most cpu time, after some debug
> work, I found it's select_delayed_ref() causing this issue, for a delayed
> head, in our case, it'll be full of BTRFS_DROP_DELAYED_REF nodes, but
> select_delayed_ref() will firstly try to iterate node list to find
> BTRFS_ADD_DELAYED_REF nodes, obviously it's a disaster in this case, and
> waste much time.
>
> To fix this issue, we introduce a new ref_add_list in struct btrfs_delayed_ref_head,
> then in select_delayed_ref(), if this list is not empty, we can directly use
> nodes in this list. With this patch, it just took about 10~15 seconds to
> delte the same file. Now using perf top, it reports that:
>
> PerfTop: 2734 irqs/sec kernel:99.5% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
> ----------------------------------------------------------------------------------------
>
> 20.74% [kernel] [k] _raw_spin_unlock_irqrestore
> 16.33% [kernel] [k] __slab_alloc
> 5.41% [kernel] [k] lock_acquired
> 4.42% [kernel] [k] lock_acquire
> 4.05% [kernel] [k] lock_release
> 3.37% [kernel] [k] _raw_spin_unlock_irq
>
> For normal files, this patch also gives help, at least we do not need to
> iterate whole list to found BTRFS_ADD_DELAYED_REF nodes.
>
> Signed-off-by: Wang Xiaoguang <wangxg.fnst@xxxxxxxxxxxxxx>
> ---
> fs/btrfs/delayed-ref.c | 14 ++++++++++++++
> fs/btrfs/delayed-ref.h | 8 ++++++++
> fs/btrfs/disk-io.c | 2 ++
> fs/btrfs/extent-tree.c | 15 +++++++++------
> 4 files changed, 33 insertions(+), 6 deletions(-)
>
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 8d93854..39c28e0 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -189,6 +189,8 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
> } else {
> assert_spin_locked(&head->lock);
> list_del(&ref->list);
> + if (!list_empty(&ref->add_list))
> + list_del(&ref->add_list);
> }
> ref->in_tree = 0;
> btrfs_put_delayed_ref(ref);
> @@ -431,6 +433,11 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
> exist->action = ref->action;
> mod = -exist->ref_mod;
> exist->ref_mod = ref->ref_mod;
> + if (ref->action == BTRFS_ADD_DELAYED_REF)
> + list_add_tail(&exist->add_list,
> + &href->ref_add_list);
> + else if (!list_empty(&exist->add_list))
> + list_del(&exist->add_list);
->action is either BTRFS_ADD_DELAYED_REF or BTRFS_DROP_DELAYED_REF, so
in 'else' section, (!list_empty(&exist->add_list)) is true indeed.
> } else
> mod = -ref->ref_mod;
> }
> @@ -444,6 +451,8 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
>
> add_tail:
> list_add_tail(&ref->list, &href->ref_list);
> + if (ref->action == BTRFS_ADD_DELAYED_REF)
> + list_add_tail(&ref->add_list, &href->ref_add_list);
> atomic_inc(&root->num_entries);
> trans->delayed_ref_updates++;
> spin_unlock(&href->lock);
> @@ -590,6 +599,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
> head_ref->must_insert_reserved = must_insert_reserved;
> head_ref->is_data = is_data;
> INIT_LIST_HEAD(&head_ref->ref_list);
> + INIT_LIST_HEAD(&head_ref->ref_add_list);
> head_ref->processing = 0;
> head_ref->total_ref_mod = count_mod;
> head_ref->qgroup_reserved = 0;
> @@ -671,6 +681,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
> ref->is_head = 0;
> ref->in_tree = 1;
> ref->seq = seq;
> + INIT_LIST_HEAD(&ref->list);
> + INIT_LIST_HEAD(&ref->add_list);
>
> full_ref = btrfs_delayed_node_to_tree_ref(ref);
> full_ref->parent = parent;
> @@ -726,6 +738,8 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
> ref->is_head = 0;
> ref->in_tree = 1;
> ref->seq = seq;
> + INIT_LIST_HEAD(&ref->list);
> + INIT_LIST_HEAD(&ref->add_list);
>
> full_ref = btrfs_delayed_node_to_data_ref(ref);
> full_ref->parent = parent;
> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
> index 43f3629..dba9784 100644
> --- a/fs/btrfs/delayed-ref.h
> +++ b/fs/btrfs/delayed-ref.h
> @@ -42,6 +42,12 @@ struct btrfs_delayed_ref_node {
>
> /*data/tree ref use list, stored in ref_head->ref_list. */
> struct list_head list;
> + /*
> + * If action is BTRFS_ADD_DELAYED_REF, also link this node to
> + * ref_head->ref_add_list, then we do not need to iterate the
> + * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
> + */
> + struct list_head add_list;
>
> /* the starting bytenr of the extent */
> u64 bytenr;
> @@ -99,6 +105,8 @@ struct btrfs_delayed_ref_head {
>
> spinlock_t lock;
> struct list_head ref_list;
> + /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
> + struct list_head ref_add_list;
>
> struct rb_node href_node;
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 3a57f99..bc2edaf 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -4354,6 +4354,8 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
> list) {
> ref->in_tree = 0;
> list_del(&ref->list);
> + if (!list_empty(&ref->add_list))
> + list_del(&ref->add_list);
> atomic_dec(&delayed_refs->num_entries);
> btrfs_put_delayed_ref(ref);
> }
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 210c94a..1284222 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2454,13 +2454,14 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
> * the extent item from the extent tree, when there still are references
> * to add, which would fail because they would not find the extent item.
> */
> - list_for_each_entry(ref, &head->ref_list, list) {
> - if (ref->action == BTRFS_ADD_DELAYED_REF)
> - return ref;
> - }
> + if (!list_empty(&head->ref_add_list))
> + return list_entry(head->ref_add_list.next,
> + struct btrfs_delayed_ref_node, add_list);
>
> - return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
> - list);
> + ref = list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
> + list);
> + WARN_ON(!list_empty(&ref->add_list));
I'd prefer ASSERT for only developers troubleshooting.
Others look good to me.
Reviewed-by: Liu Bo <bo.li.liu@xxxxxxxxxx>
I had a patch[1] while working on dedupe back then, it was trying to
resolve the same problem, somehow it didn't make it to this retry of dedupe.
[1]: https://patchwork.kernel.org/patch/3959021/
Thanks,
-liubo
> + return ref;
> }
>
> /*
> @@ -2620,6 +2621,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
> actual_count++;
> ref->in_tree = 0;
> list_del(&ref->list);
> + if (!list_empty(&ref->add_list))
> + list_del(&ref->add_list);
> }
> atomic_dec(&delayed_refs->num_entries);
>
> --
> 2.9.0
>
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html