On wed, 13 Feb 2013 11:13:22 -0500, Josef Bacik wrote:
> Miao made the ordered operations stuff run async, which introduced a
> deadlock where we could get somebody (sync) racing in and committing the
> transaction while a commit was already happening. The new committer would
> try and flush ordered operations which would hang waiting for the commit to
> finish because it is done asynchronously and no longer inherits the callers
> trans handle. To fix this we need to make the ordered operations list a per
> transaction list. We can get new inodes added to the ordered operation list
> by truncating them and then having another process writing to them, so this
> makes it so that anybody trying to add an ordered operation _must_ start a
> transaction in order to add itself to the list, which will keep new inodes
> from getting added to the ordered operations list after we start committing.
> This should fix the deadlock and also keeps us from doing a lot more work
> than we need to during commit. Thanks,
Firstly, thanks to deal with the bug which was introduced by my patch.
But comparing with this fix method, I prefer the following one because:
- we won't worry the similar problem if we add more work during commit
in the future.
- it is unnecessary to get a new handle and commit it if the transaction
is under the commit.
Thanks
Miao
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fc03aa6..c449cb5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -277,7 +277,8 @@ static void wait_current_trans(struct btrfs_root *root)
}
}
-static int may_wait_transaction(struct btrfs_root *root, int type)
+static int may_wait_transaction(struct btrfs_root *root, int type,
+ bool is_joined)
{
if (root->fs_info->log_root_recovering)
return 0;
@@ -285,6 +286,14 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
if (type == TRANS_USERSPACE)
return 1;
+ /*
+ * If we are ATTACH, it means we just want to catch the current
+ * transaction and commit it. So if someone is committing the
+ * current transaction now, it is very glad to wait it.
+ */
+ if (is_joined && type == TRANS_ATTACH)
+ return 1;
+
if (type == TRANS_START &&
!atomic_read(&root->fs_info->open_ioctl_trans))
return 1;
@@ -355,7 +364,7 @@ again:
if (type < TRANS_JOIN_NOLOCK)
sb_start_intwrite(root->fs_info->sb);
- if (may_wait_transaction(root, type))
+ if (may_wait_transaction(root, type, false))
wait_current_trans(root);
do {
@@ -383,16 +392,26 @@ again:
h->block_rsv = NULL;
h->orig_rsv = NULL;
h->aborted = 0;
- h->qgroup_reserved = qgroup_reserved;
+ h->qgroup_reserved = 0;
h->delayed_ref_elem.seq = 0;
h->type = type;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
- if (cur_trans->blocked && may_wait_transaction(root, type)) {
- btrfs_commit_transaction(h, root);
- goto again;
+ if (cur_trans->blocked && may_wait_transaction(root, type, true)) {
+ if (cur_trans->in_commit) {
+ btrfs_end_transaction(h, root);
+ wait_current_trans(root);
+ } else {
+ btrfs_commit_transaction(h, root);
+ }
+ if (unlikely(type == TRANS_ATTACH)) {
+ ret = -ENOENT;
+ goto alloc_fail;
+ } else {
+ goto again;
+ }
}
if (num_bytes) {
@@ -401,6 +420,7 @@ again:
h->block_rsv = &root->fs_info->trans_block_rsv;
h->bytes_reserved = num_bytes;
}
+ h->qgroup_reserved = qgroup_reserved;
got_it:
btrfs_record_root_in_trans(h, root);
--
1.6.5.2
>
> Signed-off-by: Josef Bacik <jbacik@xxxxxxxxxxxx>
> ---
> fs/btrfs/ctree.h | 7 -------
> fs/btrfs/disk-io.c | 11 ++++++-----
> fs/btrfs/file.c | 15 ++++++++++++++-
> fs/btrfs/ordered-data.c | 13 ++++++++-----
> fs/btrfs/ordered-data.h | 3 ++-
> fs/btrfs/transaction.c | 5 +++--
> fs/btrfs/transaction.h | 1 +
> 7 files changed, 34 insertions(+), 21 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 0c4e4df..9f72ec8 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1408,13 +1408,6 @@ struct btrfs_fs_info {
> struct list_head delalloc_inodes;
>
> /*
> - * special rename and truncate targets that must be on disk before
> - * we're allowed to commit. This is basically the ext3 style
> - * data=ordered list.
> - */
> - struct list_head ordered_operations;
> -
> - /*
> * there is a pool of worker threads for checksumming during writes
> * and a pool for checksumming after reads. This is because readers
> * can run with FS locks held, and the writers may be waiting for
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 52573d0..6baa77d 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -56,7 +56,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
> static void free_fs_root(struct btrfs_root *root);
> static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
> int read_only);
> -static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
> +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
> + struct btrfs_root *root);
> static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
> static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
> struct btrfs_root *root);
> @@ -2029,7 +2030,6 @@ int open_ctree(struct super_block *sb,
> INIT_LIST_HEAD(&fs_info->dead_roots);
> INIT_LIST_HEAD(&fs_info->delayed_iputs);
> INIT_LIST_HEAD(&fs_info->delalloc_inodes);
> - INIT_LIST_HEAD(&fs_info->ordered_operations);
> INIT_LIST_HEAD(&fs_info->caching_block_groups);
> spin_lock_init(&fs_info->delalloc_lock);
> spin_lock_init(&fs_info->trans_lock);
> @@ -3538,7 +3538,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
> btrfs_cleanup_transaction(root);
> }
>
> -static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
> +static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
> + struct btrfs_root *root)
> {
> struct btrfs_inode *btrfs_inode;
> struct list_head splice;
> @@ -3548,7 +3549,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
> mutex_lock(&root->fs_info->ordered_operations_mutex);
> spin_lock(&root->fs_info->ordered_extent_lock);
>
> - list_splice_init(&root->fs_info->ordered_operations, &splice);
> + list_splice_init(&t->ordered_operations, &splice);
> while (!list_empty(&splice)) {
> btrfs_inode = list_entry(splice.next, struct btrfs_inode,
> ordered_operations);
> @@ -3829,7 +3830,7 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
> while (!list_empty(&list)) {
> t = list_entry(list.next, struct btrfs_transaction, list);
>
> - btrfs_destroy_ordered_operations(root);
> + btrfs_destroy_ordered_operations(t, root);
>
> btrfs_destroy_ordered_extents(root);
>
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 237bf92..848442c 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1629,7 +1629,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
> */
> if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
> &BTRFS_I(inode)->runtime_flags)) {
> - btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
> + struct btrfs_trans_handle *trans;
> + struct btrfs_root *root = BTRFS_I(inode)->root;
> +
> + /*
> + * We need to block on a committing transaction to keep us from
> + * throwing a ordered operation on to the list and causing
> + * something like sync to deadlock trying to flush out this
> + * inode.
> + */
> + trans = btrfs_start_transaction(root, 0);
> + if (IS_ERR(trans))
> + return PTR_ERR(trans);
> + btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
> + btrfs_end_transaction(trans, root);
> if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
> filemap_flush(inode->i_mapping);
> }
> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
> index cd8f6e9..346dca0 100644
> --- a/fs/btrfs/ordered-data.c
> +++ b/fs/btrfs/ordered-data.c
> @@ -612,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
> * extra check to make sure the ordered operation list really is empty
> * before we return
> */
> -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
> +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
> + struct btrfs_root *root, int wait)
> {
> struct btrfs_inode *btrfs_inode;
> struct inode *inode;
> + struct btrfs_transaction *cur_trans = trans->transaction;
> struct list_head splice;
> struct list_head works;
> struct btrfs_delalloc_work *work, *next;
> @@ -626,7 +628,7 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
>
> mutex_lock(&root->fs_info->ordered_operations_mutex);
> spin_lock(&root->fs_info->ordered_extent_lock);
> - list_splice_init(&root->fs_info->ordered_operations, &splice);
> + list_splice_init(&cur_trans->ordered_operations, &splice);
> while (!list_empty(&splice)) {
> btrfs_inode = list_entry(splice.next, struct btrfs_inode,
> ordered_operations);
> @@ -643,7 +645,7 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
>
> if (!wait)
> list_add_tail(&BTRFS_I(inode)->ordered_operations,
> - &root->fs_info->ordered_operations);
> + &cur_trans->ordered_operations);
> spin_unlock(&root->fs_info->ordered_extent_lock);
>
> work = btrfs_alloc_delalloc_work(inode, wait, 1);
> @@ -653,7 +655,7 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
> list_add_tail(&btrfs_inode->ordered_operations,
> &splice);
> list_splice_tail(&splice,
> - &root->fs_info->ordered_operations);
> + &cur_trans->ordered_operations);
> spin_unlock(&root->fs_info->ordered_extent_lock);
> ret = -ENOMEM;
> goto out;
> @@ -1033,6 +1035,7 @@ out:
> void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
> struct btrfs_root *root, struct inode *inode)
> {
> + struct btrfs_transaction *cur_trans = trans->transaction;
> u64 last_mod;
>
> last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
> @@ -1047,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
> spin_lock(&root->fs_info->ordered_extent_lock);
> if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
> list_add_tail(&BTRFS_I(inode)->ordered_operations,
> - &root->fs_info->ordered_operations);
> + &cur_trans->ordered_operations);
> }
> spin_unlock(&root->fs_info->ordered_extent_lock);
> }
> diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
> index d523dbd..267ac99 100644
> --- a/fs/btrfs/ordered-data.h
> +++ b/fs/btrfs/ordered-data.h
> @@ -197,7 +197,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
> int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
> struct btrfs_ordered_extent *ordered);
> int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
> -int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
> +int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
> + struct btrfs_root *root, int wait);
> void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
> struct btrfs_root *root,
> struct inode *inode);
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 49c79b3..149ce09 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -157,6 +157,7 @@ loop:
> spin_lock_init(&cur_trans->delayed_refs.lock);
>
> INIT_LIST_HEAD(&cur_trans->pending_snapshots);
> + INIT_LIST_HEAD(&cur_trans->ordered_operations);
> list_add_tail(&cur_trans->list, &fs_info->trans_list);
> extent_io_tree_init(&cur_trans->dirty_pages,
> fs_info->btree_inode->i_mapping);
> @@ -1457,7 +1458,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
> * it here and no for sure that nothing new will be added
> * to the list
> */
> - ret = btrfs_run_ordered_operations(root, 1);
> + ret = btrfs_run_ordered_operations(trans, root, 1);
>
> return ret;
> }
> @@ -1480,7 +1481,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
> int should_grow = 0;
> unsigned long now = get_seconds();
>
> - ret = btrfs_run_ordered_operations(root, 0);
> + ret = btrfs_run_ordered_operations(trans, root, 0);
> if (ret) {
> btrfs_abort_transaction(trans, root, ret);
> btrfs_end_transaction(trans, root);
> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
> index 4662821..3f772fd 100644
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -43,6 +43,7 @@ struct btrfs_transaction {
> wait_queue_head_t writer_wait;
> wait_queue_head_t commit_wait;
> struct list_head pending_snapshots;
> + struct list_head ordered_operations;
> struct btrfs_delayed_ref_root delayed_refs;
> int aborted;
> };
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html