The following deadlock may happen when doing reservation for metadata:
Task0 Flush thread Task1
start_transaction()
shrink_delalloc()
writeback_inodes_sb_nr()
wait for flush thread
end.
btrfs_writepages()
cow_file_range()
btrfs_commit_transaction
wait num_writer == 1
(wait Task0 end
transaction)
start_transaction()
wait trans commit
end
Task0 -> Flush thread -> Task1 -> Task0
Fix the above deadlock by doing reservation before the trans handle has
been joined into the transaction.
Signed-off-by: Miao Xie <miaox@xxxxxxxxxxxxxx>
---
fs/btrfs/extent-tree.c | 25 +++++++++++++----------
fs/btrfs/transaction.c | 51 +++++++++++++++++++++++++++++++++--------------
2 files changed, 50 insertions(+), 26 deletions(-)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b42efc2..eefa432 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3290,9 +3290,11 @@ again:
/*
* shrink metadata reservation for delalloc
+ *
+ * NOTE: This function can not run in the transaction context, or deadlock
+ * will happen.
*/
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 to_reclaim, int sync)
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim)
{
struct btrfs_block_rsv *block_rsv;
struct btrfs_space_info *space_info;
@@ -3338,9 +3340,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
if (reserved == 0 || reclaimed >= max_reclaim)
break;
- if (trans && trans->transaction->blocked)
- return -EAGAIN;
-
time_left = schedule_timeout_interruptible(1);
/* We were interrupted, exit */
@@ -3449,12 +3448,16 @@ again:
/*
* We do synchronous shrinking since we don't actually unreserve
* metadata until after the IO is completed.
+ *
+ * shrink_delalloc() can not run in the transaction context.
*/
- ret = shrink_delalloc(trans, root, num_bytes, 1);
- if (ret > 0)
- return 0;
- else if (ret < 0)
- goto out;
+ if (!trans || !trans->transaction) {
+ ret = shrink_delalloc(root, num_bytes);
+ if (ret > 0)
+ return 0;
+ else if (ret < 0)
+ goto out;
+ }
/*
* So if we were overcommitted it's possible that somebody else flushed
@@ -3989,7 +3992,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
block_rsv_add_bytes(block_rsv, to_reserve, 1);
if (block_rsv->size > 512 * 1024 * 1024)
- shrink_delalloc(NULL, root, to_reserve, 0);
+ shrink_delalloc(root, to_reserve);
return 0;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2b3590b..173b15d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -222,9 +222,31 @@ again:
if (!h)
return ERR_PTR(-ENOMEM);
+ h->transid = 0;
+ h->transaction = NULL;
+ h->blocks_used = 0;
+ h->bytes_reserved = 0;
+ h->delayed_ref_updates = 0;
+ h->use_count = 1;
+ h->block_rsv = NULL;
+ h->orig_rsv = NULL;
+
if (may_wait_transaction(root, type))
wait_current_trans(root);
+ if (num_items > 0) {
+ /*
+ * Now the handle has not been joined into the transaction,
+ * so btrfs will shrink metadata reservation for delalloc if
+ * there is no enough free space to reserve.
+ */
+ ret = btrfs_trans_reserve_metadata(h, root, num_items);
+ if (ret < 0 && ret != -EAGAIN) {
+ kmem_cache_free(btrfs_trans_handle_cachep, h);
+ return ERR_PTR(ret);
+ }
+ }
+
do {
ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
if (ret == -EBUSY)
@@ -232,6 +254,7 @@ again:
} while (ret == -EBUSY);
if (ret < 0) {
+ btrfs_trans_release_metadata(h, root);
kmem_cache_free(btrfs_trans_handle_cachep, h);
return ERR_PTR(ret);
}
@@ -240,12 +263,6 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
- h->blocks_used = 0;
- h->bytes_reserved = 0;
- h->delayed_ref_updates = 0;
- h->use_count = 1;
- h->block_rsv = NULL;
- h->orig_rsv = NULL;
smp_mb();
if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -253,21 +270,25 @@ again:
goto again;
}
- if (num_items > 0) {
- ret = btrfs_trans_reserve_metadata(h, root, num_items);
- if (ret == -EAGAIN && !retries) {
+ /*
+ * Though we shrink metadata reservation for delalloc, we might still
+ * not get enough free space, so we will commit the transaction and try
+ * to reclaim the reservation.
+ *
+ * NOTE: In the transaction context, we won't shrink metadata
+ * reservation for delalloc, or the deadlock will happen.
+ */
+ if (num_items > 0 && !h->bytes_reserved) {
+ if (!retries) {
retries++;
btrfs_commit_transaction(h, root);
goto again;
- } else if (ret == -EAGAIN) {
+ } else {
/*
- * We have already retried and got EAGAIN, so really we
- * don't have space, so set ret to -ENOSPC.
+ * We have already retried, so really we don't have
+ * space, so set ret to -ENOSPC.
*/
ret = -ENOSPC;
- }
-
- if (ret < 0) {
btrfs_end_transaction(h, root);
return ERR_PTR(ret);
}
--
1.7.4
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html