cleaner_kthread() may run at any time, in which it'll call btrfs_delete_unused_bgs()
to delete unused block groups. Because this work is asynchronous, it may also result
in false ENOSPC error. Please see below race window:
CPU1 | CPU2
|
|-> btrfs_alloc_data_chunk_ondemand() |-> cleaner_kthread()
|-> do_chunk_alloc() | |
| assume it returns ENOSPC, which means | |
| btrfs_space_info is full and have free| |
| space to satisfy data request. | |
| | |- > btrfs_delete_unused_bgs()
| | | it will decrease btrfs_space_info
| | | total_bytes and make
| | | btrfs_space_info is not full.
| | |
In this case, we may get ENOSPC error, but btrfs_space_info is not full.
To fix this issue, in btrfs_alloc_data_chunk_ondemand(), if we need to call
do_chunk_alloc() to allocating new chunk, we should block btrfs_delete_unused_bgs().
So here we introduce a new struct rw_semaphore bg_delete_sem to do this job.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@xxxxxxxxxxxxxx>
---
fs/btrfs/ctree.h | 1 +
fs/btrfs/disk-io.c | 1 +
fs/btrfs/extent-tree.c | 40 ++++++++++++++++++++++++++++++++++------
3 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7eb2913..bf0751d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -800,6 +800,7 @@ struct btrfs_fs_info {
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
struct mutex volume_mutex;
+ struct rw_semaphore bg_delete_sem;
/*
* this is taken to make sure we don't set block groups ro after
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60ce119..65a1465 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2676,6 +2676,7 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->chunk_mutex);
+ init_rwsem(&fs_info->bg_delete_sem);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df8d756..d1f8638 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4111,6 +4111,7 @@ int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
int ret = 0;
int need_commit = 2;
int have_pinned_space;
+ int have_bg_delete_sem = 0;
/* make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, root->sectorsize);
@@ -4121,8 +4122,11 @@ int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
}
data_sinfo = fs_info->data_sinfo;
- if (!data_sinfo)
+ if (!data_sinfo) {
+ down_read(&root->fs_info->bg_delete_sem);
+ have_bg_delete_sem = 1;
goto alloc;
+ }
again:
/* make sure we have enough space to handle the data first */
@@ -4134,10 +4138,21 @@ again:
if (used + bytes > data_sinfo->total_bytes) {
struct btrfs_trans_handle *trans;
+ spin_unlock(&data_sinfo->lock);
+ /*
+ * We may need to allocate new chunk, so we should block
+ * btrfs_delete_unused_bgs()
+ */
+ if (have_bg_delete_sem == 0) {
+ down_read(&root->fs_info->bg_delete_sem);
+ have_bg_delete_sem = 1;
+ }
+
/*
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
*/
+ spin_lock(&data_sinfo->lock);
if (!data_sinfo->full) {
u64 alloc_target;
@@ -4156,17 +4171,20 @@ alloc:
* the fs.
*/
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ up_read(&root->fs_info->bg_delete_sem);
return PTR_ERR(trans);
+ }
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
alloc_target,
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans, root);
if (ret < 0) {
- if (ret != -ENOSPC)
+ if (ret != -ENOSPC) {
+ up_read(&root->fs_info->bg_delete_sem);
return ret;
- else {
+ } else {
have_pinned_space = 1;
goto commit_trans;
}
@@ -4200,15 +4218,19 @@ commit_trans:
}
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ up_read(&root->fs_info->bg_delete_sem);
return PTR_ERR(trans);
+ }
if (have_pinned_space >= 0 ||
test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
&trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans, root);
- if (ret)
+ if (ret) {
+ up_read(&root->fs_info->bg_delete_sem);
return ret;
+ }
/*
* The cleaner kthread might still be doing iput
* operations. Wait for it to finish so that
@@ -4225,6 +4247,7 @@ commit_trans:
trace_btrfs_space_reservation(root->fs_info,
"space_info:enospc",
data_sinfo->flags, bytes, 1);
+ up_read(&root->fs_info->bg_delete_sem);
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
@@ -4232,6 +4255,9 @@ commit_trans:
data_sinfo->flags, bytes, 1);
spin_unlock(&data_sinfo->lock);
+ if (have_bg_delete_sem == 1)
+ up_read(&root->fs_info->bg_delete_sem);
+
return ret;
}
@@ -10594,6 +10620,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->unused_bgs_lock);
mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ down_write(&root->fs_info->bg_delete_sem);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
@@ -10721,6 +10748,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
end_trans:
btrfs_end_transaction(trans, root);
next:
+ up_write(&root->fs_info->bg_delete_sem);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
--
2.9.0
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html