[PATCH 2/2] Btrfs: add TRANS_RESV_START ioctl to check/reserve free space on transaction start

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The current TRANS_START ioctl is problematic when the volume fills up
because a process can get ENOSPC on any operation that's supoosed to be
"contained" within its transaction without any prior warning.  This
defines a new ioctl, TRANS_RESV_START, that checks and sets aside space
for the entire transaction, so that the process will get ENOSPC right
away.

The bytes_ioctl_trans_reserved is only checked and adjusted by the
TRANS_RESV_START ioctl; any write()s that follow will be allowed to use
up that reserved space.  This is clearly imperfect (a mixture of an
ioctl transaction workload and a regular workload will violate the
reservations), but unavoidable with the current user transaction approach
because we don't know whether any given operation is contained by a
user transaction or not.

The 'ops' field isn't used yet.  Its intended to set a bound on the
number of transaction operations and thus btree modifications, for when
the metadata space reservation gets smarter.

Signed-off-by: Sage Weil <sage@xxxxxxxxxxxx>
---
 fs/btrfs/ctree.h       |    7 ++++
 fs/btrfs/extent-tree.c |   57 ++++++++++++++++++++++++++++--
 fs/btrfs/ioctl.c       |   92 +++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/ioctl.h       |    6 +++
 4 files changed, 151 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80599b4..62e00df 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -683,6 +683,9 @@ struct btrfs_space_info {
 	u64 bytes_may_use;	/* number of bytes that may be used for
 				   delalloc */
 
+	u64 bytes_ioctl_trans_reserved; /* number of bytes reserved by ioctl
+					   transactions */
+
 	int full;		/* indicates that we cannot allocate any more
 				   chunks for this space */
 	int force_alloc;	/* set if we need to force a chunk alloc for
@@ -2025,8 +2028,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 int btrfs_check_metadata_free_space(struct btrfs_root *root);
 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 				u64 bytes);
+int btrfs_check_ioctl_trans_free_space(struct btrfs_root *root,
+				       u64 bytes, u64 ops);
 void btrfs_free_reserved_data_space(struct btrfs_root *root,
 				    struct inode *inode, u64 bytes);
+void btrfs_free_reserved_ioctl_trans_space(struct btrfs_root *root,
+					  u64 bytes, u64 ops);
 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 				 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cb50944..b626ee2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2668,6 +2668,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_reserved = 0;
 	found->bytes_readonly = 0;
 	found->bytes_delalloc = 0;
+	found->bytes_ioctl_trans_reserved = 0;
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
@@ -2848,6 +2849,14 @@ static int __alloc_chunk(struct btrfs_root *root, u64 bytes)
 	return ret;
 }
 
+/*
+ * if @inode is defined, reserve bytes on that inode.  otherwise, we
+ * are reserving the space for an ioctl transaction, and need to
+ * subtract off other ioctl reserved space too.  we _only_ check ioctl
+ * reserved space here: the write() that follows needs to be able to
+ * use it, and we don't know which writes "belong" to which ioctl
+ * transactions.
+ */
 int btrfs_check_data_free_space_info(struct btrfs_root *root,
 				     struct btrfs_space_info *data_sinfo,
 				     u64 bytes,
@@ -2861,6 +2870,7 @@ again:
 	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+	    (inode ? 0 : data_sinfo->bytes_ioctl_trans_reserved) -
 	    data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
 		struct btrfs_trans_handle *trans;
 
@@ -2879,7 +2889,8 @@ again:
 		spin_unlock(&data_sinfo->lock);
 
 		/* commit the current transaction and try again */
-		if (!committed && !root->fs_info->open_ioctl_trans) {
+		if (!committed &&
+		    (!inode || !root->fs_info->open_ioctl_trans)) {
 			committed = 1;
 			trans = btrfs_join_transaction(root, 1);
 			if (!trans)
@@ -2903,8 +2914,12 @@ again:
 		       (unsigned long long)data_sinfo->total_bytes);
 		return -ENOSPC;
 	}
-	data_sinfo->bytes_may_use += bytes;
-	BTRFS_I(inode)->reserved_bytes += bytes;
+	if (inode) {
+		data_sinfo->bytes_may_use += bytes;
+		BTRFS_I(inode)->reserved_bytes += bytes;
+	} else {
+		data_sinfo->bytes_ioctl_trans_reserved += bytes;
+	}
 	spin_unlock(&data_sinfo->lock);
 	return 0;
 }
@@ -2937,6 +2952,42 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 	return btrfs_check_metadata_free_space(root);
 }
 
+int btrfs_check_ioctl_trans_free_space(struct btrfs_root *root, u64 bytes,
+				       u64 ops)
+{
+	struct btrfs_space_info *data_sinfo;
+	u64 alloc_target;
+	int ret;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	alloc_target = btrfs_get_alloc_profile(root, 1);
+	data_sinfo = __find_space_info(root->fs_info, alloc_target);
+	ret = btrfs_check_data_free_space_info(root, data_sinfo, bytes, NULL);
+	if (ret)
+		return ret;
+	return btrfs_check_metadata_free_space(root);
+}
+
+void btrfs_free_reserved_ioctl_trans_space(struct btrfs_root *root, u64 bytes,
+					   u64 ops)
+{
+	struct btrfs_space_info *data_sinfo;
+	u64 alloc_target;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	if (bytes || ops) {
+		alloc_target = btrfs_get_alloc_profile(root, 1);
+		data_sinfo = __find_space_info(root->fs_info, alloc_target);
+		spin_lock(&data_sinfo->lock);
+		data_sinfo->bytes_ioctl_trans_reserved -= bytes;
+		spin_unlock(&data_sinfo->lock);
+	}
+}
+
 /*
  * if there was an error for whatever reason after calling
  * btrfs_check_data_free_space, call this so we can cleanup the counters.
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 647838b..3765730 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1211,11 +1211,76 @@ static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
  * basically own the machine, and have a very in depth understanding
  * of all the possible deadlocks and enospc problems.
  */
+struct btrfs_ioctl_trans {
+	struct btrfs_trans_handle *trans;
+	u64 reserved_bytes, reserved_ops;
+};
+
+static long btrfs_ioctl_trans_resv_start(struct file *file, void __user *argp)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ioctl_trans *ioctl_trans;
+	struct btrfs_ioctl_trans_resv_start resv;
+	int ret;
+
+	ret = -EPERM;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -EINPROGRESS;
+	if (file->private_data)
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_from_user(&resv, argp, sizeof(resv)))
+		goto out;
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		goto out;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->open_ioctl_trans++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	ret = btrfs_check_ioctl_trans_free_space(root, resv.bytes, resv.ops);
+	if (ret)
+		goto out_drop;
+
+	ret = -ENOMEM;
+	ioctl_trans = kzalloc(sizeof(*ioctl_trans), GFP_KERNEL);
+	if (!ioctl_trans)
+		goto out_free_bytes;
+
+	ioctl_trans->trans = btrfs_start_ioctl_transaction(root, 0);
+	if (!ioctl_trans->trans)
+		goto out_free_ioctl_trans;
+
+	ioctl_trans->reserved_bytes = resv.bytes;
+	ioctl_trans->reserved_ops = resv.ops;
+	file->private_data = ioctl_trans;
+	return 0;
+
+out_free_ioctl_trans:
+	kfree(ioctl_trans);
+out_free_bytes:
+	btrfs_free_reserved_ioctl_trans_space(root, resv.bytes, resv.ops);
+out_drop:
+	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->open_ioctl_trans--;
+	mutex_unlock(&root->fs_info->trans_mutex);
+	mnt_drop_write(file->f_path.mnt);
+out:
+	return ret;
+}
+
+
 static long btrfs_ioctl_trans_start(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_ioctl_trans *ioctl_trans;
 	int ret;
 
 	ret = -EPERM;
@@ -1235,13 +1300,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	ret = -ENOMEM;
-	trans = btrfs_start_ioctl_transaction(root, 0);
-	if (!trans)
+	ioctl_trans = kzalloc(sizeof(*ioctl_trans), GFP_KERNEL);
+	if (!ioctl_trans)
 		goto out_drop;
 
-	file->private_data = trans;
+	ioctl_trans->trans = btrfs_start_ioctl_transaction(root, 0);
+	if (ioctl_trans->trans)
+		goto out_free_ioctl_trans;
+
+	file->private_data = ioctl_trans;
 	return 0;
 
+out_free_ioctl_trans:
+	kfree(ioctl_trans);
 out_drop:
 	mutex_lock(&root->fs_info->trans_mutex);
 	root->fs_info->open_ioctl_trans--;
@@ -1261,14 +1332,17 @@ long btrfs_ioctl_trans_end(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
+	struct btrfs_ioctl_trans *ioctl_trans;
 
-	trans = file->private_data;
-	if (!trans)
+	ioctl_trans = file->private_data;
+	if (!ioctl_trans)
 		return -EINVAL;
 	file->private_data = NULL;
 
-	btrfs_end_transaction(trans, root);
+	btrfs_free_reserved_ioctl_trans_space(root, ioctl_trans->reserved_bytes,
+					      ioctl_trans->reserved_ops);
+	btrfs_end_transaction(ioctl_trans->trans, root);
+	kfree(ioctl_trans);
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	root->fs_info->open_ioctl_trans--;
@@ -1311,6 +1385,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
 	case BTRFS_IOC_CLONE_RANGE:
 		return btrfs_ioctl_clone_range(file, argp);
+	case BTRFS_IOC_TRANS_RESV_START:
+		return btrfs_ioctl_trans_resv_start(file, argp);
 	case BTRFS_IOC_TRANS_START:
 		return btrfs_ioctl_trans_start(file);
 	case BTRFS_IOC_TRANS_END:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914..9810980 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -48,6 +48,12 @@ struct btrfs_ioctl_clone_range_args {
  * use by applications that know how to avoid the
  * resulting deadlocks
  */
+/* check/reserve disk space for the transaction up-front */
+struct btrfs_ioctl_trans_resv_start {
+	__u64 bytes, ops;
+};
+#define BTRFS_IOC_TRANS_RESV_START _IOW(BTRFS_IOCTL_MAGIC, 5,	\
+					struct btrfs_ioctl_trans_resv_start)
 #define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
 #define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
 #define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
-- 
1.5.6.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux