On 11.05.2018 03:11, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@xxxxxx>
>
> Currently, we keep space reserved for all inode orphan items until the
> inode is evicted (i.e., all references to it are dropped). We hit an
> issue where an application would keep a bunch of deleted files open (by
> design) and thus keep a large amount of space reserved, causing ENOSPC
> errors when other operations tried to reserve space. This long-standing
> reservation isn't absolutely necessary for a couple of reasons:
>
> - We can almost always make the reservation we need or steal from the
> global reserve for the orphan item
> - If we can't, it's not the end of the world if we drop the orphan item
> on the floor and let the next mount clean it up
>
> So, get rid of persistent reservation and just reserve space in
> btrfs_evict_inode().
>
> Signed-off-by: Omar Sandoval <osandov@xxxxxx>
> ---
> fs/btrfs/btrfs_inode.h | 19 +++---
> fs/btrfs/inode.c | 142 ++++++++++++-----------------------------
> 2 files changed, 50 insertions(+), 111 deletions(-)
>
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 234bae55b85d..2f466cf55790 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -20,16 +20,15 @@
> * new data the application may have written before commit.
> */
> #define BTRFS_INODE_ORDERED_DATA_CLOSE 0
> -#define BTRFS_INODE_ORPHAN_META_RESERVED 1
> -#define BTRFS_INODE_DUMMY 2
> -#define BTRFS_INODE_IN_DEFRAG 3
> -#define BTRFS_INODE_HAS_ORPHAN_ITEM 4
> -#define BTRFS_INODE_HAS_ASYNC_EXTENT 5
> -#define BTRFS_INODE_NEEDS_FULL_SYNC 6
> -#define BTRFS_INODE_COPY_EVERYTHING 7
> -#define BTRFS_INODE_IN_DELALLOC_LIST 8
> -#define BTRFS_INODE_READDIO_NEED_LOCK 9
> -#define BTRFS_INODE_HAS_PROPS 10
> +#define BTRFS_INODE_DUMMY 1
> +#define BTRFS_INODE_IN_DEFRAG 2
> +#define BTRFS_INODE_HAS_ORPHAN_ITEM 3
> +#define BTRFS_INODE_HAS_ASYNC_EXTENT 4
> +#define BTRFS_INODE_NEEDS_FULL_SYNC 5
> +#define BTRFS_INODE_COPY_EVERYTHING 6
> +#define BTRFS_INODE_IN_DELALLOC_LIST 7
> +#define BTRFS_INODE_READDIO_NEED_LOCK 8
> +#define BTRFS_INODE_HAS_PROPS 9
>
> /* in memory btrfs inode */
> struct btrfs_inode {
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 348dc57920f5..b9a046b8c72c 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3343,88 +3343,25 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
> /*
> * This creates an orphan entry for the given inode in case something goes wrong
> * in the middle of an unlink.
> - *
> - * NOTE: caller of this function should reserve 5 units of metadata for
> - * this function.
> */
> int btrfs_orphan_add(struct btrfs_trans_handle *trans,
> struct btrfs_inode *inode)
> {
> - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> struct btrfs_root *root = inode->root;
> - struct btrfs_block_rsv *block_rsv = NULL;
> - int reserve = 0;
> - bool insert = false;
> int ret;
>
> - if (!root->orphan_block_rsv) {
> - block_rsv = btrfs_alloc_block_rsv(fs_info,
> - BTRFS_BLOCK_RSV_TEMP);
> - if (!block_rsv)
> - return -ENOMEM;
> - }
> -
> - if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> - &inode->runtime_flags))
> - insert = true;
> -
> - if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> - &inode->runtime_flags))
> - reserve = 1;
> -
> - spin_lock(&root->orphan_lock);
> - /* If someone has created ->orphan_block_rsv, be happy to use it. */
> - if (!root->orphan_block_rsv) {
> - root->orphan_block_rsv = block_rsv;
> - } else if (block_rsv) {
> - btrfs_free_block_rsv(fs_info, block_rsv);
> - block_rsv = NULL;
> - }
> + if (test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> + &inode->runtime_flags))
> + return 0;
How come this can return true? Shouldn't btrfs_orphan_add always be
called for an inode which doesn't have an orphan item? Having this check
seems to indicate there is some non-determinism in the lifetime of
orphan items.
>
> - if (insert)
> - atomic_inc(&root->orphan_inodes);
> - spin_unlock(&root->orphan_lock);
> + atomic_inc(&root->orphan_inodes);
>
> - /* grab metadata reservation from transaction handle */
> - if (reserve) {
> - ret = btrfs_orphan_reserve_metadata(trans, inode);
> - ASSERT(!ret);
> - if (ret) {
> - /*
> - * dec doesn't need spin_lock as ->orphan_block_rsv
> - * would be released only if ->orphan_inodes is
> - * zero.
> - */
> - atomic_dec(&root->orphan_inodes);
> - clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> - &inode->runtime_flags);
> - if (insert)
> - clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> - &inode->runtime_flags);
> - return ret;
> - }
> - }
> -
> - /* insert an orphan item to track this unlinked file */
> - if (insert) {
> - ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
> - if (ret && ret != -EEXIST) {
> - if (reserve) {
> - clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> - &inode->runtime_flags);
> - btrfs_orphan_release_metadata(inode);
> - }
> - /*
> - * btrfs_orphan_commit_root may race with us and set
> - * ->orphan_block_rsv to zero, in order to avoid that,
> - * decrease ->orphan_inodes after everything is done.
> - */
> - atomic_dec(&root->orphan_inodes);
> - clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> - &inode->runtime_flags);
> - btrfs_abort_transaction(trans, ret);
> - return ret;
> - }
> + ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
> + if (ret && ret != -EEXIST) {
> + atomic_dec(&root->orphan_inodes);
> + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &inode->runtime_flags);
> + btrfs_abort_transaction(trans, ret);
> + return ret;
> }
>
> return 0;
> @@ -3438,27 +3375,16 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
> struct btrfs_inode *inode)
> {
> struct btrfs_root *root = inode->root;
> - int delete_item = 0;
> int ret = 0;
>
> - if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> - &inode->runtime_flags))
> - delete_item = 1;
> + if (!test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
> + &inode->runtime_flags))
> + return 0;
Similar comment as in btrfs_orphan_del. Shouldn't this always follow a
successful btrfs_orphan_add, guaranteeing there is an item for this
inode? Are there some "benign" races that could happen in the mean time
which could trigger this check ?
>
> - if (delete_item && trans)
> + if (trans)
> ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
>
> - if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
> - &inode->runtime_flags))
> - btrfs_orphan_release_metadata(inode);
> -
> - /*
> - * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv
> - * to zero, in order to avoid that, decrease ->orphan_inodes after
> - * everything is done.
> - */
> - if (delete_item)
> - atomic_dec(&root->orphan_inodes);
> + atomic_dec(&root->orphan_inodes);
>
> return ret;
> }
> @@ -5339,10 +5265,10 @@ void btrfs_evict_inode(struct inode *inode)
> trans->block_rsv = rsv;
>
> ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
> + trans->block_rsv = &fs_info->trans_block_rsv;
> + btrfs_end_transaction(trans);
> + btrfs_btree_balance_dirty(fs_info);
> if (ret) {
> - trans->block_rsv = &fs_info->trans_block_rsv;
> - btrfs_end_transaction(trans);
> - btrfs_btree_balance_dirty(fs_info);
> if (ret != -ENOSPC && ret != -EAGAIN) {
> btrfs_orphan_del(NULL, BTRFS_I(inode));
> btrfs_free_block_rsv(fs_info, rsv);
> @@ -5353,22 +5279,36 @@ void btrfs_evict_inode(struct inode *inode)
> }
> }
>
> - btrfs_free_block_rsv(fs_info, rsv);
> -
> /*
> - * Errors here aren't a big deal, it just means we leave orphan items
> - * in the tree. They will be cleaned up on the next mount.
> + * Errors here aren't a big deal, it just means we leave orphan items in
> + * the tree. They will be cleaned up on the next mount. If the inode
> + * number gets reused, cleanup deletes the orphan item without doing
> + * anything, and unlink reuses the existing orphan item.
> + *
> + * If it turns out that we are dropping too many of these, we might want
> + * to add a mechanism for retrying these after a commit.
> */
> - trans->block_rsv = root->orphan_block_rsv;
> - btrfs_orphan_del(trans, BTRFS_I(inode));
> + trans = evict_refill_and_join(root, rsv, min_size);
> + if (IS_ERR(trans)) {
> + btrfs_orphan_del(NULL, BTRFS_I(inode));
> + } else {
> + trans->block_rsv = rsv;
> + btrfs_orphan_del(trans, BTRFS_I(inode));
> + trans->block_rsv = &fs_info->trans_block_rsv;
> + btrfs_end_transaction(trans);
> + }
> +
> + btrfs_free_block_rsv(fs_info, rsv);
>
> - trans->block_rsv = &fs_info->trans_block_rsv;
> if (!(root == fs_info->tree_root ||
> root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
> btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
>
> - btrfs_end_transaction(trans);
> - btrfs_btree_balance_dirty(fs_info);
> + /*
> + * If we didn't successfully delete, the orphan item will still be in
> + * the tree and we'll retry on the next mount. Again, we might also want
> + * to retry these periodically in the future.
> + */
> no_delete:
> btrfs_remove_delayed_node(BTRFS_I(inode));
> clear_inode(inode);
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html