Hi Jan,
(2012/05/21 1:06), Jan Schmidt wrote:
> The tree mod log will log modifications made fs-tree nodes. Most
> modifications are done by autobalance of the tree. Such changes are recorded
> as long as a block entry exists. When released, the log is cleaned.
>
> With the tree modification log, it's possible to reconstruct a consistent
> old state of the tree. This is required to do backref walking on a busy
> file system.
>
> Signed-off-by: Jan Schmidt<list.btrfs@xxxxxxxxxxxxx>
> ---
> fs/btrfs/ctree.c | 409 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/ctree.h | 7 +-
> fs/btrfs/disk-io.c | 2 +-
> 3 files changed, 416 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
> index 56485b3..6420638 100644
> --- a/fs/btrfs/ctree.c
> +++ b/fs/btrfs/ctree.c
> @@ -18,6 +18,7 @@
>
> #include<linux/sched.h>
> #include<linux/slab.h>
> +#include<linux/rbtree.h>
> #include "ctree.h"
> #include "disk-io.h"
> #include "transaction.h"
> @@ -288,6 +289,414 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
> return 0;
> }
>
> +enum mod_log_op {
> + MOD_LOG_KEY_REPLACE,
> + MOD_LOG_KEY_ADD,
> + MOD_LOG_KEY_REMOVE,
> + MOD_LOG_KEY_REMOVE_WHILE_FREEING,
> + MOD_LOG_MOVE_KEYS,
> + MOD_LOG_ROOT_REPLACE,
> +};
> +
> +struct tree_mod_move {
> + int dst_slot;
> + int nr_items;
> +};
> +
> +struct tree_mod_root {
> + u64 logical;
> + u8 level;
> +};
> +
> +struct tree_mod_elem {
> + struct rb_node node;
> + u64 index; /* shifted logical */
> + struct seq_list elem;
> + enum mod_log_op op;
> +
> + /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
> + int slot;
> +
> + /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
> + u64 generation;
> +
> + /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
> + struct btrfs_disk_key key;
> + u64 blockptr;
> +
> + /* this is used for op == MOD_LOG_MOVE_KEYS */
> + struct tree_mod_move move;
> +
> + /* this is used for op == MOD_LOG_ROOT_REPLACE */
> + struct tree_mod_root old_root;
> +};
> +
> +static inline void
> +__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
> +{
> + elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
> + list_add_tail(&elem->list,&fs_info->tree_mod_seq_list);
> +}
> +
> +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
> + struct seq_list *elem)
> +{
> + elem->flags = 1;
> + spin_lock(&fs_info->tree_mod_seq_lock);
> + __get_tree_mod_seq(fs_info, elem);
> + spin_unlock(&fs_info->tree_mod_seq_lock);
> +}
> +
> +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
> + struct seq_list *elem)
> +{
> + struct rb_root *tm_root;
> + struct rb_node *node;
> + struct rb_node *next;
> + struct seq_list *cur_elem;
> + struct tree_mod_elem *tm;
> + u64 min_seq = (u64)-1;
> + u64 seq_putting = elem->seq;
> +
> + if (!seq_putting)
> + return;
> +
> + BUG_ON(!(elem->flags& 1));
> + spin_lock(&fs_info->tree_mod_seq_lock);
> + list_del(&elem->list);
> +
> + list_for_each_entry(cur_elem,&fs_info->tree_mod_seq_list, list) {
> + if ((cur_elem->flags& 1)&& cur_elem->seq< min_seq) {
> + if (seq_putting> cur_elem->seq) {
> + /*
> + * blocker with lower sequence number exists, we
> + * cannot remove anything from the log
> + */
> + goto out;
> + }
> + min_seq = cur_elem->seq;
> + }
> + }
> +
> + /*
> + * anything that's lower than the lowest existing (read: blocked)
> + * sequence number can be removed from the tree.
> + */
> + write_lock(&fs_info->tree_mod_log_lock);
> + tm_root =&fs_info->tree_mod_log;
> + for (node = rb_first(tm_root); node; node = next) {
> + next = rb_next(node);
> + tm = container_of(node, struct tree_mod_elem, node);
> + if (tm->elem.seq> min_seq)
> + continue;
> + rb_erase(node, tm_root);
> + list_del(&tm->elem.list);
> + kfree(tm);
> + }
> + write_unlock(&fs_info->tree_mod_log_lock);
> +out:
> + spin_unlock(&fs_info->tree_mod_seq_lock);
> +}
> +
> +/*
> + * key order of the log:
> + * index -> sequence
> + *
> + * the index is the shifted logical of the *new* root node for root replace
> + * operations, or the shifted logical of the affected block for all other
> + * operations.
> + */
> +static noinline int
> +__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
> +{
> + struct rb_root *tm_root;
> + struct rb_node **new;
> + struct rb_node *parent = NULL;
> + struct tree_mod_elem *cur;
> +
> + BUG_ON(!tm || !tm->elem.seq);
> +
> + write_lock(&fs_info->tree_mod_log_lock);
> + tm_root =&fs_info->tree_mod_log;
> + new =&tm_root->rb_node;
> + while (*new) {
> + cur = container_of(*new, struct tree_mod_elem, node);
> + parent = *new;
> + if (cur->index< tm->index)
> + new =&((*new)->rb_left);
> + else if (cur->index> tm->index)
> + new =&((*new)->rb_right);
> + else if (cur->elem.seq< tm->elem.seq)
> + new =&((*new)->rb_left);
> + else if (cur->elem.seq> tm->elem.seq)
> + new =&((*new)->rb_right);
> + else {
> + kfree(tm);
> + return -EEXIST;
I think that write_unlock() is necessary for here.
> + }
> + }
> +
> + rb_link_node(&tm->node, parent, new);
> + rb_insert_color(&tm->node, tm_root);
> + write_unlock(&fs_info->tree_mod_log_lock);
> +
> + return 0;
> +}
> +
> +int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
> + struct tree_mod_elem **tm_ret)
> +{
> + struct tree_mod_elem *tm;
> + u64 seq = 0;
> +
> + /*
> + * we want to avoid a malloc/free cycle if there's no blocker in the
> + * list.
> + * we also want to avoid atomic malloc. so we must drop the spinlock
> + * before calling kzalloc and recheck afterwards.
> + */
> + spin_lock(&fs_info->tree_mod_seq_lock);
> + if (list_empty(&fs_info->tree_mod_seq_list))
> + goto out;
> +
> + spin_unlock(&fs_info->tree_mod_seq_lock);
> + tm = *tm_ret = kzalloc(sizeof(*tm), flags);
> + if (!tm)
> + return -ENOMEM;
> +
> + spin_lock(&fs_info->tree_mod_seq_lock);
> + if (list_empty(&fs_info->tree_mod_seq_list)) {
> + kfree(tm);
> + goto out;
> + }
> +
> + __get_tree_mod_seq(fs_info,&tm->elem);
> + seq = tm->elem.seq;
> + tm->elem.flags = 0;
> +
> +out:
> + spin_unlock(&fs_info->tree_mod_seq_lock);
> + return seq;
> +}
> +
> +static noinline int
> +tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
> + struct extent_buffer *eb, int slot,
> + enum mod_log_op op, gfp_t flags)
> +{
> + struct tree_mod_elem *tm;
> + int ret;
> +
> + ret = tree_mod_alloc(fs_info, flags,&tm);
> + if (ret<= 0)
> + return ret;
> +
> + tm->index = eb->start>> PAGE_CACHE_SHIFT;
> + if (op != MOD_LOG_KEY_ADD) {
> + btrfs_node_key(eb,&tm->key, slot);
> + tm->blockptr = btrfs_node_blockptr(eb, slot);
> + }
> + tm->op = op;
> + tm->slot = slot;
> + tm->generation = btrfs_node_ptr_generation(eb, slot);
> +
> + return __tree_mod_log_insert(fs_info, tm);
> +}
> +
> +static noinline int
> +tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
> + int slot, enum mod_log_op op)
> +{
> + return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
> +}
> +
> +static noinline int
> +tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
> + struct extent_buffer *eb, int dst_slot, int src_slot,
> + int nr_items, gfp_t flags)
> +{
> + struct tree_mod_elem *tm;
> + int ret;
> +
> + ret = tree_mod_alloc(fs_info, flags,&tm);
> + if (ret<= 0)
> + return ret;
> +
> + tm->index = eb->start>> PAGE_CACHE_SHIFT;
> + tm->slot = src_slot;
> + tm->move.dst_slot = dst_slot;
> + tm->move.nr_items = nr_items;
> + tm->op = MOD_LOG_MOVE_KEYS;
> +
> + return __tree_mod_log_insert(fs_info, tm);
> +}
> +
> +static noinline int
> +tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
> + struct extent_buffer *old_root,
> + struct extent_buffer *new_root, gfp_t flags)
> +{
> + struct tree_mod_elem *tm;
> + int ret;
> +
> + ret = tree_mod_alloc(fs_info, flags,&tm);
> + if (ret<= 0)
> + return ret;
> +
> + tm->index = new_root->start>> PAGE_CACHE_SHIFT;
> + tm->old_root.logical = old_root->start;
> + tm->old_root.level = btrfs_header_level(old_root);
> + tm->generation = btrfs_header_generation(old_root);
> + tm->op = MOD_LOG_ROOT_REPLACE;
> +
> + return __tree_mod_log_insert(fs_info, tm);
> +}
> +
> +static struct tree_mod_elem *
> +__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
> + int smallest)
> +{
> + struct rb_root *tm_root;
> + struct rb_node *node;
> + struct tree_mod_elem *cur = NULL;
> + struct tree_mod_elem *found = NULL;
> + u64 index = start>> PAGE_CACHE_SHIFT;
> +
> + read_lock(&fs_info->tree_mod_log_lock);
> + tm_root =&fs_info->tree_mod_log;
> + node = tm_root->rb_node;
> + while (node) {
> + cur = container_of(node, struct tree_mod_elem, node);
> + if (cur->index< index) {
> + node = node->rb_left;
> + } else if (cur->index> index) {
> + node = node->rb_right;
> + } else if (cur->elem.seq< min_seq) {
> + node = node->rb_left;
> + } else if (!smallest) {
> + /* we want the node with the highest seq */
> + if (found)
> + BUG_ON(found->elem.seq> cur->elem.seq);
> + found = cur;
> + node = node->rb_left;
> + } else if (cur->elem.seq> min_seq) {
> + /* we want the node with the smallest seq */
> + if (found)
> + BUG_ON(found->elem.seq< cur->elem.seq);
> + found = cur;
> + node = node->rb_right;
> + } else {
I think read_unlock() is necessary for here.
Thanks,
Tsutomu
> + return cur;
> + }
> + }
> + read_unlock(&fs_info->tree_mod_log_lock);
> +
> + return found;
> +}
> +
> +/*
> + * this returns the element from the log with the smallest time sequence
> + * value that's in the log (the oldest log item). any element with a time
> + * sequence lower than min_seq will be ignored.
> + */
> +static struct tree_mod_elem *
> +tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
> + u64 min_seq)
> +{
> + return __tree_mod_log_search(fs_info, start, min_seq, 1);
> +}
> +
> +/*
> + * this returns the element from the log with the largest time sequence
> + * value that's in the log (the most recent log item). any element with
> + * a time sequence lower than min_seq will be ignored.
> + */
> +static struct tree_mod_elem *
> +tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
> +{
> + return __tree_mod_log_search(fs_info, start, min_seq, 0);
> +}
> +
> +static inline void
> +__copy_extent_buffer_log(struct btrfs_fs_info *fs_info,
> + struct extent_buffer *dst, struct extent_buffer *src,
> + unsigned long dst_offset, unsigned long src_offset,
> + int nr_items, size_t item_size)
> +{
> + int ret;
> + int i;
> +
> + /* speed this up by single seq for all operations? */
> + for (i = 0; i< nr_items; i++) {
> + ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
> + MOD_LOG_KEY_REMOVE);
> + BUG_ON(ret< 0);
> + ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
> + MOD_LOG_KEY_ADD);
> + BUG_ON(ret< 0);
> + }
> +
> + copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_offset),
> + btrfs_node_key_ptr_offset(src_offset),
> + nr_items * item_size);
> +}
> +
> +static inline void
> +__memmove_extent_buffer_log(struct btrfs_fs_info *fs_info,
> + struct extent_buffer *dst,
> + int dst_offset, int src_offset, int nr_items,
> + size_t item_size, int tree_mod_log)
> +{
> + int ret;
> + if (tree_mod_log) {
> + ret = tree_mod_log_insert_move(fs_info, dst, dst_offset,
> + src_offset, nr_items, GFP_NOFS);
> + BUG_ON(ret< 0);
> + }
> + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst_offset),
> + btrfs_node_key_ptr_offset(src_offset),
> + nr_items * item_size);
> +}
> +
> +static inline void
> +__set_node_key_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
> + struct btrfs_disk_key *disk_key, int nr, int atomic)
> +{
> + int ret;
> +
> + ret = tree_mod_log_insert_key_mask(fs_info, eb, nr, MOD_LOG_KEY_REPLACE,
> + atomic ? GFP_ATOMIC : GFP_NOFS);
> + BUG_ON(ret< 0);
> +
> + btrfs_set_node_key(eb, disk_key, nr);
> +}
> +
> +static void __log_cleaning(struct btrfs_fs_info *fs_info,
> + struct extent_buffer *eb)
> +{
> + int i;
> + int ret;
> + u32 nritems;
> +
> + nritems = btrfs_header_nritems(eb);
> + for (i = nritems - 1; i>= 0; i--) {
> + ret = tree_mod_log_insert_key(fs_info, eb, i,
> + MOD_LOG_KEY_REMOVE_WHILE_FREEING);
> + BUG_ON(ret< 0);
> + }
> +}
> +
> +static inline void
> +set_root_pointer(struct btrfs_root *root, struct extent_buffer *new_root_node)
> +{
> + int ret;
> + __log_cleaning(root->fs_info, root->node);
> + ret = tree_mod_log_insert_root(root->fs_info, root->node,
> + new_root_node, GFP_NOFS);
> + BUG_ON(ret< 0);
> + rcu_assign_pointer(root->node, new_root_node);
> +}
> +
> /*
> * check if the tree block can be shared by multiple trees
> */
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 6774821..e53bfb9 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1132,7 +1132,7 @@ struct btrfs_fs_info {
> /* this protects tree_mod_seq_list */
> spinlock_t tree_mod_seq_lock;
> atomic_t tree_mod_seq;
> - struct list_head tree_mod_list;
> + struct list_head tree_mod_seq_list;
>
> /* this protects tree_mod_log */
> rwlock_t tree_mod_log_lock;
> @@ -3114,4 +3114,9 @@ struct seq_list {
> u32 flags;
> };
>
> +void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
> + struct seq_list *elem);
> +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
> + struct seq_list *elem);
> +
> #endif
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 6aec7c6..f51ad84 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -1921,7 +1921,7 @@ int open_ctree(struct super_block *sb,
> init_completion(&fs_info->kobj_unregister);
> INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
> INIT_LIST_HEAD(&fs_info->space_info);
> - INIT_LIST_HEAD(&fs_info->tree_mod_list);
> + INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
> btrfs_mapping_init(&fs_info->mapping_tree);
> btrfs_init_block_rsv(&fs_info->global_block_rsv);
> btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html