[PATCH 07/24] Btrfs: add tree modification log functions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The tree mod log will log modifications made fs-tree nodes. Most
modifications are done by autobalance of the tree. Such changes are recorded
as long as a block entry exists. When released, the log is cleaned.

With the tree modification log, it's possible to reconstruct a consistent
old state of the tree. This is required to do backref walking on a busy
file system.

Signed-off-by: Jan Schmidt <list.btrfs@xxxxxxxxxxxxx>
---
 fs/btrfs/ctree.c   |  409 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ctree.h   |    7 +-
 fs/btrfs/disk-io.c |    2 +-
 3 files changed, 416 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 56485b3..6420638 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/rbtree.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -288,6 +289,414 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+enum mod_log_op {
+	MOD_LOG_KEY_REPLACE,
+	MOD_LOG_KEY_ADD,
+	MOD_LOG_KEY_REMOVE,
+	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+	MOD_LOG_MOVE_KEYS,
+	MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+	int dst_slot;
+	int nr_items;
+};
+
+struct tree_mod_root {
+	u64 logical;
+	u8 level;
+};
+
+struct tree_mod_elem {
+	struct rb_node node;
+	u64 index;		/* shifted logical */
+	struct seq_list elem;
+	enum mod_log_op op;
+
+	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+	int slot;
+
+	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+	u64 generation;
+
+	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+	struct btrfs_disk_key key;
+	u64 blockptr;
+
+	/* this is used for op == MOD_LOG_MOVE_KEYS */
+	struct tree_mod_move move;
+
+	/* this is used for op == MOD_LOG_ROOT_REPLACE */
+	struct tree_mod_root old_root;
+};
+
+static inline void
+__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+{
+	elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
+	list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+}
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem)
+{
+	elem->flags = 1;
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	__get_tree_mod_seq(fs_info, elem);
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem)
+{
+	struct rb_root *tm_root;
+	struct rb_node *node;
+	struct rb_node *next;
+	struct seq_list *cur_elem;
+	struct tree_mod_elem *tm;
+	u64 min_seq = (u64)-1;
+	u64 seq_putting = elem->seq;
+
+	if (!seq_putting)
+		return;
+
+	BUG_ON(!(elem->flags & 1));
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	list_del(&elem->list);
+
+	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+		if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+			if (seq_putting > cur_elem->seq) {
+				/*
+				 * blocker with lower sequence number exists, we
+				 * cannot remove anything from the log
+				 */
+				goto out;
+			}
+			min_seq = cur_elem->seq;
+		}
+	}
+
+	/*
+	 * anything that's lower than the lowest existing (read: blocked)
+	 * sequence number can be removed from the tree.
+	 */
+	write_lock(&fs_info->tree_mod_log_lock);
+	tm_root = &fs_info->tree_mod_log;
+	for (node = rb_first(tm_root); node; node = next) {
+		next = rb_next(node);
+		tm = container_of(node, struct tree_mod_elem, node);
+		if (tm->elem.seq > min_seq)
+			continue;
+		rb_erase(node, tm_root);
+		list_del(&tm->elem.list);
+		kfree(tm);
+	}
+	write_unlock(&fs_info->tree_mod_log_lock);
+out:
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+/*
+ * key order of the log:
+ *       index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+	struct rb_root *tm_root;
+	struct rb_node **new;
+	struct rb_node *parent = NULL;
+	struct tree_mod_elem *cur;
+
+	BUG_ON(!tm || !tm->elem.seq);
+
+	write_lock(&fs_info->tree_mod_log_lock);
+	tm_root = &fs_info->tree_mod_log;
+	new = &tm_root->rb_node;
+	while (*new) {
+		cur = container_of(*new, struct tree_mod_elem, node);
+		parent = *new;
+		if (cur->index < tm->index)
+			new = &((*new)->rb_left);
+		else if (cur->index > tm->index)
+			new = &((*new)->rb_right);
+		else if (cur->elem.seq < tm->elem.seq)
+			new = &((*new)->rb_left);
+		else if (cur->elem.seq > tm->elem.seq)
+			new = &((*new)->rb_right);
+		else {
+			kfree(tm);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&tm->node, parent, new);
+	rb_insert_color(&tm->node, tm_root);
+	write_unlock(&fs_info->tree_mod_log_lock);
+
+	return 0;
+}
+
+int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
+		   struct tree_mod_elem **tm_ret)
+{
+	struct tree_mod_elem *tm;
+	u64 seq = 0;
+
+	/*
+	 * we want to avoid a malloc/free cycle if there's no blocker in the
+	 * list.
+	 * we also want to avoid atomic malloc. so we must drop the spinlock
+	 * before calling kzalloc and recheck afterwards.
+	 */
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (list_empty(&fs_info->tree_mod_seq_list))
+		goto out;
+
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+	tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+	if (!tm)
+		return -ENOMEM;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (list_empty(&fs_info->tree_mod_seq_list)) {
+		kfree(tm);
+		goto out;
+	}
+
+	__get_tree_mod_seq(fs_info, &tm->elem);
+	seq = tm->elem.seq;
+	tm->elem.flags = 0;
+
+out:
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+	return seq;
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+			     struct extent_buffer *eb, int slot,
+			     enum mod_log_op op, gfp_t flags)
+{
+	struct tree_mod_elem *tm;
+	int ret;
+
+	ret = tree_mod_alloc(fs_info, flags, &tm);
+	if (ret <= 0)
+		return ret;
+
+	tm->index = eb->start >> PAGE_CACHE_SHIFT;
+	if (op != MOD_LOG_KEY_ADD) {
+		btrfs_node_key(eb, &tm->key, slot);
+		tm->blockptr = btrfs_node_blockptr(eb, slot);
+	}
+	tm->op = op;
+	tm->slot = slot;
+	tm->generation = btrfs_node_ptr_generation(eb, slot);
+
+	return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+			int slot, enum mod_log_op op)
+{
+	return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+			 struct extent_buffer *eb, int dst_slot, int src_slot,
+			 int nr_items, gfp_t flags)
+{
+	struct tree_mod_elem *tm;
+	int ret;
+
+	ret = tree_mod_alloc(fs_info, flags, &tm);
+	if (ret <= 0)
+		return ret;
+
+	tm->index = eb->start >> PAGE_CACHE_SHIFT;
+	tm->slot = src_slot;
+	tm->move.dst_slot = dst_slot;
+	tm->move.nr_items = nr_items;
+	tm->op = MOD_LOG_MOVE_KEYS;
+
+	return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+			 struct extent_buffer *old_root,
+			 struct extent_buffer *new_root, gfp_t flags)
+{
+	struct tree_mod_elem *tm;
+	int ret;
+
+	ret = tree_mod_alloc(fs_info, flags, &tm);
+	if (ret <= 0)
+		return ret;
+
+	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+	tm->old_root.logical = old_root->start;
+	tm->old_root.level = btrfs_header_level(old_root);
+	tm->generation = btrfs_header_generation(old_root);
+	tm->op = MOD_LOG_ROOT_REPLACE;
+
+	return __tree_mod_log_insert(fs_info, tm);
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+		      int smallest)
+{
+	struct rb_root *tm_root;
+	struct rb_node *node;
+	struct tree_mod_elem *cur = NULL;
+	struct tree_mod_elem *found = NULL;
+	u64 index = start >> PAGE_CACHE_SHIFT;
+
+	read_lock(&fs_info->tree_mod_log_lock);
+	tm_root = &fs_info->tree_mod_log;
+	node = tm_root->rb_node;
+	while (node) {
+		cur = container_of(node, struct tree_mod_elem, node);
+		if (cur->index < index) {
+			node = node->rb_left;
+		} else if (cur->index > index) {
+			node = node->rb_right;
+		} else if (cur->elem.seq < min_seq) {
+			node = node->rb_left;
+		} else if (!smallest) {
+			/* we want the node with the highest seq */
+			if (found)
+				BUG_ON(found->elem.seq > cur->elem.seq);
+			found = cur;
+			node = node->rb_left;
+		} else if (cur->elem.seq > min_seq) {
+			/* we want the node with the smallest seq */
+			if (found)
+				BUG_ON(found->elem.seq < cur->elem.seq);
+			found = cur;
+			node = node->rb_right;
+		} else {
+			return cur;
+		}
+	}
+	read_unlock(&fs_info->tree_mod_log_lock);
+
+	return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+			   u64 min_seq)
+{
+	return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+	return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static inline void
+__copy_extent_buffer_log(struct btrfs_fs_info *fs_info,
+			 struct extent_buffer *dst, struct extent_buffer *src,
+			 unsigned long dst_offset, unsigned long src_offset,
+			 int nr_items, size_t item_size)
+{
+	int ret;
+	int i;
+
+	/* speed this up by single seq for all operations? */
+	for (i = 0; i < nr_items; i++) {
+		ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
+					      MOD_LOG_KEY_REMOVE);
+		BUG_ON(ret < 0);
+		ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
+					      MOD_LOG_KEY_ADD);
+		BUG_ON(ret < 0);
+	}
+
+	copy_extent_buffer(dst, src, btrfs_node_key_ptr_offset(dst_offset),
+			   btrfs_node_key_ptr_offset(src_offset),
+			   nr_items * item_size);
+}
+
+static inline void
+__memmove_extent_buffer_log(struct btrfs_fs_info *fs_info,
+			    struct extent_buffer *dst,
+			    int dst_offset, int src_offset, int nr_items,
+			    size_t item_size, int tree_mod_log)
+{
+	int ret;
+	if (tree_mod_log) {
+		ret = tree_mod_log_insert_move(fs_info, dst, dst_offset,
+					       src_offset, nr_items, GFP_NOFS);
+		BUG_ON(ret < 0);
+	}
+	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst_offset),
+			      btrfs_node_key_ptr_offset(src_offset),
+			      nr_items * item_size);
+}
+
+static inline void
+__set_node_key_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+		   struct btrfs_disk_key *disk_key, int nr, int atomic)
+{
+	int ret;
+
+	ret = tree_mod_log_insert_key_mask(fs_info, eb, nr, MOD_LOG_KEY_REPLACE,
+					   atomic ? GFP_ATOMIC : GFP_NOFS);
+	BUG_ON(ret < 0);
+
+	btrfs_set_node_key(eb, disk_key, nr);
+}
+
+static void __log_cleaning(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *eb)
+{
+	int i;
+	int ret;
+	u32 nritems;
+
+	nritems = btrfs_header_nritems(eb);
+	for (i = nritems - 1; i >= 0; i--) {
+		ret = tree_mod_log_insert_key(fs_info, eb, i,
+					      MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+		BUG_ON(ret < 0);
+	}
+}
+
+static inline void
+set_root_pointer(struct btrfs_root *root, struct extent_buffer *new_root_node)
+{
+	int ret;
+	__log_cleaning(root->fs_info, root->node);
+	ret = tree_mod_log_insert_root(root->fs_info, root->node,
+				       new_root_node, GFP_NOFS);
+	BUG_ON(ret < 0);
+	rcu_assign_pointer(root->node, new_root_node);
+}
+
 /*
  * check if the tree block can be shared by multiple trees
  */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6774821..e53bfb9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1132,7 +1132,7 @@ struct btrfs_fs_info {
 	/* this protects tree_mod_seq_list */
 	spinlock_t tree_mod_seq_lock;
 	atomic_t tree_mod_seq;
-	struct list_head tree_mod_list;
+	struct list_head tree_mod_seq_list;
 
 	/* this protects tree_mod_log */
 	rwlock_t tree_mod_log_lock;
@@ -3114,4 +3114,9 @@ struct seq_list {
 	u32 flags;
 };
 
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem);
+
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6aec7c6..f51ad84 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1921,7 +1921,7 @@ int open_ctree(struct super_block *sb,
 	init_completion(&fs_info->kobj_unregister);
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
-	INIT_LIST_HEAD(&fs_info->tree_mod_list);
+	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	btrfs_init_block_rsv(&fs_info->global_block_rsv);
 	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
-- 
1.7.3.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Filesystem Development]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux