On Mon, Jul 16, 2012 at 4:56 PM, Arne Jansen <sensille@xxxxxxx> wrote:
> On 04.07.2012 15:38, Alexander Block wrote:
>> This patch introduces uuids for subvolumes. Each
>> subvolume has it's own uuid. In case it was snapshotted,
>> it also contains parent_uuid. In case it was received,
>> it also contains received_uuid.
>>
>> It also introduces subvolume ctime/otime/stime/rtime. The
>> first two are comparable to the times found in inodes. otime
>> is the origin/creation time and ctime is the change time.
>> stime/rtime are only valid on received subvolumes.
>> stime is the time of the subvolume when it was
>> sent. rtime is the time of the subvolume when it was
>> received.
>>
>> Additionally to the times, we have a transid for each
>> time. They are updated at the same place as the times.
>>
>> btrfs receive uses stransid and rtransid to find out
>> if a received subvolume changed in the meantime.
>>
>> If an older kernel mounts a filesystem with the
>> extented fields, all fields become invalid. The next
>> mount with a new kernel will detect this and reset the
>> fields.
>>
>> Signed-off-by: Alexander Block <ablock84@xxxxxxxxxxxxxx>
>> ---
>> fs/btrfs/ctree.h | 43 ++++++++++++++++++++++
>> fs/btrfs/disk-io.c | 2 +
>> fs/btrfs/inode.c | 4 ++
>> fs/btrfs/ioctl.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++--
>> fs/btrfs/ioctl.h | 13 +++++++
>> fs/btrfs/root-tree.c | 92 +++++++++++++++++++++++++++++++++++++++++++---
>> fs/btrfs/transaction.c | 17 +++++++++
>> 7 files changed, 258 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 8cfde93..2bd5df8 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -709,6 +709,35 @@ struct btrfs_root_item {
>> struct btrfs_disk_key drop_progress;
>> u8 drop_level;
>> u8 level;
>> +
>> + /*
>> + * The following fields appear after subvol_uuids+subvol_times
>> + * were introduced.
>> + */
>> +
>> + /*
>> + * This generation number is used to test if the new fields are valid
>> + * and up to date while reading the root item. Everytime the root item
>> + * is written out, the "generation" field is copied into this field. If
>> + * anyone ever mounted the fs with an older kernel, we will have
>> + * mismatching generation values here and thus must invalidate the
>> + * new fields. See btrfs_update_root and btrfs_find_last_root for
>> + * details.
>> + * the offset of generation_v2 is also used as the start for the memset
>> + * when invalidating the fields.
>> + */
>> + __le64 generation_v2;
>> + u8 uuid[BTRFS_UUID_SIZE];
>> + u8 parent_uuid[BTRFS_UUID_SIZE];
>> + u8 received_uuid[BTRFS_UUID_SIZE];
>> + __le64 ctransid; /* updated when an inode changes */
>> + __le64 otransid; /* trans when created */
>> + __le64 stransid; /* trans when sent. non-zero for received subvol */
>> + __le64 rtransid; /* trans when received. non-zero for received subvol */
>> + struct btrfs_timespec ctime;
>> + struct btrfs_timespec otime;
>> + struct btrfs_timespec stime;
>> + struct btrfs_timespec rtime;
>> } __attribute__ ((__packed__));
>>
>> /*
>> @@ -1416,6 +1445,8 @@ struct btrfs_root {
>> dev_t anon_dev;
>>
>> int force_cow;
>> +
>> + spinlock_t root_times_lock;
>> };
>>
>> struct btrfs_ioctl_defrag_range_args {
>> @@ -2189,6 +2220,16 @@ BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
>> BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
>> BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
>> last_snapshot, 64);
>> +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
>> + generation_v2, 64);
>> +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
>> + ctransid, 64);
>> +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
>> + otransid, 64);
>> +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
>> + stransid, 64);
>> +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
>> + rtransid, 64);
>>
>> static inline bool btrfs_root_readonly(struct btrfs_root *root)
>> {
>> @@ -2829,6 +2870,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
>> void btrfs_set_root_node(struct btrfs_root_item *item,
>> struct extent_buffer *node);
>> void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
>> +void btrfs_update_root_times(struct btrfs_trans_handle *trans,
>> + struct btrfs_root *root);
>>
>> /* dir-item.c */
>> int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 7b845ff..d3b49ad 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -1182,6 +1182,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
>> root->defrag_running = 0;
>> root->root_key.objectid = objectid;
>> root->anon_dev = 0;
>> +
>> + spin_lock_init(&root->root_times_lock);
>> }
>>
>> static int __must_check find_and_setup_root(struct btrfs_root *tree_root,
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index 139be17..0f6a65d 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -2734,6 +2734,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
>> */
>> if (!btrfs_is_free_space_inode(root, inode)
>> && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
>> + btrfs_update_root_times(trans, root);
>> +
>> ret = btrfs_delayed_update_inode(trans, root, inode);
>> if (!ret)
>> btrfs_set_inode_last_trans(trans, inode);
>> @@ -4728,6 +4730,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
>> trace_btrfs_inode_new(inode);
>> btrfs_set_inode_last_trans(trans, inode);
>>
>> + btrfs_update_root_times(trans, root);
>> +
>> return inode;
>> fail:
>> if (dir)
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index 7011871..8d258cb 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -41,6 +41,7 @@
>> #include <linux/vmalloc.h>
>> #include <linux/slab.h>
>> #include <linux/blkdev.h>
>> +#include <linux/uuid.h>
>> #include "compat.h"
>> #include "ctree.h"
>> #include "disk-io.h"
>> @@ -346,11 +347,13 @@ static noinline int create_subvol(struct btrfs_root *root,
>> struct btrfs_root *new_root;
>> struct dentry *parent = dentry->d_parent;
>> struct inode *dir;
>> + struct timespec cur_time = CURRENT_TIME;
>> int ret;
>> int err;
>> u64 objectid;
>> u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
>> u64 index = 0;
>> + uuid_le new_uuid;
>>
>> ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
>> if (ret)
>> @@ -389,8 +392,9 @@ static noinline int create_subvol(struct btrfs_root *root,
>> BTRFS_UUID_SIZE);
>> btrfs_mark_buffer_dirty(leaf);
>>
>> + memset(&root_item, 0, sizeof(root_item));
>> +
>> inode_item = &root_item.inode;
>> - memset(inode_item, 0, sizeof(*inode_item));
>> inode_item->generation = cpu_to_le64(1);
>> inode_item->size = cpu_to_le64(3);
>> inode_item->nlink = cpu_to_le32(1);
>> @@ -408,8 +412,15 @@ static noinline int create_subvol(struct btrfs_root *root,
>> btrfs_set_root_used(&root_item, leaf->len);
>> btrfs_set_root_last_snapshot(&root_item, 0);
>>
>> - memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
>> - root_item.drop_level = 0;
>> + btrfs_set_root_generation_v2(&root_item,
>> + btrfs_root_generation(&root_item));
>> + uuid_le_gen(&new_uuid);
>> + memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
>> + root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
>> + root_item.otime.nsec = cpu_to_le64(cur_time.tv_nsec);
>> + root_item.ctime = root_item.otime;
>> + btrfs_set_root_ctransid(&root_item, trans->transid);
>> + btrfs_set_root_otransid(&root_item, trans->transid);
>>
>> btrfs_tree_unlock(leaf);
>> free_extent_buffer(leaf);
>> @@ -3395,6 +3406,83 @@ out:
>> return ret;
>> }
>>
>> +static long btrfs_ioctl_set_received_subvol(struct file *file,
>> + void __user *arg)
>> +{
>> + struct btrfs_ioctl_received_subvol_args *sa = NULL;
>> + struct inode *inode = fdentry(file)->d_inode;
>> + struct btrfs_root *root = BTRFS_I(inode)->root;
>> + struct btrfs_root_item *root_item = &root->root_item;
>> + struct btrfs_trans_handle *trans;
>> + int ret = 0;
>> +
>> + ret = mnt_want_write_file(file);
>> + if (ret < 0)
>> + return ret;
>> +
>> + down_write(&root->fs_info->subvol_sem);
>> +
>> + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
>> + ret = -EINVAL;
>> + goto out;
>> + }
>> +
>> + if (btrfs_root_readonly(root)) {
>> + ret = -EROFS;
>> + goto out;
>> + }
>> +
>> + if (!inode_owner_or_capable(inode)) {
>> + ret = -EACCES;
>> + goto out;
>> + }
>> +
>> + sa = memdup_user(arg, sizeof(*sa));
>> + if (IS_ERR(sa)) {
>> + ret = PTR_ERR(sa);
>> + sa = NULL;
>> + goto out;
>> + }
>> +
>> + trans = btrfs_start_transaction(root, 1);
>> + if (IS_ERR(trans)) {
>> + ret = PTR_ERR(trans);
>> + trans = NULL;
>> + goto out;
>> + }
>> +
>> + sa->rtransid = trans->transid;
>> + sa->rtime = CURRENT_TIME;
>> +
>> + memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
>> + btrfs_set_root_stransid(root_item, sa->stransid);
>> + btrfs_set_root_rtransid(root_item, sa->rtransid);
>> + root_item->stime.sec = cpu_to_le64(sa->stime.tv_sec);
>> + root_item->stime.nsec = cpu_to_le64(sa->stime.tv_nsec);
>> + root_item->rtime.sec = cpu_to_le64(sa->rtime.tv_sec);
>> + root_item->rtime.nsec = cpu_to_le64(sa->rtime.tv_nsec);
>> +
>> + ret = btrfs_update_root(trans, root->fs_info->tree_root,
>> + &root->root_key, &root->root_item);
>> + if (ret < 0) {
>> + goto out;
>
> are you leaking a trans handle here?
>
btrfs_update_root is aborting the transaction in case of failure. Do I
still need to call end_transaction?
>> + } else {
>> + ret = btrfs_commit_transaction(trans, root);
>> + if (ret < 0)
>> + goto out;
>> + }
>> +
>> + ret = copy_to_user(arg, sa, sizeof(*sa));
>> + if (ret)
>> + ret = -EFAULT;
>> +
>> +out:
>> + kfree(sa);
>> + up_write(&root->fs_info->subvol_sem);
>> + mnt_drop_write_file(file);
>> + return ret;
>> +}
>> +
>> long btrfs_ioctl(struct file *file, unsigned int
>> cmd, unsigned long arg)
>> {
>> @@ -3477,6 +3565,8 @@ long btrfs_ioctl(struct file *file, unsigned int
>> return btrfs_ioctl_balance_ctl(root, arg);
>> case BTRFS_IOC_BALANCE_PROGRESS:
>> return btrfs_ioctl_balance_progress(root, argp);
>> + case BTRFS_IOC_SET_RECEIVED_SUBVOL:
>> + return btrfs_ioctl_set_received_subvol(file, argp);
>> case BTRFS_IOC_GET_DEV_STATS:
>> return btrfs_ioctl_get_dev_stats(root, argp, 0);
>> case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
>> diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
>> index e440aa6..c9e3fac 100644
>> --- a/fs/btrfs/ioctl.h
>> +++ b/fs/btrfs/ioctl.h
>> @@ -295,6 +295,15 @@ struct btrfs_ioctl_get_dev_stats {
>> __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
>> };
>>
>> +struct btrfs_ioctl_received_subvol_args {
>> + char uuid[BTRFS_UUID_SIZE]; /* in */
>> + __u64 stransid; /* in */
>> + __u64 rtransid; /* out */
>> + struct timespec stime; /* in */
>> + struct timespec rtime; /* out */
>> + __u64 reserved[16];
>
> What is this reserved used for? I don't see a mechanism that could be
> used to signal that there are useful information here, other than
> using a different ioctl.
>
The reserved is a result of a suggestion made by David. I can remove
it again if you want...
>> +};
>> +
>> #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
>> struct btrfs_ioctl_vol_args)
>> #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
>> @@ -359,6 +368,10 @@ struct btrfs_ioctl_get_dev_stats {
>> struct btrfs_ioctl_ino_path_args)
>> #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
>> struct btrfs_ioctl_ino_path_args)
>> +
>> +#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
>> + struct btrfs_ioctl_received_subvol_args)
>> +
>> #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
>> struct btrfs_ioctl_get_dev_stats)
>> #define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
>> diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
>> index 24fb8ce..17d638e 100644
>> --- a/fs/btrfs/root-tree.c
>> +++ b/fs/btrfs/root-tree.c
>> @@ -16,6 +16,7 @@
>> * Boston, MA 021110-1307, USA.
>> */
>>
>> +#include <linux/uuid.h>
>> #include "ctree.h"
>> #include "transaction.h"
>> #include "disk-io.h"
>> @@ -25,6 +26,9 @@
>> * lookup the root with the highest offset for a given objectid. The key we do
>> * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
>> * on error.
>> + * We also check if the root was once mounted with an older kernel. If we detect
>> + * this, the new fields coming after 'level' get overwritten with zeros so to
>> + * invalidate the fields.
>
> ... "This is detected by a mismatch of the 2 generation fields" ... or something
> like that.
>
The current version (found in git only) has this new function which is
called in find_last_root:
void btrfs_read_root_item(struct btrfs_root *root,
struct extent_buffer *eb, int slot,
struct btrfs_root_item *item)
The comment above this function explains what happens.
>> */
>> int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
>> struct btrfs_root_item *item, struct btrfs_key *key)
>> @@ -35,6 +39,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
>> struct extent_buffer *l;
>> int ret;
>> int slot;
>> + int len;
>> + int need_reset = 0;
>> + uuid_le uuid;
>>
>> search_key.objectid = objectid;
>> search_key.type = BTRFS_ROOT_ITEM_KEY;
>> @@ -60,11 +67,36 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
>> ret = 1;
>> goto out;
>> }
>> - if (item)
>> + if (item) {
>> + len = btrfs_item_size_nr(l, slot);
>> read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
>> - sizeof(*item));
>> + min_t(int, len, (int)sizeof(*item)));
>> + if (len < sizeof(*item))
>> + need_reset = 1;
>> + if (!need_reset && btrfs_root_generation(item)
>> + != btrfs_root_generation_v2(item)) {
>> + if (btrfs_root_generation_v2(item) != 0) {
>> + printk(KERN_WARNING "btrfs: mismatching "
>> + "generation and generation_v2 "
>> + "found in root item. This root "
>> + "was probably mounted with an "
>> + "older kernel. Resetting all "
>> + "new fields.\n");
>> + }
>> + need_reset = 1;
>> + }
>> + if (need_reset) {
>> + memset(&item->generation_v2, 0,
>> + sizeof(*item) - offsetof(struct btrfs_root_item,
>> + generation_v2));
>> +
>> + uuid_le_gen(&uuid);
>> + memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
>> + }
>> + }
>> if (key)
>> memcpy(key, &found_key, sizeof(found_key));
>> +
>> ret = 0;
>> out:
>> btrfs_free_path(path);
>> @@ -91,16 +123,15 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
>> int ret;
>> int slot;
>> unsigned long ptr;
>> + int old_len;
>>
>> path = btrfs_alloc_path();
>> if (!path)
>> return -ENOMEM;
>>
>> ret = btrfs_search_slot(trans, root, key, path, 0, 1);
>> - if (ret < 0) {
>> - btrfs_abort_transaction(trans, root, ret);
>> - goto out;
>> - }
>> + if (ret < 0)
>> + goto out_abort;
>>
>> if (ret != 0) {
>> btrfs_print_leaf(root, path->nodes[0]);
>> @@ -113,11 +144,47 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
>> l = path->nodes[0];
>> slot = path->slots[0];
>> ptr = btrfs_item_ptr_offset(l, slot);
>> + old_len = btrfs_item_size_nr(l, slot);
>> +
>> + /*
>> + * If this is the first time we update the root item which originated
>> + * from an older kernel, we need to enlarge the item size to make room
>> + * for the added fields.
>> + */
>> + if (old_len < sizeof(*item)) {
>> + btrfs_release_path(path);
>> + ret = btrfs_search_slot(trans, root, key, path,
>> + -1, 1);
>> + if (ret < 0)
>> + goto out_abort;
>> + ret = btrfs_del_item(trans, root, path);
>> + if (ret < 0)
>> + goto out_abort;
>> + btrfs_release_path(path);
>> + ret = btrfs_insert_empty_item(trans, root, path,
>> + key, sizeof(*item));
>> + if (ret < 0)
>> + goto out_abort;
>> + l = path->nodes[0];
>> + slot = path->slots[0];
>> + ptr = btrfs_item_ptr_offset(l, slot);
>> + }
>> +
>> + /*
>> + * Update generation_v2 so at the next mount we know the new root
>> + * fields are valid.
>> + */
>> + btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
>> +
>> write_extent_buffer(l, item, ptr, sizeof(*item));
>> btrfs_mark_buffer_dirty(path->nodes[0]);
>> out:
>> btrfs_free_path(path);
>> return ret;
>> +
>> +out_abort:
>> + btrfs_abort_transaction(trans, root, ret);
>> + goto out;
>> }
>>
>> int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>> @@ -454,3 +521,16 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
>> root_item->byte_limit = 0;
>> }
>> }
>> +
>> +void btrfs_update_root_times(struct btrfs_trans_handle *trans,
>> + struct btrfs_root *root)
>> +{
>> + struct btrfs_root_item *item = &root->root_item;
>> + struct timespec ct = CURRENT_TIME;
>> +
>> + spin_lock(&root->root_times_lock);
>> + item->ctransid = trans->transid;
>> + item->ctime.sec = cpu_to_le64(ct.tv_sec);
>> + item->ctime.nsec = cpu_to_le64(ct.tv_nsec);
>> + spin_unlock(&root->root_times_lock);
>> +}
>> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
>> index b72b068..a21f308 100644
>> --- a/fs/btrfs/transaction.c
>> +++ b/fs/btrfs/transaction.c
>> @@ -22,6 +22,7 @@
>> #include <linux/writeback.h>
>> #include <linux/pagemap.h>
>> #include <linux/blkdev.h>
>> +#include <linux/uuid.h>
>> #include "ctree.h"
>> #include "disk-io.h"
>> #include "transaction.h"
>> @@ -926,11 +927,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
>> struct dentry *dentry;
>> struct extent_buffer *tmp;
>> struct extent_buffer *old;
>> + struct timespec cur_time = CURRENT_TIME;
>> int ret;
>> u64 to_reserve = 0;
>> u64 index = 0;
>> u64 objectid;
>> u64 root_flags;
>> + uuid_le new_uuid;
>>
>> rsv = trans->block_rsv;
>>
>> @@ -1016,6 +1019,20 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
>> root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
>> btrfs_set_root_flags(new_root_item, root_flags);
>>
>> + btrfs_set_root_generation_v2(new_root_item,
>> + trans->transid);
>> + uuid_le_gen(&new_uuid);
>> + memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
>> + memcpy(new_root_item->parent_uuid, root->root_item.uuid,
>> + BTRFS_UUID_SIZE);
>> + new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
>> + new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec);
>> + btrfs_set_root_otransid(new_root_item, trans->transid);
>> + memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
>> + memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
>> + btrfs_set_root_stransid(new_root_item, 0);
>> + btrfs_set_root_rtransid(new_root_item, 0);
>> +
>> old = btrfs_lock_root_node(root);
>> ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
>> if (ret) {
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html