On Thu, Dec 03, 2015 at 12:59:50PM +0100, Christoph Hellwig wrote:
> The btrfs clone ioctls are now adopted by other file systems, with NFS
> and CIFS already having support for them, and XFS being under active
> development. To avoid growth of various slightly incompatible
> implementations, add one to the VFS. Note that clones are different from
> file copies in several ways:
>
> - they are atomic vs other writers
> - they support whole file clones
> - they support 64-bit legth clones
> - they do not allow partial success (aka short writes)
> - clones are expected to be a fast metadata operation
>
> Because of that it would be rather cumbersome to try to piggyback them on
> top of the recent clone_file_range infrastructure. The converse isn't
> true and the clone_file_range system call could try clone file range as
> a first attempt to copy, something that further patches will enable.
>
> Based on earlier work from Peng Tao.
<snip>
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 6c1aa73..9e3dd8f 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1451,3 +1451,75 @@ out1:
> out2:
> return ret;
> }
> +
> +static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
> +{
> + struct inode *inode = file_inode(file);
> +
> + if (unlikely(pos < 0))
> + return -EINVAL;
> +
> + if (unlikely((loff_t) (pos + len) < 0))
> + return -EINVAL;
> +
> + if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
> + loff_t end = len ? pos + len - 1 : OFFSET_MAX;
> + int retval;
> +
> + retval = locks_mandatory_area(file, pos, end,
> + write ? F_WRLCK : F_RDLCK);
> + if (retval < 0)
> + return retval;
> + }
> +
> + return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
> +}
> +
> +int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out, u64 len)
> +{
> + struct inode *inode_in = file_inode(file_in);
> + struct inode *inode_out = file_inode(file_out);
> + int ret;
> +
> + if (inode_in->i_sb != inode_out->i_sb ||
> + file_in->f_path.mnt != file_out->f_path.mnt)
> + return -EXDEV;
> +
> + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
> + return -EISDIR;
> + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
> + return -EOPNOTSUPP;
I thought we were moving to -EINVAL for wrong file types?
Though, perhaps "I've also prepared a btrfs patch for this and clone" from the
earlier thread about generic/157 wasn't referring to /this/ patch. :)
In any case, I'm ok with EINVAL, and I haven't heard any objections to
changing -EOPNOTSUPP -> -EINVAL when trying to reflink/dedupe/whatever
non-file non-dir fds.
<shrug> Anyone object?
--D
> +
> + if (!(file_in->f_mode & FMODE_READ) ||
> + !(file_out->f_mode & FMODE_WRITE) ||
> + (file_out->f_flags & O_APPEND) ||
> + !file_in->f_op->clone_file_range)
> + return -EBADF;
> +
> + ret = clone_verify_area(file_in, pos_in, len, false);
> + if (ret)
> + return ret;
> +
> + ret = clone_verify_area(file_out, pos_out, len, true);
> + if (ret)
> + return ret;
> +
> + if (pos_in + len > i_size_read(inode_in))
> + return -EINVAL;
> +
> + ret = mnt_want_write_file(file_out);
> + if (ret)
> + return ret;
> +
> + ret = file_in->f_op->clone_file_range(file_in, pos_in,
> + file_out, pos_out, len);
> + if (!ret) {
> + fsnotify_access(file_in);
> + fsnotify_modify(file_out);
> + }
> +
> + mnt_drop_write_file(file_out);
> + return ret;
> +}
> +EXPORT_SYMBOL(vfs_clone_file_range);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index af559ac..59bf96d 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1629,7 +1629,10 @@ struct file_operations {
> #ifndef CONFIG_MMU
> unsigned (*mmap_capabilities)(struct file *);
> #endif
> - ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
> + ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
> + loff_t, size_t, unsigned int);
> + int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
> + u64);
> };
>
> struct inode_operations {
> @@ -1683,6 +1686,8 @@ extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
> unsigned long, loff_t *);
> extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
> loff_t, size_t, unsigned int);
> +extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
> + struct file *file_out, loff_t pos_out, u64 len);
>
> struct super_operations {
> struct inode *(*alloc_inode)(struct super_block *sb);
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index f15d980..cd5db7f 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -39,6 +39,13 @@
> #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */
> #define RENAME_WHITEOUT (1 << 2) /* Whiteout source */
>
> +struct file_clone_range {
> + __s64 src_fd;
> + __u64 src_offset;
> + __u64 src_length;
> + __u64 dest_offset;
> +};
> +
> struct fstrim_range {
> __u64 start;
> __u64 len;
> @@ -159,6 +166,8 @@ struct inodes_stat_t {
> #define FIFREEZE _IOWR('X', 119, int) /* Freeze */
> #define FITHAW _IOWR('X', 120, int) /* Thaw */
> #define FITRIM _IOWR('X', 121, struct fstrim_range) /* Trim */
> +#define FICLONE _IOW(0x94, 9, int)
> +#define FICLONERANGE _IOW(0x94, 13, struct file_clone_range)
>
> #define FS_IOC_GETFLAGS _IOR('f', 1, long)
> #define FS_IOC_SETFLAGS _IOW('f', 2, long)
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html