Index: include/unistd.h =================================================================== --- include/unistd.h +++ include/unistd.h @@ -494,6 +494,7 @@ int acct(const char *); int async_daemon(void); int check_utility_compat(const char *); +ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, u_int); const char * crypt_get_format(void); char *crypt_r(const char *, const char *, struct crypt_data *); Index: lib/libc/sys/Makefile.inc =================================================================== --- lib/libc/sys/Makefile.inc +++ lib/libc/sys/Makefile.inc @@ -175,6 +175,7 @@ closefrom.2 \ connect.2 \ connectat.2 \ + copy_file_range.2 \ cpuset.2 \ cpuset_getaffinity.2 \ cpuset_getdomain.2 \ Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -402,6 +402,7 @@ }; FBSD_1.6 { + copy_file_range; fhlink; fhlinkat; fhreadlink; Index: lib/libc/sys/copy_file_range.2 =================================================================== --- lib/libc/sys/copy_file_range.2 +++ lib/libc/sys/copy_file_range.2 @@ -0,0 +1,144 @@ +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2019 Rick Macklem +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 9, 2019 +.Dt COPY_FILE_RANGE 2 +.Os +.Sh NAME +.Nm copy_file_range +.Nd kernel copy of a byte range from one file to another +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/types.h +.In unistd.h +.Ft ssize_t +.Fn copy_file_range "int infd" "off_t *inoffp" "int outfd" "off_t *outoffp" "size_t len" "u_int flags" +.Fc +.Sh DESCRIPTION +The +.Fn copy_file_range +system call +copies +.Fa len +bytes from +.Fa infd +to +.Fa outfd +in the kernel. +It may do this using a file system specific technique if +.Fa infd +and +.Fa outfd +are on the same file system. +The +.Fa infd +argument must be opened for reading and the +.Fa outfd +argument must be opened for writing, but not O_APPEND. +If +.Fa inoffp +or +.Fa outoffp +is NULL, the file offset for +.Fa infd +or +.Fa outfd +respectively will be used and updated by +the number of bytes copied. +If +.Fa inoffp +or +.Fa outoffp +is not NULL, the byte offset pointed to by +.Fa inoffp +or +.Fa outoffp +respectively will be used/updated and the file offset for +.Fa infd +or +.Fa outfd +respectively will not be affected. +The +.Fa flags +argument is currently ignored and should be set to 0. +.Pp +.Sh RETURN VALUES +If it succeeds, the call returns the number of bytes copied, which can be less +than +.Fa len . +.Fn copy_file_range +should be used in a loop until copying of the desired byte range has been +completed. +If an error has occurred, a \-1 is returned and the error code is placed in +the global variable +.Va errno . +.Sh ERRORS +The +.Fn copy_file_range +system call +will fail if: +.Bl -tag -width Er +.It Bq Er EBADF +If +.Fa +infd +is not open for reading or +.Fa +outfd +is not open for writing, or opened for writing with O_APPEND. +.It Bq Er EFBIG +If the copy exceeds the process's file size limit or the maximum file size +for the file system +.Fa outfd +resides on. +.It Bq Er EINVAL +If the initial offset for +.Fa infd +plus +.Fa len +exceeds EOF for +.Fa infd . +.It Bq Er EIO +An I/O error occurred while reading/writing the files. +.It Bq Er EISDIR +If either +.Fa infd +or +.Fa outfd +refers to a directory. +.El +.Sh STANDARDS +The +.Fn copy_file_range +system call is expected to be compatible with the Linux system call of +the same name. +.Sh HISTORY +The +.Fn copy_file_range +function appeared in +.Fx 13.0 . Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -3175,6 +3175,16 @@ int flag ); } +569 AUE_NULL STD { + ssize_t copy_file_range( + _In_ int infd, + _Inout_opt_ off_t *inoffp, + _In_ int outfd, + _Inout_opt_ off_t *outoffp, + _In_ size_t len, + _In_ u_int flags + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -140,6 +140,7 @@ .vop_set_text = vop_stdset_text, .vop_unset_text = vop_stdunset_text, .vop_add_writecount = vop_stdadd_writecount, + .vop_copy_file_range = VOP_EOPNOTSUPP, }; /* Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -4814,3 +4814,94 @@ uap->advice); return (kern_posix_error(td, error)); } + +int +kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, + off_t *outoffp, size_t len, u_int flags) +{ + struct file *infp, *outfp; + struct vnode *invp, *outvp; + int error, lock_flags; + size_t retlen; + + infp = outfp = NULL; + retlen = 0; + + /* Get the file structures for the file descriptors. */ + error = fget_read(td, infd, &cap_read_rights, &infp); + if (error != 0) + goto out; + error = fget_write(td, outfd, &cap_write_rights, &outfp); + if (error != 0) + goto out; + + /* Set the offset pointers to the correct place. */ + if (inoffp == NULL) + inoffp = &infp->f_offset; + if (outoffp == NULL) + outoffp = &outfp->f_offset; + + /* Sanity check the f_flag bits. */ + if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || + (infp->f_flag & FREAD) == 0) { + error = EBADF; + goto out; + } + + /* Lock the vnodes. */ + invp = infp->f_vnode; + error = vn_lock(invp, LK_SHARED); + if (error != 0) + goto out; + outvp = outfp->f_vnode; + if (MNT_SHARED_WRITES(outvp->v_mount)) + lock_flags = LK_SHARED; + else + lock_flags = LK_EXCLUSIVE; + error = vn_lock(outvp, lock_flags); + if (error != 0) { + VOP_UNLOCK(outvp, 0); + goto out; + } + + retlen = len; + error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, + flags); + VOP_UNLOCK(invp, 0); + VOP_UNLOCK(outvp, 0); +out: + if (infp != NULL) + fdrop(infp, td); + if (outfp != NULL) + fdrop(outfp, td); + td->td_retval[0] = retlen; + return (error); +} + +int +sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) +{ + off_t inoff, outoff, *inoffp, *outoffp; + int error; + + inoffp = outoffp = NULL; + if (uap->inoffp != NULL) { + error = copyin(uap->inoffp, &inoff, sizeof(off_t)); + if (error != 0) + return (error); + inoffp = &inoff; + } + if (uap->outoffp != NULL) { + error = copyin(uap->outoffp, &outoff, sizeof(off_t)); + if (error != 0) + return (error); + outoffp = &outoff; + } + error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, + outoffp, uap->len, uap->flags); + if (error == 0 && uap->inoffp != NULL) + error = copyout(inoffp, uap->inoffp, sizeof(off_t)); + if (error == 0 && uap->outoffp != NULL) + error = copyout(outoffp, uap->outoffp, sizeof(off_t)); + return (error); +} Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -2494,6 +2495,134 @@ va->va_fsid += (uint32_t)f->val[0]; } +/* Malloc a zero'd block to compare with the data block read in. */ +static char *copyfilerange_zerodat = NULL; +static u_long copyfilerange_zerosize = 0; + +int +vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, + off_t *outoffp, size_t *lenp, u_int flags) +{ + struct statfs *sfp; + struct vattr va; + struct mount *mp; + u_long blksize; + int error, xfer; + ssize_t aresid; + size_t len; + char *dat; + struct thread *td = curthread; + + len = *lenp; + error = 0; + mp = NULL; + /* Do some sanity checks on the arguments. */ + if (invp->v_type == VDIR || outvp->v_type == VDIR) + error = EISDIR; + else if (*inoffp < 0 || (*inoffp + len) < *inoffp || *outoffp < 0 || + (*outoffp + len) < *outoffp || invp->v_type != VREG || + outvp->v_type != VREG) + error = EINVAL; + /* Check that the offset + len does not go past EOF of invp. */ + if (error == 0) + error = VOP_GETATTR(invp, &va, curthread->td_ucred); + if (error == 0 && va.va_size < (*inoffp + len)) + error = EINVAL; + if (error != 0) { + *lenp = 0; + return (error); + } + + /* + * If the two vnodes are for the same file system, try the + * VOP_COPY_FILE_RANGE() call first and do it here if the VOP + * call fails. + */ + if (invp->v_mount == outvp->v_mount) { + error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp, + lenp, flags); + if (error == 0) + return (error); + } + + /* + * Copy blocks of the size preferred by the input file, with a + * minimum of 16Kbytes and a maximum of 1Mbytes. + */ + sfp = malloc(sizeof(*sfp), M_STATFS, M_WAITOK); + error = VFS_STATFS(invp->v_mount, sfp); + if (error != 0) { + free(sfp, M_STATFS); + *lenp = 0; + return (error); + } + if (sfp->f_iosize < 16384) + blksize = 16384; + else if (sfp->f_iosize > 1048576) + blksize = 1048576; + else + blksize = sfp->f_iosize; + free(sfp, M_STATFS); + + /* Start write for outvp. */ + error = vn_start_write(outvp, &mp, V_WAIT | PCATCH); + if (error != 0) { + *lenp = 0; + return (error); + } + + dat = malloc(blksize, M_TEMP, M_WAITOK); + /* + * It would be nice to use VOP_IOCTL() to find holes, but that + * requires that invp be unlocked/relocked for each block read. + * I am not sure we want to do that here, since it would open + * up a window where another thread could write to the file while + * the copy is in progress. + * In the meantime, just scan for a read block of all 0s. + */ + if (copyfilerange_zerosize < blksize) { + free(copyfilerange_zerodat, M_TEMP); + copyfilerange_zerosize = blksize; + copyfilerange_zerodat = malloc(copyfilerange_zerosize, M_TEMP, + M_WAITOK | M_ZERO); + } + while (error == 0 && len > 0) { + if (len > blksize) + xfer = blksize; + else + xfer = len; + error = vn_rdwr(UIO_READ, invp, dat, xfer, *inoffp, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NULL, &aresid, + td); + /* Linux considers a range that exceeds EOF to be an error. */ + if (error == 0 && aresid > 0) + error = EINVAL; + if (error == 0) { + /* Skip the write for holes. */ + if (memcmp(dat, copyfilerange_zerodat, xfer) != 0) + error = vn_rdwr(UIO_WRITE, outvp, dat, xfer, + *outoffp, UIO_SYSSPACE, IO_NODELOCKED, + td->td_ucred, NULL, NULL, td); + else if (xfer == len) { + /* Hole at EOF. */ + VATTR_NULL(&va); + va.va_size = *outoffp + len; + error = VOP_SETATTR(outvp, &va, td->td_ucred); + } + if (error == 0) { + *inoffp += xfer; + *outoffp += xfer; + len -= xfer; + } + } + } + *lenp -= len; + if (mp != NULL) + vn_finished_write(mp); + free(dat, M_TEMP); + return (error); +} + int vn_fsync_buf(struct vnode *vp, int waitfor) { Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -718,6 +718,19 @@ }; +%% copy_file_range invp L L L +%% copy_file_range outvp L L L + +vop_copy_file_range { + IN struct vnode *invp; + INOUT off_t *inoffp; + IN struct vnode *outvp; + INOUT off_t *outoffp; + INOUT size_t *lenp; + IN u_int flags; +}; + + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, Index: sys/sys/syscallsubr.h =================================================================== --- sys/sys/syscallsubr.h +++ sys/sys/syscallsubr.h @@ -94,6 +94,8 @@ int kern_close(struct thread *td, int fd); int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa); +int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, + int outfd, off_t *outoffp, size_t len, u_int flags); int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp); int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -667,6 +667,8 @@ struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); +int vn_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, u_int flags); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor);