Index: include/unistd.h =================================================================== --- include/unistd.h +++ include/unistd.h @@ -494,6 +494,7 @@ int acct(const char *); int async_daemon(void); int check_utility_compat(const char *); +ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int); const char * crypt_get_format(void); char *crypt_r(const char *, const char *, struct crypt_data *); Index: lib/libc/sys/Makefile.inc =================================================================== --- lib/libc/sys/Makefile.inc +++ lib/libc/sys/Makefile.inc @@ -175,6 +175,7 @@ closefrom.2 \ connect.2 \ connectat.2 \ + copy_file_range.2 \ cpuset.2 \ cpuset_getaffinity.2 \ cpuset_getdomain.2 \ Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -402,6 +402,7 @@ }; FBSD_1.6 { + copy_file_range; fhlink; fhlinkat; fhreadlink; Index: lib/libc/sys/copy_file_range.2 =================================================================== --- lib/libc/sys/copy_file_range.2 +++ lib/libc/sys/copy_file_range.2 @@ -0,0 +1,188 @@ +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2019 Rick Macklem +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 9, 2019 +.Dt COPY_FILE_RANGE 2 +.Os +.Sh NAME +.Nm copy_file_range +.Nd kernel copy of a byte range from one file to another +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/types.h +.In unistd.h +.Ft ssize_t +.Fo copy_file_range +.Fa "int infd" +.Fa "off_t *inoffp" +.Fa "int outfd" +.Fa "off_t *outoffp" +.Fa "size_t len" +.Fa "unsigned int flags" +.Fc +.Sh DESCRIPTION +The +.Fn copy_file_range +system call +copies up to +.Fa len +bytes from +.Fa infd +to +.Fa outfd +in the kernel. +It may do this using a file system specific technique if +.Fa infd +and +.Fa outfd +are on the same file system. +The +.Fa infd +argument must be opened for reading and the +.Fa outfd +argument must be opened for writing, but not +.Dv O_APPEND . +If +.Fa inoffp +or +.Fa outoffp +is +.Dv NULL , +the file offset for +.Fa infd +or +.Fa outfd +respectively will be used and updated by +the number of bytes copied. +If +.Fa inoffp +or +.Fa outoffp +is not +.Dv NULL , +the byte offset pointed to by +.Fa inoffp +or +.Fa outoffp +respectively will be used/updated and the file offset for +.Fa infd +or +.Fa outfd +respectively will not be affected. +The +.Fa flags +argument must be 0. +.Pp +This system call attempts to maintain holes in the output file for +the byte range being copied. +However, this does not always work well. +It is recommended that sparse files be copied in a loop using +.Xr lseek 2 +with SEEK_HOLE, SEEK_DATA arguments and this system call for the +data ranges found. +.Pp +.Sh RETURN VALUES +If it succeeds, the call returns the number of bytes copied, which can be less +than +.Fa len . +.Fn copy_file_range +should be used in a loop until copying of the desired byte range has been +completed. +If an error has occurred, a \-1 is returned and the error code is placed in +the global variable +.Va errno . +.Sh ERRORS +The +.Fn copy_file_range +system call +will fail if: +.Bl -tag -width Er +.It Bq Er EBADF +If +.Fa +infd +is not open for reading or +.Fa +outfd +is not open for writing, or opened for writing with +.Dv O_APPEND , +or if +.Fa infd +and +.Fa outfd +refer to the same file. +.It Bq Er EFBIG +If the copy exceeds the process's file size limit or the maximum file size +for the file system +.Fa outfd +resides on. +.It Bq Er EINTR +A signal interrupted the system call +before it could be completed. +This may happen for files on some NFS mounts. +When this happens, the values pointed to by +.Fa inoffp +and +.Fa outoffp +are reset to the initial values for the system call. +.It Bq Er EINVAL +If the initial offset for +.Fa infd +plus +.Fa len +exceeds EOF for +.Fa infd +or +.Fa +flags +is not zero. +.It Bq Er EIO +An I/O error occurred while reading/writing the files. +.It Bq Er EISDIR +If either +.Fa infd +or +.Fa outfd +refers to a directory. +.It Bq Er ENOSPC +File system that stores +.Fa outfd +is full. +.El +.Sh SEE ALSO +.Xr lseek 2 +.Sh STANDARDS +The +.Fn copy_file_range +system call is expected to be compatible with the Linux system call of +the same name. +.Sh HISTORY +The +.Fn copy_file_range +function appeared in +.Fx 13.0 . Index: share/man/man9/Makefile =================================================================== --- share/man/man9/Makefile +++ share/man/man9/Makefile @@ -400,6 +400,7 @@ VOP_ATTRIB.9 \ VOP_BMAP.9 \ VOP_BWRITE.9 \ + VOP_COPY_FILE_RANGE.9 \ VOP_CREATE.9 \ VOP_FSYNC.9 \ VOP_GETACL.9 \ Index: share/man/man9/VOP_COPY_FILE_RANGE.9 =================================================================== --- share/man/man9/VOP_COPY_FILE_RANGE.9 +++ share/man/man9/VOP_COPY_FILE_RANGE.9 @@ -0,0 +1,121 @@ +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2019 Rick Macklem +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 20, 2019 +.Dt VOP_COPY_FILE_RANGE 9 +.Os +.Sh NAME +.Nm VOP_COPY_FILE_RANGE +.Nd copy a byte range from one regular file to another within a file system +.Sh SYNOPSIS +.In sys/param.h +.In sys/vnode.h +.Ft int +.Fo VOP_COPY_FILE_RANGE +.Fa "struct vnode *invp" +.Fa "off_t *inoff" +.Fa "struct vnode *outvp" +.Fa "off_t *outoff" +.Fa "size_t *len" +.Fa "unsigned int flags" +.Fa "struct ucred *incred" +.Fa "struct ucred *outcred" +.Fa "struct thread *fsize_td" +.Sh DESCRIPTION +This entry point copies a byte range from one regular file to another within a +file system. +.Pp +The arguments are: +.Bl -tag -width ioflag +.It Fa invp +The vnode of the input file. +.It Fa inoff +A pointer to the file offset for the input file. +.It Fa outvp +The vnode of the output file. +.It Fa outoff +A pointer to the file offset for the output file. +.It Fa len +A pointer to the number of bytes to be copied. +.It Fa flags +Flags, should be set to 0 for now. +.It Fa incred +The credentials used to read +.Fa invp . +.It Fa outcred +The credentials used to write +.Fa outvp . +.It Fa fsize_td +The thread pointer to be passed to vn_rlimit_fsize(). +This will be NULL for a server thread without limits, such as for the NFS +server or curthread otherwise. +.El +.Pp +When the call is done, the +.Fa inoff +and +.Fa outoff +arguments point to the locations of the file offsets. +These file offsets should be updated by the number of bytes copied. +The +.Fa len +argument points to the location that stores the number of bytes +to be copied. +It should be reduced by the number of bytes copied, which implies that +the value pointed to by +.Fa len +will normally be zero for a non-error return. +However, a copy of less bytes than requested is permitted. +.Sh LOCKS +The vnodes are unlocked when the call is done and should be unlocked +when the call returns. +The byte ranges for both +.Fa invp +and +.Fa outvp +should be range locked when this call is done. +.Sh RETURN VALUES +Zero is returned on success, otherwise an error code is returned. +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er EFBIG +If the copy exceeds the process's file size limit or the maximum file size +for the file system +.Fa invp +and +.Fa outvp +reside on. +.It Bq Er EINTR +A signal interrupted the VOP call before it could be completed. +.It Bq Er EIO +An I/O error occurred while reading/writing the files. +.It Bq Er ENOSPC +The file system is full. +.El +.Sh SEE ALSO +.Xr vn_rdwr 9 , +.Xr vnode 9 Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -3175,6 +3175,16 @@ int flag ); } +569 AUE_NULL STD { + ssize_t copy_file_range( + int infd, + _Inout_opt_ off_t *inoffp, + int outfd, + _Inout_opt_ off_t *outoffp, + size_t len, + unsigned int flags + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -83,6 +83,7 @@ static int vop_stdis_text(struct vop_is_text_args *ap); static int vop_stdunset_text(struct vop_unset_text_args *ap); static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); +static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap); static int vop_stdfdatasync(struct vop_fdatasync_args *ap); static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); @@ -140,6 +141,7 @@ .vop_set_text = vop_stdset_text, .vop_unset_text = vop_stdunset_text, .vop_add_writecount = vop_stdadd_writecount, + .vop_copy_file_range = vop_stdcopy_file_range, }; /* @@ -1206,6 +1208,17 @@ return (0); } +static int +vop_stdcopy_file_range(struct vop_copy_file_range_args *ap) +{ + int error; + + error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, + ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred, + ap->a_outcred, ap->a_fsizetd); + return (error); +} + int vfs_stdvget (mp, ino, flags, vpp) struct mount *mp; Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -80,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -5657,3 +5659,311 @@ mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); } + +/* + * Test len bytes of data starting at addr for all bytes == 0. + * Return true if all bytes are zero, false otherwise. + * Expects dat to be well aligned. + */ +static bool +mem_iszero(void *dat, int len) +{ + int i; + const u_int *p; + const char *cp; + + for (p = dat; len > 0; len -= sizeof(*p), p++) { + if (len >= sizeof(*p)) { + if (*p != 0) + return (false); + } else { + cp = (const char *)p; + for (i = 0; i < len; i++, cp++) + if (*cp != '\0') + return (false); + } + } + return (true); +} + +/* + * Write an xfer sized chunk to outvp in blksize blocks from dat. + * dat is a maximum of blksize in length and can be written repeatedly in + * the chunk. + * If growfile == true, just grow the file via vn_truncate_locked() instead + * of doing actual writes. + */ +static int +vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer, + u_long blksize, bool growfile, struct ucred *cred) +{ + struct mount *mp; + off_t xfer2; + int error, lckf; + + mp = NULL; + error = vn_start_write(outvp, &mp, V_WAIT); + if (error == 0) { + if (MNT_SHARED_WRITES(mp)) + lckf = LK_SHARED; + else + lckf = LK_EXCLUSIVE; + error = vn_lock(outvp, lckf); + } + if (error == 0) { + if (growfile) + error = vn_truncate_locked(outvp, outoff + xfer, false, + cred); + else + do { + xfer2 = xfer; + if (xfer2 > blksize) + xfer2 = blksize; + error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, + outoff, UIO_SYSSPACE, IO_NODELOCKED, + curthread->td_ucred, cred, NULL, curthread); + outoff += xfer2; + xfer -= xfer2; + } while (xfer > 0 && error == 0); + VOP_UNLOCK(outvp, 0); + } + if (mp != NULL) + vn_finished_write(mp); + return (error); +} + +/* + * Copy a byte range of one file to another. This function can handle the + * case where invp and outvp are on different file systems. + * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there + * is no better file system specific way to do it. + */ +int +vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, + struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) +{ + struct vattr va; + struct mount *mp; + struct uio io; + off_t startoff, endoff, xfer, xfer2; + u_long blksize; + int error; + bool cantseek, readzeros; + ssize_t aresid; + size_t copylen, len, savlen; + char *dat; + long holein, holeout; + + savlen = len = *lenp; + error = 0; + dat = NULL; + + error = vn_lock(invp, LK_SHARED); + if (error != 0) + goto out; + if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) + holein = 0; + VOP_UNLOCK(invp, 0); + if (error != 0) + goto out; + + mp = NULL; + error = vn_start_write(outvp, &mp, V_WAIT); + if (error == 0) + error = vn_lock(outvp, LK_EXCLUSIVE); + if (error == 0) { + /* + * If fsize_td != NULL, do a vn_rlimit_fsize() call, + * now that outvp is locked. + */ + if (fsize_td != NULL) { + io.uio_offset = *outoffp; + io.uio_resid = len; + error = vn_rlimit_fsize(outvp, &io, fsize_td); + if (error != 0) + error = EFBIG; + } + if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) + holeout = 0; + /* + * Holes that are past EOF do not need to be written as a block + * of zero bytes. So, truncate the output file as far as + * possible and then use va.va_size to decide if writing 0 + * bytes is necessary in the loop below. + */ + if (error == 0) + error = VOP_GETATTR(outvp, &va, outcred); + if (error == 0 && va.va_size > *outoffp && va.va_size <= + *outoffp + len) { +#ifdef MAC + error = mac_vnode_check_write(curthread->td_ucred, + outcred, outvp); +#endif + if (error == 0) + error = vn_truncate_locked(outvp, *outoffp, + false, outcred); + if (error == 0) + va.va_size = *outoffp; + } + VOP_UNLOCK(outvp, 0); + } + if (mp != NULL) + vn_finished_write(mp); + if (error != 0) + goto out; + + /* + * Set the blksize to the larger of the hole sizes for invp and outvp. + * If hole sizes aren't available, set the blksize to the larger + * f_iosize of invp and outvp. + * This code expects the hole sizes and f_iosizes to be powers of 2. + * This value is clipped at 4Kbytes and 1Mbyte. + */ + if (holein > 0 && holeout > 0) + if (holein > holeout) + blksize = holein; + else + blksize = holeout; + else if (invp->v_mount->mnt_stat.f_iosize > + outvp->v_mount->mnt_stat.f_iosize) + blksize = invp->v_mount->mnt_stat.f_iosize; + else + blksize = outvp->v_mount->mnt_stat.f_iosize; + if (blksize < 4096) + blksize = 4096; + else if (blksize > 1048576) + blksize = 1048576; + dat = malloc(blksize, M_TEMP, M_WAITOK); + + /* + * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA + * to find holes. Otherwise, just scan the read block for all 0s + * in the inner loop where the data copying is done. + * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may + * support holes on the server, but do not support FIOSEEKHOLE. + */ + while (len > 0 && error == 0) { + endoff = 0; /* To shut up compilers. */ + cantseek = true; + startoff = *inoffp; + copylen = len; + + /* + * Find the next data area. If there is just a hole to EOF, + * FIOSEEKDATA should fail and then we drop down into the + * inner loop and create the hole on the outvp file. + * (I do not know if any file system will report a hole to + * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA + * will fail for those file systems.) + * + * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, + * the code just falls through to the inner copy loop. + */ + error = EINVAL; + if (holein > 0) + error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, + incred, curthread); + if (error == 0) { + endoff = startoff; + error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, + incred, curthread); + } + if (error == 0) { + if (startoff > *inoffp) { + /* Found hole before data block. */ + xfer = startoff - *inoffp; + if (xfer > len) + xfer = len; + if (*outoffp < va.va_size) { + /* Must write 0s to punch hole. */ + xfer2 = va.va_size - *outoffp; + if (xfer2 > xfer) + xfer2 = xfer; + memset(dat, 0, (xfer2 > blksize) ? + blksize : xfer2); + error = vn_write_outvp(outvp, dat, + *outoffp, xfer2, blksize, false, + outcred); + } + + if (error == 0 && *outoffp + xfer > + va.va_size && xfer == len) + /* Grow last block. */ + error = vn_write_outvp(outvp, dat, + *outoffp, xfer, blksize, true, + outcred); + if (error == 0) { + *inoffp += xfer; + *outoffp += xfer; + len -= xfer; + } + } + copylen = len; + if (copylen > endoff - startoff) + copylen = endoff - startoff; + cantseek = false; + } else { + cantseek = true; + startoff = *inoffp; + copylen = len; + error = 0; + } + + xfer = blksize; + if (cantseek) { + /* + * Set first xfer to end at a block boundary, so that + * holes are more likely detected in the loop below via + * the for all bytes 0 method. + */ + xfer -= (*inoffp % blksize); + } + /* Loop copying the data block. */ + while (copylen > 0 && error == 0) { + if (copylen < xfer) + xfer = copylen; + error = vn_lock(invp, LK_SHARED); + if (error != 0) + goto out; + error = vn_rdwr(UIO_READ, invp, dat, xfer, + startoff, UIO_SYSSPACE, IO_NODELOCKED, + curthread->td_ucred, incred, &aresid, + curthread); + VOP_UNLOCK(invp, 0); + /* + * Linux considers a range that exceeds EOF to + * be an error, so we will too. + */ + if (error == 0 && aresid > 0) + error = EINVAL; + if (error == 0) { + /* + * Skip the write for holes past the initial EOF + * of the output file, unless this is the last + * write of the output file at EOF. + */ + readzeros = (cantseek) ? mem_iszero(dat, xfer) : + false; + if (!cantseek || *outoffp < va.va_size || + xfer == len || !readzeros) + error = vn_write_outvp(outvp, dat, + *outoffp, xfer, blksize, + readzeros && xfer == len && + *outoffp >= va.va_size, outcred); + if (error == 0) { + *inoffp += xfer; + startoff += xfer; + *outoffp += xfer; + copylen -= xfer; + len -= xfer; + } + } + xfer = blksize; + } + } +out: + *lenp = savlen - len; + free(dat, M_TEMP); + return (error); +} Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -4814,3 +4814,122 @@ uap->advice); return (kern_posix_error(td, error)); } + +int +kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, + off_t *outoffp, size_t len, unsigned int flags) +{ + struct file *infp, *outfp; + struct vnode *invp, *outvp; + int error; + size_t retlen; + void *rl_rcookie, *rl_wcookie; + off_t savinoff, savoutoff; + + infp = outfp = NULL; + rl_rcookie = rl_wcookie = NULL; + savinoff = -1; + error = 0; + retlen = 0; + + if (flags != 0) { + error = EINVAL; + goto out; + } + if (len > SSIZE_MAX) + /* + * Although the len argument is size_t, the return argument + * is ssize_t (which is signed). Therefore a size that won't + * fit in ssize_t can't be returned. + */ + len = SSIZE_MAX; + + /* Get the file structures for the file descriptors. */ + error = fget_read(td, infd, &cap_read_rights, &infp); + if (error != 0) + goto out; + error = fget_write(td, outfd, &cap_write_rights, &outfp); + if (error != 0) + goto out; + + /* Set the offset pointers to the correct place. */ + if (inoffp == NULL) + inoffp = &infp->f_offset; + if (outoffp == NULL) + outoffp = &outfp->f_offset; + savinoff = *inoffp; + savoutoff = *outoffp; + + invp = infp->f_vnode; + outvp = outfp->f_vnode; + /* Sanity check the f_flag bits. */ + if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || + (infp->f_flag & FREAD) == 0 || invp == outvp) { + error = EBADF; + goto out; + } + + /* If len == 0, just return 0. */ + if (len == 0) + goto out; + + /* Range lock the byte ranges for both invp and outvp. */ + for (;;) { + rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp + + len); + rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp + + len); + if (rl_rcookie != NULL) + break; + vn_rangelock_unlock(outvp, rl_wcookie); + rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len); + vn_rangelock_unlock(invp, rl_rcookie); + } + + retlen = len; + error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, + flags, infp->f_cred, outfp->f_cred, td); +out: + if (rl_rcookie != NULL) + vn_rangelock_unlock(invp, rl_rcookie); + if (rl_wcookie != NULL) + vn_rangelock_unlock(outvp, rl_wcookie); + if (savinoff != -1 && (error == EINTR || error == ERESTART)) { + *inoffp = savinoff; + *outoffp = savoutoff; + } + if (outfp != NULL) + fdrop(outfp, td); + if (infp != NULL) + fdrop(infp, td); + td->td_retval[0] = retlen; + return (error); +} + +int +sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) +{ + off_t inoff, outoff, *inoffp, *outoffp; + int error; + + inoffp = outoffp = NULL; + if (uap->inoffp != NULL) { + error = copyin(uap->inoffp, &inoff, sizeof(off_t)); + if (error != 0) + return (error); + inoffp = &inoff; + } + if (uap->outoffp != NULL) { + error = copyin(uap->outoffp, &outoff, sizeof(off_t)); + if (error != 0) + return (error); + outoffp = &outoff; + } + error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, + outoffp, uap->len, uap->flags); + if (error == 0 && uap->inoffp != NULL) + error = copyout(inoffp, uap->inoffp, sizeof(off_t)); + if (error == 0 && uap->outoffp != NULL) + error = copyout(outoffp, uap->outoffp, sizeof(off_t)); + return (error); +} Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -89,6 +90,8 @@ #include #include +#include + #ifdef HWPMC_HOOKS #include #endif @@ -2508,6 +2511,69 @@ va->va_fsid += (uint32_t)f->val[0]; } +/* + * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE() + * or vn_generic_copy_file_range() after rangelocking the byte ranges, + * to do the actual copy. + * vn_generic_copy_file_range() is factored out, so it can be called + * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from + * different file systems. + */ +int +vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, + off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, + struct ucred *outcred, struct thread *fsize_td) +{ + struct vattr va; + int error; + size_t len; + uint64_t uvalin, uvalout; + + len = *lenp; + error = 0; + + /* Do some sanity checks on the arguments. */ + uvalin = *inoffp; + uvalin += len; + uvalout = *outoffp; + uvalout += len; + if (invp->v_type == VDIR || outvp->v_type == VDIR) + error = EISDIR; + else if (*inoffp < 0 || uvalin > INT64_MAX || uvalin < + (uint64_t)*inoffp || *outoffp < 0 || uvalout > INT64_MAX || + uvalout < (uint64_t)*outoffp || invp->v_type != VREG || + outvp->v_type != VREG || invp == outvp) + error = EINVAL; + if (error != 0) + goto out; + + error = vn_lock(invp, LK_SHARED); + if (error != 0) + goto out; + /* Check that the offset + len does not go past EOF of invp. */ + error = VOP_GETATTR(invp, &va, incred); + if (error == 0 && va.va_size < (*inoffp + len)) + error = EINVAL; + VOP_UNLOCK(invp, 0); + if (error != 0) + goto out; + bwillwrite(); + + /* + * If the two vnode are for the same file system, call + * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range() + * which can handle copies across multiple file systems. + */ + if (invp->v_mount == outvp->v_mount) + error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp, + lenp, flags, incred, outcred, fsize_td); + else + error = vn_generic_copy_file_range(invp, inoffp, outvp, + outoffp, lenp, flags, incred, outcred, fsize_td); +out: + return (error); +} + int vn_fsync_buf(struct vnode *vp, int waitfor) { Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -718,6 +718,22 @@ }; +%% copy_file_range invp U U U +%% copy_file_range outvp U U U + +vop_copy_file_range { + IN struct vnode *invp; + INOUT off_t *inoffp; + IN struct vnode *outvp; + INOUT off_t *outoffp; + INOUT size_t *lenp; + IN unsigned int flags; + IN struct ucred *incred; + IN struct ucred *outcred; + IN struct thread *fsizetd; +}; + + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, Index: sys/sys/syscallsubr.h =================================================================== --- sys/sys/syscallsubr.h +++ sys/sys/syscallsubr.h @@ -94,6 +94,8 @@ int kern_close(struct thread *td, int fd); int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa); +int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, + int outfd, off_t *outoffp, size_t len, unsigned int flags); int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp); int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -667,9 +667,17 @@ struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); +int vn_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, + unsigned int flags, struct ucred *incred, struct ucred *outcred, + struct thread *fsize_td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor); +int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, + unsigned int flags, struct ucred *incred, struct ucred *outcred, + struct thread *fsize_td); int vn_isdisk(struct vnode *vp, int *errp); int _vn_lock(struct vnode *vp, int flags, char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)