Index: include/unistd.h =================================================================== --- include/unistd.h +++ include/unistd.h @@ -494,6 +494,7 @@ int acct(const char *); int async_daemon(void); int check_utility_compat(const char *); +ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int); const char * crypt_get_format(void); char *crypt_r(const char *, const char *, struct crypt_data *); Index: lib/libc/sys/Makefile.inc =================================================================== --- lib/libc/sys/Makefile.inc +++ lib/libc/sys/Makefile.inc @@ -175,6 +175,7 @@ closefrom.2 \ connect.2 \ connectat.2 \ + copy_file_range.2 \ cpuset.2 \ cpuset_getaffinity.2 \ cpuset_getdomain.2 \ Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -402,6 +402,7 @@ }; FBSD_1.6 { + copy_file_range; fhlink; fhlinkat; fhreadlink; Index: lib/libc/sys/copy_file_range.2 =================================================================== --- lib/libc/sys/copy_file_range.2 +++ lib/libc/sys/copy_file_range.2 @@ -0,0 +1,150 @@ +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2019 Rick Macklem +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd June 9, 2019 +.Dt COPY_FILE_RANGE 2 +.Os +.Sh NAME +.Nm copy_file_range +.Nd kernel copy of a byte range from one file to another +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In sys/types.h +.In unistd.h +.Ft ssize_t +.Fn copy_file_range "int infd" "off_t *inoffp" "int outfd" "off_t *outoffp" "size_t len" "unsigned int flags" +.Fc +.Sh DESCRIPTION +The +.Fn copy_file_range +system call +copies +.Fa len +bytes from +.Fa infd +to +.Fa outfd +in the kernel. +It may do this using a file system specific technique if +.Fa infd +and +.Fa outfd +are on the same file system. +The +.Fa infd +argument must be opened for reading and the +.Fa outfd +argument must be opened for writing, but not +.Dv O_APPEND . +If +.Fa inoffp +or +.Fa outoffp +is +.Dv NULL , +the file offset for +.Fa infd +or +.Fa outfd +respectively will be used and updated by +the number of bytes copied. +If +.Fa inoffp +or +.Fa outoffp +is not +.Dv NULL , +the byte offset pointed to by +.Fa inoffp +or +.Fa outoffp +respectively will be used/updated and the file offset for +.Fa infd +or +.Fa outfd +respectively will not be affected. +The +.Fa flags +argument is currently ignored and should be set to 0. +.Pp +.Sh RETURN VALUES +If it succeeds, the call returns the number of bytes copied, which can be less +than +.Fa len . +.Fn copy_file_range +should be used in a loop until copying of the desired byte range has been +completed. +If an error has occurred, a \-1 is returned and the error code is placed in +the global variable +.Va errno . +.Sh ERRORS +The +.Fn copy_file_range +system call +will fail if: +.Bl -tag -width Er +.It Bq Er EBADF +If +.Fa +infd +is not open for reading or +.Fa +outfd +is not open for writing, or opened for writing with +.Dv O_APPEND . +.It Bq Er EFBIG +If the copy exceeds the process's file size limit or the maximum file size +for the file system +.Fa outfd +resides on. +.It Bq Er EINVAL +If the initial offset for +.Fa infd +plus +.Fa len +exceeds EOF for +.Fa infd . +.It Bq Er EIO +An I/O error occurred while reading/writing the files. +.It Bq Er EISDIR +If either +.Fa infd +or +.Fa outfd +refers to a directory. +.El +.Sh STANDARDS +The +.Fn copy_file_range +system call is expected to be compatible with the Linux system call of +the same name. +.Sh HISTORY +The +.Fn copy_file_range +function appeared in +.Fx 13.0 . Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -3175,6 +3175,16 @@ int flag ); } +568 AUE_NULL STD { + ssize_t copy_file_range( + int infd, + _Inout_opt_ off_t *inoffp, + int outfd, + _Inout_opt_ off_t *outoffp, + size_t len, + unsigned int flags + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -140,6 +140,7 @@ .vop_set_text = vop_stdset_text, .vop_unset_text = vop_stdunset_text, .vop_add_writecount = vop_stdadd_writecount, + .vop_copy_file_range = VOP_EOPNOTSUPP, }; /* Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -4814,3 +4814,78 @@ uap->advice); return (kern_posix_error(td, error)); } + +int +kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, + off_t *outoffp, size_t len, unsigned int flags) +{ + struct file *infp, *outfp; + struct vnode *invp, *outvp; + int error; + size_t retlen; + + infp = outfp = NULL; + retlen = 0; + + /* Get the file structures for the file descriptors. */ + error = fget_read(td, infd, &cap_read_rights, &infp); + if (error != 0) + goto out; + error = fget_write(td, outfd, &cap_write_rights, &outfp); + if (error != 0) + goto out; + + /* Set the offset pointers to the correct place. */ + if (inoffp == NULL) + inoffp = &infp->f_offset; + if (outoffp == NULL) + outoffp = &outfp->f_offset; + + invp = infp->f_vnode; + outvp = outfp->f_vnode; + /* Sanity check the f_flag bits. */ + if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || + (infp->f_flag & FREAD) == 0 || invp == outvp) { + error = EBADF; + goto out; + } + + retlen = len; + error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, + flags); +out: + if (outfp != NULL) + fdrop(outfp, td); + if (infp != NULL) + fdrop(infp, td); + td->td_retval[0] = retlen; + return (error); +} + +int +sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) +{ + off_t inoff, outoff, *inoffp, *outoffp; + int error; + + inoffp = outoffp = NULL; + if (uap->inoffp != NULL) { + error = copyin(uap->inoffp, &inoff, sizeof(off_t)); + if (error != 0) + return (error); + inoffp = &inoff; + } + if (uap->outoffp != NULL) { + error = copyin(uap->outoffp, &outoff, sizeof(off_t)); + if (error != 0) + return (error); + outoffp = &outoff; + } + error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, + outoffp, uap->len, uap->flags); + if (error == 0 && uap->inoffp != NULL) + error = copyout(inoffp, uap->inoffp, sizeof(off_t)); + if (error == 0 && uap->outoffp != NULL) + error = copyout(outoffp, uap->outoffp, sizeof(off_t)); + return (error); +} Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -89,6 +90,8 @@ #include #include +#include + #ifdef HWPMC_HOOKS #include #endif @@ -2494,6 +2497,315 @@ va->va_fsid += (uint32_t)f->val[0]; } +/* + * Test len bytes of data starting at addr for all bytes == 0. + * Return 0 if all bytes are zero, non-zero otherwise. + * Expects dat to be well aligned. + */ +static int +mem_iszero(void *dat, int len) +{ + int i; + const u_int *p; + const char *cp; + + for (p = (const u_int *)dat; len > 0; len -= sizeof(*p), p++) { + if (len >= sizeof(u_int)) { + if (*p != 0) + return (1); + } else { + cp = (const char *)p; + for (i = 0; i < len; i++, cp++) + if (*cp != '\0') + return (1); + return (0); + } + } + return (0); +} + +int +vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, + off_t *outoffp, size_t *lenp, unsigned int flags) +{ + struct vattr va; + struct mount *mp; + off_t startoff, endoff; + u_long blksize; + int cantseek, error, lckf; + ssize_t aresid; + size_t copylen, len, savlen, xfer, xfer2; + char *dat; + uint64_t uvalin, uvalout; + long holein, holeout; + void *rl_rcookie, *rl_wcookie; + struct thread *td = curthread; + + savlen = len = *lenp; + *lenp = 0; /* Return 0 len for errors. */ + error = 0; + dat = NULL; + rl_rcookie = rl_wcookie = NULL; + + /* Do some sanity checks on the arguments. */ + uvalin = *inoffp; + uvalin += len; + uvalout = *outoffp; + uvalout += len; + if (invp->v_type == VDIR || outvp->v_type == VDIR) + error = EISDIR; + else if (*inoffp < 0 || uvalin > INT64_MAX || uvalin < + (uint64_t)*inoffp || *outoffp < 0 || uvalout > INT64_MAX || + uvalout < (uint64_t)*outoffp || invp->v_type != VREG || + outvp->v_type != VREG || invp == outvp) + error = EINVAL; + if (error != 0) + goto out; + + error = vn_lock(invp, LK_SHARED); + if (error != 0) + goto out; + if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) + holein = 0; + + /* Check that the offset + len does not go past EOF of invp. */ + if (error == 0) + error = VOP_GETATTR(invp, &va, td->td_ucred); + if (error == 0 && va.va_size < (*inoffp + len)) + error = EINVAL; + VOP_UNLOCK(invp, 0); + if (error != 0) + goto out; + + /* Range lock the byte ranges for both invp and outvp. */ + for (;;) { + rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp + + len); + rl_rcookie = vn_rangelock_rlock_trylock(invp, *inoffp, + *inoffp + len); + if (rl_rcookie != NULL) + break; + vn_rangelock_unlock(outvp, rl_wcookie); + rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len); + vn_rangelock_unlock(invp, rl_rcookie); + } + + /* + * If the two vnodes are for the same file system, try the + * VOP_COPY_FILE_RANGE() call first, but do it here if the VOP + * call fails. + */ + if (invp->v_mount == outvp->v_mount) { + error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp, + lenp, flags); + if (error == 0) + goto out; + } + + mp = NULL; + error = vn_start_write(outvp, &mp, V_WAIT); + if (error == 0) + error = vn_lock(outvp, LK_EXCLUSIVE); + if (error == 0) { + if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) + holeout = 0; + /* + * Holes that are past EOF do not need to be written as a block + * of zero bytes. So, truncate the output file as far as + * possible and then use va.va_size to decide if writing 0 + * bytes is necessary in the loop below. + */ + error = VOP_GETATTR(outvp, &va, td->td_ucred); + if (error == 0 && va.va_size > *outoffp && va.va_size <= + *outoffp + len) { + VATTR_NULL(&va); + va.va_size = *outoffp; + error = VOP_SETATTR(outvp, &va, td->td_ucred); + } + VOP_UNLOCK(outvp, 0); + } + if (mp != NULL) + vn_finished_write(mp); + if (error != 0) + goto out; + + /* + * Set the blksize to the larger of the hole sizes for invp and outvp. + * If hole sizes aren't available, set the blksize to the larger + * f_iosize of invp and outvp. + * This code expects the hole sizes and f_iosizes to be powers of 2. + * This value is clipped at 4Kbytes and 1Mbyte. + */ + if (holein > 0 && holeout > 0) + if (holein > holeout) + blksize = holein; + else + blksize = holeout; + else if (invp->v_mount->mnt_stat.f_iosize > + outvp->v_mount->mnt_stat.f_iosize) + blksize = invp->v_mount->mnt_stat.f_iosize; + else + blksize = outvp->v_mount->mnt_stat.f_iosize; + if (blksize < 4096) + blksize = 4096; + else if (blksize > 1048576) + blksize = 1048576; + dat = malloc(blksize, M_TEMP, M_WAITOK); + + /* + * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA + * to find holes. Otherwise, just scan the read block for all 0s + * in the inner loop where the data copying is done. + * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may + * support holes on the server, but do not support FIOSEEKHOLE. + */ + while (len > 0 && error == 0) { + endoff = 0; /* To shut up compilers. */ + + /* + * Find the next data area. If there is just a hole to EOF, + * FOISEEKDATA should fail and then we drop down into the + * inner loop and create the hole on the outvp file. + * (I do not know if any file system will report a hole to + * EOF via FOISEEKHOLE, but I am pretty sure FIOSEEKDATA + * will fail for those file systems.) + * + * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, + * the code just falls through to the inner copy loop. + */ + cantseek = 1; + startoff = *inoffp; + copylen = len; + error = EINVAL; + if (holein > 0) + error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, + td->td_ucred, td); + if (error == 0) { + endoff = startoff; + error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, + td->td_ucred, td); + } + if (error == 0) { + if (startoff > *inoffp) { + /* Found hole before data block. */ + xfer = startoff - *inoffp; + if (*inoffp < va.va_size) { + /* Must write 0s to punch hole. */ + xfer2 = va.va_size - *inoffp; + if (xfer2 > xfer) + xfer2 = xfer; + memset(dat, 0, xfer2); + mp = NULL; + error = vn_start_write(outvp, &mp, + V_WAIT); + if (error == 0) { + if (MNT_SHARED_WRITES(mp)) + lckf = LK_SHARED; + else + lckf = LK_EXCLUSIVE; + error = vn_lock(outvp, lckf); + } + if (error == 0) { + error = vn_rdwr(UIO_WRITE, + outvp, dat, xfer2, *outoffp, + UIO_SYSSPACE, IO_NODELOCKED, + td->td_ucred, NULL, NULL, + td); + VOP_UNLOCK(outvp, 0); + } + if (mp != NULL) + vn_finished_write(mp); + } + if (error == 0) { + *inoffp += xfer; + *outoffp += xfer; + len -= xfer; + } + } + copylen = len; + if (copylen > endoff - startoff) + copylen = endoff - startoff; + cantseek = 0; + } else + error = 0; + + xfer = blksize; + if (cantseek == 1) { + /* + * Set first xfer to end at a block boundary, so that + * holes are more likely detected in the loop below via + * the for all bytes 0 method. + */ + xfer -= (*inoffp % blksize); + } + /* Loop copying the data block. */ + while (copylen > 0 && error == 0) { + if (copylen < xfer) + xfer = copylen; + error = vn_lock(invp, LK_SHARED); + if (error != 0) + goto out; + error = vn_rdwr(UIO_READ, invp, dat, xfer, startoff, + UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NULL, + &aresid, td); + VOP_UNLOCK(invp, 0); + /* + * Linux considers a range that exceeds EOF to be an + * error, so we will too. + */ + if (error == 0 && aresid > 0) + error = EINVAL; + if (error == 0) { + /* + * Skip the write for holes past the initial EOF + * of the output file, unless this is the last + * write of the output file at EOF. + */ + if (cantseek == 0 || *outoffp < va.va_size || + xfer == len || mem_iszero(dat, xfer) != 0) { + mp = NULL; + error = vn_start_write(outvp, &mp, + V_WAIT); + if (error == 0) { + if (MNT_SHARED_WRITES(mp)) + lckf = LK_SHARED; + else + lckf = LK_EXCLUSIVE; + error = vn_lock(outvp, lckf); + } + if (error == 0) { + error = vn_rdwr(UIO_WRITE, + outvp, dat, xfer, *outoffp, + UIO_SYSSPACE, IO_NODELOCKED, + td->td_ucred, NULL, NULL, + td); + VOP_UNLOCK(outvp, 0); + } + if (mp != NULL) + vn_finished_write(mp); + } + if (error == 0) { + *inoffp += xfer; + startoff += xfer; + *outoffp += xfer; + copylen -= xfer; + len -= xfer; + } + } + xfer = blksize; + } + } + if (error == 0) + *lenp = savlen - len; +out: + if (rl_rcookie != NULL) + vn_rangelock_unlock(invp, rl_rcookie); + if (rl_wcookie != NULL) + vn_rangelock_unlock(outvp, rl_wcookie); + free(dat, M_TEMP); + return (error); +} + int vn_fsync_buf(struct vnode *vp, int waitfor) { Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -718,6 +718,19 @@ }; +%% copy_file_range invp U U U +%% copy_file_range outvp U U U + +vop_copy_file_range { + IN struct vnode *invp; + INOUT off_t *inoffp; + IN struct vnode *outvp; + INOUT off_t *outoffp; + INOUT size_t *lenp; + IN u_int flags; +}; + + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, Index: sys/sys/syscallsubr.h =================================================================== --- sys/sys/syscallsubr.h +++ sys/sys/syscallsubr.h @@ -94,6 +94,8 @@ int kern_close(struct thread *td, int fd); int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa); +int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, + int outfd, off_t *outoffp, size_t len, unsigned int flags); int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp); int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -667,6 +667,9 @@ struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); +int vn_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, + unsigned int flags); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor);