Index: head/sys/kern/syscalls.master =================================================================== --- head/sys/kern/syscalls.master +++ head/sys/kern/syscalls.master @@ -3175,6 +3175,16 @@ int flag ); } +569 AUE_NULL STD { + ssize_t copy_file_range( + int infd, + _Inout_opt_ off_t *inoffp, + int outfd, + _Inout_opt_ off_t *outoffp, + size_t len, + unsigned int flags + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: head/sys/kern/vfs_default.c =================================================================== --- head/sys/kern/vfs_default.c +++ head/sys/kern/vfs_default.c @@ -83,6 +83,7 @@ static int vop_stdis_text(struct vop_is_text_args *ap); static int vop_stdunset_text(struct vop_unset_text_args *ap); static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); +static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap); static int vop_stdfdatasync(struct vop_fdatasync_args *ap); static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); @@ -140,6 +141,7 @@ .vop_set_text = vop_stdset_text, .vop_unset_text = vop_stdunset_text, .vop_add_writecount = vop_stdadd_writecount, + .vop_copy_file_range = vop_stdcopy_file_range, }; /* @@ -1210,6 +1212,17 @@ { return (0); +} + +static int +vop_stdcopy_file_range(struct vop_copy_file_range_args *ap) +{ + int error; + + error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, + ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred, + ap->a_outcred, ap->a_fsizetd); + return (error); } int Index: head/sys/kern/vfs_syscalls.c =================================================================== --- head/sys/kern/vfs_syscalls.c +++ head/sys/kern/vfs_syscalls.c @@ -4814,3 +4814,122 @@ uap->advice); return (kern_posix_error(td, error)); } + +int +kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd, + off_t *outoffp, size_t len, unsigned int flags) +{ + struct file *infp, *outfp; + struct vnode *invp, *outvp; + int error; + size_t retlen; + void *rl_rcookie, *rl_wcookie; + off_t savinoff, savoutoff; + + infp = outfp = NULL; + rl_rcookie = rl_wcookie = NULL; + savinoff = -1; + error = 0; + retlen = 0; + + if (flags != 0) { + error = EINVAL; + goto out; + } + if (len > SSIZE_MAX) + /* + * Although the len argument is size_t, the return argument + * is ssize_t (which is signed). Therefore a size that won't + * fit in ssize_t can't be returned. + */ + len = SSIZE_MAX; + + /* Get the file structures for the file descriptors. */ + error = fget_read(td, infd, &cap_read_rights, &infp); + if (error != 0) + goto out; + error = fget_write(td, outfd, &cap_write_rights, &outfp); + if (error != 0) + goto out; + + /* Set the offset pointers to the correct place. */ + if (inoffp == NULL) + inoffp = &infp->f_offset; + if (outoffp == NULL) + outoffp = &outfp->f_offset; + savinoff = *inoffp; + savoutoff = *outoffp; + + invp = infp->f_vnode; + outvp = outfp->f_vnode; + /* Sanity check the f_flag bits. */ + if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE || + (infp->f_flag & FREAD) == 0 || invp == outvp) { + error = EBADF; + goto out; + } + + /* If len == 0, just return 0. */ + if (len == 0) + goto out; + + /* Range lock the byte ranges for both invp and outvp. */ + for (;;) { + rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp + + len); + rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp + + len); + if (rl_rcookie != NULL) + break; + vn_rangelock_unlock(outvp, rl_wcookie); + rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len); + vn_rangelock_unlock(invp, rl_rcookie); + } + + retlen = len; + error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen, + flags, infp->f_cred, outfp->f_cred, td); +out: + if (rl_rcookie != NULL) + vn_rangelock_unlock(invp, rl_rcookie); + if (rl_wcookie != NULL) + vn_rangelock_unlock(outvp, rl_wcookie); + if (savinoff != -1 && (error == EINTR || error == ERESTART)) { + *inoffp = savinoff; + *outoffp = savoutoff; + } + if (outfp != NULL) + fdrop(outfp, td); + if (infp != NULL) + fdrop(infp, td); + td->td_retval[0] = retlen; + return (error); +} + +int +sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap) +{ + off_t inoff, outoff, *inoffp, *outoffp; + int error; + + inoffp = outoffp = NULL; + if (uap->inoffp != NULL) { + error = copyin(uap->inoffp, &inoff, sizeof(off_t)); + if (error != 0) + return (error); + inoffp = &inoff; + } + if (uap->outoffp != NULL) { + error = copyin(uap->outoffp, &outoff, sizeof(off_t)); + if (error != 0) + return (error); + outoffp = &outoff; + } + error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd, + outoffp, uap->len, uap->flags); + if (error == 0 && uap->inoffp != NULL) + error = copyout(inoffp, uap->inoffp, sizeof(off_t)); + if (error == 0 && uap->outoffp != NULL) + error = copyout(outoffp, uap->outoffp, sizeof(off_t)); + return (error); +} Index: head/sys/kern/vfs_vnops.c =================================================================== --- head/sys/kern/vfs_vnops.c +++ head/sys/kern/vfs_vnops.c @@ -2619,3 +2619,372 @@ return (error); } + +/* + * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE() + * or vn_generic_copy_file_range() after rangelocking the byte ranges, + * to do the actual copy. + * vn_generic_copy_file_range() is factored out, so it can be called + * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from + * different file systems. + */ +int +vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, + off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, + struct ucred *outcred, struct thread *fsize_td) +{ + struct vattr va; + int error; + size_t len; + uint64_t uvalin, uvalout; + + len = *lenp; + *lenp = 0; /* For error returns. */ + error = 0; + + /* Do some sanity checks on the arguments. */ + uvalin = *inoffp; + uvalin += len; + uvalout = *outoffp; + uvalout += len; + if (invp->v_type == VDIR || outvp->v_type == VDIR) + error = EISDIR; + else if (*inoffp < 0 || uvalin > INT64_MAX || uvalin < + (uint64_t)*inoffp || *outoffp < 0 || uvalout > INT64_MAX || + uvalout < (uint64_t)*outoffp || invp->v_type != VREG || + outvp->v_type != VREG) + error = EINVAL; + else if (invp == outvp) + error = EBADF; + if (error != 0) + goto out; + + error = vn_lock(invp, LK_SHARED); + if (error != 0) + goto out; + /* Check that the offset + len does not go past EOF of invp. */ + error = VOP_GETATTR(invp, &va, incred); + if (error == 0 && va.va_size < *inoffp + len) + error = EINVAL; + VOP_UNLOCK(invp, 0); + if (error != 0) + goto out; + + /* + * If the two vnode are for the same file system, call + * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range() + * which can handle copies across multiple file systems. + */ + *lenp = len; + if (invp->v_mount == outvp->v_mount) + error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp, + lenp, flags, incred, outcred, fsize_td); + else + error = vn_generic_copy_file_range(invp, inoffp, outvp, + outoffp, lenp, flags, incred, outcred, fsize_td); +out: + return (error); +} + +/* + * Test len bytes of data starting at dat for all bytes == 0. + * Return true if all bytes are zero, false otherwise. + * Expects dat to be well aligned. + */ +static bool +mem_iszero(void *dat, int len) +{ + int i; + const u_int *p; + const char *cp; + + for (p = dat; len > 0; len -= sizeof(*p), p++) { + if (len >= sizeof(*p)) { + if (*p != 0) + return (false); + } else { + cp = (const char *)p; + for (i = 0; i < len; i++, cp++) + if (*cp != '\0') + return (false); + } + } + return (true); +} + +/* + * Write an xfer sized chunk to outvp in blksize blocks from dat. + * dat is a maximum of blksize in length and can be written repeatedly in + * the chunk. + * If growfile == true, just grow the file via vn_truncate_locked() instead + * of doing actual writes. + */ +static int +vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer, + u_long blksize, bool growfile, struct ucred *cred) +{ + struct mount *mp; + off_t xfer2; + int error, lckf; + + /* + * Loop around doing writes of blksize until write has been completed. + * Lock/unlock on each loop iteration so that a bwillwrite() can be + * done for each iteration, since the xfer argument can be very + * large if there is a large hole to punch in the output file. + */ + do { + bwillwrite(); + mp = NULL; + error = vn_start_write(outvp, &mp, V_WAIT); + if (error == 0) { + if (MNT_SHARED_WRITES(mp)) + lckf = LK_SHARED; + else + lckf = LK_EXCLUSIVE; + error = vn_lock(outvp, lckf); + } + if (error == 0) { + if (growfile) + error = vn_truncate_locked(outvp, outoff + xfer, + false, cred); + else { + xfer2 = MIN(xfer, blksize); + error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2, + outoff, UIO_SYSSPACE, IO_NODELOCKED, + curthread->td_ucred, cred, NULL, curthread); + outoff += xfer2; + xfer -= xfer2; + } + VOP_UNLOCK(outvp, 0); + } + if (mp != NULL) + vn_finished_write(mp); + } while (!growfile && xfer > 0 && error == 0); + return (error); +} + +/* + * Copy a byte range of one file to another. This function can handle the + * case where invp and outvp are on different file systems. + * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there + * is no better file system specific way to do it. + */ +int +vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, + struct ucred *incred, struct ucred *outcred, struct thread *fsize_td) +{ + struct vattr va; + struct mount *mp; + struct uio io; + off_t startoff, endoff, xfer, xfer2; + u_long blksize; + int error; + bool cantseek, readzeros; + ssize_t aresid; + size_t copylen, len, savlen; + char *dat; + long holein, holeout; + + holein = holeout = 0; + savlen = len = *lenp; + error = 0; + dat = NULL; + + error = vn_lock(invp, LK_SHARED); + if (error != 0) + goto out; + if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0) + holein = 0; + VOP_UNLOCK(invp, 0); + if (error != 0) + goto out; + + mp = NULL; + error = vn_start_write(outvp, &mp, V_WAIT); + if (error == 0) + error = vn_lock(outvp, LK_EXCLUSIVE); + if (error == 0) { + /* + * If fsize_td != NULL, do a vn_rlimit_fsize() call, + * now that outvp is locked. + */ + if (fsize_td != NULL) { + io.uio_offset = *outoffp; + io.uio_resid = len; + error = vn_rlimit_fsize(outvp, &io, fsize_td); + if (error != 0) + error = EFBIG; + } + if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0) + holeout = 0; + /* + * Holes that are past EOF do not need to be written as a block + * of zero bytes. So, truncate the output file as far as + * possible and then use va.va_size to decide if writing 0 + * bytes is necessary in the loop below. + */ + if (error == 0) + error = VOP_GETATTR(outvp, &va, outcred); + if (error == 0 && va.va_size > *outoffp && va.va_size <= + *outoffp + len) { +#ifdef MAC + error = mac_vnode_check_write(curthread->td_ucred, + outcred, outvp); + if (error == 0) +#endif + error = vn_truncate_locked(outvp, *outoffp, + false, outcred); + if (error == 0) + va.va_size = *outoffp; + } + VOP_UNLOCK(outvp, 0); + } + if (mp != NULL) + vn_finished_write(mp); + if (error != 0) + goto out; + + /* + * Set the blksize to the larger of the hole sizes for invp and outvp. + * If hole sizes aren't available, set the blksize to the larger + * f_iosize of invp and outvp. + * This code expects the hole sizes and f_iosizes to be powers of 2. + * This value is clipped at 4Kbytes and 1Mbyte. + */ + blksize = MAX(holein, holeout); + if (blksize == 0) + blksize = MAX(invp->v_mount->mnt_stat.f_iosize, + outvp->v_mount->mnt_stat.f_iosize); + if (blksize < 4096) + blksize = 4096; + else if (blksize > 1024 * 1024) + blksize = 1024 * 1024; + dat = malloc(blksize, M_TEMP, M_WAITOK); + + /* + * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA + * to find holes. Otherwise, just scan the read block for all 0s + * in the inner loop where the data copying is done. + * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may + * support holes on the server, but do not support FIOSEEKHOLE. + */ + while (len > 0 && error == 0) { + endoff = 0; /* To shut up compilers. */ + cantseek = true; + startoff = *inoffp; + copylen = len; + + /* + * Find the next data area. If there is just a hole to EOF, + * FIOSEEKDATA should fail and then we drop down into the + * inner loop and create the hole on the outvp file. + * (I do not know if any file system will report a hole to + * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA + * will fail for those file systems.) + * + * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE, + * the code just falls through to the inner copy loop. + */ + error = EINVAL; + if (holein > 0) + error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0, + incred, curthread); + if (error == 0) { + endoff = startoff; + error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0, + incred, curthread); + } + if (error == 0) { + if (startoff > *inoffp) { + /* Found hole before data block. */ + xfer = MIN(startoff - *inoffp, len); + if (*outoffp < va.va_size) { + /* Must write 0s to punch hole. */ + xfer2 = MIN(va.va_size - *outoffp, + xfer); + memset(dat, 0, MIN(xfer2, blksize)); + error = vn_write_outvp(outvp, dat, + *outoffp, xfer2, blksize, false, + outcred); + } + + if (error == 0 && *outoffp + xfer > + va.va_size && xfer == len) + /* Grow last block. */ + error = vn_write_outvp(outvp, dat, + *outoffp, xfer, blksize, true, + outcred); + if (error == 0) { + *inoffp += xfer; + *outoffp += xfer; + len -= xfer; + } + } + copylen = MIN(len, endoff - startoff); + cantseek = false; + } else { + cantseek = true; + startoff = *inoffp; + copylen = len; + error = 0; + } + + xfer = blksize; + if (cantseek) { + /* + * Set first xfer to end at a block boundary, so that + * holes are more likely detected in the loop below via + * the for all bytes 0 method. + */ + xfer -= (*inoffp % blksize); + } + /* Loop copying the data block. */ + while (copylen > 0 && error == 0) { + if (copylen < xfer) + xfer = copylen; + error = vn_lock(invp, LK_SHARED); + if (error != 0) + goto out; + error = vn_rdwr(UIO_READ, invp, dat, xfer, + startoff, UIO_SYSSPACE, IO_NODELOCKED, + curthread->td_ucred, incred, &aresid, + curthread); + VOP_UNLOCK(invp, 0); + /* + * Linux considers a range that exceeds EOF to + * be an error, so we will too. + */ + if (error == 0 && aresid > 0) + error = EINVAL; + if (error == 0) { + /* + * Skip the write for holes past the initial EOF + * of the output file, unless this is the last + * write of the output file at EOF. + */ + readzeros = cantseek ? mem_iszero(dat, xfer) : + false; + if (!cantseek || *outoffp < va.va_size || + xfer == len || !readzeros) + error = vn_write_outvp(outvp, dat, + *outoffp, xfer, blksize, + readzeros && xfer == len && + *outoffp >= va.va_size, outcred); + if (error == 0) { + *inoffp += xfer; + startoff += xfer; + *outoffp += xfer; + copylen -= xfer; + len -= xfer; + } + } + xfer = blksize; + } + } +out: + *lenp = savlen - len; + free(dat, M_TEMP); + return (error); +} Index: head/sys/kern/vnode_if.src =================================================================== --- head/sys/kern/vnode_if.src +++ head/sys/kern/vnode_if.src @@ -718,6 +718,22 @@ }; +%% copy_file_range invp U U U +%% copy_file_range outvp U U U + +vop_copy_file_range { + IN struct vnode *invp; + INOUT off_t *inoffp; + IN struct vnode *outvp; + INOUT off_t *outoffp; + INOUT size_t *lenp; + IN unsigned int flags; + IN struct ucred *incred; + IN struct ucred *outcred; + IN struct thread *fsizetd; +}; + + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, Index: head/sys/sys/syscallsubr.h =================================================================== --- head/sys/sys/syscallsubr.h +++ head/sys/sys/syscallsubr.h @@ -94,6 +94,8 @@ int kern_close(struct thread *td, int fd); int kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa); +int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, + int outfd, off_t *outoffp, size_t len, unsigned int flags); int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp); int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h +++ head/sys/sys/vnode.h @@ -667,9 +667,17 @@ struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); +int vn_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, + unsigned int flags, struct ucred *incred, struct ucred *outcred, + struct thread *fsize_td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor); +int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, + struct vnode *outvp, off_t *outoffp, size_t *lenp, + unsigned int flags, struct ucred *incred, struct ucred *outcred, + struct thread *fsize_td); int vn_isdisk(struct vnode *vp, int *errp); int _vn_lock(struct vnode *vp, int flags, char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)