Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F142260711
D20584.id60007.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
28 KB
Referenced Files
None
Subscribers
None
D20584.id60007.diff
View Options
Index: include/unistd.h
===================================================================
--- include/unistd.h
+++ include/unistd.h
@@ -494,6 +494,7 @@
int acct(const char *);
int async_daemon(void);
int check_utility_compat(const char *);
+ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int);
const char *
crypt_get_format(void);
char *crypt_r(const char *, const char *, struct crypt_data *);
Index: lib/libc/sys/Makefile.inc
===================================================================
--- lib/libc/sys/Makefile.inc
+++ lib/libc/sys/Makefile.inc
@@ -175,6 +175,7 @@
closefrom.2 \
connect.2 \
connectat.2 \
+ copy_file_range.2 \
cpuset.2 \
cpuset_getaffinity.2 \
cpuset_getdomain.2 \
Index: lib/libc/sys/Symbol.map
===================================================================
--- lib/libc/sys/Symbol.map
+++ lib/libc/sys/Symbol.map
@@ -402,6 +402,7 @@
};
FBSD_1.6 {
+ copy_file_range;
fhlink;
fhlinkat;
fhreadlink;
Index: lib/libc/sys/copy_file_range.2
===================================================================
--- lib/libc/sys/copy_file_range.2
+++ lib/libc/sys/copy_file_range.2
@@ -0,0 +1,191 @@
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2019 Rick Macklem
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd June 9, 2019
+.Dt COPY_FILE_RANGE 2
+.Os
+.Sh NAME
+.Nm copy_file_range
+.Nd kernel copy of a byte range from one file to another
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In sys/types.h
+.In unistd.h
+.Ft ssize_t
+.Fo copy_file_range
+.Fa "int infd"
+.Fa "off_t *inoffp"
+.Fa "int outfd"
+.Fa "off_t *outoffp"
+.Fa "size_t len"
+.Fa "unsigned int flags"
+.Fc
+.Sh DESCRIPTION
+The
+.Fn copy_file_range
+system call
+copies up to
+.Fa len
+bytes from
+.Fa infd
+to
+.Fa outfd
+in the kernel.
+It may do this using a file system specific technique if
+.Fa infd
+and
+.Fa outfd
+are on the same file system.
+The
+.Fa infd
+argument must be opened for reading and the
+.Fa outfd
+argument must be opened for writing, but not
+.Dv O_APPEND .
+If
+.Fa inoffp
+or
+.Fa outoffp
+is
+.Dv NULL ,
+the file offset for
+.Fa infd
+or
+.Fa outfd
+respectively will be used and updated by
+the number of bytes copied.
+If
+.Fa inoffp
+or
+.Fa outoffp
+is not
+.Dv NULL ,
+the byte offset pointed to by
+.Fa inoffp
+or
+.Fa outoffp
+respectively will be used/updated and the file offset for
+.Fa infd
+or
+.Fa outfd
+respectively will not be affected.
+The
+.Fa flags
+argument must be 0.
+.Pp
+This system call attempts to maintain holes in the output file for
+the byte range being copied.
+However, this does not always work well.
+It is recommended that sparse files be copied in a loop using
+.Xr lseek 2
+with
+.Dv SEEK_HOLE ,
+.Dv SEEK_DATA
+arguments and this system call for the
+data ranges found.
+.Pp
+.Sh RETURN VALUES
+If it succeeds, the call returns the number of bytes copied, which can be less
+than
+.Fa len .
+.Fn copy_file_range
+should be used in a loop until copying of the desired byte range has been
+completed.
+If an error has occurred, a \-1 is returned and the error code is placed in
+the global variable
+.Va errno .
+.Sh ERRORS
+The
+.Fn copy_file_range
+system call
+will fail if:
+.Bl -tag -width Er
+.It Bq Er EBADF
+If
+.Fa
+infd
+is not open for reading or
+.Fa
+outfd
+is not open for writing, or opened for writing with
+.Dv O_APPEND ,
+or if
+.Fa infd
+and
+.Fa outfd
+refer to the same file.
+.It Bq Er EFBIG
+If the copy exceeds the process's file size limit or the maximum file size
+for the file system
+.Fa outfd
+resides on.
+.It Bq Er EINTR
+A signal interrupted the system call
+before it could be completed.
+This may happen for files on some NFS mounts.
+When this happens, the values pointed to by
+.Fa inoffp
+and
+.Fa outoffp
+are reset to the initial values for the system call.
+.It Bq Er EINVAL
+If the initial offset for
+.Fa infd
+plus
+.Fa len
+exceeds EOF for
+.Fa infd
+or
+.Fa
+flags
+is not zero.
+.It Bq Er EIO
+An I/O error occurred while reading/writing the files.
+.It Bq Er EISDIR
+If either
+.Fa infd
+or
+.Fa outfd
+refers to a directory.
+.It Bq Er ENOSPC
+File system that stores
+.Fa outfd
+is full.
+.El
+.Sh SEE ALSO
+.Xr lseek 2
+.Sh STANDARDS
+The
+.Fn copy_file_range
+system call is expected to be compatible with the Linux system call of
+the same name.
+.Sh HISTORY
+The
+.Fn copy_file_range
+function appeared in
+.Fx 13.0 .
Index: share/man/man9/Makefile
===================================================================
--- share/man/man9/Makefile
+++ share/man/man9/Makefile
@@ -400,6 +400,7 @@
VOP_ATTRIB.9 \
VOP_BMAP.9 \
VOP_BWRITE.9 \
+ VOP_COPY_FILE_RANGE.9 \
VOP_CREATE.9 \
VOP_FSYNC.9 \
VOP_GETACL.9 \
Index: share/man/man9/VOP_COPY_FILE_RANGE.9
===================================================================
--- share/man/man9/VOP_COPY_FILE_RANGE.9
+++ share/man/man9/VOP_COPY_FILE_RANGE.9
@@ -0,0 +1,124 @@
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2019 Rick Macklem
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd June 20, 2019
+.Dt VOP_COPY_FILE_RANGE 9
+.Os
+.Sh NAME
+.Nm VOP_COPY_FILE_RANGE
+.Nd copy a byte range from one regular file to another within a file system
+.Sh SYNOPSIS
+.In sys/param.h
+.In sys/vnode.h
+.Ft int
+.Fo VOP_COPY_FILE_RANGE
+.Fa "struct vnode *invp"
+.Fa "off_t *inoff"
+.Fa "struct vnode *outvp"
+.Fa "off_t *outoff"
+.Fa "size_t *len"
+.Fa "unsigned int flags"
+.Fa "struct ucred *incred"
+.Fa "struct ucred *outcred"
+.Fa "struct thread *fsize_td"
+.Sh DESCRIPTION
+This entry point copies a byte range from one regular file to another within a
+file system.
+.Pp
+The arguments are:
+.Bl -tag -width ioflag
+.It Fa invp
+The vnode of the input file.
+.It Fa inoff
+A pointer to the file offset for the input file.
+.It Fa outvp
+The vnode of the output file.
+.It Fa outoff
+A pointer to the file offset for the output file.
+.It Fa len
+A pointer to the number of bytes to be copied.
+.It Fa flags
+Flags, should be set to 0 for now.
+.It Fa incred
+The credentials used to read
+.Fa invp .
+.It Fa outcred
+The credentials used to write
+.Fa outvp .
+.It Fa fsize_td
+The thread pointer to be passed to vn_rlimit_fsize().
+This will be
+.Dv NULL
+for a server thread without limits, such as for the NFS
+server or
+.Dv curthread
+otherwise.
+.El
+.Pp
+When the call is done, the
+.Fa inoff
+and
+.Fa outoff
+arguments point to the locations of the file offsets.
+These file offsets should be updated by the number of bytes copied.
+The
+.Fa len
+argument points to the location that stores the number of bytes
+to be copied.
+It should be reduced by the number of bytes copied, which implies that
+the value pointed to by
+.Fa len
+will normally be zero for a non-error return.
+However, a copy of less bytes than requested is permitted.
+.Sh LOCKS
+The vnode are unlocked on entry and must be unlocked on return.
+The byte ranges for both
+.Fa invp
+and
+.Fa outvp
+should be range locked when this call is done.
+.Sh RETURN VALUES
+Zero is returned on success, otherwise an error code is returned.
+.Sh ERRORS
+.Bl -tag -width Er
+.It Bq Er EFBIG
+If the copy exceeds the process's file size limit or the maximum file size
+for the file system
+.Fa invp
+and
+.Fa outvp
+reside on.
+.It Bq Er EINTR
+A signal interrupted the VOP call before it could be completed.
+.It Bq Er EIO
+An I/O error occurred while reading/writing the files.
+.It Bq Er ENOSPC
+The file system is full.
+.El
+.Sh SEE ALSO
+.Xr vn_rdwr 9 ,
+.Xr vnode 9
Index: sys/kern/syscalls.master
===================================================================
--- sys/kern/syscalls.master
+++ sys/kern/syscalls.master
@@ -3175,6 +3175,16 @@
int flag
);
}
+569 AUE_NULL STD {
+ ssize_t copy_file_range(
+ int infd,
+ _Inout_opt_ off_t *inoffp,
+ int outfd,
+ _Inout_opt_ off_t *outoffp,
+ size_t len,
+ unsigned int flags
+ );
+ }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
Index: sys/kern/vfs_default.c
===================================================================
--- sys/kern/vfs_default.c
+++ sys/kern/vfs_default.c
@@ -83,6 +83,7 @@
static int vop_stdis_text(struct vop_is_text_args *ap);
static int vop_stdunset_text(struct vop_unset_text_args *ap);
static int vop_stdadd_writecount(struct vop_add_writecount_args *ap);
+static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap);
static int vop_stdfdatasync(struct vop_fdatasync_args *ap);
static int vop_stdgetpages_async(struct vop_getpages_async_args *ap);
@@ -140,6 +141,7 @@
.vop_set_text = vop_stdset_text,
.vop_unset_text = vop_stdunset_text,
.vop_add_writecount = vop_stdadd_writecount,
+ .vop_copy_file_range = vop_stdcopy_file_range,
};
/*
@@ -1206,6 +1208,17 @@
return (0);
}
+static int
+vop_stdcopy_file_range(struct vop_copy_file_range_args *ap)
+{
+ int error;
+
+ error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp,
+ ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred,
+ ap->a_outcred, ap->a_fsizetd);
+ return (error);
+}
+
int
vfs_stdvget (mp, ino, flags, vpp)
struct mount *mp;
Index: sys/kern/vfs_subr.c
===================================================================
--- sys/kern/vfs_subr.c
+++ sys/kern/vfs_subr.c
@@ -60,6 +60,7 @@
#include <sys/extattr.h>
#include <sys/file.h>
#include <sys/fcntl.h>
+#include <sys/filio.h>
#include <sys/jail.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
@@ -80,6 +81,7 @@
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
+#include <sys/unistd.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/watchdog.h>
@@ -5657,3 +5659,305 @@
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_active(mvp, mp);
}
+
+/*
+ * Test len bytes of data starting at dat for all bytes == 0.
+ * Return true if all bytes are zero, false otherwise.
+ * Expects dat to be well aligned.
+ */
+static bool
+mem_iszero(void *dat, int len)
+{
+ int i;
+ const u_int *p;
+ const char *cp;
+
+ for (p = dat; len > 0; len -= sizeof(*p), p++) {
+ if (len >= sizeof(*p)) {
+ if (*p != 0)
+ return (false);
+ } else {
+ cp = (const char *)p;
+ for (i = 0; i < len; i++, cp++)
+ if (*cp != '\0')
+ return (false);
+ }
+ }
+ return (true);
+}
+
+/*
+ * Write an xfer sized chunk to outvp in blksize blocks from dat.
+ * dat is a maximum of blksize in length and can be written repeatedly in
+ * the chunk.
+ * If growfile == true, just grow the file via vn_truncate_locked() instead
+ * of doing actual writes.
+ */
+static int
+vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
+ u_long blksize, bool growfile, struct ucred *cred)
+{
+ struct mount *mp;
+ off_t xfer2;
+ int error, lckf;
+
+ /*
+ * Loop around doing writes of blksize until write has been completed.
+ * Lock/unlock on each loop iteration so that a bwillwrite() can be
+ * done for each iteration, since the xfer argument can be very
+ * large if there is a large hole to punch in the output file.
+ */
+ do {
+ bwillwrite();
+ mp = NULL;
+ error = vn_start_write(outvp, &mp, V_WAIT);
+ if (error == 0) {
+ if (MNT_SHARED_WRITES(mp))
+ lckf = LK_SHARED;
+ else
+ lckf = LK_EXCLUSIVE;
+ error = vn_lock(outvp, lckf);
+ }
+ if (error == 0) {
+ if (growfile)
+ error = vn_truncate_locked(outvp, outoff + xfer,
+ false, cred);
+ else {
+ xfer2 = MIN(xfer, blksize);
+ error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
+ outoff, UIO_SYSSPACE, IO_NODELOCKED,
+ curthread->td_ucred, cred, NULL, curthread);
+ outoff += xfer2;
+ xfer -= xfer2;
+ }
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ } while (!growfile && xfer > 0 && error == 0);
+ return (error);
+}
+
+/*
+ * Copy a byte range of one file to another. This function can handle the
+ * case where invp and outvp are on different file systems.
+ * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
+ * is no better file system specific way to do it.
+ */
+int
+vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
+ struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
+ struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
+{
+ struct vattr va;
+ struct mount *mp;
+ struct uio io;
+ off_t startoff, endoff, xfer, xfer2;
+ u_long blksize;
+ int error;
+ bool cantseek, readzeros;
+ ssize_t aresid;
+ size_t copylen, len, savlen;
+ char *dat;
+ long holein, holeout;
+
+ savlen = len = *lenp;
+ error = 0;
+ dat = NULL;
+
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
+ holein = 0;
+ VOP_UNLOCK(invp, 0);
+ if (error != 0)
+ goto out;
+
+ mp = NULL;
+ error = vn_start_write(outvp, &mp, V_WAIT);
+ if (error == 0)
+ error = vn_lock(outvp, LK_EXCLUSIVE);
+ if (error == 0) {
+ /*
+ * If fsize_td != NULL, do a vn_rlimit_fsize() call,
+ * now that outvp is locked.
+ */
+ if (fsize_td != NULL) {
+ io.uio_offset = *outoffp;
+ io.uio_resid = len;
+ error = vn_rlimit_fsize(outvp, &io, fsize_td);
+ if (error != 0)
+ error = EFBIG;
+ }
+ if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
+ holeout = 0;
+ /*
+ * Holes that are past EOF do not need to be written as a block
+ * of zero bytes. So, truncate the output file as far as
+ * possible and then use va.va_size to decide if writing 0
+ * bytes is necessary in the loop below.
+ */
+ if (error == 0)
+ error = VOP_GETATTR(outvp, &va, outcred);
+ if (error == 0 && va.va_size > *outoffp && va.va_size <=
+ *outoffp + len) {
+#ifdef MAC
+ error = mac_vnode_check_write(curthread->td_ucred,
+ outcred, outvp);
+ if (error == 0)
+#endif
+ error = vn_truncate_locked(outvp, *outoffp,
+ false, outcred);
+ if (error == 0)
+ va.va_size = *outoffp;
+ }
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Set the blksize to the larger of the hole sizes for invp and outvp.
+ * If hole sizes aren't available, set the blksize to the larger
+ * f_iosize of invp and outvp.
+ * This code expects the hole sizes and f_iosizes to be powers of 2.
+ * This value is clipped at 4Kbytes and 1Mbyte.
+ */
+ blksize = MAX(holein, holeout);
+ if (blksize == 0)
+ blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
+ outvp->v_mount->mnt_stat.f_iosize);
+ if (blksize < 4096)
+ blksize = 4096;
+ else if (blksize > 1048576)
+ blksize = 1048576;
+ dat = malloc(blksize, M_TEMP, M_WAITOK);
+
+ /*
+ * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
+ * to find holes. Otherwise, just scan the read block for all 0s
+ * in the inner loop where the data copying is done.
+ * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
+ * support holes on the server, but do not support FIOSEEKHOLE.
+ */
+ while (len > 0 && error == 0) {
+ endoff = 0; /* To shut up compilers. */
+ cantseek = true;
+ startoff = *inoffp;
+ copylen = len;
+
+ /*
+ * Find the next data area. If there is just a hole to EOF,
+ * FIOSEEKDATA should fail and then we drop down into the
+ * inner loop and create the hole on the outvp file.
+ * (I do not know if any file system will report a hole to
+ * EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
+ * will fail for those file systems.)
+ *
+ * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
+ * the code just falls through to the inner copy loop.
+ */
+ error = EINVAL;
+ if (holein > 0)
+ error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
+ incred, curthread);
+ if (error == 0) {
+ endoff = startoff;
+ error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
+ incred, curthread);
+ }
+ if (error == 0) {
+ if (startoff > *inoffp) {
+ /* Found hole before data block. */
+ xfer = MIN(startoff - *inoffp, len);
+ if (*outoffp < va.va_size) {
+ /* Must write 0s to punch hole. */
+ xfer2 = MIN(va.va_size - *outoffp,
+ xfer);
+ memset(dat, 0, MIN(xfer2, blksize));
+ error = vn_write_outvp(outvp, dat,
+ *outoffp, xfer2, blksize, false,
+ outcred);
+ }
+
+ if (error == 0 && *outoffp + xfer >
+ va.va_size && xfer == len)
+ /* Grow last block. */
+ error = vn_write_outvp(outvp, dat,
+ *outoffp, xfer, blksize, true,
+ outcred);
+ if (error == 0) {
+ *inoffp += xfer;
+ *outoffp += xfer;
+ len -= xfer;
+ }
+ }
+ copylen = MIN(len, endoff - startoff);
+ cantseek = false;
+ } else {
+ cantseek = true;
+ startoff = *inoffp;
+ copylen = len;
+ error = 0;
+ }
+
+ xfer = blksize;
+ if (cantseek) {
+ /*
+ * Set first xfer to end at a block boundary, so that
+ * holes are more likely detected in the loop below via
+ * the for all bytes 0 method.
+ */
+ xfer -= (*inoffp % blksize);
+ }
+ /* Loop copying the data block. */
+ while (copylen > 0 && error == 0) {
+ if (copylen < xfer)
+ xfer = copylen;
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ error = vn_rdwr(UIO_READ, invp, dat, xfer,
+ startoff, UIO_SYSSPACE, IO_NODELOCKED,
+ curthread->td_ucred, incred, &aresid,
+ curthread);
+ VOP_UNLOCK(invp, 0);
+ /*
+ * Linux considers a range that exceeds EOF to
+ * be an error, so we will too.
+ */
+ if (error == 0 && aresid > 0)
+ error = EINVAL;
+ if (error == 0) {
+ /*
+ * Skip the write for holes past the initial EOF
+ * of the output file, unless this is the last
+ * write of the output file at EOF.
+ */
+ readzeros = (cantseek) ? mem_iszero(dat, xfer) :
+ false;
+ if (!cantseek || *outoffp < va.va_size ||
+ xfer == len || !readzeros)
+ error = vn_write_outvp(outvp, dat,
+ *outoffp, xfer, blksize,
+ readzeros && xfer == len &&
+ *outoffp >= va.va_size, outcred);
+ if (error == 0) {
+ *inoffp += xfer;
+ startoff += xfer;
+ *outoffp += xfer;
+ copylen -= xfer;
+ len -= xfer;
+ }
+ }
+ xfer = blksize;
+ }
+ }
+out:
+ *lenp = savlen - len;
+ free(dat, M_TEMP);
+ return (error);
+}
Index: sys/kern/vfs_syscalls.c
===================================================================
--- sys/kern/vfs_syscalls.c
+++ sys/kern/vfs_syscalls.c
@@ -4814,3 +4814,122 @@
uap->advice);
return (kern_posix_error(td, error));
}
+
+int
+kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
+ off_t *outoffp, size_t len, unsigned int flags)
+{
+ struct file *infp, *outfp;
+ struct vnode *invp, *outvp;
+ int error;
+ size_t retlen;
+ void *rl_rcookie, *rl_wcookie;
+ off_t savinoff, savoutoff;
+
+ infp = outfp = NULL;
+ rl_rcookie = rl_wcookie = NULL;
+ savinoff = -1;
+ error = 0;
+ retlen = 0;
+
+ if (flags != 0) {
+ error = EINVAL;
+ goto out;
+ }
+ if (len > SSIZE_MAX)
+ /*
+ * Although the len argument is size_t, the return argument
+ * is ssize_t (which is signed). Therefore a size that won't
+ * fit in ssize_t can't be returned.
+ */
+ len = SSIZE_MAX;
+
+ /* Get the file structures for the file descriptors. */
+ error = fget_read(td, infd, &cap_read_rights, &infp);
+ if (error != 0)
+ goto out;
+ error = fget_write(td, outfd, &cap_write_rights, &outfp);
+ if (error != 0)
+ goto out;
+
+ /* Set the offset pointers to the correct place. */
+ if (inoffp == NULL)
+ inoffp = &infp->f_offset;
+ if (outoffp == NULL)
+ outoffp = &outfp->f_offset;
+ savinoff = *inoffp;
+ savoutoff = *outoffp;
+
+ invp = infp->f_vnode;
+ outvp = outfp->f_vnode;
+ /* Sanity check the f_flag bits. */
+ if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE ||
+ (infp->f_flag & FREAD) == 0 || invp == outvp) {
+ error = EBADF;
+ goto out;
+ }
+
+ /* If len == 0, just return 0. */
+ if (len == 0)
+ goto out;
+
+ /* Range lock the byte ranges for both invp and outvp. */
+ for (;;) {
+ rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp +
+ len);
+ rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp +
+ len);
+ if (rl_rcookie != NULL)
+ break;
+ vn_rangelock_unlock(outvp, rl_wcookie);
+ rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len);
+ vn_rangelock_unlock(invp, rl_rcookie);
+ }
+
+ retlen = len;
+ error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
+ flags, infp->f_cred, outfp->f_cred, td);
+out:
+ if (rl_rcookie != NULL)
+ vn_rangelock_unlock(invp, rl_rcookie);
+ if (rl_wcookie != NULL)
+ vn_rangelock_unlock(outvp, rl_wcookie);
+ if (savinoff != -1 && (error == EINTR || error == ERESTART)) {
+ *inoffp = savinoff;
+ *outoffp = savoutoff;
+ }
+ if (outfp != NULL)
+ fdrop(outfp, td);
+ if (infp != NULL)
+ fdrop(infp, td);
+ td->td_retval[0] = retlen;
+ return (error);
+}
+
+int
+sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap)
+{
+ off_t inoff, outoff, *inoffp, *outoffp;
+ int error;
+
+ inoffp = outoffp = NULL;
+ if (uap->inoffp != NULL) {
+ error = copyin(uap->inoffp, &inoff, sizeof(off_t));
+ if (error != 0)
+ return (error);
+ inoffp = &inoff;
+ }
+ if (uap->outoffp != NULL) {
+ error = copyin(uap->outoffp, &outoff, sizeof(off_t));
+ if (error != 0)
+ return (error);
+ outoffp = &outoff;
+ }
+ error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
+ outoffp, uap->len, uap->flags);
+ if (error == 0 && uap->inoffp != NULL)
+ error = copyout(inoffp, uap->inoffp, sizeof(off_t));
+ if (error == 0 && uap->outoffp != NULL)
+ error = copyout(outoffp, uap->outoffp, sizeof(off_t));
+ return (error);
+}
Index: sys/kern/vfs_vnops.c
===================================================================
--- sys/kern/vfs_vnops.c
+++ sys/kern/vfs_vnops.c
@@ -61,6 +61,7 @@
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mman.h>
+#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
@@ -89,6 +90,8 @@
#include <vm/vm_page.h>
#include <vm/vnode_pager.h>
+#include <machine/vmparam.h>
+
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
@@ -2508,6 +2511,72 @@
va->va_fsid += (uint32_t)f->val[0];
}
+/*
+ * Copies a byte range from invp to outvp. Calls VOP_COPY_FILE_RANGE()
+ * or vn_generic_copy_file_range() after rangelocking the byte ranges,
+ * to do the actual copy.
+ * vn_generic_copy_file_range() is factored out, so it can be called
+ * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
+ * different file systems.
+ */
+int
+vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
+ off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
+ struct ucred *outcred, struct thread *fsize_td)
+{
+ struct vattr va;
+ int error;
+ size_t len;
+ uint64_t uvalin, uvalout;
+
+ len = *lenp;
+ *lenp = 0; /* For error returns. */
+ error = 0;
+
+ /* Do some sanity checks on the arguments. */
+ uvalin = *inoffp;
+ uvalin += len;
+ uvalout = *outoffp;
+ uvalout += len;
+ if (invp->v_type == VDIR || outvp->v_type == VDIR)
+ error = EISDIR;
+ else if (*inoffp < 0 || uvalin > INT64_MAX || uvalin <
+ (uint64_t)*inoffp || *outoffp < 0 || uvalout > INT64_MAX ||
+ uvalout < (uint64_t)*outoffp || invp->v_type != VREG ||
+ outvp->v_type != VREG)
+ error = EINVAL;
+ else if (invp == outvp)
+ error = EBADF;
+ if (error != 0)
+ goto out;
+
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ /* Check that the offset + len does not go past EOF of invp. */
+ error = VOP_GETATTR(invp, &va, incred);
+ if (error == 0 && va.va_size < *inoffp + len)
+ error = EINVAL;
+ VOP_UNLOCK(invp, 0);
+ if (error != 0)
+ goto out;
+
+ /*
+ * If the two vnode are for the same file system, call
+ * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
+ * which can handle copies across multiple file systems.
+ */
+ *lenp = len;
+ if (invp->v_mount == outvp->v_mount)
+ error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
+ lenp, flags, incred, outcred, fsize_td);
+ else
+ error = vn_generic_copy_file_range(invp, inoffp, outvp,
+ outoffp, lenp, flags, incred, outcred, fsize_td);
+out:
+ return (error);
+}
+
int
vn_fsync_buf(struct vnode *vp, int waitfor)
{
Index: sys/kern/vnode_if.src
===================================================================
--- sys/kern/vnode_if.src
+++ sys/kern/vnode_if.src
@@ -718,6 +718,22 @@
};
+%% copy_file_range invp U U U
+%% copy_file_range outvp U U U
+
+vop_copy_file_range {
+ IN struct vnode *invp;
+ INOUT off_t *inoffp;
+ IN struct vnode *outvp;
+ INOUT off_t *outoffp;
+ INOUT size_t *lenp;
+ IN unsigned int flags;
+ IN struct ucred *incred;
+ IN struct ucred *outcred;
+ IN struct thread *fsizetd;
+};
+
+
# The VOPs below are spares at the end of the table to allow new VOPs to be
# added in stable branches without breaking the KBI. New VOPs in HEAD should
# be added above these spares. When merging a new VOP to a stable branch,
Index: sys/sys/syscallsubr.h
===================================================================
--- sys/sys/syscallsubr.h
+++ sys/sys/syscallsubr.h
@@ -94,6 +94,8 @@
int kern_close(struct thread *td, int fd);
int kern_connectat(struct thread *td, int dirfd, int fd,
struct sockaddr *sa);
+int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp,
+ int outfd, off_t *outoffp, size_t len, unsigned int flags);
int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level,
cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp);
int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level,
Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h
+++ sys/sys/vnode.h
@@ -667,9 +667,17 @@
struct ucred *cred);
int vn_close(struct vnode *vp,
int flags, struct ucred *file_cred, struct thread *td);
+int vn_copy_file_range(struct vnode *invp, off_t *inoffp,
+ struct vnode *outvp, off_t *outoffp, size_t *lenp,
+ unsigned int flags, struct ucred *incred, struct ucred *outcred,
+ struct thread *fsize_td);
void vn_finished_write(struct mount *mp);
void vn_finished_secondary_write(struct mount *mp);
int vn_fsync_buf(struct vnode *vp, int waitfor);
+int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
+ struct vnode *outvp, off_t *outoffp, size_t *lenp,
+ unsigned int flags, struct ucred *incred, struct ucred *outcred,
+ struct thread *fsize_td);
int vn_isdisk(struct vnode *vp, int *errp);
int _vn_lock(struct vnode *vp, int flags, char *file, int line);
#define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Jan 18, 10:11 PM (20 h, 30 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27720887
Default Alt Text
D20584.id60007.diff (28 KB)
Attached To
Mode
D20584: add a linux compatible copy_file_range(2) syscall
Attached
Detach File
Event Timeline
Log In to Comment