Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F132631312
D20584.id58701.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
18 KB
Referenced Files
None
Subscribers
None
D20584.id58701.diff
View Options
Index: include/unistd.h
===================================================================
--- include/unistd.h
+++ include/unistd.h
@@ -494,6 +494,7 @@
int acct(const char *);
int async_daemon(void);
int check_utility_compat(const char *);
+ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int);
const char *
crypt_get_format(void);
char *crypt_r(const char *, const char *, struct crypt_data *);
Index: lib/libc/sys/Makefile.inc
===================================================================
--- lib/libc/sys/Makefile.inc
+++ lib/libc/sys/Makefile.inc
@@ -175,6 +175,7 @@
closefrom.2 \
connect.2 \
connectat.2 \
+ copy_file_range.2 \
cpuset.2 \
cpuset_getaffinity.2 \
cpuset_getdomain.2 \
Index: lib/libc/sys/Symbol.map
===================================================================
--- lib/libc/sys/Symbol.map
+++ lib/libc/sys/Symbol.map
@@ -402,6 +402,7 @@
};
FBSD_1.6 {
+ copy_file_range;
fhlink;
fhlinkat;
fhreadlink;
Index: lib/libc/sys/copy_file_range.2
===================================================================
--- lib/libc/sys/copy_file_range.2
+++ lib/libc/sys/copy_file_range.2
@@ -0,0 +1,150 @@
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2019 Rick Macklem
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd June 9, 2019
+.Dt COPY_FILE_RANGE 2
+.Os
+.Sh NAME
+.Nm copy_file_range
+.Nd kernel copy of a byte range from one file to another
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In sys/types.h
+.In unistd.h
+.Ft ssize_t
+.Fn copy_file_range "int infd" "off_t *inoffp" "int outfd" "off_t *outoffp" "size_t len" "unsigned int flags"
+.Fc
+.Sh DESCRIPTION
+The
+.Fn copy_file_range
+system call
+copies
+.Fa len
+bytes from
+.Fa infd
+to
+.Fa outfd
+in the kernel.
+It may do this using a file system specific technique if
+.Fa infd
+and
+.Fa outfd
+are on the same file system.
+The
+.Fa infd
+argument must be opened for reading and the
+.Fa outfd
+argument must be opened for writing, but not
+.Dv O_APPEND .
+If
+.Fa inoffp
+or
+.Fa outoffp
+is
+.Dv NULL ,
+the file offset for
+.Fa infd
+or
+.Fa outfd
+respectively will be used and updated by
+the number of bytes copied.
+If
+.Fa inoffp
+or
+.Fa outoffp
+is not
+.Dv NULL ,
+the byte offset pointed to by
+.Fa inoffp
+or
+.Fa outoffp
+respectively will be used/updated and the file offset for
+.Fa infd
+or
+.Fa outfd
+respectively will not be affected.
+The
+.Fa flags
+argument is currently ignored and should be set to 0.
+.Pp
+.Sh RETURN VALUES
+If it succeeds, the call returns the number of bytes copied, which can be less
+than
+.Fa len .
+.Fn copy_file_range
+should be used in a loop until copying of the desired byte range has been
+completed.
+If an error has occurred, a \-1 is returned and the error code is placed in
+the global variable
+.Va errno .
+.Sh ERRORS
+The
+.Fn copy_file_range
+system call
+will fail if:
+.Bl -tag -width Er
+.It Bq Er EBADF
+If
+.Fa
+infd
+is not open for reading or
+.Fa
+outfd
+is not open for writing, or opened for writing with
+.Dv O_APPEND .
+.It Bq Er EFBIG
+If the copy exceeds the process's file size limit or the maximum file size
+for the file system
+.Fa outfd
+resides on.
+.It Bq Er EINVAL
+If the initial offset for
+.Fa infd
+plus
+.Fa len
+exceeds EOF for
+.Fa infd .
+.It Bq Er EIO
+An I/O error occurred while reading/writing the files.
+.It Bq Er EISDIR
+If either
+.Fa infd
+or
+.Fa outfd
+refers to a directory.
+.El
+.Sh STANDARDS
+The
+.Fn copy_file_range
+system call is expected to be compatible with the Linux system call of
+the same name.
+.Sh HISTORY
+The
+.Fn copy_file_range
+function appeared in
+.Fx 13.0 .
Index: sys/kern/syscalls.master
===================================================================
--- sys/kern/syscalls.master
+++ sys/kern/syscalls.master
@@ -3175,6 +3175,16 @@
int flag
);
}
+568 AUE_NULL STD {
+ ssize_t copy_file_range(
+ int infd,
+ _Inout_opt_ off_t *inoffp,
+ int outfd,
+ _Inout_opt_ off_t *outoffp,
+ size_t len,
+ unsigned int flags
+ );
+ }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
Index: sys/kern/vfs_default.c
===================================================================
--- sys/kern/vfs_default.c
+++ sys/kern/vfs_default.c
@@ -140,6 +140,7 @@
.vop_set_text = vop_stdset_text,
.vop_unset_text = vop_stdunset_text,
.vop_add_writecount = vop_stdadd_writecount,
+ .vop_copy_file_range = VOP_EOPNOTSUPP,
};
/*
Index: sys/kern/vfs_syscalls.c
===================================================================
--- sys/kern/vfs_syscalls.c
+++ sys/kern/vfs_syscalls.c
@@ -4814,3 +4814,78 @@
uap->advice);
return (kern_posix_error(td, error));
}
+
+int
+kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
+ off_t *outoffp, size_t len, unsigned int flags)
+{
+ struct file *infp, *outfp;
+ struct vnode *invp, *outvp;
+ int error;
+ size_t retlen;
+
+ infp = outfp = NULL;
+ retlen = 0;
+
+ /* Get the file structures for the file descriptors. */
+ error = fget_read(td, infd, &cap_read_rights, &infp);
+ if (error != 0)
+ goto out;
+ error = fget_write(td, outfd, &cap_write_rights, &outfp);
+ if (error != 0)
+ goto out;
+
+ /* Set the offset pointers to the correct place. */
+ if (inoffp == NULL)
+ inoffp = &infp->f_offset;
+ if (outoffp == NULL)
+ outoffp = &outfp->f_offset;
+
+ invp = infp->f_vnode;
+ outvp = outfp->f_vnode;
+ /* Sanity check the f_flag bits. */
+ if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE ||
+ (infp->f_flag & FREAD) == 0 || invp == outvp) {
+ error = EBADF;
+ goto out;
+ }
+
+ retlen = len;
+ error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
+ flags);
+out:
+ if (outfp != NULL)
+ fdrop(outfp, td);
+ if (infp != NULL)
+ fdrop(infp, td);
+ td->td_retval[0] = retlen;
+ return (error);
+}
+
+int
+sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap)
+{
+ off_t inoff, outoff, *inoffp, *outoffp;
+ int error;
+
+ inoffp = outoffp = NULL;
+ if (uap->inoffp != NULL) {
+ error = copyin(uap->inoffp, &inoff, sizeof(off_t));
+ if (error != 0)
+ return (error);
+ inoffp = &inoff;
+ }
+ if (uap->outoffp != NULL) {
+ error = copyin(uap->outoffp, &outoff, sizeof(off_t));
+ if (error != 0)
+ return (error);
+ outoffp = &outoff;
+ }
+ error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
+ outoffp, uap->len, uap->flags);
+ if (error == 0 && uap->inoffp != NULL)
+ error = copyout(inoffp, uap->inoffp, sizeof(off_t));
+ if (error == 0 && uap->outoffp != NULL)
+ error = copyout(outoffp, uap->outoffp, sizeof(off_t));
+ return (error);
+}
Index: sys/kern/vfs_vnops.c
===================================================================
--- sys/kern/vfs_vnops.c
+++ sys/kern/vfs_vnops.c
@@ -61,6 +61,7 @@
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mman.h>
+#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
@@ -89,6 +90,8 @@
#include <vm/vm_page.h>
#include <vm/vnode_pager.h>
+#include <machine/vmparam.h>
+
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
@@ -2494,6 +2497,316 @@
va->va_fsid += (uint32_t)f->val[0];
}
+/*
+ * Test len bytes of data starting at addr for all bytes == 0.
+ * Return 0 if all bytes are zero, non-zero otherwise.
+ * Expects dat to be well aligned.
+ */
+static int
+mem_iszero(void *dat, int len)
+{
+ int i;
+ const u_int *p;
+ const char *cp;
+
+ for (p = (const u_int *)dat; len > 0; len -= sizeof(*p), p++) {
+ if (len >= sizeof(u_int)) {
+ if (*p != 0)
+ return (1);
+ } else {
+ cp = (const char *)p;
+ for (i = 0; i < len; i++, cp++)
+ if (*cp != '\0')
+ return (1);
+ return (0);
+ }
+ }
+ return (0);
+}
+
+int
+vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
+ off_t *outoffp, size_t *lenp, unsigned int flags)
+{
+ struct vattr va;
+ struct mount *mp;
+ off_t startoff, endoff;
+ u_long blksize;
+ int error, lckf;
+ bool cantseek;
+ ssize_t aresid;
+ size_t copylen, len, savlen, xfer, xfer2;
+ char *dat;
+ uint64_t uvalin, uvalout;
+ long holein, holeout;
+ void *rl_rcookie, *rl_wcookie;
+ struct thread *td = curthread;
+
+ savlen = len = *lenp;
+ *lenp = 0; /* Return 0 len for errors. */
+ error = 0;
+ dat = NULL;
+ rl_rcookie = rl_wcookie = NULL;
+
+ /* Do some sanity checks on the arguments. */
+ uvalin = *inoffp;
+ uvalin += len;
+ uvalout = *outoffp;
+ uvalout += len;
+ if (invp->v_type == VDIR || outvp->v_type == VDIR)
+ error = EISDIR;
+ else if (*inoffp < 0 || uvalin > INT64_MAX || uvalin <
+ (uint64_t)*inoffp || *outoffp < 0 || uvalout > INT64_MAX ||
+ uvalout < (uint64_t)*outoffp || invp->v_type != VREG ||
+ outvp->v_type != VREG || invp == outvp)
+ error = EINVAL;
+ if (error != 0)
+ goto out;
+
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
+ holein = 0;
+
+ /* Check that the offset + len does not go past EOF of invp. */
+ if (error == 0)
+ error = VOP_GETATTR(invp, &va, td->td_ucred);
+ if (error == 0 && va.va_size < (*inoffp + len))
+ error = EINVAL;
+ VOP_UNLOCK(invp, 0);
+ if (error != 0)
+ goto out;
+
+ /* Range lock the byte ranges for both invp and outvp. */
+ for (;;) {
+ rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp +
+ len);
+ rl_rcookie = vn_rangelock_rlock_trylock(invp, *inoffp,
+ *inoffp + len);
+ if (rl_rcookie != NULL)
+ break;
+ vn_rangelock_unlock(outvp, rl_wcookie);
+ rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len);
+ vn_rangelock_unlock(invp, rl_rcookie);
+ }
+
+ /*
+ * If the two vnodes are for the same file system, try the
+ * VOP_COPY_FILE_RANGE() call first, but do it here if the VOP
+ * call fails.
+ */
+ if (invp->v_mount == outvp->v_mount) {
+ error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
+ lenp, flags);
+ if (error == 0)
+ goto out;
+ }
+
+ mp = NULL;
+ error = vn_start_write(outvp, &mp, V_WAIT);
+ if (error == 0)
+ error = vn_lock(outvp, LK_EXCLUSIVE);
+ if (error == 0) {
+ if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
+ holeout = 0;
+ /*
+ * Holes that are past EOF do not need to be written as a block
+ * of zero bytes. So, truncate the output file as far as
+ * possible and then use va.va_size to decide if writing 0
+ * bytes is necessary in the loop below.
+ */
+ error = VOP_GETATTR(outvp, &va, td->td_ucred);
+ if (error == 0 && va.va_size > *outoffp && va.va_size <=
+ *outoffp + len) {
+ VATTR_NULL(&va);
+ va.va_size = *outoffp;
+ error = VOP_SETATTR(outvp, &va, td->td_ucred);
+ }
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Set the blksize to the larger of the hole sizes for invp and outvp.
+ * If hole sizes aren't available, set the blksize to the larger
+ * f_iosize of invp and outvp.
+ * This code expects the hole sizes and f_iosizes to be powers of 2.
+ * This value is clipped at 4Kbytes and 1Mbyte.
+ */
+ if (holein > 0 && holeout > 0)
+ if (holein > holeout)
+ blksize = holein;
+ else
+ blksize = holeout;
+ else if (invp->v_mount->mnt_stat.f_iosize >
+ outvp->v_mount->mnt_stat.f_iosize)
+ blksize = invp->v_mount->mnt_stat.f_iosize;
+ else
+ blksize = outvp->v_mount->mnt_stat.f_iosize;
+ if (blksize < 4096)
+ blksize = 4096;
+ else if (blksize > 1048576)
+ blksize = 1048576;
+ dat = malloc(blksize, M_TEMP, M_WAITOK);
+
+ /*
+ * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
+ * to find holes. Otherwise, just scan the read block for all 0s
+ * in the inner loop where the data copying is done.
+ * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
+ * support holes on the server, but do not support FIOSEEKHOLE.
+ */
+ while (len > 0 && error == 0) {
+ endoff = 0; /* To shut up compilers. */
+
+ /*
+ * Find the next data area. If there is just a hole to EOF,
+ * FOISEEKDATA should fail and then we drop down into the
+ * inner loop and create the hole on the outvp file.
+ * (I do not know if any file system will report a hole to
+ * EOF via FOISEEKHOLE, but I am pretty sure FIOSEEKDATA
+ * will fail for those file systems.)
+ *
+ * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
+ * the code just falls through to the inner copy loop.
+ */
+ cantseek = true;
+ startoff = *inoffp;
+ copylen = len;
+ error = EINVAL;
+ if (holein > 0)
+ error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
+ td->td_ucred, td);
+ if (error == 0) {
+ endoff = startoff;
+ error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
+ td->td_ucred, td);
+ }
+ if (error == 0) {
+ if (startoff > *inoffp) {
+ /* Found hole before data block. */
+ xfer = startoff - *inoffp;
+ if (*inoffp < va.va_size) {
+ /* Must write 0s to punch hole. */
+ xfer2 = va.va_size - *inoffp;
+ if (xfer2 > xfer)
+ xfer2 = xfer;
+ memset(dat, 0, xfer2);
+ mp = NULL;
+ error = vn_start_write(outvp, &mp,
+ V_WAIT);
+ if (error == 0) {
+ if (MNT_SHARED_WRITES(mp))
+ lckf = LK_SHARED;
+ else
+ lckf = LK_EXCLUSIVE;
+ error = vn_lock(outvp, lckf);
+ }
+ if (error == 0) {
+ error = vn_rdwr(UIO_WRITE,
+ outvp, dat, xfer2, *outoffp,
+ UIO_SYSSPACE, IO_NODELOCKED,
+ td->td_ucred, NULL, NULL,
+ td);
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ }
+ if (error == 0) {
+ *inoffp += xfer;
+ *outoffp += xfer;
+ len -= xfer;
+ }
+ }
+ copylen = len;
+ if (copylen > endoff - startoff)
+ copylen = endoff - startoff;
+ cantseek = false;
+ } else
+ error = 0;
+
+ xfer = blksize;
+ if (cantseek) {
+ /*
+ * Set first xfer to end at a block boundary, so that
+ * holes are more likely detected in the loop below via
+ * the for all bytes 0 method.
+ */
+ xfer -= (*inoffp % blksize);
+ }
+ /* Loop copying the data block. */
+ while (copylen > 0 && error == 0) {
+ if (copylen < xfer)
+ xfer = copylen;
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ error = vn_rdwr(UIO_READ, invp, dat, xfer, startoff,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NULL,
+ &aresid, td);
+ VOP_UNLOCK(invp, 0);
+ /*
+ * Linux considers a range that exceeds EOF to be an
+ * error, so we will too.
+ */
+ if (error == 0 && aresid > 0)
+ error = EINVAL;
+ if (error == 0) {
+ /*
+ * Skip the write for holes past the initial EOF
+ * of the output file, unless this is the last
+ * write of the output file at EOF.
+ */
+ if (!cantseek || *outoffp < va.va_size ||
+ xfer == len || mem_iszero(dat, xfer) != 0) {
+ mp = NULL;
+ error = vn_start_write(outvp, &mp,
+ V_WAIT);
+ if (error == 0) {
+ if (MNT_SHARED_WRITES(mp))
+ lckf = LK_SHARED;
+ else
+ lckf = LK_EXCLUSIVE;
+ error = vn_lock(outvp, lckf);
+ }
+ if (error == 0) {
+ error = vn_rdwr(UIO_WRITE,
+ outvp, dat, xfer, *outoffp,
+ UIO_SYSSPACE, IO_NODELOCKED,
+ td->td_ucred, NULL, NULL,
+ td);
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ }
+ if (error == 0) {
+ *inoffp += xfer;
+ startoff += xfer;
+ *outoffp += xfer;
+ copylen -= xfer;
+ len -= xfer;
+ }
+ }
+ xfer = blksize;
+ }
+ }
+ if (error == 0)
+ *lenp = savlen - len;
+out:
+ if (rl_rcookie != NULL)
+ vn_rangelock_unlock(invp, rl_rcookie);
+ if (rl_wcookie != NULL)
+ vn_rangelock_unlock(outvp, rl_wcookie);
+ free(dat, M_TEMP);
+ return (error);
+}
+
int
vn_fsync_buf(struct vnode *vp, int waitfor)
{
Index: sys/kern/vnode_if.src
===================================================================
--- sys/kern/vnode_if.src
+++ sys/kern/vnode_if.src
@@ -718,6 +718,19 @@
};
+%% copy_file_range invp U U U
+%% copy_file_range outvp U U U
+
+vop_copy_file_range {
+ IN struct vnode *invp;
+ INOUT off_t *inoffp;
+ IN struct vnode *outvp;
+ INOUT off_t *outoffp;
+ INOUT size_t *lenp;
+ IN u_int flags;
+};
+
+
# The VOPs below are spares at the end of the table to allow new VOPs to be
# added in stable branches without breaking the KBI. New VOPs in HEAD should
# be added above these spares. When merging a new VOP to a stable branch,
Index: sys/sys/syscallsubr.h
===================================================================
--- sys/sys/syscallsubr.h
+++ sys/sys/syscallsubr.h
@@ -94,6 +94,8 @@
int kern_close(struct thread *td, int fd);
int kern_connectat(struct thread *td, int dirfd, int fd,
struct sockaddr *sa);
+int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp,
+ int outfd, off_t *outoffp, size_t len, unsigned int flags);
int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level,
cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp);
int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level,
Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h
+++ sys/sys/vnode.h
@@ -667,6 +667,9 @@
struct ucred *cred);
int vn_close(struct vnode *vp,
int flags, struct ucred *file_cred, struct thread *td);
+int vn_copy_file_range(struct vnode *invp, off_t *inoffp,
+ struct vnode *outvp, off_t *outoffp, size_t *lenp,
+ unsigned int flags);
void vn_finished_write(struct mount *mp);
void vn_finished_secondary_write(struct mount *mp);
int vn_fsync_buf(struct vnode *vp, int waitfor);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sun, Oct 19, 2:32 PM (10 h, 51 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
23928818
Default Alt Text
D20584.id58701.diff (18 KB)
Attached To
Mode
D20584: add a linux compatible copy_file_range(2) syscall
Attached
Detach File
Event Timeline
Log In to Comment