Page MenuHomeFreeBSD

D20584.id58701.diff
No OneTemporary

D20584.id58701.diff

Index: include/unistd.h
===================================================================
--- include/unistd.h
+++ include/unistd.h
@@ -494,6 +494,7 @@
int acct(const char *);
int async_daemon(void);
int check_utility_compat(const char *);
+ssize_t copy_file_range(int, off_t *, int, off_t *, size_t, unsigned int);
const char *
crypt_get_format(void);
char *crypt_r(const char *, const char *, struct crypt_data *);
Index: lib/libc/sys/Makefile.inc
===================================================================
--- lib/libc/sys/Makefile.inc
+++ lib/libc/sys/Makefile.inc
@@ -175,6 +175,7 @@
closefrom.2 \
connect.2 \
connectat.2 \
+ copy_file_range.2 \
cpuset.2 \
cpuset_getaffinity.2 \
cpuset_getdomain.2 \
Index: lib/libc/sys/Symbol.map
===================================================================
--- lib/libc/sys/Symbol.map
+++ lib/libc/sys/Symbol.map
@@ -402,6 +402,7 @@
};
FBSD_1.6 {
+ copy_file_range;
fhlink;
fhlinkat;
fhreadlink;
Index: lib/libc/sys/copy_file_range.2
===================================================================
--- lib/libc/sys/copy_file_range.2
+++ lib/libc/sys/copy_file_range.2
@@ -0,0 +1,150 @@
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2019 Rick Macklem
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd June 9, 2019
+.Dt COPY_FILE_RANGE 2
+.Os
+.Sh NAME
+.Nm copy_file_range
+.Nd kernel copy of a byte range from one file to another
+.Sh LIBRARY
+.Lb libc
+.Sh SYNOPSIS
+.In sys/types.h
+.In unistd.h
+.Ft ssize_t
+.Fn copy_file_range "int infd" "off_t *inoffp" "int outfd" "off_t *outoffp" "size_t len" "unsigned int flags"
+.Fc
+.Sh DESCRIPTION
+The
+.Fn copy_file_range
+system call
+copies
+.Fa len
+bytes from
+.Fa infd
+to
+.Fa outfd
+in the kernel.
+It may do this using a file system specific technique if
+.Fa infd
+and
+.Fa outfd
+are on the same file system.
+The
+.Fa infd
+argument must be opened for reading and the
+.Fa outfd
+argument must be opened for writing, but not
+.Dv O_APPEND .
+If
+.Fa inoffp
+or
+.Fa outoffp
+is
+.Dv NULL ,
+the file offset for
+.Fa infd
+or
+.Fa outfd
+respectively will be used and updated by
+the number of bytes copied.
+If
+.Fa inoffp
+or
+.Fa outoffp
+is not
+.Dv NULL ,
+the byte offset pointed to by
+.Fa inoffp
+or
+.Fa outoffp
+respectively will be used/updated and the file offset for
+.Fa infd
+or
+.Fa outfd
+respectively will not be affected.
+The
+.Fa flags
+argument is currently ignored and should be set to 0.
+.Pp
+.Sh RETURN VALUES
+If it succeeds, the call returns the number of bytes copied, which can be less
+than
+.Fa len .
+.Fn copy_file_range
+should be used in a loop until copying of the desired byte range has been
+completed.
+If an error has occurred, a \-1 is returned and the error code is placed in
+the global variable
+.Va errno .
+.Sh ERRORS
+The
+.Fn copy_file_range
+system call
+will fail if:
+.Bl -tag -width Er
+.It Bq Er EBADF
+If
+.Fa
+infd
+is not open for reading or
+.Fa
+outfd
+is not open for writing, or opened for writing with
+.Dv O_APPEND .
+.It Bq Er EFBIG
+If the copy exceeds the process's file size limit or the maximum file size
+for the file system
+.Fa outfd
+resides on.
+.It Bq Er EINVAL
+If the initial offset for
+.Fa infd
+plus
+.Fa len
+exceeds EOF for
+.Fa infd .
+.It Bq Er EIO
+An I/O error occurred while reading/writing the files.
+.It Bq Er EISDIR
+If either
+.Fa infd
+or
+.Fa outfd
+refers to a directory.
+.El
+.Sh STANDARDS
+The
+.Fn copy_file_range
+system call is expected to be compatible with the Linux system call of
+the same name.
+.Sh HISTORY
+The
+.Fn copy_file_range
+function appeared in
+.Fx 13.0 .
Index: sys/kern/syscalls.master
===================================================================
--- sys/kern/syscalls.master
+++ sys/kern/syscalls.master
@@ -3175,6 +3175,16 @@
int flag
);
}
+568 AUE_NULL STD {
+ ssize_t copy_file_range(
+ int infd,
+ _Inout_opt_ off_t *inoffp,
+ int outfd,
+ _Inout_opt_ off_t *outoffp,
+ size_t len,
+ unsigned int flags
+ );
+ }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
Index: sys/kern/vfs_default.c
===================================================================
--- sys/kern/vfs_default.c
+++ sys/kern/vfs_default.c
@@ -140,6 +140,7 @@
.vop_set_text = vop_stdset_text,
.vop_unset_text = vop_stdunset_text,
.vop_add_writecount = vop_stdadd_writecount,
+ .vop_copy_file_range = VOP_EOPNOTSUPP,
};
/*
Index: sys/kern/vfs_syscalls.c
===================================================================
--- sys/kern/vfs_syscalls.c
+++ sys/kern/vfs_syscalls.c
@@ -4814,3 +4814,78 @@
uap->advice);
return (kern_posix_error(td, error));
}
+
+int
+kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
+ off_t *outoffp, size_t len, unsigned int flags)
+{
+ struct file *infp, *outfp;
+ struct vnode *invp, *outvp;
+ int error;
+ size_t retlen;
+
+ infp = outfp = NULL;
+ retlen = 0;
+
+ /* Get the file structures for the file descriptors. */
+ error = fget_read(td, infd, &cap_read_rights, &infp);
+ if (error != 0)
+ goto out;
+ error = fget_write(td, outfd, &cap_write_rights, &outfp);
+ if (error != 0)
+ goto out;
+
+ /* Set the offset pointers to the correct place. */
+ if (inoffp == NULL)
+ inoffp = &infp->f_offset;
+ if (outoffp == NULL)
+ outoffp = &outfp->f_offset;
+
+ invp = infp->f_vnode;
+ outvp = outfp->f_vnode;
+ /* Sanity check the f_flag bits. */
+ if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE ||
+ (infp->f_flag & FREAD) == 0 || invp == outvp) {
+ error = EBADF;
+ goto out;
+ }
+
+ retlen = len;
+ error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
+ flags);
+out:
+ if (outfp != NULL)
+ fdrop(outfp, td);
+ if (infp != NULL)
+ fdrop(infp, td);
+ td->td_retval[0] = retlen;
+ return (error);
+}
+
+int
+sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap)
+{
+ off_t inoff, outoff, *inoffp, *outoffp;
+ int error;
+
+ inoffp = outoffp = NULL;
+ if (uap->inoffp != NULL) {
+ error = copyin(uap->inoffp, &inoff, sizeof(off_t));
+ if (error != 0)
+ return (error);
+ inoffp = &inoff;
+ }
+ if (uap->outoffp != NULL) {
+ error = copyin(uap->outoffp, &outoff, sizeof(off_t));
+ if (error != 0)
+ return (error);
+ outoffp = &outoff;
+ }
+ error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
+ outoffp, uap->len, uap->flags);
+ if (error == 0 && uap->inoffp != NULL)
+ error = copyout(inoffp, uap->inoffp, sizeof(off_t));
+ if (error == 0 && uap->outoffp != NULL)
+ error = copyout(outoffp, uap->outoffp, sizeof(off_t));
+ return (error);
+}
Index: sys/kern/vfs_vnops.c
===================================================================
--- sys/kern/vfs_vnops.c
+++ sys/kern/vfs_vnops.c
@@ -61,6 +61,7 @@
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mman.h>
+#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
@@ -89,6 +90,8 @@
#include <vm/vm_page.h>
#include <vm/vnode_pager.h>
+#include <machine/vmparam.h>
+
#ifdef HWPMC_HOOKS
#include <sys/pmckern.h>
#endif
@@ -2494,6 +2497,316 @@
va->va_fsid += (uint32_t)f->val[0];
}
+/*
+ * Test len bytes of data starting at addr for all bytes == 0.
+ * Return 0 if all bytes are zero, non-zero otherwise.
+ * Expects dat to be well aligned.
+ */
+static int
+mem_iszero(void *dat, int len)
+{
+ int i;
+ const u_int *p;
+ const char *cp;
+
+ for (p = (const u_int *)dat; len > 0; len -= sizeof(*p), p++) {
+ if (len >= sizeof(u_int)) {
+ if (*p != 0)
+ return (1);
+ } else {
+ cp = (const char *)p;
+ for (i = 0; i < len; i++, cp++)
+ if (*cp != '\0')
+ return (1);
+ return (0);
+ }
+ }
+ return (0);
+}
+
+int
+vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
+ off_t *outoffp, size_t *lenp, unsigned int flags)
+{
+ struct vattr va;
+ struct mount *mp;
+ off_t startoff, endoff;
+ u_long blksize;
+ int error, lckf;
+ bool cantseek;
+ ssize_t aresid;
+ size_t copylen, len, savlen, xfer, xfer2;
+ char *dat;
+ uint64_t uvalin, uvalout;
+ long holein, holeout;
+ void *rl_rcookie, *rl_wcookie;
+ struct thread *td = curthread;
+
+ savlen = len = *lenp;
+ *lenp = 0; /* Return 0 len for errors. */
+ error = 0;
+ dat = NULL;
+ rl_rcookie = rl_wcookie = NULL;
+
+ /* Do some sanity checks on the arguments. */
+ uvalin = *inoffp;
+ uvalin += len;
+ uvalout = *outoffp;
+ uvalout += len;
+ if (invp->v_type == VDIR || outvp->v_type == VDIR)
+ error = EISDIR;
+ else if (*inoffp < 0 || uvalin > INT64_MAX || uvalin <
+ (uint64_t)*inoffp || *outoffp < 0 || uvalout > INT64_MAX ||
+ uvalout < (uint64_t)*outoffp || invp->v_type != VREG ||
+ outvp->v_type != VREG || invp == outvp)
+ error = EINVAL;
+ if (error != 0)
+ goto out;
+
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
+ holein = 0;
+
+ /* Check that the offset + len does not go past EOF of invp. */
+ if (error == 0)
+ error = VOP_GETATTR(invp, &va, td->td_ucred);
+ if (error == 0 && va.va_size < (*inoffp + len))
+ error = EINVAL;
+ VOP_UNLOCK(invp, 0);
+ if (error != 0)
+ goto out;
+
+ /* Range lock the byte ranges for both invp and outvp. */
+ for (;;) {
+ rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp +
+ len);
+ rl_rcookie = vn_rangelock_rlock_trylock(invp, *inoffp,
+ *inoffp + len);
+ if (rl_rcookie != NULL)
+ break;
+ vn_rangelock_unlock(outvp, rl_wcookie);
+ rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len);
+ vn_rangelock_unlock(invp, rl_rcookie);
+ }
+
+ /*
+ * If the two vnodes are for the same file system, try the
+ * VOP_COPY_FILE_RANGE() call first, but do it here if the VOP
+ * call fails.
+ */
+ if (invp->v_mount == outvp->v_mount) {
+ error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
+ lenp, flags);
+ if (error == 0)
+ goto out;
+ }
+
+ mp = NULL;
+ error = vn_start_write(outvp, &mp, V_WAIT);
+ if (error == 0)
+ error = vn_lock(outvp, LK_EXCLUSIVE);
+ if (error == 0) {
+ if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
+ holeout = 0;
+ /*
+ * Holes that are past EOF do not need to be written as a block
+ * of zero bytes. So, truncate the output file as far as
+ * possible and then use va.va_size to decide if writing 0
+ * bytes is necessary in the loop below.
+ */
+ error = VOP_GETATTR(outvp, &va, td->td_ucred);
+ if (error == 0 && va.va_size > *outoffp && va.va_size <=
+ *outoffp + len) {
+ VATTR_NULL(&va);
+ va.va_size = *outoffp;
+ error = VOP_SETATTR(outvp, &va, td->td_ucred);
+ }
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Set the blksize to the larger of the hole sizes for invp and outvp.
+ * If hole sizes aren't available, set the blksize to the larger
+ * f_iosize of invp and outvp.
+ * This code expects the hole sizes and f_iosizes to be powers of 2.
+ * This value is clipped at 4Kbytes and 1Mbyte.
+ */
+ if (holein > 0 && holeout > 0)
+ if (holein > holeout)
+ blksize = holein;
+ else
+ blksize = holeout;
+ else if (invp->v_mount->mnt_stat.f_iosize >
+ outvp->v_mount->mnt_stat.f_iosize)
+ blksize = invp->v_mount->mnt_stat.f_iosize;
+ else
+ blksize = outvp->v_mount->mnt_stat.f_iosize;
+ if (blksize < 4096)
+ blksize = 4096;
+ else if (blksize > 1048576)
+ blksize = 1048576;
+ dat = malloc(blksize, M_TEMP, M_WAITOK);
+
+ /*
+ * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
+ * to find holes. Otherwise, just scan the read block for all 0s
+ * in the inner loop where the data copying is done.
+ * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
+ * support holes on the server, but do not support FIOSEEKHOLE.
+ */
+ while (len > 0 && error == 0) {
+ endoff = 0; /* To shut up compilers. */
+
+ /*
+ * Find the next data area. If there is just a hole to EOF,
+ * FOISEEKDATA should fail and then we drop down into the
+ * inner loop and create the hole on the outvp file.
+ * (I do not know if any file system will report a hole to
+ * EOF via FOISEEKHOLE, but I am pretty sure FIOSEEKDATA
+ * will fail for those file systems.)
+ *
+ * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
+ * the code just falls through to the inner copy loop.
+ */
+ cantseek = true;
+ startoff = *inoffp;
+ copylen = len;
+ error = EINVAL;
+ if (holein > 0)
+ error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
+ td->td_ucred, td);
+ if (error == 0) {
+ endoff = startoff;
+ error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
+ td->td_ucred, td);
+ }
+ if (error == 0) {
+ if (startoff > *inoffp) {
+ /* Found hole before data block. */
+ xfer = startoff - *inoffp;
+ if (*inoffp < va.va_size) {
+ /* Must write 0s to punch hole. */
+ xfer2 = va.va_size - *inoffp;
+ if (xfer2 > xfer)
+ xfer2 = xfer;
+ memset(dat, 0, xfer2);
+ mp = NULL;
+ error = vn_start_write(outvp, &mp,
+ V_WAIT);
+ if (error == 0) {
+ if (MNT_SHARED_WRITES(mp))
+ lckf = LK_SHARED;
+ else
+ lckf = LK_EXCLUSIVE;
+ error = vn_lock(outvp, lckf);
+ }
+ if (error == 0) {
+ error = vn_rdwr(UIO_WRITE,
+ outvp, dat, xfer2, *outoffp,
+ UIO_SYSSPACE, IO_NODELOCKED,
+ td->td_ucred, NULL, NULL,
+ td);
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ }
+ if (error == 0) {
+ *inoffp += xfer;
+ *outoffp += xfer;
+ len -= xfer;
+ }
+ }
+ copylen = len;
+ if (copylen > endoff - startoff)
+ copylen = endoff - startoff;
+ cantseek = false;
+ } else
+ error = 0;
+
+ xfer = blksize;
+ if (cantseek) {
+ /*
+ * Set first xfer to end at a block boundary, so that
+ * holes are more likely detected in the loop below via
+ * the for all bytes 0 method.
+ */
+ xfer -= (*inoffp % blksize);
+ }
+ /* Loop copying the data block. */
+ while (copylen > 0 && error == 0) {
+ if (copylen < xfer)
+ xfer = copylen;
+ error = vn_lock(invp, LK_SHARED);
+ if (error != 0)
+ goto out;
+ error = vn_rdwr(UIO_READ, invp, dat, xfer, startoff,
+ UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NULL,
+ &aresid, td);
+ VOP_UNLOCK(invp, 0);
+ /*
+ * Linux considers a range that exceeds EOF to be an
+ * error, so we will too.
+ */
+ if (error == 0 && aresid > 0)
+ error = EINVAL;
+ if (error == 0) {
+ /*
+ * Skip the write for holes past the initial EOF
+ * of the output file, unless this is the last
+ * write of the output file at EOF.
+ */
+ if (!cantseek || *outoffp < va.va_size ||
+ xfer == len || mem_iszero(dat, xfer) != 0) {
+ mp = NULL;
+ error = vn_start_write(outvp, &mp,
+ V_WAIT);
+ if (error == 0) {
+ if (MNT_SHARED_WRITES(mp))
+ lckf = LK_SHARED;
+ else
+ lckf = LK_EXCLUSIVE;
+ error = vn_lock(outvp, lckf);
+ }
+ if (error == 0) {
+ error = vn_rdwr(UIO_WRITE,
+ outvp, dat, xfer, *outoffp,
+ UIO_SYSSPACE, IO_NODELOCKED,
+ td->td_ucred, NULL, NULL,
+ td);
+ VOP_UNLOCK(outvp, 0);
+ }
+ if (mp != NULL)
+ vn_finished_write(mp);
+ }
+ if (error == 0) {
+ *inoffp += xfer;
+ startoff += xfer;
+ *outoffp += xfer;
+ copylen -= xfer;
+ len -= xfer;
+ }
+ }
+ xfer = blksize;
+ }
+ }
+ if (error == 0)
+ *lenp = savlen - len;
+out:
+ if (rl_rcookie != NULL)
+ vn_rangelock_unlock(invp, rl_rcookie);
+ if (rl_wcookie != NULL)
+ vn_rangelock_unlock(outvp, rl_wcookie);
+ free(dat, M_TEMP);
+ return (error);
+}
+
int
vn_fsync_buf(struct vnode *vp, int waitfor)
{
Index: sys/kern/vnode_if.src
===================================================================
--- sys/kern/vnode_if.src
+++ sys/kern/vnode_if.src
@@ -718,6 +718,19 @@
};
+%% copy_file_range invp U U U
+%% copy_file_range outvp U U U
+
+vop_copy_file_range {
+ IN struct vnode *invp;
+ INOUT off_t *inoffp;
+ IN struct vnode *outvp;
+ INOUT off_t *outoffp;
+ INOUT size_t *lenp;
+ IN u_int flags;
+};
+
+
# The VOPs below are spares at the end of the table to allow new VOPs to be
# added in stable branches without breaking the KBI. New VOPs in HEAD should
# be added above these spares. When merging a new VOP to a stable branch,
Index: sys/sys/syscallsubr.h
===================================================================
--- sys/sys/syscallsubr.h
+++ sys/sys/syscallsubr.h
@@ -94,6 +94,8 @@
int kern_close(struct thread *td, int fd);
int kern_connectat(struct thread *td, int dirfd, int fd,
struct sockaddr *sa);
+int kern_copy_file_range(struct thread *td, int infd, off_t *inoffp,
+ int outfd, off_t *outoffp, size_t len, unsigned int flags);
int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level,
cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp);
int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level,
Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h
+++ sys/sys/vnode.h
@@ -667,6 +667,9 @@
struct ucred *cred);
int vn_close(struct vnode *vp,
int flags, struct ucred *file_cred, struct thread *td);
+int vn_copy_file_range(struct vnode *invp, off_t *inoffp,
+ struct vnode *outvp, off_t *outoffp, size_t *lenp,
+ unsigned int flags);
void vn_finished_write(struct mount *mp);
void vn_finished_secondary_write(struct mount *mp);
int vn_fsync_buf(struct vnode *vp, int waitfor);

File Metadata

Mime Type
text/plain
Expires
Sun, Oct 19, 2:32 PM (10 h, 51 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
23928818
Default Alt Text
D20584.id58701.diff (18 KB)

Event Timeline