Index: lib/libc/sys/Makefile.inc =================================================================== --- lib/libc/sys/Makefile.inc +++ lib/libc/sys/Makefile.inc @@ -189,6 +189,7 @@ fhreadlink.2 \ flock.2 \ fork.2 \ + fspacectl.2 \ fsync.2 \ getdirentries.2 \ getdtablesize.2 \ Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -417,6 +417,10 @@ shm_rename; }; +FBSD_1.7 { + fspacectl; +}; + FBSDprivate_1.0 { ___acl_aclcheck_fd; __sys___acl_aclcheck_fd; Index: lib/libc/sys/fspacectl.2 =================================================================== --- /dev/null +++ lib/libc/sys/fspacectl.2 @@ -0,0 +1,189 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD +.\" +.\" Copyright (c) 2021 The FreeBSD Foundation +.\" +.\" This manual page was written by Ka Ho Ng under sponsorship from +.\" the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.Dd August 4, 2021 +.Dt FSPACECTL 2 +.Os +.Sh NAME +.Nm fspacectl +.Nd space management in a file +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In fcntl.h +.Ft int +.Fo fspacectl +.Fa "int fd" +.Fa "int cmd" +.Fa "const struct spacectl_range *rqsr" +.Fa "int flags" +.Fa "struct spacectl_range *rmsr" +.Fc +.Sh DESCRIPTION +.Nm +is a system call performing space management over a file. +The +.Fa fd +argument specifies the file descriptor to be operated on by the +.Fa cmd +argument. +The +.Fa rqsr +argument points to a +.Fa spacectl_range +structure that contains the requested operation range. +The +.Fa flags +argument controls the behavior of the operation to take place. +If the +.Fa rmsr +argument is non-NULL, the +.Fa spacectl_range +structure it points to is updated to contain the unprocessed operation range +after the system call returns. +Both +.Fa rqsr +and +.Fa rmsr +arguments can point to the same structure. +.Pp +The +.Fa spacectl_range +structure is defined as: +.Bd -literal +struct spacectl_range { + off_t r_offset; + off_t r_len; +}; +.Ed +.Pp +The operation specified by the +.Fa cmd +argument may be one of: +.Bl -tag -width SPACECTL_DEALLOC +.It Dv SPACECTL_DEALLOC +Zero a region in the file specified by the +.Fa rqsr +argument. +The +.Va "rqsr->r_offset" +has to be a value greater than or equal to 0, and the +.Va "rqsr->r_len" +has to be a value greater than 0. +.Pp +If the file system supports hole-punching, +file system space deallocation may be performed in the given region. +.El +.Pp +The +.Fa flags +argument needs to be the value 0 currently. +.Sh RETURN VALUES +Upon successful completion, the value 0 is returned; +otherwise the value -1 is returned and +.Va errno +is set to indicate the error. +.Sh ERRORS +Possible failure conditions: +.Bl -tag -width Er +.It Bq Er EBADF +The +.Fa fd +argument is not a valid file descriptor. +.It Bq Er EBADF +The +.Fa fd +argument references a file that was opened without write permission. +.It Bq Er EINTR +A signal was caught during execution. +.It Bq Er EINVAL +The +.Fa cmd +argument is not valid. +.It Bq Er EINVAL +If the +.Fa cmd +argument is +.Dv SPACECTL_DEALLOC , +either the +.Fa "range->r_offset" +argument was less than zero, or the +.Fa "range->r_len" +argument was less than or equal to zero. +.It Bq Er EINVAL +An invalid or unsupported flag is included in +.Fa flags . +.It Bq Er EINVAL +A flag included in +.Fa flags +is not supported by the operation specified by the +.Fa cmd +argument. +.It Bq Er EFAULT +The +.Fa rqsr +or +.Fa rmsr +argument points outside the process' allocated address space. +.It Bq Er EIO +An I/O error occurred while reading from or writing to a file system. +.It Bq Er EINTEGRITY +Corrupted data was detected while reading from the file system. +.It Bq Er ENODEV +The +.Fa fd +argument does not refer to a file that supports +.Nm . +.It Bq Er ENOSPC +There is insufficient free space remaining on the file system storage +media. +.It Bq Er ENOTCAPABLE +The file descriptor +.Fa fd +has insufficient rights. +.It Bq Er ESPIPE +The +.Fa fd +argument is associated with a pipe or FIFO. +.El +.Sh SEE ALSO +.Xr creat 2 , +.Xr ftruncate 2 , +.Xr open 2 , +.Xr unlink 2 +.Sh HISTORY +The +.Nm +system call appeared in +.Fx 14.0 . +.Sh AUTHORS +.Nm +and this manual page were written by +.An Ka Ho Ng Aq Mt khng@FreeBSD.org +under sponsorship from the FreeBSD Foundation. Index: lib/libc/sys/pathconf.2 =================================================================== --- lib/libc/sys/pathconf.2 +++ lib/libc/sys/pathconf.2 @@ -166,6 +166,9 @@ .It Li _PC_MIN_HOLE_SIZE If a file system supports the reporting of holes (see .Xr lseek 2 ) , +.It Li _PC_DEALLOC_PRESENT +If a file system supports hole-punching (see +.Xr fspacectl 2 ) , .Fn pathconf and .Fn fpathconf Index: share/man/man9/Makefile =================================================================== --- share/man/man9/Makefile +++ share/man/man9/Makefile @@ -404,6 +404,7 @@ vm_page_wire.9 \ vm_set_page_size.9 \ vmem.9 \ + vn_deallocate.9 \ vn_fullpath.9 \ vn_isdisk.9 \ vnet.9 \ @@ -420,6 +421,7 @@ VOP_BWRITE.9 \ VOP_COPY_FILE_RANGE.9 \ VOP_CREATE.9 \ + VOP_DEALLOCATE.9 \ VOP_FSYNC.9 \ VOP_GETACL.9 \ VOP_GETEXTATTR.9 \ Index: share/man/man9/VOP_DEALLOCATE.9 =================================================================== --- /dev/null +++ share/man/man9/VOP_DEALLOCATE.9 @@ -0,0 +1,101 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD +.\" +.\" Copyright (c) 2021 The FreeBSD Foundation +.\" +.\" This manual page was written by Ka Ho Ng under sponsorship from +.\" the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.Dd May 11, 2021 +.Dt VOP_DEALLOCATE 9 +.Os +.Sh NAME +.Nm VOP_DEALLOCATE +.Nd zero and/or deallocate storage from a file +.Sh SYNOPSIS +.In sys/param.h +.In sys/vnode.h +.Ft int +.Fo VOP_DEALLOCATE +.Fa "struct vnode *vp" +.Fa "off_t *offset" +.Fa "off_t *len" +.Fa "int flags" +.Fa "struct ucred *cred" +.Fc +.Sh DESCRIPTION +This VOP call zeroes/deallocates storage for an offset range in a file. +It is used to implement the +.Xr fspacectl 2 +system call. +.Pp +Its arguments are: +.Bl -tag -width offset +.It Fa vp +The vnode of the file. +.It Fa offset +The start of the range to deallocate storage in the file. +.It Fa len +The length of the range to deallocate storage in the file. +.It Fa flags +The flags of this call. +This should be set to 0 for now. +.It Fa cred +The credentials of the caller. +.El +.Pp +.Fa *offset +and +.Fa *len +are updated to reflect the portion of the range that +still needs to be zeroed/deallocated on return. +Partial result is considered a successful operation. +.Sh LOCKS +The vnode should be locked on entry and will still be locked on exit. +.Sh RETURN VALUES +Zero is returned if the call is successful, otherwise an appropriate +error code is returned. +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er EINVAL +Invalid +.Fa offset , len +or +.Fa flags +parameters are passed into this VOP call. +.It Bq Er ENODEV +The vnode type is not supported by this VOP call. +.It Bq Er ENOSPC +The file system is full. +.It Bq Er EPERM +An append-only flag is set on the file, but the caller is attempting to +zero before the current end of file. +.El +.Sh SEE ALSO +.Xr vnode 9 +.Sh AUTHORS +.Nm +and this manual page was written by +.An Ka Ho Ng Aq Mt khng@FreeBSD.org +under sponsorship from the FreeBSD Foundation. Index: share/man/man9/vn_deallocate.9 =================================================================== --- /dev/null +++ share/man/man9/vn_deallocate.9 @@ -0,0 +1,103 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause-FreeBSD +.\" +.\" Copyright (c) 2021 The FreeBSD Foundation +.\" +.\" This manual page was written by Ka Ho Ng under sponsorship from +.\" the FreeBSD Foundation. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.Dd Jul 30, 2021 +.Dt VN_DEALLOCATE 9 +.Os +.Sh NAME +.Nm vn_deallocate +.Nd zero and/or deallocate storage from a file +.Sh SYNOPSIS +.In sys/param.h +.In sys/vnode.h +.Ft int +.Fo vn_deallocate +.Fa "struct vnode *vp" +.Fa "off_t *offset" +.Fa "off_t *length" +.Fa "int flags" +.Fa "int ioflg" +.Fa "struct ucred *active_cred" +.Fa "struct ucred *file_cred" +.Fc +.Sh DESCRIPTION +The +.Fn vn_deallocate +function zeros and/or deallocates backing storage space from a file. +This function only works on vnodes with +.Dv VREG +type. +.Pp +The arguments are: +.Bl -tag -width active_cred +.It Fa vp +The vnode of the file. +.It Fa offset +The starting offset of the operation range. +.It Fa length +The length of the operation range. +This must be greater than 0. +.It Fa flags +The control flags of the operation. +This should be set to 0 for now. +.It Fa ioflg +The control flags of vnode locking. +.It Fa active_cred +The user credentials of the calling thread. +.It Fa file_cred +The credentials installed on the file description pointing to the vnode or NOCRED. +.El +.Pp +The +.Fn ioflg +argument may be one or more of the following flags: +.Bl -tag -width IO_RANGELOCKED +.It Dv IO_NODELOCKED +The vnode was locked before the call. +.It Dv IO_RANGELOCKED +Rangelock was owned around the call. +.It Dv IO_NOMACCHECK +Skip MAC checking in the call. +.El +.Pp +.Fa *offset +and +.Fa *length +are updated to reflect the unprocessed operation range of the call. +.Sh RETURN VALUES +Upon successful completion, the value 0 is returned; otherwise the +appropriate error is returned. +.Sh SEE ALSO +.Xr vnode 9 , +.Xr VOP_DEALLOCATE 9 +.Sh AUTHORS +.Nm +and this manual page was written by +.An Ka Ho Ng Aq Mt khng@FreeBSD.org +under sponsorship from the FreeBSD Foundation. Index: sys/bsm/audit_kevents.h =================================================================== --- sys/bsm/audit_kevents.h +++ sys/bsm/audit_kevents.h @@ -662,6 +662,7 @@ #define AUE_SPECIALFD 43266 /* FreeBSD-specific. */ #define AUE_AIO_WRITEV 43267 /* FreeBSD-specific. */ #define AUE_AIO_READV 43268 /* FreeBSD-specific. */ +#define AUE_FSPACECTL 43269 /* FreeBSD-specific. */ /* * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the Index: sys/compat/freebsd32/freebsd32.h =================================================================== --- sys/compat/freebsd32/freebsd32.h +++ sys/compat/freebsd32/freebsd32.h @@ -435,5 +435,9 @@ uint32_t pc_limit1, pc_limit2; }; +struct spacectl_range32 { + uint32_t r_offset1, r_offset2; + uint32_t r_len1, r_len2; +}; #endif /* !_COMPAT_FREEBSD32_FREEBSD32_H_ */ Index: sys/compat/freebsd32/freebsd32_misc.c =================================================================== --- sys/compat/freebsd32/freebsd32_misc.c +++ sys/compat/freebsd32/freebsd32_misc.c @@ -3857,3 +3857,37 @@ } return (error); } + +int +freebsd32_fspacectl(struct thread *td, struct freebsd32_fspacectl_args *uap) +{ + struct spacectl_range rqsr, rmsr; + struct spacectl_range32 rqsr32, rmsr32; + int error, cerror; + + error = copyin(uap->rqsr, &rqsr32, sizeof(rqsr32)); + if (error != 0) + return (error); + rqsr.r_offset = PAIR32TO64(off_t, rqsr32.r_offset); + rqsr.r_len = PAIR32TO64(off_t, rqsr32.r_len); + + error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags, + &rmsr); + if (uap->rmsr != NULL) { +#if BYTE_ORDER == LITTLE_ENDIAN + rmsr32.r_offset1 = rmsr.r_offset; + rmsr32.r_offset2 = rmsr.r_offset >> 32; + rmsr32.r_len1 = rmsr.r_len; + rmsr32.r_len2 = rmsr.r_len >> 32; +#else + rmsr32.r_offset1 = rmsr.r_offset >> 32; + rmsr32.r_offset2 = rmsr.r_offset; + rmsr32.r_len1 = rmsr.r_len >> 32; + rmsr32.r_len2 = rmsr.r_len; +#endif + cerror = copyout(&rmsr32, uap->rmsr, sizeof(rmsr32)); + if (cerror != 0) + error = cerror; + } + return (error); +} Index: sys/compat/freebsd32/syscalls.master =================================================================== --- sys/compat/freebsd32/syscalls.master +++ sys/compat/freebsd32/syscalls.master @@ -1176,5 +1176,10 @@ struct aiocb32 *aiocbp); } 579 AUE_AIO_READV STD { int freebsd32_aio_readv( \ struct aiocb32 *aiocbp); } +580 AUE_FSPACECTL STD { int freebsd32_fspacectl(int fd, \ + int cmd, \ + const struct spacectl_range32 *rqsr, \ + int flags, \ + struct spacectl_range32 *rmsr); } ; vim: syntax=off Index: sys/kern/capabilities.conf =================================================================== --- sys/kern/capabilities.conf +++ sys/kern/capabilities.conf @@ -228,6 +228,11 @@ freebsd6_pread freebsd6_pwrite +## +## Allow I/O-related file operations, subject to capability rights. +## +fspacectl + ## ## Allow querying file and file system state with fstat(2) and fstatfs(2), ## subject to capability rights. Index: sys/kern/sys_generic.c =================================================================== --- sys/kern/sys_generic.c +++ sys/kern/sys_generic.c @@ -861,6 +861,76 @@ return (error); } +int +sys_fspacectl(struct thread *td, struct fspacectl_args *uap) +{ + struct spacectl_range rqsr, rmsr; + int error, cerror; + + error = copyin(uap->rqsr, &rqsr, sizeof(rqsr)); + if (error != 0) + return (error); + + error = kern_fspacectl(td, uap->fd, uap->cmd, &rqsr, uap->flags, + &rmsr); + if (uap->rmsr != NULL) { + cerror = copyout(&rmsr, uap->rmsr, sizeof(rmsr)); + if (cerror != 0) + error = cerror; + } + return (error); +} + +int +kern_fspacectl(struct thread *td, int fd, int cmd, + const struct spacectl_range *rqsr, int flags, struct spacectl_range *rmsrp) +{ + struct file *fp; + struct spacectl_range rmsr; + int error; + + AUDIT_ARG_FD(fd); + AUDIT_ARG_CMD(cmd); + AUDIT_ARG_FFLAGS(flags); + + if (rqsr == NULL) + return (EINVAL); + rmsr = *rqsr; + if (rmsrp != NULL) + *rmsrp = rmsr; + + if (cmd != SPACECTL_DEALLOC || + rqsr->r_offset < 0 || rqsr->r_len <= 0 || + rqsr->r_offset > OFF_MAX - rqsr->r_len || + (flags & ~SPACECTL_F_SUPPORTED) != 0) + return (EINVAL); + + error = fget_write(td, fd, &cap_pwrite_rights, &fp); + if (error != 0) + return (error); + AUDIT_ARG_FILE(td->td_proc, fp); + if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { + error = ESPIPE; + goto out; + } + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + goto out; + } + + error = fo_fspacectl(fp, cmd, &rmsr.r_offset, &rmsr.r_len, flags, + td->td_ucred, td); + /* fspacectl is not restarted after signals if the file is modified. */ + if (rmsr.r_len != rqsr->r_len && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (rmsrp != NULL) + *rmsrp = rmsr; +out: + fdrop(fp, td); + return (error); +} + int kern_specialfd(struct thread *td, int type, void *arg) { Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -3250,6 +3250,15 @@ _Inout_ struct aiocb *aiocbp ); } +580 AUE_FSPACECTL STD { + int fspacectl( + int fd, + int cmd, + _In_ const struct spacectl_range *rqsr, + int flags, + _Out_opt_ struct spacectl_range *rmsr, + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -93,6 +93,7 @@ static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap); static int vop_stdstat(struct vop_stat_args *ap); static int vop_stdvput_pair(struct vop_vput_pair_args *ap); +static int vop_stddeallocate(struct vop_deallocate_args *ap); /* * This vnode table stores what we want to do if the filesystem doesn't @@ -117,6 +118,7 @@ .vop_advlockasync = vop_stdadvlockasync, .vop_advlockpurge = vop_stdadvlockpurge, .vop_allocate = vop_stdallocate, + .vop_deallocate = vop_stddeallocate, .vop_bmap = vop_stdbmap, .vop_close = VOP_NULL, .vop_fsync = VOP_NULL, @@ -518,6 +520,7 @@ case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_CAP_PRESENT: + case _PC_DEALLOC_PRESENT: case _PC_INF_PRESENT: case _PC_MAC_PRESENT: *ap->a_retval = 0; @@ -1069,6 +1072,125 @@ return (error); } +static int +vp_zerofill(struct vnode *vp, struct vattr *vap, off_t *offsetp, off_t *lenp, + struct ucred *cred) +{ + int iosize; + int error = 0; + struct iovec aiov; + struct uio auio; + struct thread *td; + off_t offset, len; + + iosize = vap->va_blocksize; + td = curthread; + offset = *offsetp; + len = *lenp; + + if (iosize == 0) + iosize = BLKDEV_IOSIZE; + /* If va_blocksize is 512 bytes, iosize will be 4 kilobytes */ + iosize = min(iosize * 8, ZERO_REGION_SIZE); + + while (len > 0) { + int xfersize = iosize; + if (offset % iosize != 0) + xfersize -= offset % iosize; + if (xfersize > len) + xfersize = len; + + aiov.iov_base = __DECONST(void *, zero_region); + aiov.iov_len = xfersize; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = offset; + auio.uio_resid = xfersize; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_td = td; + + error = VOP_WRITE(vp, &auio, 0, cred); + if (error != 0) { + len -= xfersize - auio.uio_resid; + offset += xfersize - auio.uio_resid; + break; + } + + len -= xfersize; + offset += xfersize; + } + + *offsetp = offset; + *lenp = len; + return (error); +} + +static int +vop_stddeallocate(struct vop_deallocate_args *ap) +{ + struct vnode *vp; + off_t offset, len; + struct ucred *cred; + int error; + struct vattr va; + off_t noff, xfersize, rem; + + vp = ap->a_vp; + offset = *ap->a_offset; + len = *ap->a_len; + cred = ap->a_cred; + + error = VOP_GETATTR(vp, &va, cred); + if (error) + return (error); + + len = omin(OFF_MAX - offset, *ap->a_len); + while (len > 0) { + noff = offset; + error = vn_bmap_seekhole_locked(vp, FIOSEEKDATA, &noff, cred); + if (error) { + if (error != ENXIO) + /* XXX: Is it okay to fallback further? */ + goto out; + + /* + * No more data region to be filled + */ + len = 0; + error = 0; + break; + } + KASSERT(noff >= offset, ("FIOSEEKDATA going backward")); + if (noff != offset) { + xfersize = omin(noff - offset, len); + len -= xfersize; + offset += xfersize; + if (len == 0) + break; + } + error = vn_bmap_seekhole_locked(vp, FIOSEEKHOLE, &noff, cred); + if (error) + goto out; + + /* Fill zeroes */ + xfersize = rem = omin(noff - offset, len); + error = vp_zerofill(vp, &va, &offset, &rem, cred); + if (error) { + len -= xfersize - rem; + goto out; + } + + len -= xfersize; + if (should_yield()) + break; + } +out: + *ap->a_offset = offset; + *ap->a_len = len; + return (error); +} + int vop_stdadvise(struct vop_advise_args *ap) { Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -106,6 +106,7 @@ static fo_close_t vn_closefile; static fo_mmap_t vn_mmap; static fo_fallocate_t vn_fallocate; +static fo_fspacectl_t vn_fspacectl; struct fileops vnops = { .fo_read = vn_io_fault, @@ -123,6 +124,7 @@ .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = vn_mmap, .fo_fallocate = vn_fallocate, + .fo_fspacectl = vn_fspacectl, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; @@ -3439,6 +3441,114 @@ return (error); } +static int +vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags, + int ioflg, struct ucred *active_cred, struct ucred *file_cred) +{ + struct mount *mp; + void *rl_cookie; + off_t off, len; + int error; +#ifdef AUDIT + bool audited_vnode1 = false; +#endif + + rl_cookie = NULL; + error = 0; + mp = NULL; + off = *offset; + len = *length; + + if ((ioflg & (IO_NODELOCKED|IO_RANGELOCKED)) == 0) + rl_cookie = vn_rangelock_wlock(vp, off, off + len); + while (len > 0 && error == 0) { + /* + * Try to deallocate the longest range in one pass. + * In case a pass takes too long to be executed, it returns + * partial result. The residue will be proceeded in the next + * pass. + */ + + if ((ioflg & IO_NODELOCKED) == 0) { + bwillwrite(); + if ((error = vn_start_write(vp, &mp, + V_WAIT | PCATCH)) != 0) + goto out; + vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY); + } +#ifdef AUDIT + if (!audited_vnode1) { + AUDIT_ARG_VNODE1(vp); + audited_vnode1 = true; + } +#endif + +#ifdef MAC + if ((ioflg & IO_NOMACCHECK) == 0) + error = mac_vnode_check_write(active_cred, file_cred, + vp); +#endif + if (error == 0) + error = VOP_DEALLOCATE(vp, &off, &len, flags, + active_cred); + + if ((ioflg & IO_NODELOCKED) == 0) { + VOP_UNLOCK(vp); + if (mp != NULL) { + vn_finished_write(mp); + mp = NULL; + } + } + } +out: + if (rl_cookie != NULL) + vn_rangelock_unlock(vp, rl_cookie); + *offset = off; + *length = len; + return (error); +} + +int +vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags, + int ioflg, struct ucred *active_cred, struct ucred *file_cred) +{ + if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset || + flags != 0) + return (EINVAL); + if (vp->v_type != VREG) + return (ENODEV); + + return (vn_deallocate_impl(vp, offset, length, flags, ioflg, + active_cred, file_cred)); +} + +static int +vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags, + struct ucred *active_cred, struct thread *td) +{ + int error; + struct vnode *vp; + + vp = fp->f_vnode; + + if (cmd != SPACECTL_DEALLOC || *offset < 0 || *length <= 0 || + *length > OFF_MAX - *offset || flags != 0) + return (EINVAL); + if (vp->v_type != VREG) + return (ENODEV); + + switch (cmd) { + case SPACECTL_DEALLOC: + error = vn_deallocate_impl(vp, offset, length, flags, 0, + active_cred, fp->f_cred); + break; + default: + panic("vn_fspacectl: unknown cmd %d", cmd); + } + + return (error); +} + static u_long vn_lock_pair_pause_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD, &vn_lock_pair_pause_cnt, 0, Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -801,6 +801,17 @@ }; +%% deallocate vp L L L + +vop_deallocate { + IN struct vnode *vp; + INOUT off_t *offset; + INOUT off_t *len; + IN int flags; + IN struct ucred *cred; +}; + + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, Index: sys/security/audit/audit_bsm.c =================================================================== --- sys/security/audit/audit_bsm.c +++ sys/security/audit/audit_bsm.c @@ -1091,6 +1091,18 @@ FD_VNODE1_TOKENS; break; + case AUE_FSPACECTL: + if (ARG_IS_VALID(kar, ARG_CMD)) { + tok = au_to_arg32(2, "operation", ar->ar_arg_cmd); + kau_write(rec, tok); + } + if (ARG_IS_VALID(kar, ARG_FFLAGS)) { + tok = au_to_arg32(4, "flags", ar->ar_arg_fflags); + kau_write(rec, tok); + } + FD_VNODE1_TOKENS; + break; + case AUE_RFORK: if (ARG_IS_VALID(kar, ARG_FFLAGS)) { tok = au_to_arg32(1, "flags", ar->ar_arg_fflags); Index: sys/sys/fcntl.h =================================================================== --- sys/sys/fcntl.h +++ sys/sys/fcntl.h @@ -323,6 +323,14 @@ short l_type; /* lock type: read/write, etc. */ short l_whence; /* type of l_start */ }; + +/* + * Space control offset/length description + */ +struct spacectl_range { + off_t r_offset; /* starting offset */ + off_t r_len; /* length */ +}; #endif #if __BSD_VISIBLE @@ -352,6 +360,16 @@ * similar syscalls. */ #define FD_NONE -200 + +/* + * Commands for fspacectl(2) + */ +#define SPACECTL_DEALLOC 1 /* deallocate space */ + +/* + * fspacectl(2) flags + */ +#define SPACECTL_F_SUPPORTED 0 #endif #ifndef _KERNEL @@ -361,6 +379,8 @@ int fcntl(int, int, ...); #if __BSD_VISIBLE int flock(int, int); +int fspacectl(int, int, const struct spacectl_range *, int, + struct spacectl_range *); #endif #if __POSIX_VISIBLE >= 200809 int openat(int, const char *, int, ...); Index: sys/sys/file.h =================================================================== --- sys/sys/file.h +++ sys/sys/file.h @@ -129,6 +129,9 @@ typedef int fo_get_seals_t(struct file *fp, int *flags); typedef int fo_fallocate_t(struct file *fp, off_t offset, off_t len, struct thread *td); +typedef int fo_fspacectl_t(struct file *fp, int cmd, + off_t *offset, off_t *length, int flags, + struct ucred *active_cred, struct thread *td); typedef int fo_flags_t; struct fileops { @@ -150,6 +153,7 @@ fo_add_seals_t *fo_add_seals; fo_get_seals_t *fo_get_seals; fo_fallocate_t *fo_fallocate; + fo_fspacectl_t *fo_fspacectl; fo_flags_t fo_flags; /* DFLAG_* below */ }; @@ -472,6 +476,17 @@ return ((*fp->f_ops->fo_fallocate)(fp, offset, len, td)); } +static __inline int fo_fspacectl(struct file *fp, int cmd, off_t *offset, + off_t *length, int flags, struct ucred *active_cred, struct thread *td) +{ + + if (fp->f_ops->fo_fspacectl == NULL) + return (ENODEV); + return ((*fp->f_ops->fo_fspacectl)(fp, cmd, offset, length, flags, + active_cred, td)); +} + + #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ Index: sys/sys/syscallsubr.h =================================================================== --- sys/sys/syscallsubr.h +++ sys/sys/syscallsubr.h @@ -59,6 +59,7 @@ struct sched_param; union semun; struct sockaddr; +struct spacectl_range; struct stat; struct thr_param; struct timex; @@ -233,6 +234,8 @@ int advice); int kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len); +int kern_fspacectl(struct thread *td, int fd, int cmd, + const struct spacectl_range *, int flags, struct spacectl_range *); int kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com, void *data); int kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, Index: sys/sys/unistd.h =================================================================== --- sys/sys/unistd.h +++ sys/sys/unistd.h @@ -156,6 +156,7 @@ #define _PC_INF_PRESENT 62 #define _PC_MAC_PRESENT 63 #define _PC_ACL_NFS4 64 +#define _PC_DEALLOC_PRESENT 65 #endif /* From OpenSolaris, used by SEEK_DATA/SEEK_HOLE. */ Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -741,6 +741,8 @@ struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); +int vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags, + int ioflg, struct ucred *active_cred, struct ucred *file_cred); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor); Index: tests/sys/file/Makefile =================================================================== --- tests/sys/file/Makefile +++ tests/sys/file/Makefile @@ -11,6 +11,7 @@ TAP_TESTS_SH+= flock_test PLAIN_TESTS_C+= ftruncate_test PLAIN_TESTS_C+= newfileops_on_fork_test +ATF_TESTS_C+= fspacectl_test PROGS+= flock_helper Index: tests/sys/file/fspacectl_test.c =================================================================== --- /dev/null +++ tests/sys/file/fspacectl_test.c @@ -0,0 +1,338 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2021 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Ka Ho Ng under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include +#include +#include + +static off_t file_max_blocks = 32; +static const char byte_to_fill = 0x5f; + +static int +fill(int fd, off_t offset, off_t len) +{ + int error; + size_t blen; + char *buf; + struct stat statbuf; + blksize_t blocksize; + + if (fstat(fd, &statbuf) == -1) + return (1); + blocksize = statbuf.st_blksize; + error = 0; + buf = malloc(blocksize); + if (buf == NULL) + return (1); + + while (len > 0) { + blen = len < (off_t)blocksize ? len : blocksize; + memset(buf, byte_to_fill, blen); + if (pwrite(fd, buf, blen, offset) != (ssize_t)blen) { + error = 1; + break; + } + len -= blen; + offset += blen; + } + + free(buf); + return (error); +} + +static blksize_t +fd_get_blksize(void) +{ + struct statfs statfsbuf; + + if (statfs(".", &statfsbuf) == -1) + return (-1); + return statfsbuf.f_iosize; +} + +static int +check_content_dealloc(int fd, off_t hole_start, off_t hole_len, off_t file_sz) +{ + int error; + size_t blen; + off_t offset, resid; + struct stat statbuf; + char *buf, *sblk; + blksize_t blocksize; + + blocksize = fd_get_blksize(); + if (blocksize == -1) + return (1); + error = 0; + buf = malloc(blocksize * 2); + if (buf == NULL) + return (1); + sblk = buf + blocksize; + + memset(sblk, 0, blocksize); + + if ((uint64_t)hole_start + hole_len > (uint64_t)file_sz) + hole_len = file_sz - hole_start; + + /* + * Check hole is zeroed. + */ + offset = hole_start; + resid = hole_len; + while (resid > 0) { + blen = resid < (off_t)blocksize ? resid : blocksize; + if (pread(fd, buf, blen, offset) != (ssize_t)blen) { + error = 1; + break; + } + if (memcmp(buf, sblk, blen) != 0) { + error = 1; + break; + } + resid -= blen; + offset += blen; + } + + memset(sblk, byte_to_fill, blocksize); + + /* + * Check file region before hole is zeroed. + */ + offset = 0; + resid = hole_start; + while (resid > 0) { + blen = resid < (off_t)blocksize ? resid : blocksize; + if (pread(fd, buf, blen, offset) != (ssize_t)blen) { + error = 1; + break; + } + if (memcmp(buf, sblk, blen) != 0) { + error = 1; + break; + } + resid -= blen; + offset += blen; + } + + /* + * Check file region after hole is zeroed. + */ + offset = hole_start + hole_len; + resid = file_sz - offset; + while (resid > 0) { + blen = resid < (off_t)blocksize ? resid : blocksize; + if (pread(fd, buf, blen, offset) != (ssize_t)blen) { + error = 1; + break; + } + if (memcmp(buf, sblk, blen) != 0) { + error = 1; + break; + } + resid -= blen; + offset += blen; + } + + /* + * Check file size matches with expected file size. + */ + if (fstat(fd, &statbuf) == -1) + error = -1; + if (statbuf.st_size != file_sz) + error = -1; + + free(buf); + return (error); +} + +/* + * Check aligned deallocation + */ +ATF_TC_WITHOUT_HEAD(aligned_dealloc); +ATF_TC_BODY(aligned_dealloc, tc) +{ + struct spacectl_range range; + off_t offset, length; + blksize_t blocksize; + int fd; + + ATF_REQUIRE((blocksize = fd_get_blksize()) != -1); + range.r_offset = offset = blocksize; + range.r_len = length = (file_max_blocks - 1) * blocksize - + range.r_offset; + + ATF_REQUIRE((fd = open("sys_fspacectl_testfile", + O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1); + ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0); + ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0); + ATF_CHECK(check_content_dealloc(fd, offset, length, + file_max_blocks * blocksize) == 0); + ATF_REQUIRE(close(fd) == 0); +} + +/* + * Check unaligned deallocation + */ +ATF_TC_WITHOUT_HEAD(unaligned_dealloc); +ATF_TC_BODY(unaligned_dealloc, tc) +{ + struct spacectl_range range; + off_t offset, length; + blksize_t blocksize; + int fd; + + ATF_REQUIRE((blocksize = fd_get_blksize()) != -1); + range.r_offset = offset = blocksize / 2; + range.r_len = length = (file_max_blocks - 1) * blocksize + + blocksize / 2 - offset; + + ATF_REQUIRE((fd = open("sys_fspacectl_testfile", + O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1); + ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0); + ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0); + ATF_CHECK(check_content_dealloc(fd, offset, length, + file_max_blocks * blocksize) == 0); + ATF_REQUIRE(close(fd) == 0); +} + +/* + * Check aligned deallocation from certain offset to OFF_MAX + */ +ATF_TC_WITHOUT_HEAD(aligned_dealloc_offmax); +ATF_TC_BODY(aligned_dealloc_offmax, tc) +{ + struct spacectl_range range; + off_t offset, length; + blksize_t blocksize; + int fd; + + ATF_REQUIRE((blocksize = fd_get_blksize()) != -1); + range.r_offset = offset = blocksize; + range.r_len = length = OFF_MAX - offset; + + ATF_REQUIRE((fd = open("sys_fspacectl_testfile", + O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1); + ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0); + ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0); + ATF_CHECK(check_content_dealloc(fd, offset, length, + file_max_blocks * blocksize) == 0); + ATF_REQUIRE(close(fd) == 0); +} + +/* + * Check unaligned deallocation from certain offset to OFF_MAX + */ +ATF_TC_WITHOUT_HEAD(unaligned_dealloc_offmax); +ATF_TC_BODY(unaligned_dealloc_offmax, tc) +{ + struct spacectl_range range; + off_t offset, length; + blksize_t blocksize; + int fd; + + ATF_REQUIRE((blocksize = fd_get_blksize()) != -1); + range.r_offset = offset = blocksize / 2; + range.r_len = length = OFF_MAX - offset; + + ATF_REQUIRE((fd = open("sys_fspacectl_testfile", + O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1); + ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0); + ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0); + ATF_CHECK(check_content_dealloc(fd, offset, length, + file_max_blocks * blocksize) == 0); + ATF_REQUIRE(close(fd) == 0); +} + +/* + * Check aligned deallocation around EOF + */ +ATF_TC_WITHOUT_HEAD(aligned_dealloc_eof); +ATF_TC_BODY(aligned_dealloc_eof, tc) +{ + struct spacectl_range range; + off_t offset, length; + blksize_t blocksize; + int fd; + + ATF_REQUIRE((blocksize = fd_get_blksize()) != -1); + range.r_offset = offset = blocksize; + range.r_len = length = (file_max_blocks + 1) * blocksize - + range.r_offset; + + ATF_REQUIRE((fd = open("sys_fspacectl_testfile", + O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1); + ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0); + ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0); + ATF_CHECK(check_content_dealloc(fd, offset, length, + file_max_blocks * blocksize) == 0); + ATF_REQUIRE(close(fd) == 0); +} + +/* + * Check unaligned deallocation around EOF + */ +ATF_TC_WITHOUT_HEAD(unaligned_dealloc_eof); +ATF_TC_BODY(unaligned_dealloc_eof, tc) +{ + struct spacectl_range range; + off_t offset, length; + blksize_t blocksize; + int fd; + + ATF_REQUIRE((blocksize = fd_get_blksize()) != -1); + range.r_offset = offset = blocksize / 2; + range.r_len = length = file_max_blocks * blocksize + blocksize / 2 - + range.r_offset; + + ATF_REQUIRE((fd = open("sys_fspacectl_testfile", + O_CREAT | O_RDWR | O_TRUNC, 0600)) != -1); + ATF_REQUIRE(fill(fd, 0, file_max_blocks * blocksize) == 0); + ATF_CHECK(fspacectl(fd, SPACECTL_DEALLOC, &range, 0, &range) == 0); + ATF_CHECK(check_content_dealloc(fd, offset, length, + file_max_blocks * blocksize) == 0); + ATF_REQUIRE(close(fd) == 0); +} + +ATF_TP_ADD_TCS(tp) +{ + ATF_TP_ADD_TC(tp, aligned_dealloc); + ATF_TP_ADD_TC(tp, unaligned_dealloc); + ATF_TP_ADD_TC(tp, aligned_dealloc_eof); + ATF_TP_ADD_TC(tp, unaligned_dealloc_eof); + ATF_TP_ADD_TC(tp, aligned_dealloc_offmax); + ATF_TP_ADD_TC(tp, unaligned_dealloc_offmax); + + return atf_no_error(); +}