Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -410,6 +410,7 @@ fhlink; fhlinkat; fhreadlink; + fspacectl; getfhat; funlinkat; memfd_create; Index: sys/bsm/audit_kevents.h =================================================================== --- sys/bsm/audit_kevents.h +++ sys/bsm/audit_kevents.h @@ -662,6 +662,7 @@ #define AUE_SPECIALFD 43266 /* FreeBSD-specific. */ #define AUE_AIO_WRITEV 43267 /* FreeBSD-specific. */ #define AUE_AIO_READV 43268 /* FreeBSD-specific. */ +#define AUE_FSPACECTL 43269 /* FreeBSD-specific. */ /* * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the Index: sys/compat/freebsd32/freebsd32_misc.c =================================================================== --- sys/compat/freebsd32/freebsd32_misc.c +++ sys/compat/freebsd32/freebsd32_misc.c @@ -3569,6 +3569,20 @@ return (kern_posix_error(td, error)); } +int +freebsd32_fspacectl(struct thread *td, struct freebsd32_fspacectl_args *uap) +{ + int error; + struct spacectl_range range; + + error = copyin(uap->range, &range, sizeof(range)); + if (error != 0) + return (error); + + error = kern_fspacectl(td, uap->fd, uap->cmd, uap->range, uap->flags); + return (error); +} + int freebsd32_posix_fadvise(struct thread *td, struct freebsd32_posix_fadvise_args *uap) Index: sys/compat/freebsd32/freebsd32_proto.h =================================================================== --- sys/compat/freebsd32/freebsd32_proto.h +++ sys/compat/freebsd32/freebsd32_proto.h @@ -751,6 +751,12 @@ struct freebsd32_aio_readv_args { char aiocbp_l_[PADL_(struct aiocb32 *)]; struct aiocb32 * aiocbp; char aiocbp_r_[PADR_(struct aiocb32 *)]; }; +struct freebsd32_fspacectl_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char cmd_l_[PADL_(int)]; int cmd; char cmd_r_[PADR_(int)]; + char range_l_[PADL_(struct spacectl_range *)]; struct spacectl_range * range; char range_r_[PADR_(struct spacectl_range *)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; #if !defined(PAD64_REQUIRED) && !defined(__amd64__) #define PAD64_REQUIRED #endif @@ -893,6 +899,7 @@ int freebsd32___sysctlbyname(struct thread *, struct freebsd32___sysctlbyname_args *); int freebsd32_aio_writev(struct thread *, struct freebsd32_aio_writev_args *); int freebsd32_aio_readv(struct thread *, struct freebsd32_aio_readv_args *); +int freebsd32_fspacectl(struct thread *, struct freebsd32_fspacectl_args *); #ifdef COMPAT_43 @@ -1476,6 +1483,7 @@ #define FREEBSD32_SYS_AUE_freebsd32___sysctlbyname AUE_SYSCTL #define FREEBSD32_SYS_AUE_freebsd32_aio_writev AUE_AIO_WRITEV #define FREEBSD32_SYS_AUE_freebsd32_aio_readv AUE_AIO_READV +#define FREEBSD32_SYS_AUE_freebsd32_fspacectl AUE_FSPACECTL #undef PAD_ #undef PADL_ Index: sys/compat/freebsd32/freebsd32_syscall.h =================================================================== --- sys/compat/freebsd32/freebsd32_syscall.h +++ sys/compat/freebsd32/freebsd32_syscall.h @@ -506,4 +506,5 @@ #define FREEBSD32_SYS___specialfd 577 #define FREEBSD32_SYS_freebsd32_aio_writev 578 #define FREEBSD32_SYS_freebsd32_aio_readv 579 -#define FREEBSD32_SYS_MAXSYSCALL 580 +#define FREEBSD32_SYS_freebsd32_fspacectl 580 +#define FREEBSD32_SYS_MAXSYSCALL 581 Index: sys/compat/freebsd32/freebsd32_syscalls.c =================================================================== --- sys/compat/freebsd32/freebsd32_syscalls.c +++ sys/compat/freebsd32/freebsd32_syscalls.c @@ -616,4 +616,5 @@ "__specialfd", /* 577 = __specialfd */ "freebsd32_aio_writev", /* 578 = freebsd32_aio_writev */ "freebsd32_aio_readv", /* 579 = freebsd32_aio_readv */ + "freebsd32_fspacectl", /* 580 = freebsd32_fspacectl */ }; Index: sys/compat/freebsd32/freebsd32_sysent.c =================================================================== --- sys/compat/freebsd32/freebsd32_sysent.c +++ sys/compat/freebsd32/freebsd32_sysent.c @@ -669,4 +669,5 @@ { .sy_narg = AS(__specialfd_args), .sy_call = (sy_call_t *)sys___specialfd, .sy_auevent = AUE_SPECIALFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 577 = __specialfd */ { .sy_narg = AS(freebsd32_aio_writev_args), .sy_call = (sy_call_t *)freebsd32_aio_writev, .sy_auevent = AUE_AIO_WRITEV, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 578 = freebsd32_aio_writev */ { .sy_narg = AS(freebsd32_aio_readv_args), .sy_call = (sy_call_t *)freebsd32_aio_readv, .sy_auevent = AUE_AIO_READV, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 579 = freebsd32_aio_readv */ + { .sy_narg = AS(freebsd32_fspacectl_args), .sy_call = (sy_call_t *)freebsd32_fspacectl, .sy_auevent = AUE_FSPACECTL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 580 = freebsd32_fspacectl */ }; Index: sys/compat/freebsd32/freebsd32_systrace_args.c =================================================================== --- sys/compat/freebsd32/freebsd32_systrace_args.c +++ sys/compat/freebsd32/freebsd32_systrace_args.c @@ -3407,6 +3407,16 @@ *n_args = 1; break; } + /* freebsd32_fspacectl */ + case 580: { + struct freebsd32_fspacectl_args *p = params; + iarg[0] = p->fd; /* int */ + iarg[1] = p->cmd; /* int */ + uarg[2] = (intptr_t) p->range; /* struct spacectl_range * */ + iarg[3] = p->flags; /* int */ + *n_args = 4; + break; + } default: *n_args = 0; break; @@ -9183,6 +9193,25 @@ break; }; break; + /* freebsd32_fspacectl */ + case 580: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "int"; + break; + case 2: + p = "userland struct spacectl_range *"; + break; + case 3: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -11102,6 +11131,11 @@ if (ndx == 0 || ndx == 1) p = "int"; break; + /* freebsd32_fspacectl */ + case 580: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; Index: sys/compat/freebsd32/syscalls.master =================================================================== --- sys/compat/freebsd32/syscalls.master +++ sys/compat/freebsd32/syscalls.master @@ -1174,5 +1174,9 @@ struct aiocb32 *aiocbp); } 579 AUE_AIO_READV STD { int freebsd32_aio_readv( \ struct aiocb32 *aiocbp); } +580 AUE_FSPACECTL STD { int freebsd32_fspacectl(int fd, \ + int cmd, \ + struct spacectl_range *range,\ + int flags); } ; vim: syntax=off Index: sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c =================================================================== --- sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -3797,6 +3797,49 @@ return (error); } +/* + * common code for zfs_space-related operations + * + * This function is called from zfs_space and zfs_deallocate. + */ +static int +zfs_space_common(znode_t *zp, int cmd, off_t off, off_t len, int flag, + cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t uoff, ulen; + int error; + + if (cmd != F_FREESP) + return (EINVAL); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) + return (EROFS); + + if (len < 0) + return (EINVAL); + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) + return (error); + + uoff = off; + ulen = len; + + error = zfs_freesp(zp, uoff, ulen, flag, TRUE); + + return (error); +} + /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat @@ -3821,47 +3864,16 @@ offset_t offset, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); - uint64_t off, len; + off_t off, len; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - if (cmd != F_FREESP) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - if (bfp->l_len < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Permissions aren't checked on Solaris because on this OS - * zfs_space() can only be called with an opened file handle. - * On Linux we can get here through truncate_range() which - * operates directly on inodes, so we need to check access rights. - */ - if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { - ZFS_EXIT(zfsvfs); - return (error); - } - off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ - error = zfs_freesp(zp, off, len, flag, TRUE); - + error = zfs_space_common(zp, cmd, off, len, flag, cr); ZFS_EXIT(zfsvfs); return (error); } @@ -5189,6 +5201,9 @@ case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); + case _PC_FDEALLOC_PRESENT: + *ap->a_retval = 1; + return (0); case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { *ap->a_retval = PIPE_BUF; @@ -5781,6 +5796,51 @@ return (error); } +/* + * This is mostly the same as zfs_space except it limits the range of operation + * to zp->z_size, so the log record of zfs_space during replay would be reused. + * + * File size is supposed to grow only under vnode shared lock, even though we do + * not grab rangelock on the znode. + */ +static int +zfs_deallocate(struct vop_deallocate_args *ap) +{ + vnode_t *vp; + znode_t *zp; + zfsvfs_t *zfsvfs; + off_t offset, len; + uint64_t file_sz; + int error; + + vp = ap->a_vp; + zp = VTOZ(vp); + zfsvfs = ZTOZSB(zp); + + if (ap->a_offset < 0 || ap->a_len < 0) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + offset = *ap->a_offset; + len = *ap->a_len; + + file_sz = zp->z_size; + if ((uint64_t)offset + len > file_sz) + len = file_sz - offset; + if (len <= 0) { + error = 0; + goto out; + } + error = zfs_space_common(zp, F_FREESP, offset, len, O_RDWR, ap->a_cred); +out: + if (__predict_true(error == 0)) + *ap->a_len = 0; + ZFS_EXIT(zfsvfs); + return (error); +} + struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; @@ -5798,6 +5858,7 @@ .vop_fplookup_symlink = zfs_freebsd_fplookup_symlink, .vop_access = zfs_freebsd_access, .vop_allocate = VOP_EINVAL, + .vop_deallocate = zfs_deallocate, .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_cachedlookup, .vop_getattr = zfs_freebsd_getattr, Index: sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c =================================================================== --- sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -1449,14 +1449,8 @@ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); - if (error == 0) { - /* - * In FreeBSD we cannot free block in the middle of a file, - * but only at the end of a file, so this code path should - * never happen. - */ - vnode_pager_setsize(ZTOV(zp), off); - } + if (error == 0) + vnode_pager_purge_range(ZTOV(zp), off, off + len); zfs_rangelock_exit(lr); Index: sys/kern/init_sysent.c =================================================================== --- sys/kern/init_sysent.c +++ sys/kern/init_sysent.c @@ -635,4 +635,5 @@ { .sy_narg = AS(__specialfd_args), .sy_call = (sy_call_t *)sys___specialfd, .sy_auevent = AUE_SPECIALFD, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 577 = __specialfd */ { .sy_narg = AS(aio_writev_args), .sy_call = (sy_call_t *)sys_aio_writev, .sy_auevent = AUE_AIO_WRITEV, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 578 = aio_writev */ { .sy_narg = AS(aio_readv_args), .sy_call = (sy_call_t *)sys_aio_readv, .sy_auevent = AUE_AIO_READV, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 579 = aio_readv */ + { .sy_narg = AS(fspacectl_args), .sy_call = (sy_call_t *)sys_fspacectl, .sy_auevent = AUE_FSPACECTL, .sy_flags = 0, .sy_thrcnt = SY_THR_STATIC }, /* 580 = fspacectl */ }; Index: sys/kern/sys_generic.c =================================================================== --- sys/kern/sys_generic.c +++ sys/kern/sys_generic.c @@ -861,6 +861,59 @@ return (error); } +int +sys_fspacectl(struct thread *td, struct fspacectl_args *uap) +{ + struct spacectl_range range; + int error; + + error = copyin(uap->range, &range, sizeof(range)); + if (error != 0) + return (error); + + error = kern_fspacectl(td, uap->fd, uap->cmd, &range, uap->flags); + return (error); +} + +int +kern_fspacectl(struct thread *td, int fd, int cmd, struct spacectl_range *range, + int flags) +{ + struct file *fp; + off_t offset; + int error; + + offset = range->r_resid; + + AUDIT_ARG_FD(fd); + AUDIT_ARG_CMD(cmd); + AUDIT_ARG_FFLAGS(flags); + + if ((cmd != SPACECTL_DEALLOC) || + (range->r_offset < 0 || range->r_resid < 0) || + (flags & ~SPACECTL_F_SUPPORTED)) + return (EINVAL); + + error = fget(td, fd, &cap_pwrite_rights, &fp); + if (error != 0) + return (error); + AUDIT_ARG_FILE(td->td_proc, fp); + if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) { + error = ESPIPE; + goto out; + } + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + goto out; + } + + error = fo_fspacectl(fp, cmd, range->r_offset, range->r_resid, flags, + td->td_ucred, td); +out: + fdrop(fp, td); + return (error); +} + int kern_specialfd(struct thread *td, int type, void *arg) { Index: sys/kern/syscalls.c =================================================================== --- sys/kern/syscalls.c +++ sys/kern/syscalls.c @@ -586,4 +586,5 @@ "__specialfd", /* 577 = __specialfd */ "aio_writev", /* 578 = aio_writev */ "aio_readv", /* 579 = aio_readv */ + "fspacectl", /* 580 = fspacectl */ }; Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -3238,6 +3238,14 @@ _Inout_ struct aiocb *aiocbp ); } +580 AUE_FSPACECTL STD { + int fspacectl( + int fd, + int cmd, + _In_ struct spacectl_range *range, + int flags + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master Index: sys/kern/systrace_args.c =================================================================== --- sys/kern/systrace_args.c +++ sys/kern/systrace_args.c @@ -3399,6 +3399,16 @@ *n_args = 1; break; } + /* fspacectl */ + case 580: { + struct fspacectl_args *p = params; + iarg[0] = p->fd; /* int */ + iarg[1] = p->cmd; /* int */ + uarg[2] = (intptr_t) p->range; /* struct spacectl_range * */ + iarg[3] = p->flags; /* int */ + *n_args = 4; + break; + } default: *n_args = 0; break; @@ -9088,6 +9098,25 @@ break; }; break; + /* fspacectl */ + case 580: + switch(ndx) { + case 0: + p = "int"; + break; + case 1: + p = "int"; + break; + case 2: + p = "userland struct spacectl_range *"; + break; + case 3: + p = "int"; + break; + default: + break; + }; + break; default: break; }; @@ -11034,6 +11063,11 @@ if (ndx == 0 || ndx == 1) p = "int"; break; + /* fspacectl */ + case 580: + if (ndx == 0 || ndx == 1) + p = "int"; + break; default: break; }; Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -92,6 +92,7 @@ static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); static int vop_stdread_pgcache(struct vop_read_pgcache_args *ap); static int vop_stdstat(struct vop_stat_args *ap); +static int vop_stddeallocate(struct vop_deallocate_args *ap); /* * This vnode table stores what we want to do if the filesystem doesn't @@ -116,6 +117,7 @@ .vop_advlockasync = vop_stdadvlockasync, .vop_advlockpurge = vop_stdadvlockpurge, .vop_allocate = vop_stdallocate, + .vop_deallocate = vop_stddeallocate, .vop_bmap = vop_stdbmap, .vop_close = VOP_NULL, .vop_fsync = VOP_NULL, @@ -501,6 +503,7 @@ case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_CAP_PRESENT: + case _PC_FDEALLOC_PRESENT: case _PC_INF_PRESENT: case _PC_MAC_PRESENT: *ap->a_retval = 0; @@ -1075,6 +1078,145 @@ return (error); } +static int +vp_zerofill(struct vnode *vp, struct vattr *vap, off_t offset, off_t len, + off_t *residp, struct ucred *cred) +{ + int iosize; + int error = 0; + void *buf = NULL; + struct iovec aiov; + struct uio auio; + struct thread *td; + + iosize = vap->va_blocksize; + td = curthread; + + if (iosize == 0) + iosize = BLKDEV_IOSIZE; + if (iosize > MAXPHYS) + iosize = MAXPHYS; + buf = malloc(iosize, M_TEMP, M_ZERO | M_WAITOK); + + while (len > 0) { + int xfersize = iosize; + if (offset % iosize != 0) + xfersize -= offset % iosize; + if (xfersize > len) + xfersize = len; + + if (error != EOPNOTSUPP) + break; + + aiov.iov_base = buf; + aiov.iov_len = xfersize; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = offset; + auio.uio_resid = xfersize; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_td = td; + + error = VOP_WRITE(vp, &auio, 0, cred); + if (error != 0) { + len -= xfersize - auio.uio_resid; + break; + } + + len -= xfersize; + offset += xfersize; + } + + free(buf, M_TEMP); + *residp = len; + return (error); +} + +static int +vop_stddeallocate(struct vop_deallocate_args *ap) +{ + struct vnode *vp; + off_t offset, resid; + struct ucred *cred; + int error; + struct vattr va; + + vp = ap->a_vp; + offset = *ap->a_offset; + resid = *ap->a_len; + cred = ap->a_cred; + + if ((offset < 0 || resid < 0) || + (ap->a_flags & ~SPACECTL_F_SUPPORTED)) + return (EINVAL); + if (ap->a_flags & SPACECTL_F_ATOMIC) + return (EOPNOTSUPP); + + error = VOP_GETATTR(vp, &va, cred); + if (error) + return (error); + + if ((uint64_t)offset + resid > va.va_size) + resid = va.va_size - offset; + + while (resid > 0) { + off_t noff; + off_t xfersize; + off_t rem; + + noff = offset; + + error = vn_bmap_seekhole_locked(vp, FIOSEEKDATA, &noff, cred); + if (error) { + if (error == ENXIO) { + /* No more data region to be filled */ + error = vn_truncate_locked( + vp, offset + resid, false, cred); + if (error) + goto out; + offset += resid; + resid = 0; + break; + } + /* XXX: Is it okay to fallback further? */ + goto out; + } + KASSERT(noff >= offset, ("FIOSEEKDATA going backward")); + if (noff != offset) { + xfersize = omin(noff - offset, resid); + resid -= xfersize; + offset += xfersize; + continue; + } + error = vn_bmap_seekhole_locked(vp, FIOSEEKHOLE, &noff, cred); + if (error) + goto out; + + xfersize = noff - offset; + if (xfersize > resid) + xfersize = resid; + + /* Fill zeroes */ + error = vp_zerofill(vp, &va, offset, xfersize, &rem, cred); + if (error) { + resid -= xfersize - rem; + offset += xfersize - rem; + goto out; + } + + resid -= xfersize; + offset += xfersize; + } + +out: + if (*ap->a_offset != offset) { + *ap->a_offset = offset; + *ap->a_len = resid; + } + return (error); +} + int vop_stdadvise(struct vop_advise_args *ap) { Index: sys/kern/vfs_vnops.c =================================================================== --- sys/kern/vfs_vnops.c +++ sys/kern/vfs_vnops.c @@ -106,6 +106,7 @@ static fo_close_t vn_closefile; static fo_mmap_t vn_mmap; static fo_fallocate_t vn_fallocate; +static fo_fspacectl_t vn_fspacectl; struct fileops vnops = { .fo_read = vn_io_fault, @@ -123,6 +124,7 @@ .fo_fill_kinfo = vn_fill_kinfo, .fo_mmap = vn_mmap, .fo_fallocate = vn_fallocate, + .fo_fspacectl = vn_fspacectl, .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; @@ -2345,7 +2347,8 @@ } int -vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) +vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off, + struct ucred *cred) { struct vattr va; daddr_t bn, bnp; @@ -2353,22 +2356,17 @@ off_t noff; int error; - KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, - ("Wrong command %lu", cmd)); - - if (vn_lock(vp, LK_SHARED) != 0) - return (EBADF); if (vp->v_type != VREG) { error = ENOTTY; - goto unlock; + goto out; } error = VOP_GETATTR(vp, &va, cred); if (error != 0) - goto unlock; + goto out; noff = *off; if (noff >= va.va_size) { error = ENXIO; - goto unlock; + goto out; } bsize = vp->v_mount->mnt_stat.f_iosize; for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize - @@ -2376,14 +2374,14 @@ error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL); if (error == EOPNOTSUPP) { error = ENOTTY; - goto unlock; + goto out; } if ((bnp == -1 && cmd == FIOSEEKHOLE) || (bnp != -1 && cmd == FIOSEEKDATA)) { noff = bn * bsize; if (noff < *off) noff = *off; - goto unlock; + goto out; } } if (noff > va.va_size) @@ -2391,13 +2389,28 @@ /* noff == va.va_size. There is an implicit hole at the end of file. */ if (cmd == FIOSEEKDATA) error = ENXIO; -unlock: +out: VOP_UNLOCK(vp); if (error == 0) *off = noff; return (error); } +int +vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred) +{ + int error; + + KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA, + ("Wrong command %lu", cmd)); + + if (vn_lock(vp, LK_SHARED) != 0) + return (EBADF); + error = vn_bmap_seekhole_locked(vp, cmd, off, cred); + VOP_UNLOCK(vp); + return (error); +} + int vn_seek(struct file *fp, off_t offset, int whence, struct thread *td) { @@ -3339,6 +3352,74 @@ return (error); } +static int +vn_deallocate(struct vnode *vp, off_t off, off_t resid, int flags, + struct ucred *active_cred, struct thread *td) +{ + struct mount *mp; + int lock_flags; + void *rl_cookie; + int error; + + rl_cookie = vn_rangelock_wlock(vp, off, off + resid); + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto out; + + if (MNT_SHARED_WRITES(mp) || + (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) { + lock_flags = LK_SHARED; + } else { + lock_flags = LK_EXCLUSIVE; + } + vn_lock(vp, lock_flags | LK_RETRY); + + error = VOP_DEALLOCATE(vp, &off, &resid, flags, active_cred); + VOP_UNLOCK(vp); + vn_finished_write(mp); + +out: + vn_rangelock_unlock(vp, rl_cookie); + return (error); +} + +static int +vn_fspacectl(struct file *fp, int cmd, off_t offset, off_t len, int flags, + struct ucred *active_cred, struct thread *td) +{ + int error; + struct vnode *vp; + off_t off, resid; + + vp = fp->f_vnode; + off = offset; + resid = len; + + if ((cmd != SPACECTL_DEALLOC) || (offset < 0 || resid < 0) || + (flags & ~SPACECTL_F_SUPPORTED)) + return (EINVAL); + if (vp->v_type != VREG) + return (ENODEV); + + /* Take the maximum range if end offset overflows */ + if (resid > OFF_MAX - off) + resid = OFF_MAX - off; + if (resid == 0) { + /* Degenerated case */ + return (0); + } + + switch (cmd) { + case SPACECTL_DEALLOC: + error = vn_deallocate(vp, off, resid, flags, active_cred, + td); + break; + default: + panic("vn_fspacectl: unknown cmd %d", cmd); + } + + return (error); +} + static u_long vn_lock_pair_pause_cnt; SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD, &vn_lock_pair_pause_cnt, 0, Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -792,6 +792,17 @@ }; +%% deallocate vp L L L + +vop_deallocate { + IN struct vnode *vp; + INOUT off_t *offset; + INOUT off_t *len; + IN int flags; + IN struct ucred *cred; +}; + + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, Index: sys/security/audit/audit_bsm.c =================================================================== --- sys/security/audit/audit_bsm.c +++ sys/security/audit/audit_bsm.c @@ -1076,6 +1076,18 @@ FD_VNODE1_TOKENS; break; + case AUE_FSPACECTL: + if (ARG_IS_VALID(kar, ARG_CMD)) { + tok = au_to_arg32(2, "operation", ar->ar_arg_cmd); + kau_write(rec, tok); + } + if (ARG_IS_VALID(kar, ARG_FFLAGS)) { + tok = au_to_arg32(4, "flags", ar->ar_arg_fflags); + kau_write(rec, tok); + } + FD_VNODE1_TOKENS; + break; + case AUE_RFORK: if (ARG_IS_VALID(kar, ARG_FFLAGS)) { tok = au_to_arg32(1, "flags", ar->ar_arg_fflags); Index: sys/sys/fcntl.h =================================================================== --- sys/sys/fcntl.h +++ sys/sys/fcntl.h @@ -314,6 +314,14 @@ short l_type; /* lock type: read/write, etc. */ short l_whence; /* type of l_start */ }; + +/* + * Space control offset/length description + */ +struct spacectl_range { + off_t r_offset; /* starting offset */ + off_t r_resid; /* length */ +}; #endif #if __BSD_VISIBLE @@ -343,6 +351,20 @@ * similar syscalls. */ #define FD_NONE -200 + +/* + * Commands for fspacectl(2) + */ +#define SPACECTL_ALLOC 0 /* allocate space */ +#define SPACECTL_DEALLOC 1 /* deallocate space */ + +/* + * fspacectl(2) flags + */ +#define SPACECTL_F_ATOMIC 0x01 /* the operation is atomic with respect + to other FS operations if operation + range overlaps */ +#define SPACECTL_F_SUPPORTED (SPACECTL_F_ATOMIC) #endif #ifndef _KERNEL @@ -360,6 +382,9 @@ int posix_fadvise(int, off_t, off_t, int); int posix_fallocate(int, off_t, off_t); #endif +#if __BSD_VISIBLE +int fspacectl(int, int, struct spacectl_range *, int); +#endif __END_DECLS #endif Index: sys/sys/file.h =================================================================== --- sys/sys/file.h +++ sys/sys/file.h @@ -35,6 +35,7 @@ #ifndef _SYS_FILE_H_ #define _SYS_FILE_H_ +#include "sys/fcntl.h" #ifndef _KERNEL #include /* XXX */ #include @@ -129,6 +130,9 @@ typedef int fo_get_seals_t(struct file *fp, int *flags); typedef int fo_fallocate_t(struct file *fp, off_t offset, off_t len, struct thread *td); +typedef int fo_fspacectl_t(struct file *fp, int cmd, + off_t offset, off_t len, int flags, + struct ucred *active_cred, struct thread *td); typedef int fo_flags_t; struct fileops { @@ -150,6 +154,7 @@ fo_add_seals_t *fo_add_seals; fo_get_seals_t *fo_get_seals; fo_fallocate_t *fo_fallocate; + fo_fspacectl_t *fo_fspacectl; fo_flags_t fo_flags; /* DFLAG_* below */ }; @@ -470,6 +475,17 @@ return ((*fp->f_ops->fo_fallocate)(fp, offset, len, td)); } +static __inline int fo_fspacectl(struct file *fp, int cmd, off_t offset, + off_t len, int flags, struct ucred *active_cred, struct thread *td) +{ + + if (fp->f_ops->fo_fspacectl == NULL) + return (ENODEV); + return ((*fp->f_ops->fo_fspacectl)(fp, cmd, offset, len, flags, + active_cred, td)); +} + + #endif /* _KERNEL */ #endif /* !SYS_FILE_H */ Index: sys/sys/syscall.h =================================================================== --- sys/sys/syscall.h +++ sys/sys/syscall.h @@ -515,4 +515,5 @@ #define SYS___specialfd 577 #define SYS_aio_writev 578 #define SYS_aio_readv 579 -#define SYS_MAXSYSCALL 580 +#define SYS_fspacectl 580 +#define SYS_MAXSYSCALL 581 Index: sys/sys/syscall.mk =================================================================== --- sys/sys/syscall.mk +++ sys/sys/syscall.mk @@ -420,4 +420,5 @@ rpctls_syscall.o \ __specialfd.o \ aio_writev.o \ - aio_readv.o + aio_readv.o \ + fspacectl.o Index: sys/sys/syscallsubr.h =================================================================== --- sys/sys/syscallsubr.h +++ sys/sys/syscallsubr.h @@ -59,6 +59,7 @@ struct sched_param; union semun; struct sockaddr; +struct spacectl_range; struct stat; struct thr_param; struct timex; @@ -230,6 +231,10 @@ int advice); int kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len); +int kern_fdeallocate(struct thread *td, int fd, off_t offset, off_t len); +int kern_fzerorange(struct thread *td, int fd, off_t offset, off_t len); +int kern_fspacectl(struct thread *td, int fd, int cmd, + struct spacectl_range *, int flags); int kern_procctl(struct thread *td, enum idtype idtype, id_t id, int com, void *data); int kern_pread(struct thread *td, int fd, void *buf, size_t nbyte, Index: sys/sys/sysproto.h =================================================================== --- sys/sys/sysproto.h +++ sys/sys/sysproto.h @@ -1847,6 +1847,12 @@ struct aio_readv_args { char aiocbp_l_[PADL_(struct aiocb *)]; struct aiocb * aiocbp; char aiocbp_r_[PADR_(struct aiocb *)]; }; +struct fspacectl_args { + char fd_l_[PADL_(int)]; int fd; char fd_r_[PADR_(int)]; + char cmd_l_[PADL_(int)]; int cmd; char cmd_r_[PADR_(int)]; + char range_l_[PADL_(struct spacectl_range *)]; struct spacectl_range * range; char range_r_[PADR_(struct spacectl_range *)]; + char flags_l_[PADL_(int)]; int flags; char flags_r_[PADR_(int)]; +}; int nosys(struct thread *, struct nosys_args *); void sys_sys_exit(struct thread *, struct sys_exit_args *); int sys_fork(struct thread *, struct fork_args *); @@ -2241,6 +2247,7 @@ int sys___specialfd(struct thread *, struct __specialfd_args *); int sys_aio_writev(struct thread *, struct aio_writev_args *); int sys_aio_readv(struct thread *, struct aio_readv_args *); +int sys_fspacectl(struct thread *, struct fspacectl_args *); #ifdef COMPAT_43 @@ -3175,6 +3182,7 @@ #define SYS_AUE___specialfd AUE_SPECIALFD #define SYS_AUE_aio_writev AUE_AIO_WRITEV #define SYS_AUE_aio_readv AUE_AIO_READV +#define SYS_AUE_fspacectl AUE_FSPACECTL #undef PAD_ #undef PADL_ Index: sys/sys/unistd.h =================================================================== --- sys/sys/unistd.h +++ sys/sys/unistd.h @@ -156,6 +156,7 @@ #define _PC_INF_PRESENT 62 #define _PC_MAC_PRESENT 63 #define _PC_ACL_NFS4 64 +#define _PC_FDEALLOC_PRESENT 65 #endif /* From OpenSolaris, used by SEEK_DATA/SEEK_HOLE. */ Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -719,6 +719,9 @@ void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); int vrecyclel(struct vnode *vp); +/* vn_bmap_seekhole_locked is not public KPI */ +int vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off, + struct ucred *cred); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp,