Index: sys/kern/vfs_extattr.c =================================================================== --- sys/kern/vfs_extattr.c +++ sys/kern/vfs_extattr.c @@ -49,6 +49,14 @@ #include #include +#include +#include +#include +#include +#include +#include +#include + /* * Syscall to push extended attribute configuration information into the VFS. * Accepts a path, which it converts to a mountpoint, as well as a command @@ -147,45 +155,177 @@ return (error); } -/*- - * Set a named extended attribute on a file or directory - * - * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", - * kernelspace string pointer "attrname", userspace buffer - * pointer "data", buffer length "nbytes", thread "td". - * Returns: 0 on success, an error number otherwise - * Locks: none - * References: vp must be a valid reference for the duration of the call - */ +static bool +do_vn_extattr_io_fault(struct vnode *vp, struct uio *uio, struct thread *td) +{ + struct mount *mp; + ssize_t size; + int res, error; + + /* + * If we cannot get sysctl value, let assume that io_fault is enabled. + */ + error = kernel_sysctlbyname(td, "debug.vn_io_fault_enable", &res, &size, + 0, 0, 0, 0); + if (error) + res = 1; + + return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG && + (mp = vp->v_mount) != NULL && + (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && res); +} + +struct vn_extattr_io_fault_args { + struct ucred *cred; + struct vop_args_tag { + struct vnode *vp; + int attrnamespace; + char attrname[EXTATTR_MAXNAMELEN]; + enum extattr_dio_t { + listextattr_t, + getextattr_t, + setextattr_t + } doio; + } vop_args; +}; + static int -extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, - void *data, size_t nbytes, struct thread *td) +vn_extattr_io_fault_touch(char *base, const struct uio *uio) +{ + int r; + + r = fubyte(base); + if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1)) + return (EFAULT); + return (0); +} + +static int +vn_extattr_io_fault_prefault_user(const struct uio *uio) +{ + char *base; + const struct iovec *iov; + size_t len; + ssize_t resid; + int error, i; + + KASSERT(uio->uio_segflg == UIO_USERSPACE, + ("vn_io_fault_prefault_user userspace")); + + error = i = 0; + iov = uio->uio_iov; + resid = uio->uio_resid; + base = iov->iov_base; + len = iov->iov_len; + while (resid > 0) { + error = vn_extattr_io_fault_touch(base, uio); + if (error != 0) + break; + if (len < PAGE_SIZE) { + if (len != 0) { + error = vn_extattr_io_fault_touch( + base + len - 1, uio); + if (error != 0) + break; + resid -= len; + } + if (++i >= uio->uio_iovcnt) + break; + iov = uio->uio_iov + i; + base = iov->iov_base; + len = iov->iov_len; + } else { + len -= PAGE_SIZE; + base += PAGE_SIZE; + resid -= PAGE_SIZE; + } + } + return (error); +} + +static int +extattr_list_vp_helper(struct vnode *vp, int attrnamespace, struct uio *uio, + size_t *sizep, struct thread *td) +{ + ssize_t cnt; + int error; + + if (uio != NULL) + cnt = uio->uio_resid; + + vn_lock(vp, LK_SHARED | LK_RETRY); + +#ifdef MAC + error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace); + if (error) + goto done; +#endif + + error = VOP_LISTEXTATTR(vp, attrnamespace, uio, sizep, + td->td_ucred, td); + + if (uio != NULL) { + cnt -= uio->uio_resid; + td->td_retval[0] = cnt; + } else + td->td_retval[0] = *sizep; +#ifdef MAC +done: +#endif + VOP_UNLOCK(vp, 0); + return (error); +} + +static int +extattr_get_vp_helper(struct vnode *vp, int attrnamespace, const char *attrname, + struct uio *uio, size_t *sizep, struct thread *td) +{ + ssize_t cnt; + int error; + + if (uio != NULL) + cnt = uio->uio_resid; + + vn_lock(vp, LK_SHARED | LK_RETRY); + +#ifdef MAC + error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace, + attrname); + if (error) + goto done; +#endif + + error = VOP_GETEXTATTR(vp, attrnamespace, attrname, uio, sizep, + td->td_ucred, td); + + if (uio != NULL) { + cnt -= uio->uio_resid; + td->td_retval[0] = cnt; + } else + td->td_retval[0] = *sizep; +#ifdef MAC +done: +#endif + VOP_UNLOCK(vp, 0); + return (error); +} + + +static int +extattr_set_vp_helper(struct vnode *vp, int attrnamespace, const char *attrname, + struct uio *uio, struct thread *td) { struct mount *mp; - struct uio auio; - struct iovec aiov; ssize_t cnt; int error; - if (nbytes > IOSIZE_MAX) - return (EINVAL); + cnt = uio->uio_resid; error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error) return (error); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - aiov.iov_base = data; - aiov.iov_len = nbytes; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = 0; - auio.uio_resid = nbytes; - auio.uio_rw = UIO_WRITE; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_td = td; - cnt = nbytes; - #ifdef MAC error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace, attrname); @@ -193,9 +333,9 @@ goto done; #endif - error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, + error = VOP_SETEXTATTR(vp, attrnamespace, attrname, uio, td->td_ucred, td); - cnt -= auio.uio_resid; + cnt -= uio->uio_resid; td->td_retval[0] = cnt; #ifdef MAC @@ -206,6 +346,217 @@ return (error); } +static int +vn_extattr_io_fault_doio(struct vn_extattr_io_fault_args *args, struct uio *uio, + struct thread *td) +{ + size_t size; + + KASSERT(uio != NULL, ("vn_extattr_io_fault_doio bad uio")); + + switch (args->vop_args.doio) { + case listextattr_t: + size = uio->uio_resid; + return (extattr_list_vp_helper(args->vop_args.vp, + args->vop_args.attrnamespace, uio, &size, td)); + + case getextattr_t: + size = uio->uio_resid; + return (extattr_get_vp_helper(args->vop_args.vp, + args->vop_args.attrnamespace, args->vop_args.attrname, uio, + &size, td)); + + case setextattr_t: + return (extattr_set_vp_helper(args->vop_args.vp, + args->vop_args.attrnamespace, args->vop_args.attrname, + uio, td)); + + default: + panic("vn_extattr_io_fault_doio: unknown kind of io %d %d", + args->vop_args.doio, uio->uio_rw); + } + + return (0); +} + +static const int io_hold_cnt = 16; + +static int +vn_extattr_io_fault1(struct vnode *vp, struct uio *uio, struct vn_extattr_io_fault_args *args, + struct thread *td) +{ + vm_page_t ma[io_hold_cnt + 2]; + struct uio *uio_clone, short_uio; + struct iovec short_iovec[1]; + vm_page_t *prev_td_ma; + vm_prot_t prot; + vm_offset_t addr, end; + size_t len, resid; + ssize_t adv, size; + int do_io_prefault, error, cnt, save, saveheld, prev_td_ma_cnt; + + /* + * It is impossible to operate with extattrs with non-zero offset. + */ + KASSERT(uio->uio_offset == 0, + ("vn_extattr_io_fault1 bad uio_offset")); + + /* + * Disable prefaulting if we can not get the sysctl value. + */ + error = kernel_sysctlbyname(td, + "debug.vn_io_fault_prefault", &do_io_prefault, &size, 0, 0, 0, 0); + if (error) + do_io_prefault = 0; + + if (do_io_prefault) { + error = vn_extattr_io_fault_prefault_user(uio); + if (error != 0) + return (error); /* Or ignore ? */ + } + + prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ; + + /* + * The UFS follows IO_UNIT directive and replays back both + * uio_offset and uio_resid if an error is encountered during the + * operation. But, since the iovec may be already advanced, + * uio is still in an inconsistent state. + * + * Cache a copy of the original uio, which is advanced to the redo + * point using UIO_NOCOPY below. + */ + uio_clone = cloneuio(uio); + resid = uio->uio_resid; + + short_uio.uio_segflg = UIO_USERSPACE; + short_uio.uio_rw = uio->uio_rw; + short_uio.uio_td = uio->uio_td; + + save = vm_fault_disable_pagefaults(); + error = vn_extattr_io_fault_doio(args, uio, td); + if (error != EFAULT) + goto out; + + /* Count iofaults here. */ + uio_clone->uio_segflg = UIO_NOCOPY; + uiomove(NULL, resid - uio->uio_resid, uio_clone); + uio_clone->uio_segflg = uio->uio_segflg; + + saveheld = curthread_pflags_set(TDP_UIOHELD); + prev_td_ma = td->td_ma; + prev_td_ma_cnt = td->td_ma_cnt; + + while (uio_clone->uio_resid != 0) { + len = uio_clone->uio_iov->iov_len; + if (len == 0) { + KASSERT(uio_clone->uio_iovcnt >= 1, + ("iovcnt underflow")); + uio_clone->uio_iov++; + uio_clone->uio_iovcnt--; + continue; + } + if (len > io_hold_cnt * PAGE_SIZE) + len = io_hold_cnt * PAGE_SIZE; + addr = (uintptr_t)uio_clone->uio_iov->iov_base; + end = round_page(addr + len); + if (end < addr) { + error = EFAULT; + break; + } + cnt = atop(end - trunc_page(addr)); + /* + * A perfectly misaligned address and length could cause + * both the start and the end of the chunk to use partial + * page. +2 accounts for such a situation. + */ + cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map, + addr, len, prot, ma, io_hold_cnt + 2); + if (cnt == -1) { + error = EFAULT; + break; + } + short_uio.uio_iov = &short_iovec[0]; + short_iovec[0].iov_base = (void *)addr; + short_uio.uio_iovcnt = 1; + short_uio.uio_resid = short_iovec[0].iov_len = len; + short_uio.uio_offset = uio_clone->uio_offset; + td->td_ma = ma; + td->td_ma_cnt = cnt; + + error = vn_extattr_io_fault_doio(args, &short_uio, td); + vm_page_unhold_pages(ma, cnt); + adv = len - short_uio.uio_resid; + + uio_clone->uio_iov->iov_base = + (char *)uio_clone->uio_iov->iov_base + adv; + uio_clone->uio_iov->iov_len -= adv; + uio_clone->uio_resid -= adv; + uio_clone->uio_offset += adv; + + uio->uio_resid -= adv; + uio->uio_offset += adv; + + if (error != 0 || adv == 0) + break; + } + td->td_ma = prev_td_ma; + td->td_ma_cnt = prev_td_ma_cnt; + curthread_pflags_restore(saveheld); +out: + vm_fault_enable_pagefaults(save); + free(uio_clone, M_IOV); + return (error); +} + +/*- + * Set a named extended attribute on a file or directory + * + * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace", + * kernelspace string pointer "attrname", userspace buffer + * pointer "data", buffer length "nbytes", thread "td". + * Returns: 0 on success, an error number otherwise + * Locks: none + * References: vp must be a valid reference for the duration of the call + */ +static int +extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname, + void *data, size_t nbytes, struct thread *td) +{ + struct uio uio; + struct iovec aiov; + void *rl_cookie; + struct vn_extattr_io_fault_args args; + int error; + + if (nbytes > IOSIZE_MAX) + return (EINVAL); + + aiov.iov_base = data; + aiov.iov_len = nbytes; + uio.uio_iov = &aiov; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_resid = nbytes; + uio.uio_rw = UIO_WRITE; + uio.uio_segflg = UIO_USERSPACE; + uio.uio_td = td; + + if (do_vn_extattr_io_fault(vp, &uio, td)) { + args.cred = td->td_ucred; + args.vop_args.vp = vp; + args.vop_args.attrnamespace = attrnamespace; + memcpy(args.vop_args.attrname, attrname, EXTATTR_MAXNAMELEN); + args.vop_args.doio = setextattr_t; + rl_cookie = vn_rangelock_wlock(vp, 0, uio.uio_resid); + error = vn_extattr_io_fault1(vp, &uio, &args, td); + vn_rangelock_unlock(vp, rl_cookie); + return (error); + } + + return (extattr_set_vp_helper(vp, attrnamespace, attrname, &uio, td)); +} + int sys_extattr_set_fd(td, uap) struct thread *td; @@ -327,15 +678,14 @@ { struct uio auio, *auiop; struct iovec aiov; - ssize_t cnt; size_t size, *sizep; + void *rl_cookie; + struct vn_extattr_io_fault_args args; int error; if (nbytes > IOSIZE_MAX) return (EINVAL); - vn_lock(vp, LK_SHARED | LK_RETRY); - /* * Slightly unusual semantics: if the user provides a NULL data * pointer, they don't want to receive the data, just the maximum @@ -343,7 +693,6 @@ */ auiop = NULL; sizep = NULL; - cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; @@ -355,30 +704,22 @@ auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; - cnt = nbytes; + if (do_vn_extattr_io_fault(vp, auiop, td)) { + args.cred = td->td_ucred; + args.vop_args.vp = vp; + args.vop_args.attrnamespace = attrnamespace; + memcpy(args.vop_args.attrname, attrname, EXTATTR_MAXNAMELEN); + args.vop_args.doio = getextattr_t; + rl_cookie = vn_rangelock_rlock(vp, 0, auiop->uio_resid); + error = vn_extattr_io_fault1(vp, auiop, &args, td); + vn_rangelock_unlock(vp, rl_cookie); + return (error); + } } else sizep = &size; -#ifdef MAC - error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace, - attrname); - if (error) - goto done; -#endif - - error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep, - td->td_ucred, td); - - if (auiop != NULL) { - cnt -= auio.uio_resid; - td->td_retval[0] = cnt; - } else - td->td_retval[0] = size; -#ifdef MAC -done: -#endif - VOP_UNLOCK(vp, 0); - return (error); + return (extattr_get_vp_helper(vp, attrnamespace, attrname, auiop, + sizep, td)); } int @@ -636,17 +977,15 @@ struct uio auio, *auiop; size_t size, *sizep; struct iovec aiov; - ssize_t cnt; + void *rl_cookie; + struct vn_extattr_io_fault_args args; int error; if (nbytes > IOSIZE_MAX) return (EINVAL); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - auiop = NULL; sizep = NULL; - cnt = 0; if (data != NULL) { aiov.iov_base = data; aiov.iov_len = nbytes; @@ -658,29 +997,20 @@ auio.uio_segflg = UIO_USERSPACE; auio.uio_td = td; auiop = &auio; - cnt = nbytes; + if (do_vn_extattr_io_fault(vp, auiop, td)) { + args.cred = td->td_ucred; + args.vop_args.vp = vp; + args.vop_args.attrnamespace = attrnamespace; + args.vop_args.doio = listextattr_t; + rl_cookie = vn_rangelock_rlock(vp, 0, auiop->uio_resid); + error = vn_extattr_io_fault1(vp, auiop, &args, td); + vn_rangelock_unlock(vp, rl_cookie); + return (error); + } } else sizep = &size; -#ifdef MAC - error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace); - if (error) - goto done; -#endif - - error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep, - td->td_ucred, td); - - if (auiop != NULL) { - cnt -= auio.uio_resid; - td->td_retval[0] = cnt; - } else - td->td_retval[0] = size; -#ifdef MAC -done: -#endif - VOP_UNLOCK(vp, 0); - return (error); + return (extattr_list_vp_helper(vp, attrnamespace, auiop, sizep, td)); }