Index: share/man/man9/VOP_UBOP.9 =================================================================== --- /dev/null +++ share/man/man9/VOP_UBOP.9 @@ -0,0 +1,71 @@ +.\" -*- nroff -*- +.\" +.\" Copyright (c) 2020 Matthew Macy +.\" +.\" All rights reserved. +.\" +.\" This program is free software. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR +.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +.\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, +.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +.\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +.\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd October 13, 2020 +.Dt VOP_UBOP 9 +.Os +.Sh NAME +.Nm VOP_UBOP +.Nd read or write a file system buffer +.Sh SYNOPSIS +.In sys/param.h +.In sys/vnode.h +.In sys/uio.h +.Ft int +.Fn VOP_UBOP "struct vnode *vp" "struct uio_bio *uio" "int io_flags" +.Sh DESCRIPTION +The arguments are: +.Bl -tag -width 2n +.It Fa vp +The vnode that the operation is for. +.It Fa uio +The context for the operation. +.It Fa flags +The flags for the operation: +.Bl -tag -width ".Dv FOF_OFFSET" +.It Dv FOF_OFFSET +uio_offset is valid. +.It Dv FAPPEND +Write to the end of the file. +.El +.El +.Pp +This call either reads from, writes to, or syncs a file, depending on the value of +.Fa uio->uio_cmd . +.Sh RETURN VALUES +Zero if completed immediately, EINPROGRESS otherwise. +Errors should be signalled by setting UIO_BIO_ERROR on uio_flags field in struct uio_bio, +and setting uio_error to the appropriate errno value. +.Sh SEE ALSO +.Xr uio_bio 9 , +.Xr vnode 9 +.Sh AUTHORS +This manual page was written by +.An Matthew Macy . Index: share/man/man9/uio_bio.9 =================================================================== --- /dev/null +++ share/man/man9/uio_bio.9 @@ -0,0 +1,230 @@ +.\" +.\" Copyright (c) 2020 Matthew Macy +.\" +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR +.\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +.\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +.\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT, +.\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +.\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +.\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +.\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +.\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd March 11, 2020 +.Dt UIO 9 +.Os +.Sh NAME +.Nm uio_bio , +.Nm uiobiomove , +.Nd asynchronous file system I/O routines +.Sh SYNOPSIS +.In sys/types.h +.In sys/uio.h +.Bd -literal +struct uio_bio { + uint8_t uio_cmd; /* operation */ + uint8_t uio_error; /* Errno for UIO_ERROR. */ + uint16_t uio_ma_cnt; /* length in pages of scatter/gather list */ + uint16_t uio_flags; /* General flags */ + off_t uio_ma_offset; /* offset in to page list */ + off_t uio_offset; /* offset in target object */ + uint32_t uio_resid; /* remaining bytes to process */ + struct thread *uio_td; /* owner */ + void (*uio_bio_done)(struct uio_bio *); /* I/O completion routine */ + void *uio_arg; /* argument to completion routine */ + struct vm_page **uio_ma; /* user buffer's pages */ +}; +.Ed +.Ft int +.Fn uiomove "void *buf" "int howmuch" "struct uio_bio *uiop" +.Sh DESCRIPTION +The function +.Fn uiobiomove , +is used to transfer data between buffers and pages that might +possibly cross the user/kernel space boundary. +.Pp +As a result of any +.Xr aio_read 2 , +.Xr aio_write 2 , +or +.Xr lio_listio 2 +system call that is being passed to a supporting file system, +.Va VOP_UBOP +will be called with a pointer to a +.Vt "struct uio_bio" +being passed. +The transfer request is encoded in this structure. +The driver itself should use +.Fn uiobiomove +to get at the data in this structure. +.Pp +The fields in the +.Vt uio +structure are: +.Bl -tag -width ".Va uio_ma_offset" +.It Va uio_cmd +The operation to be performed: +.Bl -tag -width ".Dv UIO_BIO_WRITE" +.It Dv UIO_BIO_READ +Read from a file. +.It Dv UIO_BIO_WRITE +Write to a file. +.It Dv UIO_BIO_SYNC +Sync a file to backing storage. +.El +.It Va uio_error +The error code if the operation was not successful. +.It Va uio_ma_cnt +The number of entries in the passed page array. +.It Va uio_flags +.Bl -tag -width ".Dv UIO_BIO_SPARSE" +.It Dv UIO_BIO_ERROR +The uio_error field is valid. +.It Dv UIO_BIO_SPARSE +The page array is not completely populated. +.El +.It Va uio_ma_offset +The starting byte offset in to the page list. +.It Va uio_offset +The offset into the file. +.It Va uio_resid +The remaining number of bytes to process, +updated after transfer. +.It Va uio_td +The pointer to a +.Vt "struct thread" +for the associated thread. +.It Va uio_bio_done +The I/O completion routine. +.It Va uio_arg +The argument to pass to the I/O completion routine. +.It Va uio_ma +A pointer to the caller's pages. +.El +.Pp +.Sh RETURN VALUES +On success +.Fn uiobiomove , +will return 0; on error it will return an appropriate error code. +.Sh EXAMPLES +The idea is that the file system maintains private buffers for its data, +and processes the request in chunks of maximal the size of these +buffers. +.Bd -literal +#include +#include +#include +#include + +static uint64_t +dmu_physmove(dmu_buf_set_t *dbs, dmu_buf_t *db, uint64_t off, uint64_t sz) +{ + struct uio_bio *uio = (struct uio_bio *)dbs->dbs_dc->dc_data_buf; + uint64_t adv = uio->uio_resid; + int err; + + err = uiobiomove((char *)db->db_data + off, sz, uio); + if (err) + dbs->dbs_err = err; + adv -= uio->uio_resid; + + return (adv); +} + +static int +aio_queue_vfs(struct kaiocb *job) +{ + struct aiocb *cb; + struct file *fp; + struct vnode *vp; + struct uio_bio *ubio, ubio_local; + vm_prot_t prot; + uint32_t io_size, bio_size; + int error, cmd; + vm_offset_t page_offset; + + cb = &job->uaiocb; + fp = job->fd_file; + ... + vp = fp->f_vnode; + + /* + * Zero length read should always succeed + * if supported. + */ + bzero(&ubio_local, sizeof(ubio_local)); + ubio_local.uio_cmd = UIO_BIO_READ; + if (VOP_UBOP(vp, &ubio_local, FOF_OFFSET, curthread->td_ucred) == EOPNOTSUPP) + return (-1); + ... + page_offset = ((vm_offset_t)cb->aio_buf) & PAGE_MASK; + cmd = cb->aio_lio_opcode == LIO_WRITE ? UIO_BIO_WRITE : UIO_BIO_READ; + io_size = cb->aio_nbytes + page_offset + PAGE_MASK; + io_size &= ~PAGE_MASK; + bio_size = sizeof(*ubio); + if (io_size <= MAXPHYS) { + ubio = malloc(bio_size, M_AIOS, M_WAITOK); + ubio->uio_ma = job->pages; + } else { + bio_size += sizeof(vm_page_t )*btoc(io_size); + ubio = malloc(bio_size, M_AIOS, M_WAITOK); + ubio->uio_ma = (vm_page_t*)(ubio + 1); + } + ubio->uio_cmd = cmd; + ubio->uio_error = 0; + ubio->uio_flags = 0; + ubio->uio_ma_offset = page_offset; + ubio->uio_offset = cb->aio_offset; + ubio->uio_resid = cb->aio_nbytes; + ubio->uio_td = curthread; + ubio->uio_bio_done = aio_ubiowakeup; + ubio->uio_arg = job; + + prot = VM_PROT_READ; + if (cb->aio_lio_opcode == LIO_READ) + prot |= VM_PROT_WRITE; /* Less backwards than it looks */ + ubio->uio_ma_cnt = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, + (vm_offset_t)cb->aio_buf, cb->aio_nbytes, prot, ubio->uio_ma, + btoc(MAX(io_size, MAXPHYS))); + if (ubio->uio_ma_cnt < 0) { + error = EFAULT; + goto err; + } + + error = VOP_UBOP(vp, ubio, FOF_OFFSET, curthread->td_ucred); + if (error == EINPROGRESS || error == 0) + return (0); +err: + free(ubio, M_AIOS); + return (error); +} + +.Ed +.El +.Sh SEE ALSO +.Xr aio_read 2 , +.Xr aio_write 2 , +.Xr lio_listio 2 , +.Xr VOP_UBOP 9 , +.Sh HISTORY +The +.Nm +mechanism was introduced to support asynchronous file system requests in OpenZFS. +.Sh AUTHORS +This manual page was written by +.An Matthew Macy . Index: sys/kern/subr_uio.c =================================================================== --- sys/kern/subr_uio.c +++ sys/kern/subr_uio.c @@ -279,6 +279,72 @@ return (error); } +int +uiobiomove(void *cp, int n, struct uio_bio *uio) +{ + vm_paddr_t paddr; + int rc; + struct iovec iov[1]; + struct uio auio; + ssize_t resid, size; + int pageoff, pageidx; + + + KASSERT(uio->uio_ma_offset + n <= (uio->uio_ma_cnt << PAGE_SHIFT), + ("byte count too large offset: %ju count: %d size: %d", + uio->uio_ma_offset, n, uio->uio_ma_cnt << PAGE_SHIFT)); + + pageoff = uio->uio_ma_offset & PAGE_MASK; + pageidx = uio->uio_ma_offset >> PAGE_SHIFT; + if (pageoff) { + size = min(PAGE_SIZE - pageoff, n); + paddr = VM_PAGE_TO_PHYS(uio->uio_ma[pageidx]) + pageoff; + /* + * This is confusing because the user addresses are physical + * and the kernel addresses are virtual, so the naming is + * reversed. + */ + if (uio->uio_cmd == UIO_BIO_READ) + rc = physcopyin(cp, paddr, size); + else if (uio->uio_cmd == UIO_BIO_WRITE) + rc = physcopyout(paddr, cp, size); + else + panic("invalid command to uiobiomove: %d", uio->uio_cmd); + n -= size; + if (rc != 0) + return (rc); + pageidx++; + uio->uio_ma_offset += size; + uio->uio_offset += size; + uio->uio_resid -= size; + cp = ((char *)cp) + size; + } + iov[0].iov_base = cp; + iov[0].iov_len = n; + auio.uio_iov = iov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + resid = auio.uio_resid = uio->uio_resid; + auio.uio_segflg = UIO_SYSSPACE; + /* + * Again reversed because user is physical + */ + if (uio->uio_cmd == UIO_BIO_READ) + auio.uio_rw = UIO_WRITE; + else if (uio->uio_cmd == UIO_BIO_WRITE) + auio.uio_rw = UIO_READ; + else + panic("invalid command to uiobiomove: %d", uio->uio_cmd); + rc = uiomove_fromphys(uio->uio_ma + pageidx, 0, n, &auio); + if (rc == 0) { + size = (resid - auio.uio_resid); + uio->uio_resid -= size; + uio->uio_offset += size; + uio->uio_ma_offset += size; + } + return (rc); +} + /* * Wrapper for uiomove() that validates the arguments against a known-good * kernel buffer. Currently, uiomove accepts a signed (n) argument, which Index: sys/kern/vfs_aio.c =================================================================== --- sys/kern/vfs_aio.c +++ sys/kern/vfs_aio.c @@ -101,6 +101,10 @@ #define MAX_BUF_AIO 16 #endif +#ifndef MAX_VFS_XFER +#define MAX_VFS_XFER (32 * 1024 * 1024) /* 32MB - DMU_MAX_ACCESS/2 */ +#endif + FEATURE(aio, "Asynchronous I/O"); SYSCTL_DECL(_p1003_1b); @@ -319,9 +323,11 @@ struct aioliojob *lio, int type, struct aiocb_ops *ops); static int aio_queue_file(struct file *fp, struct kaiocb *job); static void aio_biowakeup(struct bio *bp); +static void aio_ubiowakeup(struct uio_bio *ubio); static void aio_proc_rundown(void *arg, struct proc *p); static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp); +static int aio_queue_vfs(struct kaiocb *job); static int aio_qbio(struct proc *p, struct kaiocb *job); static void aio_daemon(void *param); static void aio_bio_done_notify(struct proc *userp, struct kaiocb *job); @@ -1284,8 +1290,9 @@ bp->bio_caller1 = (void *)job; prot = VM_PROT_READ; + /* Reading from disk means writing to memory */ if (cb->aio_lio_opcode == LIO_READ) - prot |= VM_PROT_WRITE; /* Less backwards than it looks */ + prot |= VM_PROT_WRITE; job->npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, (vm_offset_t)cb->aio_buf, bp->bio_length, prot, job->pages, nitems(job->pages)); @@ -1327,6 +1334,95 @@ return (error); } +/* + * aio_queue_vfs works similarly to aio_qbio. It checks + * that it supports the aio operation in question and + * then if the vnode's file system support asynchronous + * requests. It then sets up the request by holding the + * user's pages with the appropriate permissions. If that + * succeeds it call VOP_UBOP. The uio_bio callback + * aio_ubiowakeup will be called when the operation completes. + */ +static int +aio_queue_vfs(struct kaiocb *job) +{ + struct aiocb *cb; + struct file *fp; + struct vnode *vp; + struct uio_bio *ubio, ubio_local; + vm_prot_t prot; + uint32_t io_size, bio_size; + int error, cmd; + vm_offset_t page_offset; + + cb = &job->uaiocb; + fp = job->fd_file; + + if (!(cb->aio_lio_opcode == LIO_WRITE || + cb->aio_lio_opcode == LIO_READ)) + return (-1); + if (fp == NULL || fp->f_type != DTYPE_VNODE) + return (-1); + + vp = fp->f_vnode; + + /* + * Zero length read should always succeed + * if supported. + */ + bzero(&ubio_local, sizeof(ubio_local)); + ubio_local.uio_cmd = UIO_BIO_READ; + if (VOP_UBOP(vp, &ubio_local, FOF_OFFSET) == EOPNOTSUPP) + return (-1); + /* + * Don't punt here - XXX + */ + if (cb->aio_nbytes > MAX_VFS_XFER) + return (-1); + + page_offset = ((vm_offset_t)cb->aio_buf) & PAGE_MASK; + cmd = cb->aio_lio_opcode == LIO_WRITE ? UIO_BIO_WRITE : UIO_BIO_READ; + io_size = cb->aio_nbytes + page_offset + PAGE_MASK; + io_size &= ~PAGE_MASK; + bio_size = sizeof(*ubio); + if (io_size <= MAXPHYS) { + ubio = malloc(bio_size, M_AIOS, M_WAITOK); + ubio->uio_ma = job->pages; + } else { + bio_size += sizeof(vm_page_t )*btoc(io_size); + ubio = malloc(bio_size, M_AIOS, M_WAITOK); + ubio->uio_ma = (vm_page_t*)(ubio + 1); + } + ubio->uio_cmd = cmd; + ubio->uio_error = 0; + ubio->uio_flags = 0; + ubio->uio_ma_offset = page_offset; + ubio->uio_offset = cb->aio_offset; + ubio->uio_resid = cb->aio_nbytes; + ubio->uio_td = curthread; + ubio->uio_bio_done = aio_ubiowakeup; + ubio->uio_arg = job; + + prot = VM_PROT_READ; + /* Reading from disk means writing to memory */ + if (cb->aio_lio_opcode == LIO_READ) + prot |= VM_PROT_WRITE; + ubio->uio_ma_cnt = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, + (vm_offset_t)cb->aio_buf, cb->aio_nbytes, prot, ubio->uio_ma, + btoc(MAX(io_size, MAXPHYS))); + if (ubio->uio_ma_cnt < 0) { + error = EFAULT; + goto err; + } + + error = VOP_UBOP(vp, ubio, FOF_OFFSET); + if (error == EINPROGRESS || error == 0) + return (0); +err: + free(ubio, M_AIOS); + return (error); +} + #ifdef COMPAT_FREEBSD6 static int convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig) @@ -1695,6 +1791,9 @@ int error; bool safe; + error = aio_queue_vfs(job); + if (error >= 0) + return (error); ki = job->userproc->p_aioinfo; error = aio_qbio(job->userproc, job); if (error >= 0) @@ -2329,6 +2428,33 @@ return (error); } +/* + * aio_ubiowakeup is the uio_bio completion callback for + * aio_queue_vfs. It just drops the hold on the pages + * from aio_queue_vfs and marks the aio as completed. + */ +static void +aio_ubiowakeup(struct uio_bio *ubio) +{ + struct kaiocb *job = (struct kaiocb *)ubio->uio_arg; + size_t nbytes; + int error; + + vm_page_unhold_pages(ubio->uio_ma, ubio->uio_ma_cnt); + + nbytes = job->uaiocb.aio_nbytes - ubio->uio_resid; + error = 0; + if (ubio->uio_flags & UIO_BIO_ERROR) + error = ubio->uio_error; + + if (error) + aio_complete(job, -1, error); + else + aio_complete(job, nbytes, 0); + + free(ubio, M_AIOS); +} + static void aio_biowakeup(struct bio *bp) { Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -783,6 +783,14 @@ }; +%% ubop vp U U U + +vop_ubop { + IN struct vnode *vp; + INOUT struct uio_bio *uio; + IN int ioflag; +}; + # The VOPs below are spares at the end of the table to allow new VOPs to be # added in stable branches without breaking the KBI. New VOPs in HEAD should # be added above these spares. When merging a new VOP to a stable branch, Index: sys/sys/_uio.h =================================================================== --- sys/sys/_uio.h +++ sys/sys/_uio.h @@ -41,6 +41,12 @@ UIO_WRITE }; +enum uio_bio_cmd { + UIO_BIO_READ, + UIO_BIO_WRITE, + UIO_BIO_SYNC +}; + /* Segment flag values. */ enum uio_seg { UIO_USERSPACE, /* from user data space */ Index: sys/sys/uio.h =================================================================== --- sys/sys/uio.h +++ sys/sys/uio.h @@ -62,6 +62,26 @@ struct thread *uio_td; /* owner */ }; +#define HAVE_UBOP +enum uio_bio_flags { + UIO_BIO_ERROR = 1 << 0, + UIO_BIO_SPARSE = 2 << 0, +}; + +struct uio_bio { + uint8_t uio_cmd; /* operation */ + uint8_t uio_error; /* Errno for UIO_ERROR. */ + uint16_t uio_ma_cnt; /* length of scatter/gather list */ + uint16_t uio_flags; /* General flags */ + off_t uio_ma_offset; /* offset in to page list */ + off_t uio_offset; /* offset in target object */ + uint32_t uio_resid; /* remaining bytes to process */ + struct thread *uio_td; /* owner */ + void (*uio_bio_done)(struct uio_bio *); + void *uio_arg; + struct vm_page **uio_ma; /* user buffer's pages */ +}; + /* * Limits * @@ -92,6 +112,7 @@ int physcopyout_vlist(vm_paddr_t src, struct bus_dma_segment *dst, off_t offset, size_t len); int uiomove(void *cp, int n, struct uio *uio); +int uiobiomove(void *cp, int n, struct uio_bio *uio); int uiomove_frombuf(void *buf, int buflen, struct uio *uio); int uiomove_fromphys(struct vm_page *ma[], vm_offset_t offset, int n, struct uio *uio);