Index: head/sys/kern/sys_socket.c =================================================================== --- head/sys/kern/sys_socket.c (revision 300555) +++ head/sys/kern/sys_socket.c (revision 300556) @@ -1,769 +1,781 @@ /*- * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD, NULL, "socket AIO stats"); static int empty_results; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results, 0, "socket operation returned EAGAIN"); static int empty_retries; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries, 0, "socket operation retries"); static fo_rdwr_t soo_read; static fo_rdwr_t soo_write; static fo_ioctl_t soo_ioctl; static fo_poll_t soo_poll; extern fo_kqfilter_t soo_kqfilter; static fo_stat_t soo_stat; static fo_close_t soo_close; static fo_fill_kinfo_t soo_fill_kinfo; static fo_aio_queue_t soo_aio_queue; static void soo_aio_cancel(struct kaiocb *job); struct fileops socketops = { .fo_read = soo_read, .fo_write = soo_write, .fo_truncate = invfo_truncate, .fo_ioctl = soo_ioctl, .fo_poll = soo_poll, .fo_kqfilter = soo_kqfilter, .fo_stat = soo_stat, .fo_close = soo_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = soo_fill_kinfo, .fo_aio_queue = soo_aio_queue, .fo_flags = DFLAG_PASSABLE }; static int soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_receive(active_cred, so); if (error) return (error); #endif error = soreceive(so, 0, uio, 0, 0, 0); return (error); } static int soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_send(active_cred, so); if (error) return (error); #endif error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td); if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(uio->uio_td->td_proc); tdsignal(uio->uio_td, SIGPIPE); PROC_UNLOCK(uio->uio_td->td_proc); } return (error); } static int soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; int error = 0; switch (cmd) { case FIONBIO: SOCK_LOCK(so); if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; SOCK_UNLOCK(so); break; case FIOASYNC: /* * XXXRW: This code separately acquires SOCK_LOCK(so) and * SOCKBUF_LOCK(&so->so_rcv) even though they are the same * mutex to avoid introducing the assumption that they are * the same. */ if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_ASYNC; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_ASYNC; SOCKBUF_UNLOCK(&so->so_snd); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; SOCK_UNLOCK(so); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_ASYNC; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_ASYNC; SOCKBUF_UNLOCK(&so->so_snd); } break; case FIONREAD: /* Unlocked read. */ *(int *)data = sbavail(&so->so_rcv); break; case FIONWRITE: /* Unlocked read. */ *(int *)data = sbavail(&so->so_snd); break; case FIONSPACE: /* Unlocked read. */ if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) *(int *)data = 0; else *(int *)data = sbspace(&so->so_snd); break; case FIOSETOWN: error = fsetown(*(int *)data, &so->so_sigio); break; case FIOGETOWN: *(int *)data = fgetown(&so->so_sigio); break; case SIOCSPGRP: error = fsetown(-(*(int *)data), &so->so_sigio); break; case SIOCGPGRP: *(int *)data = -fgetown(&so->so_sigio); break; case SIOCATMARK: /* Unlocked read. */ *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0; break; default: /* * Interface/routing/protocol specific ioctls: interface and * routing ioctls should have a different entry since a * socket is unnecessary. */ if (IOCGROUP(cmd) == 'i') error = ifioctl(so, cmd, data, td); else if (IOCGROUP(cmd) == 'r') { CURVNET_SET(so->so_vnet); error = rtioctl_fib(cmd, data, so->so_fibnum); CURVNET_RESTORE(); } else { CURVNET_SET(so->so_vnet); error = ((*so->so_proto->pr_usrreqs->pru_control) (so, cmd, data, 0, td)); CURVNET_RESTORE(); } break; } return (error); } static int soo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; #ifdef MAC int error; error = mac_socket_check_poll(active_cred, so); if (error) return (error); #endif return (sopoll(so, events, fp->f_cred, td)); } static int soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; struct sockbuf *sb; #ifdef MAC int error; #endif bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFSOCK; #ifdef MAC error = mac_socket_check_stat(active_cred, so); if (error) return (error); #endif /* * If SBS_CANTRCVMORE is set, but there's still data left in the * receive buffer, the socket is still readable. */ sb = &so->so_rcv; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb)) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; ub->st_size = sbavail(sb) - sb->sb_ctl; SOCKBUF_UNLOCK(sb); sb = &so->so_snd; SOCKBUF_LOCK(sb); if ((sb->sb_state & SBS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; SOCKBUF_UNLOCK(sb); ub->st_uid = so->so_cred->cr_uid; ub->st_gid = so->so_cred->cr_gid; return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub); } /* * API socket close on file pointer. We call soclose() to close the socket * (including initiating closing protocols). soclose() will sorele() the * file reference but the actual socket will not go away until the socket's * ref count hits 0. */ static int soo_close(struct file *fp, struct thread *td) { int error = 0; struct socket *so; so = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; if (so) error = soclose(so); return (error); } static int soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct sockaddr *sa; struct inpcb *inpcb; struct unpcb *unpcb; struct socket *so; int error; kif->kf_type = KF_TYPE_SOCKET; so = fp->f_data; kif->kf_sock_domain = so->so_proto->pr_domain->dom_family; kif->kf_sock_type = so->so_type; kif->kf_sock_protocol = so->so_proto->pr_protocol; kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb; switch (kif->kf_sock_domain) { case AF_INET: case AF_INET6: if (kif->kf_sock_protocol == IPPROTO_TCP) { if (so->so_pcb != NULL) { inpcb = (struct inpcb *)(so->so_pcb); kif->kf_un.kf_sock.kf_sock_inpcb = (uintptr_t)inpcb->inp_ppcb; } } break; case AF_UNIX: if (so->so_pcb != NULL) { unpcb = (struct unpcb *)(so->so_pcb); if (unpcb->unp_conn) { kif->kf_un.kf_sock.kf_sock_unpconn = (uintptr_t)unpcb->unp_conn; kif->kf_un.kf_sock.kf_sock_rcv_sb_state = so->so_rcv.sb_state; kif->kf_un.kf_sock.kf_sock_snd_sb_state = so->so_snd.sb_state; } } break; } error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) { bcopy(sa, &kif->kf_sa_local, sa->sa_len); free(sa, M_SONAME); } error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa); if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) { bcopy(sa, &kif->kf_sa_peer, sa->sa_len); free(sa, M_SONAME); } strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name, sizeof(kif->kf_path)); return (0); } static STAILQ_HEAD(, task) soaio_jobs; static struct mtx soaio_jobs_lock; static struct task soaio_kproc_task; static int soaio_starting, soaio_idle, soaio_queued; static struct unrhdr *soaio_kproc_unr; static int soaio_max_procs = MAX_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0, "Maximum number of kernel processes to use for async socket IO"); static int soaio_num_procs; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0, "Number of active kernel processes for async socket IO"); static int soaio_target_procs = TARGET_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD, &soaio_target_procs, 0, "Preferred number of ready kernel processes for async socket IO"); static int soaio_lifetime; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0, "Maximum lifetime for idle aiod"); static void soaio_kproc_loop(void *arg) { struct proc *p; struct vmspace *myvm; struct task *task; int error, id, pending; id = (intptr_t)arg; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = curproc; myvm = vmspace_acquire_ref(p); mtx_lock(&soaio_jobs_lock); MPASS(soaio_starting > 0); soaio_starting--; for (;;) { while (!STAILQ_EMPTY(&soaio_jobs)) { task = STAILQ_FIRST(&soaio_jobs); STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link); soaio_queued--; pending = task->ta_pending; task->ta_pending = 0; mtx_unlock(&soaio_jobs_lock); task->ta_func(task->ta_context, pending); mtx_lock(&soaio_jobs_lock); } MPASS(soaio_queued == 0); if (p->p_vmspace != myvm) { mtx_unlock(&soaio_jobs_lock); vmspace_switch_aio(myvm); mtx_lock(&soaio_jobs_lock); continue; } soaio_idle++; error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-", soaio_lifetime); soaio_idle--; if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) && soaio_num_procs > soaio_target_procs) break; } soaio_num_procs--; mtx_unlock(&soaio_jobs_lock); free_unr(soaio_kproc_unr, id); kproc_exit(0); } static void soaio_kproc_create(void *context, int pending) { struct proc *p; int error, id; mtx_lock(&soaio_jobs_lock); for (;;) { if (soaio_num_procs < soaio_target_procs) { /* Must create */ } else if (soaio_num_procs >= soaio_max_procs) { /* * Hit the limit on kernel processes, don't * create another one. */ break; } else if (soaio_queued <= soaio_idle + soaio_starting) { /* * No more AIO jobs waiting for a process to be * created, so stop. */ break; } soaio_starting++; mtx_unlock(&soaio_jobs_lock); id = alloc_unr(soaio_kproc_unr); error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id, &p, 0, 0, "soaiod%d", id); if (error != 0) { free_unr(soaio_kproc_unr, id); mtx_lock(&soaio_jobs_lock); soaio_starting--; break; } mtx_lock(&soaio_jobs_lock); soaio_num_procs++; } mtx_unlock(&soaio_jobs_lock); } void soaio_enqueue(struct task *task) { mtx_lock(&soaio_jobs_lock); MPASS(task->ta_pending == 0); task->ta_pending++; STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link); soaio_queued++; if (soaio_queued <= soaio_idle) wakeup_one(&soaio_idle); else if (soaio_num_procs < soaio_max_procs) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); mtx_unlock(&soaio_jobs_lock); } static void soaio_init(void) { soaio_lifetime = AIOD_LIFETIME_DEFAULT; STAILQ_INIT(&soaio_jobs); mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF); soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL); TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL); if (soaio_target_procs > 0) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); } SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL); static __inline int soaio_ready(struct socket *so, struct sockbuf *sb) { return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so)); } static void soaio_process_job(struct socket *so, struct sockbuf *sb, struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct file *fp; struct uio uio; struct iovec iov; - size_t cnt; + size_t cnt, done; int error, flags; SOCKBUF_UNLOCK(sb); aio_switch_vmspace(job); td = curthread; fp = job->fd_file; retry: td_savedcred = td->td_ucred; td->td_ucred = job->cred; - cnt = job->uaiocb.aio_nbytes; - iov.iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf; + done = job->uaiocb._aiocb_private.status; + cnt = job->uaiocb.aio_nbytes - done; + iov.iov_base = (void *)((uintptr_t)job->uaiocb.aio_buf + done); iov.iov_len = cnt; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = cnt; uio.uio_segflg = UIO_USERSPACE; uio.uio_td = td; flags = MSG_NBIO; /* TODO: Charge ru_msg* to job. */ if (sb == &so->so_rcv) { uio.uio_rw = UIO_READ; #ifdef MAC error = mac_socket_check_receive(fp->f_cred, so); if (error == 0) #endif error = soreceive(so, NULL, &uio, NULL, NULL, &flags); } else { uio.uio_rw = UIO_WRITE; #ifdef MAC error = mac_socket_check_send(fp->f_cred, so); if (error == 0) #endif error = sosend(so, NULL, &uio, NULL, NULL, flags, td); if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } - cnt -= uio.uio_resid; + done += cnt - uio.uio_resid; + job->uaiocb._aiocb_private.status = done; td->td_ucred = td_savedcred; - if (cnt != 0 && (error == ERESTART || error == EINTR || - error == EWOULDBLOCK)) - error = 0; if (error == EWOULDBLOCK) { /* - * A read() or write() on the socket raced with this - * request. If the socket is now ready, try again. - * If it is not, place this request at the head of the + * The request was either partially completed or not + * completed at all due to racing with a read() or + * write() on the socket. If the socket is + * non-blocking, return with any partial completion. + * If the socket is blocking or if no progress has + * been made, requeue this request at the head of the * queue to try again when the socket is ready. */ - SOCKBUF_LOCK(sb); - empty_results++; - if (soaio_ready(so, sb)) { - empty_retries++; - SOCKBUF_UNLOCK(sb); - goto retry; - } - - if (!aio_set_cancel_function(job, soo_aio_cancel)) { - MPASS(cnt == 0); - SOCKBUF_UNLOCK(sb); - aio_cancel(job); - SOCKBUF_LOCK(sb); - } else { - TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list); - } - } else { - if (error) - aio_complete(job, -1, error); - else - aio_complete(job, cnt, 0); + MPASS(done != job->uaiocb.aio_nbytes); SOCKBUF_LOCK(sb); - } + if (done == 0 || !(so->so_state & SS_NBIO)) { + empty_results++; + if (soaio_ready(so, sb)) { + empty_retries++; + SOCKBUF_UNLOCK(sb); + goto retry; + } + + if (!aio_set_cancel_function(job, soo_aio_cancel)) { + SOCKBUF_UNLOCK(sb); + if (done != 0) + aio_complete(job, done, 0); + else + aio_cancel(job); + SOCKBUF_LOCK(sb); + } else { + TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list); + } + return; + } + SOCKBUF_UNLOCK(sb); + } + if (done != 0 && (error == ERESTART || error == EINTR || + error == EWOULDBLOCK)) + error = 0; + if (error) + aio_complete(job, -1, error); + else + aio_complete(job, done, 0); + SOCKBUF_LOCK(sb); } static void soaio_process_sb(struct socket *so, struct sockbuf *sb) { struct kaiocb *job; SOCKBUF_LOCK(sb); while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) { job = TAILQ_FIRST(&sb->sb_aiojobq); TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (!aio_clear_cancel_function(job)) continue; soaio_process_job(so, sb, job); } /* * If there are still pending requests, the socket must not be * ready so set SB_AIO to request a wakeup when the socket * becomes ready. */ if (!TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags |= SB_AIO; sb->sb_flags &= ~SB_AIO_RUNNING; SOCKBUF_UNLOCK(sb); ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } void soaio_rcv(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, &so->so_rcv); } void soaio_snd(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, &so->so_snd); } void sowakeup_aio(struct socket *so, struct sockbuf *sb) { SOCKBUF_LOCK_ASSERT(sb); sb->sb_flags &= ~SB_AIO; if (sb->sb_flags & SB_AIO_RUNNING) return; sb->sb_flags |= SB_AIO_RUNNING; if (sb == &so->so_snd) SOCK_LOCK(so); soref(so); if (sb == &so->so_snd) SOCK_UNLOCK(so); soaio_enqueue(&sb->sb_aiotask); } static void soo_aio_cancel(struct kaiocb *job) { struct socket *so; struct sockbuf *sb; int opcode; so = job->fd_file->f_data; opcode = job->uaiocb.aio_lio_opcode; if (opcode == LIO_READ) sb = &so->so_rcv; else { MPASS(opcode == LIO_WRITE); sb = &so->so_snd; } SOCKBUF_LOCK(sb); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags &= ~SB_AIO; SOCKBUF_UNLOCK(sb); aio_cancel(job); } static int soo_aio_queue(struct file *fp, struct kaiocb *job) { struct socket *so; struct sockbuf *sb; int error; so = fp->f_data; error = (*so->so_proto->pr_usrreqs->pru_aio_queue)(so, job); if (error == 0) return (0); switch (job->uaiocb.aio_lio_opcode) { case LIO_READ: sb = &so->so_rcv; break; case LIO_WRITE: sb = &so->so_snd; break; default: return (EINVAL); } SOCKBUF_LOCK(sb); if (!aio_set_cancel_function(job, soo_aio_cancel)) panic("new job was cancelled"); TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list); + job->uaiocb._aiocb_private.status = 0; if (!(sb->sb_flags & SB_AIO_RUNNING)) { if (soaio_ready(so, sb)) sowakeup_aio(so, sb); else sb->sb_flags |= SB_AIO; } SOCKBUF_UNLOCK(sb); return (0); } Index: head/tests/sys/aio/aio_test.c =================================================================== --- head/tests/sys/aio/aio_test.c (revision 300555) +++ head/tests/sys/aio/aio_test.c (revision 300556) @@ -1,797 +1,862 @@ /*- * Copyright (c) 2004 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Regression test to do some very basic AIO exercising on several types of * file descriptors. Currently, the tests consist of initializing a fixed * size buffer with pseudo-random data, writing it to one fd using AIO, then * reading it from a second descriptor using AIO. For some targets, the same * fd is used for write and read (i.e., file, md device), but for others the * operation is performed on a peer (pty, socket, fifo, etc). A timeout is * initiated to detect undo blocking. This test does not attempt to exercise * error cases or more subtle asynchronous behavior, just make sure that the * basic operations work on some basic object types. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "freebsd_test_suite/macros.h" #include "local.h" #define PATH_TEMPLATE "aio.XXXXXXXXXX" /* * GLOBAL_MAX sets the largest usable buffer size to be read and written, as * it sizes ac_buffer in the aio_context structure. It is also the default * size for file I/O. For other types, we use smaller blocks or we risk * blocking (and we run in a single process/thread so that would be bad). */ #define GLOBAL_MAX 16384 #define BUFFER_MAX GLOBAL_MAX struct aio_context { int ac_read_fd, ac_write_fd; long ac_seed; char ac_buffer[GLOBAL_MAX]; int ac_buflen; int ac_seconds; void (*ac_cleanup)(void *arg); void *ac_cleanup_arg; }; static int aio_timedout; /* * Each test run specifies a timeout in seconds. Use the somewhat obsoleted * signal(3) and alarm(3) APIs to set this up. */ static void aio_timeout_signal(int sig __unused) { aio_timedout = 1; } static void aio_timeout_start(int seconds) { aio_timedout = 0; ATF_REQUIRE_MSG(signal(SIGALRM, aio_timeout_signal) != SIG_ERR, "failed to set SIGALRM handler: %s", strerror(errno)); alarm(seconds); } static void aio_timeout_stop(void) { ATF_REQUIRE_MSG(signal(SIGALRM, NULL) != SIG_ERR, "failed to reset SIGALRM handler to default: %s", strerror(errno)); alarm(0); } /* * Fill a buffer given a seed that can be fed into srandom() to initialize * the PRNG in a repeatable manner. */ static void aio_fill_buffer(char *buffer, int len, long seed) { char ch; int i; srandom(seed); for (i = 0; i < len; i++) { ch = random() & 0xff; buffer[i] = ch; } } /* * Test that a buffer matches a given seed. See aio_fill_buffer(). Return * (1) on a match, (0) on a mismatch. */ static int aio_test_buffer(char *buffer, int len, long seed) { char ch; int i; srandom(seed); for (i = 0; i < len; i++) { ch = random() & 0xff; if (buffer[i] != ch) return (0); } return (1); } /* * Initialize a testing context given the file descriptors provided by the * test setup. */ static void aio_context_init(struct aio_context *ac, int read_fd, int write_fd, int buflen, int seconds, void (*cleanup)(void *), void *cleanup_arg) { ATF_REQUIRE_MSG(buflen <= BUFFER_MAX, "aio_context_init: buffer too large (%d > %d)", buflen, BUFFER_MAX); bzero(ac, sizeof(*ac)); ac->ac_read_fd = read_fd; ac->ac_write_fd = write_fd; ac->ac_buflen = buflen; srandomdev(); ac->ac_seed = random(); aio_fill_buffer(ac->ac_buffer, buflen, ac->ac_seed); ATF_REQUIRE_MSG(aio_test_buffer(ac->ac_buffer, buflen, ac->ac_seed) != 0, "aio_test_buffer: internal error"); ac->ac_seconds = seconds; ac->ac_cleanup = cleanup; ac->ac_cleanup_arg = cleanup_arg; } /* * Each tester can register a callback to clean up in the event the test * fails. Preserve the value of errno so that subsequent calls to errx() * work properly. */ static void aio_cleanup(struct aio_context *ac) { int error; if (ac->ac_cleanup == NULL) return; error = errno; (ac->ac_cleanup)(ac->ac_cleanup_arg); errno = error; } /* * Perform a simple write test of our initialized data buffer to the provided * file descriptor. */ static void aio_write_test(struct aio_context *ac) { struct aiocb aio, *aiop; ssize_t len; ATF_REQUIRE_KERNEL_MODULE("aio"); bzero(&aio, sizeof(aio)); aio.aio_buf = ac->ac_buffer; aio.aio_nbytes = ac->ac_buflen; aio.aio_fildes = ac->ac_write_fd; aio.aio_offset = 0; aio_timeout_start(ac->ac_seconds); if (aio_write(&aio) < 0) { if (errno == EINTR) { if (aio_timedout) { aio_cleanup(ac); atf_tc_fail("aio_write timed out"); } } aio_cleanup(ac); atf_tc_fail("aio_write failed: %s", strerror(errno)); } len = aio_waitcomplete(&aiop, NULL); if (len < 0) { if (errno == EINTR) { if (aio_timedout) { aio_cleanup(ac); atf_tc_fail("aio_waitcomplete timed out"); } } aio_cleanup(ac); atf_tc_fail("aio_waitcomplete failed: %s", strerror(errno)); } aio_timeout_stop(); if (len != ac->ac_buflen) { aio_cleanup(ac); atf_tc_fail("aio_waitcomplete short write (%jd)", (intmax_t)len); } } /* * Perform a simple read test of our initialized data buffer from the * provided file descriptor. */ static void aio_read_test(struct aio_context *ac) { struct aiocb aio, *aiop; ssize_t len; ATF_REQUIRE_KERNEL_MODULE("aio"); bzero(ac->ac_buffer, ac->ac_buflen); bzero(&aio, sizeof(aio)); aio.aio_buf = ac->ac_buffer; aio.aio_nbytes = ac->ac_buflen; aio.aio_fildes = ac->ac_read_fd; aio.aio_offset = 0; aio_timeout_start(ac->ac_seconds); if (aio_read(&aio) < 0) { if (errno == EINTR) { if (aio_timedout) { aio_cleanup(ac); atf_tc_fail("aio_write timed out"); } } aio_cleanup(ac); atf_tc_fail("aio_read failed: %s", strerror(errno)); } len = aio_waitcomplete(&aiop, NULL); if (len < 0) { if (errno == EINTR) { if (aio_timedout) { aio_cleanup(ac); atf_tc_fail("aio_waitcomplete timed out"); } } aio_cleanup(ac); atf_tc_fail("aio_waitcomplete failed: %s", strerror(errno)); } aio_timeout_stop(); if (len != ac->ac_buflen) { aio_cleanup(ac); atf_tc_fail("aio_waitcomplete short read (%jd)", (intmax_t)len); } if (aio_test_buffer(ac->ac_buffer, ac->ac_buflen, ac->ac_seed) == 0) { aio_cleanup(ac); atf_tc_fail("buffer mismatched"); } } /* * Series of type-specific tests for AIO. For now, we just make sure we can * issue a write and then a read to each type. We assume that once a write * is issued, a read can follow. */ /* * Test with a classic file. Assumes we can create a moderate size temporary * file. */ struct aio_file_arg { int afa_fd; char *afa_pathname; }; static void aio_file_cleanup(void *arg) { struct aio_file_arg *afa; afa = arg; close(afa->afa_fd); unlink(afa->afa_pathname); } #define FILE_LEN GLOBAL_MAX #define FILE_TIMEOUT 30 ATF_TC_WITHOUT_HEAD(aio_file_test); ATF_TC_BODY(aio_file_test, tc) { char pathname[PATH_MAX]; struct aio_file_arg arg; struct aio_context ac; int fd; ATF_REQUIRE_KERNEL_MODULE("aio"); ATF_REQUIRE_UNSAFE_AIO(); strcpy(pathname, PATH_TEMPLATE); fd = mkstemp(pathname); ATF_REQUIRE_MSG(fd != -1, "mkstemp failed: %s", strerror(errno)); arg.afa_fd = fd; arg.afa_pathname = pathname; aio_context_init(&ac, fd, fd, FILE_LEN, FILE_TIMEOUT, aio_file_cleanup, &arg); aio_write_test(&ac); aio_read_test(&ac); aio_file_cleanup(&arg); } struct aio_fifo_arg { int afa_read_fd; int afa_write_fd; char *afa_pathname; }; static void aio_fifo_cleanup(void *arg) { struct aio_fifo_arg *afa; afa = arg; if (afa->afa_read_fd != -1) close(afa->afa_read_fd); if (afa->afa_write_fd != -1) close(afa->afa_write_fd); unlink(afa->afa_pathname); } #define FIFO_LEN 256 #define FIFO_TIMEOUT 30 ATF_TC_WITHOUT_HEAD(aio_fifo_test); ATF_TC_BODY(aio_fifo_test, tc) { int error, read_fd = -1, write_fd = -1; struct aio_fifo_arg arg; char pathname[PATH_MAX]; struct aio_context ac; ATF_REQUIRE_KERNEL_MODULE("aio"); ATF_REQUIRE_UNSAFE_AIO(); /* * In theory, mkstemp() can return a name that is then collided with. * Because this is a regression test, we treat that as a test failure * rather than retrying. */ strcpy(pathname, PATH_TEMPLATE); ATF_REQUIRE_MSG(mkstemp(pathname) != -1, "mkstemp failed: %s", strerror(errno)); ATF_REQUIRE_MSG(unlink(pathname) == 0, "unlink failed: %s", strerror(errno)); ATF_REQUIRE_MSG(mkfifo(pathname, 0600) != -1, "mkfifo failed: %s", strerror(errno)); arg.afa_pathname = pathname; arg.afa_read_fd = -1; arg.afa_write_fd = -1; read_fd = open(pathname, O_RDONLY | O_NONBLOCK); if (read_fd == -1) { error = errno; aio_fifo_cleanup(&arg); errno = error; atf_tc_fail("read_fd open failed: %s", strerror(errno)); } arg.afa_read_fd = read_fd; write_fd = open(pathname, O_WRONLY); if (write_fd == -1) { error = errno; aio_fifo_cleanup(&arg); errno = error; atf_tc_fail("write_fd open failed: %s", strerror(errno)); } arg.afa_write_fd = write_fd; aio_context_init(&ac, read_fd, write_fd, FIFO_LEN, FIFO_TIMEOUT, aio_fifo_cleanup, &arg); aio_write_test(&ac); aio_read_test(&ac); aio_fifo_cleanup(&arg); } struct aio_unix_socketpair_arg { int asa_sockets[2]; }; static void aio_unix_socketpair_cleanup(void *arg) { struct aio_unix_socketpair_arg *asa; asa = arg; close(asa->asa_sockets[0]); close(asa->asa_sockets[1]); } #define UNIX_SOCKETPAIR_LEN 256 #define UNIX_SOCKETPAIR_TIMEOUT 30 ATF_TC_WITHOUT_HEAD(aio_unix_socketpair_test); ATF_TC_BODY(aio_unix_socketpair_test, tc) { struct aio_unix_socketpair_arg arg; struct aio_context ac; int sockets[2]; ATF_REQUIRE_KERNEL_MODULE("aio"); ATF_REQUIRE_MSG(socketpair(PF_UNIX, SOCK_STREAM, 0, sockets) != -1, "socketpair failed: %s", strerror(errno)); arg.asa_sockets[0] = sockets[0]; arg.asa_sockets[1] = sockets[1]; aio_context_init(&ac, sockets[0], sockets[1], UNIX_SOCKETPAIR_LEN, UNIX_SOCKETPAIR_TIMEOUT, aio_unix_socketpair_cleanup, &arg); aio_write_test(&ac); aio_read_test(&ac); aio_unix_socketpair_cleanup(&arg); } struct aio_pty_arg { int apa_read_fd; int apa_write_fd; }; static void aio_pty_cleanup(void *arg) { struct aio_pty_arg *apa; apa = arg; close(apa->apa_read_fd); close(apa->apa_write_fd); }; #define PTY_LEN 256 #define PTY_TIMEOUT 30 ATF_TC_WITHOUT_HEAD(aio_pty_test); ATF_TC_BODY(aio_pty_test, tc) { struct aio_pty_arg arg; struct aio_context ac; int read_fd, write_fd; struct termios ts; int error; ATF_REQUIRE_KERNEL_MODULE("aio"); ATF_REQUIRE_UNSAFE_AIO(); ATF_REQUIRE_MSG(openpty(&read_fd, &write_fd, NULL, NULL, NULL) == 0, "openpty failed: %s", strerror(errno)); arg.apa_read_fd = read_fd; arg.apa_write_fd = write_fd; if (tcgetattr(write_fd, &ts) < 0) { error = errno; aio_pty_cleanup(&arg); errno = error; atf_tc_fail("tcgetattr failed: %s", strerror(errno)); } cfmakeraw(&ts); if (tcsetattr(write_fd, TCSANOW, &ts) < 0) { error = errno; aio_pty_cleanup(&arg); errno = error; atf_tc_fail("tcsetattr failed: %s", strerror(errno)); } aio_context_init(&ac, read_fd, write_fd, PTY_LEN, PTY_TIMEOUT, aio_pty_cleanup, &arg); aio_write_test(&ac); aio_read_test(&ac); aio_pty_cleanup(&arg); } static void aio_pipe_cleanup(void *arg) { int *pipes = arg; close(pipes[0]); close(pipes[1]); } #define PIPE_LEN 256 #define PIPE_TIMEOUT 30 ATF_TC_WITHOUT_HEAD(aio_pipe_test); ATF_TC_BODY(aio_pipe_test, tc) { struct aio_context ac; int pipes[2]; ATF_REQUIRE_KERNEL_MODULE("aio"); ATF_REQUIRE_UNSAFE_AIO(); ATF_REQUIRE_MSG(pipe(pipes) != -1, "pipe failed: %s", strerror(errno)); aio_context_init(&ac, pipes[0], pipes[1], PIPE_LEN, PIPE_TIMEOUT, aio_pipe_cleanup, pipes); aio_write_test(&ac); aio_read_test(&ac); aio_pipe_cleanup(pipes); } struct aio_md_arg { int ama_mdctl_fd; int ama_unit; int ama_fd; }; static void aio_md_cleanup(void *arg) { struct aio_md_arg *ama; struct md_ioctl mdio; int error; ama = arg; if (ama->ama_fd != -1) close(ama->ama_fd); if (ama->ama_unit != -1) { bzero(&mdio, sizeof(mdio)); mdio.md_version = MDIOVERSION; mdio.md_unit = ama->ama_unit; if (ioctl(ama->ama_mdctl_fd, MDIOCDETACH, &mdio) == -1) { error = errno; close(ama->ama_mdctl_fd); errno = error; atf_tc_fail("ioctl MDIOCDETACH failed: %s", strerror(errno)); } } close(ama->ama_mdctl_fd); } #define MD_LEN GLOBAL_MAX #define MD_TIMEOUT 30 ATF_TC(aio_md_test); ATF_TC_HEAD(aio_md_test, tc) { atf_tc_set_md_var(tc, "require.user", "root"); } ATF_TC_BODY(aio_md_test, tc) { int error, fd, mdctl_fd, unit; char pathname[PATH_MAX]; struct aio_md_arg arg; struct aio_context ac; struct md_ioctl mdio; ATF_REQUIRE_KERNEL_MODULE("aio"); mdctl_fd = open("/dev/" MDCTL_NAME, O_RDWR, 0); ATF_REQUIRE_MSG(mdctl_fd != -1, "opening /dev/%s failed: %s", MDCTL_NAME, strerror(errno)); bzero(&mdio, sizeof(mdio)); mdio.md_version = MDIOVERSION; mdio.md_type = MD_MALLOC; mdio.md_options = MD_AUTOUNIT | MD_COMPRESS; mdio.md_mediasize = GLOBAL_MAX; mdio.md_sectorsize = 512; arg.ama_mdctl_fd = mdctl_fd; arg.ama_unit = -1; arg.ama_fd = -1; if (ioctl(mdctl_fd, MDIOCATTACH, &mdio) < 0) { error = errno; aio_md_cleanup(&arg); errno = error; atf_tc_fail("ioctl MDIOCATTACH failed: %s", strerror(errno)); } arg.ama_unit = unit = mdio.md_unit; snprintf(pathname, PATH_MAX, "/dev/md%d", unit); fd = open(pathname, O_RDWR); ATF_REQUIRE_MSG(fd != -1, "opening %s failed: %s", pathname, strerror(errno)); arg.ama_fd = fd; aio_context_init(&ac, fd, fd, MD_LEN, MD_TIMEOUT, aio_md_cleanup, &arg); aio_write_test(&ac); aio_read_test(&ac); aio_md_cleanup(&arg); } ATF_TC_WITHOUT_HEAD(aio_large_read_test); ATF_TC_BODY(aio_large_read_test, tc) { char pathname[PATH_MAX]; struct aiocb cb, *cbp; ssize_t nread; size_t len; int fd; #ifdef __LP64__ int clamped; #endif ATF_REQUIRE_KERNEL_MODULE("aio"); ATF_REQUIRE_UNSAFE_AIO(); #ifdef __LP64__ len = sizeof(clamped); if (sysctlbyname("debug.iosize_max_clamp", &clamped, &len, NULL, 0) == -1) atf_libc_error(errno, "Failed to read debug.iosize_max_clamp"); #endif /* Determine the maximum supported read(2) size. */ len = SSIZE_MAX; #ifdef __LP64__ if (clamped) len = INT_MAX; #endif strcpy(pathname, PATH_TEMPLATE); fd = mkstemp(pathname); ATF_REQUIRE_MSG(fd != -1, "mkstemp failed: %s", strerror(errno)); unlink(pathname); memset(&cb, 0, sizeof(cb)); cb.aio_nbytes = len; cb.aio_fildes = fd; cb.aio_buf = NULL; if (aio_read(&cb) == -1) atf_tc_fail("aio_read() of maximum read size failed: %s", strerror(errno)); nread = aio_waitcomplete(&cbp, NULL); if (nread == -1) atf_tc_fail("aio_waitcomplete() failed: %s", strerror(errno)); if (nread != 0) atf_tc_fail("aio_read() from empty file returned data: %zd", nread); memset(&cb, 0, sizeof(cb)); cb.aio_nbytes = len + 1; cb.aio_fildes = fd; cb.aio_buf = NULL; if (aio_read(&cb) == -1) { if (errno == EINVAL) goto finished; atf_tc_fail("aio_read() of too large read size failed: %s", strerror(errno)); } nread = aio_waitcomplete(&cbp, NULL); if (nread == -1) { if (errno == EINVAL) goto finished; atf_tc_fail("aio_waitcomplete() failed: %s", strerror(errno)); } atf_tc_fail("aio_read() of too large read size returned: %zd", nread); finished: close(fd); } /* * This tests for a bug where arriving socket data can wakeup multiple * AIO read requests resulting in an uncancellable request. */ ATF_TC_WITHOUT_HEAD(aio_socket_two_reads); ATF_TC_BODY(aio_socket_two_reads, tc) { struct ioreq { struct aiocb iocb; char buffer[1024]; } ioreq[2]; struct aiocb *iocb; unsigned i; int s[2]; char c; ATF_REQUIRE_KERNEL_MODULE("aio"); #if __FreeBSD_version < 1100101 aft_tc_skip("kernel version %d is too old (%d required)", __FreeBSD_version, 1100101); #endif ATF_REQUIRE(socketpair(PF_UNIX, SOCK_STREAM, 0, s) != -1); /* Queue two read requests. */ memset(&ioreq, 0, sizeof(ioreq)); for (i = 0; i < nitems(ioreq); i++) { ioreq[i].iocb.aio_nbytes = sizeof(ioreq[i].buffer); ioreq[i].iocb.aio_fildes = s[0]; ioreq[i].iocb.aio_buf = ioreq[i].buffer; ATF_REQUIRE(aio_read(&ioreq[i].iocb) == 0); } /* Send a single byte. This should complete one request. */ c = 0xc3; ATF_REQUIRE(write(s[1], &c, sizeof(c)) == 1); ATF_REQUIRE(aio_waitcomplete(&iocb, NULL) == 1); /* Determine which request completed and verify the data was read. */ if (iocb == &ioreq[0].iocb) i = 0; else i = 1; ATF_REQUIRE(ioreq[i].buffer[0] == c); i ^= 1; /* * Try to cancel the other request. On broken systems this * will fail and the process will hang on exit. */ ATF_REQUIRE(aio_error(&ioreq[i].iocb) == EINPROGRESS); ATF_REQUIRE(aio_cancel(s[0], &ioreq[i].iocb) == AIO_CANCELED); close(s[1]); close(s[0]); } +/* + * This test ensures that aio_write() on a blocking socket of a "large" + * buffer does not return a short completion. + */ +ATF_TC_WITHOUT_HEAD(aio_socket_blocking_short_write); +ATF_TC_BODY(aio_socket_blocking_short_write, tc) +{ + struct aiocb iocb, *iocbp; + char *buffer[2]; + ssize_t done; + int buffer_size, sb_size; + socklen_t len; + int s[2]; + + ATF_REQUIRE_KERNEL_MODULE("aio"); + + ATF_REQUIRE(socketpair(PF_UNIX, SOCK_STREAM, 0, s) != -1); + + len = sizeof(sb_size); + ATF_REQUIRE(getsockopt(s[0], SOL_SOCKET, SO_RCVBUF, &sb_size, &len) != + -1); + ATF_REQUIRE(len == sizeof(sb_size)); + buffer_size = sb_size; + + ATF_REQUIRE(getsockopt(s[1], SOL_SOCKET, SO_SNDBUF, &sb_size, &len) != + -1); + ATF_REQUIRE(len == sizeof(sb_size)); + if (sb_size > buffer_size) + buffer_size = sb_size; + + /* + * Use twice the size of the MAX(receive buffer, send buffer) + * to ensure that the write is split up into multiple writes + * internally. + */ + buffer_size *= 2; + + buffer[0] = malloc(buffer_size); + ATF_REQUIRE(buffer[0] != NULL); + buffer[1] = malloc(buffer_size); + ATF_REQUIRE(buffer[1] != NULL); + + srandomdev(); + aio_fill_buffer(buffer[1], buffer_size, random()); + + memset(&iocb, 0, sizeof(iocb)); + iocb.aio_fildes = s[1]; + iocb.aio_buf = buffer[1]; + iocb.aio_nbytes = buffer_size; + ATF_REQUIRE(aio_write(&iocb) == 0); + + done = recv(s[0], buffer[0], buffer_size, MSG_WAITALL); + ATF_REQUIRE(done == buffer_size); + + done = aio_waitcomplete(&iocbp, NULL); + ATF_REQUIRE(iocbp == &iocb); + ATF_REQUIRE(done == buffer_size); + + ATF_REQUIRE(memcmp(buffer[0], buffer[1], buffer_size) == 0); + + close(s[1]); + close(s[0]); +} + ATF_TP_ADD_TCS(tp) { ATF_TP_ADD_TC(tp, aio_file_test); ATF_TP_ADD_TC(tp, aio_fifo_test); ATF_TP_ADD_TC(tp, aio_unix_socketpair_test); ATF_TP_ADD_TC(tp, aio_pty_test); ATF_TP_ADD_TC(tp, aio_pipe_test); ATF_TP_ADD_TC(tp, aio_md_test); ATF_TP_ADD_TC(tp, aio_large_read_test); ATF_TP_ADD_TC(tp, aio_socket_two_reads); + ATF_TP_ADD_TC(tp, aio_socket_blocking_short_write); return (atf_no_error()); }