Index: head/sys/kern/sys_socket.c
===================================================================
--- head/sys/kern/sys_socket.c	(revision 300555)
+++ head/sys/kern/sys_socket.c	(revision 300556)
@@ -1,769 +1,781 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/aio.h>
 #include <sys/domain.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/filio.h>			/* XXX */
 #include <sys/sockio.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/ucred.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/user.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD, NULL,
     "socket AIO stats");
 
 static int empty_results;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results,
     0, "socket operation returned EAGAIN");
 
 static int empty_retries;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries,
     0, "socket operation retries");
 
 static fo_rdwr_t soo_read;
 static fo_rdwr_t soo_write;
 static fo_ioctl_t soo_ioctl;
 static fo_poll_t soo_poll;
 extern fo_kqfilter_t soo_kqfilter;
 static fo_stat_t soo_stat;
 static fo_close_t soo_close;
 static fo_fill_kinfo_t soo_fill_kinfo;
 static fo_aio_queue_t soo_aio_queue;
 
 static void	soo_aio_cancel(struct kaiocb *job);
 
 struct fileops	socketops = {
 	.fo_read = soo_read,
 	.fo_write = soo_write,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = soo_ioctl,
 	.fo_poll = soo_poll,
 	.fo_kqfilter = soo_kqfilter,
 	.fo_stat = soo_stat,
 	.fo_close = soo_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = soo_fill_kinfo,
 	.fo_aio_queue = soo_aio_queue,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static int
 soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_receive(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = soreceive(so, 0, uio, 0, 0, 0);
 	return (error);
 }
 
 static int
 soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_send(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td);
 	if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
 		PROC_LOCK(uio->uio_td->td_proc);
 		tdsignal(uio->uio_td, SIGPIPE);
 		PROC_UNLOCK(uio->uio_td->td_proc);
 	}
 	return (error);
 }
 
 static int
 soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error = 0;
 
 	switch (cmd) {
 	case FIONBIO:
 		SOCK_LOCK(so);
 		if (*(int *)data)
 			so->so_state |= SS_NBIO;
 		else
 			so->so_state &= ~SS_NBIO;
 		SOCK_UNLOCK(so);
 		break;
 
 	case FIOASYNC:
 		/*
 		 * XXXRW: This code separately acquires SOCK_LOCK(so) and
 		 * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
 		 * mutex to avoid introducing the assumption that they are
 		 * the same.
 		 */
 		if (*(int *)data) {
 			SOCK_LOCK(so);
 			so->so_state |= SS_ASYNC;
 			SOCK_UNLOCK(so);
 			SOCKBUF_LOCK(&so->so_rcv);
 			so->so_rcv.sb_flags |= SB_ASYNC;
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			SOCKBUF_LOCK(&so->so_snd);
 			so->so_snd.sb_flags |= SB_ASYNC;
 			SOCKBUF_UNLOCK(&so->so_snd);
 		} else {
 			SOCK_LOCK(so);
 			so->so_state &= ~SS_ASYNC;
 			SOCK_UNLOCK(so);
 			SOCKBUF_LOCK(&so->so_rcv);
 			so->so_rcv.sb_flags &= ~SB_ASYNC;
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			SOCKBUF_LOCK(&so->so_snd);
 			so->so_snd.sb_flags &= ~SB_ASYNC;
 			SOCKBUF_UNLOCK(&so->so_snd);
 		}
 		break;
 
 	case FIONREAD:
 		/* Unlocked read. */
 		*(int *)data = sbavail(&so->so_rcv);
 		break;
 
 	case FIONWRITE:
 		/* Unlocked read. */
 		*(int *)data = sbavail(&so->so_snd);
 		break;
 
 	case FIONSPACE:
 		/* Unlocked read. */
 		if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) ||
 		    (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt))
 			*(int *)data = 0;
 		else
 			*(int *)data = sbspace(&so->so_snd);
 		break;
 
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &so->so_sigio);
 		break;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&so->so_sigio);
 		break;
 
 	case SIOCSPGRP:
 		error = fsetown(-(*(int *)data), &so->so_sigio);
 		break;
 
 	case SIOCGPGRP:
 		*(int *)data = -fgetown(&so->so_sigio);
 		break;
 
 	case SIOCATMARK:
 		/* Unlocked read. */
 		*(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0;
 		break;
 	default:
 		/*
 		 * Interface/routing/protocol specific ioctls: interface and
 		 * routing ioctls should have a different entry since a
 		 * socket is unnecessary.
 		 */
 		if (IOCGROUP(cmd) == 'i')
 			error = ifioctl(so, cmd, data, td);
 		else if (IOCGROUP(cmd) == 'r') {
 			CURVNET_SET(so->so_vnet);
 			error = rtioctl_fib(cmd, data, so->so_fibnum);
 			CURVNET_RESTORE();
 		} else {
 			CURVNET_SET(so->so_vnet);
 			error = ((*so->so_proto->pr_usrreqs->pru_control)
 			    (so, cmd, data, 0, td));
 			CURVNET_RESTORE();
 		}
 		break;
 	}
 	return (error);
 }
 
 static int
 soo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 #ifdef MAC
 	int error;
 
 	error = mac_socket_check_poll(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	return (sopoll(so, events, fp->f_cred, td));
 }
 
 static int
 soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	struct sockbuf *sb;
 #ifdef MAC
 	int error;
 #endif
 
 	bzero((caddr_t)ub, sizeof (*ub));
 	ub->st_mode = S_IFSOCK;
 #ifdef MAC
 	error = mac_socket_check_stat(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	/*
 	 * If SBS_CANTRCVMORE is set, but there's still data left in the
 	 * receive buffer, the socket is still readable.
 	 */
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
 		ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
 	ub->st_size = sbavail(sb) - sb->sb_ctl;
 	SOCKBUF_UNLOCK(sb);
 
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
 		ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
 	SOCKBUF_UNLOCK(sb);
 	ub->st_uid = so->so_cred->cr_uid;
 	ub->st_gid = so->so_cred->cr_gid;
 	return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
 }
 
 /*
  * API socket close on file pointer.  We call soclose() to close the socket
  * (including initiating closing protocols).  soclose() will sorele() the
  * file reference but the actual socket will not go away until the socket's
  * ref count hits 0.
  */
 static int
 soo_close(struct file *fp, struct thread *td)
 {
 	int error = 0;
 	struct socket *so;
 
 	so = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 
 	if (so)
 		error = soclose(so);
 	return (error);
 }
 
 static int
 soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct sockaddr *sa;
 	struct inpcb *inpcb;
 	struct unpcb *unpcb;
 	struct socket *so;
 	int error;
 
 	kif->kf_type = KF_TYPE_SOCKET;
 	so = fp->f_data;
 	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
 	kif->kf_sock_type = so->so_type;
 	kif->kf_sock_protocol = so->so_proto->pr_protocol;
 	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
 	switch (kif->kf_sock_domain) {
 	case AF_INET:
 	case AF_INET6:
 		if (kif->kf_sock_protocol == IPPROTO_TCP) {
 			if (so->so_pcb != NULL) {
 				inpcb = (struct inpcb *)(so->so_pcb);
 				kif->kf_un.kf_sock.kf_sock_inpcb =
 				    (uintptr_t)inpcb->inp_ppcb;
 			}
 		}
 		break;
 	case AF_UNIX:
 		if (so->so_pcb != NULL) {
 			unpcb = (struct unpcb *)(so->so_pcb);
 			if (unpcb->unp_conn) {
 				kif->kf_un.kf_sock.kf_sock_unpconn =
 				    (uintptr_t)unpcb->unp_conn;
 				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
 				    so->so_rcv.sb_state;
 				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
 				    so->so_snd.sb_state;
 			}
 		}
 		break;
 	}
 	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
 	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
 		bcopy(sa, &kif->kf_sa_local, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
 	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
 		bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
 	    sizeof(kif->kf_path));
 	return (0);	
 }
 
 static STAILQ_HEAD(, task) soaio_jobs;
 static struct mtx soaio_jobs_lock;
 static struct task soaio_kproc_task;
 static int soaio_starting, soaio_idle, soaio_queued;
 static struct unrhdr *soaio_kproc_unr;
 
 static int soaio_max_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0,
     "Maximum number of kernel processes to use for async socket IO");
 
 static int soaio_num_procs;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0,
     "Number of active kernel processes for async socket IO");
 
 static int soaio_target_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD,
     &soaio_target_procs, 0,
     "Preferred number of ready kernel processes for async socket IO");
 
 static int soaio_lifetime;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static void
 soaio_kproc_loop(void *arg)
 {
 	struct proc *p;
 	struct vmspace *myvm;
 	struct task *task;
 	int error, id, pending;
 
 	id = (intptr_t)arg;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = curproc;
 	myvm = vmspace_acquire_ref(p);
 
 	mtx_lock(&soaio_jobs_lock);
 	MPASS(soaio_starting > 0);
 	soaio_starting--;
 	for (;;) {
 		while (!STAILQ_EMPTY(&soaio_jobs)) {
 			task = STAILQ_FIRST(&soaio_jobs);
 			STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link);
 			soaio_queued--;
 			pending = task->ta_pending;
 			task->ta_pending = 0;
 			mtx_unlock(&soaio_jobs_lock);
 
 			task->ta_func(task->ta_context, pending);
 
 			mtx_lock(&soaio_jobs_lock);
 		}
 		MPASS(soaio_queued == 0);
 
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&soaio_jobs_lock);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&soaio_jobs_lock);
 			continue;
 		}
 
 		soaio_idle++;
 		error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-",
 		    soaio_lifetime);
 		soaio_idle--;
 		if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) &&
 		    soaio_num_procs > soaio_target_procs)
 			break;
 	}
 	soaio_num_procs--;
 	mtx_unlock(&soaio_jobs_lock);
 	free_unr(soaio_kproc_unr, id);
 	kproc_exit(0);
 }
 
 static void
 soaio_kproc_create(void *context, int pending)
 {
 	struct proc *p;
 	int error, id;
 
 	mtx_lock(&soaio_jobs_lock);
 	for (;;) {
 		if (soaio_num_procs < soaio_target_procs) {
 			/* Must create */
 		} else if (soaio_num_procs >= soaio_max_procs) {
 			/*
 			 * Hit the limit on kernel processes, don't
 			 * create another one.
 			 */
 			break;
 		} else if (soaio_queued <= soaio_idle + soaio_starting) {
 			/*
 			 * No more AIO jobs waiting for a process to be
 			 * created, so stop.
 			 */
 			break;
 		}
 		soaio_starting++;
 		mtx_unlock(&soaio_jobs_lock);
 
 		id = alloc_unr(soaio_kproc_unr);
 		error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id,
 		    &p, 0, 0, "soaiod%d", id);
 		if (error != 0) {
 			free_unr(soaio_kproc_unr, id);
 			mtx_lock(&soaio_jobs_lock);
 			soaio_starting--;
 			break;
 		}
 
 		mtx_lock(&soaio_jobs_lock);
 		soaio_num_procs++;
 	}
 	mtx_unlock(&soaio_jobs_lock);
 }
 
 void
 soaio_enqueue(struct task *task)
 {
 
 	mtx_lock(&soaio_jobs_lock);
 	MPASS(task->ta_pending == 0);
 	task->ta_pending++;
 	STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link);
 	soaio_queued++;
 	if (soaio_queued <= soaio_idle)
 		wakeup_one(&soaio_idle);
 	else if (soaio_num_procs < soaio_max_procs)
 		taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task);
 	mtx_unlock(&soaio_jobs_lock);
 }
 
 static void
 soaio_init(void)
 {
 
 	soaio_lifetime = AIOD_LIFETIME_DEFAULT;
 	STAILQ_INIT(&soaio_jobs);
 	mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF);
 	soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL);
 	TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL);
 	if (soaio_target_procs > 0)
 		taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task);
 }
 SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL);
 
 static __inline int
 soaio_ready(struct socket *so, struct sockbuf *sb)
 {
 	return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so));
 }
 
 static void
 soaio_process_job(struct socket *so, struct sockbuf *sb, struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct file *fp;
 	struct uio uio;
 	struct iovec iov;
-	size_t cnt;
+	size_t cnt, done;
 	int error, flags;
 
 	SOCKBUF_UNLOCK(sb);
 	aio_switch_vmspace(job);
 	td = curthread;
 	fp = job->fd_file;
 retry:
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 
-	cnt = job->uaiocb.aio_nbytes;
-	iov.iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf;
+	done = job->uaiocb._aiocb_private.status;
+	cnt = job->uaiocb.aio_nbytes - done;
+	iov.iov_base = (void *)((uintptr_t)job->uaiocb.aio_buf + done);
 	iov.iov_len = cnt;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = cnt;
 	uio.uio_segflg = UIO_USERSPACE;
 	uio.uio_td = td;
 	flags = MSG_NBIO;
 
 	/* TODO: Charge ru_msg* to job. */
 
 	if (sb == &so->so_rcv) {
 		uio.uio_rw = UIO_READ;
 #ifdef MAC
 		error = mac_socket_check_receive(fp->f_cred, so);
 		if (error == 0)
 
 #endif
 			error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
 	} else {
 		uio.uio_rw = UIO_WRITE;
 #ifdef MAC
 		error = mac_socket_check_send(fp->f_cred, so);
 		if (error == 0)
 #endif
 			error = sosend(so, NULL, &uio, NULL, NULL, flags, td);
 		if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
-	cnt -= uio.uio_resid;
+	done += cnt - uio.uio_resid;
+	job->uaiocb._aiocb_private.status = done;
 	td->td_ucred = td_savedcred;
 
-	if (cnt != 0 && (error == ERESTART || error == EINTR ||
-	    error == EWOULDBLOCK))
-		error = 0;
 	if (error == EWOULDBLOCK) {
 		/*
-		 * A read() or write() on the socket raced with this
-		 * request.  If the socket is now ready, try again.
-		 * If it is not, place this request at the head of the
+		 * The request was either partially completed or not
+		 * completed at all due to racing with a read() or
+		 * write() on the socket.  If the socket is
+		 * non-blocking, return with any partial completion.
+		 * If the socket is blocking or if no progress has
+		 * been made, requeue this request at the head of the
 		 * queue to try again when the socket is ready.
 		 */
-		SOCKBUF_LOCK(sb);		
-		empty_results++;
-		if (soaio_ready(so, sb)) {
-			empty_retries++;
-			SOCKBUF_UNLOCK(sb);
-			goto retry;
-		}
-
-		if (!aio_set_cancel_function(job, soo_aio_cancel)) {
-			MPASS(cnt == 0);
-			SOCKBUF_UNLOCK(sb);
-			aio_cancel(job);
-			SOCKBUF_LOCK(sb);
-		} else {
-			TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list);
-		}
-	} else {
-		if (error)
-			aio_complete(job, -1, error);
-		else
-			aio_complete(job, cnt, 0);
+		MPASS(done != job->uaiocb.aio_nbytes);
 		SOCKBUF_LOCK(sb);
-	}
+		if (done == 0 || !(so->so_state & SS_NBIO)) {
+			empty_results++;
+			if (soaio_ready(so, sb)) {
+				empty_retries++;
+				SOCKBUF_UNLOCK(sb);
+				goto retry;
+			}
+			
+			if (!aio_set_cancel_function(job, soo_aio_cancel)) {
+				SOCKBUF_UNLOCK(sb);
+				if (done != 0)
+					aio_complete(job, done, 0);
+				else
+					aio_cancel(job);
+				SOCKBUF_LOCK(sb);
+			} else {
+				TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list);
+			}
+			return;
+		}
+		SOCKBUF_UNLOCK(sb);
+	}		
+	if (done != 0 && (error == ERESTART || error == EINTR ||
+	    error == EWOULDBLOCK))
+		error = 0;
+	if (error)
+		aio_complete(job, -1, error);
+	else
+		aio_complete(job, done, 0);
+	SOCKBUF_LOCK(sb);
 }
 
 static void
 soaio_process_sb(struct socket *so, struct sockbuf *sb)
 {
 	struct kaiocb *job;
 
 	SOCKBUF_LOCK(sb);
 	while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) {
 		job = TAILQ_FIRST(&sb->sb_aiojobq);
 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		soaio_process_job(so, sb, job);
 	}
 
 	/*
 	 * If there are still pending requests, the socket must not be
 	 * ready so set SB_AIO to request a wakeup when the socket
 	 * becomes ready.
 	 */
 	if (!TAILQ_EMPTY(&sb->sb_aiojobq))
 		sb->sb_flags |= SB_AIO;
 	sb->sb_flags &= ~SB_AIO_RUNNING;
 	SOCKBUF_UNLOCK(sb);
 
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	sorele(so);
 }
 
 void
 soaio_rcv(void *context, int pending)
 {
 	struct socket *so;
 
 	so = context;
 	soaio_process_sb(so, &so->so_rcv);
 }
 
 void
 soaio_snd(void *context, int pending)
 {
 	struct socket *so;
 
 	so = context;
 	soaio_process_sb(so, &so->so_snd);
 }
 
 void
 sowakeup_aio(struct socket *so, struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	sb->sb_flags &= ~SB_AIO;
 	if (sb->sb_flags & SB_AIO_RUNNING)
 		return;
 	sb->sb_flags |= SB_AIO_RUNNING;
 	if (sb == &so->so_snd)
 		SOCK_LOCK(so);
 	soref(so);
 	if (sb == &so->so_snd)
 		SOCK_UNLOCK(so);
 	soaio_enqueue(&sb->sb_aiotask);
 }
 
 static void
 soo_aio_cancel(struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	int opcode;
 
 	so = job->fd_file->f_data;
 	opcode = job->uaiocb.aio_lio_opcode;
 	if (opcode == LIO_READ)
 		sb = &so->so_rcv;
 	else {
 		MPASS(opcode == LIO_WRITE);
 		sb = &so->so_snd;
 	}
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
 	if (TAILQ_EMPTY(&sb->sb_aiojobq))
 		sb->sb_flags &= ~SB_AIO;
 	SOCKBUF_UNLOCK(sb);
 
 	aio_cancel(job);
 }
 
 static int
 soo_aio_queue(struct file *fp, struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	int error;
 
 	so = fp->f_data;
 	error = (*so->so_proto->pr_usrreqs->pru_aio_queue)(so, job);
 	if (error == 0)
 		return (0);
 
 	switch (job->uaiocb.aio_lio_opcode) {
 	case LIO_READ:
 		sb = &so->so_rcv;
 		break;
 	case LIO_WRITE:
 		sb = &so->so_snd;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_set_cancel_function(job, soo_aio_cancel))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list);
+	job->uaiocb._aiocb_private.status = 0;
 	if (!(sb->sb_flags & SB_AIO_RUNNING)) {
 		if (soaio_ready(so, sb))
 			sowakeup_aio(so, sb);
 		else
 			sb->sb_flags |= SB_AIO;
 	}
 	SOCKBUF_UNLOCK(sb);
 	return (0);
 }
Index: head/tests/sys/aio/aio_test.c
===================================================================
--- head/tests/sys/aio/aio_test.c	(revision 300555)
+++ head/tests/sys/aio/aio_test.c	(revision 300556)
@@ -1,797 +1,862 @@
 /*-
  * Copyright (c) 2004 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Regression test to do some very basic AIO exercising on several types of
  * file descriptors.  Currently, the tests consist of initializing a fixed
  * size buffer with pseudo-random data, writing it to one fd using AIO, then
  * reading it from a second descriptor using AIO.  For some targets, the same
  * fd is used for write and read (i.e., file, md device), but for others the
  * operation is performed on a peer (pty, socket, fifo, etc).  A timeout is
  * initiated to detect undo blocking.  This test does not attempt to exercise
  * error cases or more subtle asynchronous behavior, just make sure that the
  * basic operations work on some basic object types.
  */
 
 #include <sys/param.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/mdioctl.h>
 
 #include <aio.h>
 #include <err.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <libutil.h>
 #include <limits.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <termios.h>
 #include <unistd.h>
 
 #include <atf-c.h>
 
 #include "freebsd_test_suite/macros.h"
 #include "local.h"
 
 #define	PATH_TEMPLATE	"aio.XXXXXXXXXX"
 
 /*
  * GLOBAL_MAX sets the largest usable buffer size to be read and written, as
  * it sizes ac_buffer in the aio_context structure.  It is also the default
  * size for file I/O.  For other types, we use smaller blocks or we risk
  * blocking (and we run in a single process/thread so that would be bad).
  */
 #define	GLOBAL_MAX	16384
 
 #define	BUFFER_MAX	GLOBAL_MAX
 struct aio_context {
 	int		 ac_read_fd, ac_write_fd;
 	long		 ac_seed;
 	char		 ac_buffer[GLOBAL_MAX];
 	int		 ac_buflen;
 	int		 ac_seconds;
 	void		 (*ac_cleanup)(void *arg);
 	void		*ac_cleanup_arg;
 };
 
 static int	aio_timedout;
 
 /*
  * Each test run specifies a timeout in seconds.  Use the somewhat obsoleted
  * signal(3) and alarm(3) APIs to set this up.
  */
 static void
 aio_timeout_signal(int sig __unused)
 {
 
 	aio_timedout = 1;
 }
 
 static void
 aio_timeout_start(int seconds)
 {
 
 	aio_timedout = 0;
 	ATF_REQUIRE_MSG(signal(SIGALRM, aio_timeout_signal) != SIG_ERR,
 	    "failed to set SIGALRM handler: %s", strerror(errno));
 	alarm(seconds);
 }
 
 static void
 aio_timeout_stop(void)
 {
 
 	ATF_REQUIRE_MSG(signal(SIGALRM, NULL) != SIG_ERR,
 	    "failed to reset SIGALRM handler to default: %s", strerror(errno));
 	alarm(0);
 }
 
 /*
  * Fill a buffer given a seed that can be fed into srandom() to initialize
  * the PRNG in a repeatable manner.
  */
 static void
 aio_fill_buffer(char *buffer, int len, long seed)
 {
 	char ch;
 	int i;
 
 	srandom(seed);
 	for (i = 0; i < len; i++) {
 		ch = random() & 0xff;
 		buffer[i] = ch;
 	}
 }
 
 /*
  * Test that a buffer matches a given seed.  See aio_fill_buffer().  Return
  * (1) on a match, (0) on a mismatch.
  */
 static int
 aio_test_buffer(char *buffer, int len, long seed)
 {
 	char ch;
 	int i;
 
 	srandom(seed);
 	for (i = 0; i < len; i++) {
 		ch = random() & 0xff;
 		if (buffer[i] != ch)
 			return (0);
 	}
 	return (1);
 }
 
 /*
  * Initialize a testing context given the file descriptors provided by the
  * test setup.
  */
 static void
 aio_context_init(struct aio_context *ac, int read_fd,
     int write_fd, int buflen, int seconds, void (*cleanup)(void *),
     void *cleanup_arg)
 {
 
 	ATF_REQUIRE_MSG(buflen <= BUFFER_MAX,
 	    "aio_context_init: buffer too large (%d > %d)",
 	    buflen, BUFFER_MAX);
 	bzero(ac, sizeof(*ac));
 	ac->ac_read_fd = read_fd;
 	ac->ac_write_fd = write_fd;
 	ac->ac_buflen = buflen;
 	srandomdev();
 	ac->ac_seed = random();
 	aio_fill_buffer(ac->ac_buffer, buflen, ac->ac_seed);
 	ATF_REQUIRE_MSG(aio_test_buffer(ac->ac_buffer, buflen,
 	    ac->ac_seed) != 0, "aio_test_buffer: internal error");
 	ac->ac_seconds = seconds;
 	ac->ac_cleanup = cleanup;
 	ac->ac_cleanup_arg = cleanup_arg;
 }
 
 /*
  * Each tester can register a callback to clean up in the event the test
  * fails.  Preserve the value of errno so that subsequent calls to errx()
  * work properly.
  */
 static void
 aio_cleanup(struct aio_context *ac)
 {
 	int error;
 
 	if (ac->ac_cleanup == NULL)
 		return;
 	error = errno;
 	(ac->ac_cleanup)(ac->ac_cleanup_arg);
 	errno = error;
 }
 
 /*
  * Perform a simple write test of our initialized data buffer to the provided
  * file descriptor.
  */
 static void
 aio_write_test(struct aio_context *ac)
 {
 	struct aiocb aio, *aiop;
 	ssize_t len;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	bzero(&aio, sizeof(aio));
 	aio.aio_buf = ac->ac_buffer;
 	aio.aio_nbytes = ac->ac_buflen;
 	aio.aio_fildes = ac->ac_write_fd;
 	aio.aio_offset = 0;
 
 	aio_timeout_start(ac->ac_seconds);
 
 	if (aio_write(&aio) < 0) {
 		if (errno == EINTR) {
 			if (aio_timedout) {
 				aio_cleanup(ac);
 				atf_tc_fail("aio_write timed out");
 			}
 		}
 		aio_cleanup(ac);
 		atf_tc_fail("aio_write failed: %s", strerror(errno));
 	}
 
 	len = aio_waitcomplete(&aiop, NULL);
 	if (len < 0) {
 		if (errno == EINTR) {
 			if (aio_timedout) {
 				aio_cleanup(ac);
 				atf_tc_fail("aio_waitcomplete timed out");
 			}
 		}
 		aio_cleanup(ac);
 		atf_tc_fail("aio_waitcomplete failed: %s", strerror(errno));
 	}
 
 	aio_timeout_stop();
 
 	if (len != ac->ac_buflen) {
 		aio_cleanup(ac);
 		atf_tc_fail("aio_waitcomplete short write (%jd)",
 		    (intmax_t)len);
 	}
 }
 
 /*
  * Perform a simple read test of our initialized data buffer from the
  * provided file descriptor.
  */
 static void
 aio_read_test(struct aio_context *ac)
 {
 	struct aiocb aio, *aiop;
 	ssize_t len;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	bzero(ac->ac_buffer, ac->ac_buflen);
 	bzero(&aio, sizeof(aio));
 	aio.aio_buf = ac->ac_buffer;
 	aio.aio_nbytes = ac->ac_buflen;
 	aio.aio_fildes = ac->ac_read_fd;
 	aio.aio_offset = 0;
 
 	aio_timeout_start(ac->ac_seconds);
 
 	if (aio_read(&aio) < 0) {
 		if (errno == EINTR) {
 			if (aio_timedout) {
 				aio_cleanup(ac);
 				atf_tc_fail("aio_write timed out");
 			}
 		}
 		aio_cleanup(ac);
 		atf_tc_fail("aio_read failed: %s", strerror(errno));
 	}
 
 	len = aio_waitcomplete(&aiop, NULL);
 	if (len < 0) {
 		if (errno == EINTR) {
 			if (aio_timedout) {
 				aio_cleanup(ac);
 				atf_tc_fail("aio_waitcomplete timed out");
 			}
 		}
 		aio_cleanup(ac);
 		atf_tc_fail("aio_waitcomplete failed: %s", strerror(errno));
 	}
 
 	aio_timeout_stop();
 
 	if (len != ac->ac_buflen) {
 		aio_cleanup(ac);
 		atf_tc_fail("aio_waitcomplete short read (%jd)",
 		    (intmax_t)len);
 	}
 
 	if (aio_test_buffer(ac->ac_buffer, ac->ac_buflen, ac->ac_seed) == 0) {
 		aio_cleanup(ac);
 		atf_tc_fail("buffer mismatched");
 	}
 }
 
 /*
  * Series of type-specific tests for AIO.  For now, we just make sure we can
  * issue a write and then a read to each type.  We assume that once a write
  * is issued, a read can follow.
  */
 
 /*
  * Test with a classic file.  Assumes we can create a moderate size temporary
  * file.
  */
 struct aio_file_arg {
 	int	 afa_fd;
 	char	*afa_pathname;
 };
 
 static void
 aio_file_cleanup(void *arg)
 {
 	struct aio_file_arg *afa;
 
 	afa = arg;
 	close(afa->afa_fd);
 	unlink(afa->afa_pathname);
 }
 
 #define	FILE_LEN	GLOBAL_MAX
 #define	FILE_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_file_test);
 ATF_TC_BODY(aio_file_test, tc)
 {
 	char pathname[PATH_MAX];
 	struct aio_file_arg arg;
 	struct aio_context ac;
 	int fd;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 	strcpy(pathname, PATH_TEMPLATE);
 	fd = mkstemp(pathname);
 	ATF_REQUIRE_MSG(fd != -1, "mkstemp failed: %s", strerror(errno));
 
 	arg.afa_fd = fd;
 	arg.afa_pathname = pathname;
 
 	aio_context_init(&ac, fd, fd, FILE_LEN,
 	    FILE_TIMEOUT, aio_file_cleanup, &arg);
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_file_cleanup(&arg);
 }
 
 struct aio_fifo_arg {
 	int	 afa_read_fd;
 	int	 afa_write_fd;
 	char	*afa_pathname;
 };
 
 static void
 aio_fifo_cleanup(void *arg)
 {
 	struct aio_fifo_arg *afa;
 
 	afa = arg;
 	if (afa->afa_read_fd != -1)
 		close(afa->afa_read_fd);
 	if (afa->afa_write_fd != -1)
 		close(afa->afa_write_fd);
 	unlink(afa->afa_pathname);
 }
 
 #define	FIFO_LEN	256
 #define	FIFO_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_fifo_test);
 ATF_TC_BODY(aio_fifo_test, tc)
 {
 	int error, read_fd = -1, write_fd = -1;
 	struct aio_fifo_arg arg;
 	char pathname[PATH_MAX];
 	struct aio_context ac;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 	/*
 	 * In theory, mkstemp() can return a name that is then collided with.
 	 * Because this is a regression test, we treat that as a test failure
 	 * rather than retrying.
 	 */
 	strcpy(pathname, PATH_TEMPLATE);
 	ATF_REQUIRE_MSG(mkstemp(pathname) != -1,
 	    "mkstemp failed: %s", strerror(errno));
 	ATF_REQUIRE_MSG(unlink(pathname) == 0,
 	    "unlink failed: %s", strerror(errno));
 	ATF_REQUIRE_MSG(mkfifo(pathname, 0600) != -1,
 	    "mkfifo failed: %s", strerror(errno));
 	arg.afa_pathname = pathname;
 	arg.afa_read_fd = -1;
 	arg.afa_write_fd = -1;
 
 	read_fd = open(pathname, O_RDONLY | O_NONBLOCK);
 	if (read_fd == -1) {
 		error = errno;
 		aio_fifo_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("read_fd open failed: %s",
 		    strerror(errno));
 	}
 	arg.afa_read_fd = read_fd;
 
 	write_fd = open(pathname, O_WRONLY);
 	if (write_fd == -1) {
 		error = errno;
 		aio_fifo_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("write_fd open failed: %s",
 		    strerror(errno));
 	}
 	arg.afa_write_fd = write_fd;
 
 	aio_context_init(&ac, read_fd, write_fd, FIFO_LEN,
 	    FIFO_TIMEOUT, aio_fifo_cleanup, &arg);
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_fifo_cleanup(&arg);
 }
 
 struct aio_unix_socketpair_arg {
 	int	asa_sockets[2];
 };
 
 static void
 aio_unix_socketpair_cleanup(void *arg)
 {
 	struct aio_unix_socketpair_arg *asa;
 
 	asa = arg;
 	close(asa->asa_sockets[0]);
 	close(asa->asa_sockets[1]);
 }
 
 #define	UNIX_SOCKETPAIR_LEN	256
 #define	UNIX_SOCKETPAIR_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_unix_socketpair_test);
 ATF_TC_BODY(aio_unix_socketpair_test, tc)
 {
 	struct aio_unix_socketpair_arg arg;
 	struct aio_context ac;
 	int sockets[2];
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	ATF_REQUIRE_MSG(socketpair(PF_UNIX, SOCK_STREAM, 0, sockets) != -1,
 	    "socketpair failed: %s", strerror(errno));
 
 	arg.asa_sockets[0] = sockets[0];
 	arg.asa_sockets[1] = sockets[1];
 	aio_context_init(&ac, sockets[0],
 	    sockets[1], UNIX_SOCKETPAIR_LEN, UNIX_SOCKETPAIR_TIMEOUT,
 	    aio_unix_socketpair_cleanup, &arg);
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_unix_socketpair_cleanup(&arg);
 }
 
 struct aio_pty_arg {
 	int	apa_read_fd;
 	int	apa_write_fd;
 };
 
 static void
 aio_pty_cleanup(void *arg)
 {
 	struct aio_pty_arg *apa;
 
 	apa = arg;
 	close(apa->apa_read_fd);
 	close(apa->apa_write_fd);
 };
 
 #define	PTY_LEN		256
 #define	PTY_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_pty_test);
 ATF_TC_BODY(aio_pty_test, tc)
 {
 	struct aio_pty_arg arg;
 	struct aio_context ac;
 	int read_fd, write_fd;
 	struct termios ts;
 	int error;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 	ATF_REQUIRE_MSG(openpty(&read_fd, &write_fd, NULL, NULL, NULL) == 0,
 	    "openpty failed: %s", strerror(errno));
 
 	arg.apa_read_fd = read_fd;
 	arg.apa_write_fd = write_fd;
 
 	if (tcgetattr(write_fd, &ts) < 0) {
 		error = errno;
 		aio_pty_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("tcgetattr failed: %s", strerror(errno));
 	}
 	cfmakeraw(&ts);
 	if (tcsetattr(write_fd, TCSANOW, &ts) < 0) {
 		error = errno;
 		aio_pty_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("tcsetattr failed: %s", strerror(errno));
 	}
 	aio_context_init(&ac, read_fd, write_fd, PTY_LEN,
 	    PTY_TIMEOUT, aio_pty_cleanup, &arg);
 
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_pty_cleanup(&arg);
 }
 
 static void
 aio_pipe_cleanup(void *arg)
 {
 	int *pipes = arg;
 
 	close(pipes[0]);
 	close(pipes[1]);
 }
 
 #define	PIPE_LEN	256
 #define	PIPE_TIMEOUT	30
 ATF_TC_WITHOUT_HEAD(aio_pipe_test);
 ATF_TC_BODY(aio_pipe_test, tc)
 {
 	struct aio_context ac;
 	int pipes[2];
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 	ATF_REQUIRE_MSG(pipe(pipes) != -1,
 	    "pipe failed: %s", strerror(errno));
 
 	aio_context_init(&ac, pipes[0], pipes[1], PIPE_LEN,
 	    PIPE_TIMEOUT, aio_pipe_cleanup, pipes);
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_pipe_cleanup(pipes);
 }
 
 struct aio_md_arg {
 	int	ama_mdctl_fd;
 	int	ama_unit;
 	int	ama_fd;
 };
 
 static void
 aio_md_cleanup(void *arg)
 {
 	struct aio_md_arg *ama;
 	struct md_ioctl mdio;
 	int error;
 
 	ama = arg;
 
 	if (ama->ama_fd != -1)
 		close(ama->ama_fd);
 
 	if (ama->ama_unit != -1) {
 		bzero(&mdio, sizeof(mdio));
 		mdio.md_version = MDIOVERSION;
 		mdio.md_unit = ama->ama_unit;
 		if (ioctl(ama->ama_mdctl_fd, MDIOCDETACH, &mdio) == -1) {
 			error = errno;
 			close(ama->ama_mdctl_fd);
 			errno = error;
 			atf_tc_fail("ioctl MDIOCDETACH failed: %s",
 			    strerror(errno));
 		}
 	}
 
 	close(ama->ama_mdctl_fd);
 }
 
 #define	MD_LEN		GLOBAL_MAX
 #define	MD_TIMEOUT	30
 ATF_TC(aio_md_test);
 ATF_TC_HEAD(aio_md_test, tc)
 {
 
 	atf_tc_set_md_var(tc, "require.user", "root");
 }
 ATF_TC_BODY(aio_md_test, tc)
 {
 	int error, fd, mdctl_fd, unit;
 	char pathname[PATH_MAX];
 	struct aio_md_arg arg;
 	struct aio_context ac;
 	struct md_ioctl mdio;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 
 	mdctl_fd = open("/dev/" MDCTL_NAME, O_RDWR, 0);
 	ATF_REQUIRE_MSG(mdctl_fd != -1,
 	    "opening /dev/%s failed: %s", MDCTL_NAME, strerror(errno));
 
 	bzero(&mdio, sizeof(mdio));
 	mdio.md_version = MDIOVERSION;
 	mdio.md_type = MD_MALLOC;
 	mdio.md_options = MD_AUTOUNIT | MD_COMPRESS;
 	mdio.md_mediasize = GLOBAL_MAX;
 	mdio.md_sectorsize = 512;
 
 	arg.ama_mdctl_fd = mdctl_fd;
 	arg.ama_unit = -1;
 	arg.ama_fd = -1;
 	if (ioctl(mdctl_fd, MDIOCATTACH, &mdio) < 0) {
 		error = errno;
 		aio_md_cleanup(&arg);
 		errno = error;
 		atf_tc_fail("ioctl MDIOCATTACH failed: %s", strerror(errno));
 	}
 
 	arg.ama_unit = unit = mdio.md_unit;
 	snprintf(pathname, PATH_MAX, "/dev/md%d", unit);
 	fd = open(pathname, O_RDWR);
 	ATF_REQUIRE_MSG(fd != -1,
 	    "opening %s failed: %s", pathname, strerror(errno));
 	arg.ama_fd = fd;
 
 	aio_context_init(&ac, fd, fd, MD_LEN, MD_TIMEOUT,
 	    aio_md_cleanup, &arg);
 	aio_write_test(&ac);
 	aio_read_test(&ac);
 
 	aio_md_cleanup(&arg);
 }
 
 ATF_TC_WITHOUT_HEAD(aio_large_read_test);
 ATF_TC_BODY(aio_large_read_test, tc)
 {
 	char pathname[PATH_MAX];
 	struct aiocb cb, *cbp;
 	ssize_t nread;
 	size_t len;
 	int fd;
 #ifdef __LP64__
 	int clamped;
 #endif
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 	ATF_REQUIRE_UNSAFE_AIO();
 
 #ifdef __LP64__
 	len = sizeof(clamped);
 	if (sysctlbyname("debug.iosize_max_clamp", &clamped, &len, NULL, 0) ==
 	    -1)
 		atf_libc_error(errno, "Failed to read debug.iosize_max_clamp");
 #endif
 
 	/* Determine the maximum supported read(2) size. */
 	len = SSIZE_MAX;
 #ifdef __LP64__
 	if (clamped)
 		len = INT_MAX;
 #endif
 
 	strcpy(pathname, PATH_TEMPLATE);
 	fd = mkstemp(pathname);
 	ATF_REQUIRE_MSG(fd != -1, "mkstemp failed: %s", strerror(errno));
 
 	unlink(pathname);
 
 	memset(&cb, 0, sizeof(cb));
 	cb.aio_nbytes = len;
 	cb.aio_fildes = fd;
 	cb.aio_buf = NULL;
 	if (aio_read(&cb) == -1)
 		atf_tc_fail("aio_read() of maximum read size failed: %s",
 		    strerror(errno));
 
 	nread = aio_waitcomplete(&cbp, NULL);
 	if (nread == -1)
 		atf_tc_fail("aio_waitcomplete() failed: %s", strerror(errno));
 	if (nread != 0)
 		atf_tc_fail("aio_read() from empty file returned data: %zd",
 		    nread);
 
 	memset(&cb, 0, sizeof(cb));
 	cb.aio_nbytes = len + 1;
 	cb.aio_fildes = fd;
 	cb.aio_buf = NULL;
 	if (aio_read(&cb) == -1) {
 		if (errno == EINVAL)
 			goto finished;
 		atf_tc_fail("aio_read() of too large read size failed: %s",
 		    strerror(errno));
 	}
 
 	nread = aio_waitcomplete(&cbp, NULL);
 	if (nread == -1) {
 		if (errno == EINVAL)
 			goto finished;
 		atf_tc_fail("aio_waitcomplete() failed: %s", strerror(errno));
 	}
 	atf_tc_fail("aio_read() of too large read size returned: %zd", nread);
 
 finished:
 	close(fd);
 }
 
 /*
  * This tests for a bug where arriving socket data can wakeup multiple
  * AIO read requests resulting in an uncancellable request.
  */
 ATF_TC_WITHOUT_HEAD(aio_socket_two_reads);
 ATF_TC_BODY(aio_socket_two_reads, tc)
 {
 	struct ioreq {
 		struct aiocb iocb;
 		char buffer[1024];
 	} ioreq[2];
 	struct aiocb *iocb;
 	unsigned i;
 	int s[2];
 	char c;
 
 	ATF_REQUIRE_KERNEL_MODULE("aio");
 #if __FreeBSD_version < 1100101
 	aft_tc_skip("kernel version %d is too old (%d required)",
 	    __FreeBSD_version, 1100101);
 #endif
 
 	ATF_REQUIRE(socketpair(PF_UNIX, SOCK_STREAM, 0, s) != -1);
 
 	/* Queue two read requests. */
 	memset(&ioreq, 0, sizeof(ioreq));
 	for (i = 0; i < nitems(ioreq); i++) {
 		ioreq[i].iocb.aio_nbytes = sizeof(ioreq[i].buffer);
 		ioreq[i].iocb.aio_fildes = s[0];
 		ioreq[i].iocb.aio_buf = ioreq[i].buffer;
 		ATF_REQUIRE(aio_read(&ioreq[i].iocb) == 0);
 	}
 
 	/* Send a single byte.  This should complete one request. */
 	c = 0xc3;
 	ATF_REQUIRE(write(s[1], &c, sizeof(c)) == 1);
 
 	ATF_REQUIRE(aio_waitcomplete(&iocb, NULL) == 1);
 
 	/* Determine which request completed and verify the data was read. */
 	if (iocb == &ioreq[0].iocb)
 		i = 0;
 	else
 		i = 1;
 	ATF_REQUIRE(ioreq[i].buffer[0] == c);
 
 	i ^= 1;
 
 	/*
 	 * Try to cancel the other request.  On broken systems this
 	 * will fail and the process will hang on exit.
 	 */
 	ATF_REQUIRE(aio_error(&ioreq[i].iocb) == EINPROGRESS);
 	ATF_REQUIRE(aio_cancel(s[0], &ioreq[i].iocb) == AIO_CANCELED);
 
 	close(s[1]);
 	close(s[0]);
 }
 
+/*
+ * This test ensures that aio_write() on a blocking socket of a "large"
+ * buffer does not return a short completion.
+ */
+ATF_TC_WITHOUT_HEAD(aio_socket_blocking_short_write);
+ATF_TC_BODY(aio_socket_blocking_short_write, tc)
+{
+	struct aiocb iocb, *iocbp;
+	char *buffer[2];
+	ssize_t done;
+	int buffer_size, sb_size;
+	socklen_t len;
+	int s[2];
+
+	ATF_REQUIRE_KERNEL_MODULE("aio");
+
+	ATF_REQUIRE(socketpair(PF_UNIX, SOCK_STREAM, 0, s) != -1);
+
+	len = sizeof(sb_size);
+	ATF_REQUIRE(getsockopt(s[0], SOL_SOCKET, SO_RCVBUF, &sb_size, &len) !=
+	    -1);
+	ATF_REQUIRE(len == sizeof(sb_size));
+	buffer_size = sb_size;
+
+	ATF_REQUIRE(getsockopt(s[1], SOL_SOCKET, SO_SNDBUF, &sb_size, &len) !=
+	    -1);
+	ATF_REQUIRE(len == sizeof(sb_size));
+	if (sb_size > buffer_size)
+		buffer_size = sb_size;
+
+	/*
+	 * Use twice the size of the MAX(receive buffer, send buffer)
+	 * to ensure that the write is split up into multiple writes
+	 * internally.
+	 */
+	buffer_size *= 2;
+
+	buffer[0] = malloc(buffer_size);
+	ATF_REQUIRE(buffer[0] != NULL);
+	buffer[1] = malloc(buffer_size);
+	ATF_REQUIRE(buffer[1] != NULL);
+
+	srandomdev();
+	aio_fill_buffer(buffer[1], buffer_size, random());
+
+	memset(&iocb, 0, sizeof(iocb));
+	iocb.aio_fildes = s[1];
+	iocb.aio_buf = buffer[1];
+	iocb.aio_nbytes = buffer_size;
+	ATF_REQUIRE(aio_write(&iocb) == 0);
+
+	done = recv(s[0], buffer[0], buffer_size, MSG_WAITALL);
+	ATF_REQUIRE(done == buffer_size);
+
+	done = aio_waitcomplete(&iocbp, NULL);
+	ATF_REQUIRE(iocbp == &iocb);
+	ATF_REQUIRE(done == buffer_size);
+
+	ATF_REQUIRE(memcmp(buffer[0], buffer[1], buffer_size) == 0);
+
+	close(s[1]);
+	close(s[0]);
+}
+
 ATF_TP_ADD_TCS(tp)
 {
 
 	ATF_TP_ADD_TC(tp, aio_file_test);
 	ATF_TP_ADD_TC(tp, aio_fifo_test);
 	ATF_TP_ADD_TC(tp, aio_unix_socketpair_test);
 	ATF_TP_ADD_TC(tp, aio_pty_test);
 	ATF_TP_ADD_TC(tp, aio_pipe_test);
 	ATF_TP_ADD_TC(tp, aio_md_test);
 	ATF_TP_ADD_TC(tp, aio_large_read_test);
 	ATF_TP_ADD_TC(tp, aio_socket_two_reads);
+	ATF_TP_ADD_TC(tp, aio_socket_blocking_short_write);
 
 	return (atf_no_error());
 }