Index: lib/libc/sys/Makefile.inc =================================================================== --- lib/libc/sys/Makefile.inc +++ lib/libc/sys/Makefile.inc @@ -353,6 +353,8 @@ write.2 \ _umtx_op.2 +MLINKS+=aio_read.2 aio_readv.2 +MLINKS+=aio_write.2 aio_writev.2 MLINKS+=accept.2 accept4.2 MLINKS+=access.2 eaccess.2 \ access.2 faccessat.2 Index: lib/libc/sys/Symbol.map =================================================================== --- lib/libc/sys/Symbol.map +++ lib/libc/sys/Symbol.map @@ -403,6 +403,8 @@ FBSD_1.6 { __sysctlbyname; + aio_readv; + aio_writev; close_range; copy_file_range; fhlink; Index: lib/libc/sys/aio_error.2 =================================================================== --- lib/libc/sys/aio_error.2 +++ lib/libc/sys/aio_error.2 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd June 2, 1999 +.Dd December 15, 2020 .Dt AIO_ERROR 2 .Os .Sh NAME @@ -52,7 +52,9 @@ If the request has completed unsuccessfully the error status is returned as described in .Xr read 2 , +.Xr readv 2 , .Xr write 2 , +.Xr writev 2 , or .Xr fsync 2 . On failure, @@ -76,9 +78,11 @@ .Sh SEE ALSO .Xr aio_cancel 2 , .Xr aio_read 2 , +.Xr aio_readv 2 , .Xr aio_return 2 , .Xr aio_suspend 2 , .Xr aio_write 2 , +.Xr aio_writev 2 , .Xr fsync 2 , .Xr read 2 , .Xr write 2 , Index: lib/libc/sys/aio_read.2 =================================================================== --- lib/libc/sys/aio_read.2 +++ lib/libc/sys/aio_read.2 @@ -24,11 +24,12 @@ .\" .\" $FreeBSD$ .\" -.Dd August 19, 2016 +.Dd December 13, 2020 .Dt AIO_READ 2 .Os .Sh NAME -.Nm aio_read +.Nm aio_read , +.Nm aio_readv .Nd asynchronous read from a file (REALTIME) .Sh LIBRARY .Lb libc @@ -36,21 +37,41 @@ .In aio.h .Ft int .Fn aio_read "struct aiocb *iocb" +.Ft int +.Fn aio_readv "struct aiocb *iocb" .Sh DESCRIPTION The .Fn aio_read -system call allows the calling process to read -.Fa iocb->aio_nbytes +and +.Fn aio_readv +system calls allow the calling process to read from the descriptor .Fa iocb->aio_fildes beginning at the offset -.Fa iocb->aio_offset -into the buffer pointed to by -.Fa iocb->aio_buf . -The call returns immediately after the read request has +.Fa iocb->aio_offset . +.Fn aio_read +will read +.Fa iocb->aio_nbytes +from the buffer pointed to by +.Fa iocb->aio_buf , +whereas +.Fn aio_readv +reads the data into the +.Fa iocb->aio_iovcnt +buffers specified by the members of the +.Fa iocb->aio_iov +array. +Both syscalls return immediately after the read request has been enqueued to the descriptor; the read may or may not have completed at the time the call returns. .Pp +For +.Fn aio_readv +the +.Fa iovec +structure is defined in +.Xr readv 2 . +.Pp If _POSIX_PRIORITIZED_IO is defined, and the descriptor supports it, then the enqueued operation is submitted at a priority equal to that of the calling process minus @@ -61,7 +82,9 @@ argument is ignored by the .Fn aio_read -system call. +and +.Fn aio_readv +system calls. .Pp The .Fa iocb @@ -108,13 +131,15 @@ .Fa iocb->aio_fildes , no I/O will occur. .Sh RETURN VALUES -.Rv -std aio_read +.Rv -std aio_read aio_readv .Sh DIAGNOSTICS None. .Sh ERRORS The .Fn aio_read -system call will fail if: +and +.Fn aio_readv +system calls will fail if: .Bl -tag -width Er .It Bq Er EAGAIN The request was not queued because of system resource limitations. @@ -130,10 +155,14 @@ .Pp The following conditions may be synchronously detected when the .Fn aio_read +or +.Fn aio_readv system call is made, or asynchronously, at any time thereafter. If they are detected at call time, .Fn aio_read +or +.Fn aio_readv returns -1 and sets .Va errno appropriately; otherwise the @@ -207,11 +236,18 @@ system call is expected to conform to the .St -p1003.1 standard. +The +.Fn aio_readv +system call is a FreeBSD extension, and should not be used in portable code. .Sh HISTORY The .Fn aio_read system call first appeared in .Fx 3.0 . +The +.Fn aio_readv +system call first appeared in +.Fx 13.0 . .Sh AUTHORS This manual page was written by Index: lib/libc/sys/aio_return.2 =================================================================== --- lib/libc/sys/aio_return.2 +++ lib/libc/sys/aio_return.2 @@ -55,7 +55,9 @@ If the asynchronous I/O request has completed, the status is returned as described in .Xr read 2 , +.Xr readv 2 , .Xr write 2 , +.Xr writev 2 , or .Xr fsync 2 . Otherwise, Index: lib/libc/sys/aio_write.2 =================================================================== --- lib/libc/sys/aio_write.2 +++ lib/libc/sys/aio_write.2 @@ -24,11 +24,12 @@ .\" .\" $FreeBSD$ .\" -.Dd August 19, 2016 +.Dd November 29, 2020 .Dt AIO_WRITE 2 .Os .Sh NAME -.Nm aio_write +.Nm aio_write , +.Nm aio_writev .Nd asynchronous write to a file (REALTIME) .Sh LIBRARY .Lb libc @@ -36,28 +37,47 @@ .In aio.h .Ft int .Fn aio_write "struct aiocb *iocb" +.Ft int +.Fn aio_writev "struct aiocb *iocb" .Sh DESCRIPTION The .Fn aio_write -system call allows the calling process to write -.Fa iocb->aio_nbytes -from the buffer pointed to by -.Fa iocb->aio_buf +and +.Fn aio_writev +system calls allow the calling process to write to the descriptor .Fa iocb->aio_fildes . -The call returns immediately after the write request has been enqueued +.Fn aio_write +will write +.Fa iocb->aio_nbytes +from the buffer pointed to by +.Fa iocb->aio_buf , +whereas +.Fn aio_writev +gathers the data from the +.Fa iocb->aio_iovcnt +buffers specified by the members of the +.Fa iocb->aio_iov +array. +Both syscalls return immediately after the write request has been enqueued to the descriptor; the write may or may not have completed at the time the call returns. If the request could not be enqueued, generally due to invalid arguments, the call returns without having enqueued the request. .Pp +For +.Fn aio_writev +the +.Fa iovec +structure is defined in +.Xr writev 2 . +.Pp If .Dv O_APPEND is set for .Fa iocb->aio_fildes , -.Fn aio_write -operations append to the file in the same order as the calls were +write operations append to the file in the same order as the calls were made. If .Dv O_APPEND @@ -103,6 +123,8 @@ .Fa iocb should be zeroed before the .Fn aio_write +or +.Fn aio_writev system call to avoid passing bogus context information to the kernel. .Pp Modifications of the Asynchronous I/O Control Block structure or the @@ -114,11 +136,13 @@ .Fa iocb->aio_fildes , no I/O will occur. .Sh RETURN VALUES -.Rv -std aio_write +.Rv -std aio_write aio_writev .Sh ERRORS The .Fn aio_write -system call will fail if: +and +.Fn aio_writev +system calls will fail if: .Bl -tag -width Er .It Bq Er EAGAIN The request was not queued because of system resource limitations. @@ -134,10 +158,14 @@ .Pp The following conditions may be synchronously detected when the .Fn aio_write +or +.Fn aio_writev system call is made, or asynchronously, at any time thereafter. If they are detected at call time, .Fn aio_write +or +.Fn aio_writev returns -1 and sets .Va errno appropriately; otherwise the @@ -203,11 +231,19 @@ is expected to conform to the .St -p1003.1 standard. +.Pp +The +.Fn aio_writev +system call is a FreeBSD extension, and should not be used in portable code. .Sh HISTORY The .Fn aio_write system call first appeared in .Fx 3.0 . +The +.Fn aio_writev +system call first appeared in +.Fx 13.0 . .Sh AUTHORS This manual page was written by .An Wes Peters Aq Mt wes@softweyr.com . Index: share/man/man4/aio.4 =================================================================== --- share/man/man4/aio.4 +++ share/man/man4/aio.4 @@ -27,7 +27,7 @@ .\" .\" $FreeBSD$ .\" -.Dd June 22, 2017 +.Dd December 15, 2020 .Dt AIO 4 .Os .Sh NAME @@ -215,10 +215,12 @@ .Xr aio_cancel 2 , .Xr aio_error 2 , .Xr aio_read 2 , +.Xr aio_readv 2 , .Xr aio_return 2 , .Xr aio_suspend 2 , .Xr aio_waitcomplete 2 , .Xr aio_write 2 , +.Xr aio_writev 2 , .Xr lio_listio 2 , .Xr sigevent 3 , .Xr sysctl 8 Index: sys/bsm/audit_kevents.h =================================================================== --- sys/bsm/audit_kevents.h +++ sys/bsm/audit_kevents.h @@ -659,6 +659,8 @@ #define AUE_SHMRENAME 43263 /* FreeBSD-specific. */ #define AUE_REALPATHAT 43264 /* FreeBSD-specific. */ #define AUE_CLOSERANGE 43265 /* FreeBSD-specific. */ +#define AUE_AIO_WRITEV 43266 /* FreeBSD-specific. */ +#define AUE_AIO_READV 43267 /* FreeBSD-specific. */ /* * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the Index: sys/compat/freebsd32/freebsd32_proto.h =================================================================== --- sys/compat/freebsd32/freebsd32_proto.h +++ sys/compat/freebsd32/freebsd32_proto.h @@ -251,6 +251,12 @@ char nent_l_[PADL_(int)]; int nent; char nent_r_[PADR_(int)]; char sig_l_[PADL_(struct sigevent32 *)]; struct sigevent32 * sig; char sig_r_[PADR_(struct sigevent32 *)]; }; +struct freebsd32_aio_writev_args { + char aiocbp_l_[PADL_(struct aiocb32 *)]; struct aiocb32 * aiocbp; char aiocbp_r_[PADR_(struct aiocb32 *)]; +}; +struct freebsd32_aio_readv_args { + char aiocbp_l_[PADL_(struct aiocb32 *)]; struct aiocb32 * aiocbp; char aiocbp_r_[PADR_(struct aiocb32 *)]; +}; struct freebsd32_lutimes_args { char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; char tptr_l_[PADL_(struct timeval32 *)]; struct timeval32 * tptr; char tptr_r_[PADR_(struct timeval32 *)]; @@ -789,6 +795,8 @@ int freebsd32_aio_read(struct thread *, struct freebsd32_aio_read_args *); int freebsd32_aio_write(struct thread *, struct freebsd32_aio_write_args *); int freebsd32_lio_listio(struct thread *, struct freebsd32_lio_listio_args *); +int freebsd32_aio_writev(struct thread *, struct freebsd32_aio_writev_args *); +int freebsd32_aio_readv(struct thread *, struct freebsd32_aio_readv_args *); int freebsd32_lutimes(struct thread *, struct freebsd32_lutimes_args *); int freebsd32_preadv(struct thread *, struct freebsd32_preadv_args *); int freebsd32_pwritev(struct thread *, struct freebsd32_pwritev_args *); @@ -1374,6 +1382,8 @@ #define FREEBSD32_SYS_AUE_freebsd32_aio_read AUE_AIO_READ #define FREEBSD32_SYS_AUE_freebsd32_aio_write AUE_AIO_WRITE #define FREEBSD32_SYS_AUE_freebsd32_lio_listio AUE_LIO_LISTIO +#define FREEBSD32_SYS_AUE_freebsd32_aio_writev AUE_AIO_WRITEV +#define FREEBSD32_SYS_AUE_freebsd32_aio_readv AUE_AIO_READV #define FREEBSD32_SYS_AUE_freebsd11_freebsd32_getdents AUE_O_GETDENTS #define FREEBSD32_SYS_AUE_freebsd32_lutimes AUE_LUTIMES #define FREEBSD32_SYS_AUE_freebsd32_preadv AUE_PREADV Index: sys/compat/freebsd32/freebsd32_syscall.h =================================================================== --- sys/compat/freebsd32/freebsd32_syscall.h +++ sys/compat/freebsd32/freebsd32_syscall.h @@ -229,6 +229,8 @@ #define FREEBSD32_SYS_freebsd32_aio_read 255 #define FREEBSD32_SYS_freebsd32_aio_write 256 #define FREEBSD32_SYS_freebsd32_lio_listio 257 +#define FREEBSD32_SYS_freebsd32_aio_writev 258 +#define FREEBSD32_SYS_freebsd32_aio_readv 259 #define FREEBSD32_SYS_freebsd11_freebsd32_getdents 272 #define FREEBSD32_SYS_lchmod 274 /* 275 is obsolete netbsd_lchown */ Index: sys/compat/freebsd32/freebsd32_syscalls.c =================================================================== --- sys/compat/freebsd32/freebsd32_syscalls.c +++ sys/compat/freebsd32/freebsd32_syscalls.c @@ -267,8 +267,8 @@ "freebsd32_aio_read", /* 255 = freebsd32_aio_read */ "freebsd32_aio_write", /* 256 = freebsd32_aio_write */ "freebsd32_lio_listio", /* 257 = freebsd32_lio_listio */ - "#258", /* 258 = nosys */ - "#259", /* 259 = nosys */ + "freebsd32_aio_writev", /* 258 = freebsd32_aio_writev */ + "freebsd32_aio_readv", /* 259 = freebsd32_aio_readv */ "#260", /* 260 = nosys */ "#261", /* 261 = nosys */ "#262", /* 262 = nosys */ Index: sys/compat/freebsd32/freebsd32_sysent.c =================================================================== --- sys/compat/freebsd32/freebsd32_sysent.c +++ sys/compat/freebsd32/freebsd32_sysent.c @@ -320,8 +320,8 @@ { .sy_narg = AS(freebsd32_aio_read_args), .sy_call = (sy_call_t *)freebsd32_aio_read, .sy_auevent = AUE_AIO_READ, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 255 = freebsd32_aio_read */ { .sy_narg = AS(freebsd32_aio_write_args), .sy_call = (sy_call_t *)freebsd32_aio_write, .sy_auevent = AUE_AIO_WRITE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 256 = freebsd32_aio_write */ { .sy_narg = AS(freebsd32_lio_listio_args), .sy_call = (sy_call_t *)freebsd32_lio_listio, .sy_auevent = AUE_LIO_LISTIO, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 257 = freebsd32_lio_listio */ - { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 258 = nosys */ - { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 259 = nosys */ + { .sy_narg = AS(freebsd32_aio_writev_args), .sy_call = (sy_call_t *)freebsd32_aio_writev, .sy_auevent = AUE_AIO_WRITEV, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 258 = freebsd32_aio_writev */ + { .sy_narg = AS(freebsd32_aio_readv_args), .sy_call = (sy_call_t *)freebsd32_aio_readv, .sy_auevent = AUE_AIO_READV, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 259 = freebsd32_aio_readv */ { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 260 = nosys */ { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 261 = nosys */ { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 262 = nosys */ Index: sys/compat/freebsd32/freebsd32_systrace_args.c =================================================================== --- sys/compat/freebsd32/freebsd32_systrace_args.c +++ sys/compat/freebsd32/freebsd32_systrace_args.c @@ -1284,6 +1284,20 @@ *n_args = 4; break; } + /* freebsd32_aio_writev */ + case 258: { + struct freebsd32_aio_writev_args *p = params; + uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb32 * */ + *n_args = 1; + break; + } + /* freebsd32_aio_readv */ + case 259: { + struct freebsd32_aio_readv_args *p = params; + uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb32 * */ + *n_args = 1; + break; + } /* lchmod */ case 274: { struct lchmod_args *p = params; @@ -5411,6 +5425,26 @@ break; }; break; + /* freebsd32_aio_writev */ + case 258: + switch(ndx) { + case 0: + p = "userland struct aiocb32 *"; + break; + default: + break; + }; + break; + /* freebsd32_aio_readv */ + case 259: + switch(ndx) { + case 0: + p = "userland struct aiocb32 *"; + break; + default: + break; + }; + break; /* lchmod */ case 274: switch(ndx) { @@ -9868,6 +9902,16 @@ if (ndx == 0 || ndx == 1) p = "int"; break; + /* freebsd32_aio_writev */ + case 258: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* freebsd32_aio_readv */ + case 259: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* lchmod */ case 274: if (ndx == 0 || ndx == 1) Index: sys/compat/freebsd32/syscalls.master =================================================================== --- sys/compat/freebsd32/syscalls.master +++ sys/compat/freebsd32/syscalls.master @@ -493,8 +493,10 @@ 257 AUE_LIO_LISTIO STD { int freebsd32_lio_listio(int mode, \ struct aiocb32 * const *acb_list, \ int nent, struct sigevent32 *sig); } -258 AUE_NULL UNIMPL nosys -259 AUE_NULL UNIMPL nosys +258 AUE_AIO_WRITEV STD { int freebsd32_aio_writev( \ + struct aiocb32 *aiocbp); } +259 AUE_AIO_READV STD { int freebsd32_aio_readv( \ + struct aiocb32 *aiocbp); } 260 AUE_NULL UNIMPL nosys 261 AUE_NULL UNIMPL nosys 262 AUE_NULL UNIMPL nosys Index: sys/kern/capabilities.conf =================================================================== --- sys/kern/capabilities.conf +++ sys/kern/capabilities.conf @@ -95,6 +95,8 @@ aio_suspend aio_waitcomplete aio_write +aio_writev +aio_readv ## ## audit(2) is a global operation, submitting to the global trail, but it is Index: sys/kern/init_sysent.c =================================================================== --- sys/kern/init_sysent.c +++ sys/kern/init_sysent.c @@ -313,8 +313,8 @@ { .sy_narg = AS(aio_read_args), .sy_call = (sy_call_t *)sys_aio_read, .sy_auevent = AUE_AIO_READ, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 255 = aio_read */ { .sy_narg = AS(aio_write_args), .sy_call = (sy_call_t *)sys_aio_write, .sy_auevent = AUE_AIO_WRITE, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 256 = aio_write */ { .sy_narg = AS(lio_listio_args), .sy_call = (sy_call_t *)sys_lio_listio, .sy_auevent = AUE_LIO_LISTIO, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 257 = lio_listio */ - { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 258 = nosys */ - { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 259 = nosys */ + { .sy_narg = AS(aio_writev_args), .sy_call = (sy_call_t *)sys_aio_writev, .sy_auevent = AUE_AIO_WRITEV, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 258 = aio_writev */ + { .sy_narg = AS(aio_readv_args), .sy_call = (sy_call_t *)sys_aio_readv, .sy_auevent = AUE_AIO_READV, .sy_flags = SYF_CAPENABLED, .sy_thrcnt = SY_THR_STATIC }, /* 259 = aio_readv */ { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 260 = nosys */ { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 261 = nosys */ { .sy_narg = 0, .sy_call = (sy_call_t *)nosys, .sy_auevent = AUE_NULL, .sy_flags = 0, .sy_thrcnt = SY_THR_ABSENT }, /* 262 = nosys */ Index: sys/kern/sys_socket.c =================================================================== --- sys/kern/sys_socket.c +++ sys/kern/sys_socket.c @@ -600,9 +600,7 @@ struct ucred *td_savedcred; struct thread *td; struct file *fp; - struct uio uio; - struct iovec iov; - size_t cnt, done; + size_t cnt, done, job_total_nbytes; long ru_before; int error, flags; @@ -614,16 +612,11 @@ td_savedcred = td->td_ucred; td->td_ucred = job->cred; + job_total_nbytes = job->uiop->uio_resid + job->aio_done; done = job->aio_done; - cnt = job->uaiocb.aio_nbytes - done; - iov.iov_base = (void *)((uintptr_t)job->uaiocb.aio_buf + done); - iov.iov_len = cnt; - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = 0; - uio.uio_resid = cnt; - uio.uio_segflg = UIO_USERSPACE; - uio.uio_td = td; + cnt = job->uiop->uio_resid; + job->uiop->uio_offset = 0; + job->uiop->uio_td = td; flags = MSG_NBIO; /* @@ -633,26 +626,26 @@ */ if (sb == &so->so_rcv) { - uio.uio_rw = UIO_READ; ru_before = td->td_ru.ru_msgrcv; #ifdef MAC error = mac_socket_check_receive(fp->f_cred, so); if (error == 0) #endif - error = soreceive(so, NULL, &uio, NULL, NULL, &flags); + error = soreceive(so, NULL, job->uiop, NULL, NULL, + &flags); if (td->td_ru.ru_msgrcv != ru_before) job->msgrcv = 1; } else { if (!TAILQ_EMPTY(&sb->sb_aiojobq)) flags |= MSG_MORETOCOME; - uio.uio_rw = UIO_WRITE; ru_before = td->td_ru.ru_msgsnd; #ifdef MAC error = mac_socket_check_send(fp->f_cred, so); if (error == 0) #endif - error = sosend(so, NULL, &uio, NULL, NULL, flags, td); + error = sosend(so, NULL, job->uiop, NULL, NULL, flags, + td); if (td->td_ru.ru_msgsnd != ru_before) job->msgsnd = 1; if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) { @@ -662,7 +655,7 @@ } } - done += cnt - uio.uio_resid; + done += cnt - job->uiop->uio_resid; job->aio_done = done; td->td_ucred = td_savedcred; @@ -676,7 +669,7 @@ * been made, requeue this request at the head of the * queue to try again when the socket is ready. */ - MPASS(done != job->uaiocb.aio_nbytes); + MPASS(done != job_total_nbytes); SOCKBUF_LOCK(sb); if (done == 0 || !(so->so_state & SS_NBIO)) { empty_results++; @@ -782,10 +775,10 @@ so = job->fd_file->f_data; opcode = job->uaiocb.aio_lio_opcode; - if (opcode == LIO_READ) + if (opcode == LIO_READ || opcode == LIO_READV) sb = &so->so_rcv; else { - MPASS(opcode == LIO_WRITE); + MPASS(opcode == LIO_WRITE || opcode == LIO_WRITEV); sb = &so->so_snd; } @@ -817,9 +810,11 @@ switch (job->uaiocb.aio_lio_opcode) { case LIO_READ: + case LIO_READV: sb = &so->so_rcv; break; case LIO_WRITE: + case LIO_WRITEV: sb = &so->so_snd; break; default: Index: sys/kern/syscalls.c =================================================================== --- sys/kern/syscalls.c +++ sys/kern/syscalls.c @@ -264,8 +264,8 @@ "aio_read", /* 255 = aio_read */ "aio_write", /* 256 = aio_write */ "lio_listio", /* 257 = lio_listio */ - "#258", /* 258 = nosys */ - "#259", /* 259 = nosys */ + "aio_writev", /* 258 = aio_writev */ + "aio_readv", /* 259 = aio_readv */ "#260", /* 260 = nosys */ "#261", /* 261 = nosys */ "#262", /* 262 = nosys */ Index: sys/kern/syscalls.master =================================================================== --- sys/kern/syscalls.master +++ sys/kern/syscalls.master @@ -1477,7 +1477,17 @@ _In_opt_ struct sigevent *sig ); } -258-271 AUE_NULL UNIMPL nosys +258 AUE_AIO_WRITEV STD { + int aio_writev( + _Inout_ struct aiocb *aiocbp + ); + } +259 AUE_AIO_READV STD { + int aio_readv( + _Inout_ struct aiocb *aiocbp + ); + } +260-271 AUE_NULL UNIMPL nosys 272 AUE_O_GETDENTS COMPAT11 { int getdents( int fd, Index: sys/kern/systrace_args.c =================================================================== --- sys/kern/systrace_args.c +++ sys/kern/systrace_args.c @@ -1320,6 +1320,20 @@ *n_args = 4; break; } + /* aio_writev */ + case 258: { + struct aio_writev_args *p = params; + uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */ + *n_args = 1; + break; + } + /* aio_readv */ + case 259: { + struct aio_readv_args *p = params; + uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */ + *n_args = 1; + break; + } /* lchmod */ case 274: { struct lchmod_args *p = params; @@ -5462,6 +5476,26 @@ break; }; break; + /* aio_writev */ + case 258: + switch(ndx) { + case 0: + p = "userland struct aiocb *"; + break; + default: + break; + }; + break; + /* aio_readv */ + case 259: + switch(ndx) { + case 0: + p = "userland struct aiocb *"; + break; + default: + break; + }; + break; /* lchmod */ case 274: switch(ndx) { @@ -9795,6 +9829,16 @@ if (ndx == 0 || ndx == 1) p = "int"; break; + /* aio_writev */ + case 258: + if (ndx == 0 || ndx == 1) + p = "int"; + break; + /* aio_readv */ + case 259: + if (ndx == 0 || ndx == 1) + p = "int"; + break; /* lchmod */ case 274: if (ndx == 0 || ndx == 1) Index: sys/kern/vfs_aio.c =================================================================== --- sys/kern/vfs_aio.c +++ sys/kern/vfs_aio.c @@ -307,6 +307,7 @@ static TAILQ_HEAD(,kaiocb) aio_jobs; /* (c) Async job list */ static struct unrhdr *aiod_unr; +static void aio_biocleanup(struct bio *bp); void aio_init_aioinfo(struct proc *p); static int aio_onceonly(void); static int aio_free_entry(struct kaiocb *job); @@ -559,6 +560,8 @@ if (job->fd_file) fdrop(job->fd_file, curthread); crfree(job->cred); + if (job->uiop != &job->uio) + free(job->uiop, M_IOV); uma_zfree(aiocb_zone, job); AIO_LOCK(ki); @@ -754,36 +757,29 @@ struct thread *td; struct aiocb *cb; struct file *fp; - struct uio auio; - struct iovec aiov; ssize_t cnt; long msgsnd_st, msgsnd_end; long msgrcv_st, msgrcv_end; long oublock_st, oublock_end; long inblock_st, inblock_end; - int error; + int error, opcode; KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ || - job->uaiocb.aio_lio_opcode == LIO_WRITE, + job->uaiocb.aio_lio_opcode == LIO_READV || + job->uaiocb.aio_lio_opcode == LIO_WRITE || + job->uaiocb.aio_lio_opcode == LIO_WRITEV, ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode)); aio_switch_vmspace(job); td = curthread; td_savedcred = td->td_ucred; td->td_ucred = job->cred; + job->uiop->uio_td = td; cb = &job->uaiocb; fp = job->fd_file; - aiov.iov_base = (void *)(uintptr_t)cb->aio_buf; - aiov.iov_len = cb->aio_nbytes; - - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = cb->aio_offset; - auio.uio_resid = cb->aio_nbytes; - cnt = cb->aio_nbytes; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_td = td; + opcode = job->uaiocb.aio_lio_opcode; + cnt = job->uiop->uio_resid; msgrcv_st = td->td_ru.ru_msgrcv; msgsnd_st = td->td_ru.ru_msgsnd; @@ -794,17 +790,16 @@ * aio_aqueue() acquires a reference to the file that is * released in aio_free_entry(). */ - if (cb->aio_lio_opcode == LIO_READ) { - auio.uio_rw = UIO_READ; - if (auio.uio_resid == 0) + if (opcode == LIO_READ || opcode == LIO_READV) { + if (job->uiop->uio_resid == 0) error = 0; else - error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td); + error = fo_read(fp, job->uiop, fp->f_cred, FOF_OFFSET, + td); } else { if (fp->f_type == DTYPE_VNODE) bwillwrite(); - auio.uio_rw = UIO_WRITE; - error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td); + error = fo_write(fp, job->uiop, fp->f_cred, FOF_OFFSET, td); } msgrcv_end = td->td_ru.ru_msgrcv; msgsnd_end = td->td_ru.ru_msgsnd; @@ -816,17 +811,18 @@ job->inblock = inblock_end - inblock_st; job->outblock = oublock_end - oublock_st; - if ((error) && (auio.uio_resid != cnt)) { + if (error != 0 && job->uiop->uio_resid != cnt) { if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) error = 0; - if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) { + if (error == EPIPE && + (opcode == LIO_WRITE || opcode == LIO_WRITEV)) { PROC_LOCK(job->userproc); kern_psignal(job->userproc, SIGPIPE); PROC_UNLOCK(job->userproc); } } - cnt -= auio.uio_resid; + cnt -= job->uiop->uio_resid; td->td_ucred = td_savedcred; if (error) aio_complete(job, -1, error); @@ -1210,21 +1206,23 @@ { struct aiocb *cb; struct file *fp; - struct bio *bp; struct buf *pbuf; struct vnode *vp; struct cdevsw *csw; struct cdev *dev; struct kaioinfo *ki; - struct vm_page **pages; - int error, npages, poff, ref; + struct bio **bios = NULL; + off_t offset; + int bio_cmd, error, i, iovcnt, opcode, poff, ref; vm_prot_t prot; + bool use_unmapped; cb = &job->uaiocb; fp = job->fd_file; + opcode = cb->aio_lio_opcode; - if (!(cb->aio_lio_opcode == LIO_WRITE || - cb->aio_lio_opcode == LIO_READ)) + if (!(opcode == LIO_WRITE || opcode == LIO_WRITEV || + opcode == LIO_READ || opcode == LIO_READV)) return (-1); if (fp == NULL || fp->f_type != DTYPE_VNODE) return (-1); @@ -1234,8 +1232,21 @@ return (-1); if (vp->v_bufobj.bo_bsize == 0) return (-1); - if (cb->aio_nbytes % vp->v_bufobj.bo_bsize) + + bio_cmd = opcode == LIO_WRITE || opcode == LIO_WRITEV ? BIO_WRITE : + BIO_READ; + iovcnt = job->uiop->uio_iovcnt; + if (iovcnt > max_buf_aio) return (-1); + for (i = 0; i < iovcnt; i++) { + if (job->uiop->uio_iov[i].iov_len % vp->v_bufobj.bo_bsize != 0) + return (-1); + if (job->uiop->uio_iov[i].iov_len > maxphys) { + error = -1; + return (-1); + } + } + offset = cb->aio_offset; ref = 0; csw = devvn_refthread(vp, &dev, &ref); @@ -1246,89 +1257,106 @@ error = -1; goto unref; } - if (cb->aio_nbytes > dev->si_iosize_max) { + if (job->uiop->uio_resid > dev->si_iosize_max) { error = -1; goto unref; } ki = p->p_aioinfo; - poff = (vm_offset_t)cb->aio_buf & PAGE_MASK; - if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) { - if (cb->aio_nbytes > maxphys) { - error = -1; - goto unref; - } + job->error = 0; - pbuf = NULL; - pages = malloc(sizeof(vm_page_t) * (atop(round_page( - cb->aio_nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO); - } else { - if (cb->aio_nbytes > maxphys) { - error = -1; - goto unref; - } - if (ki->kaio_buffer_count >= max_buf_aio) { + use_unmapped = (dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed; + if (!use_unmapped) { + AIO_LOCK(ki); + if (ki->kaio_buffer_count + iovcnt > max_buf_aio) { + AIO_UNLOCK(ki); error = EAGAIN; goto unref; } - - pbuf = uma_zalloc(pbuf_zone, M_WAITOK); - BUF_KERNPROC(pbuf); - AIO_LOCK(ki); - ki->kaio_buffer_count++; + ki->kaio_buffer_count += iovcnt; AIO_UNLOCK(ki); - pages = pbuf->b_pages; - } - bp = g_alloc_bio(); - - bp->bio_length = cb->aio_nbytes; - bp->bio_bcount = cb->aio_nbytes; - bp->bio_done = aio_biowakeup; - bp->bio_offset = cb->aio_offset; - bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ; - bp->bio_dev = dev; - bp->bio_caller1 = job; - bp->bio_caller2 = pbuf; - - prot = VM_PROT_READ; - if (cb->aio_lio_opcode == LIO_READ) - prot |= VM_PROT_WRITE; /* Less backwards than it looks */ - npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, - (vm_offset_t)cb->aio_buf, bp->bio_length, prot, pages, - atop(maxphys) + 1); - if (npages < 0) { - error = EFAULT; - goto doerror; } - if (pbuf != NULL) { - pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages); - bp->bio_data = pbuf->b_data + poff; - atomic_add_int(&num_buf_aio, 1); - pbuf->b_npages = npages; - } else { - bp->bio_ma = pages; - bp->bio_ma_n = npages; - bp->bio_ma_offset = poff; - bp->bio_data = unmapped_buf; - bp->bio_flags |= BIO_UNMAPPED; - atomic_add_int(&num_unmapped_aio, 1); + + bios = malloc(sizeof(struct bio *) * iovcnt, M_TEMP, M_WAITOK); + atomic_store_int(&job->nbio, iovcnt); + for (i = 0; i < iovcnt; i++) { + struct vm_page** pages; + struct bio *bp; + void *buf; + size_t nbytes; + int npages; + + buf = job->uiop->uio_iov[i].iov_base; + nbytes = job->uiop->uio_iov[i].iov_len; + + bios[i] = g_alloc_bio(); + bp = bios[i]; + + poff = (vm_offset_t)buf & PAGE_MASK; + if (use_unmapped) { + pbuf = NULL; + pages = malloc(sizeof(vm_page_t) * (atop(round_page( + nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO); + } else { + pbuf = uma_zalloc(pbuf_zone, M_WAITOK); + BUF_KERNPROC(pbuf); + pages = pbuf->b_pages; + } + + bp->bio_length = nbytes; + bp->bio_bcount = nbytes; + bp->bio_done = aio_biowakeup; + bp->bio_offset = offset; + bp->bio_cmd = bio_cmd; + bp->bio_dev = dev; + bp->bio_caller1 = job; + bp->bio_caller2 = pbuf; + + prot = VM_PROT_READ; + if (opcode == LIO_READ || opcode == LIO_READV) + prot |= VM_PROT_WRITE; /* Less backwards than it looks */ + npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map, + (vm_offset_t)buf, bp->bio_length, prot, pages, + atop(maxphys) + 1); + if (npages < 0) { + if (pbuf != NULL) + uma_zfree(pbuf_zone, pbuf); + else + free(pages, M_TEMP); + error = EFAULT; + g_destroy_bio(bp); + i--; + goto destroy_bios; + } + if (pbuf != NULL) { + pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages); + bp->bio_data = pbuf->b_data + poff; + pbuf->b_npages = npages; + atomic_add_int(&num_buf_aio, 1); + } else { + bp->bio_ma = pages; + bp->bio_ma_n = npages; + bp->bio_ma_offset = poff; + bp->bio_data = unmapped_buf; + bp->bio_flags |= BIO_UNMAPPED; + atomic_add_int(&num_unmapped_aio, 1); + } + + offset += nbytes; } /* Perform transfer. */ - csw->d_strategy(bp); + for (i = 0; i < iovcnt; i++) + csw->d_strategy(bios[i]); + free(bios, M_TEMP); + dev_relthread(dev, ref); return (0); -doerror: - if (pbuf != NULL) { - AIO_LOCK(ki); - ki->kaio_buffer_count--; - AIO_UNLOCK(ki); - uma_zfree(pbuf_zone, pbuf); - } else { - free(pages, M_TEMP); - } - g_destroy_bio(bp); +destroy_bios: + for (; i >= 0; i--) + aio_biocleanup(bios[i]); + free(bios, M_TEMP); unref: dev_relthread(dev, ref); return (error); @@ -1533,9 +1561,11 @@ fd = job->uaiocb.aio_fildes; switch (opcode) { case LIO_WRITE: + case LIO_WRITEV: error = fget_write(td, fd, &cap_pwrite_rights, &fp); break; case LIO_READ: + case LIO_READV: error = fget_read(td, fd, &cap_pread_rights, &fp); break; case LIO_SYNC: @@ -1561,7 +1591,8 @@ goto aqueue_fail; } - if ((opcode == LIO_READ || opcode == LIO_WRITE) && + if ((opcode == LIO_READ || opcode == LIO_READV || + opcode == LIO_WRITE || opcode == LIO_WRITEV) && job->uaiocb.aio_offset < 0 && (fp->f_vnode == NULL || fp->f_vnode->v_type != VCHR)) { error = EINVAL; @@ -1614,6 +1645,42 @@ job->jobflags = KAIOCB_QUEUEING; job->lio = lj; + switch (opcode) { + case LIO_READV: + case LIO_WRITEV: + /* malloc a uio */ + error = copyinuio(job->uaiocb.aio_iov, job->uaiocb.aio_iovcnt, + &job->uiop); + if (error) + goto aqueue_fail; + break; + case LIO_READ: + case LIO_WRITE: + /* Use the inline uio */ + job->iov[0].iov_base = (void *)(uintptr_t)job->uaiocb.aio_buf; + job->iov[0].iov_len = job->uaiocb.aio_nbytes; + job->uio.uio_iov = job->iov; + job->uio.uio_iovcnt = 1; + job->uio.uio_resid = job->uaiocb.aio_nbytes; + job->uio.uio_segflg = UIO_USERSPACE; + /* FALLTHROUGH */ + default: + job->uiop = &job->uio; + break; + } + switch (opcode) { + case LIO_READ: + case LIO_READV: + job->uiop->uio_rw = UIO_READ; + break; + case LIO_WRITE: + case LIO_WRITEV: + job->uiop->uio_rw = UIO_WRITE; + break; + } + job->uiop->uio_offset = job->uaiocb.aio_offset; + job->uiop->uio_td = td; + if (opcode == LIO_MLOCK) { aio_schedule(job, aio_process_mlock); error = 0; @@ -1644,6 +1711,8 @@ return (0); aqueue_fail: + if (job->uiop != &job->uio) + free(job->uiop, M_IOV); knlist_delete(&job->klist, curthread, 0); if (fp) fdrop(fp, td); @@ -1723,7 +1792,9 @@ switch (job->uaiocb.aio_lio_opcode) { case LIO_READ: + case LIO_READV: case LIO_WRITE: + case LIO_WRITEV: aio_schedule(job, aio_process_rw); error = 0; break; @@ -2097,6 +2168,13 @@ return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops)); } +int +sys_aio_readv(struct thread *td, struct aio_readv_args *uap) +{ + + return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READV, &aiocb_ops)); +} + /* syscall - asynchronous write to a file (REALTIME) */ #ifdef COMPAT_FREEBSD6 int @@ -2115,6 +2193,13 @@ return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops)); } +int +sys_aio_writev(struct thread *td, struct aio_writev_args *uap) +{ + + return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITEV, &aiocb_ops)); +} + int sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap) { @@ -2337,13 +2422,11 @@ } static void -aio_biowakeup(struct bio *bp) +aio_biocleanup(struct bio *bp) { struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; struct kaioinfo *ki; struct buf *pbuf = (struct buf*)bp->bio_caller2; - size_t nbytes; - int error, nblks; /* Release mapping into kernel space. */ if (pbuf != NULL) { @@ -2362,23 +2445,47 @@ free(bp->bio_ma, M_TEMP); atomic_subtract_int(&num_unmapped_aio, 1); } + g_destroy_bio(bp); +} - nbytes = job->uaiocb.aio_nbytes - bp->bio_resid; - error = 0; - if (bp->bio_flags & BIO_ERROR) - error = bp->bio_error; +static void +aio_biowakeup(struct bio *bp) +{ + struct kaiocb *job = (struct kaiocb *)bp->bio_caller1; + size_t nbytes; + long bcount = bp->bio_bcount; + long resid = bp->bio_resid; + int error, opcode, nblks; + int bio_error = bp->bio_error; + uint16_t flags = bp->bio_flags; + + opcode = job->uaiocb.aio_lio_opcode; + + aio_biocleanup(bp); + + nbytes =bcount - resid; + atomic_add_acq_long(&job->nbytes, nbytes); nblks = btodb(nbytes); - if (job->uaiocb.aio_lio_opcode == LIO_WRITE) - job->outblock += nblks; + error = 0; + /* + * If multiple bios experienced an error, the job will reflect the + * error of whichever failed bio completed last. + */ + if (flags & BIO_ERROR) + atomic_set_int(&job->error, bio_error); + if (opcode == LIO_WRITE || opcode == LIO_WRITEV) + atomic_add_int(&job->outblock, nblks); else - job->inblock += nblks; + atomic_add_int(&job->inblock, nblks); + atomic_subtract_int(&job->nbio, 1); - if (error) - aio_complete(job, -1, error); - else - aio_complete(job, nbytes, 0); - g_destroy_bio(bp); + if (atomic_load_int(&job->nbio) == 0) { + if (atomic_load_int(&job->error)) + aio_complete(job, -1, job->error); + else + aio_complete(job, atomic_load_int(&job->nbytes), 0); + } } /* syscall - wait for the next completion of an aio request */ @@ -2840,6 +2947,14 @@ &aiocb32_ops)); } +int +freebsd32_aio_readv(struct thread *td, struct freebsd32_aio_readv_args *uap) +{ + + return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READV, + &aiocb32_ops)); +} + #ifdef COMPAT_FREEBSD6 int freebsd6_freebsd32_aio_write(struct thread *td, @@ -2859,6 +2974,14 @@ &aiocb32_ops)); } +int +freebsd32_aio_writev(struct thread *td, struct freebsd32_aio_writev_args *uap) +{ + + return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITEV, + &aiocb32_ops)); +} + int freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap) { Index: sys/sys/aio.h =================================================================== --- sys/sys/aio.h +++ sys/sys/aio.h @@ -27,6 +27,7 @@ #include #include #include +#include #endif /* @@ -45,6 +46,8 @@ #ifdef _KERNEL #define LIO_SYNC 0x3 #define LIO_MLOCK 0x4 +#define LIO_WRITEV 0x5 +#define LIO_READV 0x6 #endif /* @@ -92,8 +95,14 @@ typedef struct aiocb { int aio_fildes; /* File descriptor */ off_t aio_offset; /* File offset for I/O */ - volatile void *aio_buf; /* I/O buffer in process space */ - size_t aio_nbytes; /* Number of bytes for I/O */ + union { + volatile void *aio_buf; /* I/O buffer in process space */ + struct iovec *aio_iov; /* I/O scatter/gather list */ + }; + union { + size_t aio_nbytes; /* Number of bytes for I/O */ + int aio_iovcnt; /* Length of aio_iov */ + }; int __spare__[2]; void *__spare2__; int aio_lio_opcode; /* LIO opcode */ @@ -132,11 +141,19 @@ struct aiocb *ujob; /* (*) pointer in userspace of aiocb */ struct knlist klist; /* (a) list of knotes */ struct aiocb uaiocb; /* (*) copy of user I/O control block */ + struct uio uio; /* (*) storage for non-vectored uio */ + struct iovec iov[1]; /* (*) storage for non-vectored uio */ + struct uio *uiop; /* (*) Possibly malloced uio */ ksiginfo_t ksi; /* (a) realtime signal info */ uint64_t seqno; /* (*) job number */ aio_cancel_fn_t *cancel_fn; /* (a) backend cancel function */ aio_handle_fn_t *handle_fn; /* (c) backend handle function */ union { /* Backend-specific data fields */ + struct { /* BIO backend */ + int nbio; /* Number of remaining bios */ + int error; /* Worst error of all bios */ + long nbytes; /* Bytes completed so far */ + }; struct { /* fsync() requests */ int pending; /* (a) number of pending I/O */ }; @@ -202,11 +219,17 @@ * Asynchronously read from a file */ int aio_read(struct aiocb *); +#if __BSD_VISIBLE +int aio_readv(struct aiocb *); +#endif /* * Asynchronously write to file */ int aio_write(struct aiocb *); +#if __BSD_VISIBLE +int aio_writev(struct aiocb *); +#endif /* * List I/O Asynchronously/synchronously read/write to/from file Index: sys/sys/syscall.h =================================================================== --- sys/sys/syscall.h +++ sys/sys/syscall.h @@ -234,6 +234,8 @@ #define SYS_aio_read 255 #define SYS_aio_write 256 #define SYS_lio_listio 257 +#define SYS_aio_writev 258 +#define SYS_aio_readv 259 #define SYS_freebsd11_getdents 272 #define SYS_lchmod 274 /* 275 is obsolete netbsd_lchown */ Index: sys/sys/syscall.mk =================================================================== --- sys/sys/syscall.mk +++ sys/sys/syscall.mk @@ -168,6 +168,8 @@ aio_read.o \ aio_write.o \ lio_listio.o \ + aio_writev.o \ + aio_readv.o \ freebsd11_getdents.o \ lchmod.o \ lutimes.o \ Index: sys/sys/sysproto.h =================================================================== --- sys/sys/sysproto.h +++ sys/sys/sysproto.h @@ -714,6 +714,12 @@ char nent_l_[PADL_(int)]; int nent; char nent_r_[PADR_(int)]; char sig_l_[PADL_(struct sigevent *)]; struct sigevent * sig; char sig_r_[PADR_(struct sigevent *)]; }; +struct aio_writev_args { + char aiocbp_l_[PADL_(struct aiocb *)]; struct aiocb * aiocbp; char aiocbp_r_[PADR_(struct aiocb *)]; +}; +struct aio_readv_args { + char aiocbp_l_[PADL_(struct aiocb *)]; struct aiocb * aiocbp; char aiocbp_r_[PADR_(struct aiocb *)]; +}; struct lchmod_args { char path_l_[PADL_(const char *)]; const char * path; char path_r_[PADR_(const char *)]; char mode_l_[PADL_(mode_t)]; mode_t mode; char mode_r_[PADR_(mode_t)]; @@ -1991,6 +1997,8 @@ int sys_aio_read(struct thread *, struct aio_read_args *); int sys_aio_write(struct thread *, struct aio_write_args *); int sys_lio_listio(struct thread *, struct lio_listio_args *); +int sys_aio_writev(struct thread *, struct aio_writev_args *); +int sys_aio_readv(struct thread *, struct aio_readv_args *); int sys_lchmod(struct thread *, struct lchmod_args *); int sys_lutimes(struct thread *, struct lutimes_args *); int sys_preadv(struct thread *, struct preadv_args *); @@ -2901,6 +2909,8 @@ #define SYS_AUE_aio_read AUE_AIO_READ #define SYS_AUE_aio_write AUE_AIO_WRITE #define SYS_AUE_lio_listio AUE_LIO_LISTIO +#define SYS_AUE_aio_writev AUE_AIO_WRITEV +#define SYS_AUE_aio_readv AUE_AIO_READV #define SYS_AUE_freebsd11_getdents AUE_O_GETDENTS #define SYS_AUE_lchmod AUE_LCHMOD #define SYS_AUE_lutimes AUE_LUTIMES Index: tests/sys/aio/aio_test.c =================================================================== --- tests/sys/aio/aio_test.c +++ tests/sys/aio/aio_test.c @@ -281,6 +281,47 @@ atf_tc_fail("aio short write (%jd)", (intmax_t)len); } +/* + * Perform a vectored I/O test of our initialized data buffer to the provided + * file descriptor. + * + * To vectorize the linear buffer, chop it up into two pieces of dissimilar + * size, and swap their offsets. + */ +static void +aio_writev_test(struct aio_context *ac, completion comp, struct sigevent *sev) +{ + struct aiocb aio; + struct iovec iov[2]; + size_t len0, len1; + ssize_t len; + + bzero(&aio, sizeof(aio)); + + aio.aio_fildes = ac->ac_write_fd; + aio.aio_offset = 0; + len0 = ac->ac_buflen * 3 / 4; + len1 = ac->ac_buflen / 4; + iov[0].iov_base = ac->ac_buffer + len1; + iov[0].iov_len = len0; + iov[1].iov_base = ac->ac_buffer; + iov[1].iov_len = len1; + aio.aio_iov = iov; + aio.aio_iovcnt = 2; + if (sev) + aio.aio_sigevent = *sev; + + if (aio_writev(&aio) < 0) + atf_tc_fail("aio_writev failed: %s", strerror(errno)); + + len = comp(&aio); + if (len < 0) + atf_tc_fail("aio failed: %s", strerror(errno)); + + if (len != ac->ac_buflen) + atf_tc_fail("aio short write (%jd)", (intmax_t)len); +} + /* * Perform a simple read test of our initialized data buffer from the * provided file descriptor. @@ -314,6 +355,43 @@ atf_tc_fail("buffer mismatched"); } +static void +aio_readv_test(struct aio_context *ac, completion comp, struct sigevent *sev) +{ + struct aiocb aio; + struct iovec iov[2]; + size_t len0, len1; + ssize_t len; + + bzero(ac->ac_buffer, ac->ac_buflen); + bzero(&aio, sizeof(aio)); + aio.aio_fildes = ac->ac_read_fd; + aio.aio_offset = 0; + len0 = ac->ac_buflen * 3 / 4; + len1 = ac->ac_buflen / 4; + iov[0].iov_base = ac->ac_buffer + len1; + iov[0].iov_len = len0; + iov[1].iov_base = ac->ac_buffer; + iov[1].iov_len = len1; + aio.aio_iov = iov; + aio.aio_iovcnt = 2; + if (sev) + aio.aio_sigevent = *sev; + + if (aio_readv(&aio) < 0) + atf_tc_fail("aio_read failed: %s", strerror(errno)); + + len = comp(&aio); + if (len < 0) + atf_tc_fail("aio failed: %s", strerror(errno)); + + ATF_REQUIRE_EQ_MSG(len, ac->ac_buflen, + "aio short read (%jd)", (intmax_t)len); + + if (aio_test_buffer(ac->ac_buffer, ac->ac_buflen, ac->ac_seed) == 0) + atf_tc_fail("buffer mismatched"); +} + /* * Series of type-specific tests for AIO. For now, we just make sure we can * issue a write and then a read to each type. We assume that once a write @@ -328,7 +406,7 @@ #define FILE_PATHNAME "testfile" static void -aio_file_test(completion comp, struct sigevent *sev) +aio_file_test(completion comp, struct sigevent *sev, bool vectored) { struct aio_context ac; int fd; @@ -340,39 +418,44 @@ ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); aio_context_init(&ac, fd, fd, FILE_LEN); - aio_write_test(&ac, comp, sev); - aio_read_test(&ac, comp, sev); + if (vectored) { + aio_writev_test(&ac, comp, sev); + aio_readv_test(&ac, comp, sev); + } else { + aio_write_test(&ac, comp, sev); + aio_read_test(&ac, comp, sev); + } close(fd); } ATF_TC_WITHOUT_HEAD(file_poll); ATF_TC_BODY(file_poll, tc) { - aio_file_test(poll, NULL); + aio_file_test(poll, NULL, false); } ATF_TC_WITHOUT_HEAD(file_signal); ATF_TC_BODY(file_signal, tc) { - aio_file_test(poll_signaled, setup_signal()); + aio_file_test(poll_signaled, setup_signal(), false); } ATF_TC_WITHOUT_HEAD(file_suspend); ATF_TC_BODY(file_suspend, tc) { - aio_file_test(suspend, NULL); + aio_file_test(suspend, NULL, false); } ATF_TC_WITHOUT_HEAD(file_thread); ATF_TC_BODY(file_thread, tc) { - aio_file_test(poll_signaled, setup_thread()); + aio_file_test(poll_signaled, setup_thread(), false); } ATF_TC_WITHOUT_HEAD(file_waitcomplete); ATF_TC_BODY(file_waitcomplete, tc) { - aio_file_test(waitcomplete, NULL); + aio_file_test(waitcomplete, NULL, false); } #define FIFO_LEN 256 @@ -446,7 +529,7 @@ #define UNIX_SOCKETPAIR_LEN 256 static void -aio_unix_socketpair_test(completion comp, struct sigevent *sev) +aio_unix_socketpair_test(completion comp, struct sigevent *sev, bool vectored) { struct aio_context ac; struct rusage ru_before, ru_after; @@ -460,14 +543,16 @@ aio_context_init(&ac, sockets[0], sockets[1], UNIX_SOCKETPAIR_LEN); ATF_REQUIRE_MSG(getrusage(RUSAGE_SELF, &ru_before) != -1, "getrusage failed: %s", strerror(errno)); - aio_write_test(&ac, comp, sev); + if (vectored) { + aio_writev_test(&ac, comp, sev); + aio_readv_test(&ac, comp, sev); + } else { + aio_write_test(&ac, comp, sev); + aio_read_test(&ac, comp, sev); + } ATF_REQUIRE_MSG(getrusage(RUSAGE_SELF, &ru_after) != -1, "getrusage failed: %s", strerror(errno)); ATF_REQUIRE(ru_after.ru_msgsnd == ru_before.ru_msgsnd + 1); - ru_before = ru_after; - aio_read_test(&ac, comp, sev); - ATF_REQUIRE_MSG(getrusage(RUSAGE_SELF, &ru_after) != -1, - "getrusage failed: %s", strerror(errno)); ATF_REQUIRE(ru_after.ru_msgrcv == ru_before.ru_msgrcv + 1); close(sockets[0]); @@ -477,31 +562,31 @@ ATF_TC_WITHOUT_HEAD(socket_poll); ATF_TC_BODY(socket_poll, tc) { - aio_unix_socketpair_test(poll, NULL); + aio_unix_socketpair_test(poll, NULL, false); } ATF_TC_WITHOUT_HEAD(socket_signal); ATF_TC_BODY(socket_signal, tc) { - aio_unix_socketpair_test(poll_signaled, setup_signal()); + aio_unix_socketpair_test(poll_signaled, setup_signal(), false); } ATF_TC_WITHOUT_HEAD(socket_suspend); ATF_TC_BODY(socket_suspend, tc) { - aio_unix_socketpair_test(suspend, NULL); + aio_unix_socketpair_test(suspend, NULL, false); } ATF_TC_WITHOUT_HEAD(socket_thread); ATF_TC_BODY(socket_thread, tc) { - aio_unix_socketpair_test(poll_signaled, setup_thread()); + aio_unix_socketpair_test(poll_signaled, setup_thread(), false); } ATF_TC_WITHOUT_HEAD(socket_waitcomplete); ATF_TC_BODY(socket_waitcomplete, tc) { - aio_unix_socketpair_test(waitcomplete, NULL); + aio_unix_socketpair_test(waitcomplete, NULL, false); } struct aio_pty_arg { @@ -629,40 +714,11 @@ #define MD_LEN GLOBAL_MAX #define MDUNIT_LINK "mdunit_link" -static void -aio_md_cleanup(void) -{ - struct md_ioctl mdio; - int mdctl_fd, error, n, unit; - char buf[80]; - - mdctl_fd = open("/dev/" MDCTL_NAME, O_RDWR, 0); - ATF_REQUIRE(mdctl_fd >= 0); - n = readlink(MDUNIT_LINK, buf, sizeof(buf)); - if (n > 0) { - if (sscanf(buf, "%d", &unit) == 1 && unit >= 0) { - bzero(&mdio, sizeof(mdio)); - mdio.md_version = MDIOVERSION; - mdio.md_unit = unit; - if (ioctl(mdctl_fd, MDIOCDETACH, &mdio) == -1) { - error = errno; - close(mdctl_fd); - errno = error; - atf_tc_fail("ioctl MDIOCDETACH failed: %s", - strerror(errno)); - } - } - } - - close(mdctl_fd); -} - -static void -aio_md_test(completion comp, struct sigevent *sev) +static int +aio_md_setup(void) { int error, fd, mdctl_fd, unit; char pathname[PATH_MAX]; - struct aio_context ac; struct md_ioctl mdio; char buf[80]; @@ -694,10 +750,53 @@ fd = open(pathname, O_RDWR); ATF_REQUIRE_MSG(fd != -1, "opening %s failed: %s", pathname, strerror(errno)); + + return (fd); +} + +static void +aio_md_cleanup(void) +{ + struct md_ioctl mdio; + int mdctl_fd, error, n, unit; + char buf[80]; + mdctl_fd = open("/dev/" MDCTL_NAME, O_RDWR, 0); + ATF_REQUIRE(mdctl_fd >= 0); + n = readlink(MDUNIT_LINK, buf, sizeof(buf)); + if (n > 0) { + if (sscanf(buf, "%d", &unit) == 1 && unit >= 0) { + bzero(&mdio, sizeof(mdio)); + mdio.md_version = MDIOVERSION; + mdio.md_unit = unit; + if (ioctl(mdctl_fd, MDIOCDETACH, &mdio) == -1) { + error = errno; + close(mdctl_fd); + errno = error; + atf_tc_fail("ioctl MDIOCDETACH failed: %s", + strerror(errno)); + } + } + } + + close(mdctl_fd); +} + +static void +aio_md_test(completion comp, struct sigevent *sev, bool vectored) +{ + struct aio_context ac; + int fd; + + fd = aio_md_setup(); aio_context_init(&ac, fd, fd, MD_LEN); - aio_write_test(&ac, comp, sev); - aio_read_test(&ac, comp, sev); + if (vectored) { + aio_writev_test(&ac, comp, sev); + aio_readv_test(&ac, comp, sev); + } else { + aio_write_test(&ac, comp, sev); + aio_read_test(&ac, comp, sev); + } close(fd); } @@ -710,7 +809,7 @@ } ATF_TC_BODY(md_poll, tc) { - aio_md_test(poll, NULL); + aio_md_test(poll, NULL, false); } ATF_TC_CLEANUP(md_poll, tc) { @@ -725,7 +824,7 @@ } ATF_TC_BODY(md_signal, tc) { - aio_md_test(poll_signaled, setup_signal()); + aio_md_test(poll_signaled, setup_signal(), false); } ATF_TC_CLEANUP(md_signal, tc) { @@ -740,7 +839,7 @@ } ATF_TC_BODY(md_suspend, tc) { - aio_md_test(suspend, NULL); + aio_md_test(suspend, NULL, false); } ATF_TC_CLEANUP(md_suspend, tc) { @@ -755,7 +854,7 @@ } ATF_TC_BODY(md_thread, tc) { - aio_md_test(poll_signaled, setup_thread()); + aio_md_test(poll_signaled, setup_thread(), false); } ATF_TC_CLEANUP(md_thread, tc) { @@ -770,13 +869,60 @@ } ATF_TC_BODY(md_waitcomplete, tc) { - aio_md_test(waitcomplete, NULL); + aio_md_test(waitcomplete, NULL, false); } ATF_TC_CLEANUP(md_waitcomplete, tc) { aio_md_cleanup(); } +#define ZVOL_VDEV_PATHNAME "test_vdev" +#define POOL_SIZE (1 << 28) /* 256 MB */ +#define ZVOL_SIZE "64m" +#define POOL_NAME "aio_testpool" +#define ZVOL_NAME "aio_testvol" + +static int +aio_zvol_setup(void) +{ + int fd; + + ATF_REQUIRE_KERNEL_MODULE("aio"); + ATF_REQUIRE_KERNEL_MODULE("zfs"); + + fd = open(ZVOL_VDEV_PATHNAME, O_RDWR | O_CREAT, 0600); + ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + ATF_REQUIRE_EQ_MSG(0, + ftruncate(fd, POOL_SIZE), "ftruncate failed: %s", strerror(errno)); + close(fd); + + ATF_REQUIRE_EQ_MSG(0, + system("zpool create " POOL_NAME " $PWD/" ZVOL_VDEV_PATHNAME), + "zpool create failed: %s", strerror(errno)); + ATF_REQUIRE_EQ_MSG(0, + system("zfs create -o volblocksize=8192 -o volmode=dev -V " + ZVOL_SIZE " " POOL_NAME "/" ZVOL_NAME), + "zfs create failed: %s", strerror(errno)); + /* + * XXX Due to bug 251828, we need an extra "zfs set here" + * https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=251828 + */ + ATF_REQUIRE_EQ_MSG(0, + system("zfs set volmode=dev " POOL_NAME "/" ZVOL_NAME), + "zfs set failed: %s", strerror(errno)); + + fd = open("/dev/zvol/" POOL_NAME "/" ZVOL_NAME, O_RDWR); + ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + return (fd); +} + +static void +aio_zvol_cleanup(void) +{ + system("zpool destroy " POOL_NAME); +} + + ATF_TC_WITHOUT_HEAD(aio_large_read_test); ATF_TC_BODY(aio_large_read_test, tc) { @@ -907,14 +1053,11 @@ close(s[0]); } -/* - * This test ensures that aio_write() on a blocking socket of a "large" - * buffer does not return a short completion. - */ -ATF_TC_WITHOUT_HEAD(aio_socket_blocking_short_write); -ATF_TC_BODY(aio_socket_blocking_short_write, tc) +static void +aio_socket_blocking_short_write_test(bool vectored) { struct aiocb iocb, *iocbp; + struct iovec iov[2]; char *buffer[2]; ssize_t done; int buffer_size, sb_size; @@ -954,9 +1097,19 @@ memset(&iocb, 0, sizeof(iocb)); iocb.aio_fildes = s[1]; - iocb.aio_buf = buffer[1]; - iocb.aio_nbytes = buffer_size; - ATF_REQUIRE(aio_write(&iocb) == 0); + if (vectored) { + iov[0].iov_base = buffer[1]; + iov[0].iov_len = buffer_size / 2 + 1; + iov[1].iov_base = buffer[1] + buffer_size / 2 + 1; + iov[1].iov_len = buffer_size / 2 - 1; + iocb.aio_iov = iov; + iocb.aio_iovcnt = 2; + ATF_REQUIRE(aio_writev(&iocb) == 0); + } else { + iocb.aio_buf = buffer[1]; + iocb.aio_nbytes = buffer_size; + ATF_REQUIRE(aio_write(&iocb) == 0); + } done = recv(s[0], buffer[0], buffer_size, MSG_WAITALL); ATF_REQUIRE(done == buffer_size); @@ -971,6 +1124,26 @@ close(s[0]); } +/* + * This test ensures that aio_write() on a blocking socket of a "large" + * buffer does not return a short completion. + */ +ATF_TC_WITHOUT_HEAD(aio_socket_blocking_short_write); +ATF_TC_BODY(aio_socket_blocking_short_write, tc) +{ + aio_socket_blocking_short_write_test(false); +} + +/* + * Like aio_socket_blocking_short_write, but also tests that partially + * completed vectored sends can be retried correctly. + */ +ATF_TC_WITHOUT_HEAD(aio_socket_blocking_short_write_vectored); +ATF_TC_BODY(aio_socket_blocking_short_write_vectored, tc) +{ + aio_socket_blocking_short_write_test(true); +} + /* * This test verifies that cancelling a partially completed socket write * returns a short write rather than ECANCELED. @@ -1155,6 +1328,395 @@ close(fd); } +/* + * We shouldn't be able to DoS the system by setting iov_len to an insane + * value + */ +ATF_TC_WITHOUT_HEAD(aio_writev_dos_iov_len); +ATF_TC_BODY(aio_writev_dos_iov_len, tc) +{ + struct aiocb aio; + const struct aiocb *const iocbs[] = {&aio}; + const char *wbuf = "Hello, world!"; + struct iovec iov[1]; + ssize_t len, r; + int fd; + + ATF_REQUIRE_KERNEL_MODULE("aio"); + ATF_REQUIRE_UNSAFE_AIO(); + + fd = open("testfile", O_RDWR | O_CREAT, 0600); + ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + + len = strlen(wbuf); + iov[0].iov_base = __DECONST(void*, wbuf); + iov[0].iov_len = 1 << 30; + bzero(&aio, sizeof(aio)); + aio.aio_fildes = fd; + aio.aio_offset = 0; + aio.aio_iov = iov; + aio.aio_iovcnt = 1; + + r = aio_writev(&aio); + ATF_CHECK_EQ_MSG(0, r, "aio_writev returned %ld", r); + ATF_REQUIRE_EQ(0, aio_suspend(iocbs, 1, NULL)); + r = aio_return(&aio); + ATF_CHECK_EQ_MSG(-1, r, "aio_return returned %ld", r); + ATF_CHECK_MSG(errno == EFAULT || errno == EINVAL, + "aio_writev: %s", strerror(errno)); + + close(fd); +} + +/* + * We shouldn't be able to DoS the system by setting aio_iovcnt to an insane + * value + */ +ATF_TC_WITHOUT_HEAD(aio_writev_dos_iovcnt); +ATF_TC_BODY(aio_writev_dos_iovcnt, tc) +{ + struct aiocb aio; + const char *wbuf = "Hello, world!"; + struct iovec iov[1]; + ssize_t len; + int fd; + + ATF_REQUIRE_KERNEL_MODULE("aio"); + ATF_REQUIRE_UNSAFE_AIO(); + + fd = open("testfile", O_RDWR | O_CREAT, 0600); + ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + + len = strlen(wbuf); + iov[0].iov_base = __DECONST(void*, wbuf); + iov[0].iov_len = len; + bzero(&aio, sizeof(aio)); + aio.aio_fildes = fd; + aio.aio_offset = 0; + aio.aio_iov = iov; + aio.aio_iovcnt = 1 << 30; + + ATF_REQUIRE_EQ(-1, aio_writev(&aio)); + ATF_CHECK_EQ(EINVAL, errno); + + close(fd); +} + +ATF_TC_WITH_CLEANUP(aio_writev_efault); +ATF_TC_HEAD(aio_writev_efault, tc) +{ + atf_tc_set_md_var(tc, "descr", + "Vectored AIO should gracefully handle invalid addresses"); + atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(aio_writev_efault, tc) +{ + struct aiocb aio; + ssize_t buflen; + char *buffer; + struct iovec iov[2]; + long seed; + int fd; + + ATF_REQUIRE_KERNEL_MODULE("aio"); + ATF_REQUIRE_UNSAFE_AIO(); + + fd = aio_md_setup(); + + seed = random(); + buflen = 4096; + buffer = malloc(buflen); + aio_fill_buffer(buffer, buflen, seed); + iov[0].iov_base = buffer; + iov[0].iov_len = buflen; + iov[1].iov_base = (void*)-1; /* Invalid! */ + iov[1].iov_len = buflen; + bzero(&aio, sizeof(aio)); + aio.aio_fildes = fd; + aio.aio_offset = 0; + aio.aio_iov = iov; + aio.aio_iovcnt = nitems(iov); + + ATF_REQUIRE_EQ(-1, aio_writev(&aio)); + ATF_CHECK_EQ(EFAULT, errno); + + close(fd); +} +ATF_TC_CLEANUP(aio_writev_efault, tc) +{ + aio_md_cleanup(); +} + +ATF_TC_WITHOUT_HEAD(aio_writev_empty_file_poll); +ATF_TC_BODY(aio_writev_empty_file_poll, tc) +{ + struct aiocb aio; + int fd; + + ATF_REQUIRE_KERNEL_MODULE("aio"); + ATF_REQUIRE_UNSAFE_AIO(); + + fd = open("testfile", O_RDWR | O_CREAT, 0600); + ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + + bzero(&aio, sizeof(aio)); + aio.aio_fildes = fd; + aio.aio_offset = 0; + aio.aio_iovcnt = 0; + + ATF_REQUIRE_EQ(0, aio_writev(&aio)); + ATF_REQUIRE_EQ(0, suspend(&aio)); + + close(fd); +} + +ATF_TC_WITHOUT_HEAD(aio_writev_empty_file_signal); +ATF_TC_BODY(aio_writev_empty_file_signal, tc) +{ + struct aiocb aio; + int fd; + + ATF_REQUIRE_KERNEL_MODULE("aio"); + ATF_REQUIRE_UNSAFE_AIO(); + + fd = open("testfile", O_RDWR | O_CREAT, 0600); + ATF_REQUIRE_MSG(fd != -1, "open failed: %s", strerror(errno)); + + bzero(&aio, sizeof(aio)); + aio.aio_fildes = fd; + aio.aio_offset = 0; + aio.aio_iovcnt = 0; + aio.aio_sigevent = *setup_signal(); + + ATF_REQUIRE_EQ(0, aio_writev(&aio)); + ATF_REQUIRE_EQ(0, poll_signaled(&aio)); + + close(fd); +} + +// aio_writev and aio_readv should still work even if the iovcnt is greater +// than the number of buffered AIO operations permitted per process. +ATF_TC_WITH_CLEANUP(vectored_big_iovcnt); +ATF_TC_HEAD(vectored_big_iovcnt, tc) +{ + atf_tc_set_md_var(tc, "descr", + "Vectored AIO should still work even if the iovcnt is greater than " + "the number of buffered AIO operations permitted by the process"); + atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(vectored_big_iovcnt, tc) +{ + struct aiocb aio; + struct iovec *iov; + ssize_t len, buflen; + char *buffer; + const char *oid = "vfs.aio.max_buf_aio"; + long seed; + int max_buf_aio; + int fd, i; + ssize_t sysctl_len = sizeof(max_buf_aio); + + ATF_REQUIRE_KERNEL_MODULE("aio"); + ATF_REQUIRE_UNSAFE_AIO(); + + if (sysctlbyname(oid, &max_buf_aio, &sysctl_len, NULL, 0) == -1) + atf_libc_error(errno, "Failed to read %s", oid); + + seed = random(); + buflen = 512 * (max_buf_aio + 1); + buffer = malloc(buflen); + aio_fill_buffer(buffer, buflen, seed); + iov = calloc(max_buf_aio + 1, sizeof(struct iovec)); + + fd = aio_md_setup(); + + bzero(&aio, sizeof(aio)); + aio.aio_fildes = fd; + aio.aio_offset = 0; + for (i = 0; i < max_buf_aio + 1; i++) { + iov[i].iov_base = &buffer[i * 512]; + iov[i].iov_len = 512; + } + aio.aio_iov = iov; + aio.aio_iovcnt = max_buf_aio + 1; + + if (aio_writev(&aio) < 0) + atf_tc_fail("aio_writev failed: %s", strerror(errno)); + + len = poll(&aio); + if (len < 0) + atf_tc_fail("aio failed: %s", strerror(errno)); + + if (len != buflen) + atf_tc_fail("aio short write (%jd)", (intmax_t)len); + + bzero(&aio, sizeof(aio)); + aio.aio_fildes = fd; + aio.aio_offset = 0; + aio.aio_iov = iov; + aio.aio_iovcnt = max_buf_aio + 1; + + if (aio_readv(&aio) < 0) + atf_tc_fail("aio_readv failed: %s", strerror(errno)); + + len = poll(&aio); + if (len < 0) + atf_tc_fail("aio failed: %s", strerror(errno)); + + if (len != buflen) + atf_tc_fail("aio short read (%jd)", (intmax_t)len); + + if (aio_test_buffer(buffer, buflen, seed) == 0) + atf_tc_fail("buffer mismatched"); + + close(fd); +} +ATF_TC_CLEANUP(vectored_big_iovcnt, tc) +{ + aio_md_cleanup(); +} + +ATF_TC_WITHOUT_HEAD(vectored_file_poll); +ATF_TC_BODY(vectored_file_poll, tc) +{ + aio_file_test(poll, NULL, true); +} + +ATF_TC_WITH_CLEANUP(vectored_md_poll); +ATF_TC_HEAD(vectored_md_poll, tc) +{ + atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(vectored_md_poll, tc) +{ + aio_md_test(poll, NULL, true); +} +ATF_TC_CLEANUP(vectored_md_poll, tc) +{ + aio_md_cleanup(); +} + +ATF_TC_WITHOUT_HEAD(vectored_socket_poll); +ATF_TC_BODY(vectored_socket_poll, tc) +{ + aio_unix_socketpair_test(poll, NULL, true); +} + +// aio_writev and aio_readv should still work even if the iov contains elements +// that aren't a multiple of the device's sector size, and even if the total +// amount if I/O _is_ a multiple of the device's sector size. +ATF_TC_WITH_CLEANUP(vectored_unaligned); +ATF_TC_HEAD(vectored_unaligned, tc) +{ + atf_tc_set_md_var(tc, "descr", + "Vectored AIO should still work even if the iov contains elements " + "that aren't a multiple of the sector size."); + atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(vectored_unaligned, tc) +{ + struct aio_context ac; + struct aiocb aio; + struct iovec iov[3]; + ssize_t len, total_len; + int fd; + + ATF_REQUIRE_KERNEL_MODULE("aio"); + ATF_REQUIRE_UNSAFE_AIO(); + + /* + * Use a zvol with volmode=dev, so it will allow .d_write with + * unaligned uio. geom devices use physio, which doesn't allow that. + */ + fd = aio_zvol_setup(); + aio_context_init(&ac, fd, fd, FILE_LEN); + + /* Break the buffer into 3 parts: + * * A 4kB part, aligned to 4kB + * * Two other parts that add up to 4kB: + * - 256B + * - 4kB - 256B + */ + iov[0].iov_base = ac.ac_buffer; + iov[0].iov_len = 4096; + iov[1].iov_base = (void*)((uintptr_t)iov[0].iov_base + iov[0].iov_len); + iov[1].iov_len = 256; + iov[2].iov_base = (void*)((uintptr_t)iov[1].iov_base + iov[1].iov_len); + iov[2].iov_len = 4096 - iov[1].iov_len; + total_len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; + bzero(&aio, sizeof(aio)); + aio.aio_fildes = ac.ac_write_fd; + aio.aio_offset = 0; + aio.aio_iov = iov; + aio.aio_iovcnt = 3; + + if (aio_writev(&aio) < 0) + atf_tc_fail("aio_writev failed: %s", strerror(errno)); + + len = poll(&aio); + if (len < 0) + atf_tc_fail("aio failed: %s", strerror(errno)); + + if (len != total_len) + atf_tc_fail("aio short write (%jd)", (intmax_t)len); + + bzero(&aio, sizeof(aio)); + aio.aio_fildes = ac.ac_read_fd; + aio.aio_offset = 0; + aio.aio_iov = iov; + aio.aio_iovcnt = 3; + + if (aio_readv(&aio) < 0) + atf_tc_fail("aio_readv failed: %s", strerror(errno)); + len = poll(&aio); + + ATF_REQUIRE_MSG(aio_test_buffer(ac.ac_buffer, total_len, + ac.ac_seed) != 0, "aio_test_buffer: internal error"); + + close(fd); +} +ATF_TC_CLEANUP(vectored_unaligned, tc) +{ + aio_zvol_cleanup(); +} + +static void +aio_zvol_test(completion comp, struct sigevent *sev, bool vectored) +{ + struct aio_context ac; + int fd; + + fd = aio_zvol_setup(); + aio_context_init(&ac, fd, fd, MD_LEN); + if (vectored) { + aio_writev_test(&ac, comp, sev); + aio_readv_test(&ac, comp, sev); + } else { + aio_write_test(&ac, comp, sev); + aio_read_test(&ac, comp, sev); + } + + close(fd); +} + +/* + * Note that unlike md, the zvol is not a geom device, does not allow unmapped + * buffers, and does not use physio. + */ +ATF_TC_WITH_CLEANUP(vectored_zvol_poll); +ATF_TC_HEAD(vectored_zvol_poll, tc) +{ + atf_tc_set_md_var(tc, "require.user", "root"); +} +ATF_TC_BODY(vectored_zvol_poll, tc) +{ + aio_zvol_test(poll, NULL, true); +} +ATF_TC_CLEANUP(vectored_zvol_poll, tc) +{ + aio_zvol_cleanup(); +} + ATF_TP_ADD_TCS(tp) { @@ -1193,7 +1755,19 @@ ATF_TP_ADD_TC(tp, aio_large_read_test); ATF_TP_ADD_TC(tp, aio_socket_two_reads); ATF_TP_ADD_TC(tp, aio_socket_blocking_short_write); + ATF_TP_ADD_TC(tp, aio_socket_blocking_short_write_vectored); ATF_TP_ADD_TC(tp, aio_socket_short_write_cancel); + ATF_TP_ADD_TC(tp, aio_writev_dos_iov_len); + ATF_TP_ADD_TC(tp, aio_writev_dos_iovcnt); + ATF_TP_ADD_TC(tp, aio_writev_efault); + ATF_TP_ADD_TC(tp, aio_writev_empty_file_poll); + ATF_TP_ADD_TC(tp, aio_writev_empty_file_signal); + ATF_TP_ADD_TC(tp, vectored_big_iovcnt); + ATF_TP_ADD_TC(tp, vectored_file_poll); + ATF_TP_ADD_TC(tp, vectored_md_poll); + ATF_TP_ADD_TC(tp, vectored_zvol_poll); + ATF_TP_ADD_TC(tp, vectored_unaligned); + ATF_TP_ADD_TC(tp, vectored_socket_poll); return (atf_no_error()); }