Index: sys/kern/kern_sendfile.c =================================================================== --- sys/kern/kern_sendfile.c +++ sys/kern/kern_sendfile.c @@ -36,16 +36,14 @@ #include #include #include -#include #include #include #include -#include #include -#include #include #include #include +#include #include #include #include @@ -53,9 +51,11 @@ #include #include #include +#include #include #include +#include #include #include @@ -65,6 +65,8 @@ #include #include +static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile dynamic memory"); + #define EXT_FLAG_SYNC EXT_FLAG_VENDOR1 #define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2 #define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3 @@ -90,6 +92,7 @@ struct socket *so; struct mbuf *m; vm_object_t obj; + vm_pindex_t pindex0; #ifdef KERN_TLS struct ktls_session *tls; #endif @@ -261,13 +264,37 @@ { struct sf_io *sfio = arg; struct socket *so; + int i; - for (int i = 0; i < count; i++) - if (pg[i] != bogus_page) - vm_page_xunbusy_unchecked(pg[i]); - - if (error) + if (error != 0) { sfio->error = error; + /* + * Restore of the pg[] elements is done by + * sendfile_swapin(). + */ + } else if (pg != NULL) { + /* + * Restore the valid page pointers. They are already + * unbusied, but still wired. For error != 0 case, + * sendfile_swapin() handles unbusy. + * + * XXXKIB since pages are only wired, and we do not + * own the object lock, other users might invalidated + * them meantime. Similarly, after we unbusied the + * swapped-in pages, they can become invalid under us. + */ + for (i = 0; i < count; i++) { + if (pg[i] == bogus_page) { + pg[i] = vm_page_relookup(sfio->obj, + sfio->pindex0 + i + (sfio->pa - pg)); + KASSERT(pg[i] != NULL, + ("%s: page %p[%d] disappeared", + __func__, pg, i)); + } else { + vm_page_xunbusy_unchecked(pg[i]); + } + } + } if (!refcount_release(&sfio->nios)) return; @@ -283,7 +310,7 @@ * to the socket yet. */ MPASS((curthread->td_pflags & TDP_KTHREAD) == 0); - free(sfio, M_TEMP); + free(sfio, M_SENDFILE); return; } @@ -338,7 +365,7 @@ out_with_ref: #endif CURVNET_RESTORE(); - free(sfio, M_TEMP); + free(sfio, M_SENDFILE); } /* @@ -348,11 +375,13 @@ sendfile_swapin(vm_object_t obj, struct sf_io *sfio, int *nios, off_t off, off_t len, int npages, int rhpages, int flags) { - vm_page_t *pa = sfio->pa; - int grabbed; + vm_page_t *pa; + int a, count, count1, grabbed, i, j, rv; + pa = sfio->pa; *nios = 0; flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0; + sfio->pindex0 = OFF_TO_IDX(off); /* * First grab all the pages and wire them. Note that we grab @@ -367,9 +396,7 @@ rhpages = 0; } - for (int i = 0; i < npages;) { - int j, a, count, rv; - + for (i = 0; i < npages;) { /* Skip valid pages. */ if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK, xfsize(i, npages, off, len))) { @@ -409,19 +436,41 @@ count = min(a + 1, npages - i); /* - * We should not pagein into a valid page, thus we first trim - * any valid pages off the end of request, and substitute - * to bogus_page those, that are in the middle. + * We should not pagein into a valid page because + * there might be still unfinished write tracked by + * e.g. a buffer, thus we substitute any valid pages + * with the bogus one. + * + * We must not leave around xbusy pages which are not + * part of the run passed to vm_pager_getpages(), + * otherwise pager might deadlock waiting for the busy + * status of the page, e.g. if it constitues the + * buffer needed to validate other page. + * + * First trim the end of the run consisting of the + * valid pages, then replace the rest of the valid + * with bogus. */ + count1 = count; for (j = i + count - 1; j > i; j--) { if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { + vm_page_xunbusy(pa[j]); + SFSTAT_INC(sf_pages_valid); count--; - rhpages = 0; - } else + } else { break; + } } - for (j = i + 1; j < i + count - 1; j++) + + /* + * The last page in the run pa[i + count - 1] is + * guaranteed to be invalid by the trim above, so it + * is not replaced with bogus, thus -1 in the loop end + * condition. + */ + MPASS(pa[i + count - 1]->valid != VM_PAGE_BITS_ALL); + for (j = i + 1; j < i + count - 1; j++) { if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK, xfsize(j, npages, off, len))) { vm_page_xunbusy(pa[j]); @@ -429,21 +478,33 @@ SFSTAT_INC(sf_pages_bogus); pa[j] = bogus_page; } + } refcount_acquire(&sfio->nios); rv = vm_pager_get_pages_async(obj, pa + i, count, NULL, i + count == npages ? &rhpages : NULL, &sendfile_iodone, sfio); if (__predict_false(rv != VM_PAGER_OK)) { + /* + * Wait for all in-flight ios to complete, we + * must not unwire pages under them. + */ + while (atomic_load_int(&sfio->nios) != 1) + pause("sferrio", 1); + /* * Perform full pages recovery before returning EIO. * Pages from 0 to npages are wired. - * Pages from i to npages are also busied. * Pages from (i + 1) to (i + count - 1) may be * substituted to bogus page, and not busied. + * Pages from (i + count) to (i + count1 - 1) are + * not busied. + * Rest of the pages from i to npages are busied. */ for (j = 0; j < npages; j++) { - if (j > i && j < i + count - 1 && + if (j >= i + count && j < i + count1) + ; + else if (j > i && j < i + count - 1 && pa[j] == bogus_page) pa[j] = vm_page_relookup(obj, OFF_TO_IDX(vmoff(j, off))); @@ -454,7 +515,6 @@ __func__, pa, j)); vm_page_unwire(pa[j], PQ_INACTIVE); } - refcount_release(&sfio->nios); return (EIO); } @@ -463,19 +523,7 @@ if (i + count == npages) SFSTAT_ADD(sf_rhpages_read, rhpages); - /* - * Restore the valid page pointers. They are already - * unbusied, but still wired. - */ - for (j = i + 1; j < i + count - 1; j++) - if (pa[j] == bogus_page) { - pa[j] = vm_page_relookup(obj, - OFF_TO_IDX(vmoff(j, off))); - KASSERT(pa[j], ("%s: page %p[%d] disappeared", - __func__, pa, j)); - - } - i += count; + i += count1; (*nios)++; } @@ -640,7 +688,7 @@ SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags)); if (flags & SF_SYNC) { - sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO); + sfs = malloc(sizeof(*sfs), M_SENDFILE, M_WAITOK | M_ZERO); mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF); cv_init(&sfs->cv, "sendfile"); } @@ -826,7 +874,7 @@ npages, rhpages); sfio = malloc(sizeof(struct sf_io) + - npages * sizeof(vm_page_t), M_TEMP, M_WAITOK); + npages * sizeof(vm_page_t), M_SENDFILE, M_WAITOK); refcount_init(&sfio->nios, 1); sfio->obj = obj; sfio->error = 0; @@ -1135,7 +1183,7 @@ KASSERT(sfs->count == 0, ("sendfile sync still busy")); cv_destroy(&sfs->cv); mtx_destroy(&sfs->mtx); - free(sfs, M_TEMP); + free(sfs, M_SENDFILE); } #ifdef KERN_TLS if (tls != NULL) Index: sys/kern/vfs_bio.c =================================================================== --- sys/kern/vfs_bio.c +++ sys/kern/vfs_bio.c @@ -5154,12 +5154,16 @@ br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0) ? GB_UNMAPPED : 0; again: - for (i = 0; i < count; i++) - vm_page_busy_downgrade(ma[i]); + for (i = 0; i < count; i++) { + if (ma[i] != bogus_page) + vm_page_busy_downgrade(ma[i]); + } lbnp = -1; for (i = 0; i < count; i++) { m = ma[i]; + if (m == bogus_page) + continue; /* * Pages are shared busy and the object lock is not @@ -5228,6 +5232,8 @@ redo = false; for (i = 0; i < count; i++) { + if (ma[i] == bogus_page) + continue; if (vm_page_busy_tryupgrade(ma[i]) == 0) { vm_page_sunbusy(ma[i]); ma[i] = vm_page_grab_unlocked(object, ma[i]->pindex, Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -765,7 +765,8 @@ error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead); - ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); + if (ap->a_iodone != NULL) + ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } Index: sys/ufs/ffs/ffs_vnops.c =================================================================== --- sys/ufs/ffs/ffs_vnops.c +++ sys/ufs/ffs/ffs_vnops.c @@ -1780,18 +1780,25 @@ { struct vnode *vp; struct ufsmount *um; + bool do_iodone; int error; vp = ap->a_vp; um = VFSTOUFS(vp->v_mount); + do_iodone = true; - if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) - return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, - ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); - - error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, - ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz); - ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); + if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) { + error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg); + if (error == 0) + do_iodone = false; + } else { + error = vfs_bio_getpages(vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno, + ffs_gbp_getblksz); + } + if (do_iodone && ap->a_iodone != NULL) + ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } Index: sys/vm/vnode_pager.c =================================================================== --- sys/vm/vnode_pager.c +++ sys/vm/vnode_pager.c @@ -776,9 +776,13 @@ int vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap) { + int error; - return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, - ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg)); + error = vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg); + if (error != 0 && ap->a_iodone != NULL) + ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); + return (error); } /*