Index: head/sys/kern/kern_mbuf.c =================================================================== --- head/sys/kern/kern_mbuf.c +++ head/sys/kern/kern_mbuf.c @@ -112,6 +112,11 @@ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ +bool mb_use_ext_pgs; /* use EXT_PGS mbufs for sendfile */ +SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN, + &mb_use_ext_pgs, 0, + "Use unmapped mbufs for sendfile(2)"); + static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, Index: head/sys/kern/kern_sendfile.c =================================================================== --- head/sys/kern/kern_sendfile.c +++ head/sys/kern/kern_sendfile.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,7 @@ #define EXT_FLAG_SYNC EXT_FLAG_VENDOR1 #define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2 +#define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3 /* * Structure describing a single sendfile(2) I/O, which may consist of @@ -201,6 +203,39 @@ } } +static void +sendfile_free_mext_pg(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + vm_page_t pg; + int i; + bool nocache, cache_last; + + KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_PGS, + ("%s: m %p !M_EXT or !EXT_PGS", __func__, m)); + + nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE; + cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST; + ext_pgs = m->m_ext.ext_pgs; + + for (i = 0; i < ext_pgs->npgs; i++) { + if (cache_last && i == ext_pgs->npgs - 1) + nocache = false; + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + sendfile_free_page(pg, nocache); + } + + if (m->m_ext.ext_flags & EXT_FLAG_SYNC) { + struct sendfile_sync *sfs = m->m_ext.ext_arg2; + + mtx_lock(&sfs->mtx); + KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); + if (--sfs->count == 0) + cv_signal(&sfs->cv); + mtx_unlock(&sfs->mtx); + } +} + /* * Helper function to calculate how much data to put into page i of n. * Only first and last pages are special. @@ -283,8 +318,6 @@ CURVNET_SET(so->so_vnet); if (sfio->error) { - struct mbuf *m; - /* * I/O operation failed. The state of data in the socket * is now inconsistent, and all what we can do is to tear @@ -299,9 +332,7 @@ so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; - m = sfio->m; - for (int i = 0; i < sfio->npages; i++) - m = m_free(m); + mb_free_notready(sfio->m, sfio->npages); } else (void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, sfio->npages); @@ -540,13 +571,15 @@ struct vnode *vp; struct vm_object *obj; struct socket *so; + struct mbuf_ext_pgs *ext_pgs; struct mbuf *m, *mh, *mhtail; struct sf_buf *sf; struct shmfd *shmfd; struct sendfile_sync *sfs; struct vattr va; off_t off, sbytes, rem, obj_size; - int error, softerr, bsize, hdrlen; + int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr; + bool use_ext_pgs; obj = NULL; so = NULL; @@ -554,6 +587,7 @@ sfs = NULL; hdrlen = sbytes = 0; softerr = 0; + use_ext_pgs = false; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); if (error != 0) @@ -714,6 +748,17 @@ if (space > rem) space = rem; + else if (space > PAGE_SIZE) { + /* + * Use page boundaries when possible for large + * requests. + */ + if (off & PAGE_MASK) + space -= (PAGE_SIZE - (off & PAGE_MASK)); + space = trunc_page(space); + if (off & PAGE_MASK) + space += (PAGE_SIZE - (off & PAGE_MASK)); + } npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); @@ -751,6 +796,22 @@ * dumped into socket buffer. */ pa = sfio->pa; + + /* + * Use unmapped mbufs if enabled for TCP. Unmapped + * bufs are restricted to TCP as that is what has been + * tested. In particular, unmapped mbufs have not + * been tested with UNIX-domain sockets. + */ + if (mb_use_ext_pgs && + so->so_proto->pr_protocol == IPPROTO_TCP) { + use_ext_pgs = true; + max_pgs = MBUF_PEXT_MAX_PGS; + + /* Start at last index, to wrap on first use. */ + ext_pgs_idx = max_pgs - 1; + } + for (int i = 0; i < npages; i++) { struct mbuf *m0; @@ -764,6 +825,66 @@ npages = i; softerr = EBUSY; break; + } + + if (use_ext_pgs) { + off_t xfs; + + ext_pgs_idx++; + if (ext_pgs_idx == max_pgs) { + m0 = mb_alloc_ext_pgs(M_WAITOK, false, + sendfile_free_mext_pg); + + if (flags & SF_NOCACHE) { + m0->m_ext.ext_flags |= + EXT_FLAG_NOCACHE; + + /* + * See comment below regarding + * ignoring SF_NOCACHE for the + * last page. + */ + if ((npages - i <= max_pgs) && + ((off + space) & PAGE_MASK) && + (rem > space || rhpages > 0)) + m0->m_ext.ext_flags |= + EXT_FLAG_CACHE_LAST; + } + if (sfs != NULL) { + m0->m_ext.ext_flags |= + EXT_FLAG_SYNC; + m0->m_ext.ext_arg2 = sfs; + mtx_lock(&sfs->mtx); + sfs->count++; + mtx_unlock(&sfs->mtx); + } + ext_pgs = m0->m_ext.ext_pgs; + if (i == 0) + sfio->m = m0; + ext_pgs_idx = 0; + + /* Append to mbuf chain. */ + if (mtail != NULL) + mtail->m_next = m0; + else + m = m0; + mtail = m0; + ext_pgs->first_pg_off = + vmoff(i, off) & PAGE_MASK; + } + if (nios) { + mtail->m_flags |= M_NOTREADY; + ext_pgs->nrdy++; + } + + ext_pgs->pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pa[i]); + ext_pgs->npgs++; + xfs = xfsize(i, npages, off, space); + ext_pgs->last_pg_len = xfs; + MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs); + mtail->m_len += xfs; + mtail->m_ext.ext_size += PAGE_SIZE; + continue; } /* Index: head/sys/sys/mbuf.h =================================================================== --- head/sys/sys/mbuf.h +++ head/sys/sys/mbuf.h @@ -1129,6 +1129,7 @@ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern int nmbclusters; /* Maximum number of clusters */ +extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */ /*- * Network packets may have annotations attached by affixing a list of