Index: sbin/ifconfig/ifconfig.8 =================================================================== --- sbin/ifconfig/ifconfig.8 +++ sbin/ifconfig/ifconfig.8 @@ -538,6 +538,12 @@ If the driver supports .Xr tcp 4 large receive offloading, disable LRO on the interface. +.It Cm nomap +If the driver supports unmapped network buffers, +enable them on the interface. +.It Fl nomap +If the driver supports unmapped network buffers, +disable them on the interface. .It Cm wol , wol_ucast , wol_mcast , wol_magic Enable Wake On Lan (WOL) support, if available. WOL is a facility whereby a machine in a low power state may be woken Index: sbin/ifconfig/ifconfig.c =================================================================== --- sbin/ifconfig/ifconfig.c +++ sbin/ifconfig/ifconfig.c @@ -1257,7 +1257,7 @@ "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \ "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \ "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \ -"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP" +"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP" /* * Print the status of the interface. If an address family was @@ -1557,6 +1557,8 @@ DEF_CMD("-link2", -IFF_LINK2, setifflags), DEF_CMD("monitor", IFF_MONITOR, setifflags), DEF_CMD("-monitor", -IFF_MONITOR, setifflags), + DEF_CMD("nomap", IFCAP_NOMAP, setifcap), + DEF_CMD("-nomap", -IFCAP_NOMAP, setifcap), DEF_CMD("staticarp", IFF_STATICARP, setifflags), DEF_CMD("-staticarp", -IFF_STATICARP, setifflags), DEF_CMD("rxcsum6", IFCAP_RXCSUM_IPV6, setifcap), Index: share/man/man9/Makefile =================================================================== --- share/man/man9/Makefile +++ share/man/man9/Makefile @@ -1824,6 +1824,8 @@ MLINKS+=sglist.9 sglist_alloc.9 \ sglist.9 sglist_append.9 \ sglist.9 sglist_append_bio.9 \ + sglist.9 sglist_append_ext_pgs.9 \ + sglist.9 sglist_append_mb_ext_pgs.9 \ sglist.9 sglist_append_mbuf.9 \ sglist.9 sglist_append_phys.9 \ sglist.9 sglist_append_sglist.9 \ @@ -1834,6 +1836,8 @@ sglist.9 sglist_clone.9 \ sglist.9 sglist_consume_uio.9 \ sglist.9 sglist_count.9 \ + sglist.9 sglist_count_ext_pgs.9 \ + sglist.9 sglist_count_mb_ext_pgs.9 \ sglist.9 sglist_count_vmpages.9 \ sglist.9 sglist_free.9 \ sglist.9 sglist_hold.9 \ Index: share/man/man9/mbuf.9 =================================================================== --- share/man/man9/mbuf.9 +++ share/man/man9/mbuf.9 @@ -254,6 +254,8 @@ #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference */ +#define EXT_RXRING 8 /* data in NIC receive ring */ +#define EXT_PGS 9 /* array of unmapped pages */ #define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 253 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */ Index: share/man/man9/sglist.9 =================================================================== --- share/man/man9/sglist.9 +++ share/man/man9/sglist.9 @@ -34,6 +34,8 @@ .Nm sglist_alloc , .Nm sglist_append , .Nm sglist_append_bio , +.Nm sglist_append_ext_pgs, +.Nm sglist_append_mb_ext_pgs, .Nm sglist_append_mbuf , .Nm sglist_append_phys , .Nm sglist_append_sglist , @@ -44,6 +46,8 @@ .Nm sglist_clone , .Nm sglist_consume_uio , .Nm sglist_count , +.Nm sglist_count_ext_pgs , +.Nm sglist_count_mb_ext_pgs , .Nm sglist_count_vmpages , .Nm sglist_free , .Nm sglist_hold , @@ -64,6 +68,10 @@ .Ft int .Fn sglist_append_bio "struct sglist *sg" "struct bio *bp" .Ft int +.Fn sglist_append_ext_pgs "struct sglist *sg" "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len" +.Ft int +.Fn sglist_append_mb_ext_pgs "struct sglist *sg" "struct mbuf *m" +.Ft int .Fn sglist_append_mbuf "struct sglist *sg" "struct mbuf *m" .Ft int .Fn sglist_append_phys "struct sglist *sg" "vm_paddr_t paddr" "size_t len" @@ -84,6 +92,10 @@ .Ft int .Fn sglist_count "void *buf" "size_t len" .Ft int +.Fn sglist_count_ext_pgs "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len" +.Ft int +.Fn sglist_count_mb_ext_pgs "struct mbuf *m" +.Ft int .Fn sglist_count_vmpages "vm_page_t *m" "size_t pgoff" "size_t len" .Ft void .Fn sglist_free "struct sglist *sg" @@ -146,6 +158,22 @@ bytes long. .Pp The +.Nm sglist_count_ext_pgs +function returns the number of scatter/gather list elements needed to describe +the unmapped external mbuf buffer +.Fa ext_pgs . +The ranges start at an offset of +.Fa offset +relative to the start of the buffer and is +.Fa len +bytes long. +The +.Nm sglist_count_mb_ext_pgs +function returns the number of scatter/gather list elements needed to describe +the physical address ranges of a single unmapped mbuf +.Fa m . +.Pp +The .Nm sglist_count_vmpages function returns the number of scatter/gather list elements needed to describe the physical address ranges of a buffer backed by an array of virtual memory @@ -237,6 +265,34 @@ .Fa sg . .Pp The +.Nm sglist_append_ext_pgs +function appends the physical address ranges described by the unmapped +external mbuf buffer +.Fa ext_pgs +to the scatter/gather list +.Fa sg . +The physical address ranges start at offset +.Fa offset +within +.Fa ext_pgs +and continue for +.Fa len +bytes. +.Pp +The +.Nm sglist_append_mb_ext_pgs +function appends the physical address ranges described by the unmapped +mbuf +.Fa m +to the scatter/gather list +.Fa sg . +Note that unlike +.Nm sglist_append_mbuf , +.Nm sglist_append_mb_ext_pgs +only adds ranges for a single mbuf, +not an entire mbuf chain. +.Pp +The .Nm sglist_append_mbuf function appends the physical address ranges described by an entire mbuf chain @@ -467,8 +523,7 @@ .Pp The .Nm sglist_count -and -.Nm sglist_count_vmpages +family of functions return a count of scatter/gather list elements. .Pp The Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4268,7 +4268,8 @@ netinet/tcp_output.c optional inet | inet6 netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6 netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6 -netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap +netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \ + compile-with "${NORMAL_C} ${NO_WNONNULL}" netinet/tcp_reass.c optional inet | inet6 netinet/tcp_sack.c optional inet | inet6 netinet/tcp_subr.c optional inet | inet6 Index: sys/conf/kern.mk =================================================================== --- sys/conf/kern.mk +++ sys/conf/kern.mk @@ -76,6 +76,7 @@ # GCC 4.2 doesn't have -Wno-error=cast-qual, so just disable the warning for # the few files that are already known to generate cast-qual warnings. NO_WCAST_QUAL= -Wno-cast-qual +NO_WNONNULL= -Wno-nonnull .endif .endif Index: sys/dev/cxgbe/t4_main.c =================================================================== --- sys/dev/cxgbe/t4_main.c +++ sys/dev/cxgbe/t4_main.c @@ -1623,7 +1623,7 @@ #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \ IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \ IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS | \ - IFCAP_HWRXTSTMP) + IFCAP_HWRXTSTMP | IFCAP_NOMAP) #define T4_CAP_ENABLE (T4_CAP) static int @@ -1986,6 +1986,8 @@ rxq->iq.flags &= ~IQ_RX_TIMESTAMP; } } + if (mask & IFCAP_NOMAP) + ifp->if_capenable ^= IFCAP_NOMAP; #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); Index: sys/dev/cxgbe/t4_sge.c =================================================================== --- sys/dev/cxgbe/t4_sge.c +++ sys/dev/cxgbe/t4_sge.c @@ -83,6 +83,7 @@ #endif /* Internal mbuf flags stored in PH_loc.eight[1]. */ +#define MC_NOMAP 0x01 #define MC_RAW_WR 0x02 /* @@ -2434,15 +2435,78 @@ return ((void *)p); } +static inline int +count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr) +{ + struct mbuf_ext_pgs *ext_pgs; + vm_paddr_t paddr; + int i, len, off, pglen, pgoff, seglen, segoff; + int nsegs = 0; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + off = mtod(m, vm_offset_t); + len = m->m_len; + off += skip; + len -= skip; + + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = min(seglen, len); + off = 0; + len -= seglen; + paddr = pmap_kextract( + (vm_offset_t)&ext_pgs->hdr[segoff]); + if (*nextaddr != paddr) + nsegs++; + *nextaddr = paddr + seglen; + } + } + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = min(seglen, len); + len -= seglen; + paddr = ext_pgs->pa[i] + segoff; + if (*nextaddr != paddr) + nsegs++; + *nextaddr = paddr + seglen; + pgoff = 0; + }; + if (len != 0) { + seglen = min(len, ext_pgs->trail_len - off); + len -= seglen; + paddr = pmap_kextract((vm_offset_t)&ext_pgs->trail[off]); + if (*nextaddr != paddr) + nsegs++; + *nextaddr = paddr + seglen; + } + + return (nsegs); +} + + /* * Can deal with empty mbufs in the chain that have m_len = 0, but the chain * must have at least one mbuf that's not empty. It is possible for this * routine to return 0 if skip accounts for all the contents of the mbuf chain. */ static inline int -count_mbuf_nsegs(struct mbuf *m, int skip) +count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags) { - vm_paddr_t lastb, next; + vm_paddr_t nextaddr, paddr; vm_offset_t va; int len, nsegs; @@ -2451,9 +2515,8 @@ MPASS(m->m_pkthdr.len >= skip); nsegs = 0; - lastb = 0; + nextaddr = 0; for (; m; m = m->m_next) { - len = m->m_len; if (__predict_false(len == 0)) continue; @@ -2461,14 +2524,20 @@ skip -= len; continue; } + if ((m->m_flags & M_NOMAP) != 0) { + *cflags |= MC_NOMAP; + nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr); + skip = 0; + continue; + } va = mtod(m, vm_offset_t) + skip; len -= skip; skip = 0; - next = pmap_kextract(va); + paddr = pmap_kextract(va); nsegs += sglist_count((void *)(uintptr_t)va, len); - if (lastb + 1 == next) + if (paddr == nextaddr) nsegs--; - lastb = pmap_kextract(va + len - 1); + nextaddr = pmap_kextract(va + len - 1) + 1; } return (nsegs); @@ -2490,7 +2559,9 @@ struct tcphdr *tcp; #endif uint16_t eh_type; + uint8_t cflags; + cflags = 0; M_ASSERTPKTHDR(m0); if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) { rc = EINVAL; @@ -2506,7 +2577,7 @@ */ M_ASSERTPKTHDR(m0); MPASS(m0->m_pkthdr.len > 0); - nsegs = count_mbuf_nsegs(m0, 0); + nsegs = count_mbuf_nsegs(m0, 0, &cflags); if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) { if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) { rc = EFBIG; @@ -2516,7 +2587,8 @@ goto restart; } - if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) { + if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN && + !(cflags & MC_NOMAP))) { m0 = m_pullup(m0, m0->m_pkthdr.len); if (m0 == NULL) { /* Should have left well enough alone. */ @@ -2527,7 +2599,7 @@ goto restart; } set_mbuf_nsegs(m0, nsegs); - set_mbuf_cflags(m0, 0); + set_mbuf_cflags(m0, cflags); if (sc->flags & IS_VF) set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0))); else @@ -2616,7 +2688,9 @@ /* EO WRs have the headers in the WR and not the GL. */ immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen; - nsegs = count_mbuf_nsegs(m0, immhdrs); + cflags = 0; + nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags); + MPASS(cflags == mbuf_cflags(m0)); set_mbuf_eo_nsegs(m0, nsegs); set_mbuf_eo_len16(m0, txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0))); @@ -4723,7 +4797,8 @@ ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); - else if (pktlen <= imm_payload(2) && available >= 2) { + else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) && + available >= 2) { /* Immediate data. Recalculate len16 and set nsegs to 0. */ ctrl += pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + Index: sys/dev/cxgbe/tom/t4_cpl_io.c =================================================================== --- sys/dev/cxgbe/tom/t4_cpl_io.c +++ sys/dev/cxgbe/tom/t4_cpl_io.c @@ -666,6 +666,8 @@ if (IS_AIOTX_MBUF(m)) rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m), aiotx_mbuf_pgoff(m), m->m_len); + else if (m->m_flags & M_NOMAP) + rc = sglist_append_mb_ext_pgs(&sg, m); else rc = sglist_append(&sg, mtod(m, void *), m->m_len); if (__predict_false(rc != 0)) @@ -787,6 +789,8 @@ if (IS_AIOTX_MBUF(m)) n = sglist_count_vmpages(aiotx_mbuf_pages(m), aiotx_mbuf_pgoff(m), m->m_len); + else if (m->m_flags & M_NOMAP) + n = sglist_count_mb_ext_pgs(m); else n = sglist_count(mtod(m, void *), m->m_len); Index: sys/dev/mlx5/mlx5_en/mlx5_en_main.c =================================================================== --- sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -3279,6 +3279,8 @@ "tso6 disabled due to -txcsum6.\n"); } } + if (mask & IFCAP_NOMAP) + ifp->if_capenable ^= IFCAP_NOMAP; if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) @@ -4145,6 +4147,7 @@ ifp->if_capabilities |= IFCAP_LRO; ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO; ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP; + ifp->if_capabilities |= IFCAP_NOMAP; ifp->if_capabilities |= IFCAP_TXRTLMT; ifp->if_snd_tag_alloc = mlx5e_snd_tag_alloc; ifp->if_snd_tag_free = mlx5e_snd_tag_free; Index: sys/kern/kern_mbuf.c =================================================================== --- sys/kern/kern_mbuf.c +++ sys/kern/kern_mbuf.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -111,6 +112,11 @@ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ +bool mb_use_ext_pgs; /* use EXT_PGS mbufs for sendfile */ +SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN, + &mb_use_ext_pgs, 0, + "Use unmapped mbufs for sendfile(2)"); + static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, @@ -281,6 +287,7 @@ uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; +uma_zone_t zone_extpgs; /* * Local prototypes. @@ -298,6 +305,9 @@ /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); +_Static_assert(sizeof(struct mbuf_ext_pgs) == 256, + "mbuf_ext_pgs size mismatch"); + /* * Initialize FreeBSD Network buffer allocation. */ @@ -379,6 +389,15 @@ uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); + zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME, + sizeof(struct mbuf_ext_pgs), +#ifdef INVARIANTS + trash_ctor, trash_dtor, trash_init, trash_fini, +#else + NULL, NULL, NULL, NULL, +#endif + UMA_ALIGN_CACHE, 0); + /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA @@ -823,6 +842,390 @@ (*pr->pr_drain)(); } +/* + * Free "count" units of I/O from an mbuf chain. They could be held + * in EXT_PGS or just as a normal mbuf. This code is intended to be + * called in an error path (I/O error, closed connection, etc). + */ +void +mb_free_notready(struct mbuf *m, int count) +{ + int i; + + for (i = 0; i < count && m != NULL; i++) { + if ((m->m_flags & M_EXT) != 0 && + m->m_ext.ext_type == EXT_PGS) { + m->m_ext.ext_pgs->nrdy--; + if (m->m_ext.ext_pgs->nrdy != 0) + continue; + } + m = m_free(m); + } + KASSERT(i == count, ("Removed only %d items from %p", i, m)); +} + +/* + * Ensure it is possible to downgrade an EXT_PGS mbuf + * to a normal mbuf. + * + * XXXJHB: I think this is no longer needed? The callers of + * mb_unmapped_compress all check the length against MLEN, and + * mb_unmapped_compress allows data to be stored in unmapped pages. + */ +CTASSERT(MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN < MLEN); + +/* + * Compress an unmapped mbuf into a simple mbuf when it holds a small + * amount of data. This is used as a DOS defense to avoid having + * small packets tie up wired pages, an ext_pgs structure, and an + * mbuf. Since this converts the existing mbuf in place, it can only + * be used if there are no other references to 'm'. + */ +int +mb_unmapped_compress(struct mbuf *m) +{ + volatile u_int *refcnt; + struct mbuf m_temp; + + /* + * Assert that 'm' does not have a packet header. If 'm' had + * a packet header, it would only be able to hold MHLEN bytes + * and m_data would have to be initialized differently. + */ + KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) && + m->m_ext.ext_type == EXT_PGS, + ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m)); + KASSERT(m->m_len <= MLEN, ("m_len too large %p", m)); + + if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { + refcnt = &m->m_ext.ext_count; + } else { + KASSERT(m->m_ext.ext_cnt != NULL, + ("%s: no refcounting pointer on %p", __func__, m)); + refcnt = m->m_ext.ext_cnt; + } + + if (*refcnt != 1) + return (EBUSY); + + /* + * Copy mbuf header and m_ext portion of 'm' to 'm_temp' to + * create a "fake" EXT_PGS mbuf that can be used with + * m_copydata() as well as the ext_free callback. + */ + memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext)); + m_temp.m_next = NULL; + m_temp.m_nextpkt = NULL; + + /* Turn 'm' into a "normal" mbuf. */ + m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP); + m->m_data = m->m_dat; + + /* Copy data from template's ext_pgs. */ + m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t)); + + /* Free the backing pages. */ + m_temp.m_ext.ext_free(&m_temp); + + /* Finally, free the ext_pgs struct. */ + uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs); + return (0); +} + +/* + * These next few routines are used to permit downgrading an unmapped + * mbuf to a chain of mapped mbufs. This is used when an interface + * doesn't supported unmapped mbufs or if checksums need to be + * computed in software. + * + * Each unmapped mbuf is converted to a chain of mbufs. First, any + * TLS header data is stored in a regular mbuf. Second, each page of + * unmapped data is stored in an mbuf with an EXT_SFBUF external + * cluster. These mbufs use an sf_buf to provide a valid KVA for the + * associated physical page. They also hold a reference on the + * original EXT_PGS mbuf to ensure the physical page doesn't go away. + * Finally, any TLS trailer data is stored in a regular mbuf. + * + * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF + * mbufs. It frees the associated sf_buf and releases its reference + * on the original EXT_PGS mbuf. + * + * _mb_unmapped_to_ext() is a helper function that converts a single + * unmapped mbuf into a chain of mbufs. + * + * mb_unmapped_to_ext() is the public function that walks an mbuf + * chain converting any unmapped mbufs to mapped mbufs. It returns + * the new chain of unmapped mbufs on success. On failure it frees + * the original mbuf chain and returns NULL. + */ +static void +mb_unmapped_free_mext(struct mbuf *m) +{ + struct sf_buf *sf; + struct mbuf *old_m; + + sf = m->m_ext.ext_arg1; + sf_buf_free(sf); + + /* Drop the reference on the backing EXT_PGS mbuf. */ + old_m = m->m_ext.ext_arg2; + mb_free_ext(old_m); +} + +static struct mbuf * +_mb_unmapped_to_ext(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + struct mbuf *m_new, *top, *prev, *mref; + struct sf_buf *sf; + vm_page_t pg; + int i, len, off, pglen, pgoff, seglen, segoff; + volatile u_int *refcnt; + u_int ref_inc = 0; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + len = m->m_len; + KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p", + __func__, m)); + + /* See if this is the mbuf that holds the embedded refcount. */ + if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { + refcnt = &m->m_ext.ext_count; + mref = m; + } else { + KASSERT(m->m_ext.ext_cnt != NULL, + ("%s: no refcounting pointer on %p", __func__, m)); + refcnt = m->m_ext.ext_cnt; + mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); + } + + /* Skip over any data removed from the front. */ + off = mtod(m, vm_offset_t); + + top = NULL; + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = min(seglen, len); + off = 0; + len -= seglen; + m_new = m_get(M_NOWAIT, MT_DATA); + if (m_new == NULL) + goto fail; + m_new->m_len = seglen; + prev = top = m_new; + memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff], + seglen); + } + } + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = min(seglen, len); + len -= seglen; + + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + m_new = m_get(M_NOWAIT, MT_DATA); + if (m_new == NULL) + goto fail; + if (top == NULL) { + top = prev = m_new; + } else { + prev->m_next = m_new; + prev = m_new; + } + sf = sf_buf_alloc(pg, SFB_NOWAIT); + if (sf == NULL) + goto fail; + + ref_inc++; + m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE, + mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF); + m_new->m_data += segoff; + m_new->m_len = seglen; + + pgoff = 0; + }; + if (len != 0) { + KASSERT((off + len) <= ext_pgs->trail_len, + ("off + len > trail (%d + %d > %d)", off, len, + ext_pgs->trail_len)); + m_new = m_get(M_NOWAIT, MT_DATA); + if (m_new == NULL) + goto fail; + if (top == NULL) + top = m_new; + else + prev->m_next = m_new; + m_new->m_len = len; + memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len); + } + + if (ref_inc != 0) { + /* + * Obtain an additional reference on the old mbuf for + * each created EXT_SFBUF mbuf. They will be dropped + * in mb_unmapped_free_mext(). + */ + if (*refcnt == 1) + *refcnt += ref_inc; + else + atomic_add_int(refcnt, ref_inc); + } + m_free(m); + return (top); + +fail: + if (ref_inc != 0) { + /* + * Obtain an additional reference on the old mbuf for + * each created EXT_SFBUF mbuf. They will be + * immediately dropped when these mbufs are freed + * below. + */ + if (*refcnt == 1) + *refcnt += ref_inc; + else + atomic_add_int(refcnt, ref_inc); + } + m_free(m); + m_freem(top); + return (NULL); +} + +struct mbuf * +mb_unmapped_to_ext(struct mbuf *top) +{ + struct mbuf *m, *next, *prev = NULL; + + prev = NULL; + for (m = top; m != NULL; m = next) { + /* m might be freed, so cache the next pointer. */ + next = m->m_next; + if (m->m_flags & M_NOMAP) { + if (prev != NULL) { + /* + * Remove 'm' from the new chain so + * that the 'top' chain terminates + * before 'm' in case 'top' is freed + * due to an error. + */ + prev->m_next = NULL; + } + m = _mb_unmapped_to_ext(m); + if (m == NULL) { + m_freem(top); + m_freem(next); + return (NULL); + } + if (prev == NULL) { + top = m; + } else { + prev->m_next = m; + } + + /* + * Replaced one mbuf with a chain, so we must + * find the end of chain. + */ + prev = m_last(m); + } else { + if (prev != NULL) { + prev->m_next = m; + } + prev = m; + } + } + return (top); +} + +/* + * Allocate an empty EXT_PGS mbuf. The ext_free routine is + * responsible for freeing any pages backing this mbuf when it is + * freed. + */ +struct mbuf * +mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free) +{ + struct mbuf *m; + struct mbuf_ext_pgs *ext_pgs; + + if (pkthdr) + m = m_gethdr(how, MT_DATA); + else + m = m_get(how, MT_DATA); + if (m == NULL) + return (NULL); + + ext_pgs = uma_zalloc(zone_extpgs, how); + if (ext_pgs == NULL) { + m_free(m); + return (NULL); + } + ext_pgs->npgs = 0; + ext_pgs->nrdy = 0; + ext_pgs->first_pg_off = 0; + ext_pgs->last_pg_len = 0; + ext_pgs->hdr_len = 0; + ext_pgs->trail_len = 0; + ext_pgs->tls = NULL; + ext_pgs->so = NULL; + m->m_data = NULL; + m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP); + m->m_ext.ext_type = EXT_PGS; + m->m_ext.ext_flags = EXT_FLAG_EMBREF; + m->m_ext.ext_count = 1; + m->m_ext.ext_pgs = ext_pgs; + m->m_ext.ext_size = 0; + m->m_ext.ext_free = ext_free; + return (m); +} + +#ifdef INVARIANT_SUPPORT +void +mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs) +{ + + /* + * NB: This expects a non-empty buffer (npgs > 0 and + * last_pg_len > 0). + */ + KASSERT(ext_pgs->npgs > 0, + ("ext_pgs with no valid pages: %p", ext_pgs)); + KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa), + ("ext_pgs with too many pages: %p", ext_pgs)); + KASSERT(ext_pgs->nrdy <= ext_pgs->npgs, + ("ext_pgs with too many ready pages: %p", ext_pgs)); + KASSERT(ext_pgs->first_pg_off < PAGE_SIZE, + ("ext_pgs with too large page offset: %p", ext_pgs)); + KASSERT(ext_pgs->last_pg_len > 0, + ("ext_pgs with zero last page length: %p", ext_pgs)); + KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE, + ("ext_pgs with too large last page length: %p", ext_pgs)); + if (ext_pgs->npgs == 1) { + KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <= + PAGE_SIZE, ("ext_pgs with single page too large: %p", + ext_pgs)); + } + KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr), + ("ext_pgs with too large header length: %p", ext_pgs)); + KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail), + ("ext_pgs with too large header length: %p", ext_pgs)); +} +#endif + /* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. @@ -888,6 +1291,10 @@ uma_zfree(zone_jumbo16, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; + case EXT_PGS: + uma_zfree(zone_extpgs, mref->m_ext.ext_pgs); + uma_zfree(zone_mbuf, mref); + break; case EXT_SFBUF: case EXT_NET_DRV: case EXT_MOD_TYPE: Index: sys/kern/kern_sendfile.c =================================================================== --- sys/kern/kern_sendfile.c +++ sys/kern/kern_sendfile.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,7 @@ #define EXT_FLAG_SYNC EXT_FLAG_VENDOR1 #define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2 +#define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3 /* * Structure describing a single sendfile(2) I/O, which may consist of @@ -201,6 +203,39 @@ } } +static void +sendfile_free_mext_pg(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + vm_page_t pg; + int i; + bool nocache, cache_last; + + KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_PGS, + ("%s: m %p !M_EXT or !EXT_PGS", __func__, m)); + + nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE; + cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST; + ext_pgs = m->m_ext.ext_pgs; + + for (i = 0; i < ext_pgs->npgs; i++) { + if (cache_last && i == ext_pgs->npgs - 1) + nocache = false; + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + sendfile_free_page(pg, nocache); + } + + if (m->m_ext.ext_flags & EXT_FLAG_SYNC) { + struct sendfile_sync *sfs = m->m_ext.ext_arg2; + + mtx_lock(&sfs->mtx); + KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0")); + if (--sfs->count == 0) + cv_signal(&sfs->cv); + mtx_unlock(&sfs->mtx); + } +} + /* * Helper function to calculate how much data to put into page i of n. * Only first and last pages are special. @@ -283,8 +318,6 @@ CURVNET_SET(so->so_vnet); if (sfio->error) { - struct mbuf *m; - /* * I/O operation failed. The state of data in the socket * is now inconsistent, and all what we can do is to tear @@ -299,11 +332,9 @@ so->so_proto->pr_usrreqs->pru_abort(so); so->so_error = EIO; - m = sfio->m; - for (int i = 0; i < sfio->npages; i++) - m = m_free(m); + mb_free_notready(sfio->m, sfio->npages); } else - (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, + (void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m, sfio->npages); SOCK_LOCK(so); @@ -540,13 +571,15 @@ struct vnode *vp; struct vm_object *obj; struct socket *so; + struct mbuf_ext_pgs *ext_pgs; struct mbuf *m, *mh, *mhtail; struct sf_buf *sf; struct shmfd *shmfd; struct sendfile_sync *sfs; struct vattr va; off_t off, sbytes, rem, obj_size; - int error, softerr, bsize, hdrlen; + int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr; + bool use_ext_pgs; obj = NULL; so = NULL; @@ -554,6 +587,7 @@ sfs = NULL; hdrlen = sbytes = 0; softerr = 0; + use_ext_pgs = false; error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize); if (error != 0) @@ -714,6 +748,17 @@ if (space > rem) space = rem; + else if (space > PAGE_SIZE) { + /* + * Use page boundaries when possible for large + * requests. + */ + if (off & PAGE_MASK) + space -= (PAGE_SIZE - (off & PAGE_MASK)); + space = trunc_page(space); + if (off & PAGE_MASK) + space += (PAGE_SIZE - (off & PAGE_MASK)); + } npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE); @@ -751,6 +796,22 @@ * dumped into socket buffer. */ pa = sfio->pa; + + /* + * Use unmapped mbufs if enabled for TCP. Unmapped + * bufs are restricted to TCP as that is what has been + * tested. In particular, unmapped mbufs have not + * been tested with UNIX-domain sockets. + */ + if (mb_use_ext_pgs && + so->so_proto->pr_protocol == IPPROTO_TCP) { + use_ext_pgs = true; + max_pgs = MBUF_PEXT_MAX_PGS; + + /* Start at last index, to wrap on first use. */ + ext_pgs_idx = max_pgs - 1; + } + for (int i = 0; i < npages; i++) { struct mbuf *m0; @@ -766,6 +827,66 @@ break; } + if (use_ext_pgs) { + off_t xfs; + + ext_pgs_idx++; + if (ext_pgs_idx == max_pgs) { + m0 = mb_alloc_ext_pgs(M_WAITOK, false, + sendfile_free_mext_pg); + + if (flags & SF_NOCACHE) { + m0->m_ext.ext_flags |= + EXT_FLAG_NOCACHE; + + /* + * See comment below regarding + * ignoring SF_NOCACHE for the + * last page. + */ + if ((npages - i <= max_pgs) && + ((off + space) & PAGE_MASK) && + (rem > space || rhpages > 0)) + m0->m_ext.ext_flags |= + EXT_FLAG_CACHE_LAST; + } + if (sfs != NULL) { + m0->m_ext.ext_flags |= + EXT_FLAG_SYNC; + m0->m_ext.ext_arg2 = sfs; + mtx_lock(&sfs->mtx); + sfs->count++; + mtx_unlock(&sfs->mtx); + } + ext_pgs = m0->m_ext.ext_pgs; + if (i == 0) + sfio->m = m0; + ext_pgs_idx = 0; + + /* Append to mbuf chain. */ + if (mtail != NULL) + mtail->m_next = m0; + else + m = m0; + mtail = m0; + ext_pgs->first_pg_off = + vmoff(i, off) & PAGE_MASK; + } + if (nios) { + mtail->m_flags |= M_NOTREADY; + ext_pgs->nrdy++; + } + + ext_pgs->pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pa[i]); + ext_pgs->npgs++; + xfs = xfsize(i, npages, off, space); + ext_pgs->last_pg_len = xfs; + MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs); + mtail->m_len += xfs; + mtail->m_ext.ext_size += PAGE_SIZE; + continue; + } + /* * Get a sendfile buf. When allocating the * first buffer for mbuf chain, we usually Index: sys/kern/subr_bus_dma.c =================================================================== --- sys/kern/subr_bus_dma.c +++ sys/kern/subr_bus_dma.c @@ -110,6 +110,67 @@ return (error); } +/* + * Load an unmapped mbuf + */ +static int +_bus_dmamap_load_unmapped_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, + struct mbuf *m, bus_dma_segment_t *segs, int *nsegs, int flags) +{ + struct mbuf_ext_pgs *ext_pgs; + int error, i, off, len, pglen, pgoff, seglen, segoff; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + + len = m->m_len; + error = 0; + + /* Skip over any data removed from the front. */ + off = mtod(m, vm_offset_t); + + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = min(seglen, len); + off = 0; + len -= seglen; + error = _bus_dmamap_load_buffer(dmat, map, + &ext_pgs->hdr[segoff], seglen, kernel_pmap, + flags, segs, nsegs); + } + } + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = min(seglen, len); + len -= seglen; + error = _bus_dmamap_load_phys(dmat, map, + ext_pgs->pa[i] + segoff, seglen, flags, segs, nsegs); + pgoff = 0; + }; + if (len != 0 && error == 0) { + KASSERT((off + len) <= ext_pgs->trail_len, + ("off + len > trail (%d + %d > %d)", off, len, + ext_pgs->trail_len)); + error = _bus_dmamap_load_buffer(dmat, map, + &ext_pgs->trail[off], len, kernel_pmap, flags, segs, + nsegs); + } + return (error); +} + /* * Load an mbuf chain. */ @@ -123,9 +184,13 @@ error = 0; for (m = m0; m != NULL && error == 0; m = m->m_next) { if (m->m_len > 0) { - error = _bus_dmamap_load_buffer(dmat, map, m->m_data, - m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF, - segs, nsegs); + if ((m->m_flags & M_NOMAP) != 0) + error = _bus_dmamap_load_unmapped_mbuf_sg(dmat, + map, m, segs, nsegs, flags); + else + error = _bus_dmamap_load_buffer(dmat, map, + m->m_data, m->m_len, kernel_pmap, + flags | BUS_DMA_LOAD_MBUF, segs, nsegs); } } CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d", Index: sys/kern/subr_sglist.c =================================================================== --- sys/kern/subr_sglist.c +++ sys/kern/subr_sglist.c @@ -218,6 +218,75 @@ return (nsegs); } +/* + * Determine the number of scatter/gather list elements needed to + * describe an EXT_PGS buffer. + */ +int +sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, size_t len) +{ + vm_paddr_t nextaddr, paddr; + size_t seglen, segoff; + int i, nsegs, pglen, pgoff; + + if (len == 0) + return (0); + + nsegs = 0; + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = MIN(seglen, len); + off = 0; + len -= seglen; + nsegs += sglist_count(&ext_pgs->hdr[segoff], seglen); + } + } + nextaddr = 0; + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = MIN(seglen, len); + len -= seglen; + paddr = ext_pgs->pa[i] + segoff; + if (paddr != nextaddr) + nsegs++; + nextaddr = paddr + seglen; + pgoff = 0; + }; + if (len != 0) { + seglen = MIN(len, ext_pgs->trail_len - off); + len -= seglen; + nsegs += sglist_count(&ext_pgs->trail[off], seglen); + } + KASSERT(len == 0, ("len != 0")); + return (nsegs); +} + +/* + * Determine the number of scatter/gather list elements needed to + * describe an EXT_PGS mbuf. + */ +int +sglist_count_mb_ext_pgs(struct mbuf *m) +{ + + MBUF_EXT_PGS_ASSERT(m); + return (sglist_count_ext_pgs(m->m_ext.ext_pgs, mtod(m, vm_offset_t), + m->m_len)); +} + /* * Allocate a scatter/gather list along with 'nsegs' segments. The * 'mflags' parameters are the same as passed to malloc(9). The caller @@ -319,6 +388,76 @@ return (error); } +/* + * Append the segments to describe an EXT_PGS buffer to a + * scatter/gather list. If there are insufficient segments, then this + * fails with EFBIG. + */ +int +sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs, + size_t off, size_t len) +{ + size_t seglen, segoff; + vm_paddr_t paddr; + int error, i, pglen, pgoff; + + error = 0; + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = MIN(seglen, len); + off = 0; + len -= seglen; + error = sglist_append(sg, + &ext_pgs->hdr[segoff], seglen); + } + } + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = MIN(seglen, len); + len -= seglen; + paddr = ext_pgs->pa[i] + segoff; + error = sglist_append_phys(sg, paddr, seglen); + pgoff = 0; + }; + if (error == 0 && len > 0) { + seglen = MIN(len, ext_pgs->trail_len - off); + len -= seglen; + error = sglist_append(sg, + &ext_pgs->trail[off], seglen); + } + if (error == 0) + KASSERT(len == 0, ("len != 0")); + return (error); +} + +/* + * Append the segments to describe an EXT_PGS mbuf to a scatter/gather + * list. If there are insufficient segments, then this fails with + * EFBIG. + */ +int +sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m) +{ + + /* for now, all unmapped mbufs are assumed to be EXT_PGS */ + MBUF_EXT_PGS_ASSERT(m); + return (sglist_append_ext_pgs(sg, m->m_ext.ext_pgs, + mtod(m, vm_offset_t), m->m_len)); +} + /* * Append the segments that describe a single mbuf chain to a * scatter/gather list. If there are insufficient segments, then this @@ -338,7 +477,11 @@ SGLIST_SAVE(sg, save); for (m = m0; m != NULL; m = m->m_next) { if (m->m_len > 0) { - error = sglist_append(sg, m->m_data, m->m_len); + if ((m->m_flags & M_NOMAP) != 0) + error = sglist_append_mb_ext_pgs(sg, m); + else + error = sglist_append(sg, m->m_data, + m->m_len); if (error) { SGLIST_RESTORE(sg, save); return (error); Index: sys/kern/uipc_mbuf.c =================================================================== --- sys/kern/uipc_mbuf.c +++ sys/kern/uipc_mbuf.c @@ -50,6 +50,10 @@ #include #include #include +#include +#include +#include +#include SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, "struct mbuf *", "mbufinfo_t *", @@ -202,7 +206,7 @@ else bcopy(&m->m_ext, &n->m_ext, m_ext_copylen); n->m_flags |= M_EXT; - n->m_flags |= m->m_flags & M_RDONLY; + n->m_flags |= m->m_flags & (M_RDONLY | M_NOMAP); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { @@ -246,7 +250,8 @@ __func__, m, m0)); if (m->m_flags & M_PKTHDR) m_demote_pkthdr(m); - m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags); + m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | + M_NOMAP | flags); } } @@ -376,7 +381,8 @@ if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif - to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); + to->m_flags = (from->m_flags & M_COPYFLAGS) | + (to->m_flags & (M_EXT | M_NOMAP)); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ @@ -414,7 +420,8 @@ if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif - to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); + to->m_flags = (from->m_flags & M_COPYFLAGS) | + (to->m_flags & (M_EXT | M_NOMAP)); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; @@ -579,6 +586,30 @@ return (NULL); } +static void +m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp) +{ + struct iovec iov; + struct uio uio; + int error; + + KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off)); + KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len)); + KASSERT(off < m->m_len, + ("m_copyfromunmapped: len exceeds mbuf length")); + iov.iov_base = cp; + iov.iov_len = len; + uio.uio_resid = len; + uio.uio_iov = &iov; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_rw = UIO_READ; + error = m_unmappedtouio(m, off, &uio, len); + KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off, + len)); +} + /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. @@ -600,7 +631,10 @@ while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); - bcopy(mtod(m, caddr_t) + off, cp, count); + if ((m->m_flags & M_NOMAP) != 0) + m_copyfromunmapped(m, off, count, cp); + else + bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; @@ -695,6 +729,7 @@ m = m->m_next; while (n) { if (!M_WRITABLE(m) || + (n->m_flags & M_NOMAP) != 0 || M_TRAILINGSPACE(m) < n->m_len) { /* just join the two chains */ m->m_next = n; @@ -812,6 +847,9 @@ int count; int space; + KASSERT((n->m_flags & M_NOMAP) == 0, + ("%s: unmapped mbuf %p", __func__, n)); + /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, @@ -1364,6 +1402,41 @@ return (NULL); } +/* + * Return the number of fragments an mbuf will use. This is usually + * used as a proxy for the number of scatter/gather elements needed by + * a DMA engine to access an mbuf. In general mapped mbufs are + * assumed to be backed by physically contiguous buffers that only + * need a single fragment. Unmapped mbufs, on the other hand, can + * span disjoint physical pages. + */ +static int +frags_per_mbuf(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + int frags; + + if ((m->m_flags & M_NOMAP) == 0) + return (1); + + /* + * The header and trailer are counted as a single fragment + * each when present. + * + * XXX: This overestimates the number of fragments by assuming + * all the backing physical pages are disjoint. + */ + ext_pgs = m->m_ext.ext_pgs; + frags = 0; + if (ext_pgs->hdr_len != 0) + frags++; + frags += ext_pgs->npgs; + if (ext_pgs->trail_len != 0) + frags++; + + return (frags); +} + /* * Defragment an mbuf chain, returning at most maxfrags separate * mbufs+clusters. If this is not possible NULL is returned and @@ -1384,7 +1457,7 @@ */ curfrags = 0; for (m = m0; m != NULL; m = m->m_next) - curfrags++; + curfrags += frags_per_mbuf(m); /* * First, try to collapse mbufs. Note that we always collapse * towards the front so we don't need to deal with moving the @@ -1399,12 +1472,13 @@ break; if (M_WRITABLE(m) && n->m_len < M_TRAILINGSPACE(m)) { - bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, - n->m_len); + m_copydata(n, 0, n->m_len, + mtod(m, char *) + m->m_len); m->m_len += n->m_len; m->m_next = n->m_next; + curfrags -= frags_per_mbuf(n); m_free(n); - if (--curfrags <= maxfrags) + if (curfrags <= maxfrags) return m0; } else m = n; @@ -1421,15 +1495,18 @@ m = m_getcl(how, MT_DATA, 0); if (m == NULL) goto bad; - bcopy(mtod(n, void *), mtod(m, void *), n->m_len); - bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, - n2->m_len); + m_copydata(n, 0, n->m_len, mtod(m, char *)); + m_copydata(n2, 0, n2->m_len, + mtod(m, char *) + n->m_len); m->m_len = n->m_len + n2->m_len; m->m_next = n2->m_next; *prev = m; + curfrags += 1; /* For the new cluster */ + curfrags -= frags_per_mbuf(n); + curfrags -= frags_per_mbuf(n2); m_free(n); m_free(n2); - if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ + if (curfrags <= maxfrags) return m0; /* * Still not there, try the normal collapse @@ -1529,6 +1606,111 @@ #endif +/* + * Free pages from mbuf_ext_pgs, assuming they were allocated via + * vm_page_alloc() and aren't associated with any object. Complement + * to allocator from m_uiotombuf_nomap(). + */ +void +mb_free_mext_pgs(struct mbuf *m) +{ + struct mbuf_ext_pgs *ext_pgs; + vm_page_t pg; + int wire_adj; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + wire_adj = 0; + for (int i = 0; i < ext_pgs->npgs; i++) { + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + /* + * Note: page is not locked, as it has no + * object and is not on any queues. + */ + vm_page_free_toq(pg); + wire_adj++; + } + if (wire_adj) + vm_wire_sub(wire_adj); +} + +static struct mbuf * +m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags) +{ + struct mbuf *m, *mb, *prev; + struct mbuf_ext_pgs *pgs; + vm_page_t pg_array[MBUF_PEXT_MAX_PGS]; + int error, length, i, needed, wire_adj = 0; + ssize_t total; + int pflags = malloc2vm_flags(how) | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP; + + /* + * len can be zero or an arbitrary large value bound by + * the total data supplied by the uio. + */ + if (len > 0) + total = MIN(uio->uio_resid, len); + else + total = uio->uio_resid; + + if (maxseg == 0) + maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE; + + /* + * Allocate the pages + */ + m = NULL; + while (total > 0) { + mb = mb_alloc_ext_pgs(how, (flags & M_PKTHDR), + mb_free_mext_pgs); + if (mb == NULL) + goto failed; + if (m == NULL) + m = mb; + else + prev->m_next = mb; + prev = mb; + pgs = mb->m_ext.ext_pgs; + needed = length = MIN(maxseg, total); + for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) { +retry_page: + pg_array[i] = vm_page_alloc(NULL, 0, pflags); + if (pg_array[i] == NULL) { + if (wire_adj) + vm_wire_add(wire_adj); + wire_adj = 0; + if (how & M_NOWAIT) { + goto failed; + } else { + vm_wait(NULL); + goto retry_page; + } + } + wire_adj++; + pg_array[i]->flags &= ~PG_ZERO; + pgs->pa[i] = VM_PAGE_TO_PHYS(pg_array[i]); + pgs->npgs++; + } + pgs->last_pg_len = length - PAGE_SIZE * (pgs->npgs - 1); + MBUF_EXT_PGS_ASSERT_SANITY(pgs); + vm_wire_add(wire_adj); + wire_adj = 0; + total -= length; + error = uiomove_fromphys(pg_array, 0, length, uio); + if (error != 0) + goto failed; + mb->m_len = length; + mb->m_ext.ext_size += PAGE_SIZE * pgs->npgs; + if (flags & M_PKTHDR) + m->m_pkthdr.len += length; + } + return (m); + +failed: + m_freem(m); + return (NULL); +} + /* * Copy the contents of uio into a properly sized mbuf chain. */ @@ -1540,6 +1722,9 @@ ssize_t total; int progress = 0; + if (flags & M_NOMAP) + return (m_uiotombuf_nomap(uio, how, len, align, flags)); + /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. @@ -1585,6 +1770,62 @@ return (m); } +/* + * Copy data from an unmapped mbuf into a uio limited by len if set. + */ +int +m_unmappedtouio(const struct mbuf *m, int m_off, struct uio *uio, int len) +{ + struct mbuf_ext_pgs *ext_pgs; + vm_page_t pg; + int error, i, off, pglen, pgoff, seglen, segoff; + + MBUF_EXT_PGS_ASSERT(m); + ext_pgs = m->m_ext.ext_pgs; + error = 0; + + /* Skip over any data removed from the front. */ + off = mtod(m, vm_offset_t); + + off += m_off; + if (ext_pgs->hdr_len != 0) { + if (off >= ext_pgs->hdr_len) { + off -= ext_pgs->hdr_len; + } else { + seglen = ext_pgs->hdr_len - off; + segoff = off; + seglen = min(seglen, len); + off = 0; + len -= seglen; + error = uiomove(&ext_pgs->hdr[segoff], seglen, uio); + } + } + pgoff = ext_pgs->first_pg_off; + for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) { + pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff); + if (off >= pglen) { + off -= pglen; + pgoff = 0; + continue; + } + seglen = pglen - off; + segoff = pgoff + off; + off = 0; + seglen = min(seglen, len); + len -= seglen; + pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]); + error = uiomove_fromphys(&pg, segoff, seglen, uio); + pgoff = 0; + }; + if (len != 0 && error == 0) { + KASSERT((off + len) <= ext_pgs->trail_len, + ("off + len > trail (%d + %d > %d, m_off = %d)", off, len, + ext_pgs->trail_len, m_off)); + error = uiomove(&ext_pgs->trail[off], len, uio); + } + return (error); +} + /* * Copy an mbuf chain into a uio limited by len if set. */ @@ -1603,7 +1844,10 @@ for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); - error = uiomove(mtod(m, void *), length, uio); + if ((m->m_flags & M_NOMAP) != 0) + error = m_unmappedtouio(m, 0, uio, length); + else + error = uiomove(mtod(m, void *), length, uio); if (error) return (error); Index: sys/kern/uipc_sockbuf.c =================================================================== --- sys/kern/uipc_sockbuf.c +++ sys/kern/uipc_sockbuf.c @@ -89,28 +89,130 @@ } /* - * Mark ready "count" mbufs starting with "m". + * Compress M_NOTREADY mbufs after they have been readied by sbready(). + * + * sbcompress() skips M_NOTREADY mbufs since the data is not available to + * be copied at the time of sbcompress(). This function combines small + * mbufs similar to sbcompress() once mbufs are ready. 'm0' is the first + * mbuf sbready() marked ready, and 'end' is the first mbuf still not + * ready. + */ +static void +sbready_compress(struct sockbuf *sb, struct mbuf *m0, struct mbuf *end) +{ + struct mbuf *m, *n; + int ext_size; + + SOCKBUF_LOCK_ASSERT(sb); + + if ((sb->sb_flags & SB_NOCOALESCE) != 0) + return; + + for (m = m0; m != end; m = m->m_next) { + MPASS((m->m_flags & M_NOTREADY) == 0); + + /* Compress small unmapped mbufs into plain mbufs. */ + if ((m->m_flags & M_NOMAP) && m->m_len <= MLEN) { + MPASS(m->m_flags & M_EXT); + ext_size = m->m_ext.ext_size; + if (mb_unmapped_compress(m) == 0) { + sb->sb_mbcnt -= ext_size; + sb->sb_ccnt -= 1; + } + } + + /* + * NB: In sbcompress(), 'n' is the last mbuf in the + * socket buffer and 'm' is the new mbuf being copied + * into the trailing space of 'n'. Here, the roles + * are reversed and 'n' is the next mbuf after 'm' + * that is being copied into the trailing space of + * 'm'. + */ + n = m->m_next; + while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 && + M_WRITABLE(m) && + (m->m_flags & M_NOMAP) == 0 && + n->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ + n->m_len <= M_TRAILINGSPACE(m) && + m->m_type == n->m_type) { + KASSERT(sb->sb_lastrecord != n, + ("%s: merging start of record (%p) into previous mbuf (%p)", + __func__, n, m)); + m_copydata(n, 0, n->m_len, mtodo(m, m->m_len)); + m->m_len += n->m_len; + m->m_next = n->m_next; + m->m_flags |= n->m_flags & M_EOR; + if (sb->sb_mbtail == n) + sb->sb_mbtail = m; + + sb->sb_mbcnt -= MSIZE; + sb->sb_mcnt -= 1; + if (n->m_flags & M_EXT) { + sb->sb_mbcnt -= n->m_ext.ext_size; + sb->sb_ccnt -= 1; + } + m_free(n); + n = m->m_next; + } + } + SBLASTRECORDCHK(sb); + SBLASTMBUFCHK(sb); +} + +/* + * Mark ready "count" units of I/O starting with "m". Most mbufs + * count as a single unit of I/O except for EXT_PGS-backed mbufs which + * can be backed by multiple pages. */ int -sbready(struct sockbuf *sb, struct mbuf *m, int count) +sbready(struct sockbuf *sb, struct mbuf *m0, int count) { + struct mbuf *m; u_int blocker; SOCKBUF_LOCK_ASSERT(sb); KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb)); + KASSERT(count > 0, ("%s: invalid count %d", __func__, count)); + m = m0; blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0; - for (int i = 0; i < count; i++, m = m->m_next) { + while (count > 0) { KASSERT(m->m_flags & M_NOTREADY, ("%s: m %p !M_NOTREADY", __func__, m)); + if ((m->m_flags & M_EXT) != 0 && + m->m_ext.ext_type == EXT_PGS) { + if (count < m->m_ext.ext_pgs->nrdy) { + m->m_ext.ext_pgs->nrdy -= count; + count = 0; + break; + } + count -= m->m_ext.ext_pgs->nrdy; + m->m_ext.ext_pgs->nrdy = 0; + } else + count--; + m->m_flags &= ~(M_NOTREADY | blocker); if (blocker) sb->sb_acc += m->m_len; + m = m->m_next; } - if (!blocker) + /* + * If the first mbuf is still not fully ready because only + * some of its backing pages were readied, no further progress + * can be made. + */ + if (m0 == m) { + MPASS(m->m_flags & M_NOTREADY); return (EINPROGRESS); + } + + if (!blocker) { + sbready_compress(sb, m0, m); + return (EINPROGRESS); + } /* This one was blocking all the queue. */ for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) { @@ -121,6 +223,7 @@ } sb->sb_fnrdy = m; + sbready_compress(sb, m0, m); return (0); } @@ -1030,12 +1133,11 @@ M_WRITABLE(n) && ((sb->sb_flags & SB_NOCOALESCE) == 0) && !(m->m_flags & M_NOTREADY) && - !(n->m_flags & M_NOTREADY) && + !(n->m_flags & (M_NOTREADY | M_NOMAP)) && m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */ m->m_len <= M_TRAILINGSPACE(n) && n->m_type == m->m_type) { - bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, - (unsigned)m->m_len); + m_copydata(m, 0, m->m_len, mtodo(n, n->m_len)); n->m_len += m->m_len; sb->sb_ccc += m->m_len; if (sb->sb_fnrdy == NULL) @@ -1046,6 +1148,9 @@ m = m_free(m); continue; } + if (m->m_len <= MLEN && (m->m_flags & M_NOMAP) && + (m->m_flags & M_NOTREADY) == 0) + (void)mb_unmapped_compress(m); if (n) n->m_next = m; else Index: sys/kern/uipc_socket.c =================================================================== --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -1044,7 +1044,7 @@ * * We used to do a lot of socket buffer and socket locking here, as * well as invoke sorflush() and perform wakeups. The direct call to - * dom_dispose() and sbrelease_internal() are an inlining of what was + * dom_dispose() and sbdestroy() are an inlining of what was * necessary from sorflush(). * * Notice that the socket buffer and kqueue state are torn down @@ -1982,7 +1982,11 @@ SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); SOCKBUF_UNLOCK(&so->so_rcv); - error = uiomove(mtod(m, char *) + moff, (int)len, uio); + if ((m->m_flags & M_NOMAP) != 0) + error = m_unmappedtouio(m, moff, uio, (int)len); + else + error = uiomove(mtod(m, char *) + moff, + (int)len, uio); SOCKBUF_LOCK(&so->so_rcv); if (error) { /* Index: sys/net/bpf.c =================================================================== --- sys/net/bpf.c +++ sys/net/bpf.c @@ -2369,6 +2369,7 @@ * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ + mb.m_flags = 0; mb.m_next = m; mb.m_data = data; mb.m_len = dlen; Index: sys/net/bpf_buffer.c =================================================================== --- sys/net/bpf_buffer.c +++ sys/net/bpf_buffer.c @@ -119,19 +119,10 @@ { const struct mbuf *m; u_char *dst; - u_int count; m = (struct mbuf *)src; dst = (u_char *)buf + offset; - while (len > 0) { - if (m == NULL) - panic("bpf_mcopy"); - count = min(m->m_len, len); - bcopy(mtod(m, void *), dst, count); - m = m->m_next; - dst += count; - len -= count; - } + m_copydata(m, 0, len, dst); } /* Index: sys/net/if.h =================================================================== --- sys/net/if.h +++ sys/net/if.h @@ -246,6 +246,7 @@ #define IFCAP_HWSTATS 0x800000 /* manages counters internally */ #define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */ #define IFCAP_HWRXTSTMP 0x2000000 /* hardware rx timestamping */ +#define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */ #define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6) Index: sys/net/if_vlan.c =================================================================== --- sys/net/if_vlan.c +++ sys/net/if_vlan.c @@ -1731,6 +1731,16 @@ ena |= (mena & IFCAP_TXRTLMT); #endif + /* + * If the parent interface supports unmapped mbufs, so does + * the VLAN interface. Note that this should be fine even for + * interfaces that don't support hardware tagging as headers + * are prepended in normal mbufs to unmapped mbufs holding + * payload data. + */ + cap |= (p->if_capabilities & IFCAP_NOMAP); + ena |= (mena & IFCAP_NOMAP); + ifp->if_capabilities = cap; ifp->if_capenable = ena; ifp->if_hwassist = hwa; Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -35,13 +35,13 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" -#include "opt_ratelimit.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" +#include "opt_ratelimit.h" #include "opt_route.h" -#include "opt_sctp.h" #include "opt_rss.h" +#include "opt_sctp.h" #include #include @@ -283,6 +283,7 @@ #if defined(IPSEC) || defined(IPSEC_SUPPORT) int no_route_but_check_spd = 0; #endif + M_ASSERTPKTHDR(m); if (inp != NULL) { @@ -685,11 +686,30 @@ m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + IPSTAT_INC(ips_odropped); + error = ENOBUFS; + goto bad; + } in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + IPSTAT_INC(ips_odropped); + error = ENOBUFS; + goto bad; + } } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + IPSTAT_INC(ips_odropped); + error = ENOBUFS; + goto bad; + } sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } @@ -825,11 +845,23 @@ * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { + m0 = mb_unmapped_to_ext(m0); + if (m0 == NULL) { + error = ENOBUFS; + IPSTAT_INC(ips_odropped); + goto done; + } in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { + m0 = mb_unmapped_to_ext(m0); + if (m0 == NULL) { + error = ENOBUFS; + IPSTAT_INC(ips_odropped); + goto done; + } sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } Index: sys/netinet/tcp_pcap.c =================================================================== --- sys/netinet/tcp_pcap.c +++ sys/netinet/tcp_pcap.c @@ -311,6 +311,7 @@ if (mhead->m_flags & M_EXT) { switch (mhead->m_ext.ext_type) { case EXT_SFBUF: + case EXT_PGS: /* Don't mess around with these. */ tcp_pcap_m_freem(mhead); continue; @@ -383,8 +384,11 @@ __func__, n->m_flags)); n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m); n->m_len = m->m_len; - bcopy(M_START(m), n->m_dat, - m->m_len + M_LEADINGSPACE_NOWRITE(m)); + if (m->m_flags & M_NOMAP) + m_copydata(m, 0, m->m_len, n->m_data); + else + bcopy(M_START(m), n->m_dat, + m->m_len + M_LEADINGSPACE_NOWRITE(m)); } else { /* Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -798,8 +798,12 @@ } } + if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { + *num_names = 0; + return (EINVAL); + } + refcount_init(&blk->tfb_refcnt, 0); - blk->tfb_flags = 0; blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -1190,8 +1190,7 @@ INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); - for (int i = 0; i < count; i++) - m = m_free(m); + mb_free_notready(m, count); return (ECONNRESET); } tp = intotcpcb(inp); Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -67,11 +67,11 @@ #include "opt_inet.h" #include "opt_inet6.h" -#include "opt_ratelimit.h" #include "opt_ipsec.h" -#include "opt_sctp.h" +#include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" +#include "opt_sctp.h" #include #include @@ -963,11 +963,30 @@ */ if (sw_csum & CSUM_DELAY_DATA_IPV6) { sw_csum &= ~CSUM_DELAY_DATA_IPV6; + m = mb_unmapped_to_ext(m); + if (m == NULL) { + error = ENOBUFS; + IP6STAT_INC(ip6s_odropped); + goto bad; + } in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr)); + } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + error = ENOBUFS; + IP6STAT_INC(ip6s_odropped); + goto bad; + } } #ifdef SCTP if (sw_csum & CSUM_SCTP_IPV6) { sw_csum &= ~CSUM_SCTP_IPV6; + m = mb_unmapped_to_ext(m); + if (m == NULL) { + error = ENOBUFS; + IP6STAT_INC(ip6s_odropped); + goto bad; + } sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); } #endif @@ -1055,11 +1074,23 @@ * XXX-BZ handle the hw offloading case. Need flags. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + in6_ifstat_inc(ifp, ifs6_out_fragfail); + error = ENOBUFS; + goto bad; + } in6_delayed_cksum(m, plen, hlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { + m = mb_unmapped_to_ext(m); + if (m == NULL) { + in6_ifstat_inc(ifp, ifs6_out_fragfail); + error = ENOBUFS; + goto bad; + } sctp_delayed_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -227,7 +227,15 @@ volatile u_int ext_count; volatile u_int *ext_cnt; }; - char *ext_buf; /* start of buffer */ + union { + /* + * If ext_type == EXT_PGS, 'ext_pgs' points to a + * structure describing the buffer. Otherwise, + * 'ext_buf' points to the start of the buffer. + */ + struct mbuf_ext_pgs *ext_pgs; + char *ext_buf; + }; uint32_t ext_size; /* size of buffer, for ext_free */ uint32_t ext_type:8, /* type of external storage */ ext_flags:24; /* external storage mbuf flags */ @@ -293,6 +301,92 @@ }; }; +struct socket; + +/* + * TLS records for TLS 1.0-1.2 can have the following header lengths: + * - 5 (AES-CBC with implicit IV) + * - 21 (AES-CBC with explicit IV) + * - 13 (AES-GCM with 8 byte explicit IV) + */ +#define MBUF_PEXT_HDR_LEN 24 + +/* + * TLS records for TLS 1.0-1.2 can have the following maximum trailer + * lengths: + * - 16 (AES-GCM) + * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding) + * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding) + * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding) + */ +#define MBUF_PEXT_TRAIL_LEN 64 + +#ifdef __LP64__ +#define MBUF_PEXT_MAX_PGS (152 / sizeof(vm_paddr_t)) +#else +#define MBUF_PEXT_MAX_PGS (156 / sizeof(vm_paddr_t)) +#endif + +#define MBUF_PEXT_MAX_BYTES \ + (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN) + +/* + * This struct is 256 bytes in size and is arranged so that the most + * common case (accessing the first 4 pages of a 16KB TLS record) will + * fit in a single 64 byte cacheline. + */ +struct mbuf_ext_pgs { + uint8_t npgs; /* Number of attached pages */ + uint8_t nrdy; /* Pages with I/O pending */ + uint8_t hdr_len; /* TLS header length */ + uint8_t trail_len; /* TLS trailer length */ + uint16_t first_pg_off; /* Offset into 1st page */ + uint16_t last_pg_len; /* Length of last page */ + vm_paddr_t pa[MBUF_PEXT_MAX_PGS]; /* phys addrs of pages */ + char hdr[MBUF_PEXT_HDR_LEN]; /* TLS header */ + void *tls; /* TLS session */ +#if defined(__i386__) || \ + (defined(__powerpc__) && !defined(__powerpc64__) && defined(BOOKE)) + /* + * i386 and Book-E PowerPC have 64-bit vm_paddr_t, so there is + * a 4 byte remainder from the space allocated for pa[]. + */ + uint32_t pad; +#endif + union { + char trail[MBUF_PEXT_TRAIL_LEN]; /* TLS trailer */ + struct { + struct socket *so; + void *mbuf; + uint64_t seqno; + STAILQ_ENTRY(mbuf_ext_pgs) stailq; + }; + }; +}; + +#ifdef _KERNEL +static inline int +mbuf_ext_pg_len(struct mbuf_ext_pgs *ext_pgs, int pidx, int pgoff) +{ + KASSERT(pgoff == 0 || pidx == 0, + ("page %d with non-zero offset %d in %p", pidx, pgoff, ext_pgs)); + if (pidx == ext_pgs->npgs - 1) { + return (ext_pgs->last_pg_len); + } else { + return (PAGE_SIZE - pgoff); + } +} + +#ifdef INVARIANT_SUPPORT +void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs); +#endif +#ifdef INVARIANTS +#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) mb_ext_pgs_check((ext_pgs)) +#else +#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) +#endif +#endif + /* * mbuf flags of global significance and layer crossing. * Those of only protocol/layer specific significance are to be mapped @@ -307,7 +401,7 @@ #define M_MCAST 0x00000020 /* send/received as link-level multicast */ #define M_PROMISC 0x00000040 /* packet was not for us */ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ -#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */ +#define M_NOMAP 0x00000100 /* mbuf data is unmapped */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ #define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically @@ -348,7 +442,7 @@ */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ - "\7M_PROMISC\10M_VLANTAG\13M_TSTMP\14M_TSTMP_HPREC" + "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC" #define M_FLAG_PROTOBITS \ "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \ "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \ @@ -420,6 +514,7 @@ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference */ #define EXT_RXRING 8 /* data in NIC receive ring */ +#define EXT_PGS 9 /* array of unmapped pages */ #define EXT_VENDOR1 224 /* for vendor-internal use */ #define EXT_VENDOR2 225 /* for vendor-internal use */ @@ -464,6 +559,11 @@ "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \ "\30EXT_FLAG_EXP4" +#define MBUF_EXT_PGS_ASSERT(m) \ + KASSERT((((m)->m_flags & M_EXT) != 0) && \ + ((m)->m_ext.ext_type == EXT_PGS), \ + ("%s: m %p !M_EXT or !EXT_PGS", __func__, m)) + /* * Flags indicating checksum, segmentation and other offload work to be * done, or already done, by hardware or lower layers. It is split into @@ -566,6 +666,7 @@ #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" +#define MBUF_EXTPGS_MEM_NAME "mbuf_extpgs" #ifdef _KERNEL @@ -590,9 +691,15 @@ extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; +extern uma_zone_t zone_extpgs; void mb_dupcl(struct mbuf *, struct mbuf *); void mb_free_ext(struct mbuf *); +void mb_free_mext_pgs(struct mbuf *); +struct mbuf *mb_alloc_ext_pgs(int, bool, m_ext_free_t); +int mb_unmapped_compress(struct mbuf *m); +struct mbuf *mb_unmapped_to_ext(struct mbuf *m); +void mb_free_notready(struct mbuf *m, int count); void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); @@ -627,6 +734,7 @@ struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); int m_mbuftouio(struct uio *, const struct mbuf *, int); +int m_unmappedtouio(const struct mbuf *, int, struct uio *, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); int m_pkthdr_init(struct mbuf *, int); struct mbuf *m_prepend(struct mbuf *, int, int); @@ -881,7 +989,7 @@ * be both the local data payload, or an external buffer area, depending on * whether M_EXT is set). */ -#define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \ +#define M_WRITABLE(m) (((m)->m_flags & (M_RDONLY | M_NOMAP)) == 0 && \ (!(((m)->m_flags & M_EXT)) || \ (m_extrefcnt(m) == 1))) @@ -904,7 +1012,8 @@ * handling external storage, packet-header mbufs, and regular data mbufs. */ #define M_START(m) \ - (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ + (((m)->m_flags & M_NOMAP) ? NULL : \ + ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ &(m)->m_dat[0]) @@ -1020,6 +1129,7 @@ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern int nmbclusters; /* Maximum number of clusters */ +extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */ /*- * Network packets may have annotations attached by affixing a list of Index: sys/sys/sglist.h =================================================================== --- sys/sys/sglist.h +++ sys/sys/sglist.h @@ -57,6 +57,7 @@ struct bio; struct mbuf; +struct mbuf_ext_pgs; struct uio; static __inline void @@ -87,6 +88,9 @@ struct sglist *sglist_alloc(int nsegs, int mflags); int sglist_append(struct sglist *sg, void *buf, size_t len); int sglist_append_bio(struct sglist *sg, struct bio *bp); +int sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs, + size_t off, size_t len); +int sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m); int sglist_append_mbuf(struct sglist *sg, struct mbuf *m0); int sglist_append_phys(struct sglist *sg, vm_paddr_t paddr, size_t len); @@ -101,6 +105,9 @@ struct sglist *sglist_clone(struct sglist *sg, int mflags); int sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid); int sglist_count(void *buf, size_t len); +int sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, + size_t len); +int sglist_count_mb_ext_pgs(struct mbuf *m); int sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len); void sglist_free(struct sglist *sg); int sglist_join(struct sglist *first, struct sglist *second);