D20616.id58545.diff
No OneTemporary
Actions

Size

69 KB

Referenced Files

None

Subscribers

None

D20616.id58545.diff
View Options

	Index: sbin/ifconfig/ifconfig.8
	===================================================================
	--- sbin/ifconfig/ifconfig.8
	+++ sbin/ifconfig/ifconfig.8
	@@ -538,6 +538,12 @@
	If the driver supports
	.Xr tcp 4
	large receive offloading, disable LRO on the interface.
	+.It Cm nomap
	+If the driver supports unmapped network buffers,
	+enable them on the interface.
	+.It Fl nomap
	+If the driver supports unmapped network buffers,
	+disable them on the interface.
	.It Cm wol , wol_ucast , wol_mcast , wol_magic
	Enable Wake On Lan (WOL) support, if available.
	WOL is a facility whereby a machine in a low power state may be woken
	Index: sbin/ifconfig/ifconfig.c
	===================================================================
	--- sbin/ifconfig/ifconfig.c
	+++ sbin/ifconfig/ifconfig.c
	@@ -1257,7 +1257,7 @@
	"\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
	"\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
	"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
	-"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP"
	+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP"

	/*
	* Print the status of the interface. If an address family was
	@@ -1557,6 +1557,8 @@
	DEF_CMD("-link2", -IFF_LINK2, setifflags),
	DEF_CMD("monitor", IFF_MONITOR, setifflags),
	DEF_CMD("-monitor", -IFF_MONITOR, setifflags),
	+ DEF_CMD("nomap", IFCAP_NOMAP, setifcap),
	+ DEF_CMD("-nomap", -IFCAP_NOMAP, setifcap),
	DEF_CMD("staticarp", IFF_STATICARP, setifflags),
	DEF_CMD("-staticarp", -IFF_STATICARP, setifflags),
	DEF_CMD("rxcsum6", IFCAP_RXCSUM_IPV6, setifcap),
	Index: share/man/man9/Makefile
	===================================================================
	--- share/man/man9/Makefile
	+++ share/man/man9/Makefile
	@@ -1824,6 +1824,8 @@
	MLINKS+=sglist.9 sglist_alloc.9 \
	sglist.9 sglist_append.9 \
	sglist.9 sglist_append_bio.9 \
	+ sglist.9 sglist_append_ext_pgs.9 \
	+ sglist.9 sglist_append_mb_ext_pgs.9 \
	sglist.9 sglist_append_mbuf.9 \
	sglist.9 sglist_append_phys.9 \
	sglist.9 sglist_append_sglist.9 \
	@@ -1834,6 +1836,8 @@
	sglist.9 sglist_clone.9 \
	sglist.9 sglist_consume_uio.9 \
	sglist.9 sglist_count.9 \
	+ sglist.9 sglist_count_ext_pgs.9 \
	+ sglist.9 sglist_count_mb_ext_pgs.9 \
	sglist.9 sglist_count_vmpages.9 \
	sglist.9 sglist_free.9 \
	sglist.9 sglist_hold.9 \
	Index: share/man/man9/mbuf.9
	===================================================================
	--- share/man/man9/mbuf.9
	+++ share/man/man9/mbuf.9
	@@ -254,6 +254,8 @@
	#define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */
	#define EXT_PACKET 6 /* mbuf+cluster from packet zone */
	#define EXT_MBUF 7 /* external mbuf reference */
	+#define EXT_RXRING 8 /* data in NIC receive ring */
	+#define EXT_PGS 9 /* array of unmapped pages */
	#define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */
	#define EXT_MOD_TYPE 253 /* custom module's ext_buf type */
	#define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */
	Index: share/man/man9/sglist.9
	===================================================================
	--- share/man/man9/sglist.9
	+++ share/man/man9/sglist.9
	@@ -34,6 +34,8 @@
	.Nm sglist_alloc ,
	.Nm sglist_append ,
	.Nm sglist_append_bio ,
	+.Nm sglist_append_ext_pgs,
	+.Nm sglist_append_mb_ext_pgs,
	.Nm sglist_append_mbuf ,
	.Nm sglist_append_phys ,
	.Nm sglist_append_sglist ,
	@@ -44,6 +46,8 @@
	.Nm sglist_clone ,
	.Nm sglist_consume_uio ,
	.Nm sglist_count ,
	+.Nm sglist_count_ext_pgs ,
	+.Nm sglist_count_mb_ext_pgs ,
	.Nm sglist_count_vmpages ,
	.Nm sglist_free ,
	.Nm sglist_hold ,
	@@ -64,6 +68,10 @@
	.Ft int
	.Fn sglist_append_bio "struct sglist sg" "struct bio bp"
	.Ft int
	+.Fn sglist_append_ext_pgs "struct sglist sg" "struct mbuf_ext_pgs ext_pgs" "size_t offset" "size_t len"
	+.Ft int
	+.Fn sglist_append_mb_ext_pgs "struct sglist sg" "struct mbuf m"
	+.Ft int
	.Fn sglist_append_mbuf "struct sglist sg" "struct mbuf m"
	.Ft int
	.Fn sglist_append_phys "struct sglist *sg" "vm_paddr_t paddr" "size_t len"
	@@ -84,6 +92,10 @@
	.Ft int
	.Fn sglist_count "void *buf" "size_t len"
	.Ft int
	+.Fn sglist_count_ext_pgs "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
	+.Ft int
	+.Fn sglist_count_mb_ext_pgs "struct mbuf *m"
	+.Ft int
	.Fn sglist_count_vmpages "vm_page_t *m" "size_t pgoff" "size_t len"
	.Ft void
	.Fn sglist_free "struct sglist *sg"
	@@ -146,6 +158,22 @@
	bytes long.
	.Pp
	The
	+.Nm sglist_count_ext_pgs
	+function returns the number of scatter/gather list elements needed to describe
	+the unmapped external mbuf buffer
	+.Fa ext_pgs .
	+The ranges start at an offset of
	+.Fa offset
	+relative to the start of the buffer and is
	+.Fa len
	+bytes long.
	+The
	+.Nm sglist_count_mb_ext_pgs
	+function returns the number of scatter/gather list elements needed to describe
	+the physical address ranges of a single unmapped mbuf
	+.Fa m .
	+.Pp
	+The
	.Nm sglist_count_vmpages
	function returns the number of scatter/gather list elements needed to describe
	the physical address ranges of a buffer backed by an array of virtual memory
	@@ -237,6 +265,34 @@
	.Fa sg .
	.Pp
	The
	+.Nm sglist_append_ext_pgs
	+function appends the physical address ranges described by the unmapped
	+external mbuf buffer
	+.Fa ext_pgs
	+to the scatter/gather list
	+.Fa sg .
	+The physical address ranges start at offset
	+.Fa offset
	+within
	+.Fa ext_pgs
	+and continue for
	+.Fa len
	+bytes.
	+.Pp
	+The
	+.Nm sglist_append_mb_ext_pgs
	+function appends the physical address ranges described by the unmapped
	+mbuf
	+.Fa m
	+to the scatter/gather list
	+.Fa sg .
	+Note that unlike
	+.Nm sglist_append_mbuf ,
	+.Nm sglist_append_mb_ext_pgs
	+only adds ranges for a single mbuf,
	+not an entire mbuf chain.
	+.Pp
	+The
	.Nm sglist_append_mbuf
	function appends the physical address ranges described by an entire mbuf
	chain
	@@ -467,8 +523,7 @@
	.Pp
	The
	.Nm sglist_count
	-and
	-.Nm sglist_count_vmpages
	+family of
	functions return a count of scatter/gather list elements.
	.Pp
	The
	Index: sys/conf/files
	===================================================================
	--- sys/conf/files
	+++ sys/conf/files
	@@ -4268,7 +4268,8 @@
	netinet/tcp_output.c optional inet \| inet6
	netinet/tcp_offload.c optional tcp_offload inet \| tcp_offload inet6
	netinet/tcp_hpts.c optional tcphpts inet \| tcphpts inet6
	-netinet/tcp_pcap.c optional inet tcppcap \| inet6 tcppcap
	+netinet/tcp_pcap.c optional inet tcppcap \| inet6 tcppcap \
	+ compile-with "${NORMAL_C} ${NO_WNONNULL}"
	netinet/tcp_reass.c optional inet \| inet6
	netinet/tcp_sack.c optional inet \| inet6
	netinet/tcp_subr.c optional inet \| inet6
	Index: sys/conf/kern.mk
	===================================================================
	--- sys/conf/kern.mk
	+++ sys/conf/kern.mk
	@@ -76,6 +76,7 @@
	# GCC 4.2 doesn't have -Wno-error=cast-qual, so just disable the warning for
	# the few files that are already known to generate cast-qual warnings.
	NO_WCAST_QUAL= -Wno-cast-qual
	+NO_WNONNULL= -Wno-nonnull
	.endif
	.endif

	Index: sys/dev/cxgbe/t4_main.c
	===================================================================
	--- sys/dev/cxgbe/t4_main.c
	+++ sys/dev/cxgbe/t4_main.c
	@@ -1623,7 +1623,7 @@
	#define T4_CAP (IFCAP_VLAN_HWTAGGING \| IFCAP_VLAN_MTU \| IFCAP_HWCSUM \| \
	IFCAP_VLAN_HWCSUM \| IFCAP_TSO \| IFCAP_JUMBO_MTU \| IFCAP_LRO \| \
	IFCAP_VLAN_HWTSO \| IFCAP_LINKSTATE \| IFCAP_HWCSUM_IPV6 \| IFCAP_HWSTATS \| \
	- IFCAP_HWRXTSTMP)
	+ IFCAP_HWRXTSTMP \| IFCAP_NOMAP)
	#define T4_CAP_ENABLE (T4_CAP)

	static int
	@@ -1986,6 +1986,8 @@
	rxq->iq.flags &= ~IQ_RX_TIMESTAMP;
	}
	}
	+ if (mask & IFCAP_NOMAP)
	+ ifp->if_capenable ^= IFCAP_NOMAP;

	#ifdef VLAN_CAPABILITIES
	VLAN_CAPABILITIES(ifp);
	Index: sys/dev/cxgbe/t4_sge.c
	===================================================================
	--- sys/dev/cxgbe/t4_sge.c
	+++ sys/dev/cxgbe/t4_sge.c
	@@ -83,6 +83,7 @@
	#endif

	/* Internal mbuf flags stored in PH_loc.eight[1]. */
	+#define MC_NOMAP 0x01
	#define MC_RAW_WR 0x02

	/*
	@@ -2434,15 +2435,78 @@
	return ((void *)p);
	}

	+static inline int
	+count_mbuf_ext_pgs(struct mbuf m, int skip, vm_paddr_t nextaddr)
	+{
	+ struct mbuf_ext_pgs *ext_pgs;
	+ vm_paddr_t paddr;
	+ int i, len, off, pglen, pgoff, seglen, segoff;
	+ int nsegs = 0;
	+
	+ MBUF_EXT_PGS_ASSERT(m);
	+ ext_pgs = m->m_ext.ext_pgs;
	+ off = mtod(m, vm_offset_t);
	+ len = m->m_len;
	+ off += skip;
	+ len -= skip;
	+
	+ if (ext_pgs->hdr_len != 0) {
	+ if (off >= ext_pgs->hdr_len) {
	+ off -= ext_pgs->hdr_len;
	+ } else {
	+ seglen = ext_pgs->hdr_len - off;
	+ segoff = off;
	+ seglen = min(seglen, len);
	+ off = 0;
	+ len -= seglen;
	+ paddr = pmap_kextract(
	+ (vm_offset_t)&ext_pgs->hdr[segoff]);
	+ if (*nextaddr != paddr)
	+ nsegs++;
	+ *nextaddr = paddr + seglen;
	+ }
	+ }
	+ pgoff = ext_pgs->first_pg_off;
	+ for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
	+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
	+ if (off >= pglen) {
	+ off -= pglen;
	+ pgoff = 0;
	+ continue;
	+ }
	+ seglen = pglen - off;
	+ segoff = pgoff + off;
	+ off = 0;
	+ seglen = min(seglen, len);
	+ len -= seglen;
	+ paddr = ext_pgs->pa[i] + segoff;
	+ if (*nextaddr != paddr)
	+ nsegs++;
	+ *nextaddr = paddr + seglen;
	+ pgoff = 0;
	+ };
	+ if (len != 0) {
	+ seglen = min(len, ext_pgs->trail_len - off);
	+ len -= seglen;
	+ paddr = pmap_kextract((vm_offset_t)&ext_pgs->trail[off]);
	+ if (*nextaddr != paddr)
	+ nsegs++;
	+ *nextaddr = paddr + seglen;
	+ }
	+
	+ return (nsegs);
	+}
	+
	+
	/*
	* Can deal with empty mbufs in the chain that have m_len = 0, but the chain
	* must have at least one mbuf that's not empty. It is possible for this
	* routine to return 0 if skip accounts for all the contents of the mbuf chain.
	*/
	static inline int
	-count_mbuf_nsegs(struct mbuf *m, int skip)
	+count_mbuf_nsegs(struct mbuf m, int skip, uint8_t cflags)
	{
	- vm_paddr_t lastb, next;
	+ vm_paddr_t nextaddr, paddr;
	vm_offset_t va;
	int len, nsegs;

	@@ -2451,9 +2515,8 @@
	MPASS(m->m_pkthdr.len >= skip);

	nsegs = 0;
	- lastb = 0;
	+ nextaddr = 0;
	for (; m; m = m->m_next) {
	-
	len = m->m_len;
	if (__predict_false(len == 0))
	continue;
	@@ -2461,14 +2524,20 @@
	skip -= len;
	continue;
	}
	+ if ((m->m_flags & M_NOMAP) != 0) {
	+ *cflags \|= MC_NOMAP;
	+ nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr);
	+ skip = 0;
	+ continue;
	+ }
	va = mtod(m, vm_offset_t) + skip;
	len -= skip;
	skip = 0;
	- next = pmap_kextract(va);
	+ paddr = pmap_kextract(va);
	nsegs += sglist_count((void *)(uintptr_t)va, len);
	- if (lastb + 1 == next)
	+ if (paddr == nextaddr)
	nsegs--;
	- lastb = pmap_kextract(va + len - 1);
	+ nextaddr = pmap_kextract(va + len - 1) + 1;
	}

	return (nsegs);
	@@ -2490,7 +2559,9 @@
	struct tcphdr *tcp;
	#endif
	uint16_t eh_type;
	+ uint8_t cflags;

	+ cflags = 0;
	M_ASSERTPKTHDR(m0);
	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
	rc = EINVAL;
	@@ -2506,7 +2577,7 @@
	*/
	M_ASSERTPKTHDR(m0);
	MPASS(m0->m_pkthdr.len > 0);
	- nsegs = count_mbuf_nsegs(m0, 0);
	+ nsegs = count_mbuf_nsegs(m0, 0, &cflags);
	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
	if (defragged++ > 0 \|\| (m = m_defrag(m0, M_NOWAIT)) == NULL) {
	rc = EFBIG;
	@@ -2516,7 +2587,8 @@
	goto restart;
	}

	- if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
	+ if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN &&
	+ !(cflags & MC_NOMAP))) {
	m0 = m_pullup(m0, m0->m_pkthdr.len);
	if (m0 == NULL) {
	/* Should have left well enough alone. */
	@@ -2527,7 +2599,7 @@
	goto restart;
	}
	set_mbuf_nsegs(m0, nsegs);
	- set_mbuf_cflags(m0, 0);
	+ set_mbuf_cflags(m0, cflags);
	if (sc->flags & IS_VF)
	set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
	else
	@@ -2616,7 +2688,9 @@
	/* EO WRs have the headers in the WR and not the GL. */
	immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
	m0->m_pkthdr.l4hlen;
	- nsegs = count_mbuf_nsegs(m0, immhdrs);
	+ cflags = 0;
	+ nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags);
	+ MPASS(cflags == mbuf_cflags(m0));
	set_mbuf_eo_nsegs(m0, nsegs);
	set_mbuf_eo_len16(m0,
	txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
	@@ -4723,7 +4797,8 @@
	ctrl = sizeof(struct cpl_tx_pkt_core);
	if (needs_tso(m0))
	ctrl += sizeof(struct cpl_tx_pkt_lso_core);
	- else if (pktlen <= imm_payload(2) && available >= 2) {
	+ else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
	+ available >= 2) {
	/* Immediate data. Recalculate len16 and set nsegs to 0. */
	ctrl += pktlen;
	len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
	Index: sys/dev/cxgbe/tom/t4_cpl_io.c
	===================================================================
	--- sys/dev/cxgbe/tom/t4_cpl_io.c
	+++ sys/dev/cxgbe/tom/t4_cpl_io.c
	@@ -666,6 +666,8 @@
	if (IS_AIOTX_MBUF(m))
	rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m),
	aiotx_mbuf_pgoff(m), m->m_len);
	+ else if (m->m_flags & M_NOMAP)
	+ rc = sglist_append_mb_ext_pgs(&sg, m);
	else
	rc = sglist_append(&sg, mtod(m, void *), m->m_len);
	if (__predict_false(rc != 0))
	@@ -787,6 +789,8 @@
	if (IS_AIOTX_MBUF(m))
	n = sglist_count_vmpages(aiotx_mbuf_pages(m),
	aiotx_mbuf_pgoff(m), m->m_len);
	+ else if (m->m_flags & M_NOMAP)
	+ n = sglist_count_mb_ext_pgs(m);
	else
	n = sglist_count(mtod(m, void *), m->m_len);

	Index: sys/dev/mlx5/mlx5_en/mlx5_en_main.c
	===================================================================
	--- sys/dev/mlx5/mlx5_en/mlx5_en_main.c
	+++ sys/dev/mlx5/mlx5_en/mlx5_en_main.c
	@@ -3279,6 +3279,8 @@
	"tso6 disabled due to -txcsum6.\n");
	}
	}
	+ if (mask & IFCAP_NOMAP)
	+ ifp->if_capenable ^= IFCAP_NOMAP;
	if (mask & IFCAP_RXCSUM)
	ifp->if_capenable ^= IFCAP_RXCSUM;
	if (mask & IFCAP_RXCSUM_IPV6)
	@@ -4145,6 +4147,7 @@
	ifp->if_capabilities \|= IFCAP_LRO;
	ifp->if_capabilities \|= IFCAP_TSO \| IFCAP_VLAN_HWTSO;
	ifp->if_capabilities \|= IFCAP_HWSTATS \| IFCAP_HWRXTSTMP;
	+ ifp->if_capabilities \|= IFCAP_NOMAP;
	ifp->if_capabilities \|= IFCAP_TXRTLMT;
	ifp->if_snd_tag_alloc = mlx5e_snd_tag_alloc;
	ifp->if_snd_tag_free = mlx5e_snd_tag_free;
	Index: sys/kern/kern_mbuf.c
	===================================================================
	--- sys/kern/kern_mbuf.c
	+++ sys/kern/kern_mbuf.c
	@@ -45,6 +45,7 @@
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/protosw.h>
	+#include <sys/sf_buf.h>
	#include <sys/smp.h>
	#include <sys/socket.h>
	#include <sys/sysctl.h>
	@@ -111,6 +112,11 @@
	int nmbjumbo9; /* limits number of 9k jumbo clusters */
	int nmbjumbo16; /* limits number of 16k jumbo clusters */

	+bool mb_use_ext_pgs; /* use EXT_PGS mbufs for sendfile */
	+SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN,
	+ &mb_use_ext_pgs, 0,
	+ "Use unmapped mbufs for sendfile(2)");
	+
	static quad_t maxmbufmem; /* overall real memory limit for all mbufs */

	SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN \| CTLFLAG_NOFETCH, &maxmbufmem, 0,
	@@ -281,6 +287,7 @@
	uma_zone_t zone_jumbop;
	uma_zone_t zone_jumbo9;
	uma_zone_t zone_jumbo16;
	+uma_zone_t zone_extpgs;

	/*
	* Local prototypes.
	@@ -298,6 +305,9 @@
	/* Ensure that MSIZE is a power of 2. */
	CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);

	+_Static_assert(sizeof(struct mbuf_ext_pgs) == 256,
	+ "mbuf_ext_pgs size mismatch");
	+
	/*
	* Initialize FreeBSD Network buffer allocation.
	*/
	@@ -379,6 +389,15 @@
	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
	uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);

	+ zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME,
	+ sizeof(struct mbuf_ext_pgs),
	+#ifdef INVARIANTS
	+ trash_ctor, trash_dtor, trash_init, trash_fini,
	+#else
	+ NULL, NULL, NULL, NULL,
	+#endif
	+ UMA_ALIGN_CACHE, 0);
	+
	/*
	* Hook event handler for low-memory situation, used to
	* drain protocols and push data back to the caches (UMA
	@@ -823,6 +842,391 @@
	(*pr->pr_drain)();
	}

	+/*
	+ * Free "count" units of I/O from an mbuf chain. They could be held
	+ * in EXT_PGS or just as a normal mbuf. This code is intended to be
	+ * called in an error path (I/O error, closed connection, etc).
	+ */
	+void
	+mb_free_notready(struct mbuf *m, int count)
	+{
	+ int i;
	+
	+ for (i = 0; i < count && m != NULL; i++) {
	+ if ((m->m_flags & M_EXT) != 0 &&
	+ m->m_ext.ext_type == EXT_PGS) {
	+ m->m_ext.ext_pgs->nrdy--;
	+ if (m->m_ext.ext_pgs->nrdy != 0)
	+ continue;
	+ }
	+ m = m_free(m);
	+ }
	+ KASSERT(i == count, ("Removed only %d items from %p", i, m));
	+}
	+
	+/*
	+ * Ensure it is possible to downgrade an EXT_PGS mbuf
	+ * to a normal mbuf.
	+ *
	+ * XXXJHB: I think this is no longer needed? The callers of
	+ * mb_unmapped_compress all check the length against MLEN, and
	+ * mb_unmapped_compress allows data to be stored in unmapped pages.
	+ */
	+CTASSERT(MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN < MLEN);
	+
	+/*
	+ * Compress an unmapped mbuf into a simple mbuf when it holds a small
	+ * amount of data. This is used as a DOS defense to avoid having
	+ * small packets tie up wired pages, an ext_pgs structure, and an
	+ * mbuf. Since this converts the existing mbuf in place, it can only
	+ * be used if there are no other references to 'm'.
	+ */
	+int
	+mb_unmapped_compress(struct mbuf *m)
	+{
	+ volatile u_int *refcnt;
	+ struct mbuf m_temp;
	+
	+ /*
	+ * Assert that 'm' does not have a packet header. If 'm' had
	+ * a packet header, it would only be able to hold MHLEN bytes
	+ * and m_data would have to be initialized differently.
	+ */
	+ KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) &&
	+ m->m_ext.ext_type == EXT_PGS,
	+ ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m));
	+ KASSERT(m->m_data == 0, ("m_data != 0 %p", m));
	+ KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
	+
	+ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
	+ refcnt = &m->m_ext.ext_count;
	+ } else {
	+ KASSERT(m->m_ext.ext_cnt != NULL,
	+ ("%s: no refcounting pointer on %p", __func__, m));
	+ refcnt = m->m_ext.ext_cnt;
	+ }
	+
	+ if (*refcnt != 1)
	+ return (EBUSY);
	+
	+ /*
	+ * Copy m_ext portion of 'm' to 'm_temp' to create a "fake"
	+ * EXT_PGS mbuf that can be used with m_copydata() as well as
	+ * the ext_free callback.
	+ */
	+ memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext));
	+ m_temp.m_next = NULL;
	+ m_temp.m_nextpkt = NULL;
	+
	+ /* Turn 'm' into a "normal" mbuf. */
	+ m->m_flags &= ~(M_EXT \| M_RDONLY \| M_NOMAP);
	+ m->m_data = m->m_dat;
	+
	+ /* Copy data from template's ext_pgs. */
	+ m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t));
	+
	+ /* Free the backing pages. */
	+ m_temp.m_ext.ext_free(&m_temp);
	+
	+ /* Finally, free the ext_pgs struct. */
	+ uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs);
	+ return (0);
	+}
	+
	+/*
	+ * These next few routines are used to permit downgrading an unmapped
	+ * mbuf to a chain of mapped mbufs. This is used when an interface
	+ * doesn't supported unmapped mbufs or if checksums need to be
	+ * computed in software.
	+ *
	+ * Each unmapped mbuf is converted to a chain of mbufs. First, any
	+ * TLS header data is stored in a regular mbuf. Second, each page of
	+ * unmapped data is stored in an mbuf with an EXT_SFBUF external
	+ * cluster. These mbufs use an sf_buf to provide a valid KVA for the
	+ * associated physical page. They also hold a reference on the
	+ * original EXT_PGS mbuf to ensure the physical page doesn't go away.
	+ * Finally, any TLS trailer data is stored in a regular mbuf.
	+ *
	+ * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
	+ * mbufs. It frees the associated sf_buf and releases its reference
	+ * on the original EXT_PGS mbuf.
	+ *
	+ * _mb_unmapped_to_ext() is a helper function that converts a single
	+ * unmapped mbuf into a chain of mbufs.
	+ *
	+ * mb_unmapped_to_ext() is the public function that walks an mbuf
	+ * chain converting any unmapped mbufs to mapped mbufs. It returns
	+ * the new chain of unmapped mbufs on success. On failure it frees
	+ * the original mbuf chain and returns NULL.
	+ */
	+static void
	+mb_unmapped_free_mext(struct mbuf *m)
	+{
	+ struct sf_buf *sf;
	+ struct mbuf *old_m;
	+
	+ sf = m->m_ext.ext_arg1;
	+ sf_buf_free(sf);
	+
	+ /* Drop the reference on the backing EXT_PGS mbuf. */
	+ old_m = m->m_ext.ext_arg2;
	+ mb_free_ext(old_m);
	+}
	+
	+static struct mbuf *
	+_mb_unmapped_to_ext(struct mbuf *m)
	+{
	+ struct mbuf_ext_pgs *ext_pgs;
	+ struct mbuf m_new, top, prev, mref;
	+ struct sf_buf *sf;
	+ vm_page_t pg;
	+ int i, len, off, pglen, pgoff, seglen, segoff;
	+ volatile u_int *refcnt;
	+ u_int ref_inc = 0;
	+
	+ MBUF_EXT_PGS_ASSERT(m);
	+ ext_pgs = m->m_ext.ext_pgs;
	+ len = m->m_len;
	+ KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p",
	+ __func__, m));
	+
	+ /* See if this is the mbuf that holds the embedded refcount. */
	+ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
	+ refcnt = &m->m_ext.ext_count;
	+ mref = m;
	+ } else {
	+ KASSERT(m->m_ext.ext_cnt != NULL,
	+ ("%s: no refcounting pointer on %p", __func__, m));
	+ refcnt = m->m_ext.ext_cnt;
	+ mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
	+ }
	+
	+ /* Skip over any data removed from the front. */
	+ off = mtod(m, vm_offset_t);
	+
	+ top = NULL;
	+ if (ext_pgs->hdr_len != 0) {
	+ if (off >= ext_pgs->hdr_len) {
	+ off -= ext_pgs->hdr_len;
	+ } else {
	+ seglen = ext_pgs->hdr_len - off;
	+ segoff = off;
	+ seglen = min(seglen, len);
	+ off = 0;
	+ len -= seglen;
	+ m_new = m_get(M_NOWAIT, MT_DATA);
	+ if (m_new == NULL)
	+ goto fail;
	+ m_new->m_len = seglen;
	+ prev = top = m_new;
	+ memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff],
	+ seglen);
	+ }
	+ }
	+ pgoff = ext_pgs->first_pg_off;
	+ for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
	+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
	+ if (off >= pglen) {
	+ off -= pglen;
	+ pgoff = 0;
	+ continue;
	+ }
	+ seglen = pglen - off;
	+ segoff = pgoff + off;
	+ off = 0;
	+ seglen = min(seglen, len);
	+ len -= seglen;
	+
	+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
	+ m_new = m_get(M_NOWAIT, MT_DATA);
	+ if (m_new == NULL)
	+ goto fail;
	+ if (top == NULL) {
	+ top = prev = m_new;
	+ } else {
	+ prev->m_next = m_new;
	+ prev = m_new;
	+ }
	+ sf = sf_buf_alloc(pg, SFB_NOWAIT);
	+ if (sf == NULL)
	+ goto fail;
	+
	+ ref_inc++;
	+ m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
	+ mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
	+ m_new->m_data += segoff;
	+ m_new->m_len = seglen;
	+
	+ pgoff = 0;
	+ };
	+ if (len != 0) {
	+ KASSERT((off + len) <= ext_pgs->trail_len,
	+ ("off + len > trail (%d + %d > %d)", off, len,
	+ ext_pgs->trail_len));
	+ m_new = m_get(M_NOWAIT, MT_DATA);
	+ if (m_new == NULL)
	+ goto fail;
	+ if (top == NULL)
	+ top = m_new;
	+ else
	+ prev->m_next = m_new;
	+ m_new->m_len = len;
	+ memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len);
	+ }
	+
	+ if (ref_inc != 0) {
	+ /*
	+ * Obtain an additional reference on the old mbuf for
	+ * each created EXT_SFBUF mbuf. They will be dropped
	+ * in mb_unmapped_free_mext().
	+ */
	+ if (*refcnt == 1)
	+ *refcnt += ref_inc;
	+ else
	+ atomic_add_int(refcnt, ref_inc);
	+ }
	+ m_free(m);
	+ return (top);
	+
	+fail:
	+ if (ref_inc != 0) {
	+ /*
	+ * Obtain an additional reference on the old mbuf for
	+ * each created EXT_SFBUF mbuf. They will be
	+ * immediately dropped when these mbufs are freed
	+ * below.
	+ */
	+ if (*refcnt == 1)
	+ *refcnt += ref_inc;
	+ else
	+ atomic_add_int(refcnt, ref_inc);
	+ }
	+ m_free(m);
	+ m_freem(top);
	+ return (NULL);
	+}
	+
	+struct mbuf *
	+mb_unmapped_to_ext(struct mbuf *top)
	+{
	+ struct mbuf m, next, *prev = NULL;
	+
	+ prev = NULL;
	+ for (m = top; m != NULL; m = next) {
	+ /* m might be freed, so cache the next pointer. */
	+ next = m->m_next;
	+ if (m->m_flags & M_NOMAP) {
	+ if (prev != NULL) {
	+ /*
	+ * Remove 'm' from the new chain so
	+ * that the 'top' chain terminates
	+ * before 'm' in case 'top' is freed
	+ * due to an error.
	+ */
	+ prev->m_next = NULL;
	+ }
	+ m = _mb_unmapped_to_ext(m);
	+ if (m == NULL) {
	+ m_freem(top);
	+ m_freem(next);
	+ return (NULL);
	+ }
	+ if (prev == NULL) {
	+ top = m;
	+ } else {
	+ prev->m_next = m;
	+ }
	+
	+ /*
	+ * Replaced one mbuf with a chain, so we must
	+ * find the end of chain.
	+ */
	+ prev = m_last(m);
	+ } else {
	+ if (prev != NULL) {
	+ prev->m_next = m;
	+ }
	+ prev = m;
	+ }
	+ }
	+ return (top);
	+}
	+
	+/*
	+ * Allocate an empty EXT_PGS mbuf. The ext_free routine is
	+ * responsible for freeing any pages backing this mbuf when it is
	+ * freed.
	+ */
	+struct mbuf *
	+mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free)
	+{
	+ struct mbuf *m;
	+ struct mbuf_ext_pgs *ext_pgs;
	+
	+ if (pkthdr)
	+ m = m_gethdr(how, MT_DATA);
	+ else
	+ m = m_get(how, MT_DATA);
	+ if (m == NULL)
	+ return (NULL);
	+
	+ ext_pgs = uma_zalloc(zone_extpgs, how);
	+ if (ext_pgs == NULL) {
	+ m_free(m);
	+ return (NULL);
	+ }
	+ ext_pgs->npgs = 0;
	+ ext_pgs->nrdy = 0;
	+ ext_pgs->first_pg_off = 0;
	+ ext_pgs->last_pg_len = 0;
	+ ext_pgs->hdr_len = 0;
	+ ext_pgs->trail_len = 0;
	+ ext_pgs->tls = NULL;
	+ ext_pgs->so = NULL;
	+ m->m_data = NULL;
	+ m->m_flags \|= (M_EXT \| M_RDONLY \| M_NOMAP);
	+ m->m_ext.ext_type = EXT_PGS;
	+ m->m_ext.ext_flags = EXT_FLAG_EMBREF;
	+ m->m_ext.ext_count = 1;
	+ m->m_ext.ext_pgs = ext_pgs;
	+ m->m_ext.ext_size = 0;
	+ m->m_ext.ext_free = ext_free;
	+ return (m);
	+}
	+
	+#ifdef INVARIANT_SUPPORT
	+void
	+mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs)
	+{
	+
	+ /*
	+ * NB: This expects a non-empty buffer (npgs > 0 and
	+ * last_pg_len > 0).
	+ */
	+ KASSERT(ext_pgs->npgs > 0,
	+ ("ext_pgs with no valid pages: %p", ext_pgs));
	+ KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa),
	+ ("ext_pgs with too many pages: %p", ext_pgs));
	+ KASSERT(ext_pgs->nrdy <= ext_pgs->npgs,
	+ ("ext_pgs with too many ready pages: %p", ext_pgs));
	+ KASSERT(ext_pgs->first_pg_off < PAGE_SIZE,
	+ ("ext_pgs with too large page offset: %p", ext_pgs));
	+ KASSERT(ext_pgs->last_pg_len > 0,
	+ ("ext_pgs with zero last page length: %p", ext_pgs));
	+ KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE,
	+ ("ext_pgs with too large last page length: %p", ext_pgs));
	+ if (ext_pgs->npgs == 1) {
	+ KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <=
	+ PAGE_SIZE, ("ext_pgs with single page too large: %p",
	+ ext_pgs));
	+ }
	+ KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr),
	+ ("ext_pgs with too large header length: %p", ext_pgs));
	+ KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail),
	+ ("ext_pgs with too large header length: %p", ext_pgs));
	+}
	+#endif
	+
	/*
	* Clean up after mbufs with M_EXT storage attached to them if the
	* reference count hits 1.
	@@ -888,6 +1292,10 @@
	uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
	uma_zfree(zone_mbuf, mref);
	break;
	+ case EXT_PGS:
	+ uma_zfree(zone_extpgs, mref->m_ext.ext_pgs);
	+ uma_zfree(zone_mbuf, mref);
	+ break;
	case EXT_SFBUF:
	case EXT_NET_DRV:
	case EXT_MOD_TYPE:
	Index: sys/kern/kern_sendfile.c
	===================================================================
	--- sys/kern/kern_sendfile.c
	+++ sys/kern/kern_sendfile.c
	@@ -34,6 +34,7 @@
	#include <sys/systm.h>
	#include <sys/capsicum.h>
	#include <sys/kernel.h>
	+#include <netinet/in.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	@@ -62,6 +63,7 @@

	#define EXT_FLAG_SYNC EXT_FLAG_VENDOR1
	#define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2
	+#define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3

	/*
	* Structure describing a single sendfile(2) I/O, which may consist of
	@@ -201,6 +203,39 @@
	}
	}

	+static void
	+sendfile_free_mext_pg(struct mbuf *m)
	+{
	+ struct mbuf_ext_pgs *ext_pgs;
	+ vm_page_t pg;
	+ int i;
	+ bool nocache, cache_last;
	+
	+ KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_PGS,
	+ ("%s: m %p !M_EXT or !EXT_PGS", __func__, m));
	+
	+ nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE;
	+ cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST;
	+ ext_pgs = m->m_ext.ext_pgs;
	+
	+ for (i = 0; i < ext_pgs->npgs; i++) {
	+ if (cache_last && i == ext_pgs->npgs - 1)
	+ nocache = false;
	+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
	+ sendfile_free_page(pg, nocache);
	+ }
	+
	+ if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
	+ struct sendfile_sync *sfs = m->m_ext.ext_arg2;
	+
	+ mtx_lock(&sfs->mtx);
	+ KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
	+ if (--sfs->count == 0)
	+ cv_signal(&sfs->cv);
	+ mtx_unlock(&sfs->mtx);
	+ }
	+}
	+
	/*
	* Helper function to calculate how much data to put into page i of n.
	* Only first and last pages are special.
	@@ -283,8 +318,6 @@

	CURVNET_SET(so->so_vnet);
	if (sfio->error) {
	- struct mbuf *m;
	-
	/*
	* I/O operation failed. The state of data in the socket
	* is now inconsistent, and all what we can do is to tear
	@@ -299,11 +332,9 @@
	so->so_proto->pr_usrreqs->pru_abort(so);
	so->so_error = EIO;

	- m = sfio->m;
	- for (int i = 0; i < sfio->npages; i++)
	- m = m_free(m);
	+ mb_free_notready(sfio->m, sfio->npages);
	} else
	- (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
	+ (void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
	sfio->npages);

	SOCK_LOCK(so);
	@@ -540,13 +571,15 @@
	struct vnode *vp;
	struct vm_object *obj;
	struct socket *so;
	+ struct mbuf_ext_pgs *ext_pgs;
	struct mbuf m, mh, *mhtail;
	struct sf_buf *sf;
	struct shmfd *shmfd;
	struct sendfile_sync *sfs;
	struct vattr va;
	off_t off, sbytes, rem, obj_size;
	- int error, softerr, bsize, hdrlen;
	+ int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr;
	+ bool use_ext_pgs;

	obj = NULL;
	so = NULL;
	@@ -554,6 +587,7 @@
	sfs = NULL;
	hdrlen = sbytes = 0;
	softerr = 0;
	+ use_ext_pgs = false;

	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
	if (error != 0)
	@@ -714,6 +748,17 @@

	if (space > rem)
	space = rem;
	+ else if (space > PAGE_SIZE) {
	+ /*
	+ * Use page boundaries when possible for large
	+ * requests.
	+ */
	+ if (off & PAGE_MASK)
	+ space -= (PAGE_SIZE - (off & PAGE_MASK));
	+ space = trunc_page(space);
	+ if (off & PAGE_MASK)
	+ space += (PAGE_SIZE - (off & PAGE_MASK));
	+ }

	npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);

	@@ -751,6 +796,22 @@
	* dumped into socket buffer.
	*/
	pa = sfio->pa;
	+
	+ /*
	+ * Use unmapped mbufs if enabled for TCP. Unmapped
	+ * bufs are restricted to TCP as that is what has been
	+ * tested. In particular, unmapped mbufs have not
	+ * been tested with UNIX-domain sockets.
	+ */
	+ if (mb_use_ext_pgs &&
	+ so->so_proto->pr_protocol == IPPROTO_TCP) {
	+ use_ext_pgs = true;
	+ max_pgs = MBUF_PEXT_MAX_PGS;
	+
	+ /* Start at last index, to wrap on first use. */
	+ ext_pgs_idx = max_pgs - 1;
	+ }
	+
	for (int i = 0; i < npages; i++) {
	struct mbuf *m0;

	@@ -766,6 +827,66 @@
	break;
	}

	+ if (use_ext_pgs) {
	+ off_t xfs;
	+
	+ ext_pgs_idx++;
	+ if (ext_pgs_idx == max_pgs) {
	+ m0 = mb_alloc_ext_pgs(M_WAITOK, false,
	+ sendfile_free_mext_pg);
	+
	+ if (flags & SF_NOCACHE) {
	+ m0->m_ext.ext_flags \|=
	+ EXT_FLAG_NOCACHE;
	+
	+ /*
	+ * See comment below regarding
	+ * ignoring SF_NOCACHE for the
	+ * last page.
	+ */
	+ if ((npages - i <= max_pgs) &&
	+ ((off + space) & PAGE_MASK) &&
	+ (rem > space \|\| rhpages > 0))
	+ m0->m_ext.ext_flags \|=
	+ EXT_FLAG_CACHE_LAST;
	+ }
	+ if (sfs != NULL) {
	+ m0->m_ext.ext_flags \|=
	+ EXT_FLAG_SYNC;
	+ m0->m_ext.ext_arg2 = sfs;
	+ mtx_lock(&sfs->mtx);
	+ sfs->count++;
	+ mtx_unlock(&sfs->mtx);
	+ }
	+ ext_pgs = m0->m_ext.ext_pgs;
	+ if (i == 0)
	+ sfio->m = m0;
	+ ext_pgs_idx = 0;
	+
	+ /* Append to mbuf chain. */
	+ if (mtail != NULL)
	+ mtail->m_next = m0;
	+ else
	+ m = m0;
	+ mtail = m0;
	+ ext_pgs->first_pg_off =
	+ vmoff(i, off) & PAGE_MASK;
	+ }
	+ if (nios) {
	+ mtail->m_flags \|= M_NOTREADY;
	+ ext_pgs->nrdy++;
	+ }
	+
	+ ext_pgs->pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pa[i]);
	+ ext_pgs->npgs++;
	+ xfs = xfsize(i, npages, off, space);
	+ ext_pgs->last_pg_len = xfs;
	+ MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs);
	+ mtail->m_len += xfs;
	+ mtail->m_ext.ext_size += PAGE_SIZE;
	+ continue;
	+ }
	+
	/*
	* Get a sendfile buf. When allocating the
	* first buffer for mbuf chain, we usually
	Index: sys/kern/subr_bus_dma.c
	===================================================================
	--- sys/kern/subr_bus_dma.c
	+++ sys/kern/subr_bus_dma.c
	@@ -110,6 +110,67 @@
	return (error);
	}

	+/*
	+ * Load an unmapped mbuf
	+ */
	+static int
	+_bus_dmamap_load_unmapped_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
	+ struct mbuf m, bus_dma_segment_t segs, int *nsegs, int flags)
	+{
	+ struct mbuf_ext_pgs *ext_pgs;
	+ int error, i, off, len, pglen, pgoff, seglen, segoff;
	+
	+ MBUF_EXT_PGS_ASSERT(m);
	+ ext_pgs = m->m_ext.ext_pgs;
	+
	+ len = m->m_len;
	+ error = 0;
	+
	+ /* Skip over any data removed from the front. */
	+ off = mtod(m, vm_offset_t);
	+
	+ if (ext_pgs->hdr_len != 0) {
	+ if (off >= ext_pgs->hdr_len) {
	+ off -= ext_pgs->hdr_len;
	+ } else {
	+ seglen = ext_pgs->hdr_len - off;
	+ segoff = off;
	+ seglen = min(seglen, len);
	+ off = 0;
	+ len -= seglen;
	+ error = _bus_dmamap_load_buffer(dmat, map,
	+ &ext_pgs->hdr[segoff], seglen, kernel_pmap,
	+ flags, segs, nsegs);
	+ }
	+ }
	+ pgoff = ext_pgs->first_pg_off;
	+ for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
	+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
	+ if (off >= pglen) {
	+ off -= pglen;
	+ pgoff = 0;
	+ continue;
	+ }
	+ seglen = pglen - off;
	+ segoff = pgoff + off;
	+ off = 0;
	+ seglen = min(seglen, len);
	+ len -= seglen;
	+ error = _bus_dmamap_load_phys(dmat, map,
	+ ext_pgs->pa[i] + segoff, seglen, flags, segs, nsegs);
	+ pgoff = 0;
	+ };
	+ if (len != 0 && error == 0) {
	+ KASSERT((off + len) <= ext_pgs->trail_len,
	+ ("off + len > trail (%d + %d > %d)", off, len,
	+ ext_pgs->trail_len));
	+ error = _bus_dmamap_load_buffer(dmat, map,
	+ &ext_pgs->trail[off], len, kernel_pmap, flags, segs,
	+ nsegs);
	+ }
	+ return (error);
	+}
	+
	/*
	* Load an mbuf chain.
	*/
	@@ -123,9 +184,13 @@
	error = 0;
	for (m = m0; m != NULL && error == 0; m = m->m_next) {
	if (m->m_len > 0) {
	- error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
	- m->m_len, kernel_pmap, flags \| BUS_DMA_LOAD_MBUF,
	- segs, nsegs);
	+ if ((m->m_flags & M_NOMAP) != 0)
	+ error = _bus_dmamap_load_unmapped_mbuf_sg(dmat,
	+ map, m, segs, nsegs, flags);
	+ else
	+ error = _bus_dmamap_load_buffer(dmat, map,
	+ m->m_data, m->m_len, kernel_pmap,
	+ flags \| BUS_DMA_LOAD_MBUF, segs, nsegs);
	}
	}
	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
	Index: sys/kern/subr_sglist.c
	===================================================================
	--- sys/kern/subr_sglist.c
	+++ sys/kern/subr_sglist.c
	@@ -218,6 +218,75 @@
	return (nsegs);
	}

	+/*
	+ * Determine the number of scatter/gather list elements needed to
	+ * describe an EXT_PGS buffer.
	+ */
	+int
	+sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, size_t len)
	+{
	+ vm_paddr_t nextaddr, paddr;
	+ size_t seglen, segoff;
	+ int i, nsegs, pglen, pgoff;
	+
	+ if (len == 0)
	+ return (0);
	+
	+ nsegs = 0;
	+ if (ext_pgs->hdr_len != 0) {
	+ if (off >= ext_pgs->hdr_len) {
	+ off -= ext_pgs->hdr_len;
	+ } else {
	+ seglen = ext_pgs->hdr_len - off;
	+ segoff = off;
	+ seglen = MIN(seglen, len);
	+ off = 0;
	+ len -= seglen;
	+ nsegs += sglist_count(&ext_pgs->hdr[segoff], seglen);
	+ }
	+ }
	+ nextaddr = 0;
	+ pgoff = ext_pgs->first_pg_off;
	+ for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
	+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
	+ if (off >= pglen) {
	+ off -= pglen;
	+ pgoff = 0;
	+ continue;
	+ }
	+ seglen = pglen - off;
	+ segoff = pgoff + off;
	+ off = 0;
	+ seglen = MIN(seglen, len);
	+ len -= seglen;
	+ paddr = ext_pgs->pa[i] + segoff;
	+ if (paddr != nextaddr)
	+ nsegs++;
	+ nextaddr = paddr + seglen;
	+ pgoff = 0;
	+ };
	+ if (len != 0) {
	+ seglen = MIN(len, ext_pgs->trail_len - off);
	+ len -= seglen;
	+ nsegs += sglist_count(&ext_pgs->trail[off], seglen);
	+ }
	+ KASSERT(len == 0, ("len != 0"));
	+ return (nsegs);
	+}
	+
	+/*
	+ * Determine the number of scatter/gather list elements needed to
	+ * describe an EXT_PGS mbuf.
	+ */
	+int
	+sglist_count_mb_ext_pgs(struct mbuf *m)
	+{
	+
	+ MBUF_EXT_PGS_ASSERT(m);
	+ return (sglist_count_ext_pgs(m->m_ext.ext_pgs, mtod(m, vm_offset_t),
	+ m->m_len));
	+}
	+
	/*
	* Allocate a scatter/gather list along with 'nsegs' segments. The
	* 'mflags' parameters are the same as passed to malloc(9). The caller
	@@ -319,6 +388,76 @@
	return (error);
	}

	+/*
	+ * Append the segments to describe an EXT_PGS buffer to a
	+ * scatter/gather list. If there are insufficient segments, then this
	+ * fails with EFBIG.
	+ */
	+int
	+sglist_append_ext_pgs(struct sglist sg, struct mbuf_ext_pgs ext_pgs,
	+ size_t off, size_t len)
	+{
	+ size_t seglen, segoff;
	+ vm_paddr_t paddr;
	+ int error, i, pglen, pgoff;
	+
	+ error = 0;
	+ if (ext_pgs->hdr_len != 0) {
	+ if (off >= ext_pgs->hdr_len) {
	+ off -= ext_pgs->hdr_len;
	+ } else {
	+ seglen = ext_pgs->hdr_len - off;
	+ segoff = off;
	+ seglen = MIN(seglen, len);
	+ off = 0;
	+ len -= seglen;
	+ error = sglist_append(sg,
	+ &ext_pgs->hdr[segoff], seglen);
	+ }
	+ }
	+ pgoff = ext_pgs->first_pg_off;
	+ for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
	+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
	+ if (off >= pglen) {
	+ off -= pglen;
	+ pgoff = 0;
	+ continue;
	+ }
	+ seglen = pglen - off;
	+ segoff = pgoff + off;
	+ off = 0;
	+ seglen = MIN(seglen, len);
	+ len -= seglen;
	+ paddr = ext_pgs->pa[i] + segoff;
	+ error = sglist_append_phys(sg, paddr, seglen);
	+ pgoff = 0;
	+ };
	+ if (error == 0 && len > 0) {
	+ seglen = MIN(len, ext_pgs->trail_len - off);
	+ len -= seglen;
	+ error = sglist_append(sg,
	+ &ext_pgs->trail[off], seglen);
	+ }
	+ if (error == 0)
	+ KASSERT(len == 0, ("len != 0"));
	+ return (error);
	+}
	+
	+/*
	+ * Append the segments to describe an EXT_PGS mbuf to a scatter/gather
	+ * list. If there are insufficient segments, then this fails with
	+ * EFBIG.
	+ */
	+int
	+sglist_append_mb_ext_pgs(struct sglist sg, struct mbuf m)
	+{
	+
	+ /* for now, all unmapped mbufs are assumed to be EXT_PGS */
	+ MBUF_EXT_PGS_ASSERT(m);
	+ return (sglist_append_ext_pgs(sg, m->m_ext.ext_pgs,
	+ mtod(m, vm_offset_t), m->m_len));
	+}
	+
	/*
	* Append the segments that describe a single mbuf chain to a
	* scatter/gather list. If there are insufficient segments, then this
	@@ -338,7 +477,11 @@
	SGLIST_SAVE(sg, save);
	for (m = m0; m != NULL; m = m->m_next) {
	if (m->m_len > 0) {
	- error = sglist_append(sg, m->m_data, m->m_len);
	+ if ((m->m_flags & M_NOMAP) != 0)
	+ error = sglist_append_mb_ext_pgs(sg, m);
	+ else
	+ error = sglist_append(sg, m->m_data,
	+ m->m_len);
	if (error) {
	SGLIST_RESTORE(sg, save);
	return (error);
	Index: sys/kern/uipc_mbuf.c
	===================================================================
	--- sys/kern/uipc_mbuf.c
	+++ sys/kern/uipc_mbuf.c
	@@ -50,6 +50,10 @@
	#include <sys/protosw.h>
	#include <sys/uio.h>
	#include <sys/sdt.h>
	+#include <vm/vm.h>
	+#include <vm/vm_pageout.h>
	+#include <vm/vm_page.h>
	+#include <sys/vmmeter.h>

	SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
	"struct mbuf ", "mbufinfo_t ",
	@@ -202,7 +206,7 @@
	else
	bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
	n->m_flags \|= M_EXT;
	- n->m_flags \|= m->m_flags & M_RDONLY;
	+ n->m_flags \|= m->m_flags & (M_RDONLY \| M_NOMAP);

	/* See if this is the mbuf that holds the embedded refcount. */
	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
	@@ -246,7 +250,8 @@
	__func__, m, m0));
	if (m->m_flags & M_PKTHDR)
	m_demote_pkthdr(m);
	- m->m_flags = m->m_flags & (M_EXT \| M_RDONLY \| M_NOFREE \| flags);
	+ m->m_flags = m->m_flags & (M_EXT \| M_RDONLY \| M_NOFREE \|
	+ M_NOMAP \| flags);
	}
	}

	@@ -376,7 +381,8 @@
	if (to->m_flags & M_PKTHDR)
	m_tag_delete_chain(to, NULL);
	#endif
	- to->m_flags = (from->m_flags & M_COPYFLAGS) \| (to->m_flags & M_EXT);
	+ to->m_flags = (from->m_flags & M_COPYFLAGS) \|
	+ (to->m_flags & (M_EXT \| M_NOMAP));
	if ((to->m_flags & M_EXT) == 0)
	to->m_data = to->m_pktdat;
	to->m_pkthdr = from->m_pkthdr; /* especially tags */
	@@ -414,7 +420,8 @@
	if (to->m_flags & M_PKTHDR)
	m_tag_delete_chain(to, NULL);
	#endif
	- to->m_flags = (from->m_flags & M_COPYFLAGS) \| (to->m_flags & M_EXT);
	+ to->m_flags = (from->m_flags & M_COPYFLAGS) \|
	+ (to->m_flags & (M_EXT \| M_NOMAP));
	if ((to->m_flags & M_EXT) == 0)
	to->m_data = to->m_pktdat;
	to->m_pkthdr = from->m_pkthdr;
	@@ -579,6 +586,30 @@
	return (NULL);
	}

	+static void
	+m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
	+{
	+ struct iovec iov;
	+ struct uio uio;
	+ int error;
	+
	+ KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
	+ KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
	+ KASSERT(off < m->m_len,
	+ ("m_copyfromunmapped: len exceeds mbuf length"));
	+ iov.iov_base = cp;
	+ iov.iov_len = len;
	+ uio.uio_resid = len;
	+ uio.uio_iov = &iov;
	+ uio.uio_segflg = UIO_SYSSPACE;
	+ uio.uio_iovcnt = 1;
	+ uio.uio_offset = 0;
	+ uio.uio_rw = UIO_READ;
	+ error = m_unmappedtouio(m, off, &uio, len);
	+ KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off,
	+ len));
	+}
	+
	/*
	* Copy data from an mbuf chain starting "off" bytes from the beginning,
	* continuing for "len" bytes, into the indicated buffer.
	@@ -600,7 +631,10 @@
	while (len > 0) {
	KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
	count = min(m->m_len - off, len);
	- bcopy(mtod(m, caddr_t) + off, cp, count);
	+ if ((m->m_flags & M_NOMAP) != 0)
	+ m_copyfromunmapped(m, off, count, cp);
	+ else
	+ bcopy(mtod(m, caddr_t) + off, cp, count);
	len -= count;
	cp += count;
	off = 0;
	@@ -695,6 +729,7 @@
	m = m->m_next;
	while (n) {
	if (!M_WRITABLE(m) \|\|
	+ (n->m_flags & M_NOMAP) != 0 \|\|
	M_TRAILINGSPACE(m) < n->m_len) {
	/* just join the two chains */
	m->m_next = n;
	@@ -812,6 +847,9 @@
	int count;
	int space;

	+ KASSERT((n->m_flags & M_NOMAP) == 0,
	+ ("%s: unmapped mbuf %p", __func__, n));
	+
	/*
	* If first mbuf has no cluster, and has room for len bytes
	* without shifting current data, pullup into it,
	@@ -1364,6 +1402,41 @@
	return (NULL);
	}

	+/*
	+ * Return the number of fragments an mbuf will use. This is usually
	+ * used as a proxy for the number of scatter/gather elements needed by
	+ * a DMA engine to access an mbuf. In general mapped mbufs are
	+ * assumed to be backed by physically contiguous buffers that only
	+ * need a single fragment. Unmapped mbufs, on the other hand, can
	+ * span disjoint physical pages.
	+ */
	+static int
	+frags_per_mbuf(struct mbuf *m)
	+{
	+ struct mbuf_ext_pgs *ext_pgs;
	+ int frags;
	+
	+ if ((m->m_flags & M_NOMAP) == 0)
	+ return (1);
	+
	+ /*
	+ * The header and trailer are counted as a single fragment
	+ * each when present.
	+ *
	+ * XXX: This overestimates the number of fragments by assuming
	+ * all the backing physical pages are disjoint.
	+ */
	+ ext_pgs = m->m_ext.ext_pgs;
	+ frags = 0;
	+ if (ext_pgs->hdr_len != 0)
	+ frags++;
	+ frags += ext_pgs->npgs;
	+ if (ext_pgs->trail_len != 0)
	+ frags++;
	+
	+ return (frags);
	+}
	+
	/*
	* Defragment an mbuf chain, returning at most maxfrags separate
	* mbufs+clusters. If this is not possible NULL is returned and
	@@ -1384,7 +1457,7 @@
	*/
	curfrags = 0;
	for (m = m0; m != NULL; m = m->m_next)
	- curfrags++;
	+ curfrags += frags_per_mbuf(m);
	/*
	* First, try to collapse mbufs. Note that we always collapse
	* towards the front so we don't need to deal with moving the
	@@ -1399,12 +1472,13 @@
	break;
	if (M_WRITABLE(m) &&
	n->m_len < M_TRAILINGSPACE(m)) {
	- bcopy(mtod(n, void ), mtod(m, char ) + m->m_len,
	- n->m_len);
	+ m_copydata(n, 0, n->m_len,
	+ mtod(m, char *) + m->m_len);
	m->m_len += n->m_len;
	m->m_next = n->m_next;
	+ curfrags -= frags_per_mbuf(n);
	m_free(n);
	- if (--curfrags <= maxfrags)
	+ if (curfrags <= maxfrags)
	return m0;
	} else
	m = n;
	@@ -1421,15 +1495,18 @@
	m = m_getcl(how, MT_DATA, 0);
	if (m == NULL)
	goto bad;
	- bcopy(mtod(n, void ), mtod(m, void ), n->m_len);
	- bcopy(mtod(n2, void ), mtod(m, char ) + n->m_len,
	- n2->m_len);
	+ m_copydata(n, 0, n->m_len, mtod(m, char *));
	+ m_copydata(n2, 0, n2->m_len,
	+ mtod(m, char *) + n->m_len);
	m->m_len = n->m_len + n2->m_len;
	m->m_next = n2->m_next;
	*prev = m;
	+ curfrags += 1; /* For the new cluster */
	+ curfrags -= frags_per_mbuf(n);
	+ curfrags -= frags_per_mbuf(n2);
	m_free(n);
	m_free(n2);
	- if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */
	+ if (curfrags <= maxfrags)
	return m0;
	/*
	* Still not there, try the normal collapse
	@@ -1529,6 +1606,111 @@

	#endif

	+/*
	+ * Free pages from mbuf_ext_pgs, assuming they were allocated via
	+ * vm_page_alloc() and aren't associated with any object. Complement
	+ * to allocator from m_uiotombuf_nomap().
	+ */
	+void
	+mb_free_mext_pgs(struct mbuf *m)
	+{
	+ struct mbuf_ext_pgs *ext_pgs;
	+ vm_page_t pg;
	+ int wire_adj;
	+
	+ MBUF_EXT_PGS_ASSERT(m);
	+ ext_pgs = m->m_ext.ext_pgs;
	+ wire_adj = 0;
	+ for (int i = 0; i < ext_pgs->npgs; i++) {
	+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
	+ /*
	+ * Note: page is not locked, as it has no
	+ * object and is not on any queues.
	+ */
	+ vm_page_free_toq(pg);
	+ wire_adj++;
	+ }
	+ if (wire_adj)
	+ vm_wire_sub(wire_adj);
	+}
	+
	+static struct mbuf *
	+m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
	+{
	+ struct mbuf m, mb, *prev;
	+ struct mbuf_ext_pgs *pgs;
	+ vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
	+ int error, length, i, needed, wire_adj = 0;
	+ ssize_t total;
	+ int pflags = malloc2vm_flags(how) \| VM_ALLOC_NOOBJ \| VM_ALLOC_NODUMP;
	+
	+ /*
	+ * len can be zero or an arbitrary large value bound by
	+ * the total data supplied by the uio.
	+ */
	+ if (len > 0)
	+ total = MIN(uio->uio_resid, len);
	+ else
	+ total = uio->uio_resid;
	+
	+ if (maxseg == 0)
	+ maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
	+
	+ /*
	+ * Allocate the pages
	+ */
	+ m = NULL;
	+ while (total > 0) {
	+ mb = mb_alloc_ext_pgs(how, (flags & M_PKTHDR),
	+ mb_free_mext_pgs);
	+ if (mb == NULL)
	+ goto failed;
	+ if (m == NULL)
	+ m = mb;
	+ else
	+ prev->m_next = mb;
	+ prev = mb;
	+ pgs = mb->m_ext.ext_pgs;
	+ needed = length = MIN(maxseg, total);
	+ for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
	+retry_page:
	+ pg_array[i] = vm_page_alloc(NULL, 0, pflags);
	+ if (pg_array[i] == NULL) {
	+ if (wire_adj)
	+ vm_wire_add(wire_adj);
	+ wire_adj = 0;
	+ if (how & M_NOWAIT) {
	+ goto failed;
	+ } else {
	+ vm_wait(NULL);
	+ goto retry_page;
	+ }
	+ }
	+ wire_adj++;
	+ pg_array[i]->flags &= ~PG_ZERO;
	+ pgs->pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
	+ pgs->npgs++;
	+ }
	+ pgs->last_pg_len = length - PAGE_SIZE * (pgs->npgs - 1);
	+ MBUF_EXT_PGS_ASSERT_SANITY(pgs);
	+ vm_wire_add(wire_adj);
	+ wire_adj = 0;
	+ total -= length;
	+ error = uiomove_fromphys(pg_array, 0, length, uio);
	+ if (error != 0)
	+ goto failed;
	+ mb->m_len = length;
	+ mb->m_ext.ext_size += PAGE_SIZE * pgs->npgs;
	+ if (flags & M_PKTHDR)
	+ m->m_pkthdr.len += length;
	+ }
	+ return (m);
	+
	+failed:
	+ m_freem(m);
	+ return (NULL);
	+}
	+
	/*
	* Copy the contents of uio into a properly sized mbuf chain.
	*/
	@@ -1540,6 +1722,9 @@
	ssize_t total;
	int progress = 0;

	+ if (flags & M_NOMAP)
	+ return (m_uiotombuf_nomap(uio, how, len, align, flags));
	+
	/*
	* len can be zero or an arbitrary large value bound by
	* the total data supplied by the uio.
	@@ -1585,6 +1770,62 @@
	return (m);
	}

	+/*
	+ * Copy data from an unmapped mbuf into a uio limited by len if set.
	+ */
	+int
	+m_unmappedtouio(const struct mbuf m, int m_off, struct uio uio, int len)
	+{
	+ struct mbuf_ext_pgs *ext_pgs;
	+ vm_page_t pg;
	+ int error, i, off, pglen, pgoff, seglen, segoff;
	+
	+ MBUF_EXT_PGS_ASSERT(m);
	+ ext_pgs = m->m_ext.ext_pgs;
	+ error = 0;
	+
	+ /* Skip over any data removed from the front. */
	+ off = mtod(m, vm_offset_t);
	+
	+ off += m_off;
	+ if (ext_pgs->hdr_len != 0) {
	+ if (off >= ext_pgs->hdr_len) {
	+ off -= ext_pgs->hdr_len;
	+ } else {
	+ seglen = ext_pgs->hdr_len - off;
	+ segoff = off;
	+ seglen = min(seglen, len);
	+ off = 0;
	+ len -= seglen;
	+ error = uiomove(&ext_pgs->hdr[segoff], seglen, uio);
	+ }
	+ }
	+ pgoff = ext_pgs->first_pg_off;
	+ for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
	+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
	+ if (off >= pglen) {
	+ off -= pglen;
	+ pgoff = 0;
	+ continue;
	+ }
	+ seglen = pglen - off;
	+ segoff = pgoff + off;
	+ off = 0;
	+ seglen = min(seglen, len);
	+ len -= seglen;
	+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
	+ error = uiomove_fromphys(&pg, segoff, seglen, uio);
	+ pgoff = 0;
	+ };
	+ if (len != 0 && error == 0) {
	+ KASSERT((off + len) <= ext_pgs->trail_len,
	+ ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
	+ ext_pgs->trail_len, m_off));
	+ error = uiomove(&ext_pgs->trail[off], len, uio);
	+ }
	+ return (error);
	+}
	+
	/*
	* Copy an mbuf chain into a uio limited by len if set.
	*/
	@@ -1603,7 +1844,10 @@
	for (; m != NULL; m = m->m_next) {
	length = min(m->m_len, total - progress);

	- error = uiomove(mtod(m, void *), length, uio);
	+ if ((m->m_flags & M_NOMAP) != 0)
	+ error = m_unmappedtouio(m, 0, uio, length);
	+ else
	+ error = uiomove(mtod(m, void *), length, uio);
	if (error)
	return (error);

	Index: sys/kern/uipc_sockbuf.c
	===================================================================
	--- sys/kern/uipc_sockbuf.c
	+++ sys/kern/uipc_sockbuf.c
	@@ -89,28 +89,130 @@
	}

	/*
	- * Mark ready "count" mbufs starting with "m".
	+ * Compress M_NOTREADY mbufs after they have been readied by sbready().
	+ *
	+ * sbcompress() skips M_NOTREADY mbufs since the data is not available to
	+ * be copied at the time of sbcompress(). This function combines small
	+ * mbufs similar to sbcompress() once mbufs are ready. 'm0' is the first
	+ * mbuf sbready() marked ready, and 'end' is the first mbuf still not
	+ * ready.
	+ */
	+static void
	+sbready_compress(struct sockbuf sb, struct mbuf m0, struct mbuf *end)
	+{
	+ struct mbuf m, n;
	+ int ext_size;
	+
	+ SOCKBUF_LOCK_ASSERT(sb);
	+
	+ if ((sb->sb_flags & SB_NOCOALESCE) != 0)
	+ return;
	+
	+ for (m = m0; m != end; m = m->m_next) {
	+ MPASS((m->m_flags & M_NOTREADY) == 0);
	+
	+ /* Compress small unmapped mbufs into plain mbufs. */
	+ if ((m->m_flags & M_NOMAP) && m->m_len <= MLEN) {
	+ MPASS(m->m_flags & M_EXT);
	+ ext_size = m->m_ext.ext_size;
	+ if (mb_unmapped_compress(m) == 0) {
	+ sb->sb_mbcnt -= ext_size;
	+ sb->sb_ccnt -= 1;
	+ }
	+ }
	+
	+ /*
	+ * NB: In sbcompress(), 'n' is the last mbuf in the
	+ * socket buffer and 'm' is the new mbuf being copied
	+ * into the trailing space of 'n'. Here, the roles
	+ * are reversed and 'n' is the next mbuf after 'm'
	+ * that is being copied into the trailing space of
	+ * 'm'.
	+ */
	+ n = m->m_next;
	+ while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 &&
	+ M_WRITABLE(m) &&
	+ (m->m_flags & M_NOMAP) == 0 &&
	+ n->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
	+ n->m_len <= M_TRAILINGSPACE(m) &&
	+ m->m_type == n->m_type) {
	+ KASSERT(sb->sb_lastrecord != n,
	+ ("%s: merging start of record (%p) into previous mbuf (%p)",
	+ __func__, n, m));
	+ m_copydata(n, 0, n->m_len, mtodo(m, m->m_len));
	+ m->m_len += n->m_len;
	+ m->m_next = n->m_next;
	+ m->m_flags \|= n->m_flags & M_EOR;
	+ if (sb->sb_mbtail == n)
	+ sb->sb_mbtail = m;
	+
	+ sb->sb_mbcnt -= MSIZE;
	+ sb->sb_mcnt -= 1;
	+ if (n->m_flags & M_EXT) {
	+ sb->sb_mbcnt -= n->m_ext.ext_size;
	+ sb->sb_ccnt -= 1;
	+ }
	+ m_free(n);
	+ n = m->m_next;
	+ }
	+ }
	+ SBLASTRECORDCHK(sb);
	+ SBLASTMBUFCHK(sb);
	+}
	+
	+/*
	+ * Mark ready "count" units of I/O starting with "m". Most mbufs
	+ * count as a single unit of I/O except for EXT_PGS-backed mbufs which
	+ * can be backed by multiple pages.
	*/
	int
	-sbready(struct sockbuf sb, struct mbuf m, int count)
	+sbready(struct sockbuf sb, struct mbuf m0, int count)
	{
	+ struct mbuf *m;
	u_int blocker;

	SOCKBUF_LOCK_ASSERT(sb);
	KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
	+ KASSERT(count > 0, ("%s: invalid count %d", __func__, count));

	+ m = m0;
	blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;

	- for (int i = 0; i < count; i++, m = m->m_next) {
	+ while (count > 0) {
	KASSERT(m->m_flags & M_NOTREADY,
	("%s: m %p !M_NOTREADY", __func__, m));
	+ if ((m->m_flags & M_EXT) != 0 &&
	+ m->m_ext.ext_type == EXT_PGS) {
	+ if (count < m->m_ext.ext_pgs->nrdy) {
	+ m->m_ext.ext_pgs->nrdy -= count;
	+ count = 0;
	+ break;
	+ }
	+ count -= m->m_ext.ext_pgs->nrdy;
	+ m->m_ext.ext_pgs->nrdy = 0;
	+ } else
	+ count--;
	+
	m->m_flags &= ~(M_NOTREADY \| blocker);
	if (blocker)
	sb->sb_acc += m->m_len;
	+ m = m->m_next;
	}

	- if (!blocker)
	+ /*
	+ * If the first mbuf is still not fully ready because only
	+ * some of its backing pages were readied, no further progress
	+ * can be made.
	+ */
	+ if (m0 == m) {
	+ MPASS(m->m_flags & M_NOTREADY);
	return (EINPROGRESS);
	+ }
	+
	+ if (!blocker) {
	+ sbready_compress(sb, m0, m);
	+ return (EINPROGRESS);
	+ }

	/* This one was blocking all the queue. */
	for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
	@@ -121,6 +223,7 @@
	}

	sb->sb_fnrdy = m;
	+ sbready_compress(sb, m0, m);

	return (0);
	}
	@@ -1030,12 +1133,11 @@
	M_WRITABLE(n) &&
	((sb->sb_flags & SB_NOCOALESCE) == 0) &&
	!(m->m_flags & M_NOTREADY) &&
	- !(n->m_flags & M_NOTREADY) &&
	+ !(n->m_flags & (M_NOTREADY \| M_NOMAP)) &&
	m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
	m->m_len <= M_TRAILINGSPACE(n) &&
	n->m_type == m->m_type) {
	- bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
	- (unsigned)m->m_len);
	+ m_copydata(m, 0, m->m_len, mtodo(n, n->m_len));
	n->m_len += m->m_len;
	sb->sb_ccc += m->m_len;
	if (sb->sb_fnrdy == NULL)
	@@ -1046,6 +1148,9 @@
	m = m_free(m);
	continue;
	}
	+ if (m->m_len <= MLEN && (m->m_flags & M_NOMAP) &&
	+ (m->m_flags & M_NOTREADY) == 0)
	+ (void)mb_unmapped_compress(m);
	if (n)
	n->m_next = m;
	else
	Index: sys/kern/uipc_socket.c
	===================================================================
	--- sys/kern/uipc_socket.c
	+++ sys/kern/uipc_socket.c
	@@ -1044,7 +1044,7 @@
	*
	* We used to do a lot of socket buffer and socket locking here, as
	* well as invoke sorflush() and perform wakeups. The direct call to
	- * dom_dispose() and sbrelease_internal() are an inlining of what was
	+ * dom_dispose() and sbdestroy() are an inlining of what was
	* necessary from sorflush().
	*
	* Notice that the socket buffer and kqueue state are torn down
	@@ -1982,7 +1982,11 @@
	SBLASTRECORDCHK(&so->so_rcv);
	SBLASTMBUFCHK(&so->so_rcv);
	SOCKBUF_UNLOCK(&so->so_rcv);
	- error = uiomove(mtod(m, char *) + moff, (int)len, uio);
	+ if ((m->m_flags & M_NOMAP) != 0)
	+ error = m_unmappedtouio(m, moff, uio, (int)len);
	+ else
	+ error = uiomove(mtod(m, char *) + moff,
	+ (int)len, uio);
	SOCKBUF_LOCK(&so->so_rcv);
	if (error) {
	/*
	Index: sys/net/bpf.c
	===================================================================
	--- sys/net/bpf.c
	+++ sys/net/bpf.c
	@@ -2369,6 +2369,7 @@
	* Note that we cut corners here; we only setup what's
	* absolutely needed--this mbuf should never go anywhere else.
	*/
	+ mb.m_flags = 0;
	mb.m_next = m;
	mb.m_data = data;
	mb.m_len = dlen;
	Index: sys/net/bpf_buffer.c
	===================================================================
	--- sys/net/bpf_buffer.c
	+++ sys/net/bpf_buffer.c
	@@ -119,19 +119,10 @@
	{
	const struct mbuf *m;
	u_char *dst;
	- u_int count;

	m = (struct mbuf *)src;
	dst = (u_char *)buf + offset;
	- while (len > 0) {
	- if (m == NULL)
	- panic("bpf_mcopy");
	- count = min(m->m_len, len);
	- bcopy(mtod(m, void *), dst, count);
	- m = m->m_next;
	- dst += count;
	- len -= count;
	- }
	+ m_copydata(m, 0, len, dst);
	}

	/*
	Index: sys/net/if.h
	===================================================================
	--- sys/net/if.h
	+++ sys/net/if.h
	@@ -246,6 +246,7 @@
	#define IFCAP_HWSTATS 0x800000 /* manages counters internally */
	#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */
	#define IFCAP_HWRXTSTMP 0x2000000 /* hardware rx timestamping */
	+#define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */

	#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 \| IFCAP_TXCSUM_IPV6)

	Index: sys/net/if_vlan.c
	===================================================================
	--- sys/net/if_vlan.c
	+++ sys/net/if_vlan.c
	@@ -1731,6 +1731,16 @@
	ena \|= (mena & IFCAP_TXRTLMT);
	#endif

	+ /*
	+ * If the parent interface supports unmapped mbufs, so does
	+ * the VLAN interface. Note that this should be fine even for
	+ * interfaces that don't support hardware tagging as headers
	+ * are prepended in normal mbufs to unmapped mbufs holding
	+ * payload data.
	+ */
	+ cap \|= (p->if_capabilities & IFCAP_NOMAP);
	+ ena \|= (mena & IFCAP_NOMAP);
	+
	ifp->if_capabilities = cap;
	ifp->if_capenable = ena;
	ifp->if_hwassist = hwa;
	Index: sys/netinet/ip_output.c
	===================================================================
	--- sys/netinet/ip_output.c
	+++ sys/netinet/ip_output.c
	@@ -35,13 +35,13 @@
	__FBSDID("$FreeBSD$");

	#include "opt_inet.h"
	-#include "opt_ratelimit.h"
	#include "opt_ipsec.h"
	#include "opt_mbuf_stress_test.h"
	#include "opt_mpath.h"
	+#include "opt_ratelimit.h"
	#include "opt_route.h"
	-#include "opt_sctp.h"
	#include "opt_rss.h"
	+#include "opt_sctp.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	@@ -283,6 +283,7 @@
	#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	int no_route_but_check_spd = 0;
	#endif
	+
	M_ASSERTPKTHDR(m);

	if (inp != NULL) {
	@@ -685,11 +686,30 @@

	m->m_pkthdr.csum_flags \|= CSUM_IP;
	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
	+ m = mb_unmapped_to_ext(m);
	+ if (m == NULL) {
	+ IPSTAT_INC(ips_odropped);
	+ error = ENOBUFS;
	+ goto bad;
	+ }
	in_delayed_cksum(m);
	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	+ } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
	+ m = mb_unmapped_to_ext(m);
	+ if (m == NULL) {
	+ IPSTAT_INC(ips_odropped);
	+ error = ENOBUFS;
	+ goto bad;
	+ }
	}
	#ifdef SCTP
	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
	+ m = mb_unmapped_to_ext(m);
	+ if (m == NULL) {
	+ IPSTAT_INC(ips_odropped);
	+ error = ENOBUFS;
	+ goto bad;
	+ }
	sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
	m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
	}
	@@ -825,11 +845,23 @@
	* fragmented packets, then do it here.
	*/
	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
	+ m0 = mb_unmapped_to_ext(m0);
	+ if (m0 == NULL) {
	+ error = ENOBUFS;
	+ IPSTAT_INC(ips_odropped);
	+ goto done;
	+ }
	in_delayed_cksum(m0);
	m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
	}
	#ifdef SCTP
	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
	+ m0 = mb_unmapped_to_ext(m0);
	+ if (m0 == NULL) {
	+ error = ENOBUFS;
	+ IPSTAT_INC(ips_odropped);
	+ goto done;
	+ }
	sctp_delayed_cksum(m0, hlen);
	m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
	}
	Index: sys/netinet/tcp_pcap.c
	===================================================================
	--- sys/netinet/tcp_pcap.c
	+++ sys/netinet/tcp_pcap.c
	@@ -311,6 +311,7 @@
	if (mhead->m_flags & M_EXT) {
	switch (mhead->m_ext.ext_type) {
	case EXT_SFBUF:
	+ case EXT_PGS:
	/* Don't mess around with these. */
	tcp_pcap_m_freem(mhead);
	continue;
	@@ -383,8 +384,11 @@
	__func__, n->m_flags));
	n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
	n->m_len = m->m_len;
	- bcopy(M_START(m), n->m_dat,
	- m->m_len + M_LEADINGSPACE_NOWRITE(m));
	+ if (m->m_flags & M_NOMAP)
	+ m_copydata(m, 0, m->m_len, n->m_data);
	+ else
	+ bcopy(M_START(m), n->m_dat,
	+ m->m_len + M_LEADINGSPACE_NOWRITE(m));
	}
	else {
	/*
	Index: sys/netinet/tcp_subr.c
	===================================================================
	--- sys/netinet/tcp_subr.c
	+++ sys/netinet/tcp_subr.c
	@@ -798,8 +798,12 @@
	}
	}

	+ if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
	+ *num_names = 0;
	+ return (EINVAL);
	+ }
	+
	refcount_init(&blk->tfb_refcnt, 0);
	- blk->tfb_flags = 0;
	blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1);
	for (i = 0; i < *num_names; i++) {
	n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
	Index: sys/netinet/tcp_usrreq.c
	===================================================================
	--- sys/netinet/tcp_usrreq.c
	+++ sys/netinet/tcp_usrreq.c
	@@ -1190,8 +1190,7 @@
	INP_WLOCK(inp);
	if (inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) {
	INP_WUNLOCK(inp);
	- for (int i = 0; i < count; i++)
	- m = m_free(m);
	+ mb_free_notready(m, count);
	return (ECONNRESET);
	}
	tp = intotcpcb(inp);
	Index: sys/netinet6/ip6_output.c
	===================================================================
	--- sys/netinet6/ip6_output.c
	+++ sys/netinet6/ip6_output.c
	@@ -67,11 +67,11 @@

	#include "opt_inet.h"
	#include "opt_inet6.h"
	-#include "opt_ratelimit.h"
	#include "opt_ipsec.h"
	-#include "opt_sctp.h"
	+#include "opt_ratelimit.h"
	#include "opt_route.h"
	#include "opt_rss.h"
	+#include "opt_sctp.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	@@ -963,11 +963,30 @@
	*/
	if (sw_csum & CSUM_DELAY_DATA_IPV6) {
	sw_csum &= ~CSUM_DELAY_DATA_IPV6;
	+ m = mb_unmapped_to_ext(m);
	+ if (m == NULL) {
	+ error = ENOBUFS;
	+ IP6STAT_INC(ip6s_odropped);
	+ goto bad;
	+ }
	in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
	+ } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
	+ m = mb_unmapped_to_ext(m);
	+ if (m == NULL) {
	+ error = ENOBUFS;
	+ IP6STAT_INC(ip6s_odropped);
	+ goto bad;
	+ }
	}
	#ifdef SCTP
	if (sw_csum & CSUM_SCTP_IPV6) {
	sw_csum &= ~CSUM_SCTP_IPV6;
	+ m = mb_unmapped_to_ext(m);
	+ if (m == NULL) {
	+ error = ENOBUFS;
	+ IP6STAT_INC(ip6s_odropped);
	+ goto bad;
	+ }
	sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
	}
	#endif
	@@ -1055,11 +1074,23 @@
	* XXX-BZ handle the hw offloading case. Need flags.
	*/
	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
	+ m = mb_unmapped_to_ext(m);
	+ if (m == NULL) {
	+ in6_ifstat_inc(ifp, ifs6_out_fragfail);
	+ error = ENOBUFS;
	+ goto bad;
	+ }
	in6_delayed_cksum(m, plen, hlen);
	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
	}
	#ifdef SCTP
	if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
	+ m = mb_unmapped_to_ext(m);
	+ if (m == NULL) {
	+ in6_ifstat_inc(ifp, ifs6_out_fragfail);
	+ error = ENOBUFS;
	+ goto bad;
	+ }
	sctp_delayed_cksum(m, hlen);
	m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
	}
	Index: sys/sys/mbuf.h
	===================================================================
	--- sys/sys/mbuf.h
	+++ sys/sys/mbuf.h
	@@ -227,7 +227,15 @@
	volatile u_int ext_count;
	volatile u_int *ext_cnt;
	};
	- char ext_buf; / start of buffer */
	+ union {
	+ /*
	+ * If ext_type == EXT_PGS, 'ext_pgs' points to a
	+ * structure describing the buffer. Otherwise,
	+ * 'ext_buf' points to the start of the buffer.
	+ */
	+ struct mbuf_ext_pgs *ext_pgs;
	+ char *ext_buf;
	+ };
	uint32_t ext_size; /* size of buffer, for ext_free */
	uint32_t ext_type:8, /* type of external storage */
	ext_flags:24; /* external storage mbuf flags */
	@@ -293,6 +301,92 @@
	};
	};

	+struct socket;
	+
	+/*
	+ * TLS records for TLS 1.0-1.2 can have the following header lengths:
	+ * - 5 (AES-CBC with implicit IV)
	+ * - 21 (AES-CBC with explicit IV)
	+ * - 13 (AES-GCM with 8 byte explicit IV)
	+ */
	+#define MBUF_PEXT_HDR_LEN 24
	+
	+/*
	+ * TLS records for TLS 1.0-1.2 can have the following maximum trailer
	+ * lengths:
	+ * - 16 (AES-GCM)
	+ * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding)
	+ * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding)
	+ * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding)
	+ */
	+#define MBUF_PEXT_TRAIL_LEN 64
	+
	+#ifdef __LP64__
	+#define MBUF_PEXT_MAX_PGS (152 / sizeof(vm_paddr_t))
	+#else
	+#define MBUF_PEXT_MAX_PGS (156 / sizeof(vm_paddr_t))
	+#endif
	+
	+#define MBUF_PEXT_MAX_BYTES \
	+ (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN)
	+
	+/*
	+ * This struct is 256 bytes in size and is arranged so that the most
	+ * common case (accessing the first 4 pages of a 16KB TLS record) will
	+ * fit in a single 64 byte cacheline.
	+ */
	+struct mbuf_ext_pgs {
	+ uint8_t npgs; /* Number of attached pages */
	+ uint8_t nrdy; /* Pages with I/O pending */
	+ uint8_t hdr_len; /* TLS header length */
	+ uint8_t trail_len; /* TLS trailer length */
	+ uint16_t first_pg_off; /* Offset into 1st page */
	+ uint16_t last_pg_len; /* Length of last page */
	+ vm_paddr_t pa[MBUF_PEXT_MAX_PGS]; /* phys addrs of pages */
	+ char hdr[MBUF_PEXT_HDR_LEN]; /* TLS header */
	+ void tls; / TLS session */
	+#if defined(__i386__) \|\| \
	+ (defined(__powerpc__) && !defined(__powerpc64__) && defined(BOOKE))
	+ /*
	+ * i386 and Book-E PowerPC have 64-bit vm_paddr_t, so there is
	+ * a 4 byte remainder from the space allocated for pa[].
	+ */
	+ uint32_t pad;
	+#endif
	+ union {
	+ char trail[MBUF_PEXT_TRAIL_LEN]; /* TLS trailer */
	+ struct {
	+ struct socket *so;
	+ void *mbuf;
	+ uint64_t seqno;
	+ STAILQ_ENTRY(mbuf_ext_pgs) stailq;
	+ };
	+ };
	+};
	+
	+#ifdef _KERNEL
	+static inline int
	+mbuf_ext_pg_len(struct mbuf_ext_pgs *ext_pgs, int pidx, int pgoff)
	+{
	+ KASSERT(pgoff == 0 \|\| pidx == 0,
	+ ("page %d with non-zero offset %d in %p", pidx, pgoff, ext_pgs));
	+ if (pidx == ext_pgs->npgs - 1) {
	+ return (ext_pgs->last_pg_len);
	+ } else {
	+ return (PAGE_SIZE - pgoff);
	+ }
	+}
	+
	+#ifdef INVARIANT_SUPPORT
	+void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
	+#endif
	+#ifdef INVARIANTS
	+#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) mb_ext_pgs_check((ext_pgs))
	+#else
	+#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs)
	+#endif
	+#endif
	+
	/*
	* mbuf flags of global significance and layer crossing.
	* Those of only protocol/layer specific significance are to be mapped
	@@ -307,7 +401,7 @@
	#define M_MCAST 0x00000020 /* send/received as link-level multicast */
	#define M_PROMISC 0x00000040 /* packet was not for us */
	#define M_VLANTAG 0x00000080 /* ether_vtag is valid */
	-#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */
	+#define M_NOMAP 0x00000100 /* mbuf data is unmapped */
	#define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */
	#define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */
	#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
	@@ -348,7 +442,7 @@
	*/
	#define M_FLAG_BITS \
	"\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
	- "\7M_PROMISC\10M_VLANTAG\13M_TSTMP\14M_TSTMP_HPREC"
	+ "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC"
	#define M_FLAG_PROTOBITS \
	"\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
	"\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
	@@ -420,6 +514,7 @@
	#define EXT_PACKET 6 /* mbuf+cluster from packet zone */
	#define EXT_MBUF 7 /* external mbuf reference */
	#define EXT_RXRING 8 /* data in NIC receive ring */
	+#define EXT_PGS 9 /* array of unmapped pages */

	#define EXT_VENDOR1 224 /* for vendor-internal use */
	#define EXT_VENDOR2 225 /* for vendor-internal use */
	@@ -464,6 +559,11 @@
	"\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
	"\30EXT_FLAG_EXP4"

	+#define MBUF_EXT_PGS_ASSERT(m) \
	+ KASSERT((((m)->m_flags & M_EXT) != 0) && \
	+ ((m)->m_ext.ext_type == EXT_PGS), \
	+ ("%s: m %p !M_EXT or !EXT_PGS", __func__, m))
	+
	/*
	* Flags indicating checksum, segmentation and other offload work to be
	* done, or already done, by hardware or lower layers. It is split into
	@@ -566,6 +666,7 @@
	#define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k"
	#define MBUF_TAG_MEM_NAME "mbuf_tag"
	#define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt"
	+#define MBUF_EXTPGS_MEM_NAME "mbuf_extpgs"

	#ifdef _KERNEL

	@@ -590,9 +691,15 @@
	extern uma_zone_t zone_jumbop;
	extern uma_zone_t zone_jumbo9;
	extern uma_zone_t zone_jumbo16;
	+extern uma_zone_t zone_extpgs;

	void mb_dupcl(struct mbuf , struct mbuf );
	void mb_free_ext(struct mbuf *);
	+void mb_free_mext_pgs(struct mbuf *);
	+struct mbuf *mb_alloc_ext_pgs(int, bool, m_ext_free_t);
	+int mb_unmapped_compress(struct mbuf *m);
	+struct mbuf mb_unmapped_to_ext(struct mbuf m);
	+void mb_free_notready(struct mbuf *m, int count);
	void m_adj(struct mbuf *, int);
	int m_apply(struct mbuf *, int, int,
	int ()(void , void , u_int), void );
	@@ -627,6 +734,7 @@
	struct mbuf m_getptr(struct mbuf , int, int *);
	u_int m_length(struct mbuf , struct mbuf *);
	int m_mbuftouio(struct uio , const struct mbuf , int);
	+int m_unmappedtouio(const struct mbuf , int, struct uio , int);
	void m_move_pkthdr(struct mbuf , struct mbuf );
	int m_pkthdr_init(struct mbuf *, int);
	struct mbuf m_prepend(struct mbuf , int, int);
	@@ -881,7 +989,7 @@
	* be both the local data payload, or an external buffer area, depending on
	* whether M_EXT is set).
	*/
	-#define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \
	+#define M_WRITABLE(m) (((m)->m_flags & (M_RDONLY \| M_NOMAP)) == 0 && \
	(!(((m)->m_flags & M_EXT)) \|\| \
	(m_extrefcnt(m) == 1)))

	@@ -904,7 +1012,8 @@
	* handling external storage, packet-header mbufs, and regular data mbufs.
	*/
	#define M_START(m) \
	- (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
	+ (((m)->m_flags & M_NOMAP) ? NULL : \
	+ ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
	((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \
	&(m)->m_dat[0])

	@@ -1020,6 +1129,7 @@
	extern int max_linkhdr; /* Largest link-level header */
	extern int max_protohdr; /* Largest protocol header */
	extern int nmbclusters; /* Maximum number of clusters */
	+extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */

	/*-
	* Network packets may have annotations attached by affixing a list of
	Index: sys/sys/sglist.h
	===================================================================
	--- sys/sys/sglist.h
	+++ sys/sys/sglist.h
	@@ -57,6 +57,7 @@

	struct bio;
	struct mbuf;
	+struct mbuf_ext_pgs;
	struct uio;

	static __inline void
	@@ -87,6 +88,9 @@
	struct sglist *sglist_alloc(int nsegs, int mflags);
	int sglist_append(struct sglist sg, void buf, size_t len);
	int sglist_append_bio(struct sglist sg, struct bio bp);
	+int sglist_append_ext_pgs(struct sglist sg, struct mbuf_ext_pgs ext_pgs,
	+ size_t off, size_t len);
	+int sglist_append_mb_ext_pgs(struct sglist sg, struct mbuf m);
	int sglist_append_mbuf(struct sglist sg, struct mbuf m0);
	int sglist_append_phys(struct sglist *sg, vm_paddr_t paddr,
	size_t len);
	@@ -101,6 +105,9 @@
	struct sglist sglist_clone(struct sglist sg, int mflags);
	int sglist_consume_uio(struct sglist sg, struct uio uio, size_t resid);
	int sglist_count(void *buf, size_t len);
	+int sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off,
	+ size_t len);
	+int sglist_count_mb_ext_pgs(struct mbuf *m);
	int sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len);
	void sglist_free(struct sglist *sg);
	int sglist_join(struct sglist first, struct sglist second);

File Metadata

Mime Type: text/plain
Expires: Mon, Mar 23, 1:21 PM (5 h, 48 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 30143268
Default Alt Text: D20616.id58545.diff (69 KB)

D20616.id58545.diffNo OneTemporaryActions

D20616.id58545.diffView Options

File Metadata

Event Timeline

D20616.id58545.diff
No OneTemporary
Actions

D20616.id58545.diff
View Options