Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F149144015
D20616.id58545.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
69 KB
Referenced Files
None
Subscribers
None
D20616.id58545.diff
View Options
Index: sbin/ifconfig/ifconfig.8
===================================================================
--- sbin/ifconfig/ifconfig.8
+++ sbin/ifconfig/ifconfig.8
@@ -538,6 +538,12 @@
If the driver supports
.Xr tcp 4
large receive offloading, disable LRO on the interface.
+.It Cm nomap
+If the driver supports unmapped network buffers,
+enable them on the interface.
+.It Fl nomap
+If the driver supports unmapped network buffers,
+disable them on the interface.
.It Cm wol , wol_ucast , wol_mcast , wol_magic
Enable Wake On Lan (WOL) support, if available.
WOL is a facility whereby a machine in a low power state may be woken
Index: sbin/ifconfig/ifconfig.c
===================================================================
--- sbin/ifconfig/ifconfig.c
+++ sbin/ifconfig/ifconfig.c
@@ -1257,7 +1257,7 @@
"\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
"\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP"
/*
* Print the status of the interface. If an address family was
@@ -1557,6 +1557,8 @@
DEF_CMD("-link2", -IFF_LINK2, setifflags),
DEF_CMD("monitor", IFF_MONITOR, setifflags),
DEF_CMD("-monitor", -IFF_MONITOR, setifflags),
+ DEF_CMD("nomap", IFCAP_NOMAP, setifcap),
+ DEF_CMD("-nomap", -IFCAP_NOMAP, setifcap),
DEF_CMD("staticarp", IFF_STATICARP, setifflags),
DEF_CMD("-staticarp", -IFF_STATICARP, setifflags),
DEF_CMD("rxcsum6", IFCAP_RXCSUM_IPV6, setifcap),
Index: share/man/man9/Makefile
===================================================================
--- share/man/man9/Makefile
+++ share/man/man9/Makefile
@@ -1824,6 +1824,8 @@
MLINKS+=sglist.9 sglist_alloc.9 \
sglist.9 sglist_append.9 \
sglist.9 sglist_append_bio.9 \
+ sglist.9 sglist_append_ext_pgs.9 \
+ sglist.9 sglist_append_mb_ext_pgs.9 \
sglist.9 sglist_append_mbuf.9 \
sglist.9 sglist_append_phys.9 \
sglist.9 sglist_append_sglist.9 \
@@ -1834,6 +1836,8 @@
sglist.9 sglist_clone.9 \
sglist.9 sglist_consume_uio.9 \
sglist.9 sglist_count.9 \
+ sglist.9 sglist_count_ext_pgs.9 \
+ sglist.9 sglist_count_mb_ext_pgs.9 \
sglist.9 sglist_count_vmpages.9 \
sglist.9 sglist_free.9 \
sglist.9 sglist_hold.9 \
Index: share/man/man9/mbuf.9
===================================================================
--- share/man/man9/mbuf.9
+++ share/man/man9/mbuf.9
@@ -254,6 +254,8 @@
#define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */
#define EXT_PACKET 6 /* mbuf+cluster from packet zone */
#define EXT_MBUF 7 /* external mbuf reference */
+#define EXT_RXRING 8 /* data in NIC receive ring */
+#define EXT_PGS 9 /* array of unmapped pages */
#define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */
#define EXT_MOD_TYPE 253 /* custom module's ext_buf type */
#define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */
Index: share/man/man9/sglist.9
===================================================================
--- share/man/man9/sglist.9
+++ share/man/man9/sglist.9
@@ -34,6 +34,8 @@
.Nm sglist_alloc ,
.Nm sglist_append ,
.Nm sglist_append_bio ,
+.Nm sglist_append_ext_pgs,
+.Nm sglist_append_mb_ext_pgs,
.Nm sglist_append_mbuf ,
.Nm sglist_append_phys ,
.Nm sglist_append_sglist ,
@@ -44,6 +46,8 @@
.Nm sglist_clone ,
.Nm sglist_consume_uio ,
.Nm sglist_count ,
+.Nm sglist_count_ext_pgs ,
+.Nm sglist_count_mb_ext_pgs ,
.Nm sglist_count_vmpages ,
.Nm sglist_free ,
.Nm sglist_hold ,
@@ -64,6 +68,10 @@
.Ft int
.Fn sglist_append_bio "struct sglist *sg" "struct bio *bp"
.Ft int
+.Fn sglist_append_ext_pgs "struct sglist *sg" "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
+.Ft int
+.Fn sglist_append_mb_ext_pgs "struct sglist *sg" "struct mbuf *m"
+.Ft int
.Fn sglist_append_mbuf "struct sglist *sg" "struct mbuf *m"
.Ft int
.Fn sglist_append_phys "struct sglist *sg" "vm_paddr_t paddr" "size_t len"
@@ -84,6 +92,10 @@
.Ft int
.Fn sglist_count "void *buf" "size_t len"
.Ft int
+.Fn sglist_count_ext_pgs "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
+.Ft int
+.Fn sglist_count_mb_ext_pgs "struct mbuf *m"
+.Ft int
.Fn sglist_count_vmpages "vm_page_t *m" "size_t pgoff" "size_t len"
.Ft void
.Fn sglist_free "struct sglist *sg"
@@ -146,6 +158,22 @@
bytes long.
.Pp
The
+.Nm sglist_count_ext_pgs
+function returns the number of scatter/gather list elements needed to describe
+the unmapped external mbuf buffer
+.Fa ext_pgs .
+The ranges start at an offset of
+.Fa offset
+relative to the start of the buffer and is
+.Fa len
+bytes long.
+The
+.Nm sglist_count_mb_ext_pgs
+function returns the number of scatter/gather list elements needed to describe
+the physical address ranges of a single unmapped mbuf
+.Fa m .
+.Pp
+The
.Nm sglist_count_vmpages
function returns the number of scatter/gather list elements needed to describe
the physical address ranges of a buffer backed by an array of virtual memory
@@ -237,6 +265,34 @@
.Fa sg .
.Pp
The
+.Nm sglist_append_ext_pgs
+function appends the physical address ranges described by the unmapped
+external mbuf buffer
+.Fa ext_pgs
+to the scatter/gather list
+.Fa sg .
+The physical address ranges start at offset
+.Fa offset
+within
+.Fa ext_pgs
+and continue for
+.Fa len
+bytes.
+.Pp
+The
+.Nm sglist_append_mb_ext_pgs
+function appends the physical address ranges described by the unmapped
+mbuf
+.Fa m
+to the scatter/gather list
+.Fa sg .
+Note that unlike
+.Nm sglist_append_mbuf ,
+.Nm sglist_append_mb_ext_pgs
+only adds ranges for a single mbuf,
+not an entire mbuf chain.
+.Pp
+The
.Nm sglist_append_mbuf
function appends the physical address ranges described by an entire mbuf
chain
@@ -467,8 +523,7 @@
.Pp
The
.Nm sglist_count
-and
-.Nm sglist_count_vmpages
+family of
functions return a count of scatter/gather list elements.
.Pp
The
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -4268,7 +4268,8 @@
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
-netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap
+netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \
+ compile-with "${NORMAL_C} ${NO_WNONNULL}"
netinet/tcp_reass.c optional inet | inet6
netinet/tcp_sack.c optional inet | inet6
netinet/tcp_subr.c optional inet | inet6
Index: sys/conf/kern.mk
===================================================================
--- sys/conf/kern.mk
+++ sys/conf/kern.mk
@@ -76,6 +76,7 @@
# GCC 4.2 doesn't have -Wno-error=cast-qual, so just disable the warning for
# the few files that are already known to generate cast-qual warnings.
NO_WCAST_QUAL= -Wno-cast-qual
+NO_WNONNULL= -Wno-nonnull
.endif
.endif
Index: sys/dev/cxgbe/t4_main.c
===================================================================
--- sys/dev/cxgbe/t4_main.c
+++ sys/dev/cxgbe/t4_main.c
@@ -1623,7 +1623,7 @@
#define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \
IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \
IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS | \
- IFCAP_HWRXTSTMP)
+ IFCAP_HWRXTSTMP | IFCAP_NOMAP)
#define T4_CAP_ENABLE (T4_CAP)
static int
@@ -1986,6 +1986,8 @@
rxq->iq.flags &= ~IQ_RX_TIMESTAMP;
}
}
+ if (mask & IFCAP_NOMAP)
+ ifp->if_capenable ^= IFCAP_NOMAP;
#ifdef VLAN_CAPABILITIES
VLAN_CAPABILITIES(ifp);
Index: sys/dev/cxgbe/t4_sge.c
===================================================================
--- sys/dev/cxgbe/t4_sge.c
+++ sys/dev/cxgbe/t4_sge.c
@@ -83,6 +83,7 @@
#endif
/* Internal mbuf flags stored in PH_loc.eight[1]. */
+#define MC_NOMAP 0x01
#define MC_RAW_WR 0x02
/*
@@ -2434,15 +2435,78 @@
return ((void *)p);
}
+static inline int
+count_mbuf_ext_pgs(struct mbuf *m, int skip, vm_paddr_t *nextaddr)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ vm_paddr_t paddr;
+ int i, len, off, pglen, pgoff, seglen, segoff;
+ int nsegs = 0;
+
+ MBUF_EXT_PGS_ASSERT(m);
+ ext_pgs = m->m_ext.ext_pgs;
+ off = mtod(m, vm_offset_t);
+ len = m->m_len;
+ off += skip;
+ len -= skip;
+
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = min(seglen, len);
+ off = 0;
+ len -= seglen;
+ paddr = pmap_kextract(
+ (vm_offset_t)&ext_pgs->hdr[segoff]);
+ if (*nextaddr != paddr)
+ nsegs++;
+ *nextaddr = paddr + seglen;
+ }
+ }
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = min(seglen, len);
+ len -= seglen;
+ paddr = ext_pgs->pa[i] + segoff;
+ if (*nextaddr != paddr)
+ nsegs++;
+ *nextaddr = paddr + seglen;
+ pgoff = 0;
+ };
+ if (len != 0) {
+ seglen = min(len, ext_pgs->trail_len - off);
+ len -= seglen;
+ paddr = pmap_kextract((vm_offset_t)&ext_pgs->trail[off]);
+ if (*nextaddr != paddr)
+ nsegs++;
+ *nextaddr = paddr + seglen;
+ }
+
+ return (nsegs);
+}
+
+
/*
* Can deal with empty mbufs in the chain that have m_len = 0, but the chain
* must have at least one mbuf that's not empty. It is possible for this
* routine to return 0 if skip accounts for all the contents of the mbuf chain.
*/
static inline int
-count_mbuf_nsegs(struct mbuf *m, int skip)
+count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cflags)
{
- vm_paddr_t lastb, next;
+ vm_paddr_t nextaddr, paddr;
vm_offset_t va;
int len, nsegs;
@@ -2451,9 +2515,8 @@
MPASS(m->m_pkthdr.len >= skip);
nsegs = 0;
- lastb = 0;
+ nextaddr = 0;
for (; m; m = m->m_next) {
-
len = m->m_len;
if (__predict_false(len == 0))
continue;
@@ -2461,14 +2524,20 @@
skip -= len;
continue;
}
+ if ((m->m_flags & M_NOMAP) != 0) {
+ *cflags |= MC_NOMAP;
+ nsegs += count_mbuf_ext_pgs(m, skip, &nextaddr);
+ skip = 0;
+ continue;
+ }
va = mtod(m, vm_offset_t) + skip;
len -= skip;
skip = 0;
- next = pmap_kextract(va);
+ paddr = pmap_kextract(va);
nsegs += sglist_count((void *)(uintptr_t)va, len);
- if (lastb + 1 == next)
+ if (paddr == nextaddr)
nsegs--;
- lastb = pmap_kextract(va + len - 1);
+ nextaddr = pmap_kextract(va + len - 1) + 1;
}
return (nsegs);
@@ -2490,7 +2559,9 @@
struct tcphdr *tcp;
#endif
uint16_t eh_type;
+ uint8_t cflags;
+ cflags = 0;
M_ASSERTPKTHDR(m0);
if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
rc = EINVAL;
@@ -2506,7 +2577,7 @@
*/
M_ASSERTPKTHDR(m0);
MPASS(m0->m_pkthdr.len > 0);
- nsegs = count_mbuf_nsegs(m0, 0);
+ nsegs = count_mbuf_nsegs(m0, 0, &cflags);
if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
rc = EFBIG;
@@ -2516,7 +2587,8 @@
goto restart;
}
- if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
+ if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN &&
+ !(cflags & MC_NOMAP))) {
m0 = m_pullup(m0, m0->m_pkthdr.len);
if (m0 == NULL) {
/* Should have left well enough alone. */
@@ -2527,7 +2599,7 @@
goto restart;
}
set_mbuf_nsegs(m0, nsegs);
- set_mbuf_cflags(m0, 0);
+ set_mbuf_cflags(m0, cflags);
if (sc->flags & IS_VF)
set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
else
@@ -2616,7 +2688,9 @@
/* EO WRs have the headers in the WR and not the GL. */
immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen +
m0->m_pkthdr.l4hlen;
- nsegs = count_mbuf_nsegs(m0, immhdrs);
+ cflags = 0;
+ nsegs = count_mbuf_nsegs(m0, immhdrs, &cflags);
+ MPASS(cflags == mbuf_cflags(m0));
set_mbuf_eo_nsegs(m0, nsegs);
set_mbuf_eo_len16(m0,
txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0)));
@@ -4723,7 +4797,8 @@
ctrl = sizeof(struct cpl_tx_pkt_core);
if (needs_tso(m0))
ctrl += sizeof(struct cpl_tx_pkt_lso_core);
- else if (pktlen <= imm_payload(2) && available >= 2) {
+ else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
+ available >= 2) {
/* Immediate data. Recalculate len16 and set nsegs to 0. */
ctrl += pktlen;
len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
Index: sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- sys/dev/cxgbe/tom/t4_cpl_io.c
+++ sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -666,6 +666,8 @@
if (IS_AIOTX_MBUF(m))
rc = sglist_append_vmpages(&sg, aiotx_mbuf_pages(m),
aiotx_mbuf_pgoff(m), m->m_len);
+ else if (m->m_flags & M_NOMAP)
+ rc = sglist_append_mb_ext_pgs(&sg, m);
else
rc = sglist_append(&sg, mtod(m, void *), m->m_len);
if (__predict_false(rc != 0))
@@ -787,6 +789,8 @@
if (IS_AIOTX_MBUF(m))
n = sglist_count_vmpages(aiotx_mbuf_pages(m),
aiotx_mbuf_pgoff(m), m->m_len);
+ else if (m->m_flags & M_NOMAP)
+ n = sglist_count_mb_ext_pgs(m);
else
n = sglist_count(mtod(m, void *), m->m_len);
Index: sys/dev/mlx5/mlx5_en/mlx5_en_main.c
===================================================================
--- sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -3279,6 +3279,8 @@
"tso6 disabled due to -txcsum6.\n");
}
}
+ if (mask & IFCAP_NOMAP)
+ ifp->if_capenable ^= IFCAP_NOMAP;
if (mask & IFCAP_RXCSUM)
ifp->if_capenable ^= IFCAP_RXCSUM;
if (mask & IFCAP_RXCSUM_IPV6)
@@ -4145,6 +4147,7 @@
ifp->if_capabilities |= IFCAP_LRO;
ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP;
+ ifp->if_capabilities |= IFCAP_NOMAP;
ifp->if_capabilities |= IFCAP_TXRTLMT;
ifp->if_snd_tag_alloc = mlx5e_snd_tag_alloc;
ifp->if_snd_tag_free = mlx5e_snd_tag_free;
Index: sys/kern/kern_mbuf.c
===================================================================
--- sys/kern/kern_mbuf.c
+++ sys/kern/kern_mbuf.c
@@ -45,6 +45,7 @@
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/protosw.h>
+#include <sys/sf_buf.h>
#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
@@ -111,6 +112,11 @@
int nmbjumbo9; /* limits number of 9k jumbo clusters */
int nmbjumbo16; /* limits number of 16k jumbo clusters */
+bool mb_use_ext_pgs; /* use EXT_PGS mbufs for sendfile */
+SYSCTL_BOOL(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLFLAG_RWTUN,
+ &mb_use_ext_pgs, 0,
+ "Use unmapped mbufs for sendfile(2)");
+
static quad_t maxmbufmem; /* overall real memory limit for all mbufs */
SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0,
@@ -281,6 +287,7 @@
uma_zone_t zone_jumbop;
uma_zone_t zone_jumbo9;
uma_zone_t zone_jumbo16;
+uma_zone_t zone_extpgs;
/*
* Local prototypes.
@@ -298,6 +305,9 @@
/* Ensure that MSIZE is a power of 2. */
CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
+_Static_assert(sizeof(struct mbuf_ext_pgs) == 256,
+ "mbuf_ext_pgs size mismatch");
+
/*
* Initialize FreeBSD Network buffer allocation.
*/
@@ -379,6 +389,15 @@
uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
+ zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME,
+ sizeof(struct mbuf_ext_pgs),
+#ifdef INVARIANTS
+ trash_ctor, trash_dtor, trash_init, trash_fini,
+#else
+ NULL, NULL, NULL, NULL,
+#endif
+ UMA_ALIGN_CACHE, 0);
+
/*
* Hook event handler for low-memory situation, used to
* drain protocols and push data back to the caches (UMA
@@ -823,6 +842,391 @@
(*pr->pr_drain)();
}
+/*
+ * Free "count" units of I/O from an mbuf chain. They could be held
+ * in EXT_PGS or just as a normal mbuf. This code is intended to be
+ * called in an error path (I/O error, closed connection, etc).
+ */
+void
+mb_free_notready(struct mbuf *m, int count)
+{
+ int i;
+
+ for (i = 0; i < count && m != NULL; i++) {
+ if ((m->m_flags & M_EXT) != 0 &&
+ m->m_ext.ext_type == EXT_PGS) {
+ m->m_ext.ext_pgs->nrdy--;
+ if (m->m_ext.ext_pgs->nrdy != 0)
+ continue;
+ }
+ m = m_free(m);
+ }
+ KASSERT(i == count, ("Removed only %d items from %p", i, m));
+}
+
+/*
+ * Ensure it is possible to downgrade an EXT_PGS mbuf
+ * to a normal mbuf.
+ *
+ * XXXJHB: I think this is no longer needed? The callers of
+ * mb_unmapped_compress all check the length against MLEN, and
+ * mb_unmapped_compress allows data to be stored in unmapped pages.
+ */
+CTASSERT(MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN < MLEN);
+
+/*
+ * Compress an unmapped mbuf into a simple mbuf when it holds a small
+ * amount of data. This is used as a DOS defense to avoid having
+ * small packets tie up wired pages, an ext_pgs structure, and an
+ * mbuf. Since this converts the existing mbuf in place, it can only
+ * be used if there are no other references to 'm'.
+ */
+int
+mb_unmapped_compress(struct mbuf *m)
+{
+ volatile u_int *refcnt;
+ struct mbuf m_temp;
+
+ /*
+ * Assert that 'm' does not have a packet header. If 'm' had
+ * a packet header, it would only be able to hold MHLEN bytes
+ * and m_data would have to be initialized differently.
+ */
+ KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) &&
+ m->m_ext.ext_type == EXT_PGS,
+ ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m));
+ KASSERT(m->m_data == 0, ("m_data != 0 %p", m));
+ KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
+
+ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
+ refcnt = &m->m_ext.ext_count;
+ } else {
+ KASSERT(m->m_ext.ext_cnt != NULL,
+ ("%s: no refcounting pointer on %p", __func__, m));
+ refcnt = m->m_ext.ext_cnt;
+ }
+
+ if (*refcnt != 1)
+ return (EBUSY);
+
+ /*
+ * Copy m_ext portion of 'm' to 'm_temp' to create a "fake"
+ * EXT_PGS mbuf that can be used with m_copydata() as well as
+ * the ext_free callback.
+ */
+ memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext));
+ m_temp.m_next = NULL;
+ m_temp.m_nextpkt = NULL;
+
+ /* Turn 'm' into a "normal" mbuf. */
+ m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP);
+ m->m_data = m->m_dat;
+
+ /* Copy data from template's ext_pgs. */
+ m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t));
+
+ /* Free the backing pages. */
+ m_temp.m_ext.ext_free(&m_temp);
+
+ /* Finally, free the ext_pgs struct. */
+ uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs);
+ return (0);
+}
+
+/*
+ * These next few routines are used to permit downgrading an unmapped
+ * mbuf to a chain of mapped mbufs. This is used when an interface
+ * doesn't supported unmapped mbufs or if checksums need to be
+ * computed in software.
+ *
+ * Each unmapped mbuf is converted to a chain of mbufs. First, any
+ * TLS header data is stored in a regular mbuf. Second, each page of
+ * unmapped data is stored in an mbuf with an EXT_SFBUF external
+ * cluster. These mbufs use an sf_buf to provide a valid KVA for the
+ * associated physical page. They also hold a reference on the
+ * original EXT_PGS mbuf to ensure the physical page doesn't go away.
+ * Finally, any TLS trailer data is stored in a regular mbuf.
+ *
+ * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
+ * mbufs. It frees the associated sf_buf and releases its reference
+ * on the original EXT_PGS mbuf.
+ *
+ * _mb_unmapped_to_ext() is a helper function that converts a single
+ * unmapped mbuf into a chain of mbufs.
+ *
+ * mb_unmapped_to_ext() is the public function that walks an mbuf
+ * chain converting any unmapped mbufs to mapped mbufs. It returns
+ * the new chain of unmapped mbufs on success. On failure it frees
+ * the original mbuf chain and returns NULL.
+ */
+static void
+mb_unmapped_free_mext(struct mbuf *m)
+{
+ struct sf_buf *sf;
+ struct mbuf *old_m;
+
+ sf = m->m_ext.ext_arg1;
+ sf_buf_free(sf);
+
+ /* Drop the reference on the backing EXT_PGS mbuf. */
+ old_m = m->m_ext.ext_arg2;
+ mb_free_ext(old_m);
+}
+
+static struct mbuf *
+_mb_unmapped_to_ext(struct mbuf *m)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ struct mbuf *m_new, *top, *prev, *mref;
+ struct sf_buf *sf;
+ vm_page_t pg;
+ int i, len, off, pglen, pgoff, seglen, segoff;
+ volatile u_int *refcnt;
+ u_int ref_inc = 0;
+
+ MBUF_EXT_PGS_ASSERT(m);
+ ext_pgs = m->m_ext.ext_pgs;
+ len = m->m_len;
+ KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p",
+ __func__, m));
+
+ /* See if this is the mbuf that holds the embedded refcount. */
+ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
+ refcnt = &m->m_ext.ext_count;
+ mref = m;
+ } else {
+ KASSERT(m->m_ext.ext_cnt != NULL,
+ ("%s: no refcounting pointer on %p", __func__, m));
+ refcnt = m->m_ext.ext_cnt;
+ mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
+ }
+
+ /* Skip over any data removed from the front. */
+ off = mtod(m, vm_offset_t);
+
+ top = NULL;
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = min(seglen, len);
+ off = 0;
+ len -= seglen;
+ m_new = m_get(M_NOWAIT, MT_DATA);
+ if (m_new == NULL)
+ goto fail;
+ m_new->m_len = seglen;
+ prev = top = m_new;
+ memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff],
+ seglen);
+ }
+ }
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = min(seglen, len);
+ len -= seglen;
+
+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+ m_new = m_get(M_NOWAIT, MT_DATA);
+ if (m_new == NULL)
+ goto fail;
+ if (top == NULL) {
+ top = prev = m_new;
+ } else {
+ prev->m_next = m_new;
+ prev = m_new;
+ }
+ sf = sf_buf_alloc(pg, SFB_NOWAIT);
+ if (sf == NULL)
+ goto fail;
+
+ ref_inc++;
+ m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
+ mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
+ m_new->m_data += segoff;
+ m_new->m_len = seglen;
+
+ pgoff = 0;
+ };
+ if (len != 0) {
+ KASSERT((off + len) <= ext_pgs->trail_len,
+ ("off + len > trail (%d + %d > %d)", off, len,
+ ext_pgs->trail_len));
+ m_new = m_get(M_NOWAIT, MT_DATA);
+ if (m_new == NULL)
+ goto fail;
+ if (top == NULL)
+ top = m_new;
+ else
+ prev->m_next = m_new;
+ m_new->m_len = len;
+ memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len);
+ }
+
+ if (ref_inc != 0) {
+ /*
+ * Obtain an additional reference on the old mbuf for
+ * each created EXT_SFBUF mbuf. They will be dropped
+ * in mb_unmapped_free_mext().
+ */
+ if (*refcnt == 1)
+ *refcnt += ref_inc;
+ else
+ atomic_add_int(refcnt, ref_inc);
+ }
+ m_free(m);
+ return (top);
+
+fail:
+ if (ref_inc != 0) {
+ /*
+ * Obtain an additional reference on the old mbuf for
+ * each created EXT_SFBUF mbuf. They will be
+ * immediately dropped when these mbufs are freed
+ * below.
+ */
+ if (*refcnt == 1)
+ *refcnt += ref_inc;
+ else
+ atomic_add_int(refcnt, ref_inc);
+ }
+ m_free(m);
+ m_freem(top);
+ return (NULL);
+}
+
+struct mbuf *
+mb_unmapped_to_ext(struct mbuf *top)
+{
+ struct mbuf *m, *next, *prev = NULL;
+
+ prev = NULL;
+ for (m = top; m != NULL; m = next) {
+ /* m might be freed, so cache the next pointer. */
+ next = m->m_next;
+ if (m->m_flags & M_NOMAP) {
+ if (prev != NULL) {
+ /*
+ * Remove 'm' from the new chain so
+ * that the 'top' chain terminates
+ * before 'm' in case 'top' is freed
+ * due to an error.
+ */
+ prev->m_next = NULL;
+ }
+ m = _mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ m_freem(top);
+ m_freem(next);
+ return (NULL);
+ }
+ if (prev == NULL) {
+ top = m;
+ } else {
+ prev->m_next = m;
+ }
+
+ /*
+ * Replaced one mbuf with a chain, so we must
+ * find the end of chain.
+ */
+ prev = m_last(m);
+ } else {
+ if (prev != NULL) {
+ prev->m_next = m;
+ }
+ prev = m;
+ }
+ }
+ return (top);
+}
+
+/*
+ * Allocate an empty EXT_PGS mbuf. The ext_free routine is
+ * responsible for freeing any pages backing this mbuf when it is
+ * freed.
+ */
+struct mbuf *
+mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free)
+{
+ struct mbuf *m;
+ struct mbuf_ext_pgs *ext_pgs;
+
+ if (pkthdr)
+ m = m_gethdr(how, MT_DATA);
+ else
+ m = m_get(how, MT_DATA);
+ if (m == NULL)
+ return (NULL);
+
+ ext_pgs = uma_zalloc(zone_extpgs, how);
+ if (ext_pgs == NULL) {
+ m_free(m);
+ return (NULL);
+ }
+ ext_pgs->npgs = 0;
+ ext_pgs->nrdy = 0;
+ ext_pgs->first_pg_off = 0;
+ ext_pgs->last_pg_len = 0;
+ ext_pgs->hdr_len = 0;
+ ext_pgs->trail_len = 0;
+ ext_pgs->tls = NULL;
+ ext_pgs->so = NULL;
+ m->m_data = NULL;
+ m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP);
+ m->m_ext.ext_type = EXT_PGS;
+ m->m_ext.ext_flags = EXT_FLAG_EMBREF;
+ m->m_ext.ext_count = 1;
+ m->m_ext.ext_pgs = ext_pgs;
+ m->m_ext.ext_size = 0;
+ m->m_ext.ext_free = ext_free;
+ return (m);
+}
+
+#ifdef INVARIANT_SUPPORT
+void
+mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs)
+{
+
+ /*
+ * NB: This expects a non-empty buffer (npgs > 0 and
+ * last_pg_len > 0).
+ */
+ KASSERT(ext_pgs->npgs > 0,
+ ("ext_pgs with no valid pages: %p", ext_pgs));
+ KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa),
+ ("ext_pgs with too many pages: %p", ext_pgs));
+ KASSERT(ext_pgs->nrdy <= ext_pgs->npgs,
+ ("ext_pgs with too many ready pages: %p", ext_pgs));
+ KASSERT(ext_pgs->first_pg_off < PAGE_SIZE,
+ ("ext_pgs with too large page offset: %p", ext_pgs));
+ KASSERT(ext_pgs->last_pg_len > 0,
+ ("ext_pgs with zero last page length: %p", ext_pgs));
+ KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE,
+ ("ext_pgs with too large last page length: %p", ext_pgs));
+ if (ext_pgs->npgs == 1) {
+ KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <=
+ PAGE_SIZE, ("ext_pgs with single page too large: %p",
+ ext_pgs));
+ }
+ KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr),
+ ("ext_pgs with too large header length: %p", ext_pgs));
+ KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail),
+ ("ext_pgs with too large header length: %p", ext_pgs));
+}
+#endif
+
/*
* Clean up after mbufs with M_EXT storage attached to them if the
* reference count hits 1.
@@ -888,6 +1292,10 @@
uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
uma_zfree(zone_mbuf, mref);
break;
+ case EXT_PGS:
+ uma_zfree(zone_extpgs, mref->m_ext.ext_pgs);
+ uma_zfree(zone_mbuf, mref);
+ break;
case EXT_SFBUF:
case EXT_NET_DRV:
case EXT_MOD_TYPE:
Index: sys/kern/kern_sendfile.c
===================================================================
--- sys/kern/kern_sendfile.c
+++ sys/kern/kern_sendfile.c
@@ -34,6 +34,7 @@
#include <sys/systm.h>
#include <sys/capsicum.h>
#include <sys/kernel.h>
+#include <netinet/in.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/sysproto.h>
@@ -62,6 +63,7 @@
#define EXT_FLAG_SYNC EXT_FLAG_VENDOR1
#define EXT_FLAG_NOCACHE EXT_FLAG_VENDOR2
+#define EXT_FLAG_CACHE_LAST EXT_FLAG_VENDOR3
/*
* Structure describing a single sendfile(2) I/O, which may consist of
@@ -201,6 +203,39 @@
}
}
+static void
+sendfile_free_mext_pg(struct mbuf *m)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ vm_page_t pg;
+ int i;
+ bool nocache, cache_last;
+
+ KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_PGS,
+ ("%s: m %p !M_EXT or !EXT_PGS", __func__, m));
+
+ nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE;
+ cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST;
+ ext_pgs = m->m_ext.ext_pgs;
+
+ for (i = 0; i < ext_pgs->npgs; i++) {
+ if (cache_last && i == ext_pgs->npgs - 1)
+ nocache = false;
+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+ sendfile_free_page(pg, nocache);
+ }
+
+ if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
+ struct sendfile_sync *sfs = m->m_ext.ext_arg2;
+
+ mtx_lock(&sfs->mtx);
+ KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
+ if (--sfs->count == 0)
+ cv_signal(&sfs->cv);
+ mtx_unlock(&sfs->mtx);
+ }
+}
+
/*
* Helper function to calculate how much data to put into page i of n.
* Only first and last pages are special.
@@ -283,8 +318,6 @@
CURVNET_SET(so->so_vnet);
if (sfio->error) {
- struct mbuf *m;
-
/*
* I/O operation failed. The state of data in the socket
* is now inconsistent, and all what we can do is to tear
@@ -299,11 +332,9 @@
so->so_proto->pr_usrreqs->pru_abort(so);
so->so_error = EIO;
- m = sfio->m;
- for (int i = 0; i < sfio->npages; i++)
- m = m_free(m);
+ mb_free_notready(sfio->m, sfio->npages);
} else
- (void )(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
+ (void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
sfio->npages);
SOCK_LOCK(so);
@@ -540,13 +571,15 @@
struct vnode *vp;
struct vm_object *obj;
struct socket *so;
+ struct mbuf_ext_pgs *ext_pgs;
struct mbuf *m, *mh, *mhtail;
struct sf_buf *sf;
struct shmfd *shmfd;
struct sendfile_sync *sfs;
struct vattr va;
off_t off, sbytes, rem, obj_size;
- int error, softerr, bsize, hdrlen;
+ int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr;
+ bool use_ext_pgs;
obj = NULL;
so = NULL;
@@ -554,6 +587,7 @@
sfs = NULL;
hdrlen = sbytes = 0;
softerr = 0;
+ use_ext_pgs = false;
error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
if (error != 0)
@@ -714,6 +748,17 @@
if (space > rem)
space = rem;
+ else if (space > PAGE_SIZE) {
+ /*
+ * Use page boundaries when possible for large
+ * requests.
+ */
+ if (off & PAGE_MASK)
+ space -= (PAGE_SIZE - (off & PAGE_MASK));
+ space = trunc_page(space);
+ if (off & PAGE_MASK)
+ space += (PAGE_SIZE - (off & PAGE_MASK));
+ }
npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);
@@ -751,6 +796,22 @@
* dumped into socket buffer.
*/
pa = sfio->pa;
+
+ /*
+ * Use unmapped mbufs if enabled for TCP. Unmapped
+ * bufs are restricted to TCP as that is what has been
+ * tested. In particular, unmapped mbufs have not
+ * been tested with UNIX-domain sockets.
+ */
+ if (mb_use_ext_pgs &&
+ so->so_proto->pr_protocol == IPPROTO_TCP) {
+ use_ext_pgs = true;
+ max_pgs = MBUF_PEXT_MAX_PGS;
+
+ /* Start at last index, to wrap on first use. */
+ ext_pgs_idx = max_pgs - 1;
+ }
+
for (int i = 0; i < npages; i++) {
struct mbuf *m0;
@@ -766,6 +827,66 @@
break;
}
+ if (use_ext_pgs) {
+ off_t xfs;
+
+ ext_pgs_idx++;
+ if (ext_pgs_idx == max_pgs) {
+ m0 = mb_alloc_ext_pgs(M_WAITOK, false,
+ sendfile_free_mext_pg);
+
+ if (flags & SF_NOCACHE) {
+ m0->m_ext.ext_flags |=
+ EXT_FLAG_NOCACHE;
+
+ /*
+ * See comment below regarding
+ * ignoring SF_NOCACHE for the
+ * last page.
+ */
+ if ((npages - i <= max_pgs) &&
+ ((off + space) & PAGE_MASK) &&
+ (rem > space || rhpages > 0))
+ m0->m_ext.ext_flags |=
+ EXT_FLAG_CACHE_LAST;
+ }
+ if (sfs != NULL) {
+ m0->m_ext.ext_flags |=
+ EXT_FLAG_SYNC;
+ m0->m_ext.ext_arg2 = sfs;
+ mtx_lock(&sfs->mtx);
+ sfs->count++;
+ mtx_unlock(&sfs->mtx);
+ }
+ ext_pgs = m0->m_ext.ext_pgs;
+ if (i == 0)
+ sfio->m = m0;
+ ext_pgs_idx = 0;
+
+ /* Append to mbuf chain. */
+ if (mtail != NULL)
+ mtail->m_next = m0;
+ else
+ m = m0;
+ mtail = m0;
+ ext_pgs->first_pg_off =
+ vmoff(i, off) & PAGE_MASK;
+ }
+ if (nios) {
+ mtail->m_flags |= M_NOTREADY;
+ ext_pgs->nrdy++;
+ }
+
+ ext_pgs->pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pa[i]);
+ ext_pgs->npgs++;
+ xfs = xfsize(i, npages, off, space);
+ ext_pgs->last_pg_len = xfs;
+ MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs);
+ mtail->m_len += xfs;
+ mtail->m_ext.ext_size += PAGE_SIZE;
+ continue;
+ }
+
/*
* Get a sendfile buf. When allocating the
* first buffer for mbuf chain, we usually
Index: sys/kern/subr_bus_dma.c
===================================================================
--- sys/kern/subr_bus_dma.c
+++ sys/kern/subr_bus_dma.c
@@ -110,6 +110,67 @@
return (error);
}
+/*
+ * Load an unmapped mbuf
+ */
+static int
+_bus_dmamap_load_unmapped_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+ struct mbuf *m, bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ int error, i, off, len, pglen, pgoff, seglen, segoff;
+
+ MBUF_EXT_PGS_ASSERT(m);
+ ext_pgs = m->m_ext.ext_pgs;
+
+ len = m->m_len;
+ error = 0;
+
+ /* Skip over any data removed from the front. */
+ off = mtod(m, vm_offset_t);
+
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = min(seglen, len);
+ off = 0;
+ len -= seglen;
+ error = _bus_dmamap_load_buffer(dmat, map,
+ &ext_pgs->hdr[segoff], seglen, kernel_pmap,
+ flags, segs, nsegs);
+ }
+ }
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = min(seglen, len);
+ len -= seglen;
+ error = _bus_dmamap_load_phys(dmat, map,
+ ext_pgs->pa[i] + segoff, seglen, flags, segs, nsegs);
+ pgoff = 0;
+ };
+ if (len != 0 && error == 0) {
+ KASSERT((off + len) <= ext_pgs->trail_len,
+ ("off + len > trail (%d + %d > %d)", off, len,
+ ext_pgs->trail_len));
+ error = _bus_dmamap_load_buffer(dmat, map,
+ &ext_pgs->trail[off], len, kernel_pmap, flags, segs,
+ nsegs);
+ }
+ return (error);
+}
+
/*
* Load an mbuf chain.
*/
@@ -123,9 +184,13 @@
error = 0;
for (m = m0; m != NULL && error == 0; m = m->m_next) {
if (m->m_len > 0) {
- error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
- m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF,
- segs, nsegs);
+ if ((m->m_flags & M_NOMAP) != 0)
+ error = _bus_dmamap_load_unmapped_mbuf_sg(dmat,
+ map, m, segs, nsegs, flags);
+ else
+ error = _bus_dmamap_load_buffer(dmat, map,
+ m->m_data, m->m_len, kernel_pmap,
+ flags | BUS_DMA_LOAD_MBUF, segs, nsegs);
}
}
CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
Index: sys/kern/subr_sglist.c
===================================================================
--- sys/kern/subr_sglist.c
+++ sys/kern/subr_sglist.c
@@ -218,6 +218,75 @@
return (nsegs);
}
+/*
+ * Determine the number of scatter/gather list elements needed to
+ * describe an EXT_PGS buffer.
+ */
+int
+sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, size_t len)
+{
+ vm_paddr_t nextaddr, paddr;
+ size_t seglen, segoff;
+ int i, nsegs, pglen, pgoff;
+
+ if (len == 0)
+ return (0);
+
+ nsegs = 0;
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = MIN(seglen, len);
+ off = 0;
+ len -= seglen;
+ nsegs += sglist_count(&ext_pgs->hdr[segoff], seglen);
+ }
+ }
+ nextaddr = 0;
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = MIN(seglen, len);
+ len -= seglen;
+ paddr = ext_pgs->pa[i] + segoff;
+ if (paddr != nextaddr)
+ nsegs++;
+ nextaddr = paddr + seglen;
+ pgoff = 0;
+ };
+ if (len != 0) {
+ seglen = MIN(len, ext_pgs->trail_len - off);
+ len -= seglen;
+ nsegs += sglist_count(&ext_pgs->trail[off], seglen);
+ }
+ KASSERT(len == 0, ("len != 0"));
+ return (nsegs);
+}
+
+/*
+ * Determine the number of scatter/gather list elements needed to
+ * describe an EXT_PGS mbuf.
+ */
+int
+sglist_count_mb_ext_pgs(struct mbuf *m)
+{
+
+ MBUF_EXT_PGS_ASSERT(m);
+ return (sglist_count_ext_pgs(m->m_ext.ext_pgs, mtod(m, vm_offset_t),
+ m->m_len));
+}
+
/*
* Allocate a scatter/gather list along with 'nsegs' segments. The
* 'mflags' parameters are the same as passed to malloc(9). The caller
@@ -319,6 +388,76 @@
return (error);
}
+/*
+ * Append the segments to describe an EXT_PGS buffer to a
+ * scatter/gather list. If there are insufficient segments, then this
+ * fails with EFBIG.
+ */
+int
+sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
+ size_t off, size_t len)
+{
+ size_t seglen, segoff;
+ vm_paddr_t paddr;
+ int error, i, pglen, pgoff;
+
+ error = 0;
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = MIN(seglen, len);
+ off = 0;
+ len -= seglen;
+ error = sglist_append(sg,
+ &ext_pgs->hdr[segoff], seglen);
+ }
+ }
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = MIN(seglen, len);
+ len -= seglen;
+ paddr = ext_pgs->pa[i] + segoff;
+ error = sglist_append_phys(sg, paddr, seglen);
+ pgoff = 0;
+ };
+ if (error == 0 && len > 0) {
+ seglen = MIN(len, ext_pgs->trail_len - off);
+ len -= seglen;
+ error = sglist_append(sg,
+ &ext_pgs->trail[off], seglen);
+ }
+ if (error == 0)
+ KASSERT(len == 0, ("len != 0"));
+ return (error);
+}
+
+/*
+ * Append the segments to describe an EXT_PGS mbuf to a scatter/gather
+ * list. If there are insufficient segments, then this fails with
+ * EFBIG.
+ */
+int
+sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m)
+{
+
+ /* for now, all unmapped mbufs are assumed to be EXT_PGS */
+ MBUF_EXT_PGS_ASSERT(m);
+ return (sglist_append_ext_pgs(sg, m->m_ext.ext_pgs,
+ mtod(m, vm_offset_t), m->m_len));
+}
+
/*
* Append the segments that describe a single mbuf chain to a
* scatter/gather list. If there are insufficient segments, then this
@@ -338,7 +477,11 @@
SGLIST_SAVE(sg, save);
for (m = m0; m != NULL; m = m->m_next) {
if (m->m_len > 0) {
- error = sglist_append(sg, m->m_data, m->m_len);
+ if ((m->m_flags & M_NOMAP) != 0)
+ error = sglist_append_mb_ext_pgs(sg, m);
+ else
+ error = sglist_append(sg, m->m_data,
+ m->m_len);
if (error) {
SGLIST_RESTORE(sg, save);
return (error);
Index: sys/kern/uipc_mbuf.c
===================================================================
--- sys/kern/uipc_mbuf.c
+++ sys/kern/uipc_mbuf.c
@@ -50,6 +50,10 @@
#include <sys/protosw.h>
#include <sys/uio.h>
#include <sys/sdt.h>
+#include <vm/vm.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
+#include <sys/vmmeter.h>
SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
"struct mbuf *", "mbufinfo_t *",
@@ -202,7 +206,7 @@
else
bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
n->m_flags |= M_EXT;
- n->m_flags |= m->m_flags & M_RDONLY;
+ n->m_flags |= m->m_flags & (M_RDONLY | M_NOMAP);
/* See if this is the mbuf that holds the embedded refcount. */
if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
@@ -246,7 +250,8 @@
__func__, m, m0));
if (m->m_flags & M_PKTHDR)
m_demote_pkthdr(m);
- m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
+ m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE |
+ M_NOMAP | flags);
}
}
@@ -376,7 +381,8 @@
if (to->m_flags & M_PKTHDR)
m_tag_delete_chain(to, NULL);
#endif
- to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+ to->m_flags = (from->m_flags & M_COPYFLAGS) |
+ (to->m_flags & (M_EXT | M_NOMAP));
if ((to->m_flags & M_EXT) == 0)
to->m_data = to->m_pktdat;
to->m_pkthdr = from->m_pkthdr; /* especially tags */
@@ -414,7 +420,8 @@
if (to->m_flags & M_PKTHDR)
m_tag_delete_chain(to, NULL);
#endif
- to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+ to->m_flags = (from->m_flags & M_COPYFLAGS) |
+ (to->m_flags & (M_EXT | M_NOMAP));
if ((to->m_flags & M_EXT) == 0)
to->m_data = to->m_pktdat;
to->m_pkthdr = from->m_pkthdr;
@@ -579,6 +586,30 @@
return (NULL);
}
+static void
+m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
+{
+ struct iovec iov;
+ struct uio uio;
+ int error;
+
+ KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
+ KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
+ KASSERT(off < m->m_len,
+ ("m_copyfromunmapped: len exceeds mbuf length"));
+ iov.iov_base = cp;
+ iov.iov_len = len;
+ uio.uio_resid = len;
+ uio.uio_iov = &iov;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_rw = UIO_READ;
+ error = m_unmappedtouio(m, off, &uio, len);
+ KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off,
+ len));
+}
+
/*
* Copy data from an mbuf chain starting "off" bytes from the beginning,
* continuing for "len" bytes, into the indicated buffer.
@@ -600,7 +631,10 @@
while (len > 0) {
KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain"));
count = min(m->m_len - off, len);
- bcopy(mtod(m, caddr_t) + off, cp, count);
+ if ((m->m_flags & M_NOMAP) != 0)
+ m_copyfromunmapped(m, off, count, cp);
+ else
+ bcopy(mtod(m, caddr_t) + off, cp, count);
len -= count;
cp += count;
off = 0;
@@ -695,6 +729,7 @@
m = m->m_next;
while (n) {
if (!M_WRITABLE(m) ||
+ (n->m_flags & M_NOMAP) != 0 ||
M_TRAILINGSPACE(m) < n->m_len) {
/* just join the two chains */
m->m_next = n;
@@ -812,6 +847,9 @@
int count;
int space;
+ KASSERT((n->m_flags & M_NOMAP) == 0,
+ ("%s: unmapped mbuf %p", __func__, n));
+
/*
* If first mbuf has no cluster, and has room for len bytes
* without shifting current data, pullup into it,
@@ -1364,6 +1402,41 @@
return (NULL);
}
+/*
+ * Return the number of fragments an mbuf will use. This is usually
+ * used as a proxy for the number of scatter/gather elements needed by
+ * a DMA engine to access an mbuf. In general mapped mbufs are
+ * assumed to be backed by physically contiguous buffers that only
+ * need a single fragment. Unmapped mbufs, on the other hand, can
+ * span disjoint physical pages.
+ */
+static int
+frags_per_mbuf(struct mbuf *m)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ int frags;
+
+ if ((m->m_flags & M_NOMAP) == 0)
+ return (1);
+
+ /*
+ * The header and trailer are counted as a single fragment
+ * each when present.
+ *
+ * XXX: This overestimates the number of fragments by assuming
+ * all the backing physical pages are disjoint.
+ */
+ ext_pgs = m->m_ext.ext_pgs;
+ frags = 0;
+ if (ext_pgs->hdr_len != 0)
+ frags++;
+ frags += ext_pgs->npgs;
+ if (ext_pgs->trail_len != 0)
+ frags++;
+
+ return (frags);
+}
+
/*
* Defragment an mbuf chain, returning at most maxfrags separate
* mbufs+clusters. If this is not possible NULL is returned and
@@ -1384,7 +1457,7 @@
*/
curfrags = 0;
for (m = m0; m != NULL; m = m->m_next)
- curfrags++;
+ curfrags += frags_per_mbuf(m);
/*
* First, try to collapse mbufs. Note that we always collapse
* towards the front so we don't need to deal with moving the
@@ -1399,12 +1472,13 @@
break;
if (M_WRITABLE(m) &&
n->m_len < M_TRAILINGSPACE(m)) {
- bcopy(mtod(n, void *), mtod(m, char *) + m->m_len,
- n->m_len);
+ m_copydata(n, 0, n->m_len,
+ mtod(m, char *) + m->m_len);
m->m_len += n->m_len;
m->m_next = n->m_next;
+ curfrags -= frags_per_mbuf(n);
m_free(n);
- if (--curfrags <= maxfrags)
+ if (curfrags <= maxfrags)
return m0;
} else
m = n;
@@ -1421,15 +1495,18 @@
m = m_getcl(how, MT_DATA, 0);
if (m == NULL)
goto bad;
- bcopy(mtod(n, void *), mtod(m, void *), n->m_len);
- bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len,
- n2->m_len);
+ m_copydata(n, 0, n->m_len, mtod(m, char *));
+ m_copydata(n2, 0, n2->m_len,
+ mtod(m, char *) + n->m_len);
m->m_len = n->m_len + n2->m_len;
m->m_next = n2->m_next;
*prev = m;
+ curfrags += 1; /* For the new cluster */
+ curfrags -= frags_per_mbuf(n);
+ curfrags -= frags_per_mbuf(n2);
m_free(n);
m_free(n2);
- if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */
+ if (curfrags <= maxfrags)
return m0;
/*
* Still not there, try the normal collapse
@@ -1529,6 +1606,111 @@
#endif
+/*
+ * Free pages from mbuf_ext_pgs, assuming they were allocated via
+ * vm_page_alloc() and aren't associated with any object. Complement
+ * to allocator from m_uiotombuf_nomap().
+ */
+void
+mb_free_mext_pgs(struct mbuf *m)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ vm_page_t pg;
+ int wire_adj;
+
+ MBUF_EXT_PGS_ASSERT(m);
+ ext_pgs = m->m_ext.ext_pgs;
+ wire_adj = 0;
+ for (int i = 0; i < ext_pgs->npgs; i++) {
+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+ /*
+ * Note: page is not locked, as it has no
+ * object and is not on any queues.
+ */
+ vm_page_free_toq(pg);
+ wire_adj++;
+ }
+ if (wire_adj)
+ vm_wire_sub(wire_adj);
+}
+
+static struct mbuf *
+m_uiotombuf_nomap(struct uio *uio, int how, int len, int maxseg, int flags)
+{
+ struct mbuf *m, *mb, *prev;
+ struct mbuf_ext_pgs *pgs;
+ vm_page_t pg_array[MBUF_PEXT_MAX_PGS];
+ int error, length, i, needed, wire_adj = 0;
+ ssize_t total;
+ int pflags = malloc2vm_flags(how) | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP;
+
+ /*
+ * len can be zero or an arbitrary large value bound by
+ * the total data supplied by the uio.
+ */
+ if (len > 0)
+ total = MIN(uio->uio_resid, len);
+ else
+ total = uio->uio_resid;
+
+ if (maxseg == 0)
+ maxseg = MBUF_PEXT_MAX_PGS * PAGE_SIZE;
+
+ /*
+ * Allocate the pages
+ */
+ m = NULL;
+ while (total > 0) {
+ mb = mb_alloc_ext_pgs(how, (flags & M_PKTHDR),
+ mb_free_mext_pgs);
+ if (mb == NULL)
+ goto failed;
+ if (m == NULL)
+ m = mb;
+ else
+ prev->m_next = mb;
+ prev = mb;
+ pgs = mb->m_ext.ext_pgs;
+ needed = length = MIN(maxseg, total);
+ for (i = 0; needed > 0; i++, needed -= PAGE_SIZE) {
+retry_page:
+ pg_array[i] = vm_page_alloc(NULL, 0, pflags);
+ if (pg_array[i] == NULL) {
+ if (wire_adj)
+ vm_wire_add(wire_adj);
+ wire_adj = 0;
+ if (how & M_NOWAIT) {
+ goto failed;
+ } else {
+ vm_wait(NULL);
+ goto retry_page;
+ }
+ }
+ wire_adj++;
+ pg_array[i]->flags &= ~PG_ZERO;
+ pgs->pa[i] = VM_PAGE_TO_PHYS(pg_array[i]);
+ pgs->npgs++;
+ }
+ pgs->last_pg_len = length - PAGE_SIZE * (pgs->npgs - 1);
+ MBUF_EXT_PGS_ASSERT_SANITY(pgs);
+ vm_wire_add(wire_adj);
+ wire_adj = 0;
+ total -= length;
+ error = uiomove_fromphys(pg_array, 0, length, uio);
+ if (error != 0)
+ goto failed;
+ mb->m_len = length;
+ mb->m_ext.ext_size += PAGE_SIZE * pgs->npgs;
+ if (flags & M_PKTHDR)
+ m->m_pkthdr.len += length;
+ }
+ return (m);
+
+failed:
+ m_freem(m);
+ return (NULL);
+}
+
/*
* Copy the contents of uio into a properly sized mbuf chain.
*/
@@ -1540,6 +1722,9 @@
ssize_t total;
int progress = 0;
+ if (flags & M_NOMAP)
+ return (m_uiotombuf_nomap(uio, how, len, align, flags));
+
/*
* len can be zero or an arbitrary large value bound by
* the total data supplied by the uio.
@@ -1585,6 +1770,62 @@
return (m);
}
+/*
+ * Copy data from an unmapped mbuf into a uio limited by len if set.
+ */
+int
+m_unmappedtouio(const struct mbuf *m, int m_off, struct uio *uio, int len)
+{
+ struct mbuf_ext_pgs *ext_pgs;
+ vm_page_t pg;
+ int error, i, off, pglen, pgoff, seglen, segoff;
+
+ MBUF_EXT_PGS_ASSERT(m);
+ ext_pgs = m->m_ext.ext_pgs;
+ error = 0;
+
+ /* Skip over any data removed from the front. */
+ off = mtod(m, vm_offset_t);
+
+ off += m_off;
+ if (ext_pgs->hdr_len != 0) {
+ if (off >= ext_pgs->hdr_len) {
+ off -= ext_pgs->hdr_len;
+ } else {
+ seglen = ext_pgs->hdr_len - off;
+ segoff = off;
+ seglen = min(seglen, len);
+ off = 0;
+ len -= seglen;
+ error = uiomove(&ext_pgs->hdr[segoff], seglen, uio);
+ }
+ }
+ pgoff = ext_pgs->first_pg_off;
+ for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+ pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+ if (off >= pglen) {
+ off -= pglen;
+ pgoff = 0;
+ continue;
+ }
+ seglen = pglen - off;
+ segoff = pgoff + off;
+ off = 0;
+ seglen = min(seglen, len);
+ len -= seglen;
+ pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+ error = uiomove_fromphys(&pg, segoff, seglen, uio);
+ pgoff = 0;
+ };
+ if (len != 0 && error == 0) {
+ KASSERT((off + len) <= ext_pgs->trail_len,
+ ("off + len > trail (%d + %d > %d, m_off = %d)", off, len,
+ ext_pgs->trail_len, m_off));
+ error = uiomove(&ext_pgs->trail[off], len, uio);
+ }
+ return (error);
+}
+
/*
* Copy an mbuf chain into a uio limited by len if set.
*/
@@ -1603,7 +1844,10 @@
for (; m != NULL; m = m->m_next) {
length = min(m->m_len, total - progress);
- error = uiomove(mtod(m, void *), length, uio);
+ if ((m->m_flags & M_NOMAP) != 0)
+ error = m_unmappedtouio(m, 0, uio, length);
+ else
+ error = uiomove(mtod(m, void *), length, uio);
if (error)
return (error);
Index: sys/kern/uipc_sockbuf.c
===================================================================
--- sys/kern/uipc_sockbuf.c
+++ sys/kern/uipc_sockbuf.c
@@ -89,28 +89,130 @@
}
/*
- * Mark ready "count" mbufs starting with "m".
+ * Compress M_NOTREADY mbufs after they have been readied by sbready().
+ *
+ * sbcompress() skips M_NOTREADY mbufs since the data is not available to
+ * be copied at the time of sbcompress(). This function combines small
+ * mbufs similar to sbcompress() once mbufs are ready. 'm0' is the first
+ * mbuf sbready() marked ready, and 'end' is the first mbuf still not
+ * ready.
+ */
+static void
+sbready_compress(struct sockbuf *sb, struct mbuf *m0, struct mbuf *end)
+{
+ struct mbuf *m, *n;
+ int ext_size;
+
+ SOCKBUF_LOCK_ASSERT(sb);
+
+ if ((sb->sb_flags & SB_NOCOALESCE) != 0)
+ return;
+
+ for (m = m0; m != end; m = m->m_next) {
+ MPASS((m->m_flags & M_NOTREADY) == 0);
+
+ /* Compress small unmapped mbufs into plain mbufs. */
+ if ((m->m_flags & M_NOMAP) && m->m_len <= MLEN) {
+ MPASS(m->m_flags & M_EXT);
+ ext_size = m->m_ext.ext_size;
+ if (mb_unmapped_compress(m) == 0) {
+ sb->sb_mbcnt -= ext_size;
+ sb->sb_ccnt -= 1;
+ }
+ }
+
+ /*
+ * NB: In sbcompress(), 'n' is the last mbuf in the
+ * socket buffer and 'm' is the new mbuf being copied
+ * into the trailing space of 'n'. Here, the roles
+ * are reversed and 'n' is the next mbuf after 'm'
+ * that is being copied into the trailing space of
+ * 'm'.
+ */
+ n = m->m_next;
+ while ((n != NULL) && (n != end) && (m->m_flags & M_EOR) == 0 &&
+ M_WRITABLE(m) &&
+ (m->m_flags & M_NOMAP) == 0 &&
+ n->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+ n->m_len <= M_TRAILINGSPACE(m) &&
+ m->m_type == n->m_type) {
+ KASSERT(sb->sb_lastrecord != n,
+ ("%s: merging start of record (%p) into previous mbuf (%p)",
+ __func__, n, m));
+ m_copydata(n, 0, n->m_len, mtodo(m, m->m_len));
+ m->m_len += n->m_len;
+ m->m_next = n->m_next;
+ m->m_flags |= n->m_flags & M_EOR;
+ if (sb->sb_mbtail == n)
+ sb->sb_mbtail = m;
+
+ sb->sb_mbcnt -= MSIZE;
+ sb->sb_mcnt -= 1;
+ if (n->m_flags & M_EXT) {
+ sb->sb_mbcnt -= n->m_ext.ext_size;
+ sb->sb_ccnt -= 1;
+ }
+ m_free(n);
+ n = m->m_next;
+ }
+ }
+ SBLASTRECORDCHK(sb);
+ SBLASTMBUFCHK(sb);
+}
+
+/*
+ * Mark ready "count" units of I/O starting with "m". Most mbufs
+ * count as a single unit of I/O except for EXT_PGS-backed mbufs which
+ * can be backed by multiple pages.
*/
int
-sbready(struct sockbuf *sb, struct mbuf *m, int count)
+sbready(struct sockbuf *sb, struct mbuf *m0, int count)
{
+ struct mbuf *m;
u_int blocker;
SOCKBUF_LOCK_ASSERT(sb);
KASSERT(sb->sb_fnrdy != NULL, ("%s: sb %p NULL fnrdy", __func__, sb));
+ KASSERT(count > 0, ("%s: invalid count %d", __func__, count));
+ m = m0;
blocker = (sb->sb_fnrdy == m) ? M_BLOCKED : 0;
- for (int i = 0; i < count; i++, m = m->m_next) {
+ while (count > 0) {
KASSERT(m->m_flags & M_NOTREADY,
("%s: m %p !M_NOTREADY", __func__, m));
+ if ((m->m_flags & M_EXT) != 0 &&
+ m->m_ext.ext_type == EXT_PGS) {
+ if (count < m->m_ext.ext_pgs->nrdy) {
+ m->m_ext.ext_pgs->nrdy -= count;
+ count = 0;
+ break;
+ }
+ count -= m->m_ext.ext_pgs->nrdy;
+ m->m_ext.ext_pgs->nrdy = 0;
+ } else
+ count--;
+
m->m_flags &= ~(M_NOTREADY | blocker);
if (blocker)
sb->sb_acc += m->m_len;
+ m = m->m_next;
}
- if (!blocker)
+ /*
+ * If the first mbuf is still not fully ready because only
+ * some of its backing pages were readied, no further progress
+ * can be made.
+ */
+ if (m0 == m) {
+ MPASS(m->m_flags & M_NOTREADY);
return (EINPROGRESS);
+ }
+
+ if (!blocker) {
+ sbready_compress(sb, m0, m);
+ return (EINPROGRESS);
+ }
/* This one was blocking all the queue. */
for (; m && (m->m_flags & M_NOTREADY) == 0; m = m->m_next) {
@@ -121,6 +223,7 @@
}
sb->sb_fnrdy = m;
+ sbready_compress(sb, m0, m);
return (0);
}
@@ -1030,12 +1133,11 @@
M_WRITABLE(n) &&
((sb->sb_flags & SB_NOCOALESCE) == 0) &&
!(m->m_flags & M_NOTREADY) &&
- !(n->m_flags & M_NOTREADY) &&
+ !(n->m_flags & (M_NOTREADY | M_NOMAP)) &&
m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
m->m_len <= M_TRAILINGSPACE(n) &&
n->m_type == m->m_type) {
- bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
- (unsigned)m->m_len);
+ m_copydata(m, 0, m->m_len, mtodo(n, n->m_len));
n->m_len += m->m_len;
sb->sb_ccc += m->m_len;
if (sb->sb_fnrdy == NULL)
@@ -1046,6 +1148,9 @@
m = m_free(m);
continue;
}
+ if (m->m_len <= MLEN && (m->m_flags & M_NOMAP) &&
+ (m->m_flags & M_NOTREADY) == 0)
+ (void)mb_unmapped_compress(m);
if (n)
n->m_next = m;
else
Index: sys/kern/uipc_socket.c
===================================================================
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -1044,7 +1044,7 @@
*
* We used to do a lot of socket buffer and socket locking here, as
* well as invoke sorflush() and perform wakeups. The direct call to
- * dom_dispose() and sbrelease_internal() are an inlining of what was
+ * dom_dispose() and sbdestroy() are an inlining of what was
* necessary from sorflush().
*
* Notice that the socket buffer and kqueue state are torn down
@@ -1982,7 +1982,11 @@
SBLASTRECORDCHK(&so->so_rcv);
SBLASTMBUFCHK(&so->so_rcv);
SOCKBUF_UNLOCK(&so->so_rcv);
- error = uiomove(mtod(m, char *) + moff, (int)len, uio);
+ if ((m->m_flags & M_NOMAP) != 0)
+ error = m_unmappedtouio(m, moff, uio, (int)len);
+ else
+ error = uiomove(mtod(m, char *) + moff,
+ (int)len, uio);
SOCKBUF_LOCK(&so->so_rcv);
if (error) {
/*
Index: sys/net/bpf.c
===================================================================
--- sys/net/bpf.c
+++ sys/net/bpf.c
@@ -2369,6 +2369,7 @@
* Note that we cut corners here; we only setup what's
* absolutely needed--this mbuf should never go anywhere else.
*/
+ mb.m_flags = 0;
mb.m_next = m;
mb.m_data = data;
mb.m_len = dlen;
Index: sys/net/bpf_buffer.c
===================================================================
--- sys/net/bpf_buffer.c
+++ sys/net/bpf_buffer.c
@@ -119,19 +119,10 @@
{
const struct mbuf *m;
u_char *dst;
- u_int count;
m = (struct mbuf *)src;
dst = (u_char *)buf + offset;
- while (len > 0) {
- if (m == NULL)
- panic("bpf_mcopy");
- count = min(m->m_len, len);
- bcopy(mtod(m, void *), dst, count);
- m = m->m_next;
- dst += count;
- len -= count;
- }
+ m_copydata(m, 0, len, dst);
}
/*
Index: sys/net/if.h
===================================================================
--- sys/net/if.h
+++ sys/net/if.h
@@ -246,6 +246,7 @@
#define IFCAP_HWSTATS 0x800000 /* manages counters internally */
#define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */
#define IFCAP_HWRXTSTMP 0x2000000 /* hardware rx timestamping */
+#define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */
#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
Index: sys/net/if_vlan.c
===================================================================
--- sys/net/if_vlan.c
+++ sys/net/if_vlan.c
@@ -1731,6 +1731,16 @@
ena |= (mena & IFCAP_TXRTLMT);
#endif
+ /*
+ * If the parent interface supports unmapped mbufs, so does
+ * the VLAN interface. Note that this should be fine even for
+ * interfaces that don't support hardware tagging as headers
+ * are prepended in normal mbufs to unmapped mbufs holding
+ * payload data.
+ */
+ cap |= (p->if_capabilities & IFCAP_NOMAP);
+ ena |= (mena & IFCAP_NOMAP);
+
ifp->if_capabilities = cap;
ifp->if_capenable = ena;
ifp->if_hwassist = hwa;
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -35,13 +35,13 @@
__FBSDID("$FreeBSD$");
#include "opt_inet.h"
-#include "opt_ratelimit.h"
#include "opt_ipsec.h"
#include "opt_mbuf_stress_test.h"
#include "opt_mpath.h"
+#include "opt_ratelimit.h"
#include "opt_route.h"
-#include "opt_sctp.h"
#include "opt_rss.h"
+#include "opt_sctp.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -283,6 +283,7 @@
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
int no_route_but_check_spd = 0;
#endif
+
M_ASSERTPKTHDR(m);
if (inp != NULL) {
@@ -685,11 +686,30 @@
m->m_pkthdr.csum_flags |= CSUM_IP;
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ IPSTAT_INC(ips_odropped);
+ error = ENOBUFS;
+ goto bad;
+ }
in_delayed_cksum(m);
m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
+ } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ IPSTAT_INC(ips_odropped);
+ error = ENOBUFS;
+ goto bad;
+ }
}
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ IPSTAT_INC(ips_odropped);
+ error = ENOBUFS;
+ goto bad;
+ }
sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
@@ -825,11 +845,23 @@
* fragmented packets, then do it here.
*/
if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
+ m0 = mb_unmapped_to_ext(m0);
+ if (m0 == NULL) {
+ error = ENOBUFS;
+ IPSTAT_INC(ips_odropped);
+ goto done;
+ }
in_delayed_cksum(m0);
m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
}
#ifdef SCTP
if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
+ m0 = mb_unmapped_to_ext(m0);
+ if (m0 == NULL) {
+ error = ENOBUFS;
+ IPSTAT_INC(ips_odropped);
+ goto done;
+ }
sctp_delayed_cksum(m0, hlen);
m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
}
Index: sys/netinet/tcp_pcap.c
===================================================================
--- sys/netinet/tcp_pcap.c
+++ sys/netinet/tcp_pcap.c
@@ -311,6 +311,7 @@
if (mhead->m_flags & M_EXT) {
switch (mhead->m_ext.ext_type) {
case EXT_SFBUF:
+ case EXT_PGS:
/* Don't mess around with these. */
tcp_pcap_m_freem(mhead);
continue;
@@ -383,8 +384,11 @@
__func__, n->m_flags));
n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
n->m_len = m->m_len;
- bcopy(M_START(m), n->m_dat,
- m->m_len + M_LEADINGSPACE_NOWRITE(m));
+ if (m->m_flags & M_NOMAP)
+ m_copydata(m, 0, m->m_len, n->m_data);
+ else
+ bcopy(M_START(m), n->m_dat,
+ m->m_len + M_LEADINGSPACE_NOWRITE(m));
}
else {
/*
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -798,8 +798,12 @@
}
}
+ if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
+ *num_names = 0;
+ return (EINVAL);
+ }
+
refcount_init(&blk->tfb_refcnt, 0);
- blk->tfb_flags = 0;
blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1);
for (i = 0; i < *num_names; i++) {
n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
Index: sys/netinet/tcp_usrreq.c
===================================================================
--- sys/netinet/tcp_usrreq.c
+++ sys/netinet/tcp_usrreq.c
@@ -1190,8 +1190,7 @@
INP_WLOCK(inp);
if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
INP_WUNLOCK(inp);
- for (int i = 0; i < count; i++)
- m = m_free(m);
+ mb_free_notready(m, count);
return (ECONNRESET);
}
tp = intotcpcb(inp);
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -67,11 +67,11 @@
#include "opt_inet.h"
#include "opt_inet6.h"
-#include "opt_ratelimit.h"
#include "opt_ipsec.h"
-#include "opt_sctp.h"
+#include "opt_ratelimit.h"
#include "opt_route.h"
#include "opt_rss.h"
+#include "opt_sctp.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -963,11 +963,30 @@
*/
if (sw_csum & CSUM_DELAY_DATA_IPV6) {
sw_csum &= ~CSUM_DELAY_DATA_IPV6;
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ error = ENOBUFS;
+ IP6STAT_INC(ip6s_odropped);
+ goto bad;
+ }
in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr));
+ } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ error = ENOBUFS;
+ IP6STAT_INC(ip6s_odropped);
+ goto bad;
+ }
}
#ifdef SCTP
if (sw_csum & CSUM_SCTP_IPV6) {
sw_csum &= ~CSUM_SCTP_IPV6;
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ error = ENOBUFS;
+ IP6STAT_INC(ip6s_odropped);
+ goto bad;
+ }
sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
}
#endif
@@ -1055,11 +1074,23 @@
* XXX-BZ handle the hw offloading case. Need flags.
*/
if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ in6_ifstat_inc(ifp, ifs6_out_fragfail);
+ error = ENOBUFS;
+ goto bad;
+ }
in6_delayed_cksum(m, plen, hlen);
m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
}
#ifdef SCTP
if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
+ m = mb_unmapped_to_ext(m);
+ if (m == NULL) {
+ in6_ifstat_inc(ifp, ifs6_out_fragfail);
+ error = ENOBUFS;
+ goto bad;
+ }
sctp_delayed_cksum(m, hlen);
m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
}
Index: sys/sys/mbuf.h
===================================================================
--- sys/sys/mbuf.h
+++ sys/sys/mbuf.h
@@ -227,7 +227,15 @@
volatile u_int ext_count;
volatile u_int *ext_cnt;
};
- char *ext_buf; /* start of buffer */
+ union {
+ /*
+ * If ext_type == EXT_PGS, 'ext_pgs' points to a
+ * structure describing the buffer. Otherwise,
+ * 'ext_buf' points to the start of the buffer.
+ */
+ struct mbuf_ext_pgs *ext_pgs;
+ char *ext_buf;
+ };
uint32_t ext_size; /* size of buffer, for ext_free */
uint32_t ext_type:8, /* type of external storage */
ext_flags:24; /* external storage mbuf flags */
@@ -293,6 +301,92 @@
};
};
+struct socket;
+
+/*
+ * TLS records for TLS 1.0-1.2 can have the following header lengths:
+ * - 5 (AES-CBC with implicit IV)
+ * - 21 (AES-CBC with explicit IV)
+ * - 13 (AES-GCM with 8 byte explicit IV)
+ */
+#define MBUF_PEXT_HDR_LEN 24
+
+/*
+ * TLS records for TLS 1.0-1.2 can have the following maximum trailer
+ * lengths:
+ * - 16 (AES-GCM)
+ * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding)
+ * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding)
+ * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding)
+ */
+#define MBUF_PEXT_TRAIL_LEN 64
+
+#ifdef __LP64__
+#define MBUF_PEXT_MAX_PGS (152 / sizeof(vm_paddr_t))
+#else
+#define MBUF_PEXT_MAX_PGS (156 / sizeof(vm_paddr_t))
+#endif
+
+#define MBUF_PEXT_MAX_BYTES \
+ (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN)
+
+/*
+ * This struct is 256 bytes in size and is arranged so that the most
+ * common case (accessing the first 4 pages of a 16KB TLS record) will
+ * fit in a single 64 byte cacheline.
+ */
+struct mbuf_ext_pgs {
+ uint8_t npgs; /* Number of attached pages */
+ uint8_t nrdy; /* Pages with I/O pending */
+ uint8_t hdr_len; /* TLS header length */
+ uint8_t trail_len; /* TLS trailer length */
+ uint16_t first_pg_off; /* Offset into 1st page */
+ uint16_t last_pg_len; /* Length of last page */
+ vm_paddr_t pa[MBUF_PEXT_MAX_PGS]; /* phys addrs of pages */
+ char hdr[MBUF_PEXT_HDR_LEN]; /* TLS header */
+ void *tls; /* TLS session */
+#if defined(__i386__) || \
+ (defined(__powerpc__) && !defined(__powerpc64__) && defined(BOOKE))
+ /*
+ * i386 and Book-E PowerPC have 64-bit vm_paddr_t, so there is
+ * a 4 byte remainder from the space allocated for pa[].
+ */
+ uint32_t pad;
+#endif
+ union {
+ char trail[MBUF_PEXT_TRAIL_LEN]; /* TLS trailer */
+ struct {
+ struct socket *so;
+ void *mbuf;
+ uint64_t seqno;
+ STAILQ_ENTRY(mbuf_ext_pgs) stailq;
+ };
+ };
+};
+
+#ifdef _KERNEL
+static inline int
+mbuf_ext_pg_len(struct mbuf_ext_pgs *ext_pgs, int pidx, int pgoff)
+{
+ KASSERT(pgoff == 0 || pidx == 0,
+ ("page %d with non-zero offset %d in %p", pidx, pgoff, ext_pgs));
+ if (pidx == ext_pgs->npgs - 1) {
+ return (ext_pgs->last_pg_len);
+ } else {
+ return (PAGE_SIZE - pgoff);
+ }
+}
+
+#ifdef INVARIANT_SUPPORT
+void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs);
+#endif
+#ifdef INVARIANTS
+#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) mb_ext_pgs_check((ext_pgs))
+#else
+#define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs)
+#endif
+#endif
+
/*
* mbuf flags of global significance and layer crossing.
* Those of only protocol/layer specific significance are to be mapped
@@ -307,7 +401,7 @@
#define M_MCAST 0x00000020 /* send/received as link-level multicast */
#define M_PROMISC 0x00000040 /* packet was not for us */
#define M_VLANTAG 0x00000080 /* ether_vtag is valid */
-#define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */
+#define M_NOMAP 0x00000100 /* mbuf data is unmapped */
#define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */
#define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */
#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
@@ -348,7 +442,7 @@
*/
#define M_FLAG_BITS \
"\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \
- "\7M_PROMISC\10M_VLANTAG\13M_TSTMP\14M_TSTMP_HPREC"
+ "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC"
#define M_FLAG_PROTOBITS \
"\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
"\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
@@ -420,6 +514,7 @@
#define EXT_PACKET 6 /* mbuf+cluster from packet zone */
#define EXT_MBUF 7 /* external mbuf reference */
#define EXT_RXRING 8 /* data in NIC receive ring */
+#define EXT_PGS 9 /* array of unmapped pages */
#define EXT_VENDOR1 224 /* for vendor-internal use */
#define EXT_VENDOR2 225 /* for vendor-internal use */
@@ -464,6 +559,11 @@
"\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \
"\30EXT_FLAG_EXP4"
+#define MBUF_EXT_PGS_ASSERT(m) \
+ KASSERT((((m)->m_flags & M_EXT) != 0) && \
+ ((m)->m_ext.ext_type == EXT_PGS), \
+ ("%s: m %p !M_EXT or !EXT_PGS", __func__, m))
+
/*
* Flags indicating checksum, segmentation and other offload work to be
* done, or already done, by hardware or lower layers. It is split into
@@ -566,6 +666,7 @@
#define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k"
#define MBUF_TAG_MEM_NAME "mbuf_tag"
#define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt"
+#define MBUF_EXTPGS_MEM_NAME "mbuf_extpgs"
#ifdef _KERNEL
@@ -590,9 +691,15 @@
extern uma_zone_t zone_jumbop;
extern uma_zone_t zone_jumbo9;
extern uma_zone_t zone_jumbo16;
+extern uma_zone_t zone_extpgs;
void mb_dupcl(struct mbuf *, struct mbuf *);
void mb_free_ext(struct mbuf *);
+void mb_free_mext_pgs(struct mbuf *);
+struct mbuf *mb_alloc_ext_pgs(int, bool, m_ext_free_t);
+int mb_unmapped_compress(struct mbuf *m);
+struct mbuf *mb_unmapped_to_ext(struct mbuf *m);
+void mb_free_notready(struct mbuf *m, int count);
void m_adj(struct mbuf *, int);
int m_apply(struct mbuf *, int, int,
int (*)(void *, void *, u_int), void *);
@@ -627,6 +734,7 @@
struct mbuf *m_getptr(struct mbuf *, int, int *);
u_int m_length(struct mbuf *, struct mbuf **);
int m_mbuftouio(struct uio *, const struct mbuf *, int);
+int m_unmappedtouio(const struct mbuf *, int, struct uio *, int);
void m_move_pkthdr(struct mbuf *, struct mbuf *);
int m_pkthdr_init(struct mbuf *, int);
struct mbuf *m_prepend(struct mbuf *, int, int);
@@ -881,7 +989,7 @@
* be both the local data payload, or an external buffer area, depending on
* whether M_EXT is set).
*/
-#define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \
+#define M_WRITABLE(m) (((m)->m_flags & (M_RDONLY | M_NOMAP)) == 0 && \
(!(((m)->m_flags & M_EXT)) || \
(m_extrefcnt(m) == 1)))
@@ -904,7 +1012,8 @@
* handling external storage, packet-header mbufs, and regular data mbufs.
*/
#define M_START(m) \
- (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
+ (((m)->m_flags & M_NOMAP) ? NULL : \
+ ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \
&(m)->m_dat[0])
@@ -1020,6 +1129,7 @@
extern int max_linkhdr; /* Largest link-level header */
extern int max_protohdr; /* Largest protocol header */
extern int nmbclusters; /* Maximum number of clusters */
+extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */
/*-
* Network packets may have annotations attached by affixing a list of
Index: sys/sys/sglist.h
===================================================================
--- sys/sys/sglist.h
+++ sys/sys/sglist.h
@@ -57,6 +57,7 @@
struct bio;
struct mbuf;
+struct mbuf_ext_pgs;
struct uio;
static __inline void
@@ -87,6 +88,9 @@
struct sglist *sglist_alloc(int nsegs, int mflags);
int sglist_append(struct sglist *sg, void *buf, size_t len);
int sglist_append_bio(struct sglist *sg, struct bio *bp);
+int sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
+ size_t off, size_t len);
+int sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m);
int sglist_append_mbuf(struct sglist *sg, struct mbuf *m0);
int sglist_append_phys(struct sglist *sg, vm_paddr_t paddr,
size_t len);
@@ -101,6 +105,9 @@
struct sglist *sglist_clone(struct sglist *sg, int mflags);
int sglist_consume_uio(struct sglist *sg, struct uio *uio, size_t resid);
int sglist_count(void *buf, size_t len);
+int sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off,
+ size_t len);
+int sglist_count_mb_ext_pgs(struct mbuf *m);
int sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_t len);
void sglist_free(struct sglist *sg);
int sglist_join(struct sglist *first, struct sglist *second);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Mar 23, 1:21 PM (5 h, 48 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
30143268
Default Alt Text
D20616.id58545.diff (69 KB)
Attached To
Mode
D20616: Add a new external mbuf type that holds multiple unmapped pages.
Attached
Detach File
Event Timeline
Log In to Comment