D21127.id61578.diff
No OneTemporary
Actions

Size

38 KB

Referenced Files

None

Subscribers

None

D21127.id61578.diff
View Options

	Index: netinet/tcp_lro.h
	===================================================================
	--- netinet/tcp_lro.h
	+++ netinet/tcp_lro.h
	@@ -45,6 +45,8 @@
	LIST_ENTRY(lro_entry) hash_next;
	struct mbuf *m_head;
	struct mbuf *m_tail;
	+ struct mbuf *m_last_mbuf;
	+ struct mbuf *m_prev_last;
	union {
	struct ip *ip4;
	struct ip6_hdr *ip6;
	@@ -67,10 +69,22 @@
	uint32_t ack_seq; /* tcp_seq */
	uint32_t tsval;
	uint32_t tsecr;
	+ uint32_t tcp_tot_p_len; /* TCP payload length of chain */
	uint16_t window;
	uint16_t timestamp; /* flag, not a TCP hdr field. */
	+ uint16_t need_wakeup;
	+ uint16_t mbuf_cnt; /* Count of mbufs collected see note */
	+ uint16_t mbuf_appended;
	struct timeval mtime;
	};
	+/*
	+ * Note: The mbuf_cnt field tracks our number of mbufs added to the m_next
	+ * list. Each mbuf counted can have data and of course it will
	+ * have an ack as well (by defintion any inbound tcp segment will
	+ * have an ack value. We use this count to tell us how many ACK's
	+ * are present for our ack-count threshold. If we exceed that or
	+ * the data threshold we will wake up the endpoint.
	+ */
	LIST_HEAD(lro_head, lro_entry);

	#define le_ip4 leip.ip4
	@@ -115,6 +129,8 @@
	void tcp_lro_flush_all(struct lro_ctrl *);
	int tcp_lro_rx(struct lro_ctrl , struct mbuf , uint32_t);
	void tcp_lro_queue_mbuf(struct lro_ctrl , struct mbuf );
	+void tcp_lro_reg_mbufq(void);
	+void tcp_lro_dereg_mbufq(void);

	#define TCP_LRO_NO_ENTRIES -2
	#define TCP_LRO_CANNOT -1
	Index: netinet/tcp_lro.c
	===================================================================
	--- netinet/tcp_lro.c
	+++ netinet/tcp_lro.c
	@@ -44,6 +44,8 @@
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	+#include <sys/socketvar.h>
	+#include <sys/sockbuf.h>
	#include <sys/sysctl.h>

	#include <net/if.h>
	@@ -56,11 +58,14 @@
	#include <netinet/ip6.h>
	#include <netinet/ip.h>
	#include <netinet/ip_var.h>
	+#include <netinet/in_pcb.h>
	+#include <netinet6/in6_pcb.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_lro.h>
	#include <netinet/tcp_var.h>
	-
	+#include <netinet/tcp_hpts.h>
	+#include <netinet/tcp_log_buf.h>
	#include <netinet6/ip6_var.h>

	#include <machine/in_cksum.h>
	@@ -79,11 +84,47 @@
	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"TCP LRO");

	+static long tcplro_stacks_wanting_mbufq = 0;
	+counter_u64_t tcp_inp_lro_direct_queue;
	+counter_u64_t tcp_inp_lro_wokeup_queue;
	+counter_u64_t tcp_inp_lro_compressed;
	+counter_u64_t tcp_inp_lro_single_push;
	+counter_u64_t tcp_inp_lro_locks_taken;
	+counter_u64_t tcp_inp_lro_sack_wake;
	+
	static unsigned tcp_lro_entries = TCP_LRO_ENTRIES;
	+static int32_t hold_lock_over_compress = 0;
	+SYSCTL_INT(_net_inet_tcp_lro, OID_AUTO, hold_lock, CTLFLAG_RW,
	+ &hold_lock_over_compress, 0,
	+ "Do we hold the lock over the compress of mbufs?");
	SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
	CTLFLAG_RDTUN \| CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
	"default number of LRO entries");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
	+ &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
	+ &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD,
	+ &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, single, CTLFLAG_RD,
	+ &tcp_inp_lro_single_push, "Number of lro's sent with single segment");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD,
	+ &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, sackwakeups, CTLFLAG_RD,
	+ &tcp_inp_lro_sack_wake, "Number of wakeups caused by sack/fin");

	+void
	+tcp_lro_reg_mbufq()
	+{
	+ atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1);
	+}
	+
	+void
	+tcp_lro_dereg_mbufq(void)
	+{
	+ atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1);
	+}
	+
	static __inline void
	tcp_lro_active_insert(struct lro_ctrl lc, struct lro_head bucket,
	struct lro_entry *le)
	@@ -162,6 +203,36 @@
	return (0);
	}

	+static struct tcphdr *
	+tcp_lro_get_th(struct lro_entry le, struct mbuf m)
	+{
	+ struct ether_header *eh;
	+ struct tcphdr *th = NULL;
	+#ifdef INET6
	+ struct ip6_hdr ip6 = NULL; / Keep compiler happy. */
	+#endif
	+#ifdef INET
	+ struct ip ip4 = NULL; / Keep compiler happy. */
	+#endif
	+
	+ eh = mtod(m, struct ether_header *);
	+ switch (le->eh_type) {
	+#ifdef INET6
	+ case ETHERTYPE_IPV6:
	+ ip6 = (struct ip6_hdr *)(eh + 1);
	+ th = (struct tcphdr *)(ip6 + 1);
	+ break;
	+#endif
	+#ifdef INET
	+ case ETHERTYPE_IP:
	+ ip4 = (struct ip *)(eh + 1);
	+ th = (struct tcphdr *)(ip4 + 1);
	+ break;
	+#endif
	+ }
	+ return (th);
	+}
	+
	void
	tcp_lro_free(struct lro_ctrl *lc)
	{
	@@ -192,7 +263,6 @@
	lc->lro_mbuf_data = NULL;
	}

	-#ifdef TCP_LRO_UPDATE_CSUM
	static uint16_t
	tcp_lro_csum_th(struct tcphdr *th)
	{
	@@ -275,7 +345,6 @@

	return (c & 0xffff);
	}
	-#endif

	static void
	tcp_lro_rx_done(struct lro_ctrl *lc)
	@@ -297,7 +366,7 @@
	if (LIST_EMPTY(&lc->lro_active))
	return;

	- getmicrotime(&tv);
	+ getmicrouptime(&tv);
	timevalsub(&tv, timeout);
	LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
	if (timevalcmp(&tv, &le->mtime, >=)) {
	@@ -307,11 +376,113 @@
	}
	}

	-void
	-tcp_lro_flush(struct lro_ctrl lc, struct lro_entry le)
	+#ifdef INET6
	+static int
	+tcp_lro_rx_ipv6(struct lro_ctrl lc, struct mbuf m, struct ip6_hdr *ip6,
	+ struct tcphdr **th)
	{

	- if (le->append_cnt > 0) {
	+ /* XXX-BZ we should check the flow-label. */
	+
	+ /* XXX-BZ We do not yet support ext. hdrs. */
	+ if (ip6->ip6_nxt != IPPROTO_TCP)
	+ return (TCP_LRO_NOT_SUPPORTED);
	+
	+ /* Find the TCP header. */
	+ th = (struct tcphdr )(ip6 + 1);
	+
	+ return (0);
	+}
	+#endif
	+
	+#ifdef INET
	+static int
	+tcp_lro_rx_ipv4(struct lro_ctrl lc, struct mbuf m, struct ip *ip4,
	+ struct tcphdr **th)
	+{
	+ int csum_flags;
	+ uint16_t csum;
	+
	+ if (ip4->ip_p != IPPROTO_TCP)
	+ return (TCP_LRO_NOT_SUPPORTED);
	+
	+ /* Ensure there are no options. */
	+ if ((ip4->ip_hl << 2) != sizeof (*ip4))
	+ return (TCP_LRO_CANNOT);
	+
	+ /* .. and the packet is not fragmented. */
	+ if (ip4->ip_off & htons(IP_MF\|IP_OFFMASK))
	+ return (TCP_LRO_CANNOT);
	+
	+ /* Legacy IP has a header checksum that needs to be correct. */
	+ csum_flags = m->m_pkthdr.csum_flags;
	+ if (csum_flags & CSUM_IP_CHECKED) {
	+ if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
	+ lc->lro_bad_csum++;
	+ return (TCP_LRO_CANNOT);
	+ }
	+ } else {
	+ csum = in_cksum_hdr(ip4);
	+ if (__predict_false((csum) != 0)) {
	+ lc->lro_bad_csum++;
	+ return (TCP_LRO_CANNOT);
	+ }
	+ }
	+ /* Find the TCP header (we assured there are no IP options). */
	+ th = (struct tcphdr )(ip4 + 1);
	+ return (0);
	+}
	+#endif
	+
	+static void
	+tcp_lro_log(struct tcpcb tp, struct lro_ctrl lc,
	+ struct lro_entry le, struct mbuf m, int frm, int32_t tcp_data_len,
	+ uint32_t th_seq , uint32_t th_ack, uint16_t th_win)
	+{
	+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+ uint32_t cts;
	+
	+ cts = tcp_get_usecs(&tv);
	+ memset(&log, 0, sizeof(union tcp_log_stackspecific));
	+ log.u_bbr.flex8 = frm;
	+ log.u_bbr.flex1 = tcp_data_len;
	+ if (m)
	+ log.u_bbr.flex2 = m->m_pkthdr.len;
	+ else
	+ log.u_bbr.flex2 = 0;
	+ log.u_bbr.flex3 = le->append_cnt;
	+ log.u_bbr.flex4 = le->p_len;
	+ log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
	+ log.u_bbr.delRate = le->m_head->m_flags;
	+ log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
	+ log.u_bbr.flex6 = lc->lro_length_lim;
	+ log.u_bbr.flex7 = lc->lro_ackcnt_lim;
	+ log.u_bbr.inflight = th_seq;
	+ log.u_bbr.timeStamp = cts;
	+ log.u_bbr.epoch = le->next_seq;
	+ log.u_bbr.delivered = th_ack;
	+ log.u_bbr.lt_epoch = le->ack_seq;
	+ log.u_bbr.pacing_gain = th_win;
	+ log.u_bbr.cwnd_gain = le->window;
	+ log.u_bbr.cur_del_rate = (uint64_t)m;
	+ log.u_bbr.bw_inuse = (uint64_t)le->m_head;
	+ log.u_bbr.pkts_out = le->mbuf_cnt; /* Total mbufs added */
	+ log.u_bbr.applimited = le->ulp_csum;
	+ log.u_bbr.lost = le->mbuf_appended;
	+ TCP_LOG_EVENTP(tp, NULL,
	+ &tp->t_inpcb->inp_socket->so_rcv,
	+ &tp->t_inpcb->inp_socket->so_snd,
	+ TCP_LOG_LRO, 0,
	+ 0, &log, false, &tv);
	+ }
	+}
	+
	+static void
	+tcp_flush_out_le(struct tcpcb tp, struct lro_ctrl lc, struct lro_entry *le, int locked)
	+{
	+ if (le->append_cnt > 1) {
	struct tcphdr *th;
	uint16_t p_len;

	@@ -335,13 +506,10 @@
	case ETHERTYPE_IP:
	{
	struct ip *ip4;
	-#ifdef TCP_LRO_UPDATE_CSUM
	uint32_t cl;
	uint16_t c;
	-#endif

	ip4 = le->le_ip4;
	-#ifdef TCP_LRO_UPDATE_CSUM
	/* Fix IP header checksum for new length. */
	c = ~ip4->ip_sum;
	cl = c;
	@@ -351,9 +519,6 @@
	cl = (cl >> 16) + (cl & 0xffff);
	c = cl;
	ip4->ip_sum = ~c;
	-#else
	- ip4->ip_sum = TCP_LRO_INVALID_CSUM;
	-#endif
	ip4->ip_len = p_len;
	th = (struct tcphdr *)(ip4 + 1);
	le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID \|
	@@ -379,7 +544,6 @@
	ts_ptr[1] = htonl(le->tsval);
	ts_ptr[2] = le->tsecr;
	}
	-#ifdef TCP_LRO_UPDATE_CSUM
	/* Update the TCP header checksum. */
	le->ulp_csum += p_len;
	le->ulp_csum += tcp_lro_csum_th(th);
	@@ -388,14 +552,429 @@
	(le->ulp_csum & 0xffff);
	th->th_sum = (le->ulp_csum & 0xffff);
	th->th_sum = ~th->th_sum;
	-#else
	- th->th_sum = TCP_LRO_INVALID_CSUM;
	+ if (tp && locked) {
	+ tcp_lro_log(tp, lc, le, NULL, 7, 0, 0, 0, 0);
	+ }
	+ }
	+ /*
	+ * Break any chain, this is not set to NULL on the singleton
	+ * case m_nextpkt points to m_head. Other case set them
	+ * m_nextpkt to NULL in push_and_replace.
	+ */
	+ le->m_head->m_nextpkt = NULL;
	+ le->m_head->m_pkthdr.lro_nsegs = le->append_cnt;
	+ if (tp && locked) {
	+ tcp_lro_log(tp, lc, le, le->m_head, 8, 0, 0, 0, 0);
	+ }
	+ (*lc->ifp->if_input)(lc->ifp, le->m_head);
	+ lc->lro_queued += le->append_cnt;
	+}
	+
	+static void
	+tcp_set_le_to_m(struct lro_ctrl lc, struct lro_entry le, struct mbuf *m)
	+{
	+ struct ether_header *eh;
	+ void l3hdr = NULL; / Keep compiler happy. */
	+ struct tcphdr *th;
	+#ifdef INET6
	+ struct ip6_hdr ip6 = NULL; / Keep compiler happy. */
	#endif
	+#ifdef INET
	+ struct ip ip4 = NULL; / Keep compiler happy. */
	+#endif
	+ uint32_t *ts_ptr;
	+ int error, l, ts_failed = 0;
	+ uint16_t tcp_data_len;
	+ uint16_t csum;
	+
	+ error = -1;
	+ eh = mtod(m, struct ether_header *);
	+ /*
	+ * We must reset the other pointers since the mbuf
	+ * we were pointing too is about to go away.
	+ */
	+ switch (le->eh_type) {
	+#ifdef INET6
	+ case ETHERTYPE_IPV6:
	+ l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
	+ error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
	+ le->le_ip6 = ip6;
	+ le->source_ip6 = ip6->ip6_src;
	+ le->dest_ip6 = ip6->ip6_dst;
	+ le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
	+ break;
	+#endif
	+#ifdef INET
	+ case ETHERTYPE_IP:
	+ l3hdr = ip4 = (struct ip *)(eh + 1);
	+ error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
	+ le->le_ip4 = ip4;
	+ le->source_ip4 = ip4->ip_src.s_addr;
	+ le->dest_ip4 = ip4->ip_dst.s_addr;
	+ le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
	+ break;
	+#endif
	}
	+ KASSERT(error == 0, ("%s: le=%p tcp_lro_rx_xxx failed\n",
	+ __func__, le));
	+ ts_ptr = (uint32_t *)(th + 1);
	+ l = (th->th_off << 2);
	+ l -= sizeof(*th);
	+ if (l != 0 &&
	+ (__predict_false(l != TCPOLEN_TSTAMP_APPA) \|\|
	+ (*ts_ptr != ntohl(TCPOPT_NOP<<24\|TCPOPT_NOP<<16\|
	+ TCPOPT_TIMESTAMP<<8\|TCPOLEN_TIMESTAMP)))) {
	+ /* We have failed to find a timestamp some other option? */
	+ ts_failed = 1;
	+ }
	+ if ((l != 0) && (ts_failed == 0)) {
	+ le->timestamp = 1;
	+ le->tsval = ntohl(*(ts_ptr + 1));
	+ le->tsecr = *(ts_ptr + 2);
	+ } else
	+ le->timestamp = 0;
	+ le->source_port = th->th_sport;
	+ le->dest_port = th->th_dport;
	+ /* Pull out the csum */
	+ tcp_data_len = m->m_pkthdr.lro_len;
	+ le->next_seq = ntohl(th->th_seq) + tcp_data_len;
	+ le->ack_seq = th->th_ack;
	+ le->window = th->th_win;
	+ csum = th->th_sum;
	+ /* Setup the data pointers */
	+ le->m_head = m;
	+ le->m_tail = m_last(m);
	+ le->append_cnt = 0;
	+ le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
	+ ~csum);
	+ le->append_cnt++;
	+ th->th_sum = csum; /* Restore checksum on first packet. */
	+}

	- le->m_head->m_pkthdr.lro_nsegs = le->append_cnt + 1;
	- (*lc->ifp->if_input)(lc->ifp, le->m_head);
	- lc->lro_queued += le->append_cnt + 1;
	+static void
	+tcp_push_and_replace(struct tcpcb tp, struct lro_ctrl lc, struct lro_entry le, struct mbuf m, int locked)
	+{
	+ /*
	+ * Push up the stack the current le and replace
	+ * it with m.
	+ */
	+ struct mbuf *msave;
	+
	+ /* Grab off the next and save it */
	+ msave = le->m_head->m_nextpkt;
	+ le->m_head->m_nextpkt = NULL;
	+ /* Now push out the old le entry */
	+ tcp_flush_out_le(tp, lc, le, locked);
	+ /*
	+ * Now to replace the data properly in the le
	+ * we have to reset the tcp header and
	+ * other fields.
	+ */
	+ tcp_set_le_to_m(lc, le, m);
	+ /* Restore the next list */
	+ m->m_nextpkt = msave;
	+}
	+
	+static void
	+tcp_lro_condense(struct tcpcb tp, struct lro_ctrl lc, struct lro_entry *le, int locked)
	+{
	+ /*
	+ * Walk through the mbuf chain we
	+ * have on tap and compress/condense
	+ * as required.
	+ */
	+ uint32_t *ts_ptr;
	+ struct mbuf *m;
	+ struct tcphdr *th;
	+ uint16_t tcp_data_len, csum_upd;
	+ int l;
	+
	+ /*
	+ * First we must check the lead (m_head)
	+ * we must make sure that it is not
	+ * something that should be sent up
	+ * right away (sack etc).
	+ */
	+again:
	+
	+ m = le->m_head->m_nextpkt;
	+ if (m == NULL) {
	+ /* Just the one left */
	+ return;
	+ }
	+ th = tcp_lro_get_th(le, le->m_head);
	+ KASSERT(th != NULL,
	+ ("le:%p m:%p th comes back NULL?", le, le->m_head));
	+ l = (th->th_off << 2);
	+ l -= sizeof(*th);
	+ ts_ptr = (uint32_t *)(th + 1);
	+ if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) \|\|
	+ (*ts_ptr != ntohl(TCPOPT_NOP<<24\|TCPOPT_NOP<<16\|
	+ TCPOPT_TIMESTAMP<<8\|TCPOLEN_TIMESTAMP)))) {
	+ /*
	+ * Its not the timestamp. We can't
	+ * use this guy as the head.
	+ */
	+ le->m_head->m_nextpkt = m->m_nextpkt;
	+ tcp_push_and_replace(tp, lc, le, m, locked);
	+ goto again;
	+ }
	+ if ((th->th_flags & ~(TH_ACK \| TH_PUSH)) != 0) {
	+ /*
	+ * Make sure that previously seen segements/ACKs are delivered
	+ * before this segment, e.g. FIN.
	+ */
	+ le->m_head->m_nextpkt = m->m_nextpkt;
	+ tcp_push_and_replace(tp, lc, le, m, locked);
	+ goto again;
	+ }
	+ while((m = le->m_head->m_nextpkt) != NULL) {
	+ /*
	+ * condense m into le, first
	+ * pull m out of the list.
	+ */
	+ le->m_head->m_nextpkt = m->m_nextpkt;
	+ m->m_nextpkt = NULL;
	+ /* Setup my data */
	+ tcp_data_len = m->m_pkthdr.lro_len;
	+ th = tcp_lro_get_th(le, m);
	+ KASSERT(th != NULL,
	+ ("le:%p m:%p th comes back NULL?", le, m));
	+ ts_ptr = (uint32_t *)(th + 1);
	+ l = (th->th_off << 2);
	+ l -= sizeof(*th);
	+ if (tp && locked) {
	+ tcp_lro_log(tp, lc, le, m, 1, 0, 0, 0, 0);
	+ }
	+ if (le->append_cnt >= lc->lro_ackcnt_lim) {
	+ if (tp && locked) {
	+ tcp_lro_log(tp, lc, le, m, 2, 0, 0, 0, 0);
	+ }
	+ tcp_push_and_replace(tp, lc, le, m, locked);
	+ goto again;
	+ }
	+ if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
	+ /* Flush now if appending will result in overflow. */
	+ if (tp && locked) {
	+ tcp_lro_log(tp, lc, le, m, 3, tcp_data_len, 0, 0, 0);
	+ }
	+ tcp_push_and_replace(tp, lc, le, m, locked);
	+ goto again;
	+ }
	+ if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) \|\|
	+ (*ts_ptr != ntohl(TCPOPT_NOP<<24\|TCPOPT_NOP<<16\|
	+ TCPOPT_TIMESTAMP<<8\|TCPOLEN_TIMESTAMP)))) {
	+ /*
	+ * Maybe a sack in the new one? We need to
	+ * start all over after flushing the
	+ * current le. We will go up to the beginning
	+ * and flush it (calling the replace again possibly
	+ * or just returning).
	+ */
	+ tcp_push_and_replace(tp, lc, le, m, locked);
	+ goto again;
	+ }
	+ if ((th->th_flags & ~(TH_ACK \| TH_PUSH)) != 0) {
	+ tcp_push_and_replace(tp, lc, le, m, locked);
	+ goto again;
	+ }
	+ if (l != 0) {
	+ uint32_t tsval = ntohl(*(ts_ptr + 1));
	+ /* Make sure timestamp values are increasing. */
	+ if (TSTMP_GT(le->tsval, tsval)) {
	+ tcp_push_and_replace(tp, lc, le, m, locked);
	+ goto again;
	+ }
	+ le->tsval = tsval;
	+ le->tsecr = *(ts_ptr + 2);
	+ }
	+ /* Try to append the new segment. */
	+ if (__predict_false(ntohl(th->th_seq) != le->next_seq \|\|
	+ (tcp_data_len == 0 &&
	+ le->ack_seq == th->th_ack &&
	+ le->window == th->th_win))) {
	+ /* Out of order packet or duplicate ACK. */
	+ if (tp && locked) {
	+ tcp_lro_log(tp, lc, le, m, 4, tcp_data_len,
	+ ntohl(th->th_seq),
	+ th->th_ack,
	+ th->th_win);
	+ }
	+ tcp_push_and_replace(tp, lc, le, m, locked);
	+ goto again;
	+ }
	+ if (tcp_data_len \|\| SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) {
	+ le->next_seq += tcp_data_len;
	+ le->ack_seq = th->th_ack;
	+ le->window = th->th_win;
	+ } else if (th->th_ack == le->ack_seq) {
	+ le->window = WIN_MAX(le->window, th->th_win);
	+ }
	+ csum_upd = m->m_pkthdr.lro_csum;
	+ le->ulp_csum += csum_upd;
	+ if (tcp_data_len == 0) {
	+ le->append_cnt++;
	+ le->mbuf_cnt--;
	+ if (tp && locked) {
	+ tcp_lro_log(tp, lc, le, m, 5, tcp_data_len,
	+ ntohl(th->th_seq),
	+ th->th_ack,
	+ th->th_win);
	+ }
	+ m_freem(m);
	+ continue;
	+ }
	+ le->append_cnt++;
	+ le->mbuf_appended++;
	+ le->p_len += tcp_data_len;
	+ /*
	+ * Adjust the mbuf so that m_data points to the first byte of
	+ * the ULP payload. Adjust the mbuf to avoid complications and
	+ * append new segment to existing mbuf chain.
	+ */
	+ m_adj(m, m->m_pkthdr.len - tcp_data_len);
	+ if (tp && locked) {
	+ tcp_lro_log(tp, lc, le, m, 6, tcp_data_len,
	+ ntohl(th->th_seq),
	+ th->th_ack,
	+ th->th_win);
	+ }
	+ m_demote_pkthdr(m);
	+ le->m_tail->m_next = m;
	+ le->m_tail = m_last(m);
	+ }
	+}
	+
	+static void
	+tcp_queue_pkts(struct tcpcb tp, struct lro_entry le)
	+{
	+ if (tp->t_in_pkt == NULL) {
	+ /* Nothing yet there */
	+ tp->t_in_pkt = le->m_head;
	+ tp->t_tail_pkt = le->m_last_mbuf;
	+ } else {
	+ /* Already some there */
	+ tp->t_tail_pkt->m_nextpkt = le->m_head;
	+ tp->t_tail_pkt = le->m_last_mbuf;
	+ }
	+ le->m_head = NULL;
	+ le->m_last_mbuf = NULL;
	+}
	+
	+void
	+tcp_lro_flush(struct lro_ctrl lc, struct lro_entry le)
	+{
	+ struct tcpcb *tp = NULL;
	+ int locked = 0;
	+#ifdef TCPHPTS
	+ struct inpcb *inp = NULL;
	+ int need_wakeup = 0, can_queue = 0;
	+ struct epoch_tracker et;
	+
	+ /* Now lets lookup the inp first */
	+ CURVNET_SET(lc->ifp->if_vnet);
	+ if (tcplro_stacks_wanting_mbufq == 0)
	+ goto skip_lookup;
	+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	+ switch (le->eh_type) {
	+#ifdef INET6
	+ case ETHERTYPE_IPV6:
	+ inp = in6_pcblookup(&V_tcbinfo, &le->source_ip6,
	+ le->source_port, &le->dest_ip6,le->dest_port,
	+ INPLOOKUP_WLOCKPCB,
	+ lc->ifp);
	+ break;
	+#endif
	+#ifdef INET
	+ case ETHERTYPE_IP:
	+ inp = in_pcblookup(&V_tcbinfo, le->le_ip4->ip_src,
	+ le->source_port, le->le_ip4->ip_dst, le->dest_port,
	+ INPLOOKUP_WLOCKPCB,
	+ lc->ifp);
	+ break;
	+#endif
	+ }
	+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	+ if (inp && ((inp->inp_flags & (INP_DROPPED\|INP_TIMEWAIT)) \|\|
	+ (inp->inp_flags2 & INP_FREED))) {
	+ /* We don't want this guy */
	+ INP_WUNLOCK(inp);
	+ inp = NULL;
	+ }
	+ if (inp && (inp->inp_flags2 & INP_SUPPORTS_MBUFQ)) {
	+ /* The transport supports mbuf queuing */
	+ can_queue = 1;
	+ if (le->need_wakeup \|\|
	+ ((inp->inp_in_input == 0) &&
	+ ((inp->inp_flags2 & INP_MBUF_QUEUE_READY) == 0))) {
	+ /*
	+ * Either the transport is off on a keep-alive
	+ * (it has the queue_ready flag clear and its
	+ * not already been woken) or the entry has
	+ * some urgent thing (FIN or possibly SACK blocks).
	+ * This means we need to wake the transport up by
	+ * putting it on the input pacer.
	+ */
	+ need_wakeup = 1;
	+ if ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) &&
	+ (le->need_wakeup != 1)) {
	+ /*
	+ * Prohibited from a sack wakeup.
	+ */
	+ need_wakeup = 0;
	+ }
	+ }
	+ /* Do we need to be awoken due to lots of data or acks? */
	+ if ((le->tcp_tot_p_len >= lc->lro_length_lim) \|\|
	+ (le->mbuf_cnt >= lc->lro_ackcnt_lim))
	+ need_wakeup = 1;
	+ }
	+ if (inp) {
	+ tp = intotcpcb(inp);
	+ locked = 1;
	+ } else
	+ tp = NULL;
	+ if (can_queue) {
	+ counter_u64_add(tcp_inp_lro_direct_queue, 1);
	+ tcp_lro_log(tp, lc, le, NULL, 22, need_wakeup,
	+ inp->inp_flags2, inp->inp_in_input, le->need_wakeup);
	+ tcp_queue_pkts(tp, le);
	+ if (need_wakeup) {
	+ /*
	+ * We must get the guy to wakeup via
	+ * hpts.
	+ */
	+ counter_u64_add(tcp_inp_lro_wokeup_queue, 1);
	+ if (le->need_wakeup)
	+ counter_u64_add(tcp_inp_lro_sack_wake, 1);
	+ tcp_queue_to_input(inp);
	+ }
	+ }
	+ if (inp && (hold_lock_over_compress == 0)) {
	+ /* Unlock it */
	+ locked = 0;
	+ tp = NULL;
	+ counter_u64_add(tcp_inp_lro_locks_taken, 1);
	+ INP_WUNLOCK(inp);
	+ }
	+ if (can_queue == 0) {
	+skip_lookup:
	+#endif
	+ /* Old fashioned lro method */
	+ if (le->m_head != le->m_last_mbuf) {
	+ counter_u64_add(tcp_inp_lro_compressed, 1);
	+ tcp_lro_condense(tp, lc, le, locked);
	+ } else
	+ counter_u64_add(tcp_inp_lro_single_push, 1);
	+ tcp_flush_out_le(tp, lc, le, locked);
	+#ifdef TCPHPTS
	+ }
	+ if (inp && locked) {
	+ counter_u64_add(tcp_inp_lro_locks_taken, 1);
	+ INP_WUNLOCK(inp);
	+ }
	+ CURVNET_RESTORE();
	+#endif
	lc->lro_flushed++;
	bzero(le, sizeof(*le));
	LIST_INSERT_HEAD(&lc->lro_free, le, next);
	@@ -537,67 +1116,14 @@
	lc->lro_mbuf_count = 0;
	}

	-#ifdef INET6
	-static int
	-tcp_lro_rx_ipv6(struct lro_ctrl lc, struct mbuf m, struct ip6_hdr *ip6,
	- struct tcphdr **th)
	+static void
	+lro_set_mtime(struct timeval tv, struct timespec ts)
	{
	-
	- /* XXX-BZ we should check the flow-label. */
	-
	- /* XXX-BZ We do not yet support ext. hdrs. */
	- if (ip6->ip6_nxt != IPPROTO_TCP)
	- return (TCP_LRO_NOT_SUPPORTED);
	-
	- /* Find the TCP header. */
	- th = (struct tcphdr )(ip6 + 1);
	-
	- return (0);
	+ tv->tv_sec = ts->tv_sec;
	+ tv->tv_usec = ts->tv_nsec / 1000;
	}
	-#endif

	-#ifdef INET
	static int
	-tcp_lro_rx_ipv4(struct lro_ctrl lc, struct mbuf m, struct ip *ip4,
	- struct tcphdr **th)
	-{
	- int csum_flags;
	- uint16_t csum;
	-
	- if (ip4->ip_p != IPPROTO_TCP)
	- return (TCP_LRO_NOT_SUPPORTED);
	-
	- /* Ensure there are no options. */
	- if ((ip4->ip_hl << 2) != sizeof (*ip4))
	- return (TCP_LRO_CANNOT);
	-
	- /* .. and the packet is not fragmented. */
	- if (ip4->ip_off & htons(IP_MF\|IP_OFFMASK))
	- return (TCP_LRO_CANNOT);
	-
	- /* Legacy IP has a header checksum that needs to be correct. */
	- csum_flags = m->m_pkthdr.csum_flags;
	- if (csum_flags & CSUM_IP_CHECKED) {
	- if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
	- lc->lro_bad_csum++;
	- return (TCP_LRO_CANNOT);
	- }
	- } else {
	- csum = in_cksum_hdr(ip4);
	- if (__predict_false((csum) != 0)) {
	- lc->lro_bad_csum++;
	- return (TCP_LRO_CANNOT);
	- }
	- }
	-
	- /* Find the TCP header (we assured there are no IP options). */
	- th = (struct tcphdr )(ip4 + 1);
	-
	- return (0);
	-}
	-#endif
	-
	-static int
	tcp_lro_rx2(struct lro_ctrl lc, struct mbuf m, uint32_t csum, int use_hash)
	{
	struct lro_entry *le;
	@@ -613,12 +1139,17 @@
	uint32_t *ts_ptr;
	tcp_seq seq;
	int error, ip_len, l;
	- uint16_t eh_type, tcp_data_len;
	+ uint16_t eh_type, tcp_data_len, need_flush;
	struct lro_head *bucket;
	- int force_flush = 0;
	+ struct timespec arrv;

	/* We expect a contiguous header [eh, ip, tcp]. */
	-
	+ if ((m->m_flags & (M_TSTMP_LRO\|M_TSTMP)) == 0) {
	+ /* If no hardware or arrival stamp on the packet add arrival */
	+ nanouptime(&arrv);
	+ m->m_pkthdr.rcv_tstmp = (arrv.tv_sec * 1000000000) + arrv.tv_nsec;
	+ m->m_flags \|= M_TSTMP_LRO;
	+ }
	eh = mtod(m, struct ether_header *);
	eh_type = ntohs(eh->ether_type);
	switch (eh_type) {
	@@ -677,49 +1208,35 @@

	m_adj(m, -l);
	}
	-
	/*
	* Check TCP header constraints.
	*/
	- /* Ensure no bits set besides ACK or PSH. */
	- if ((th->th_flags & ~(TH_ACK \| TH_PUSH)) != 0) {
	- if (th->th_flags & TH_SYN)
	- return (TCP_LRO_CANNOT);
	- /*
	- * Make sure that previously seen segements/ACKs are delivered
	- * before this segement, e.g. FIN.
	- */
	- force_flush = 1;
	- }
	-
	- /* XXX-BZ We lose a ACK\|PUSH flag concatenating multiple segments. */
	- /* XXX-BZ Ideally we'd flush on PUSH? */
	-
	- /*
	- * Check for timestamps.
	- * Since the only option we handle are timestamps, we only have to
	- * handle the simple case of aligned timestamps.
	- */
	+ if (th->th_flags & TH_SYN)
	+ return (TCP_LRO_CANNOT);
	+ if ((th->th_flags & ~(TH_ACK \| TH_PUSH)) != 0)
	+ need_flush = 1;
	+ else
	+ need_flush = 0;
	l = (th->th_off << 2);
	+ ts_ptr = (uint32_t *)(th + 1);
	tcp_data_len -= l;
	l -= sizeof(*th);
	- ts_ptr = (uint32_t *)(th + 1);
	if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) \|\|
	- (*ts_ptr != ntohl(TCPOPT_NOP<<24\|TCPOPT_NOP<<16\|
	- TCPOPT_TIMESTAMP<<8\|TCPOLEN_TIMESTAMP)))) {
	- /*
	- * Make sure that previously seen segements/ACKs are delivered
	- * before this segement.
	+ (*ts_ptr != ntohl(TCPOPT_NOP<<24\|TCPOPT_NOP<<16\|
	+ TCPOPT_TIMESTAMP<<8\|TCPOLEN_TIMESTAMP)))) {
	+ /*
	+ * We have an option besides Timestamps, maybe
	+ * it is a sack (most likely) which means we
	+ * will probably need to wake up a sleeper (if
	+ * the guy does queueing).
	*/
	- force_flush = 1;
	+ need_flush = 2;
	}

	/* If the driver did not pass in the checksum, set it now. */
	if (csum == 0x0000)
	csum = th->th_sum;
	-
	seq = ntohl(th->th_seq);
	-
	if (!use_hash) {
	bucket = &lc->lro_hash[0];
	} else if (M_HASHTYPE_ISHASH(m)) {
	@@ -736,13 +1253,13 @@
	#ifdef INET6
	case ETHERTYPE_IPV6:
	hash = ip6->ip6_src.s6_addr32[0] +
	- ip6->ip6_dst.s6_addr32[0];
	+ ip6->ip6_dst.s6_addr32[0];
	hash += ip6->ip6_src.s6_addr32[1] +
	- ip6->ip6_dst.s6_addr32[1];
	+ ip6->ip6_dst.s6_addr32[1];
	hash += ip6->ip6_src.s6_addr32[2] +
	- ip6->ip6_dst.s6_addr32[2];
	+ ip6->ip6_dst.s6_addr32[2];
	hash += ip6->ip6_src.s6_addr32[3] +
	- ip6->ip6_dst.s6_addr32[3];
	+ ip6->ip6_dst.s6_addr32[3];
	break;
	#endif
	default:
	@@ -764,9 +1281,9 @@
	#ifdef INET6
	case ETHERTYPE_IPV6:
	if (bcmp(&le->source_ip6, &ip6->ip6_src,
	- sizeof(struct in6_addr)) != 0 \|\|
	+ sizeof(struct in6_addr)) != 0 \|\|
	bcmp(&le->dest_ip6, &ip6->ip6_dst,
	- sizeof(struct in6_addr)) != 0)
	+ sizeof(struct in6_addr)) != 0)
	continue;
	break;
	#endif
	@@ -778,108 +1295,34 @@
	break;
	#endif
	}
	-
	- if (force_flush) {
	- /* Timestamps mismatch; this is a FIN, etc */
	- tcp_lro_active_remove(le);
	- tcp_lro_flush(lc, le);
	- return (TCP_LRO_CANNOT);
	- }
	-
	- /* Flush now if appending will result in overflow. */
	- if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
	- tcp_lro_active_remove(le);
	- tcp_lro_flush(lc, le);
	- break;
	- }
	-
	- /* Try to append the new segment. */
	- if (__predict_false(seq != le->next_seq \|\|
	- (tcp_data_len == 0 &&
	- le->ack_seq == th->th_ack &&
	- le->window == th->th_win))) {
	- /* Out of order packet or duplicate ACK. */
	- tcp_lro_active_remove(le);
	- tcp_lro_flush(lc, le);
	- return (TCP_LRO_CANNOT);
	- }
	-
	- if (l != 0) {
	- uint32_t tsval = ntohl(*(ts_ptr + 1));
	- /* Make sure timestamp values are increasing. */
	- /* XXX-BZ flip and use TSTMP_GEQ macro for this? */
	- if (__predict_false(le->tsval > tsval \|\|
	- *(ts_ptr + 2) == 0))
	- return (TCP_LRO_CANNOT);
	- le->tsval = tsval;
	- le->tsecr = *(ts_ptr + 2);
	- }
	- if (tcp_data_len \|\| SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) {
	- le->next_seq += tcp_data_len;
	- le->ack_seq = th->th_ack;
	- le->window = th->th_win;
	- le->append_cnt++;
	- } else if (th->th_ack == le->ack_seq) {
	- le->window = WIN_MAX(le->window, th->th_win);
	- le->append_cnt++;
	+ if (tcp_data_len \|\| SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq)) \|\|
	+ (th->th_ack == le->ack_seq)) {
	+ m->m_pkthdr.lro_len = tcp_data_len;
	} else {
	/* no data and old ack */
	- le->append_cnt++;
	m_freem(m);
	return (0);
	}
	-#ifdef TCP_LRO_UPDATE_CSUM
	- le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
	- tcp_data_len, ~csum);
	-#endif
	-
	- if (tcp_data_len == 0) {
	- m_freem(m);
	- /*
	- * Flush this LRO entry, if this ACK should not
	- * be further delayed.
	- */
	- if (le->append_cnt >= lc->lro_ackcnt_lim) {
	- tcp_lro_active_remove(le);
	- tcp_lro_flush(lc, le);
	- }
	- return (0);
	- }
	-
	- le->p_len += tcp_data_len;
	-
	- /*
	- * Adjust the mbuf so that m_data points to the first byte of
	- * the ULP payload. Adjust the mbuf to avoid complications and
	- * append new segment to existing mbuf chain.
	- */
	- m_adj(m, m->m_pkthdr.len - tcp_data_len);
	- m_demote_pkthdr(m);
	-
	- le->m_tail->m_next = m;
	- le->m_tail = m_last(m);
	-
	- /*
	- * If a possible next full length packet would cause an
	- * overflow, pro-actively flush now.
	- */
	- if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) {
	- tcp_lro_active_remove(le);
	- tcp_lro_flush(lc, le);
	- } else
	- getmicrotime(&le->mtime);
	-
	+ if (need_flush)
	+ le->need_wakeup = need_flush;
	+ /* Save of the data only csum */
	+ m->m_pkthdr.rcvif = lc->ifp;
	+ m->m_pkthdr.lro_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th,
	+ tcp_data_len, ~csum);
	+ th->th_sum = csum; /* Restore checksum */
	+ /* Save off the tail I am appending too (prev) */
	+ le->m_prev_last = le->m_last_mbuf;
	+ /* Mark me in the last spot */
	+ le->m_last_mbuf->m_nextpkt = m;
	+ /* Now set the tail to me */
	+ le->m_last_mbuf = m;
	+ le->mbuf_cnt++;
	+ m->m_nextpkt = NULL;
	+ /* Add to the total size of data */
	+ le->tcp_tot_p_len += tcp_data_len;
	+ lro_set_mtime(&le->mtime, &arrv);
	return (0);
	}
	-
	- if (force_flush) {
	- /*
	- * Nothing to flush, but this segment can not be further
	- * aggregated/delayed.
	- */
	- return (TCP_LRO_CANNOT);
	- }
	-
	/* Try to find an empty slot. */
	if (LIST_EMPTY(&lc->lro_free))
	return (TCP_LRO_NO_ENTRIES);
	@@ -888,7 +1331,7 @@
	le = LIST_FIRST(&lc->lro_free);
	LIST_REMOVE(le, next);
	tcp_lro_active_insert(lc, bucket, le);
	- getmicrotime(&le->mtime);
	+ lro_set_mtime(&le->mtime, &arrv);

	/* Start filling in details. */
	switch (eh_type) {
	@@ -910,10 +1353,9 @@
	le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
	break;
	#endif
	- }
	+ }
	le->source_port = th->th_sport;
	le->dest_port = th->th_dport;
	-
	le->next_seq = seq + tcp_data_len;
	le->ack_seq = th->th_ack;
	le->window = th->th_win;
	@@ -922,26 +1364,31 @@
	le->tsval = ntohl(*(ts_ptr + 1));
	le->tsecr = *(ts_ptr + 2);
	}
	-
	-#ifdef TCP_LRO_UPDATE_CSUM
	- /*
	- * Do not touch the csum of the first packet. However save the
	- * "adjusted" checksum of just the source and destination addresses,
	- * the next header and the TCP payload. The length and TCP header
	- * parts may change, so we remove those from the saved checksum and
	- * re-add with final values on tcp_lro_flush() if needed.
	- */
	KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
	- __func__, le, le->ulp_csum));
	+ __func__, le, le->ulp_csum));

	+ le->append_cnt = 0;
	le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
	- ~csum);
	- th->th_sum = csum; /* Restore checksum on first packet. */
	-#endif
	-
	+ ~csum);
	+ le->append_cnt++;
	+ th->th_sum = csum; /* Restore checksum */
	le->m_head = m;
	+ m->m_pkthdr.rcvif = lc->ifp;
	+ le->mbuf_cnt = 1;
	+ if (need_flush)
	+ le->need_wakeup = need_flush;
	+ else
	+ le->need_wakeup = 0;
	le->m_tail = m_last(m);
	-
	+ le->m_last_mbuf = m;
	+ m->m_nextpkt = NULL;
	+ le->m_prev_last = NULL;
	+ /*
	+ * We keep the total size here for cross checking when we may need
	+ * to flush/wakeup in the MBUF_QUEUE case.
	+ */
	+ le->tcp_tot_p_len = tcp_data_len;
	+ m->m_pkthdr.lro_len = tcp_data_len;
	return (0);
	}

	@@ -955,6 +1402,8 @@
	void
	tcp_lro_queue_mbuf(struct lro_ctrl lc, struct mbuf mb)
	{
	+ struct timespec arrv;
	+
	/* sanity checks */
	if (__predict_false(lc->ifp == NULL \|\| lc->lro_mbuf_data == NULL \|\|
	lc->lro_mbuf_max == 0)) {
	@@ -971,7 +1420,15 @@
	(*lc->ifp->if_input) (lc->ifp, mb);
	return;
	}
	+ /* Arrival Stamp the packet */

	+ if ((mb->m_flags & M_TSTMP) == 0) {
	+ /* If no hardware or arrival stamp on the packet add arrival */
	+ nanouptime(&arrv);
	+ mb->m_pkthdr.rcv_tstmp = ((arrv.tv_sec * 1000000000) +
	+ arrv.tv_nsec);
	+ mb->m_flags \|= M_TSTMP_LRO;
	+ }
	/* create sequence number */
	lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
	(((uint64_t)M_HASHTYPE_GET(mb)) << 56) \|
	Index: netinet/tcp_stacks/rack_bbr_common.c
	===================================================================
	--- netinet/tcp_stacks/rack_bbr_common.c
	+++ netinet/tcp_stacks/rack_bbr_common.c
	@@ -159,6 +159,65 @@
	}
	#endif

	+
	+/*
	+ * The function ctf_process_inbound_raw() is used by
	+ * transport developers to do the steps needed to
	+ * support MBUF Queuing i.e. the flags in
	+ * inp->inp_flags2:
	+ *
	+ * - INP_SUPPORTS_MBUFQ
	+ * - INP_MBUF_QUEUE_READY
	+ * - INP_DONT_SACK_QUEUE
	+ *
	+ * These flags help control how LRO will deliver
	+ * packets to the transport. You first set in inp_flags2
	+ * the INP_SUPPORTS_MBUFQ to tell the LRO code that you
	+ * will gladly take a queue of packets instead of a compressed
	+ * single packet. You also set in your t_fb pointer the
	+ * tfb_do_queued_segments to point to ctf_process_inbound_raw.
	+ *
	+ * This then gets you lists of inbound ACK's/Data instead
	+ * of a condensed compressed ACK/DATA packet. Why would you
	+ * want that? This will get you access to all the arrival
	+ * times of at least LRO and possibly at the Hardware (if
	+ * the interface card supports that) of the actual ACK/DATA.
	+ * In some transport designs this is important since knowing
	+ * the actual time we got the packet is useful information.
	+ *
	+ * Now there are some interesting Caveats that the transport
	+ * designer needs to take into account when using this feature.
	+ *
	+ * 1) It is used with HPTS and pacing, when the pacing timer
	+ * for output calls it will first call the input.
	+ * 2) When you set INP_MBUF_QUEUE_READY this tells LRO
	+ * queue normal packets, I am busy pacing out data and
	+ * will process the queued packets before my tfb_tcp_output
	+ * call from pacing. If a non-normal packet arrives, (e.g. sack)
	+ * you will be awoken immediately.
	+ * 3) Finally you can add the INP_DONT_SACK_QUEUE to not even
	+ * be awoken if a SACK has arrived. You would do this when
	+ * you were not only running a pacing for output timer
	+ * but a Rack timer as well i.e. you know you are in recovery
	+ * and are in the process (via the timers) of dealing with
	+ * the loss.
	+ *
	+ * Now a critical thing you must be aware of here is that the
	+ * use of the flags has a far greater scope then just your
	+ * typical LRO. Why? Well thats because in the normal compressed
	+ * LRO case at the end of a driver interupt all packets are going
	+ * to get presented to the transport no matter if there is one
	+ * or 100. With the MBUF_QUEUE model, this is not true. You will
	+ * only be awoken to process the queue of packets when:
	+ * a) The flags discussed above allow it.
	+ * <or>
	+ * b) You exceed a ack or data limit (by default the
	+ * ack limit is infinity (64k acks) and the data
	+ * limit is 64k of new TCP data)
	+ * <or>
	+ * c) The push bit has been set by the peer
	+ */
	+
	int
	ctf_process_inbound_raw(struct tcpcb tp, struct socket so, struct mbuf *m, int has_pkt)
	{
	@@ -355,13 +414,7 @@
	* have been called (if we can).
	*/
	m->m_pkthdr.lro_nsegs = 1;
	- if (m->m_flags & M_TSTMP_LRO) {
	- tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
	- tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
	- } else {
	- /* Should not be should we kassert instead? */
	- tcp_get_usecs(&tv);
	- }
	+ tcp_get_usecs(&tv);
	/* Now what about next packet? */
	if (m_save \|\| has_pkt)
	nxt_pkt = 1;
	Index: sys/mbuf.h
	===================================================================
	--- sys/mbuf.h
	+++ sys/mbuf.h
	@@ -199,6 +199,8 @@
	#define lro_nsegs tso_segsz
	#define csum_phsum PH_per.sixteen[2]
	#define csum_data PH_per.thirtytwo[1]
	+#define lro_len PH_per.sixteen[0] /* inbound during LRO */
	+#define lro_csum PH_per.sixteen[1] /* inbound during LRO */
	#define pace_thoff PH_loc.sixteen[0]
	#define pace_tlen PH_loc.sixteen[1]
	#define pace_drphdrlen PH_loc.sixteen[2]
	@@ -392,7 +394,7 @@
	/*
	* mbuf flags of global significance and layer crossing.
	* Those of only protocol/layer specific significance are to be mapped
	- * to M_PROTO[1-12] and cleared at layer handoff boundaries.
	+ * to M_PROTO[1-11] and cleared at layer handoff boundaries.
	* NB: Limited to the lower 24 bits.
	*/
	#define M_EXT 0x00000001 /* has associated external storage */
	@@ -411,18 +413,17 @@
	and 802.1AS) */
	#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */

	-#define M_PROTO1 0x00001000 /* protocol-specific */
	-#define M_PROTO2 0x00002000 /* protocol-specific */
	-#define M_PROTO3 0x00004000 /* protocol-specific */
	-#define M_PROTO4 0x00008000 /* protocol-specific */
	-#define M_PROTO5 0x00010000 /* protocol-specific */
	-#define M_PROTO6 0x00020000 /* protocol-specific */
	-#define M_PROTO7 0x00040000 /* protocol-specific */
	-#define M_PROTO8 0x00080000 /* protocol-specific */
	-#define M_PROTO9 0x00100000 /* protocol-specific */
	-#define M_PROTO10 0x00200000 /* protocol-specific */
	-#define M_PROTO11 0x00400000 /* protocol-specific */
	-#define M_PROTO12 0x00800000 /* protocol-specific */
	+#define M_PROTO1 0x00002000 /* protocol-specific */
	+#define M_PROTO2 0x00004000 /* protocol-specific */
	+#define M_PROTO3 0x00008000 /* protocol-specific */
	+#define M_PROTO4 0x00010000 /* protocol-specific */
	+#define M_PROTO5 0x00020000 /* protocol-specific */
	+#define M_PROTO6 0x00040000 /* protocol-specific */
	+#define M_PROTO7 0x00080000 /* protocol-specific */
	+#define M_PROTO8 0x00100000 /* protocol-specific */
	+#define M_PROTO9 0x00200000 /* protocol-specific */
	+#define M_PROTO10 0x00400000 /* protocol-specific */
	+#define M_PROTO11 0x00800000 /* protocol-specific */

	#define MB_DTOR_SKIP 0x1 /* don't pollute the cache by touching a freed mbuf */

	@@ -431,7 +432,7 @@
	*/
	#define M_PROTOFLAGS \
	(M_PROTO1\|M_PROTO2\|M_PROTO3\|M_PROTO4\|M_PROTO5\|M_PROTO6\|M_PROTO7\|M_PROTO8\|\
	- M_PROTO9\|M_PROTO10\|M_PROTO11\|M_PROTO12)
	+ M_PROTO9\|M_PROTO10\|M_PROTO11)

	/*
	* Flags preserved when copying m_pkthdr.
	@@ -449,7 +450,7 @@
	#define M_FLAG_PROTOBITS \
	"\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \
	"\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \
	- "\27M_PROTO11\30M_PROTO12"
	+ "\27M_PROTO11"
	#define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS)

	/*

File Metadata

Mime Type: text/plain
Expires: Sat, Mar 14, 6:27 AM (1 h, 50 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 29657105
Default Alt Text: D21127.id61578.diff (38 KB)

D21127.id61578.diffNo OneTemporaryActions

D21127.id61578.diffView Options

File Metadata

Event Timeline

D21127.id61578.diff
No OneTemporary
Actions

D21127.id61578.diff
View Options