Index: head/sys/netinet/tcp_lro.c =================================================================== --- head/sys/netinet/tcp_lro.c (revision 351933) +++ head/sys/netinet/tcp_lro.c (revision 351934) @@ -1,989 +1,1446 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Portions of this software were developed by Bjoern Zeeb * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include #include #include #include #include +#include +#include #include #include #include #include - +#include +#include #include #include static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); #define TCP_LRO_UPDATE_CSUM 1 #ifndef TCP_LRO_UPDATE_CSUM #define TCP_LRO_INVALID_CSUM 0x0000 #endif static void tcp_lro_rx_done(struct lro_ctrl *lc); static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash); SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP LRO"); +static long tcplro_stacks_wanting_mbufq = 0; +counter_u64_t tcp_inp_lro_direct_queue; +counter_u64_t tcp_inp_lro_wokeup_queue; +counter_u64_t tcp_inp_lro_compressed; +counter_u64_t tcp_inp_lro_single_push; +counter_u64_t tcp_inp_lro_locks_taken; +counter_u64_t tcp_inp_lro_sack_wake; + static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; +static int32_t hold_lock_over_compress = 0; +SYSCTL_INT(_net_inet_tcp_lro, OID_AUTO, hold_lock, CTLFLAG_RW, + &hold_lock_over_compress, 0, + "Do we hold the lock over the compress of mbufs?"); SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, "default number of LRO entries"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, + &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, + &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, + &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, single, CTLFLAG_RD, + &tcp_inp_lro_single_push, "Number of lro's sent with single segment"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, + &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); +SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, sackwakeups, CTLFLAG_RD, + &tcp_inp_lro_sack_wake, "Number of wakeups caused by sack/fin"); +void +tcp_lro_reg_mbufq(void) +{ + atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); +} + +void +tcp_lro_dereg_mbufq(void) +{ + atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); +} + static __inline void tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, struct lro_entry *le) { LIST_INSERT_HEAD(&lc->lro_active, le, next); LIST_INSERT_HEAD(bucket, le, hash_next); } static __inline void tcp_lro_active_remove(struct lro_entry *le) { LIST_REMOVE(le, next); /* active list */ LIST_REMOVE(le, hash_next); /* hash bucket */ } int tcp_lro_init(struct lro_ctrl *lc) { return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); } int tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, unsigned lro_entries, unsigned lro_mbufs) { struct lro_entry *le; size_t size; unsigned i, elements; lc->lro_bad_csum = 0; lc->lro_queued = 0; lc->lro_flushed = 0; lc->lro_mbuf_count = 0; lc->lro_mbuf_max = lro_mbufs; lc->lro_cnt = lro_entries; lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; lc->lro_length_lim = TCP_LRO_LENGTH_MAX; lc->ifp = ifp; LIST_INIT(&lc->lro_free); LIST_INIT(&lc->lro_active); /* create hash table to accelerate entry lookup */ if (lro_entries > lro_mbufs) elements = lro_entries; else elements = lro_mbufs; lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, HASH_NOWAIT); if (lc->lro_hash == NULL) { memset(lc, 0, sizeof(*lc)); return (ENOMEM); } /* compute size to allocate */ size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + (lro_entries * sizeof(*le)); lc->lro_mbuf_data = (struct lro_mbuf_sort *) malloc(size, M_LRO, M_NOWAIT | M_ZERO); /* check for out of memory */ if (lc->lro_mbuf_data == NULL) { free(lc->lro_hash, M_LRO); memset(lc, 0, sizeof(*lc)); return (ENOMEM); } /* compute offset for LRO entries */ le = (struct lro_entry *) (lc->lro_mbuf_data + lro_mbufs); /* setup linked list */ for (i = 0; i != lro_entries; i++) LIST_INSERT_HEAD(&lc->lro_free, le + i, next); return (0); } +static struct tcphdr * +tcp_lro_get_th(struct lro_entry *le, struct mbuf *m) +{ + struct ether_header *eh; + struct tcphdr *th = NULL; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip4 = NULL; /* Keep compiler happy. */ +#endif + + eh = mtod(m, struct ether_header *); + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + ip6 = (struct ip6_hdr *)(eh + 1); + th = (struct tcphdr *)(ip6 + 1); + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + ip4 = (struct ip *)(eh + 1); + th = (struct tcphdr *)(ip4 + 1); + break; +#endif + } + return (th); +} + void tcp_lro_free(struct lro_ctrl *lc) { struct lro_entry *le; unsigned x; /* reset LRO free list */ LIST_INIT(&lc->lro_free); /* free active mbufs, if any */ while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); m_freem(le->m_head); } /* free hash table */ free(lc->lro_hash, M_LRO); lc->lro_hash = NULL; lc->lro_hashsz = 0; /* free mbuf array, if any */ for (x = 0; x != lc->lro_mbuf_count; x++) m_freem(lc->lro_mbuf_data[x].mb); lc->lro_mbuf_count = 0; /* free allocated memory, if any */ free(lc->lro_mbuf_data, M_LRO); lc->lro_mbuf_data = NULL; } -#ifdef TCP_LRO_UPDATE_CSUM static uint16_t tcp_lro_csum_th(struct tcphdr *th) { uint32_t ch; uint16_t *p, l; ch = th->th_sum = 0x0000; l = th->th_off; p = (uint16_t *)th; while (l > 0) { ch += *p; p++; ch += *p; p++; l--; } while (ch > 0xffff) ch = (ch >> 16) + (ch & 0xffff); return (ch & 0xffff); } static uint16_t tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, uint16_t tcp_data_len, uint16_t csum) { uint32_t c; uint16_t cs; c = csum; /* Remove length from checksum. */ switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)l3hdr; if (le->append_cnt == 0) cs = ip6->ip6_plen; else { uint32_t cx; cx = ntohs(ip6->ip6_plen); cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); } break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip4; ip4 = (struct ip *)l3hdr; if (le->append_cnt == 0) cs = ip4->ip_len; else { cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), IPPROTO_TCP); cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, htons(cs)); } break; } #endif default: cs = 0; /* Keep compiler happy. */ } cs = ~cs; c += cs; /* Remove TCP header csum. */ cs = ~tcp_lro_csum_th(th); c += cs; while (c > 0xffff) c = (c >> 16) + (c & 0xffff); return (c & 0xffff); } -#endif static void tcp_lro_rx_done(struct lro_ctrl *lc) { struct lro_entry *le; while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } void tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) { struct lro_entry *le, *le_tmp; struct timeval tv; if (LIST_EMPTY(&lc->lro_active)) return; - getmicrotime(&tv); + getmicrouptime(&tv); timevalsub(&tv, timeout); LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { if (timevalcmp(&tv, &le->mtime, >=)) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } } -void -tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) +#ifdef INET6 +static int +tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, + struct tcphdr **th) { - if (le->append_cnt > 0) { + /* XXX-BZ we should check the flow-label. */ + + /* XXX-BZ We do not yet support ext. hdrs. */ + if (ip6->ip6_nxt != IPPROTO_TCP) + return (TCP_LRO_NOT_SUPPORTED); + + /* Find the TCP header. */ + *th = (struct tcphdr *)(ip6 + 1); + + return (0); +} +#endif + +#ifdef INET +static int +tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, + struct tcphdr **th) +{ + int csum_flags; + uint16_t csum; + + if (ip4->ip_p != IPPROTO_TCP) + return (TCP_LRO_NOT_SUPPORTED); + + /* Ensure there are no options. */ + if ((ip4->ip_hl << 2) != sizeof (*ip4)) + return (TCP_LRO_CANNOT); + + /* .. and the packet is not fragmented. */ + if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) + return (TCP_LRO_CANNOT); + + /* Legacy IP has a header checksum that needs to be correct. */ + csum_flags = m->m_pkthdr.csum_flags; + if (csum_flags & CSUM_IP_CHECKED) { + if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { + lc->lro_bad_csum++; + return (TCP_LRO_CANNOT); + } + } else { + csum = in_cksum_hdr(ip4); + if (__predict_false((csum) != 0)) { + lc->lro_bad_csum++; + return (TCP_LRO_CANNOT); + } + } + /* Find the TCP header (we assured there are no IP options). */ + *th = (struct tcphdr *)(ip4 + 1); + return (0); +} +#endif + +static void +tcp_lro_log(struct tcpcb *tp, struct lro_ctrl *lc, + struct lro_entry *le, struct mbuf *m, int frm, int32_t tcp_data_len, + uint32_t th_seq , uint32_t th_ack, uint16_t th_win) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + uint32_t cts; + + cts = tcp_get_usecs(&tv); + memset(&log, 0, sizeof(union tcp_log_stackspecific)); + log.u_bbr.flex8 = frm; + log.u_bbr.flex1 = tcp_data_len; + if (m) + log.u_bbr.flex2 = m->m_pkthdr.len; + else + log.u_bbr.flex2 = 0; + log.u_bbr.flex3 = le->append_cnt; + log.u_bbr.flex4 = le->p_len; + log.u_bbr.flex5 = le->m_head->m_pkthdr.len; + log.u_bbr.delRate = le->m_head->m_flags; + log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; + log.u_bbr.flex6 = lc->lro_length_lim; + log.u_bbr.flex7 = lc->lro_ackcnt_lim; + log.u_bbr.inflight = th_seq; + log.u_bbr.timeStamp = cts; + log.u_bbr.epoch = le->next_seq; + log.u_bbr.delivered = th_ack; + log.u_bbr.lt_epoch = le->ack_seq; + log.u_bbr.pacing_gain = th_win; + log.u_bbr.cwnd_gain = le->window; + log.u_bbr.cur_del_rate = (uint64_t)m; + log.u_bbr.bw_inuse = (uint64_t)le->m_head; + log.u_bbr.pkts_out = le->mbuf_cnt; /* Total mbufs added */ + log.u_bbr.applimited = le->ulp_csum; + log.u_bbr.lost = le->mbuf_appended; + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + TCP_LOG_LRO, 0, + 0, &log, false, &tv); + } +} + +static void +tcp_flush_out_le(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, int locked) +{ + if (le->append_cnt > 1) { struct tcphdr *th; uint16_t p_len; p_len = htons(le->p_len); switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6; ip6 = le->le_ip6; ip6->ip6_plen = p_len; th = (struct tcphdr *)(ip6 + 1); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; le->p_len += ETHER_HDR_LEN + sizeof(*ip6); break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip4; -#ifdef TCP_LRO_UPDATE_CSUM uint32_t cl; uint16_t c; -#endif ip4 = le->le_ip4; -#ifdef TCP_LRO_UPDATE_CSUM /* Fix IP header checksum for new length. */ c = ~ip4->ip_sum; cl = c; c = ~ip4->ip_len; cl += c + p_len; while (cl > 0xffff) cl = (cl >> 16) + (cl & 0xffff); c = cl; ip4->ip_sum = ~c; -#else - ip4->ip_sum = TCP_LRO_INVALID_CSUM; -#endif ip4->ip_len = p_len; th = (struct tcphdr *)(ip4 + 1); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; le->p_len += ETHER_HDR_LEN; break; } #endif default: th = NULL; /* Keep compiler happy. */ } le->m_head->m_pkthdr.csum_data = 0xffff; le->m_head->m_pkthdr.len = le->p_len; /* Incorporate the latest ACK into the TCP header. */ th->th_ack = le->ack_seq; th->th_win = le->window; /* Incorporate latest timestamp into the TCP header. */ if (le->timestamp != 0) { uint32_t *ts_ptr; ts_ptr = (uint32_t *)(th + 1); ts_ptr[1] = htonl(le->tsval); ts_ptr[2] = le->tsecr; } -#ifdef TCP_LRO_UPDATE_CSUM /* Update the TCP header checksum. */ le->ulp_csum += p_len; le->ulp_csum += tcp_lro_csum_th(th); while (le->ulp_csum > 0xffff) le->ulp_csum = (le->ulp_csum >> 16) + (le->ulp_csum & 0xffff); th->th_sum = (le->ulp_csum & 0xffff); th->th_sum = ~th->th_sum; -#else - th->th_sum = TCP_LRO_INVALID_CSUM; + if (tp && locked) { + tcp_lro_log(tp, lc, le, NULL, 7, 0, 0, 0, 0); + } + } + /* + * Break any chain, this is not set to NULL on the singleton + * case m_nextpkt points to m_head. Other case set them + * m_nextpkt to NULL in push_and_replace. + */ + le->m_head->m_nextpkt = NULL; + le->m_head->m_pkthdr.lro_nsegs = le->append_cnt; + if (tp && locked) { + tcp_lro_log(tp, lc, le, le->m_head, 8, 0, 0, 0, 0); + } + (*lc->ifp->if_input)(lc->ifp, le->m_head); + lc->lro_queued += le->append_cnt; +} + +static void +tcp_set_le_to_m(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) +{ + struct ether_header *eh; + void *l3hdr = NULL; /* Keep compiler happy. */ + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif +#ifdef INET + struct ip *ip4 = NULL; /* Keep compiler happy. */ +#endif + uint32_t *ts_ptr; + int error, l, ts_failed = 0; + uint16_t tcp_data_len; + uint16_t csum; + + error = -1; + eh = mtod(m, struct ether_header *); + /* + * We must reset the other pointers since the mbuf + * we were pointing too is about to go away. + */ + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); + error = tcp_lro_rx_ipv6(lc, m, ip6, &th); + le->le_ip6 = ip6; + le->source_ip6 = ip6->ip6_src; + le->dest_ip6 = ip6->ip6_dst; + le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + l3hdr = ip4 = (struct ip *)(eh + 1); + error = tcp_lro_rx_ipv4(lc, m, ip4, &th); + le->le_ip4 = ip4; + le->source_ip4 = ip4->ip_src.s_addr; + le->dest_ip4 = ip4->ip_dst.s_addr; + le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; + break; +#endif } + KASSERT(error == 0, ("%s: le=%p tcp_lro_rx_xxx failed\n", + __func__, le)); + ts_ptr = (uint32_t *)(th + 1); + l = (th->th_off << 2); + l -= sizeof(*th); + if (l != 0 && + (__predict_false(l != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* We have failed to find a timestamp some other option? */ + ts_failed = 1; + } + if ((l != 0) && (ts_failed == 0)) { + le->timestamp = 1; + le->tsval = ntohl(*(ts_ptr + 1)); + le->tsecr = *(ts_ptr + 2); + } else + le->timestamp = 0; + le->source_port = th->th_sport; + le->dest_port = th->th_dport; + /* Pull out the csum */ + tcp_data_len = m->m_pkthdr.lro_len; + le->next_seq = ntohl(th->th_seq) + tcp_data_len; + le->ack_seq = th->th_ack; + le->window = th->th_win; + csum = th->th_sum; + /* Setup the data pointers */ + le->m_head = m; + le->m_tail = m_last(m); + le->append_cnt = 0; + le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, + ~csum); + le->append_cnt++; + th->th_sum = csum; /* Restore checksum on first packet. */ +} - le->m_head->m_pkthdr.lro_nsegs = le->append_cnt + 1; - (*lc->ifp->if_input)(lc->ifp, le->m_head); - lc->lro_queued += le->append_cnt + 1; +static void +tcp_push_and_replace(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m, int locked) +{ + /* + * Push up the stack the current le and replace + * it with m. + */ + struct mbuf *msave; + + /* Grab off the next and save it */ + msave = le->m_head->m_nextpkt; + le->m_head->m_nextpkt = NULL; + /* Now push out the old le entry */ + tcp_flush_out_le(tp, lc, le, locked); + /* + * Now to replace the data properly in the le + * we have to reset the tcp header and + * other fields. + */ + tcp_set_le_to_m(lc, le, m); + /* Restore the next list */ + m->m_nextpkt = msave; +} + +static void +tcp_lro_condense(struct tcpcb *tp, struct lro_ctrl *lc, struct lro_entry *le, int locked) +{ + /* + * Walk through the mbuf chain we + * have on tap and compress/condense + * as required. + */ + uint32_t *ts_ptr; + struct mbuf *m; + struct tcphdr *th; + uint16_t tcp_data_len, csum_upd; + int l; + + /* + * First we must check the lead (m_head) + * we must make sure that it is *not* + * something that should be sent up + * right away (sack etc). + */ +again: + + m = le->m_head->m_nextpkt; + if (m == NULL) { + /* Just the one left */ + return; + } + th = tcp_lro_get_th(le, le->m_head); + KASSERT(th != NULL, + ("le:%p m:%p th comes back NULL?", le, le->m_head)); + l = (th->th_off << 2); + l -= sizeof(*th); + ts_ptr = (uint32_t *)(th + 1); + if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* + * Its not the timestamp. We can't + * use this guy as the head. + */ + le->m_head->m_nextpkt = m->m_nextpkt; + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { + /* + * Make sure that previously seen segements/ACKs are delivered + * before this segment, e.g. FIN. + */ + le->m_head->m_nextpkt = m->m_nextpkt; + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + while((m = le->m_head->m_nextpkt) != NULL) { + /* + * condense m into le, first + * pull m out of the list. + */ + le->m_head->m_nextpkt = m->m_nextpkt; + m->m_nextpkt = NULL; + /* Setup my data */ + tcp_data_len = m->m_pkthdr.lro_len; + th = tcp_lro_get_th(le, m); + KASSERT(th != NULL, + ("le:%p m:%p th comes back NULL?", le, m)); + ts_ptr = (uint32_t *)(th + 1); + l = (th->th_off << 2); + l -= sizeof(*th); + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 1, 0, 0, 0, 0); + } + if (le->append_cnt >= lc->lro_ackcnt_lim) { + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 2, 0, 0, 0, 0); + } + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { + /* Flush now if appending will result in overflow. */ + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 3, tcp_data_len, 0, 0, 0); + } + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* + * Maybe a sack in the new one? We need to + * start all over after flushing the + * current le. We will go up to the beginning + * and flush it (calling the replace again possibly + * or just returning). + */ + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if (l != 0) { + uint32_t tsval = ntohl(*(ts_ptr + 1)); + /* Make sure timestamp values are increasing. */ + if (TSTMP_GT(le->tsval, tsval)) { + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + le->tsval = tsval; + le->tsecr = *(ts_ptr + 2); + } + /* Try to append the new segment. */ + if (__predict_false(ntohl(th->th_seq) != le->next_seq || + (tcp_data_len == 0 && + le->ack_seq == th->th_ack && + le->window == th->th_win))) { + /* Out of order packet or duplicate ACK. */ + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 4, tcp_data_len, + ntohl(th->th_seq), + th->th_ack, + th->th_win); + } + tcp_push_and_replace(tp, lc, le, m, locked); + goto again; + } + if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { + le->next_seq += tcp_data_len; + le->ack_seq = th->th_ack; + le->window = th->th_win; + } else if (th->th_ack == le->ack_seq) { + le->window = WIN_MAX(le->window, th->th_win); + } + csum_upd = m->m_pkthdr.lro_csum; + le->ulp_csum += csum_upd; + if (tcp_data_len == 0) { + le->append_cnt++; + le->mbuf_cnt--; + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 5, tcp_data_len, + ntohl(th->th_seq), + th->th_ack, + th->th_win); + } + m_freem(m); + continue; + } + le->append_cnt++; + le->mbuf_appended++; + le->p_len += tcp_data_len; + /* + * Adjust the mbuf so that m_data points to the first byte of + * the ULP payload. Adjust the mbuf to avoid complications and + * append new segment to existing mbuf chain. + */ + m_adj(m, m->m_pkthdr.len - tcp_data_len); + if (tp && locked) { + tcp_lro_log(tp, lc, le, m, 6, tcp_data_len, + ntohl(th->th_seq), + th->th_ack, + th->th_win); + } + m_demote_pkthdr(m); + le->m_tail->m_next = m; + le->m_tail = m_last(m); + } +} + +static void +tcp_queue_pkts(struct tcpcb *tp, struct lro_entry *le) +{ + if (tp->t_in_pkt == NULL) { + /* Nothing yet there */ + tp->t_in_pkt = le->m_head; + tp->t_tail_pkt = le->m_last_mbuf; + } else { + /* Already some there */ + tp->t_tail_pkt->m_nextpkt = le->m_head; + tp->t_tail_pkt = le->m_last_mbuf; + } + le->m_head = NULL; + le->m_last_mbuf = NULL; +} + +void +tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) +{ + struct tcpcb *tp = NULL; + int locked = 0; +#ifdef TCPHPTS + struct inpcb *inp = NULL; + int need_wakeup = 0, can_queue = 0; + struct epoch_tracker et; + + /* Now lets lookup the inp first */ + CURVNET_SET(lc->ifp->if_vnet); + if (tcplro_stacks_wanting_mbufq == 0) + goto skip_lookup; + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + switch (le->eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + inp = in6_pcblookup(&V_tcbinfo, &le->source_ip6, + le->source_port, &le->dest_ip6,le->dest_port, + INPLOOKUP_WLOCKPCB, + lc->ifp); + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + inp = in_pcblookup(&V_tcbinfo, le->le_ip4->ip_src, + le->source_port, le->le_ip4->ip_dst, le->dest_port, + INPLOOKUP_WLOCKPCB, + lc->ifp); + break; +#endif + } + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || + (inp->inp_flags2 & INP_FREED))) { + /* We don't want this guy */ + INP_WUNLOCK(inp); + inp = NULL; + } + if (inp && (inp->inp_flags2 & INP_SUPPORTS_MBUFQ)) { + /* The transport supports mbuf queuing */ + can_queue = 1; + if (le->need_wakeup || + ((inp->inp_in_input == 0) && + ((inp->inp_flags2 & INP_MBUF_QUEUE_READY) == 0))) { + /* + * Either the transport is off on a keep-alive + * (it has the queue_ready flag clear and its + * not already been woken) or the entry has + * some urgent thing (FIN or possibly SACK blocks). + * This means we need to wake the transport up by + * putting it on the input pacer. + */ + need_wakeup = 1; + if ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) && + (le->need_wakeup != 1)) { + /* + * Prohibited from a sack wakeup. + */ + need_wakeup = 0; + } + } + /* Do we need to be awoken due to lots of data or acks? */ + if ((le->tcp_tot_p_len >= lc->lro_length_lim) || + (le->mbuf_cnt >= lc->lro_ackcnt_lim)) + need_wakeup = 1; + } + if (inp) { + tp = intotcpcb(inp); + locked = 1; + } else + tp = NULL; + if (can_queue) { + counter_u64_add(tcp_inp_lro_direct_queue, 1); + tcp_lro_log(tp, lc, le, NULL, 22, need_wakeup, + inp->inp_flags2, inp->inp_in_input, le->need_wakeup); + tcp_queue_pkts(tp, le); + if (need_wakeup) { + /* + * We must get the guy to wakeup via + * hpts. + */ + counter_u64_add(tcp_inp_lro_wokeup_queue, 1); + if (le->need_wakeup) + counter_u64_add(tcp_inp_lro_sack_wake, 1); + tcp_queue_to_input(inp); + } + } + if (inp && (hold_lock_over_compress == 0)) { + /* Unlock it */ + locked = 0; + tp = NULL; + counter_u64_add(tcp_inp_lro_locks_taken, 1); + INP_WUNLOCK(inp); + } + if (can_queue == 0) { +skip_lookup: +#endif + /* Old fashioned lro method */ + if (le->m_head != le->m_last_mbuf) { + counter_u64_add(tcp_inp_lro_compressed, 1); + tcp_lro_condense(tp, lc, le, locked); + } else + counter_u64_add(tcp_inp_lro_single_push, 1); + tcp_flush_out_le(tp, lc, le, locked); +#ifdef TCPHPTS + } + if (inp && locked) { + counter_u64_add(tcp_inp_lro_locks_taken, 1); + INP_WUNLOCK(inp); + } + CURVNET_RESTORE(); +#endif lc->lro_flushed++; bzero(le, sizeof(*le)); LIST_INSERT_HEAD(&lc->lro_free, le, next); } #ifdef HAVE_INLINE_FLSLL #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) #else static inline uint64_t tcp_lro_msb_64(uint64_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); return (x & ~(x >> 1)); } #endif /* * The tcp_lro_sort() routine is comparable to qsort(), except it has * a worst case complexity limit of O(MIN(N,64)*N), where N is the * number of elements to sort and 64 is the number of sequence bits * available. The algorithm is bit-slicing the 64-bit sequence number, * sorting one bit at a time from the most significant bit until the * least significant one, skipping the constant bits. This is * typically called a radix sort. */ static void tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) { struct lro_mbuf_sort temp; uint64_t ones; uint64_t zeros; uint32_t x; uint32_t y; repeat: /* for small arrays insertion sort is faster */ if (size <= 12) { for (x = 1; x < size; x++) { temp = parray[x]; for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) parray[y] = parray[y - 1]; parray[y] = temp; } return; } /* compute sequence bits which are constant */ ones = 0; zeros = 0; for (x = 0; x != size; x++) { ones |= parray[x].seq; zeros |= ~parray[x].seq; } /* compute bits which are not constant into "ones" */ ones &= zeros; if (ones == 0) return; /* pick the most significant bit which is not constant */ ones = tcp_lro_msb_64(ones); /* * Move entries having cleared sequence bits to the beginning * of the array: */ for (x = y = 0; y != size; y++) { /* skip set bits */ if (parray[y].seq & ones) continue; /* swap entries */ temp = parray[x]; parray[x] = parray[y]; parray[y] = temp; x++; } KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); /* sort zeros */ tcp_lro_sort(parray, x); /* sort ones */ parray += x; size -= x; goto repeat; } void tcp_lro_flush_all(struct lro_ctrl *lc) { uint64_t seq; uint64_t nseq; unsigned x; /* check if no mbufs to flush */ if (lc->lro_mbuf_count == 0) goto done; /* sort all mbufs according to stream */ tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); /* input data into LRO engine, stream by stream */ seq = 0; for (x = 0; x != lc->lro_mbuf_count; x++) { struct mbuf *mb; /* get mbuf */ mb = lc->lro_mbuf_data[x].mb; /* get sequence number, masking away the packet index */ nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); /* check for new stream */ if (seq != nseq) { seq = nseq; /* flush active streams */ tcp_lro_rx_done(lc); } /* add packet to LRO engine */ if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { /* input packet to network layer */ (*lc->ifp->if_input)(lc->ifp, mb); lc->lro_queued++; lc->lro_flushed++; } } done: /* flush active streams */ tcp_lro_rx_done(lc); lc->lro_mbuf_count = 0; } -#ifdef INET6 -static int -tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, - struct tcphdr **th) +static void +lro_set_mtime(struct timeval *tv, struct timespec *ts) { - - /* XXX-BZ we should check the flow-label. */ - - /* XXX-BZ We do not yet support ext. hdrs. */ - if (ip6->ip6_nxt != IPPROTO_TCP) - return (TCP_LRO_NOT_SUPPORTED); - - /* Find the TCP header. */ - *th = (struct tcphdr *)(ip6 + 1); - - return (0); + tv->tv_sec = ts->tv_sec; + tv->tv_usec = ts->tv_nsec / 1000; } -#endif -#ifdef INET static int -tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, - struct tcphdr **th) -{ - int csum_flags; - uint16_t csum; - - if (ip4->ip_p != IPPROTO_TCP) - return (TCP_LRO_NOT_SUPPORTED); - - /* Ensure there are no options. */ - if ((ip4->ip_hl << 2) != sizeof (*ip4)) - return (TCP_LRO_CANNOT); - - /* .. and the packet is not fragmented. */ - if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) - return (TCP_LRO_CANNOT); - - /* Legacy IP has a header checksum that needs to be correct. */ - csum_flags = m->m_pkthdr.csum_flags; - if (csum_flags & CSUM_IP_CHECKED) { - if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { - lc->lro_bad_csum++; - return (TCP_LRO_CANNOT); - } - } else { - csum = in_cksum_hdr(ip4); - if (__predict_false((csum) != 0)) { - lc->lro_bad_csum++; - return (TCP_LRO_CANNOT); - } - } - - /* Find the TCP header (we assured there are no IP options). */ - *th = (struct tcphdr *)(ip4 + 1); - - return (0); -} -#endif - -static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) { struct lro_entry *le; struct ether_header *eh; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif #ifdef INET struct ip *ip4 = NULL; /* Keep compiler happy. */ #endif struct tcphdr *th; void *l3hdr = NULL; /* Keep compiler happy. */ uint32_t *ts_ptr; tcp_seq seq; int error, ip_len, l; - uint16_t eh_type, tcp_data_len; + uint16_t eh_type, tcp_data_len, need_flush; struct lro_head *bucket; - int force_flush = 0; + struct timespec arrv; /* We expect a contiguous header [eh, ip, tcp]. */ - + if ((m->m_flags & (M_TSTMP_LRO|M_TSTMP)) == 0) { + /* If no hardware or arrival stamp on the packet add arrival */ + nanouptime(&arrv); + m->m_pkthdr.rcv_tstmp = (arrv.tv_sec * 1000000000) + arrv.tv_nsec; + m->m_flags |= M_TSTMP_LRO; + } eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { CURVNET_SET(lc->ifp->if_vnet); if (V_ip6_forwarding != 0) { /* XXX-BZ stats but changing lro_ctrl is a problem. */ CURVNET_RESTORE(); return (TCP_LRO_CANNOT); } CURVNET_RESTORE(); l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); error = tcp_lro_rx_ipv6(lc, m, ip6, &th); if (error != 0) return (error); tcp_data_len = ntohs(ip6->ip6_plen); ip_len = sizeof(*ip6) + tcp_data_len; break; } #endif #ifdef INET case ETHERTYPE_IP: { CURVNET_SET(lc->ifp->if_vnet); if (V_ipforwarding != 0) { /* XXX-BZ stats but changing lro_ctrl is a problem. */ CURVNET_RESTORE(); return (TCP_LRO_CANNOT); } CURVNET_RESTORE(); l3hdr = ip4 = (struct ip *)(eh + 1); error = tcp_lro_rx_ipv4(lc, m, ip4, &th); if (error != 0) return (error); ip_len = ntohs(ip4->ip_len); tcp_data_len = ip_len - sizeof(*ip4); break; } #endif /* XXX-BZ what happens in case of VLAN(s)? */ default: return (TCP_LRO_NOT_SUPPORTED); } /* * If the frame is padded beyond the end of the IP packet, then we must * trim the extra bytes off. */ l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); if (l != 0) { if (l < 0) /* Truncated packet. */ return (TCP_LRO_CANNOT); m_adj(m, -l); } - /* * Check TCP header constraints. */ - /* Ensure no bits set besides ACK or PSH. */ - if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { - if (th->th_flags & TH_SYN) - return (TCP_LRO_CANNOT); - /* - * Make sure that previously seen segements/ACKs are delivered - * before this segement, e.g. FIN. - */ - force_flush = 1; - } - - /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */ - /* XXX-BZ Ideally we'd flush on PUSH? */ - - /* - * Check for timestamps. - * Since the only option we handle are timestamps, we only have to - * handle the simple case of aligned timestamps. - */ + if (th->th_flags & TH_SYN) + return (TCP_LRO_CANNOT); + if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) + need_flush = 1; + else + need_flush = 0; l = (th->th_off << 2); + ts_ptr = (uint32_t *)(th + 1); tcp_data_len -= l; l -= sizeof(*th); - ts_ptr = (uint32_t *)(th + 1); if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || - (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| - TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { - /* - * Make sure that previously seen segements/ACKs are delivered - * before this segement. + (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| + TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) { + /* + * We have an option besides Timestamps, maybe + * it is a sack (most likely) which means we + * will probably need to wake up a sleeper (if + * the guy does queueing). */ - force_flush = 1; + need_flush = 2; } /* If the driver did not pass in the checksum, set it now. */ if (csum == 0x0000) csum = th->th_sum; - seq = ntohl(th->th_seq); - if (!use_hash) { bucket = &lc->lro_hash[0]; } else if (M_HASHTYPE_ISHASH(m)) { bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; } else { uint32_t hash; switch (eh_type) { #ifdef INET case ETHERTYPE_IP: hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: hash = ip6->ip6_src.s6_addr32[0] + - ip6->ip6_dst.s6_addr32[0]; + ip6->ip6_dst.s6_addr32[0]; hash += ip6->ip6_src.s6_addr32[1] + - ip6->ip6_dst.s6_addr32[1]; + ip6->ip6_dst.s6_addr32[1]; hash += ip6->ip6_src.s6_addr32[2] + - ip6->ip6_dst.s6_addr32[2]; + ip6->ip6_dst.s6_addr32[2]; hash += ip6->ip6_src.s6_addr32[3] + - ip6->ip6_dst.s6_addr32[3]; + ip6->ip6_dst.s6_addr32[3]; break; #endif default: hash = 0; break; } hash += th->th_sport + th->th_dport; bucket = &lc->lro_hash[hash % lc->lro_hashsz]; } /* Try to find a matching previous segment. */ LIST_FOREACH(le, bucket, hash_next) { if (le->eh_type != eh_type) continue; if (le->source_port != th->th_sport || le->dest_port != th->th_dport) continue; switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: if (bcmp(&le->source_ip6, &ip6->ip6_src, - sizeof(struct in6_addr)) != 0 || + sizeof(struct in6_addr)) != 0 || bcmp(&le->dest_ip6, &ip6->ip6_dst, - sizeof(struct in6_addr)) != 0) + sizeof(struct in6_addr)) != 0) continue; break; #endif #ifdef INET case ETHERTYPE_IP: if (le->source_ip4 != ip4->ip_src.s_addr || le->dest_ip4 != ip4->ip_dst.s_addr) continue; break; #endif } - - if (force_flush) { - /* Timestamps mismatch; this is a FIN, etc */ - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - return (TCP_LRO_CANNOT); - } - - /* Flush now if appending will result in overflow. */ - if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - break; - } - - /* Try to append the new segment. */ - if (__predict_false(seq != le->next_seq || - (tcp_data_len == 0 && - le->ack_seq == th->th_ack && - le->window == th->th_win))) { - /* Out of order packet or duplicate ACK. */ - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - return (TCP_LRO_CANNOT); - } - - if (l != 0) { - uint32_t tsval = ntohl(*(ts_ptr + 1)); - /* Make sure timestamp values are increasing. */ - /* XXX-BZ flip and use TSTMP_GEQ macro for this? */ - if (__predict_false(le->tsval > tsval || - *(ts_ptr + 2) == 0)) - return (TCP_LRO_CANNOT); - le->tsval = tsval; - le->tsecr = *(ts_ptr + 2); - } - if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { - le->next_seq += tcp_data_len; - le->ack_seq = th->th_ack; - le->window = th->th_win; - le->append_cnt++; - } else if (th->th_ack == le->ack_seq) { - le->window = WIN_MAX(le->window, th->th_win); - le->append_cnt++; + if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq)) || + (th->th_ack == le->ack_seq)) { + m->m_pkthdr.lro_len = tcp_data_len; } else { /* no data and old ack */ - le->append_cnt++; m_freem(m); return (0); } -#ifdef TCP_LRO_UPDATE_CSUM - le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th, - tcp_data_len, ~csum); -#endif - - if (tcp_data_len == 0) { - m_freem(m); - /* - * Flush this LRO entry, if this ACK should not - * be further delayed. - */ - if (le->append_cnt >= lc->lro_ackcnt_lim) { - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - } - return (0); - } - - le->p_len += tcp_data_len; - - /* - * Adjust the mbuf so that m_data points to the first byte of - * the ULP payload. Adjust the mbuf to avoid complications and - * append new segment to existing mbuf chain. - */ - m_adj(m, m->m_pkthdr.len - tcp_data_len); - m_demote_pkthdr(m); - - le->m_tail->m_next = m; - le->m_tail = m_last(m); - - /* - * If a possible next full length packet would cause an - * overflow, pro-actively flush now. - */ - if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) { - tcp_lro_active_remove(le); - tcp_lro_flush(lc, le); - } else - getmicrotime(&le->mtime); - + if (need_flush) + le->need_wakeup = need_flush; + /* Save of the data only csum */ + m->m_pkthdr.rcvif = lc->ifp; + m->m_pkthdr.lro_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, + tcp_data_len, ~csum); + th->th_sum = csum; /* Restore checksum */ + /* Save off the tail I am appending too (prev) */ + le->m_prev_last = le->m_last_mbuf; + /* Mark me in the last spot */ + le->m_last_mbuf->m_nextpkt = m; + /* Now set the tail to me */ + le->m_last_mbuf = m; + le->mbuf_cnt++; + m->m_nextpkt = NULL; + /* Add to the total size of data */ + le->tcp_tot_p_len += tcp_data_len; + lro_set_mtime(&le->mtime, &arrv); return (0); } - - if (force_flush) { - /* - * Nothing to flush, but this segment can not be further - * aggregated/delayed. - */ - return (TCP_LRO_CANNOT); - } - /* Try to find an empty slot. */ if (LIST_EMPTY(&lc->lro_free)) return (TCP_LRO_NO_ENTRIES); /* Start a new segment chain. */ le = LIST_FIRST(&lc->lro_free); LIST_REMOVE(le, next); tcp_lro_active_insert(lc, bucket, le); - getmicrotime(&le->mtime); + lro_set_mtime(&le->mtime, &arrv); /* Start filling in details. */ switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: le->le_ip6 = ip6; le->source_ip6 = ip6->ip6_src; le->dest_ip6 = ip6->ip6_dst; le->eh_type = eh_type; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); break; #endif #ifdef INET case ETHERTYPE_IP: le->le_ip4 = ip4; le->source_ip4 = ip4->ip_src.s_addr; le->dest_ip4 = ip4->ip_dst.s_addr; le->eh_type = eh_type; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; break; #endif - } + } le->source_port = th->th_sport; le->dest_port = th->th_dport; - le->next_seq = seq + tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; if (l != 0) { le->timestamp = 1; le->tsval = ntohl(*(ts_ptr + 1)); le->tsecr = *(ts_ptr + 2); } - -#ifdef TCP_LRO_UPDATE_CSUM - /* - * Do not touch the csum of the first packet. However save the - * "adjusted" checksum of just the source and destination addresses, - * the next header and the TCP payload. The length and TCP header - * parts may change, so we remove those from the saved checksum and - * re-add with final values on tcp_lro_flush() if needed. - */ KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", - __func__, le, le->ulp_csum)); + __func__, le, le->ulp_csum)); + le->append_cnt = 0; le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, - ~csum); - th->th_sum = csum; /* Restore checksum on first packet. */ -#endif - + ~csum); + le->append_cnt++; + th->th_sum = csum; /* Restore checksum */ le->m_head = m; + m->m_pkthdr.rcvif = lc->ifp; + le->mbuf_cnt = 1; + if (need_flush) + le->need_wakeup = need_flush; + else + le->need_wakeup = 0; le->m_tail = m_last(m); - + le->m_last_mbuf = m; + m->m_nextpkt = NULL; + le->m_prev_last = NULL; + /* + * We keep the total size here for cross checking when we may need + * to flush/wakeup in the MBUF_QUEUE case. + */ + le->tcp_tot_p_len = tcp_data_len; + m->m_pkthdr.lro_len = tcp_data_len; return (0); } int tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) { return tcp_lro_rx2(lc, m, csum, 1); } void tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) { + struct timespec arrv; + /* sanity checks */ if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || lc->lro_mbuf_max == 0)) { /* packet drop */ m_freem(mb); return; } /* check if packet is not LRO capable */ if (__predict_false(mb->m_pkthdr.csum_flags == 0 || (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { /* input packet to network layer */ (*lc->ifp->if_input) (lc->ifp, mb); return; } + /* Arrival Stamp the packet */ + if ((mb->m_flags & M_TSTMP) == 0) { + /* If no hardware or arrival stamp on the packet add arrival */ + nanouptime(&arrv); + mb->m_pkthdr.rcv_tstmp = ((arrv.tv_sec * 1000000000) + + arrv.tv_nsec); + mb->m_flags |= M_TSTMP_LRO; + } /* create sequence number */ lc->lro_mbuf_data[lc->lro_mbuf_count].seq = (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | (((uint64_t)mb->m_pkthdr.flowid) << 24) | ((uint64_t)lc->lro_mbuf_count); /* enter mbuf */ lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; /* flush if array is full */ if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) tcp_lro_flush_all(lc); } /* end */ Index: head/sys/netinet/tcp_lro.h =================================================================== --- head/sys/netinet/tcp_lro.h (revision 351933) +++ head/sys/netinet/tcp_lro.h (revision 351934) @@ -1,123 +1,139 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _TCP_LRO_H_ #define _TCP_LRO_H_ #include #ifndef TCP_LRO_ENTRIES /* Define default number of LRO entries per RX queue */ #define TCP_LRO_ENTRIES 8 #endif struct lro_entry { LIST_ENTRY(lro_entry) next; LIST_ENTRY(lro_entry) hash_next; struct mbuf *m_head; struct mbuf *m_tail; + struct mbuf *m_last_mbuf; + struct mbuf *m_prev_last; union { struct ip *ip4; struct ip6_hdr *ip6; } leip; union { in_addr_t s_ip4; struct in6_addr s_ip6; } lesource; union { in_addr_t d_ip4; struct in6_addr d_ip6; } ledest; uint16_t source_port; uint16_t dest_port; uint16_t eh_type; /* EthernetHeader type. */ uint16_t append_cnt; uint32_t p_len; /* IP header payload length. */ uint32_t ulp_csum; /* TCP, etc. checksum. */ uint32_t next_seq; /* tcp_seq */ uint32_t ack_seq; /* tcp_seq */ uint32_t tsval; uint32_t tsecr; + uint32_t tcp_tot_p_len; /* TCP payload length of chain */ uint16_t window; uint16_t timestamp; /* flag, not a TCP hdr field. */ + uint16_t need_wakeup; + uint16_t mbuf_cnt; /* Count of mbufs collected see note */ + uint16_t mbuf_appended; struct timeval mtime; }; +/* + * Note: The mbuf_cnt field tracks our number of mbufs added to the m_next + * list. Each mbuf counted can have data and of course it will + * have an ack as well (by defintion any inbound tcp segment will + * have an ack value. We use this count to tell us how many ACK's + * are present for our ack-count threshold. If we exceed that or + * the data threshold we will wake up the endpoint. + */ LIST_HEAD(lro_head, lro_entry); #define le_ip4 leip.ip4 #define le_ip6 leip.ip6 #define source_ip4 lesource.s_ip4 #define dest_ip4 ledest.d_ip4 #define source_ip6 lesource.s_ip6 #define dest_ip6 ledest.d_ip6 struct lro_mbuf_sort { uint64_t seq; struct mbuf *mb; }; /* NB: This is part of driver structs. */ struct lro_ctrl { struct ifnet *ifp; struct lro_mbuf_sort *lro_mbuf_data; uint64_t lro_queued; uint64_t lro_flushed; uint64_t lro_bad_csum; unsigned lro_cnt; unsigned lro_mbuf_count; unsigned lro_mbuf_max; unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */ unsigned lro_length_lim; /* max len of aggregated data */ u_long lro_hashsz; struct lro_head *lro_hash; struct lro_head lro_active; struct lro_head lro_free; }; #define TCP_LRO_LENGTH_MAX 65535 #define TCP_LRO_ACKCNT_MAX 65535 /* unlimited */ int tcp_lro_init(struct lro_ctrl *); int tcp_lro_init_args(struct lro_ctrl *, struct ifnet *, unsigned, unsigned); void tcp_lro_free(struct lro_ctrl *); void tcp_lro_flush_inactive(struct lro_ctrl *, const struct timeval *); void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); void tcp_lro_flush_all(struct lro_ctrl *); int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *); +void tcp_lro_reg_mbufq(void); +void tcp_lro_dereg_mbufq(void); #define TCP_LRO_NO_ENTRIES -2 #define TCP_LRO_CANNOT -1 #define TCP_LRO_NOT_SUPPORTED 1 #endif /* _TCP_LRO_H_ */ Index: head/sys/netinet/tcp_stacks/rack_bbr_common.c =================================================================== --- head/sys/netinet/tcp_stacks/rack_bbr_common.c (revision 351933) +++ head/sys/netinet/tcp_stacks/rack_bbr_common.c (revision 351934) @@ -1,859 +1,912 @@ /*- * Copyright (c) 2016-2018 * Netflix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Author: Randall Stewart * This work is based on the ACM Queue paper * BBR - Congestion Based Congestion Control * and also numerous discussions with Neal, Yuchung and Van. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_ratelimit.h" /*#include "opt_kern_tls.h"*/ #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef INET6 #include #endif #include #include #include #include #if defined(IPSEC) || defined(IPSEC_SUPPORT) #include #include #endif /* IPSEC */ #include #include #include #ifdef MAC #include #endif #include "rack_bbr_common.h" /* * Common TCP Functions - These are shared by borth * rack and BBR. */ #ifdef KERN_TLS uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) { struct sbtls_info *tls; uint32_t len; again: tls = so->so_snd.sb_tls_info; len = tls->sb_params.sb_maxlen; /* max tls payload */ len += tls->sb_params.sb_tls_hlen; /* tls header len */ len += tls->sb_params.sb_tls_tlen; /* tls trailer len */ if ((len * 4) > rwnd) { /* * Stroke this will suck counter and what * else should we do Drew? From the * TCP perspective I am not sure * what should be done... */ if (tls->sb_params.sb_maxlen > 4096) { tls->sb_params.sb_maxlen -= 4096; if (tls->sb_params.sb_maxlen < 4096) tls->sb_params.sb_maxlen = 4096; goto again; } } return (len); } #endif + +/* + * The function ctf_process_inbound_raw() is used by + * transport developers to do the steps needed to + * support MBUF Queuing i.e. the flags in + * inp->inp_flags2: + * + * - INP_SUPPORTS_MBUFQ + * - INP_MBUF_QUEUE_READY + * - INP_DONT_SACK_QUEUE + * + * These flags help control how LRO will deliver + * packets to the transport. You first set in inp_flags2 + * the INP_SUPPORTS_MBUFQ to tell the LRO code that you + * will gladly take a queue of packets instead of a compressed + * single packet. You also set in your t_fb pointer the + * tfb_do_queued_segments to point to ctf_process_inbound_raw. + * + * This then gets you lists of inbound ACK's/Data instead + * of a condensed compressed ACK/DATA packet. Why would you + * want that? This will get you access to all the arrival + * times of at least LRO and possibly at the Hardware (if + * the interface card supports that) of the actual ACK/DATA. + * In some transport designs this is important since knowing + * the actual time we got the packet is useful information. + * + * Now there are some interesting Caveats that the transport + * designer needs to take into account when using this feature. + * + * 1) It is used with HPTS and pacing, when the pacing timer + * for output calls it will first call the input. + * 2) When you set INP_MBUF_QUEUE_READY this tells LRO + * queue normal packets, I am busy pacing out data and + * will process the queued packets before my tfb_tcp_output + * call from pacing. If a non-normal packet arrives, (e.g. sack) + * you will be awoken immediately. + * 3) Finally you can add the INP_DONT_SACK_QUEUE to not even + * be awoken if a SACK has arrived. You would do this when + * you were not only running a pacing for output timer + * but a Rack timer as well i.e. you know you are in recovery + * and are in the process (via the timers) of dealing with + * the loss. + * + * Now a critical thing you must be aware of here is that the + * use of the flags has a far greater scope then just your + * typical LRO. Why? Well thats because in the normal compressed + * LRO case at the end of a driver interupt all packets are going + * to get presented to the transport no matter if there is one + * or 100. With the MBUF_QUEUE model, this is not true. You will + * only be awoken to process the queue of packets when: + * a) The flags discussed above allow it. + * + * b) You exceed a ack or data limit (by default the + * ack limit is infinity (64k acks) and the data + * limit is 64k of new TCP data) + * + * c) The push bit has been set by the peer + */ + int ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt) { /* * We are passed a raw change of mbuf packets * that arrived in LRO. They are linked via * the m_nextpkt link in the pkt-headers. * * We process each one by: * a) saving off the next * b) stripping off the ether-header * c) formulating the arguments for * the tfb_tcp_hpts_do_segment * d) calling each mbuf to tfb_tcp_hpts_do_segment * after adjusting the time to match the arrival time. * Note that the LRO code assures no IP options are present. * * The symantics for calling tfb_tcp_hpts_do_segment are the * following: * 1) It returns 0 if all went well and you (the caller) need * to release the lock. * 2) If nxt_pkt is set, then the function will surpress calls * to tfb_tcp_output() since you are promising to call again * with another packet. * 3) If it returns 1, then you must free all the packets being * shipped in, the tcb has been destroyed (or about to be destroyed). */ struct mbuf *m_save; struct ether_header *eh; struct epoch_tracker et; struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif #ifdef INET struct ip *ip = NULL; /* Keep compiler happy. */ #endif struct ifnet *ifp; struct timeval tv; int32_t retval, nxt_pkt, tlen, off; uint16_t etype; uint16_t drop_hdrlen; uint8_t iptos, no_vn=0, bpf_req=0; /* * This is a bit deceptive, we get the * "info epoch" which is really the network * epoch. This covers us on both any INP * type change but also if the ifp goes * away it covers us as well. */ INP_INFO_RLOCK_ET(&V_tcbinfo, et); if (m && m->m_pkthdr.rcvif) ifp = m->m_pkthdr.rcvif; else ifp = NULL; if (ifp) { bpf_req = bpf_peers_present(ifp->if_bpf); } else { /* * We probably should not work around * but kassert, since lro alwasy sets rcvif. */ no_vn = 1; goto skip_vnet; } CURVNET_SET(ifp->if_vnet); skip_vnet: while (m) { m_save = m->m_nextpkt; m->m_nextpkt = NULL; /* Now lets get the ether header */ eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Let the BPF see the packet */ if (bpf_req && ifp) ETHER_BPF_MTAP(ifp, m); m_adj(m, sizeof(*eh)); /* Trim off the ethernet header */ switch (etype) { #ifdef INET6 case ETHERTYPE_IPV6: { if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); if (m == NULL) { TCPSTAT_INC(tcps_rcvshort); m_freem(m); goto skipped_pkt; } } ip6 = (struct ip6_hdr *)(eh + 1); th = (struct tcphdr *)(ip6 + 1); tlen = ntohs(ip6->ip6_plen); drop_hdrlen = sizeof(*ip6); if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); m_freem(m); goto skipped_pkt; } /* * Be proactive about unspecified IPv6 address in source. * As we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { /* XXX stat */ m_freem(m); goto skipped_pkt; } iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; break; } #endif #ifdef INET case ETHERTYPE_IP: { if (m->m_len < sizeof (struct tcpiphdr)) { if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == NULL) { TCPSTAT_INC(tcps_rcvshort); m_freem(m); goto skipped_pkt; } } ip = (struct ip *)(eh + 1); th = (struct tcphdr *)(ip + 1); drop_hdrlen = sizeof(*ip); iptos = ip->ip_tos; tlen = ntohs(ip->ip_len) - sizeof(struct ip); if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) th->th_sum = m->m_pkthdr.csum_data; else th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; } else { int len; struct ipovly *ipov = (struct ipovly *)ip; /* * Checksum extended TCP header and data. */ len = drop_hdrlen + tlen; bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); ipov->ih_len = htons(tlen); th->th_sum = in_cksum(m, len); /* Reset length for SDT probes. */ ip->ip_len = htons(len); /* Reset TOS bits */ ip->ip_tos = iptos; /* Re-initialization for later version check */ ip->ip_v = IPVERSION; ip->ip_hl = sizeof(*ip) >> 2; } if (th->th_sum) { TCPSTAT_INC(tcps_rcvbadsum); m_freem(m); goto skipped_pkt; } break; } #endif } /* * Convert TCP protocol specific fields to host format. */ tcp_fields_to_host(th); off = th->th_off << 2; if (off < sizeof (struct tcphdr) || off > tlen) { TCPSTAT_INC(tcps_rcvbadoff); m_freem(m); goto skipped_pkt; } tlen -= off; drop_hdrlen += off; /* * Now lets setup the timeval to be when we should * have been called (if we can). */ m->m_pkthdr.lro_nsegs = 1; - if (m->m_flags & M_TSTMP_LRO) { - tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; - tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000; - } else { - /* Should not be should we kassert instead? */ - tcp_get_usecs(&tv); - } + tcp_get_usecs(&tv); /* Now what about next packet? */ if (m_save || has_pkt) nxt_pkt = 1; else nxt_pkt = 0; retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen, iptos, nxt_pkt, &tv); if (retval) { /* We lost the lock and tcb probably */ m = m_save; while (m) { m_save = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = m_save; } if (no_vn == 0) CURVNET_RESTORE(); INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (retval); } skipped_pkt: m = m_save; } if (no_vn == 0) CURVNET_RESTORE(); INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (retval); } int ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt) { struct mbuf *m; /* First lets see if we have old packets */ if (tp->t_in_pkt) { m = tp->t_in_pkt; tp->t_in_pkt = NULL; tp->t_tail_pkt = NULL; if (ctf_process_inbound_raw(tp, so, m, have_pkt)) { /* We lost the tcpcb (maybe a RST came in)? */ return (1); } } return (0); } uint32_t ctf_outstanding(struct tcpcb *tp) { return (tp->snd_max - tp->snd_una); } uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked) { if (rc_sacked <= ctf_outstanding(tp)) return (ctf_outstanding(tp) - rc_sacked); else { /* TSNH */ #ifdef INVARIANTS panic("tp:%p rc_sacked:%d > out:%d", tp, rc_sacked, ctf_outstanding(tp)); #endif return (0); } } void ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) { if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); } /* * ctf_drop_checks returns 1 for you should not proceed. It places * in ret_val what should be returned 1/0 by the caller. The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ int ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) { int32_t todrop; int32_t thflags; int32_t tlen; thflags = *thf; tlen = *tlenp; todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } /* * DSACK - add SACK block for dropped range */ if (tp->t_flags & TF_SACK_PERMIT) { tcp_update_sack_list(tp, th->th_seq, th->th_seq + todrop); /* * ACK now, as the next in-sequence segment * will clear the DSACK block again */ tp->t_flags |= TF_ACKNOW; } *drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If segment ends after window, drop trailing data (and PUSH and * FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment and * ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else { ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); return (1); } } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH | TH_FIN); } *thf = thflags; *tlenp = tlen; return (0); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ void ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) { /* * Generate an ACK dropping incoming segment if it occupies sequence * space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all paths to this * code happen after packets containing RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the segment * we received passes the SYN-RECEIVED ACK test. If it fails send a * RST. This breaks the loop in the "LAND" DoS attack, and also * prevents an ACK storm between two listening ports that have been * sent forged SYN segments, each with the source address of the * other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max))) { *ret_val = 1; ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return; } else *ret_val = 0; tp->t_flags |= TF_ACKNOW; if (m) m_freem(m); } void ctf_do_drop(struct mbuf *m, struct tcpcb *tp) { /* * Drop space held by incoming segment and return. */ if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); if (m) m_freem(m); } int ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in * window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should test against * last_ack_sent instead of rcv_nxt. Note 2: we handle special case * of closed window, not covered by the RFC. */ int dropped = 0; if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || (tp->last_ack_sent == th->th_seq) || (tp->rcv_nxt == th->th_seq) || ((tp->last_ack_sent - 1) == th->th_seq)) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: so->so_error = ECONNRESET; close: tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ default: tp = tcp_close(tp); } dropped = 1; ctf_do_drop(m, tp); } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; } } else { m_freem(m); } return (dropped); } /* * The value in ret_val informs the caller * if we dropped the tcb (and lock) or not. * 1 = we dropped it, 0 = the TCB is still locked * and valid. */ void ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); *ret_val = 1; ctf_do_drop(m, tp); } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; *ret_val = 0; ctf_do_drop(m, NULL); } } /* * bbr_ts_check returns 1 for you should not proceed, the state * machine should return. It places in ret_val what should * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the * TCB is still valid and locked. */ int ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val) { if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates ts_recent, * the age will be reset later and ts_recent will get a * valid value. If it does not, setting ts_recent to zero * will at least satisfy the requirement that zero be placed * in the timestamp echo reply when ts_recent isn't valid. * The age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be dropped * when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); *ret_val = 0; if (tlen) { ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); } else { ctf_do_drop(m, NULL); } return (1); } return (0); } void ctf_calc_rwin(struct socket *so, struct tcpcb *tp) { int32_t win; /* * Calculate amount of space in receive window, and then do TCP * input processing. Receive window is amount of space in rcv queue, * but not less than advertised window. */ win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); } void ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) { if (tp->t_inpcb) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); } tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } uint32_t ctf_fixed_maxseg(struct tcpcb *tp) { int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We only consider fixed options that we would send every * time I.e. SACK is not considered. * */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } void ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex8 = num_sack_blks; if (num_sack_blks > 0) { log.u_bbr.flex1 = sack_blocks[0].start; log.u_bbr.flex2 = sack_blocks[0].end; } if (num_sack_blks > 1) { log.u_bbr.flex3 = sack_blocks[1].start; log.u_bbr.flex4 = sack_blocks[1].end; } if (num_sack_blks > 2) { log.u_bbr.flex5 = sack_blocks[2].start; log.u_bbr.flex6 = sack_blocks[2].end; } if (num_sack_blks > 3) { log.u_bbr.applimited = sack_blocks[3].start; log.u_bbr.pkts_out = sack_blocks[3].end; } TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, TCP_SACK_FILTER_RES, 0, 0, &log, false, &tv); } } uint32_t ctf_decay_count(uint32_t count, uint32_t decay) { /* * Given a count, decay it by a set percentage. The * percentage is in thousands i.e. 100% = 1000, * 19.3% = 193. */ uint64_t perc_count, decay_per; uint32_t decayed_count; if (decay > 1000) { /* We don't raise it */ return (count); } perc_count = count; decay_per = decay; perc_count *= decay_per; perc_count /= 1000; /* * So now perc_count holds the * count decay value. */ decayed_count = count - (uint32_t)perc_count; return (decayed_count); } Index: head/sys/sys/mbuf.h =================================================================== --- head/sys/sys/mbuf.h (revision 351933) +++ head/sys/sys/mbuf.h (revision 351934) @@ -1,1525 +1,1526 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ /* XXX: These includes suck. Sorry! */ #include #ifdef _KERNEL #include #include #include #ifdef WITNESS #include #endif #endif #ifdef _KERNEL #include #define MBUF_PROBE1(probe, arg0) \ SDT_PROBE1(sdt, , , probe, arg0) #define MBUF_PROBE2(probe, arg0, arg1) \ SDT_PROBE2(sdt, , , probe, arg0, arg1) #define MBUF_PROBE3(probe, arg0, arg1, arg2) \ SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2) #define MBUF_PROBE4(probe, arg0, arg1, arg2, arg3) \ SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3) #define MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4) SDT_PROBE_DECLARE(sdt, , , m__init); SDT_PROBE_DECLARE(sdt, , , m__gethdr); SDT_PROBE_DECLARE(sdt, , , m__get); SDT_PROBE_DECLARE(sdt, , , m__getcl); SDT_PROBE_DECLARE(sdt, , , m__clget); SDT_PROBE_DECLARE(sdt, , , m__cljget); SDT_PROBE_DECLARE(sdt, , , m__cljset); SDT_PROBE_DECLARE(sdt, , , m__free); SDT_PROBE_DECLARE(sdt, , , m__freem); #endif /* _KERNEL */ /* * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead. * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in * sys/param.h), which has no additional overhead and is used instead of the * internal data area; this is done when at least MINCLSIZE of data must be * stored. Additionally, it is possible to allocate a separate buffer * externally and attach it to the mbuf in a way similar to that of mbuf * clusters. * * NB: These calculation do not take actual compiler-induced alignment and * padding inside the complete struct mbuf into account. Appropriate * attention is required when changing members of struct mbuf. * * MLEN is data length in a normal mbuf. * MHLEN is data length in an mbuf with pktheader. * MINCLSIZE is a smallest amount of data that should be put into cluster. * * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are sensible. */ struct mbuf; #define MHSIZE offsetof(struct mbuf, m_dat) #define MPKTHSIZE offsetof(struct mbuf, m_pktdat) #define MLEN ((int)(MSIZE - MHSIZE)) #define MHLEN ((int)(MSIZE - MPKTHSIZE)) #define MINCLSIZE (MHLEN + 1) #define M_NODOM 255 #ifdef _KERNEL /*- * Macro for type conversion: convert mbuf pointer to data pointer of correct * type: * * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * mtodo(m, o) -- Same as above but with offset 'o' into data. */ #define mtod(m, t) ((t)((m)->m_data)) #define mtodo(m, o) ((void *)(((m)->m_data) + (o))) /* * Argument structure passed to UMA routines during mbuf and packet * allocations. */ struct mb_args { int flags; /* Flags for mbuf being allocated */ short type; /* Type of mbuf being allocated */ }; #endif /* _KERNEL */ /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ void (*m_tag_free)(struct m_tag *); }; /* * Static network interface owned tag. * Allocated through ifp->if_snd_tag_alloc(). */ struct m_snd_tag { struct ifnet *ifp; /* network interface tag belongs to */ volatile u_int refcount; }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. * Size ILP32: 48 * LP64: 56 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ struct pkthdr { union { struct m_snd_tag *snd_tag; /* send tag, if any */ struct ifnet *rcvif; /* rcv interface */ }; SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ int32_t len; /* total packet length */ /* Layer crossing persistent information. */ uint32_t flowid; /* packet's 4-tuple system */ uint32_t csum_flags; /* checksum and offload features */ uint16_t fibnum; /* this packet should use this fib */ uint8_t numa_domain; /* NUMA domain of recvd pkt */ uint8_t rsstype; /* hash type */ union { uint64_t rcv_tstmp; /* timestamp in ns */ struct { uint8_t l2hlen; /* layer 2 hdr len */ uint8_t l3hlen; /* layer 3 hdr len */ uint8_t l4hlen; /* layer 4 hdr len */ uint8_t l5hlen; /* layer 5 hdr len */ uint32_t spare; }; }; union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_per; /* Layer specific non-persistent local storage for reassembly, etc. */ union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_loc; }; #define ether_vtag PH_per.sixteen[0] #define PH_vt PH_per #define vt_nrecs sixteen[0] #define tso_segsz PH_per.sixteen[1] #define lro_nsegs tso_segsz #define csum_phsum PH_per.sixteen[2] #define csum_data PH_per.thirtytwo[1] +#define lro_len PH_per.sixteen[0] /* inbound during LRO */ +#define lro_csum PH_per.sixteen[1] /* inbound during LRO */ #define pace_thoff PH_loc.sixteen[0] #define pace_tlen PH_loc.sixteen[1] #define pace_drphdrlen PH_loc.sixteen[2] #define pace_tos PH_loc.eight[6] #define pace_lock PH_loc.eight[7] /* * Description of external storage mapped into mbuf; valid only if M_EXT is * set. * Size ILP32: 28 * LP64: 48 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ typedef void m_ext_free_t(struct mbuf *); struct m_ext { union { /* * If EXT_FLAG_EMBREF is set, then we use refcount in the * mbuf, the 'ext_count' member. Otherwise, we have a * shadow copy and we use pointer 'ext_cnt'. The original * mbuf is responsible to carry the pointer to free routine * and its arguments. They aren't copied into shadows in * mb_dupcl() to avoid dereferencing next cachelines. */ volatile u_int ext_count; volatile u_int *ext_cnt; }; union { /* * If ext_type == EXT_PGS, 'ext_pgs' points to a * structure describing the buffer. Otherwise, * 'ext_buf' points to the start of the buffer. */ struct mbuf_ext_pgs *ext_pgs; char *ext_buf; }; uint32_t ext_size; /* size of buffer, for ext_free */ uint32_t ext_type:8, /* type of external storage */ ext_flags:24; /* external storage mbuf flags */ /* * Fields below store the free context for the external storage. * They are valid only in the refcount carrying mbuf, the one with * EXT_FLAG_EMBREF flag, with exclusion for EXT_EXTREF type, where * the free context is copied into all mbufs that use same external * storage. */ #define m_ext_copylen offsetof(struct m_ext, ext_free) m_ext_free_t *ext_free; /* free routine if not the usual */ void *ext_arg1; /* optional argument pointer */ void *ext_arg2; /* optional argument pointer */ }; /* * The core of the mbuf object along with some shortcut defines for practical * purposes. */ struct mbuf { /* * Header present at the beginning of every mbuf. * Size ILP32: 24 * LP64: 32 * Compile-time assertions in uipc_mbuf.c test these values to ensure * that they are correct. */ union { /* next buffer in chain */ struct mbuf *m_next; SLIST_ENTRY(mbuf) m_slist; STAILQ_ENTRY(mbuf) m_stailq; }; union { /* next chain in queue/record */ struct mbuf *m_nextpkt; SLIST_ENTRY(mbuf) m_slistpkt; STAILQ_ENTRY(mbuf) m_stailqpkt; }; caddr_t m_data; /* location of data */ int32_t m_len; /* amount of data in this mbuf */ uint32_t m_type:8, /* type of data in this mbuf */ m_flags:24; /* flags; see below */ #if !defined(__LP64__) uint32_t m_pad; /* pad for 64bit alignment */ #endif /* * A set of optional headers (packet header, external storage header) * and internal data storage. Historically, these arrays were sized * to MHLEN (space left after a packet header) and MLEN (space left * after only a regular mbuf header); they are now variable size in * order to support future work on variable-size mbufs. */ union { struct { struct pkthdr m_pkthdr; /* M_PKTHDR set */ union { struct m_ext m_ext; /* M_EXT set */ char m_pktdat[0]; }; }; char m_dat[0]; /* !M_PKTHDR, !M_EXT */ }; }; struct ktls_session; struct socket; /* * TLS records for TLS 1.0-1.2 can have the following header lengths: * - 5 (AES-CBC with implicit IV) * - 21 (AES-CBC with explicit IV) * - 13 (AES-GCM with 8 byte explicit IV) */ #define MBUF_PEXT_HDR_LEN 24 /* * TLS records for TLS 1.0-1.2 can have the following maximum trailer * lengths: * - 16 (AES-GCM) * - 36 (AES-CBC with SHA1 and up to 16 bytes of padding) * - 48 (AES-CBC with SHA2-256 and up to 16 bytes of padding) * - 64 (AES-CBC with SHA2-384 and up to 16 bytes of padding) */ #define MBUF_PEXT_TRAIL_LEN 64 #ifdef __LP64__ #define MBUF_PEXT_MAX_PGS (152 / sizeof(vm_paddr_t)) #else #define MBUF_PEXT_MAX_PGS (156 / sizeof(vm_paddr_t)) #endif #define MBUF_PEXT_MAX_BYTES \ (MBUF_PEXT_MAX_PGS * PAGE_SIZE + MBUF_PEXT_HDR_LEN + MBUF_PEXT_TRAIL_LEN) /* * This struct is 256 bytes in size and is arranged so that the most * common case (accessing the first 4 pages of a 16KB TLS record) will * fit in a single 64 byte cacheline. */ struct mbuf_ext_pgs { uint8_t npgs; /* Number of attached pages */ uint8_t nrdy; /* Pages with I/O pending */ uint8_t hdr_len; /* TLS header length */ uint8_t trail_len; /* TLS trailer length */ uint16_t first_pg_off; /* Offset into 1st page */ uint16_t last_pg_len; /* Length of last page */ vm_paddr_t pa[MBUF_PEXT_MAX_PGS]; /* phys addrs of pages */ char hdr[MBUF_PEXT_HDR_LEN]; /* TLS header */ struct ktls_session *tls; /* TLS session */ #if defined(__i386__) || \ (defined(__powerpc__) && !defined(__powerpc64__) && defined(BOOKE)) /* * i386 and Book-E PowerPC have 64-bit vm_paddr_t, so there is * a 4 byte remainder from the space allocated for pa[]. */ uint32_t pad; #endif union { char trail[MBUF_PEXT_TRAIL_LEN]; /* TLS trailer */ struct { struct socket *so; struct mbuf *mbuf; uint64_t seqno; STAILQ_ENTRY(mbuf_ext_pgs) stailq; int enc_cnt; }; }; }; #ifdef _KERNEL static inline int mbuf_ext_pg_len(struct mbuf_ext_pgs *ext_pgs, int pidx, int pgoff) { KASSERT(pgoff == 0 || pidx == 0, ("page %d with non-zero offset %d in %p", pidx, pgoff, ext_pgs)); if (pidx == ext_pgs->npgs - 1) { return (ext_pgs->last_pg_len); } else { return (PAGE_SIZE - pgoff); } } #ifdef INVARIANT_SUPPORT void mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs); #endif #ifdef INVARIANTS #define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) mb_ext_pgs_check((ext_pgs)) #else #define MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs) #endif #endif /* * mbuf flags of global significance and layer crossing. * Those of only protocol/layer specific significance are to be mapped - * to M_PROTO[1-12] and cleared at layer handoff boundaries. + * to M_PROTO[1-11] and cleared at layer handoff boundaries. * NB: Limited to the lower 24 bits. */ #define M_EXT 0x00000001 /* has associated external storage */ #define M_PKTHDR 0x00000002 /* start of record */ #define M_EOR 0x00000004 /* end of record */ #define M_RDONLY 0x00000008 /* associated data is marked read-only */ #define M_BCAST 0x00000010 /* send/received as link-level broadcast */ #define M_MCAST 0x00000020 /* send/received as link-level multicast */ #define M_PROMISC 0x00000040 /* packet was not for us */ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ #define M_NOMAP 0x00000100 /* mbuf data is unmapped */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ #define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ #define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */ -#define M_PROTO1 0x00001000 /* protocol-specific */ -#define M_PROTO2 0x00002000 /* protocol-specific */ -#define M_PROTO3 0x00004000 /* protocol-specific */ -#define M_PROTO4 0x00008000 /* protocol-specific */ -#define M_PROTO5 0x00010000 /* protocol-specific */ -#define M_PROTO6 0x00020000 /* protocol-specific */ -#define M_PROTO7 0x00040000 /* protocol-specific */ -#define M_PROTO8 0x00080000 /* protocol-specific */ -#define M_PROTO9 0x00100000 /* protocol-specific */ -#define M_PROTO10 0x00200000 /* protocol-specific */ -#define M_PROTO11 0x00400000 /* protocol-specific */ -#define M_PROTO12 0x00800000 /* protocol-specific */ +#define M_PROTO1 0x00002000 /* protocol-specific */ +#define M_PROTO2 0x00004000 /* protocol-specific */ +#define M_PROTO3 0x00008000 /* protocol-specific */ +#define M_PROTO4 0x00010000 /* protocol-specific */ +#define M_PROTO5 0x00020000 /* protocol-specific */ +#define M_PROTO6 0x00040000 /* protocol-specific */ +#define M_PROTO7 0x00080000 /* protocol-specific */ +#define M_PROTO8 0x00100000 /* protocol-specific */ +#define M_PROTO9 0x00200000 /* protocol-specific */ +#define M_PROTO10 0x00400000 /* protocol-specific */ +#define M_PROTO11 0x00800000 /* protocol-specific */ #define MB_DTOR_SKIP 0x1 /* don't pollute the cache by touching a freed mbuf */ /* * Flags to purge when crossing layers. */ #define M_PROTOFLAGS \ (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\ - M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12) + M_PROTO9|M_PROTO10|M_PROTO11) /* * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \ M_TSTMP_HPREC|M_PROTOFLAGS) /* * Mbuf flag description for use with printf(9) %b identifier. */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ "\7M_PROMISC\10M_VLANTAG\11M_NOMAP\12M_NOFREE\13M_TSTMP\14M_TSTMP_HPREC" #define M_FLAG_PROTOBITS \ "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \ "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \ - "\27M_PROTO11\30M_PROTO12" + "\27M_PROTO11" #define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS) /* * Network interface cards are able to hash protocol fields (such as IPv4 * addresses and TCP port numbers) classify packets into flows. These flows * can then be used to maintain ordering while delivering packets to the OS * via parallel input queues, as well as to provide a stateless affinity * model. NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set * m_flag fields to indicate how the hash should be interpreted by the * network stack. * * Most NICs support RSS, which provides ordering and explicit affinity, and * use the hash m_flag bits to indicate what header fields were covered by * the hash. M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non- * RSS cards or configurations that provide an opaque flow identifier, allowing * for ordering and distribution without explicit affinity. Additionally, * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash * properties. * * The meaning of the IPV6_EX suffix: * "o Home address from the home address option in the IPv6 destination * options header. If the extension header is not present, use the Source * IPv6 Address. * o IPv6 address that is contained in the Routing-Header-Type-2 from the * associated extension header. If the extension header is not present, * use the Destination IPv6 Address." * Quoted from: * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex */ #define M_HASHTYPE_HASHPROP 0x80 /* has hash properties */ #define M_HASHTYPE_HASH(t) (M_HASHTYPE_HASHPROP | (t)) /* Microsoft RSS standard hash types */ #define M_HASHTYPE_NONE 0 #define M_HASHTYPE_RSS_IPV4 M_HASHTYPE_HASH(1) /* IPv4 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV4 M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */ #define M_HASHTYPE_RSS_IPV6 M_HASHTYPE_HASH(3) /* IPv6 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV6 M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */ #define M_HASHTYPE_RSS_IPV6_EX M_HASHTYPE_HASH(5) /* IPv6 2-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_TCP_IPV6_EX M_HASHTYPE_HASH(6) /* TCPv6 4-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6 M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6_EX M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple + * ext hdrs */ #define M_HASHTYPE_OPAQUE 63 /* ordering, not affinity */ #define M_HASHTYPE_OPAQUE_HASH M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE) /* ordering+hash, not affinity*/ #define M_HASHTYPE_CLEAR(m) ((m)->m_pkthdr.rsstype = 0) #define M_HASHTYPE_GET(m) ((m)->m_pkthdr.rsstype) #define M_HASHTYPE_SET(m, v) ((m)->m_pkthdr.rsstype = (v)) #define M_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) #define M_HASHTYPE_ISHASH(m) (M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP) /* * External mbuf storage buffer types. */ #define EXT_CLUSTER 1 /* mbuf cluster */ #define EXT_SFBUF 2 /* sendfile(2)'s sf_buf */ #define EXT_JUMBOP 3 /* jumbo cluster page sized */ #define EXT_JUMBO9 4 /* jumbo cluster 9216 bytes */ #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference */ #define EXT_RXRING 8 /* data in NIC receive ring */ #define EXT_PGS 9 /* array of unmapped pages */ #define EXT_VENDOR1 224 /* for vendor-internal use */ #define EXT_VENDOR2 225 /* for vendor-internal use */ #define EXT_VENDOR3 226 /* for vendor-internal use */ #define EXT_VENDOR4 227 /* for vendor-internal use */ #define EXT_EXP1 244 /* for experimental use */ #define EXT_EXP2 245 /* for experimental use */ #define EXT_EXP3 246 /* for experimental use */ #define EXT_EXP4 247 /* for experimental use */ #define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 253 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */ #define EXT_EXTREF 255 /* has externally maintained ext_cnt ptr */ /* * Flags for external mbuf buffer types. * NB: limited to the lower 24 bits. */ #define EXT_FLAG_EMBREF 0x000001 /* embedded ext_count */ #define EXT_FLAG_EXTREF 0x000002 /* external ext_cnt, notyet */ #define EXT_FLAG_NOFREE 0x000010 /* don't free mbuf to pool, notyet */ #define EXT_FLAG_VENDOR1 0x010000 /* These flags are vendor */ #define EXT_FLAG_VENDOR2 0x020000 /* or submodule specific, */ #define EXT_FLAG_VENDOR3 0x040000 /* not used by mbuf code. */ #define EXT_FLAG_VENDOR4 0x080000 /* Set/read by submodule. */ #define EXT_FLAG_EXP1 0x100000 /* for experimental use */ #define EXT_FLAG_EXP2 0x200000 /* for experimental use */ #define EXT_FLAG_EXP3 0x400000 /* for experimental use */ #define EXT_FLAG_EXP4 0x800000 /* for experimental use */ /* * EXT flag description for use with printf(9) %b identifier. */ #define EXT_FLAG_BITS \ "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \ "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \ "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \ "\30EXT_FLAG_EXP4" #define MBUF_EXT_PGS_ASSERT(m) \ KASSERT((((m)->m_flags & M_EXT) != 0) && \ ((m)->m_ext.ext_type == EXT_PGS), \ ("%s: m %p !M_EXT or !EXT_PGS", __func__, m)) /* * Flags indicating checksum, segmentation and other offload work to be * done, or already done, by hardware or lower layers. It is split into * separate inbound and outbound flags. * * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested * against ifnet if_hwassist. */ #define CSUM_IP 0x00000001 /* IP header checksum offload */ #define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */ #define CSUM_IP_TCP 0x00000004 /* TCP checksum offload */ #define CSUM_IP_SCTP 0x00000008 /* SCTP checksum offload */ #define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */ #define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */ #define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */ #define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */ #define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */ #define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */ #define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */ /* Inbound checksum support where the checksum was verified by hardware. */ #define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */ #define CSUM_L3_VALID 0x02000000 /* checksum is correct */ #define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */ #define CSUM_L4_VALID 0x08000000 /* checksum is correct */ #define CSUM_L5_CALC 0x10000000 /* calculated layer 5 csum */ #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESCED 0x40000000 /* contains merged segments */ #define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */ /* * CSUM flag description for use with printf(9) %b identifier. */ #define CSUM_BITS \ "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \ "\6CSUM_IP_ISCSI" \ "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \ "\16CSUM_IP6_ISCSI" \ "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \ "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC #define CSUM_IP_VALID CSUM_L3_VALID #define CSUM_DATA_VALID CSUM_L4_VALID #define CSUM_PSEUDO_HDR CSUM_L4_CALC #define CSUM_SCTP_VALID CSUM_L4_VALID #define CSUM_DELAY_DATA (CSUM_TCP|CSUM_UDP) #define CSUM_DELAY_IP CSUM_IP /* Only v4, no v6 IP hdr csum */ #define CSUM_DELAY_DATA_IPV6 (CSUM_TCP_IPV6|CSUM_UDP_IPV6) #define CSUM_DATA_VALID_IPV6 CSUM_DATA_VALID #define CSUM_TCP CSUM_IP_TCP #define CSUM_UDP CSUM_IP_UDP #define CSUM_SCTP CSUM_IP_SCTP #define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO) #define CSUM_UDP_IPV6 CSUM_IP6_UDP #define CSUM_TCP_IPV6 CSUM_IP6_TCP #define CSUM_SCTP_IPV6 CSUM_IP6_SCTP /* * mbuf types describing the content of the mbuf (including external storage). */ #define MT_NOTMBUF 0 /* USED INTERNALLY ONLY! Object is not mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER MT_DATA /* packet header, use M_PKTHDR instead */ #define MT_VENDOR1 4 /* for vendor-internal use */ #define MT_VENDOR2 5 /* for vendor-internal use */ #define MT_VENDOR3 6 /* for vendor-internal use */ #define MT_VENDOR4 7 /* for vendor-internal use */ #define MT_SONAME 8 /* socket name */ #define MT_EXP1 9 /* for experimental use */ #define MT_EXP2 10 /* for experimental use */ #define MT_EXP3 11 /* for experimental use */ #define MT_EXP4 12 /* for experimental use */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_EXTCONTROL 15 /* control message with externalized contents */ #define MT_OOBDATA 16 /* expedited data */ #define MT_NOINIT 255 /* Not a type but a flag to allocate a non-initialized mbuf */ /* * String names of mbuf-related UMA(9) and malloc(9) types. Exposed to * !_KERNEL so that monitoring tools can look up the zones with * libmemstat(3). */ #define MBUF_MEM_NAME "mbuf" #define MBUF_CLUSTER_MEM_NAME "mbuf_cluster" #define MBUF_PACKET_MEM_NAME "mbuf_packet" #define MBUF_JUMBOP_MEM_NAME "mbuf_jumbo_page" #define MBUF_JUMBO9_MEM_NAME "mbuf_jumbo_9k" #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" #define MBUF_EXTPGS_MEM_NAME "mbuf_extpgs" #ifdef _KERNEL #ifdef WITNESS #define MBUF_CHECKSLEEP(how) do { \ if (how == M_WAITOK) \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "Sleeping in \"%s\"", __func__); \ } while (0) #else #define MBUF_CHECKSLEEP(how) #endif /* * Network buffer allocation API * * The rest of it is defined in kern/kern_mbuf.c */ extern uma_zone_t zone_mbuf; extern uma_zone_t zone_clust; extern uma_zone_t zone_pack; extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; extern uma_zone_t zone_extpgs; void mb_dupcl(struct mbuf *, struct mbuf *); void mb_free_ext(struct mbuf *); void mb_free_mext_pgs(struct mbuf *); struct mbuf *mb_alloc_ext_pgs(int, bool, m_ext_free_t); int mb_unmapped_compress(struct mbuf *m); struct mbuf *mb_unmapped_to_ext(struct mbuf *m); void mb_free_notready(struct mbuf *m, int count); void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); int m_append(struct mbuf *, int, c_caddr_t); void m_cat(struct mbuf *, struct mbuf *); void m_catpkt(struct mbuf *, struct mbuf *); int m_clget(struct mbuf *m, int how); void *m_cljget(struct mbuf *m, int how, int size); struct mbuf *m_collapse(struct mbuf *, int, int); void m_copyback(struct mbuf *, int, int, c_caddr_t); void m_copydata(const struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); void m_copy_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_copyup(struct mbuf *, int, int); struct mbuf *m_defrag(struct mbuf *, int); void m_demote_pkthdr(struct mbuf *); void m_demote(struct mbuf *, int, int); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(char *, caddr_t, u_int)); void m_dispose_extcontrolm(struct mbuf *m); struct mbuf *m_dup(const struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, const struct mbuf *, int); void m_extadd(struct mbuf *, char *, u_int, m_ext_free_t, void *, void *, int, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); void m_freem(struct mbuf *); struct mbuf *m_get2(int, int, short, int); struct mbuf *m_getjcl(int, short, int, int); struct mbuf *m_getm2(struct mbuf *, int, int, short, int); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); int m_mbuftouio(struct uio *, const struct mbuf *, int); int m_unmappedtouio(const struct mbuf *, int, struct uio *, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); int m_pkthdr_init(struct mbuf *, int); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *, int); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); struct mbuf *m_unshare(struct mbuf *, int); void m_snd_tag_init(struct m_snd_tag *, struct ifnet *); void m_snd_tag_destroy(struct m_snd_tag *); static __inline int m_gettype(int size) { int type; switch (size) { case MSIZE: type = EXT_MBUF; break; case MCLBYTES: type = EXT_CLUSTER; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: type = EXT_JUMBOP; break; #endif case MJUM9BYTES: type = EXT_JUMBO9; break; case MJUM16BYTES: type = EXT_JUMBO16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (type); } /* * Associated an external reference counted buffer with an mbuf. */ static __inline void m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt, m_ext_free_t freef, void *arg1, void *arg2) { KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__)); atomic_add_int(ref_cnt, 1); m->m_flags |= M_EXT; m->m_ext.ext_buf = buf; m->m_ext.ext_cnt = ref_cnt; m->m_data = m->m_ext.ext_buf; m->m_ext.ext_size = size; m->m_ext.ext_free = freef; m->m_ext.ext_arg1 = arg1; m->m_ext.ext_arg2 = arg2; m->m_ext.ext_type = EXT_EXTREF; m->m_ext.ext_flags = 0; } static __inline uma_zone_t m_getzone(int size) { uma_zone_t zone; switch (size) { case MCLBYTES: zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: zone = zone_jumbop; break; #endif case MJUM9BYTES: zone = zone_jumbo9; break; case MJUM16BYTES: zone = zone_jumbo16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (zone); } /* * Initialize an mbuf with linear storage. * * Inline because the consumer text overhead will be roughly the same to * initialize or call a function with this many parameters and M_PKTHDR * should go away with constant propagation for !MGETHDR. */ static __inline int m_init(struct mbuf *m, int how, short type, int flags) { int error; m->m_next = NULL; m->m_nextpkt = NULL; m->m_data = m->m_dat; m->m_len = 0; m->m_flags = flags; m->m_type = type; if (flags & M_PKTHDR) error = m_pkthdr_init(m, how); else error = 0; MBUF_PROBE5(m__init, m, how, type, flags, error); return (error); } static __inline struct mbuf * m_get(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = 0; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__get, how, type, m); return (m); } static __inline struct mbuf * m_gethdr(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = M_PKTHDR; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__gethdr, how, type, m); return (m); } static __inline struct mbuf * m_getcl(int how, short type, int flags) { struct mbuf *m; struct mb_args args; args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_pack, &args, how); MBUF_PROBE4(m__getcl, how, type, flags, m); return (m); } /* * XXX: m_cljset() is a dangerous API. One must attach only a new, * unreferenced cluster to an mbuf(9). It is not possible to assert * that, so care can be taken only by users of the API. */ static __inline void m_cljset(struct mbuf *m, void *cl, int type) { int size; switch (type) { case EXT_CLUSTER: size = MCLBYTES; break; #if MJUMPAGESIZE != MCLBYTES case EXT_JUMBOP: size = MJUMPAGESIZE; break; #endif case EXT_JUMBO9: size = MJUM9BYTES; break; case EXT_JUMBO16: size = MJUM16BYTES; break; default: panic("%s: unknown cluster type %d", __func__, type); break; } m->m_data = m->m_ext.ext_buf = cl; m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_flags |= M_EXT; MBUF_PROBE3(m__cljset, m, cl, type); } static __inline void m_chtype(struct mbuf *m, short new_type) { m->m_type = new_type; } static __inline void m_clrprotoflags(struct mbuf *m) { while (m) { m->m_flags &= ~M_PROTOFLAGS; m = m->m_next; } } static __inline struct mbuf * m_last(struct mbuf *m) { while (m->m_next) m = m->m_next; return (m); } static inline u_int m_extrefcnt(struct mbuf *m) { KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__)); return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count : *m->m_ext.ext_cnt); } /* * mbuf, cluster, and external object allocation macros (for compatibility * purposes). */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, arg1, arg2, flags, type) \ m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2), \ (flags), (type)) #define m_getm(m, len, how, type) \ m_getm2((m), (len), (how), (type), M_PKTHDR) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can * be both the local data payload, or an external buffer area, depending on * whether M_EXT is set). */ #define M_WRITABLE(m) (((m)->m_flags & (M_RDONLY | M_NOMAP)) == 0 && \ (!(((m)->m_flags & M_EXT)) || \ (m_extrefcnt(m) == 1))) /* Check if the supplied mbuf has a packet header, or else panic. */ #define M_ASSERTPKTHDR(m) \ KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR, \ ("%s: no mbuf packet header!", __func__)) /* * Ensure that the supplied mbuf is a valid, non-free mbuf. * * XXX: Broken at the moment. Need some UMA magic to make it work again. */ #define M_ASSERTVALID(m) \ KASSERT((((struct mbuf *)m)->m_flags & 0) == 0, \ ("%s: attempted use of a free mbuf!", __func__)) /* * Return the address of the start of the buffer associated with an mbuf, * handling external storage, packet-header mbufs, and regular data mbufs. */ #define M_START(m) \ (((m)->m_flags & M_NOMAP) ? NULL : \ ((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ &(m)->m_dat[0]) /* * Return the size of the buffer associated with an mbuf, handling external * storage, packet-header mbufs, and regular data mbufs. */ #define M_SIZE(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \ ((m)->m_flags & M_PKTHDR) ? MHLEN : \ MLEN) /* * Set the m_data pointer of a newly allocated mbuf to place an object of the * specified size at the end of the mbuf, longword aligned. * * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as * separate macros, each asserting that it was called at the proper moment. * This required callers to themselves test the storage type and call the * right one. Rather than require callers to be aware of those layout * decisions, we centralize here. */ static __inline void m_align(struct mbuf *m, int len) { #ifdef INVARIANTS const char *msg = "%s: not a virgin mbuf"; #endif int adjust; KASSERT(m->m_data == M_START(m), (msg, __func__)); adjust = M_SIZE(m) - len; m->m_data += adjust &~ (sizeof(long)-1); } #define M_ALIGN(m, len) m_align(m, len) #define MH_ALIGN(m, len) m_align(m, len) #define MEXT_ALIGN(m, len) m_align(m, len) /* * Compute the amount of space available before the current start of data in * an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_LEADINGSPACE(m) \ (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0) /* * Compute the amount of space available after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_TRAILINGSPACE(m) \ (M_WRITABLE(m) ? \ ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0) /* * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be * allocated, how specifies whether to wait. If the allocation fails, the * original mbuf chain is freed and m is set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ MBUF_CHECKSLEEP(how); \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* * Change mbuf to new type. This is a relatively expensive operation and * should be avoided. */ #define MCHTYPE(m, t) m_chtype((m), (t)) /* Return the rcvif of a packet header. */ static __inline struct ifnet * m_rcvif(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) return (NULL); return (m->m_pkthdr.rcvif); } /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 extern int max_datalen; /* MHLEN - max_hdr */ extern int max_hdr; /* Largest link + protocol header */ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern int nmbclusters; /* Maximum number of clusters */ extern bool mb_use_ext_pgs; /* Use ext_pgs for sendfile */ /*- * Network packets may have annotations attached by affixing a list of * "packet tags" to the pkthdr structure. Packet tags are dynamically * allocated semi-opaque data structures that have a fixed header * (struct m_tag) that specifies the size of the memory block and a * pair that identifies it. The cookie is a 32-bit unique * unsigned value used to identify a module or ABI. By convention this value * is chosen as the date+time that the module is created, expressed as the * number of seconds since the epoch (e.g., using date -u +'%s'). The type * value is an ABI/module-specific value that identifies a particular * annotation and is private to the module. For compatibility with systems * like OpenBSD that define packet tags w/o an ABI/module cookie, the value * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find * compatibility shim functions and several tag types are defined below. * Users that do not require compatibility should use a private cookie value * so that packet tag-related definitions can be maintained privately. * * Note that the packet tag returned by m_tag_alloc has the default memory * alignment implemented by malloc. To reference private data one can use a * construct like: * * struct m_tag *mtag = m_tag_alloc(...); * struct foo *p = (struct foo *)(mtag+1); * * if the alignment of struct m_tag is sufficient for referencing members of * struct foo. Otherwise it is necessary to embed struct m_tag within the * private data structure to insure proper alignment; e.g., * * struct foo { * struct m_tag tag; * ... * }; * struct foo *p = (struct foo *) m_tag_alloc(...); * struct m_tag *mtag = &p->tag; */ /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise * tags are expected to ``vanish'' when they pass through a network * interface. For most interfaces this happens normally as the tags are * reclaimed when the mbuf is free'd. However in some special cases * reclaiming must be done manually. An example is packets that pass through * the loopback interface. Also, one must be careful to do this when * ``turning around'' packets (e.g., icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the tag id. * The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF (21 | MTAG_PERSISTENT) /* PF/ALTQ information */ #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ #define PACKET_TAG_IPSEC_NAT_T_PORTS 29 /* two uint16_t */ #define PACKET_TAG_ND_OUTGOING 30 /* ND outgoing */ /* Specific cookies and tags. */ /* Packet tag routines. */ struct m_tag *m_tag_alloc(u_int32_t, int, int, int); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *, struct m_tag *); void m_tag_free_default(struct m_tag *); struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *); struct m_tag *m_tag_copy(struct m_tag *, int); int m_tag_copy_chain(struct mbuf *, const struct mbuf *, int); void m_tag_delete_nonpersistent(struct mbuf *); /* * Initialize the list of tags associated with an mbuf. */ static __inline void m_tag_init(struct mbuf *m) { SLIST_INIT(&m->m_pkthdr.tags); } /* * Set up the contents of a tag. Note that this does not fill in the free * method; the caller is expected to do that. * * XXX probably should be called m_tag_init, but that was already taken. */ static __inline void m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len) { t->m_tag_id = type; t->m_tag_len = len; t->m_tag_cookie = cookie; } /* * Reclaim resources associated with a tag. */ static __inline void m_tag_free(struct m_tag *t) { (*t->m_tag_free)(t); } /* * Return the first tag associated with an mbuf. */ static __inline struct m_tag * m_tag_first(struct mbuf *m) { return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m __unused, struct m_tag *t) { return (SLIST_NEXT(t, m_tag_link)); } /* * Prepend a tag to the list of tags associated with an mbuf. */ static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Unlink a tag from the list of tags associated with an mbuf. */ static __inline void m_tag_unlink(struct mbuf *m, struct m_tag *t) { SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } /* These are for OpenBSD compatibility. */ #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait)); } static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } static inline struct m_snd_tag * m_snd_tag_ref(struct m_snd_tag *mst) { refcount_acquire(&mst->refcount); return (mst); } static inline void m_snd_tag_rele(struct m_snd_tag *mst) { if (refcount_release(&mst->refcount)) m_snd_tag_destroy(mst); } static __inline struct mbuf * m_free(struct mbuf *m) { struct mbuf *n = m->m_next; MBUF_PROBE1(m__free, m); if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE)) m_tag_delete_chain(m, NULL); if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG) m_snd_tag_rele(m->m_pkthdr.snd_tag); if (m->m_flags & M_EXT) mb_free_ext(m); else if ((m->m_flags & M_NOFREE) == 0) uma_zfree(zone_mbuf, m); return (n); } static __inline int rt_m_getfib(struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf.")); return (m->m_pkthdr.fibnum); } #define M_GETFIB(_m) rt_m_getfib(_m) #define M_SETFIB(_m, _fib) do { \ KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf.")); \ ((_m)->m_pkthdr.fibnum) = (_fib); \ } while (0) /* flags passed as first argument for "m_ether_tcpip_hash()" */ #define MBUF_HASHFLAG_L2 (1 << 2) #define MBUF_HASHFLAG_L3 (1 << 3) #define MBUF_HASHFLAG_L4 (1 << 4) /* mbuf hashing helper routines */ uint32_t m_ether_tcpip_hash_init(void); uint32_t m_ether_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t); #ifdef MBUF_PROFILING void m_profile(struct mbuf *m); #define M_PROFILE(m) m_profile(m) #else #define M_PROFILE(m) #endif struct mbufq { STAILQ_HEAD(, mbuf) mq_head; int mq_len; int mq_maxlen; }; static inline void mbufq_init(struct mbufq *mq, int maxlen) { STAILQ_INIT(&mq->mq_head); mq->mq_maxlen = maxlen; mq->mq_len = 0; } static inline struct mbuf * mbufq_flush(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); STAILQ_INIT(&mq->mq_head); mq->mq_len = 0; return (m); } static inline void mbufq_drain(struct mbufq *mq) { struct mbuf *m, *n; n = mbufq_flush(mq); while ((m = n) != NULL) { n = STAILQ_NEXT(m, m_stailqpkt); m_freem(m); } } static inline struct mbuf * mbufq_first(const struct mbufq *mq) { return (STAILQ_FIRST(&mq->mq_head)); } static inline struct mbuf * mbufq_last(const struct mbufq *mq) { return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt)); } static inline int mbufq_full(const struct mbufq *mq) { return (mq->mq_maxlen > 0 && mq->mq_len >= mq->mq_maxlen); } static inline int mbufq_len(const struct mbufq *mq) { return (mq->mq_len); } static inline int mbufq_enqueue(struct mbufq *mq, struct mbuf *m) { if (mbufq_full(mq)) return (ENOBUFS); STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; return (0); } static inline struct mbuf * mbufq_dequeue(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); if (m) { STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt); m->m_nextpkt = NULL; mq->mq_len--; } return (m); } static inline void mbufq_prepend(struct mbufq *mq, struct mbuf *m) { STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; } /* * Note: this doesn't enforce the maximum list size for dst. */ static inline void mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src) { mq_dst->mq_len += mq_src->mq_len; STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head); mq_src->mq_len = 0; } #ifdef _SYS_TIMESPEC_H_ static inline void mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts) { KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m)); KASSERT((m->m_flags & M_TSTMP) != 0, ("mbuf %p no M_TSTMP", m)); ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000; } #endif #ifdef NETDUMP /* Invoked from the netdump client code. */ void netdump_mbuf_drain(void); void netdump_mbuf_dump(void); void netdump_mbuf_reinit(int nmbuf, int nclust, int clsize); #endif static inline bool mbuf_has_tls_session(struct mbuf *m) { if (m->m_flags & M_NOMAP) { MBUF_EXT_PGS_ASSERT(m); if (m->m_ext.ext_pgs->tls != NULL) { return (true); } } return (false); } #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */