diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c index 09fc024c3d73..ea84ca191eca 100644 --- a/sys/netinet/tcp_lro.c +++ b/sys/netinet/tcp_lro.c @@ -1,1879 +1,1882 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2016-2021 Mellanox Technologies. * All rights reserved. * * Portions of this software were developed by Bjoern Zeeb * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); #define TCP_LRO_TS_OPTION \ ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP) static void tcp_lro_rx_done(struct lro_ctrl *lc); static int tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_hash); #ifdef TCPHPTS static bool do_bpf_strip_and_compress(struct inpcb *, struct lro_ctrl *, struct lro_entry *, struct mbuf **, struct mbuf **, struct mbuf **, bool *, bool); #endif SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP LRO"); static long tcplro_stacks_wanting_mbufq; counter_u64_t tcp_inp_lro_direct_queue; counter_u64_t tcp_inp_lro_wokeup_queue; counter_u64_t tcp_inp_lro_compressed; counter_u64_t tcp_inp_lro_locks_taken; counter_u64_t tcp_extra_mbuf; counter_u64_t tcp_would_have_but; counter_u64_t tcp_comp_total; counter_u64_t tcp_uncomp_total; static unsigned tcp_lro_entries = TCP_LRO_ENTRIES; SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries, CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0, "default number of LRO entries"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD, &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD, &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD, &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD, &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD, &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD, &tcp_would_have_but, "Number of times we would have had an extra compressed, but mget failed"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD, &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set"); SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD, &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP"); void tcp_lro_reg_mbufq(void) { atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1); } void tcp_lro_dereg_mbufq(void) { atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1); } static __inline void tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, struct lro_entry *le) { LIST_INSERT_HEAD(&lc->lro_active, le, next); LIST_INSERT_HEAD(bucket, le, hash_next); } static __inline void tcp_lro_active_remove(struct lro_entry *le) { LIST_REMOVE(le, next); /* active list */ LIST_REMOVE(le, hash_next); /* hash bucket */ } int tcp_lro_init(struct lro_ctrl *lc) { return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0)); } int tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, unsigned lro_entries, unsigned lro_mbufs) { struct lro_entry *le; size_t size; unsigned i, elements; lc->lro_bad_csum = 0; lc->lro_queued = 0; lc->lro_flushed = 0; lc->lro_mbuf_count = 0; lc->lro_mbuf_max = lro_mbufs; lc->lro_cnt = lro_entries; lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; lc->lro_length_lim = TCP_LRO_LENGTH_MAX; lc->ifp = ifp; LIST_INIT(&lc->lro_free); LIST_INIT(&lc->lro_active); /* create hash table to accelerate entry lookup */ if (lro_entries > lro_mbufs) elements = lro_entries; else elements = lro_mbufs; lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, HASH_NOWAIT); if (lc->lro_hash == NULL) { memset(lc, 0, sizeof(*lc)); return (ENOMEM); } /* compute size to allocate */ size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + (lro_entries * sizeof(*le)); lc->lro_mbuf_data = (struct lro_mbuf_sort *) malloc(size, M_LRO, M_NOWAIT | M_ZERO); /* check for out of memory */ if (lc->lro_mbuf_data == NULL) { free(lc->lro_hash, M_LRO); memset(lc, 0, sizeof(*lc)); return (ENOMEM); } /* compute offset for LRO entries */ le = (struct lro_entry *) (lc->lro_mbuf_data + lro_mbufs); /* setup linked list */ for (i = 0; i != lro_entries; i++) LIST_INSERT_HEAD(&lc->lro_free, le + i, next); return (0); } struct vxlan_header { uint32_t vxlh_flags; uint32_t vxlh_vni; }; static inline void * tcp_lro_low_level_parser(void *ptr, struct lro_parser *parser, bool update_data, bool is_vxlan) { const struct ether_vlan_header *eh; void *old; uint16_t eth_type; if (update_data) memset(parser, 0, sizeof(*parser)); old = ptr; if (is_vxlan) { const struct vxlan_header *vxh; vxh = ptr; ptr = (uint8_t *)ptr + sizeof(*vxh); if (update_data) { parser->data.vxlan_vni = vxh->vxlh_vni & htonl(0xffffff00); } } eh = ptr; if (__predict_false(eh->evl_encap_proto == htons(ETHERTYPE_VLAN))) { eth_type = eh->evl_proto; if (update_data) { /* strip priority and keep VLAN ID only */ parser->data.vlan_id = eh->evl_tag & htons(EVL_VLID_MASK); } /* advance to next header */ ptr = (uint8_t *)ptr + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = eh->evl_encap_proto; /* advance to next header */ ptr = (uint8_t *)ptr + ETHER_HDR_LEN; } switch (eth_type) { #ifdef INET case htons(ETHERTYPE_IP): parser->ip4 = ptr; /* Ensure there are no IPv4 options. */ if ((parser->ip4->ip_hl << 2) != sizeof (*parser->ip4)) break; /* .. and the packet is not fragmented. */ if (parser->ip4->ip_off & htons(IP_MF|IP_OFFMASK)) break; ptr = (uint8_t *)ptr + (parser->ip4->ip_hl << 2); if (update_data) { parser->data.s_addr.v4 = parser->ip4->ip_src; parser->data.d_addr.v4 = parser->ip4->ip_dst; } switch (parser->ip4->ip_p) { case IPPROTO_UDP: parser->udp = ptr; if (update_data) { parser->data.lro_type = LRO_TYPE_IPV4_UDP; parser->data.s_port = parser->udp->uh_sport; parser->data.d_port = parser->udp->uh_dport; } else { MPASS(parser->data.lro_type == LRO_TYPE_IPV4_UDP); } ptr = ((uint8_t *)ptr + sizeof(*parser->udp)); parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; return (ptr); case IPPROTO_TCP: parser->tcp = ptr; if (update_data) { parser->data.lro_type = LRO_TYPE_IPV4_TCP; parser->data.s_port = parser->tcp->th_sport; parser->data.d_port = parser->tcp->th_dport; } else { MPASS(parser->data.lro_type == LRO_TYPE_IPV4_TCP); } ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2); parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; return (ptr); default: break; } break; #endif #ifdef INET6 case htons(ETHERTYPE_IPV6): parser->ip6 = ptr; ptr = (uint8_t *)ptr + sizeof(*parser->ip6); if (update_data) { parser->data.s_addr.v6 = parser->ip6->ip6_src; parser->data.d_addr.v6 = parser->ip6->ip6_dst; } switch (parser->ip6->ip6_nxt) { case IPPROTO_UDP: parser->udp = ptr; if (update_data) { parser->data.lro_type = LRO_TYPE_IPV6_UDP; parser->data.s_port = parser->udp->uh_sport; parser->data.d_port = parser->udp->uh_dport; } else { MPASS(parser->data.lro_type == LRO_TYPE_IPV6_UDP); } ptr = (uint8_t *)ptr + sizeof(*parser->udp); parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; return (ptr); case IPPROTO_TCP: parser->tcp = ptr; if (update_data) { parser->data.lro_type = LRO_TYPE_IPV6_TCP; parser->data.s_port = parser->tcp->th_sport; parser->data.d_port = parser->tcp->th_dport; } else { MPASS(parser->data.lro_type == LRO_TYPE_IPV6_TCP); } ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2); parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old; return (ptr); default: break; } break; #endif default: break; } /* Invalid packet - cannot parse */ return (NULL); } static const int vxlan_csum = CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID; static inline struct lro_parser * tcp_lro_parser(struct mbuf *m, struct lro_parser *po, struct lro_parser *pi, bool update_data) { void *data_ptr; /* Try to parse outer headers first. */ data_ptr = tcp_lro_low_level_parser(m->m_data, po, update_data, false); if (data_ptr == NULL || po->total_hdr_len > m->m_len) return (NULL); if (update_data) { /* Store VLAN ID, if any. */ if (__predict_false(m->m_flags & M_VLANTAG)) { po->data.vlan_id = htons(m->m_pkthdr.ether_vtag) & htons(EVL_VLID_MASK); } } switch (po->data.lro_type) { case LRO_TYPE_IPV4_UDP: case LRO_TYPE_IPV6_UDP: /* Check for VXLAN headers. */ if ((m->m_pkthdr.csum_flags & vxlan_csum) != vxlan_csum) break; /* Try to parse inner headers. */ data_ptr = tcp_lro_low_level_parser(data_ptr, pi, update_data, true); if (data_ptr == NULL || pi->total_hdr_len > m->m_len) break; /* Verify supported header types. */ switch (pi->data.lro_type) { case LRO_TYPE_IPV4_TCP: case LRO_TYPE_IPV6_TCP: return (pi); default: break; } break; case LRO_TYPE_IPV4_TCP: case LRO_TYPE_IPV6_TCP: if (update_data) memset(pi, 0, sizeof(*pi)); return (po); default: break; } return (NULL); } static inline int tcp_lro_trim_mbuf_chain(struct mbuf *m, const struct lro_parser *po) { int len; switch (po->data.lro_type) { #ifdef INET case LRO_TYPE_IPV4_TCP: len = ((uint8_t *)po->ip4 - (uint8_t *)m->m_data) + ntohs(po->ip4->ip_len); break; #endif #ifdef INET6 case LRO_TYPE_IPV6_TCP: len = ((uint8_t *)po->ip6 - (uint8_t *)m->m_data) + ntohs(po->ip6->ip6_plen) + sizeof(*po->ip6); break; #endif default: return (TCP_LRO_CANNOT); } /* * If the frame is padded beyond the end of the IP packet, * then trim the extra bytes off: */ if (__predict_true(m->m_pkthdr.len == len)) { return (0); } else if (m->m_pkthdr.len > len) { m_adj(m, len - m->m_pkthdr.len); return (0); } return (TCP_LRO_CANNOT); } static struct tcphdr * tcp_lro_get_th(struct mbuf *m) { return ((struct tcphdr *)((uint8_t *)m->m_data + m->m_pkthdr.lro_tcp_h_off)); } static void lro_free_mbuf_chain(struct mbuf *m) { struct mbuf *save; while (m) { save = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = save; } } void tcp_lro_free(struct lro_ctrl *lc) { struct lro_entry *le; unsigned x; /* reset LRO free list */ LIST_INIT(&lc->lro_free); /* free active mbufs, if any */ while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); lro_free_mbuf_chain(le->m_head); } /* free hash table */ free(lc->lro_hash, M_LRO); lc->lro_hash = NULL; lc->lro_hashsz = 0; /* free mbuf array, if any */ for (x = 0; x != lc->lro_mbuf_count; x++) m_freem(lc->lro_mbuf_data[x].mb); lc->lro_mbuf_count = 0; /* free allocated memory, if any */ free(lc->lro_mbuf_data, M_LRO); lc->lro_mbuf_data = NULL; } static uint16_t tcp_lro_rx_csum_tcphdr(const struct tcphdr *th) { const uint16_t *ptr; uint32_t csum; uint16_t len; csum = -th->th_sum; /* exclude checksum field */ len = th->th_off; ptr = (const uint16_t *)th; while (len--) { csum += *ptr; ptr++; csum += *ptr; ptr++; } while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); return (csum); } static uint16_t tcp_lro_rx_csum_data(const struct lro_parser *pa, uint16_t tcp_csum) { uint32_t c; uint16_t cs; c = tcp_csum; switch (pa->data.lro_type) { #ifdef INET6 case LRO_TYPE_IPV6_TCP: /* Compute full pseudo IPv6 header checksum. */ cs = in6_cksum_pseudo(pa->ip6, ntohs(pa->ip6->ip6_plen), pa->ip6->ip6_nxt, 0); break; #endif #ifdef INET case LRO_TYPE_IPV4_TCP: /* Compute full pseudo IPv4 header checsum. */ cs = in_addword(ntohs(pa->ip4->ip_len) - sizeof(*pa->ip4), IPPROTO_TCP); cs = in_pseudo(pa->ip4->ip_src.s_addr, pa->ip4->ip_dst.s_addr, htons(cs)); break; #endif default: cs = 0; /* Keep compiler happy. */ break; } /* Complement checksum. */ cs = ~cs; c += cs; /* Remove TCP header checksum. */ cs = ~tcp_lro_rx_csum_tcphdr(pa->tcp); c += cs; /* Compute checksum remainder. */ while (c > 0xffff) c = (c >> 16) + (c & 0xffff); return (c); } static void tcp_lro_rx_done(struct lro_ctrl *lc) { struct lro_entry *le; while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } void tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) { struct lro_entry *le, *le_tmp; - sbintime_t sbt; + uint64_t now, tov; + struct bintime bt; if (LIST_EMPTY(&lc->lro_active)) return; - /* get timeout time */ - sbt = getsbinuptime() - tvtosbt(*timeout); - + /* get timeout time and current time in ns */ + binuptime(&bt); + now = bintime2ns(&bt); + tov = ((timeout->tv_sec * 1000000000) + (timeout->tv_usec * 1000)); LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { - if (sbt >= le->alloc_time) { + if (now >= (bintime2ns(&le->alloc_time) + tov)) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } } #ifdef INET static int tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4) { uint16_t csum; /* Legacy IP has a header checksum that needs to be correct. */ if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { if (__predict_false((m->m_pkthdr.csum_flags & CSUM_IP_VALID) == 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } else { csum = in_cksum_hdr(ip4); if (__predict_false(csum != 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } return (0); } #endif #ifdef TCPHPTS static void tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc, const struct lro_entry *le, const struct mbuf *m, int frm, int32_t tcp_data_len, uint32_t th_seq, uint32_t th_ack, uint16_t th_win) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; - struct timeval tv; + struct timeval tv, btv; uint32_t cts; cts = tcp_get_usecs(&tv); memset(&log, 0, sizeof(union tcp_log_stackspecific)); log.u_bbr.flex8 = frm; log.u_bbr.flex1 = tcp_data_len; if (m) log.u_bbr.flex2 = m->m_pkthdr.len; else log.u_bbr.flex2 = 0; log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs; log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len; if (le->m_head) { log.u_bbr.flex5 = le->m_head->m_pkthdr.len; log.u_bbr.delRate = le->m_head->m_flags; log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp; } log.u_bbr.inflight = th_seq; log.u_bbr.timeStamp = cts; log.u_bbr.epoch = le->next_seq; log.u_bbr.delivered = th_ack; log.u_bbr.lt_epoch = le->ack_seq; log.u_bbr.pacing_gain = th_win; log.u_bbr.cwnd_gain = le->window; log.u_bbr.cur_del_rate = (uintptr_t)m; log.u_bbr.bw_inuse = (uintptr_t)le->m_head; - log.u_bbr.flex6 = sbttous(lc->lro_last_queue_time); + bintime2timeval(&lc->lro_last_queue_time, &btv); + log.u_bbr.flex6 = tcp_tv_to_usectick(&btv); log.u_bbr.flex7 = le->compressed; log.u_bbr.pacing_gain = le->uncompressed; if (in_epoch(net_epoch_preempt)) log.u_bbr.inhpts = 1; else log.u_bbr.inhpts = 0; TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_LRO, 0, 0, &log, false, &tv); } } #endif static inline void tcp_lro_assign_and_checksum_16(uint16_t *ptr, uint16_t value, uint16_t *psum) { uint32_t csum; csum = 0xffff - *ptr + value; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); *ptr = value; *psum = csum; } static uint16_t tcp_lro_update_checksum(const struct lro_parser *pa, const struct lro_entry *le, uint16_t payload_len, uint16_t delta_sum) { uint32_t csum; uint16_t tlen; uint16_t temp[5] = {}; switch (pa->data.lro_type) { case LRO_TYPE_IPV4_TCP: /* Compute new IPv4 length. */ tlen = (pa->ip4->ip_hl << 2) + (pa->tcp->th_off << 2) + payload_len; tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]); /* Subtract delta from current IPv4 checksum. */ csum = pa->ip4->ip_sum + 0xffff - temp[0]; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]); goto update_tcp_header; case LRO_TYPE_IPV6_TCP: /* Compute new IPv6 length. */ tlen = (pa->tcp->th_off << 2) + payload_len; tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]); goto update_tcp_header; case LRO_TYPE_IPV4_UDP: /* Compute new IPv4 length. */ tlen = (pa->ip4->ip_hl << 2) + sizeof(*pa->udp) + payload_len; tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]); /* Subtract delta from current IPv4 checksum. */ csum = pa->ip4->ip_sum + 0xffff - temp[0]; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]); goto update_udp_header; case LRO_TYPE_IPV6_UDP: /* Compute new IPv6 length. */ tlen = sizeof(*pa->udp) + payload_len; tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]); goto update_udp_header; default: return (0); } update_tcp_header: /* Compute current TCP header checksum. */ temp[2] = tcp_lro_rx_csum_tcphdr(pa->tcp); /* Incorporate the latest ACK into the TCP header. */ pa->tcp->th_ack = le->ack_seq; pa->tcp->th_win = le->window; /* Incorporate latest timestamp into the TCP header. */ if (le->timestamp != 0) { uint32_t *ts_ptr; ts_ptr = (uint32_t *)(pa->tcp + 1); ts_ptr[1] = htonl(le->tsval); ts_ptr[2] = le->tsecr; } /* Compute new TCP header checksum. */ temp[3] = tcp_lro_rx_csum_tcphdr(pa->tcp); /* Compute new TCP checksum. */ csum = pa->tcp->th_sum + 0xffff - delta_sum + 0xffff - temp[0] + 0xffff - temp[3] + temp[2]; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); /* Assign new TCP checksum. */ tcp_lro_assign_and_checksum_16(&pa->tcp->th_sum, csum, &temp[4]); /* Compute all modififications affecting next checksum. */ csum = temp[0] + temp[1] + 0xffff - temp[2] + temp[3] + temp[4] + delta_sum; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); /* Return delta checksum to next stage, if any. */ return (csum); update_udp_header: tlen = sizeof(*pa->udp) + payload_len; /* Assign new UDP length and compute checksum delta. */ tcp_lro_assign_and_checksum_16(&pa->udp->uh_ulen, htons(tlen), &temp[2]); /* Check if there is a UDP checksum. */ if (__predict_false(pa->udp->uh_sum != 0)) { /* Compute new UDP checksum. */ csum = pa->udp->uh_sum + 0xffff - delta_sum + 0xffff - temp[0] + 0xffff - temp[2]; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); /* Assign new UDP checksum. */ tcp_lro_assign_and_checksum_16(&pa->udp->uh_sum, csum, &temp[3]); } /* Compute all modififications affecting next checksum. */ csum = temp[0] + temp[1] + temp[2] + temp[3] + delta_sum; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); /* Return delta checksum to next stage, if any. */ return (csum); } static void tcp_flush_out_entry(struct lro_ctrl *lc, struct lro_entry *le) { /* Check if we need to recompute any checksums. */ if (le->m_head->m_pkthdr.lro_nsegs > 1) { uint16_t csum; switch (le->inner.data.lro_type) { case LRO_TYPE_IPV4_TCP: csum = tcp_lro_update_checksum(&le->inner, le, le->m_head->m_pkthdr.lro_tcp_d_len, le->m_head->m_pkthdr.lro_tcp_d_csum); csum = tcp_lro_update_checksum(&le->outer, NULL, le->m_head->m_pkthdr.lro_tcp_d_len + le->inner.total_hdr_len, csum); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; le->m_head->m_pkthdr.csum_data = 0xffff; break; case LRO_TYPE_IPV6_TCP: csum = tcp_lro_update_checksum(&le->inner, le, le->m_head->m_pkthdr.lro_tcp_d_len, le->m_head->m_pkthdr.lro_tcp_d_csum); csum = tcp_lro_update_checksum(&le->outer, NULL, le->m_head->m_pkthdr.lro_tcp_d_len + le->inner.total_hdr_len, csum); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; le->m_head->m_pkthdr.csum_data = 0xffff; break; case LRO_TYPE_NONE: switch (le->outer.data.lro_type) { case LRO_TYPE_IPV4_TCP: csum = tcp_lro_update_checksum(&le->outer, le, le->m_head->m_pkthdr.lro_tcp_d_len, le->m_head->m_pkthdr.lro_tcp_d_csum); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; le->m_head->m_pkthdr.csum_data = 0xffff; break; case LRO_TYPE_IPV6_TCP: csum = tcp_lro_update_checksum(&le->outer, le, le->m_head->m_pkthdr.lro_tcp_d_len, le->m_head->m_pkthdr.lro_tcp_d_csum); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; le->m_head->m_pkthdr.csum_data = 0xffff; break; default: break; } break; default: break; } } /* * Break any chain, this is not set to NULL on the singleton * case m_nextpkt points to m_head. Other case set them * m_nextpkt to NULL in push_and_replace. */ le->m_head->m_nextpkt = NULL; lc->lro_queued += le->m_head->m_pkthdr.lro_nsegs; (*lc->ifp->if_input)(lc->ifp, le->m_head); } static void tcp_set_entry_to_mbuf(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m, struct tcphdr *th) { uint32_t *ts_ptr; uint16_t tcp_data_len; uint16_t tcp_opt_len; ts_ptr = (uint32_t *)(th + 1); tcp_opt_len = (th->th_off << 2); tcp_opt_len -= sizeof(*th); /* Check if there is a timestamp option. */ if (tcp_opt_len == 0 || __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || *ts_ptr != TCP_LRO_TS_OPTION)) { /* We failed to find the timestamp option. */ le->timestamp = 0; } else { le->timestamp = 1; le->tsval = ntohl(*(ts_ptr + 1)); le->tsecr = *(ts_ptr + 2); } tcp_data_len = m->m_pkthdr.lro_tcp_d_len; /* Pull out TCP sequence numbers and window size. */ le->next_seq = ntohl(th->th_seq) + tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; /* Setup new data pointers. */ le->m_head = m; le->m_tail = m_last(m); } static void tcp_push_and_replace(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m) { struct lro_parser *pa; /* * Push up the stack of the current entry * and replace it with "m". */ struct mbuf *msave; /* Grab off the next and save it */ msave = le->m_head->m_nextpkt; le->m_head->m_nextpkt = NULL; /* Now push out the old entry */ tcp_flush_out_entry(lc, le); /* Re-parse new header, should not fail. */ pa = tcp_lro_parser(m, &le->outer, &le->inner, false); KASSERT(pa != NULL, ("tcp_push_and_replace: LRO parser failed on m=%p\n", m)); /* * Now to replace the data properly in the entry * we have to reset the TCP header and * other fields. */ tcp_set_entry_to_mbuf(lc, le, m, pa->tcp); /* Restore the next list */ m->m_nextpkt = msave; } static void tcp_lro_mbuf_append_pkthdr(struct mbuf *m, const struct mbuf *p) { uint32_t csum; if (m->m_pkthdr.lro_nsegs == 1) { /* Compute relative checksum. */ csum = p->m_pkthdr.lro_tcp_d_csum; } else { /* Merge TCP data checksums. */ csum = (uint32_t)m->m_pkthdr.lro_tcp_d_csum + (uint32_t)p->m_pkthdr.lro_tcp_d_csum; while (csum > 0xffff) csum = (csum >> 16) + (csum & 0xffff); } /* Update various counters. */ m->m_pkthdr.len += p->m_pkthdr.lro_tcp_d_len; m->m_pkthdr.lro_tcp_d_csum = csum; m->m_pkthdr.lro_tcp_d_len += p->m_pkthdr.lro_tcp_d_len; m->m_pkthdr.lro_nsegs += p->m_pkthdr.lro_nsegs; } static void tcp_lro_condense(struct lro_ctrl *lc, struct lro_entry *le) { /* * Walk through the mbuf chain we * have on tap and compress/condense * as required. */ uint32_t *ts_ptr; struct mbuf *m; struct tcphdr *th; uint32_t tcp_data_len_total; uint32_t tcp_data_seg_total; uint16_t tcp_data_len; uint16_t tcp_opt_len; /* * First we must check the lead (m_head) * we must make sure that it is *not* * something that should be sent up * right away (sack etc). */ again: m = le->m_head->m_nextpkt; if (m == NULL) { /* Just one left. */ return; } th = tcp_lro_get_th(m); tcp_opt_len = (th->th_off << 2); tcp_opt_len -= sizeof(*th); ts_ptr = (uint32_t *)(th + 1); if (tcp_opt_len != 0 && __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || *ts_ptr != TCP_LRO_TS_OPTION)) { /* * Its not the timestamp. We can't * use this guy as the head. */ le->m_head->m_nextpkt = m->m_nextpkt; tcp_push_and_replace(lc, le, m); goto again; } if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { /* * Make sure that previously seen segements/ACKs are delivered * before this segment, e.g. FIN. */ le->m_head->m_nextpkt = m->m_nextpkt; tcp_push_and_replace(lc, le, m); goto again; } while((m = le->m_head->m_nextpkt) != NULL) { /* * condense m into le, first * pull m out of the list. */ le->m_head->m_nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; /* Setup my data */ tcp_data_len = m->m_pkthdr.lro_tcp_d_len; th = tcp_lro_get_th(m); ts_ptr = (uint32_t *)(th + 1); tcp_opt_len = (th->th_off << 2); tcp_opt_len -= sizeof(*th); tcp_data_len_total = le->m_head->m_pkthdr.lro_tcp_d_len + tcp_data_len; tcp_data_seg_total = le->m_head->m_pkthdr.lro_nsegs + m->m_pkthdr.lro_nsegs; if (tcp_data_seg_total >= lc->lro_ackcnt_lim || tcp_data_len_total >= lc->lro_length_lim) { /* Flush now if appending will result in overflow. */ tcp_push_and_replace(lc, le, m); goto again; } if (tcp_opt_len != 0 && __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA || *ts_ptr != TCP_LRO_TS_OPTION)) { /* * Maybe a sack in the new one? We need to * start all over after flushing the * current le. We will go up to the beginning * and flush it (calling the replace again possibly * or just returning). */ tcp_push_and_replace(lc, le, m); goto again; } if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) { tcp_push_and_replace(lc, le, m); goto again; } if (tcp_opt_len != 0) { uint32_t tsval = ntohl(*(ts_ptr + 1)); /* Make sure timestamp values are increasing. */ if (TSTMP_GT(le->tsval, tsval)) { tcp_push_and_replace(lc, le, m); goto again; } le->tsval = tsval; le->tsecr = *(ts_ptr + 2); } /* Try to append the new segment. */ if (__predict_false(ntohl(th->th_seq) != le->next_seq || (tcp_data_len == 0 && le->ack_seq == th->th_ack && le->window == th->th_win))) { /* Out of order packet or duplicate ACK. */ tcp_push_and_replace(lc, le, m); goto again; } if (tcp_data_len != 0 || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) { le->next_seq += tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; } else if (th->th_ack == le->ack_seq) { le->window = WIN_MAX(le->window, th->th_win); } if (tcp_data_len == 0) { m_freem(m); continue; } /* Merge TCP data checksum and length to head mbuf. */ tcp_lro_mbuf_append_pkthdr(le->m_head, m); /* * Adjust the mbuf so that m_data points to the first byte of * the ULP payload. Adjust the mbuf to avoid complications and * append new segment to existing mbuf chain. */ m_adj(m, m->m_pkthdr.len - tcp_data_len); m_demote_pkthdr(m); le->m_tail->m_next = m; le->m_tail = m_last(m); } } #ifdef TCPHPTS static void tcp_queue_pkts(struct inpcb *inp, struct tcpcb *tp, struct lro_entry *le) { INP_WLOCK_ASSERT(inp); if (tp->t_in_pkt == NULL) { /* Nothing yet there */ tp->t_in_pkt = le->m_head; tp->t_tail_pkt = le->m_last_mbuf; } else { /* Already some there */ tp->t_tail_pkt->m_nextpkt = le->m_head; tp->t_tail_pkt = le->m_last_mbuf; } le->m_head = NULL; le->m_last_mbuf = NULL; } static struct mbuf * tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le, struct inpcb *inp, int32_t *new_m) { struct tcpcb *tp; struct mbuf *m; tp = intotcpcb(inp); if (__predict_false(tp == NULL)) return (NULL); /* Look at the last mbuf if any in queue */ m = tp->t_tail_pkt; if (m != NULL && (m->m_flags & M_ACKCMP) != 0) { if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) { tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0); *new_m = 0; counter_u64_add(tcp_extra_mbuf, 1); return (m); } else { /* Mark we ran out of space */ inp->inp_flags2 |= INP_MBUF_L_ACKS; } } /* Decide mbuf size. */ if (inp->inp_flags2 & INP_MBUF_L_ACKS) m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR); else m = m_gethdr(M_NOWAIT, MT_DATA); if (__predict_false(m == NULL)) { counter_u64_add(tcp_would_have_but, 1); return (NULL); } counter_u64_add(tcp_comp_total, 1); m->m_flags |= M_ACKCMP; *new_m = 1; return (m); } static struct inpcb * tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa) { struct inpcb *inp; NET_EPOCH_ASSERT(); switch (pa->data.lro_type) { #ifdef INET6 case LRO_TYPE_IPV6_TCP: inp = in6_pcblookup(&V_tcbinfo, &pa->data.s_addr.v6, pa->data.s_port, &pa->data.d_addr.v6, pa->data.d_port, INPLOOKUP_WLOCKPCB, ifp); break; #endif #ifdef INET case LRO_TYPE_IPV4_TCP: inp = in_pcblookup(&V_tcbinfo, pa->data.s_addr.v4, pa->data.s_port, pa->data.d_addr.v4, pa->data.d_port, INPLOOKUP_WLOCKPCB, ifp); break; #endif default: inp = NULL; break; } return (inp); } static inline bool tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts) { /* * This function returns two bits of valuable information. * a) Is what is present capable of being ack-compressed, * we can ack-compress if there is no options or just * a timestamp option, and of course the th_flags must * be correct as well. * b) Our other options present such as SACK. This is * used to determine if we want to wakeup or not. */ bool ret = true; switch (th->th_off << 2) { case (sizeof(*th) + TCPOLEN_TSTAMP_APPA): *ppts = (uint32_t *)(th + 1); /* Check if we have only one timestamp option. */ if (**ppts == TCP_LRO_TS_OPTION) *other_opts = false; else { *other_opts = true; ret = false; } break; case (sizeof(*th)): /* No options. */ *ppts = NULL; *other_opts = false; break; default: *ppts = NULL; *other_opts = true; ret = false; break; } /* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */ if ((th->th_flags & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0) ret = false; /* If it has data on it we cannot compress it */ if (m->m_pkthdr.lro_tcp_d_len) ret = false; /* ACK flag must be set. */ if (!(th->th_flags & TH_ACK)) ret = false; return (ret); } static int tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le) { struct inpcb *inp; struct tcpcb *tp; struct mbuf **pp, *cmp, *mv_to; bool bpf_req, should_wake; /* Check if packet doesn't belongs to our network interface. */ if ((tcplro_stacks_wanting_mbufq == 0) || (le->outer.data.vlan_id != 0) || (le->inner.data.lro_type != LRO_TYPE_NONE)) return (TCP_LRO_CANNOT); #ifdef INET6 /* * Be proactive about unspecified IPv6 address in source. As * we use all-zero to indicate unbounded/unconnected pcb, * unspecified IPv6 address can be used to confuse us. * * Note that packets with unspecified IPv6 destination is * already dropped in ip6_input. */ if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP && IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6))) return (TCP_LRO_CANNOT); if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP && IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6))) return (TCP_LRO_CANNOT); #endif /* Lookup inp, if any. */ inp = tcp_lro_lookup(lc->ifp, (le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner); if (inp == NULL) return (TCP_LRO_CANNOT); counter_u64_add(tcp_inp_lro_locks_taken, 1); /* Get TCP control structure. */ tp = intotcpcb(inp); /* Check if the inp is dead, Jim. */ if (tp == NULL || (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || (inp->inp_flags2 & INP_FREED)) { INP_WUNLOCK(inp); return (TCP_LRO_CANNOT); } /* Check if the transport doesn't support the needed optimizations. */ if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) { INP_WUNLOCK(inp); return (TCP_LRO_CANNOT); } if (inp->inp_flags2 & INP_MBUF_QUEUE_READY) should_wake = false; else should_wake = true; /* Check if packets should be tapped to BPF. */ bpf_req = bpf_peers_present(lc->ifp->if_bpf); /* Strip and compress all the incoming packets. */ cmp = NULL; for (pp = &le->m_head; *pp != NULL; ) { mv_to = NULL; if (do_bpf_strip_and_compress(inp, lc, le, pp, &cmp, &mv_to, &should_wake, bpf_req ) == false) { /* Advance to next mbuf. */ pp = &(*pp)->m_nextpkt; } else if (mv_to != NULL) { /* We are asked to move pp up */ pp = &mv_to->m_nextpkt; } } /* Update "m_last_mbuf", if any. */ if (pp == &le->m_head) le->m_last_mbuf = *pp; else le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt); /* Check if any data mbufs left. */ if (le->m_head != NULL) { counter_u64_add(tcp_inp_lro_direct_queue, 1); tcp_lro_log(tp, lc, le, NULL, 22, 1, inp->inp_flags2, inp->inp_in_input, 1); tcp_queue_pkts(inp, tp, le); } if (should_wake) { /* Wakeup */ counter_u64_add(tcp_inp_lro_wokeup_queue, 1); if ((*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0)) inp = NULL; } if (inp != NULL) INP_WUNLOCK(inp); return (0); /* Success. */ } #endif void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) { /* Only optimise if there are multiple packets waiting. */ #ifdef TCPHPTS int error; CURVNET_SET(lc->ifp->if_vnet); error = tcp_lro_flush_tcphpts(lc, le); CURVNET_RESTORE(); if (error != 0) { #endif tcp_lro_condense(lc, le); tcp_flush_out_entry(lc, le); #ifdef TCPHPTS } #endif lc->lro_flushed++; bzero(le, sizeof(*le)); LIST_INSERT_HEAD(&lc->lro_free, le, next); } #ifdef HAVE_INLINE_FLSLL #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) #else static inline uint64_t tcp_lro_msb_64(uint64_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); return (x & ~(x >> 1)); } #endif /* * The tcp_lro_sort() routine is comparable to qsort(), except it has * a worst case complexity limit of O(MIN(N,64)*N), where N is the * number of elements to sort and 64 is the number of sequence bits * available. The algorithm is bit-slicing the 64-bit sequence number, * sorting one bit at a time from the most significant bit until the * least significant one, skipping the constant bits. This is * typically called a radix sort. */ static void tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) { struct lro_mbuf_sort temp; uint64_t ones; uint64_t zeros; uint32_t x; uint32_t y; repeat: /* for small arrays insertion sort is faster */ if (size <= 12) { for (x = 1; x < size; x++) { temp = parray[x]; for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) parray[y] = parray[y - 1]; parray[y] = temp; } return; } /* compute sequence bits which are constant */ ones = 0; zeros = 0; for (x = 0; x != size; x++) { ones |= parray[x].seq; zeros |= ~parray[x].seq; } /* compute bits which are not constant into "ones" */ ones &= zeros; if (ones == 0) return; /* pick the most significant bit which is not constant */ ones = tcp_lro_msb_64(ones); /* * Move entries having cleared sequence bits to the beginning * of the array: */ for (x = y = 0; y != size; y++) { /* skip set bits */ if (parray[y].seq & ones) continue; /* swap entries */ temp = parray[x]; parray[x] = parray[y]; parray[y] = temp; x++; } KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); /* sort zeros */ tcp_lro_sort(parray, x); /* sort ones */ parray += x; size -= x; goto repeat; } void tcp_lro_flush_all(struct lro_ctrl *lc) { uint64_t seq; uint64_t nseq; unsigned x; /* check if no mbufs to flush */ if (lc->lro_mbuf_count == 0) goto done; CURVNET_SET(lc->ifp->if_vnet); /* get current time */ - lc->lro_last_queue_time = getsbinuptime(); + binuptime(&lc->lro_last_queue_time); /* sort all mbufs according to stream */ tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); /* input data into LRO engine, stream by stream */ seq = 0; for (x = 0; x != lc->lro_mbuf_count; x++) { struct mbuf *mb; /* get mbuf */ mb = lc->lro_mbuf_data[x].mb; /* get sequence number, masking away the packet index */ nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); /* check for new stream */ if (seq != nseq) { seq = nseq; /* flush active streams */ tcp_lro_rx_done(lc); } /* add packet to LRO engine */ if (tcp_lro_rx_common(lc, mb, 0, false) != 0) { /* input packet to network layer */ (*lc->ifp->if_input)(lc->ifp, mb); lc->lro_queued++; lc->lro_flushed++; } } CURVNET_RESTORE(); done: /* flush active streams */ tcp_lro_rx_done(lc); lc->lro_mbuf_count = 0; } #ifdef TCPHPTS static void build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m, uint32_t *ts_ptr, uint16_t iptos) { /* * Given a TCP ACK, summarize it down into the small TCP ACK * entry. */ ae->timestamp = m->m_pkthdr.rcv_tstmp; if (m->m_flags & M_TSTMP_LRO) ae->flags = TSTMP_LRO; else if (m->m_flags & M_TSTMP) ae->flags = TSTMP_HDWR; ae->seq = ntohl(th->th_seq); ae->ack = ntohl(th->th_ack); ae->flags |= th->th_flags; if (ts_ptr != NULL) { ae->ts_value = ntohl(ts_ptr[1]); ae->ts_echo = ntohl(ts_ptr[2]); ae->flags |= HAS_TSTMP; } ae->win = ntohs(th->th_win); ae->codepoint = iptos; } /* * Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets * and strip all, but the IPv4/IPv6 header. */ static bool do_bpf_strip_and_compress(struct inpcb *inp, struct lro_ctrl *lc, struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp, struct mbuf **mv_to, bool *should_wake, bool bpf_req) { union { void *ptr; struct ip *ip4; struct ip6_hdr *ip6; } l3; struct mbuf *m; struct mbuf *nm; struct tcphdr *th; struct tcp_ackent *ack_ent; uint32_t *ts_ptr; int32_t n_mbuf; bool other_opts, can_compress; uint16_t lro_type; uint16_t iptos; int tcp_hdr_offset; int idx; /* Get current mbuf. */ m = *pp; /* Let the BPF see the packet */ if (__predict_false(bpf_req)) ETHER_BPF_MTAP(lc->ifp, m); tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off; lro_type = le->inner.data.lro_type; switch (lro_type) { case LRO_TYPE_NONE: lro_type = le->outer.data.lro_type; switch (lro_type) { case LRO_TYPE_IPV4_TCP: tcp_hdr_offset -= sizeof(*le->outer.ip4); m->m_pkthdr.lro_etype = ETHERTYPE_IP; break; case LRO_TYPE_IPV6_TCP: tcp_hdr_offset -= sizeof(*le->outer.ip6); m->m_pkthdr.lro_etype = ETHERTYPE_IPV6; break; default: goto compressed; } break; case LRO_TYPE_IPV4_TCP: tcp_hdr_offset -= sizeof(*le->outer.ip4); m->m_pkthdr.lro_etype = ETHERTYPE_IP; break; case LRO_TYPE_IPV6_TCP: tcp_hdr_offset -= sizeof(*le->outer.ip6); m->m_pkthdr.lro_etype = ETHERTYPE_IPV6; break; default: goto compressed; } MPASS(tcp_hdr_offset >= 0); m_adj(m, tcp_hdr_offset); m->m_flags |= M_LRO_EHDRSTRP; m->m_flags &= ~M_ACKCMP; m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset; th = tcp_lro_get_th(m); th->th_sum = 0; /* TCP checksum is valid. */ /* Check if ACK can be compressed */ can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts); /* Now lets look at the should wake states */ if ((other_opts == true) && ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) == 0)) { /* * If there are other options (SACK?) and the * tcp endpoint has not expressly told us it does * not care about SACKS, then we should wake up. */ *should_wake = true; } /* Is the ack compressable? */ if (can_compress == false) goto done; /* Does the TCP endpoint support ACK compression? */ if ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0) goto done; /* Lets get the TOS/traffic class field */ l3.ptr = mtod(m, void *); switch (lro_type) { case LRO_TYPE_IPV4_TCP: iptos = l3.ip4->ip_tos; break; case LRO_TYPE_IPV6_TCP: iptos = IPV6_TRAFFIC_CLASS(l3.ip6); break; default: iptos = 0; /* Keep compiler happy. */ break; } /* Now lets get space if we don't have some already */ if (*cmp == NULL) { new_one: nm = tcp_lro_get_last_if_ackcmp(lc, le, inp, &n_mbuf); if (__predict_false(nm == NULL)) goto done; *cmp = nm; if (n_mbuf) { /* * Link in the new cmp ack to our in-order place, * first set our cmp ack's next to where we are. */ nm->m_nextpkt = m; (*pp) = nm; /* * Set it up so mv_to is advanced to our * compressed ack. This way the caller can * advance pp to the right place. */ *mv_to = nm; /* * Advance it here locally as well. */ pp = &nm->m_nextpkt; } } else { /* We have one already we are working on */ nm = *cmp; if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) { /* We ran out of space */ inp->inp_flags2 |= INP_MBUF_L_ACKS; goto new_one; } } MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent)); counter_u64_add(tcp_inp_lro_compressed, 1); le->compressed++; /* We can add in to the one on the tail */ ack_ent = mtod(nm, struct tcp_ackent *); idx = (nm->m_len / sizeof(struct tcp_ackent)); build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos); /* Bump the size of both pkt-hdr and len */ nm->m_len += sizeof(struct tcp_ackent); nm->m_pkthdr.len += sizeof(struct tcp_ackent); compressed: /* Advance to next mbuf before freeing. */ *pp = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); return (true); done: counter_u64_add(tcp_uncomp_total, 1); le->uncompressed++; return (false); } #endif static struct lro_head * tcp_lro_rx_get_bucket(struct lro_ctrl *lc, struct mbuf *m, struct lro_parser *parser) { u_long hash; if (M_HASHTYPE_ISHASH(m)) { hash = m->m_pkthdr.flowid; } else { for (unsigned i = hash = 0; i != LRO_RAW_ADDRESS_MAX; i++) hash += parser->data.raw[i]; } return (&lc->lro_hash[hash % lc->lro_hashsz]); } static int tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_hash) { struct lro_parser pi; /* inner address data */ struct lro_parser po; /* outer address data */ struct lro_parser *pa; /* current parser for TCP stream */ struct lro_entry *le; struct lro_head *bucket; struct tcphdr *th; int tcp_data_len; int tcp_opt_len; int error; uint16_t tcp_data_sum; #ifdef INET /* Quickly decide if packet cannot be LRO'ed */ if (__predict_false(V_ipforwarding != 0)) return (TCP_LRO_CANNOT); #endif #ifdef INET6 /* Quickly decide if packet cannot be LRO'ed */ if (__predict_false(V_ip6_forwarding != 0)) return (TCP_LRO_CANNOT); #endif /* We expect a contiguous header [eh, ip, tcp]. */ pa = tcp_lro_parser(m, &po, &pi, true); if (__predict_false(pa == NULL)) return (TCP_LRO_NOT_SUPPORTED); /* We don't expect any padding. */ error = tcp_lro_trim_mbuf_chain(m, pa); if (__predict_false(error != 0)) return (error); #ifdef INET switch (pa->data.lro_type) { case LRO_TYPE_IPV4_TCP: error = tcp_lro_rx_ipv4(lc, m, pa->ip4); if (__predict_false(error != 0)) return (error); break; default: break; } #endif /* If no hardware or arrival stamp on the packet add timestamp */ if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) { - m->m_pkthdr.rcv_tstmp = sbttons(lc->lro_last_queue_time); + m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); m->m_flags |= M_TSTMP_LRO; } /* Get pointer to TCP header. */ th = pa->tcp; /* Don't process SYN packets. */ if (__predict_false(th->th_flags & TH_SYN)) return (TCP_LRO_CANNOT); /* Get total TCP header length and compute payload length. */ tcp_opt_len = (th->th_off << 2); tcp_data_len = m->m_pkthdr.len - ((uint8_t *)th - (uint8_t *)m->m_data) - tcp_opt_len; tcp_opt_len -= sizeof(*th); /* Don't process invalid TCP headers. */ if (__predict_false(tcp_opt_len < 0 || tcp_data_len < 0)) return (TCP_LRO_CANNOT); /* Compute TCP data only checksum. */ if (tcp_data_len == 0) tcp_data_sum = 0; /* no data, no checksum */ else if (__predict_false(csum != 0)) tcp_data_sum = tcp_lro_rx_csum_data(pa, ~csum); else tcp_data_sum = tcp_lro_rx_csum_data(pa, ~th->th_sum); /* Save TCP info in mbuf. */ m->m_nextpkt = NULL; m->m_pkthdr.rcvif = lc->ifp; m->m_pkthdr.lro_tcp_d_csum = tcp_data_sum; m->m_pkthdr.lro_tcp_d_len = tcp_data_len; m->m_pkthdr.lro_tcp_h_off = ((uint8_t *)th - (uint8_t *)m->m_data); m->m_pkthdr.lro_nsegs = 1; /* Get hash bucket. */ if (!use_hash) { bucket = &lc->lro_hash[0]; } else { bucket = tcp_lro_rx_get_bucket(lc, m, pa); } /* Try to find a matching previous segment. */ LIST_FOREACH(le, bucket, hash_next) { /* Compare addresses and ports. */ if (lro_address_compare(&po.data, &le->outer.data) == false || lro_address_compare(&pi.data, &le->inner.data) == false) continue; /* Check if no data and old ACK. */ if (tcp_data_len == 0 && SEQ_LT(ntohl(th->th_ack), ntohl(le->ack_seq))) { m_freem(m); return (0); } /* Mark "m" in the last spot. */ le->m_last_mbuf->m_nextpkt = m; /* Now set the tail to "m". */ le->m_last_mbuf = m; return (0); } /* Try to find an empty slot. */ if (LIST_EMPTY(&lc->lro_free)) return (TCP_LRO_NO_ENTRIES); /* Start a new segment chain. */ le = LIST_FIRST(&lc->lro_free); LIST_REMOVE(le, next); tcp_lro_active_insert(lc, bucket, le); /* Make sure the headers are set. */ le->inner = pi; le->outer = po; /* Store time this entry was allocated. */ le->alloc_time = lc->lro_last_queue_time; tcp_set_entry_to_mbuf(lc, le, m, th); /* Now set the tail to "m". */ le->m_last_mbuf = m; return (0); } int tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) { int error; /* get current time */ - lc->lro_last_queue_time = getsbinuptime(); + binuptime(&lc->lro_last_queue_time); CURVNET_SET(lc->ifp->if_vnet); error = tcp_lro_rx_common(lc, m, csum, true); CURVNET_RESTORE(); return (error); } void tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) { /* sanity checks */ if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || lc->lro_mbuf_max == 0)) { /* packet drop */ m_freem(mb); return; } /* check if packet is not LRO capable */ if (__predict_false(mb->m_pkthdr.csum_flags == 0 || (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { /* input packet to network layer */ (*lc->ifp->if_input) (lc->ifp, mb); return; } /* create sequence number */ lc->lro_mbuf_data[lc->lro_mbuf_count].seq = (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | (((uint64_t)mb->m_pkthdr.flowid) << 24) | ((uint64_t)lc->lro_mbuf_count); /* enter mbuf */ lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb; /* flush if array is full */ if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max)) tcp_lro_flush_all(lc); } /* end */ diff --git a/sys/netinet/tcp_lro.h b/sys/netinet/tcp_lro.h index d2220a626b81..5ff15e2dc97e 100644 --- a/sys/netinet/tcp_lro.h +++ b/sys/netinet/tcp_lro.h @@ -1,208 +1,208 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2016-2021 Mellanox Technologies. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _TCP_LRO_H_ #define _TCP_LRO_H_ #include #ifndef TCP_LRO_ENTRIES /* Define default number of LRO entries per RX queue */ #define TCP_LRO_ENTRIES 8 #endif /* * Flags for ACK entry for compression * the bottom 8 bits has the th_flags. * LRO itself adds only the TSTMP flags * to indicate if either of the types * of timestamps are filled and the * HAS_TSTMP option to indicate if the * TCP timestamp option is valid. * * The other 5 flag bits are for processing * by a stack. * */ #define TSTMP_LRO 0x0100 #define TSTMP_HDWR 0x0200 #define HAS_TSTMP 0x0400 struct inpcb; union lro_address { u_long raw[1]; struct { uint16_t lro_type; /* internal */ #define LRO_TYPE_NONE 0 #define LRO_TYPE_IPV4_TCP 1 #define LRO_TYPE_IPV6_TCP 2 #define LRO_TYPE_IPV4_UDP 3 #define LRO_TYPE_IPV6_UDP 4 uint16_t vlan_id; /* VLAN identifier */ uint16_t s_port; /* source TCP/UDP port */ uint16_t d_port; /* destination TCP/UDP port */ uint32_t vxlan_vni; /* VXLAN virtual network identifier */ union { #ifdef INET struct in_addr v4; #endif #ifdef INET6 struct in6_addr v6; #endif } s_addr; /* source IPv4/IPv6 address */ union { #ifdef INET struct in_addr v4; #endif #ifdef INET6 struct in6_addr v6; #endif } d_addr; /* destination IPv4/IPv6 address */ }; } __aligned(sizeof(u_long)); #define LRO_RAW_ADDRESS_MAX \ (sizeof(union lro_address) / sizeof(u_long)) /* Optimize address comparison by comparing one unsigned long at a time: */ static inline bool lro_address_compare(const union lro_address *pa, const union lro_address *pb) { if (pa->lro_type == LRO_TYPE_NONE && pb->lro_type == LRO_TYPE_NONE) { return (true); } else for (unsigned i = 0; i < LRO_RAW_ADDRESS_MAX; i++) { if (pa->raw[i] != pb->raw[i]) return (false); } return (true); } struct lro_parser { union lro_address data; union { uint8_t *l3; struct ip *ip4; struct ip6_hdr *ip6; }; union { uint8_t *l4; struct tcphdr *tcp; struct udphdr *udp; }; uint16_t total_hdr_len; }; /* This structure is zeroed frequently, try to keep it small. */ struct lro_entry { LIST_ENTRY(lro_entry) next; LIST_ENTRY(lro_entry) hash_next; struct mbuf *m_head; struct mbuf *m_tail; struct mbuf *m_last_mbuf; struct lro_parser outer; struct lro_parser inner; uint32_t next_seq; /* tcp_seq */ uint32_t ack_seq; /* tcp_seq */ uint32_t tsval; uint32_t tsecr; uint16_t compressed; uint16_t uncompressed; uint16_t window; uint16_t timestamp; /* flag, not a TCP hdr field. */ - sbintime_t alloc_time; /* time when entry was allocated */ + struct bintime alloc_time; /* time when entry was allocated */ }; LIST_HEAD(lro_head, lro_entry); struct lro_mbuf_sort { uint64_t seq; struct mbuf *mb; }; /* NB: This is part of driver structs. */ struct lro_ctrl { struct ifnet *ifp; struct lro_mbuf_sort *lro_mbuf_data; - sbintime_t lro_last_queue_time; /* last time data was queued */ + struct bintime lro_last_queue_time; /* last time data was queued */ uint64_t lro_queued; uint64_t lro_flushed; uint64_t lro_bad_csum; unsigned lro_cnt; unsigned lro_mbuf_count; unsigned lro_mbuf_max; unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */ unsigned lro_length_lim; /* max len of aggregated data */ u_long lro_hashsz; struct lro_head *lro_hash; struct lro_head lro_active; struct lro_head lro_free; }; struct tcp_ackent { uint64_t timestamp; /* hardware or sofware timestamp, valid if TSTMP_LRO or TSTMP_HDRW set */ uint32_t seq; /* th_seq value */ uint32_t ack; /* th_ack value */ uint32_t ts_value; /* If ts option value, valid if HAS_TSTMP is set */ uint32_t ts_echo; /* If ts option echo, valid if HAS_TSTMP is set */ uint16_t win; /* TCP window */ uint16_t flags; /* Flags to say if TS is present and type of timestamp and th_flags */ uint8_t codepoint; /* IP level codepoint including ECN bits */ uint8_t ack_val_set; /* Classification of ack used by the stack */ uint8_t pad[2]; /* To 32 byte boundary */ }; /* We use two M_PROTO on the mbuf */ #define M_ACKCMP M_PROTO4 /* Indicates LRO is sending in a Ack-compression mbuf */ #define M_LRO_EHDRSTRP M_PROTO6 /* Indicates that LRO has stripped the etherenet header */ #define TCP_LRO_LENGTH_MAX (65535 - 255) /* safe value with room for outer headers */ #define TCP_LRO_ACKCNT_MAX 65535 /* unlimited */ int tcp_lro_init(struct lro_ctrl *); int tcp_lro_init_args(struct lro_ctrl *, struct ifnet *, unsigned, unsigned); void tcp_lro_free(struct lro_ctrl *); void tcp_lro_flush_inactive(struct lro_ctrl *, const struct timeval *); void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); void tcp_lro_flush_all(struct lro_ctrl *); int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *); void tcp_lro_reg_mbufq(void); void tcp_lro_dereg_mbufq(void); #define TCP_LRO_NO_ENTRIES -2 #define TCP_LRO_CANNOT -1 #define TCP_LRO_NOT_SUPPORTED 1 #endif /* _TCP_LRO_H_ */ diff --git a/sys/sys/time.h b/sys/sys/time.h index 9bffca204d56..a48aa3fe5548 100644 --- a/sys/sys/time.h +++ b/sys/sys/time.h @@ -1,631 +1,642 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)time.h 8.5 (Berkeley) 5/4/95 * $FreeBSD$ */ #ifndef _SYS_TIME_H_ #define _SYS_TIME_H_ #include #include #include struct timezone { int tz_minuteswest; /* minutes west of Greenwich */ int tz_dsttime; /* type of dst correction */ }; #define DST_NONE 0 /* not on dst */ #define DST_USA 1 /* USA style dst */ #define DST_AUST 2 /* Australian style dst */ #define DST_WET 3 /* Western European dst */ #define DST_MET 4 /* Middle European dst */ #define DST_EET 5 /* Eastern European dst */ #define DST_CAN 6 /* Canada */ #if __BSD_VISIBLE struct bintime { time_t sec; uint64_t frac; }; static __inline void bintime_addx(struct bintime *_bt, uint64_t _x) { uint64_t _u; _u = _bt->frac; _bt->frac += _x; if (_u > _bt->frac) _bt->sec++; } static __inline void bintime_add(struct bintime *_bt, const struct bintime *_bt2) { uint64_t _u; _u = _bt->frac; _bt->frac += _bt2->frac; if (_u > _bt->frac) _bt->sec++; _bt->sec += _bt2->sec; } static __inline void bintime_sub(struct bintime *_bt, const struct bintime *_bt2) { uint64_t _u; _u = _bt->frac; _bt->frac -= _bt2->frac; if (_u < _bt->frac) _bt->sec--; _bt->sec -= _bt2->sec; } static __inline void bintime_mul(struct bintime *_bt, u_int _x) { uint64_t _p1, _p2; _p1 = (_bt->frac & 0xffffffffull) * _x; _p2 = (_bt->frac >> 32) * _x + (_p1 >> 32); _bt->sec *= _x; _bt->sec += (_p2 >> 32); _bt->frac = (_p2 << 32) | (_p1 & 0xffffffffull); } static __inline void bintime_shift(struct bintime *_bt, int _exp) { if (_exp > 0) { _bt->sec <<= _exp; _bt->sec |= _bt->frac >> (64 - _exp); _bt->frac <<= _exp; } else if (_exp < 0) { _bt->frac >>= -_exp; _bt->frac |= (uint64_t)_bt->sec << (64 + _exp); _bt->sec >>= -_exp; } } #define bintime_clear(a) ((a)->sec = (a)->frac = 0) #define bintime_isset(a) ((a)->sec || (a)->frac) #define bintime_cmp(a, b, cmp) \ (((a)->sec == (b)->sec) ? \ ((a)->frac cmp (b)->frac) : \ ((a)->sec cmp (b)->sec)) #define SBT_1S ((sbintime_t)1 << 32) #define SBT_1M (SBT_1S * 60) #define SBT_1MS (SBT_1S / 1000) #define SBT_1US (SBT_1S / 1000000) #define SBT_1NS (SBT_1S / 1000000000) /* beware rounding, see nstosbt() */ #define SBT_MAX 0x7fffffffffffffffLL static __inline int sbintime_getsec(sbintime_t _sbt) { return (_sbt >> 32); } static __inline sbintime_t bttosbt(const struct bintime _bt) { return (((sbintime_t)_bt.sec << 32) + (_bt.frac >> 32)); } static __inline struct bintime sbttobt(sbintime_t _sbt) { struct bintime _bt; _bt.sec = _sbt >> 32; _bt.frac = _sbt << 32; return (_bt); } /* * Decimal<->sbt conversions. Multiplying or dividing by SBT_1NS results in * large roundoff errors which sbttons() and nstosbt() avoid. Millisecond and * microsecond functions are also provided for completeness. * * These functions return the smallest sbt larger or equal to the * number of seconds requested so that sbttoX(Xtosbt(y)) == y. Unlike * top of second computations below, which require that we tick at the * top of second, these need to be rounded up so we do whatever for at * least as long as requested. * * The naive computation we'd do is this * ((unit * 2^64 / SIFACTOR) + 2^32-1) >> 32 * However, that overflows. Instead, we compute * ((unit * 2^63 / SIFACTOR) + 2^31-1) >> 32 * and use pre-computed constants that are the ceil of the 2^63 / SIFACTOR * term to ensure we are using exactly the right constant. We use the lesser * evil of ull rather than a uint64_t cast to ensure we have well defined * right shift semantics. With these changes, we get all the ns, us and ms * conversions back and forth right. * Note: This file is used for both kernel and userland includes, so we can't * rely on KASSERT being defined, nor can we pollute the namespace by including * assert.h. */ static __inline int64_t sbttons(sbintime_t _sbt) { uint64_t ns; #ifdef KASSERT KASSERT(_sbt >= 0, ("Negative values illegal for sbttons: %jx", _sbt)); #endif ns = _sbt; if (ns >= SBT_1S) ns = (ns >> 32) * 1000000000; else ns = 0; return (ns + (1000000000 * (_sbt & 0xffffffffu) >> 32)); } static __inline sbintime_t nstosbt(int64_t _ns) { sbintime_t sb = 0; #ifdef KASSERT KASSERT(_ns >= 0, ("Negative values illegal for nstosbt: %jd", _ns)); #endif if (_ns >= SBT_1S) { sb = (_ns / 1000000000) * SBT_1S; _ns = _ns % 1000000000; } /* 9223372037 = ceil(2^63 / 1000000000) */ sb += ((_ns * 9223372037ull) + 0x7fffffff) >> 31; return (sb); } static __inline int64_t sbttous(sbintime_t _sbt) { return ((1000000 * _sbt) >> 32); } static __inline sbintime_t ustosbt(int64_t _us) { sbintime_t sb = 0; #ifdef KASSERT KASSERT(_us >= 0, ("Negative values illegal for ustosbt: %jd", _us)); #endif if (_us >= SBT_1S) { sb = (_us / 1000000) * SBT_1S; _us = _us % 1000000; } /* 9223372036855 = ceil(2^63 / 1000000) */ sb += ((_us * 9223372036855ull) + 0x7fffffff) >> 31; return (sb); } static __inline int64_t sbttoms(sbintime_t _sbt) { return ((1000 * _sbt) >> 32); } static __inline sbintime_t mstosbt(int64_t _ms) { sbintime_t sb = 0; #ifdef KASSERT KASSERT(_ms >= 0, ("Negative values illegal for mstosbt: %jd", _ms)); #endif if (_ms >= SBT_1S) { sb = (_ms / 1000) * SBT_1S; _ms = _ms % 1000; } /* 9223372036854776 = ceil(2^63 / 1000) */ sb += ((_ms * 9223372036854776ull) + 0x7fffffff) >> 31; return (sb); } /*- * Background information: * * When converting between timestamps on parallel timescales of differing * resolutions it is historical and scientific practice to round down rather * than doing 4/5 rounding. * * The date changes at midnight, not at noon. * * Even at 15:59:59.999999999 it's not four'o'clock. * * time_second ticks after N.999999999 not after N.4999999999 */ static __inline void bintime2timespec(const struct bintime *_bt, struct timespec *_ts) { _ts->tv_sec = _bt->sec; _ts->tv_nsec = ((uint64_t)1000000000 * (uint32_t)(_bt->frac >> 32)) >> 32; } +static __inline uint64_t +bintime2ns(const struct bintime *_bt) +{ + uint64_t ret; + + ret = (uint64_t)(_bt->sec) * (uint64_t)1000000000; + ret += (((uint64_t)1000000000 * + (uint32_t)(_bt->frac >> 32)) >> 32); + return (ret); +} + static __inline void timespec2bintime(const struct timespec *_ts, struct bintime *_bt) { _bt->sec = _ts->tv_sec; /* 18446744073 = int(2^64 / 1000000000) */ _bt->frac = _ts->tv_nsec * (uint64_t)18446744073LL; } static __inline void bintime2timeval(const struct bintime *_bt, struct timeval *_tv) { _tv->tv_sec = _bt->sec; _tv->tv_usec = ((uint64_t)1000000 * (uint32_t)(_bt->frac >> 32)) >> 32; } static __inline void timeval2bintime(const struct timeval *_tv, struct bintime *_bt) { _bt->sec = _tv->tv_sec; /* 18446744073709 = int(2^64 / 1000000) */ _bt->frac = _tv->tv_usec * (uint64_t)18446744073709LL; } static __inline struct timespec sbttots(sbintime_t _sbt) { struct timespec _ts; _ts.tv_sec = _sbt >> 32; _ts.tv_nsec = sbttons((uint32_t)_sbt); return (_ts); } static __inline sbintime_t tstosbt(struct timespec _ts) { return (((sbintime_t)_ts.tv_sec << 32) + nstosbt(_ts.tv_nsec)); } static __inline struct timeval sbttotv(sbintime_t _sbt) { struct timeval _tv; _tv.tv_sec = _sbt >> 32; _tv.tv_usec = sbttous((uint32_t)_sbt); return (_tv); } static __inline sbintime_t tvtosbt(struct timeval _tv) { return (((sbintime_t)_tv.tv_sec << 32) + ustosbt(_tv.tv_usec)); } #endif /* __BSD_VISIBLE */ #ifdef _KERNEL /* * Simple macros to convert ticks to milliseconds * or microseconds and vice-versa. The answer * will always be at least 1. Note the return * value is a uint32_t however we step up the * operations to 64 bit to avoid any overflow/underflow * problems. */ #define TICKS_2_MSEC(t) max(1, (uint32_t)(hz == 1000) ? \ (t) : (((uint64_t)(t) * (uint64_t)1000)/(uint64_t)hz)) #define TICKS_2_USEC(t) max(1, (uint32_t)(hz == 1000) ? \ ((t) * 1000) : (((uint64_t)(t) * (uint64_t)1000000)/(uint64_t)hz)) #define MSEC_2_TICKS(m) max(1, (uint32_t)((hz == 1000) ? \ (m) : ((uint64_t)(m) * (uint64_t)hz)/(uint64_t)1000)) #define USEC_2_TICKS(u) max(1, (uint32_t)((hz == 1000) ? \ ((u) / 1000) : ((uint64_t)(u) * (uint64_t)hz)/(uint64_t)1000000)) #endif /* Operations on timespecs */ #define timespecclear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0) #define timespecisset(tvp) ((tvp)->tv_sec || (tvp)->tv_nsec) #define timespeccmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) #define timespecadd(tsp, usp, vsp) \ do { \ (vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec; \ (vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec; \ if ((vsp)->tv_nsec >= 1000000000L) { \ (vsp)->tv_sec++; \ (vsp)->tv_nsec -= 1000000000L; \ } \ } while (0) #define timespecsub(tsp, usp, vsp) \ do { \ (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \ (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \ if ((vsp)->tv_nsec < 0) { \ (vsp)->tv_sec--; \ (vsp)->tv_nsec += 1000000000L; \ } \ } while (0) #ifdef _KERNEL /* Operations on timevals. */ #define timevalclear(tvp) ((tvp)->tv_sec = (tvp)->tv_usec = 0) #define timevalisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) #define timevalcmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_usec cmp (uvp)->tv_usec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) /* timevaladd and timevalsub are not inlined */ #endif /* _KERNEL */ #ifndef _KERNEL /* NetBSD/OpenBSD compatible interfaces */ #define timerclear(tvp) ((tvp)->tv_sec = (tvp)->tv_usec = 0) #define timerisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) #define timercmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_usec cmp (uvp)->tv_usec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) #define timeradd(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec + (uvp)->tv_usec; \ if ((vvp)->tv_usec >= 1000000) { \ (vvp)->tv_sec++; \ (vvp)->tv_usec -= 1000000; \ } \ } while (0) #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (0) #endif /* * Names of the interval timers, and structure * defining a timer setting. */ #define ITIMER_REAL 0 #define ITIMER_VIRTUAL 1 #define ITIMER_PROF 2 struct itimerval { struct timeval it_interval; /* timer interval */ struct timeval it_value; /* current value */ }; /* * Getkerninfo clock information structure */ struct clockinfo { int hz; /* clock frequency */ int tick; /* micro-seconds per hz tick */ int spare; int stathz; /* statistics clock frequency */ int profhz; /* profiling clock frequency */ }; /* These macros are also in time.h. */ #ifndef CLOCK_REALTIME #define CLOCK_REALTIME 0 #endif #ifndef CLOCK_VIRTUAL #define CLOCK_VIRTUAL 1 #define CLOCK_PROF 2 #endif #ifndef CLOCK_MONOTONIC #define CLOCK_MONOTONIC 4 #define CLOCK_UPTIME 5 /* FreeBSD-specific. */ #define CLOCK_UPTIME_PRECISE 7 /* FreeBSD-specific. */ #define CLOCK_UPTIME_FAST 8 /* FreeBSD-specific. */ #define CLOCK_REALTIME_PRECISE 9 /* FreeBSD-specific. */ #define CLOCK_REALTIME_FAST 10 /* FreeBSD-specific. */ #define CLOCK_MONOTONIC_PRECISE 11 /* FreeBSD-specific. */ #define CLOCK_MONOTONIC_FAST 12 /* FreeBSD-specific. */ #define CLOCK_SECOND 13 /* FreeBSD-specific. */ #define CLOCK_THREAD_CPUTIME_ID 14 #define CLOCK_PROCESS_CPUTIME_ID 15 #endif #ifndef TIMER_ABSTIME #define TIMER_RELTIME 0x0 /* relative timer */ #define TIMER_ABSTIME 0x1 /* absolute timer */ #endif #if __BSD_VISIBLE #define CPUCLOCK_WHICH_PID 0 #define CPUCLOCK_WHICH_TID 1 #endif #if defined(_KERNEL) || defined(_STANDALONE) /* * Kernel to clock driver interface. */ void inittodr(time_t base); void resettodr(void); extern volatile time_t time_second; extern volatile time_t time_uptime; extern struct bintime tc_tick_bt; extern sbintime_t tc_tick_sbt; extern struct bintime tick_bt; extern sbintime_t tick_sbt; extern int tc_precexp; extern int tc_timepercentage; extern struct bintime bt_timethreshold; extern struct bintime bt_tickthreshold; extern sbintime_t sbt_timethreshold; extern sbintime_t sbt_tickthreshold; extern volatile int rtc_generation; /* * Functions for looking at our clock: [get]{bin,nano,micro}[up]time() * * Functions without the "get" prefix returns the best timestamp * we can produce in the given format. * * "bin" == struct bintime == seconds + 64 bit fraction of seconds. * "nano" == struct timespec == seconds + nanoseconds. * "micro" == struct timeval == seconds + microseconds. * * Functions containing "up" returns time relative to boot and * should be used for calculating time intervals. * * Functions without "up" returns UTC time. * * Functions with the "get" prefix returns a less precise result * much faster than the functions without "get" prefix and should * be used where a precision of 1/hz seconds is acceptable or where * performance is priority. (NB: "precision", _not_ "resolution" !) */ void binuptime(struct bintime *bt); void nanouptime(struct timespec *tsp); void microuptime(struct timeval *tvp); static __inline sbintime_t sbinuptime(void) { struct bintime _bt; binuptime(&_bt); return (bttosbt(_bt)); } void bintime(struct bintime *bt); void nanotime(struct timespec *tsp); void microtime(struct timeval *tvp); void getbinuptime(struct bintime *bt); void getnanouptime(struct timespec *tsp); void getmicrouptime(struct timeval *tvp); static __inline sbintime_t getsbinuptime(void) { struct bintime _bt; getbinuptime(&_bt); return (bttosbt(_bt)); } void getbintime(struct bintime *bt); void getnanotime(struct timespec *tsp); void getmicrotime(struct timeval *tvp); void getboottime(struct timeval *boottime); void getboottimebin(struct bintime *boottimebin); /* Other functions */ int itimerdecr(struct itimerval *itp, int usec); int itimerfix(struct timeval *tv); int ppsratecheck(struct timeval *, int *, int); int ratecheck(struct timeval *, const struct timeval *); void timevaladd(struct timeval *t1, const struct timeval *t2); void timevalsub(struct timeval *t1, const struct timeval *t2); int tvtohz(struct timeval *tv); #define TC_DEFAULTPERC 5 #define BT2FREQ(bt) \ (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ ((bt)->frac >> 1)) #define SBT2FREQ(sbt) ((SBT_1S + ((sbt) >> 1)) / (sbt)) #define FREQ2BT(freq, bt) \ { \ (bt)->sec = 0; \ (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ } #define TIMESEL(sbt, sbt2) \ (((sbt2) >= sbt_timethreshold) ? \ ((*(sbt) = getsbinuptime()), 1) : ((*(sbt) = sbinuptime()), 0)) #else /* !_KERNEL && !_STANDALONE */ #include #include #include __BEGIN_DECLS int setitimer(int, const struct itimerval *, struct itimerval *); int utimes(const char *, const struct timeval *); #if __BSD_VISIBLE int adjtime(const struct timeval *, struct timeval *); int clock_getcpuclockid2(id_t, int, clockid_t *); int futimes(int, const struct timeval *); int futimesat(int, const char *, const struct timeval [2]); int lutimes(const char *, const struct timeval *); int settimeofday(const struct timeval *, const struct timezone *); #endif #if __XSI_VISIBLE int getitimer(int, struct itimerval *); int gettimeofday(struct timeval *, struct timezone *); #endif __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_TIME_H_ */