diff --git a/lib/libipsec/pfkey.c b/lib/libipsec/pfkey.c index d1aae7321c2e..08098775910f 100644 --- a/lib/libipsec/pfkey.c +++ b/lib/libipsec/pfkey.c @@ -1,2199 +1,2199 @@ /* $KAME: pfkey.c,v 1.46 2003/08/26 03:37:06 itojun Exp $ */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, 1998, and 1999 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "ipsec_strerror.h" #include "libpfkey.h" #define CALLOC(size, cast) (cast)calloc(1, (size)) static int findsupportedmap(int); static int setsupportedmap(struct sadb_supported *); static struct sadb_alg *findsupportedalg(u_int, u_int); static int pfkey_send_x1(int, u_int, u_int, u_int, struct sockaddr *, struct sockaddr *, u_int32_t, u_int32_t, u_int, caddr_t, u_int, u_int, u_int, u_int, u_int, u_int32_t, u_int32_t, u_int32_t, u_int32_t, u_int32_t); static int pfkey_send_x2(int, u_int, u_int, u_int, struct sockaddr *, struct sockaddr *, u_int32_t); static int pfkey_send_x3(int, u_int, u_int); static int pfkey_send_x4(int, u_int, struct sockaddr *, u_int, struct sockaddr *, u_int, u_int, u_int64_t, u_int64_t, char *, int, u_int32_t); static int pfkey_send_x5(int, u_int, u_int32_t); static caddr_t pfkey_setsadbmsg(caddr_t, caddr_t, u_int, u_int, u_int, u_int32_t, pid_t); static caddr_t pfkey_setsadbsa(caddr_t, caddr_t, u_int32_t, u_int, u_int, u_int, u_int32_t); static caddr_t pfkey_setsadbxreplay(caddr_t, caddr_t, uint32_t); static caddr_t pfkey_setsadbaddr(caddr_t, caddr_t, u_int, struct sockaddr *, u_int, u_int); static caddr_t pfkey_setsadbkey(caddr_t, caddr_t, u_int, caddr_t, u_int); static caddr_t pfkey_setsadblifetime(caddr_t, caddr_t, u_int, u_int32_t, u_int32_t, u_int32_t, u_int32_t); static caddr_t pfkey_setsadbxsa2(caddr_t, caddr_t, u_int32_t, u_int32_t); /* * make and search supported algorithm structure. */ static struct sadb_supported *ipsec_supported[] = { NULL, NULL, NULL, NULL }; static int supported_map[] = { SADB_SATYPE_AH, SADB_SATYPE_ESP, SADB_X_SATYPE_IPCOMP, SADB_X_SATYPE_TCPSIGNATURE }; static int findsupportedmap(satype) int satype; { int i; for (i = 0; i < sizeof(supported_map)/sizeof(supported_map[0]); i++) if (supported_map[i] == satype) return i; return -1; } static struct sadb_alg * findsupportedalg(satype, alg_id) u_int satype, alg_id; { int algno; int tlen; caddr_t p; /* validity check */ algno = findsupportedmap(satype); if (algno == -1) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return NULL; } if (ipsec_supported[algno] == NULL) { __ipsec_errcode = EIPSEC_DO_GET_SUPP_LIST; return NULL; } tlen = ipsec_supported[algno]->sadb_supported_len - sizeof(struct sadb_supported); p = (caddr_t)(ipsec_supported[algno] + 1); while (tlen > 0) { if (tlen < sizeof(struct sadb_alg)) { /* invalid format */ break; } if (((struct sadb_alg *)p)->sadb_alg_id == alg_id) return (struct sadb_alg *)p; tlen -= sizeof(struct sadb_alg); p += sizeof(struct sadb_alg); } __ipsec_errcode = EIPSEC_NOT_SUPPORTED; return NULL; } static int setsupportedmap(sup) struct sadb_supported *sup; { struct sadb_supported **ipsup; switch (sup->sadb_supported_exttype) { case SADB_EXT_SUPPORTED_AUTH: ipsup = &ipsec_supported[findsupportedmap(SADB_SATYPE_AH)]; break; case SADB_EXT_SUPPORTED_ENCRYPT: ipsup = &ipsec_supported[findsupportedmap(SADB_SATYPE_ESP)]; break; default: __ipsec_errcode = EIPSEC_INVAL_SATYPE; return -1; } if (*ipsup) free(*ipsup); *ipsup = malloc(sup->sadb_supported_len); if (!*ipsup) { __ipsec_set_strerror(strerror(errno)); return -1; } memcpy(*ipsup, sup, sup->sadb_supported_len); return 0; } /* * check key length against algorithm specified. * This function is called with SADB_EXT_SUPPORTED_{AUTH,ENCRYPT} as the * augument, and only calls to ipsec_check_keylen2(); * keylen is the unit of bit. * OUT: * -1: invalid. * 0: valid. */ int ipsec_check_keylen(supported, alg_id, keylen) u_int supported; u_int alg_id; u_int keylen; { int satype; /* validity check */ switch (supported) { case SADB_EXT_SUPPORTED_AUTH: satype = SADB_SATYPE_AH; break; case SADB_EXT_SUPPORTED_ENCRYPT: satype = SADB_SATYPE_ESP; break; default: __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } return ipsec_check_keylen2(satype, alg_id, keylen); } /* * check key length against algorithm specified. * satype is one of satype defined at pfkeyv2.h. * keylen is the unit of bit. * OUT: * -1: invalid. * 0: valid. */ int ipsec_check_keylen2(satype, alg_id, keylen) u_int satype; u_int alg_id; u_int keylen; { struct sadb_alg *alg; alg = findsupportedalg(satype, alg_id); if (!alg) return -1; if (keylen < alg->sadb_alg_minbits || keylen > alg->sadb_alg_maxbits) { __ipsec_errcode = EIPSEC_INVAL_KEYLEN; return -1; } __ipsec_errcode = EIPSEC_NO_ERROR; return 0; } /* * get max/min key length against algorithm specified. * satype is one of satype defined at pfkeyv2.h. * keylen is the unit of bit. * OUT: * -1: invalid. * 0: valid. */ int ipsec_get_keylen(supported, alg_id, alg0) u_int supported, alg_id; struct sadb_alg *alg0; { struct sadb_alg *alg; u_int satype; /* validity check */ if (!alg0) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } switch (supported) { case SADB_EXT_SUPPORTED_AUTH: satype = SADB_SATYPE_AH; break; case SADB_EXT_SUPPORTED_ENCRYPT: satype = SADB_SATYPE_ESP; break; default: __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } alg = findsupportedalg(satype, alg_id); if (!alg) return -1; memcpy(alg0, alg, sizeof(*alg0)); __ipsec_errcode = EIPSEC_NO_ERROR; return 0; } /* * set the rate for SOFT lifetime against HARD one. * If rate is more than 100 or equal to zero, then set to 100. */ static u_int soft_lifetime_allocations_rate = PFKEY_SOFT_LIFETIME_RATE; static u_int soft_lifetime_bytes_rate = PFKEY_SOFT_LIFETIME_RATE; static u_int soft_lifetime_addtime_rate = PFKEY_SOFT_LIFETIME_RATE; static u_int soft_lifetime_usetime_rate = PFKEY_SOFT_LIFETIME_RATE; u_int pfkey_set_softrate(type, rate) u_int type, rate; { __ipsec_errcode = EIPSEC_NO_ERROR; if (rate > 100 || rate == 0) rate = 100; switch (type) { case SADB_X_LIFETIME_ALLOCATIONS: soft_lifetime_allocations_rate = rate; return 0; case SADB_X_LIFETIME_BYTES: soft_lifetime_bytes_rate = rate; return 0; case SADB_X_LIFETIME_ADDTIME: soft_lifetime_addtime_rate = rate; return 0; case SADB_X_LIFETIME_USETIME: soft_lifetime_usetime_rate = rate; return 0; } __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return 1; } /* * get current rate for SOFT lifetime against HARD one. * ATTENTION: ~0 is returned if invalid type was passed. */ u_int pfkey_get_softrate(type) u_int type; { switch (type) { case SADB_X_LIFETIME_ALLOCATIONS: return soft_lifetime_allocations_rate; case SADB_X_LIFETIME_BYTES: return soft_lifetime_bytes_rate; case SADB_X_LIFETIME_ADDTIME: return soft_lifetime_addtime_rate; case SADB_X_LIFETIME_USETIME: return soft_lifetime_usetime_rate; } return ~0; } /* * sending SADB_GETSPI message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_getspi(so, satype, mode, src, dst, min, max, reqid, seq) int so; u_int satype, mode; struct sockaddr *src, *dst; u_int32_t min, max, reqid, seq; { struct sadb_msg *newmsg; caddr_t ep; int len; int need_spirange = 0; caddr_t p; int plen; /* validity check */ if (src == NULL || dst == NULL) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } if (src->sa_family != dst->sa_family) { __ipsec_errcode = EIPSEC_FAMILY_MISMATCH; return -1; } if (min > max || (min > 0 && min <= 255)) { __ipsec_errcode = EIPSEC_INVAL_SPI; return -1; } switch (src->sa_family) { case AF_INET: plen = sizeof(struct in_addr) << 3; break; case AF_INET6: plen = sizeof(struct in6_addr) << 3; break; default: __ipsec_errcode = EIPSEC_INVAL_FAMILY; return -1; } /* create new sadb_msg to send. */ len = sizeof(struct sadb_msg) + sizeof(struct sadb_x_sa2) + sizeof(struct sadb_address) + PFKEY_ALIGN8(src->sa_len) + sizeof(struct sadb_address) + PFKEY_ALIGN8(dst->sa_len); if (min > 255 && max < ~0) { need_spirange++; len += sizeof(struct sadb_spirange); } if ((newmsg = CALLOC(len, struct sadb_msg *)) == NULL) { __ipsec_set_strerror(strerror(errno)); return -1; } ep = ((caddr_t)newmsg) + len; p = pfkey_setsadbmsg((caddr_t)newmsg, ep, SADB_GETSPI, len, satype, seq, getpid()); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbxsa2(p, ep, mode, reqid); if (!p) { free(newmsg); return -1; } /* set sadb_address for source */ p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_SRC, src, plen, IPSEC_ULPROTO_ANY); if (!p) { free(newmsg); return -1; } /* set sadb_address for destination */ p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_DST, dst, plen, IPSEC_ULPROTO_ANY); if (!p) { free(newmsg); return -1; } /* proccessing spi range */ if (need_spirange) { struct sadb_spirange spirange; if (p + sizeof(spirange) > ep) { free(newmsg); return -1; } memset(&spirange, 0, sizeof(spirange)); spirange.sadb_spirange_len = PFKEY_UNIT64(sizeof(spirange)); spirange.sadb_spirange_exttype = SADB_EXT_SPIRANGE; spirange.sadb_spirange_min = min; spirange.sadb_spirange_max = max; memcpy(p, &spirange, sizeof(spirange)); p += sizeof(spirange); } if (p != ep) { free(newmsg); return -1; } /* send message */ len = pfkey_send(so, newmsg, len); free(newmsg); if (len < 0) return -1; __ipsec_errcode = EIPSEC_NO_ERROR; return len; } /* * sending SADB_UPDATE message to the kernel. * The length of key material is a_keylen + e_keylen. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_update(so, satype, mode, src, dst, spi, reqid, wsize, keymat, e_type, e_keylen, a_type, a_keylen, flags, l_alloc, l_bytes, l_addtime, l_usetime, seq) int so; u_int satype, mode, wsize; struct sockaddr *src, *dst; u_int32_t spi, reqid; caddr_t keymat; u_int e_type, e_keylen, a_type, a_keylen, flags; u_int32_t l_alloc; u_int64_t l_bytes, l_addtime, l_usetime; u_int32_t seq; { int len; if ((len = pfkey_send_x1(so, SADB_UPDATE, satype, mode, src, dst, spi, reqid, wsize, keymat, e_type, e_keylen, a_type, a_keylen, flags, l_alloc, l_bytes, l_addtime, l_usetime, seq)) < 0) return -1; return len; } /* * sending SADB_ADD message to the kernel. * The length of key material is a_keylen + e_keylen. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_add(so, satype, mode, src, dst, spi, reqid, wsize, keymat, e_type, e_keylen, a_type, a_keylen, flags, l_alloc, l_bytes, l_addtime, l_usetime, seq) int so; u_int satype, mode, wsize; struct sockaddr *src, *dst; u_int32_t spi, reqid; caddr_t keymat; u_int e_type, e_keylen, a_type, a_keylen, flags; u_int32_t l_alloc; u_int64_t l_bytes, l_addtime, l_usetime; u_int32_t seq; { int len; if ((len = pfkey_send_x1(so, SADB_ADD, satype, mode, src, dst, spi, reqid, wsize, keymat, e_type, e_keylen, a_type, a_keylen, flags, l_alloc, l_bytes, l_addtime, l_usetime, seq)) < 0) return -1; return len; } /* * sending SADB_DELETE message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_delete(so, satype, mode, src, dst, spi) int so; u_int satype, mode; struct sockaddr *src, *dst; u_int32_t spi; { int len; if ((len = pfkey_send_x2(so, SADB_DELETE, satype, mode, src, dst, spi)) < 0) return -1; return len; } /* * sending SADB_DELETE without spi to the kernel. This is * the "delete all" request (an extension also present in * Solaris). * * OUT: * positive: success and return length sent - * -1 : error occured, and set errno + * -1 : error occurred, and set errno */ int pfkey_send_delete_all(so, satype, mode, src, dst) int so; u_int satype, mode; struct sockaddr *src, *dst; { struct sadb_msg *newmsg; int len; caddr_t p; int plen; caddr_t ep; /* validity check */ if (src == NULL || dst == NULL) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } if (src->sa_family != dst->sa_family) { __ipsec_errcode = EIPSEC_FAMILY_MISMATCH; return -1; } switch (src->sa_family) { case AF_INET: plen = sizeof(struct in_addr) << 3; break; case AF_INET6: plen = sizeof(struct in6_addr) << 3; break; default: __ipsec_errcode = EIPSEC_INVAL_FAMILY; return -1; } /* create new sadb_msg to reply. */ len = sizeof(struct sadb_msg) + sizeof(struct sadb_address) + PFKEY_ALIGN8(src->sa_len) + sizeof(struct sadb_address) + PFKEY_ALIGN8(dst->sa_len); if ((newmsg = CALLOC(len, struct sadb_msg *)) == NULL) { __ipsec_set_strerror(strerror(errno)); return -1; } ep = ((caddr_t)newmsg) + len; p = pfkey_setsadbmsg((caddr_t)newmsg, ep, SADB_DELETE, len, satype, 0, getpid()); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_SRC, src, plen, IPSEC_ULPROTO_ANY); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_DST, dst, plen, IPSEC_ULPROTO_ANY); if (!p || p != ep) { free(newmsg); return -1; } /* send message */ len = pfkey_send(so, newmsg, len); free(newmsg); if (len < 0) return -1; __ipsec_errcode = EIPSEC_NO_ERROR; return len; } /* * sending SADB_GET message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_get(so, satype, mode, src, dst, spi) int so; u_int satype, mode; struct sockaddr *src, *dst; u_int32_t spi; { int len; if ((len = pfkey_send_x2(so, SADB_GET, satype, mode, src, dst, spi)) < 0) return -1; return len; } /* * sending SADB_REGISTER message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_register(so, satype) int so; u_int satype; { int len, algno; if (satype == SADB_SATYPE_UNSPEC) { for (algno = 0; algno < sizeof(supported_map)/sizeof(supported_map[0]); algno++) { if (ipsec_supported[algno]) { free(ipsec_supported[algno]); ipsec_supported[algno] = NULL; } } } else { algno = findsupportedmap(satype); if (algno == -1) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } if (ipsec_supported[algno]) { free(ipsec_supported[algno]); ipsec_supported[algno] = NULL; } } if ((len = pfkey_send_x3(so, SADB_REGISTER, satype)) < 0) return -1; return len; } /* * receiving SADB_REGISTER message from the kernel, and copy buffer for * sadb_supported returned into ipsec_supported. * OUT: * 0: success and return length sent. - * -1: error occured, and set errno. + * -1: error occurred, and set errno. */ int pfkey_recv_register(so) int so; { pid_t pid = getpid(); struct sadb_msg *newmsg; int error = -1; /* receive message */ for (;;) { if ((newmsg = pfkey_recv(so)) == NULL) return -1; if (newmsg->sadb_msg_type == SADB_REGISTER && newmsg->sadb_msg_pid == pid) break; free(newmsg); } /* check and fix */ newmsg->sadb_msg_len = PFKEY_UNUNIT64(newmsg->sadb_msg_len); error = pfkey_set_supported(newmsg, newmsg->sadb_msg_len); free(newmsg); if (error == 0) __ipsec_errcode = EIPSEC_NO_ERROR; return error; } /* * receiving SADB_REGISTER message from the kernel, and copy buffer for * sadb_supported returned into ipsec_supported. * NOTE: sadb_msg_len must be host order. * IN: * tlen: msg length, it's to makeing sure. * OUT: * 0: success and return length sent. - * -1: error occured, and set errno. + * -1: error occurred, and set errno. */ int pfkey_set_supported(msg, tlen) struct sadb_msg *msg; int tlen; { struct sadb_supported *sup; caddr_t p; caddr_t ep; /* validity */ if (msg->sadb_msg_len != tlen) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } p = (caddr_t)msg; ep = p + tlen; p += sizeof(struct sadb_msg); while (p < ep) { sup = (struct sadb_supported *)p; if (ep < p + sizeof(*sup) || PFKEY_EXTLEN(sup) < sizeof(*sup) || ep < p + sup->sadb_supported_len) { /* invalid format */ break; } switch (sup->sadb_supported_exttype) { case SADB_EXT_SUPPORTED_AUTH: case SADB_EXT_SUPPORTED_ENCRYPT: break; default: __ipsec_errcode = EIPSEC_INVAL_SATYPE; return -1; } /* fixed length */ sup->sadb_supported_len = PFKEY_EXTLEN(sup); /* set supported map */ if (setsupportedmap(sup) != 0) return -1; p += sup->sadb_supported_len; } if (p != ep) { __ipsec_errcode = EIPSEC_INVAL_SATYPE; return -1; } __ipsec_errcode = EIPSEC_NO_ERROR; return 0; } /* * sending SADB_FLUSH message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_flush(so, satype) int so; u_int satype; { int len; if ((len = pfkey_send_x3(so, SADB_FLUSH, satype)) < 0) return -1; return len; } /* * sending SADB_DUMP message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_dump(so, satype) int so; u_int satype; { int len; if ((len = pfkey_send_x3(so, SADB_DUMP, satype)) < 0) return -1; return len; } /* * sending SADB_X_PROMISC message to the kernel. * NOTE that this function handles promisc mode toggle only. * IN: * flag: set promisc off if zero, set promisc on if non-zero. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. - * 0 : error occured, and set errno. + * -1 : error occurred, and set errno. + * 0 : error occurred, and set errno. * others: a pointer to new allocated buffer in which supported * algorithms is. */ int pfkey_send_promisc_toggle(so, flag) int so; int flag; { int len; if ((len = pfkey_send_x3(so, SADB_X_PROMISC, (flag ? 1 : 0))) < 0) return -1; return len; } /* * sending SADB_X_SPDADD message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spdadd(so, src, prefs, dst, prefd, proto, policy, policylen, seq) int so; struct sockaddr *src, *dst; u_int prefs, prefd, proto; caddr_t policy; int policylen; u_int32_t seq; { int len; if ((len = pfkey_send_x4(so, SADB_X_SPDADD, src, prefs, dst, prefd, proto, 0, 0, policy, policylen, seq)) < 0) return -1; return len; } /* * sending SADB_X_SPDADD message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spdadd2(so, src, prefs, dst, prefd, proto, ltime, vtime, policy, policylen, seq) int so; struct sockaddr *src, *dst; u_int prefs, prefd, proto; u_int64_t ltime, vtime; caddr_t policy; int policylen; u_int32_t seq; { int len; if ((len = pfkey_send_x4(so, SADB_X_SPDADD, src, prefs, dst, prefd, proto, ltime, vtime, policy, policylen, seq)) < 0) return -1; return len; } /* * sending SADB_X_SPDUPDATE message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spdupdate(so, src, prefs, dst, prefd, proto, policy, policylen, seq) int so; struct sockaddr *src, *dst; u_int prefs, prefd, proto; caddr_t policy; int policylen; u_int32_t seq; { int len; if ((len = pfkey_send_x4(so, SADB_X_SPDUPDATE, src, prefs, dst, prefd, proto, 0, 0, policy, policylen, seq)) < 0) return -1; return len; } /* * sending SADB_X_SPDUPDATE message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spdupdate2(so, src, prefs, dst, prefd, proto, ltime, vtime, policy, policylen, seq) int so; struct sockaddr *src, *dst; u_int prefs, prefd, proto; u_int64_t ltime, vtime; caddr_t policy; int policylen; u_int32_t seq; { int len; if ((len = pfkey_send_x4(so, SADB_X_SPDUPDATE, src, prefs, dst, prefd, proto, ltime, vtime, policy, policylen, seq)) < 0) return -1; return len; } /* * sending SADB_X_SPDDELETE message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spddelete(so, src, prefs, dst, prefd, proto, policy, policylen, seq) int so; struct sockaddr *src, *dst; u_int prefs, prefd, proto; caddr_t policy; int policylen; u_int32_t seq; { int len; if (policylen != sizeof(struct sadb_x_policy)) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } if ((len = pfkey_send_x4(so, SADB_X_SPDDELETE, src, prefs, dst, prefd, proto, 0, 0, policy, policylen, seq)) < 0) return -1; return len; } /* * sending SADB_X_SPDDELETE message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spddelete2(so, spid) int so; u_int32_t spid; { int len; if ((len = pfkey_send_x5(so, SADB_X_SPDDELETE2, spid)) < 0) return -1; return len; } /* * sending SADB_X_SPDGET message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spdget(so, spid) int so; u_int32_t spid; { int len; if ((len = pfkey_send_x5(so, SADB_X_SPDGET, spid)) < 0) return -1; return len; } /* * sending SADB_X_SPDSETIDX message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spdsetidx(so, src, prefs, dst, prefd, proto, policy, policylen, seq) int so; struct sockaddr *src, *dst; u_int prefs, prefd, proto; caddr_t policy; int policylen; u_int32_t seq; { int len; if (policylen != sizeof(struct sadb_x_policy)) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } if ((len = pfkey_send_x4(so, SADB_X_SPDSETIDX, src, prefs, dst, prefd, proto, 0, 0, policy, policylen, seq)) < 0) return -1; return len; } /* * sending SADB_SPDFLUSH message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spdflush(so) int so; { int len; if ((len = pfkey_send_x3(so, SADB_X_SPDFLUSH, SADB_SATYPE_UNSPEC)) < 0) return -1; return len; } /* * sending SADB_SPDDUMP message to the kernel. * OUT: * positive: success and return length sent. - * -1 : error occured, and set errno. + * -1 : error occurred, and set errno. */ int pfkey_send_spddump(so) int so; { int len; if ((len = pfkey_send_x3(so, SADB_X_SPDDUMP, SADB_SATYPE_UNSPEC)) < 0) return -1; return len; } /* sending SADB_ADD or SADB_UPDATE message to the kernel */ static int pfkey_send_x1(so, type, satype, mode, src, dst, spi, reqid, wsize, keymat, e_type, e_keylen, a_type, a_keylen, flags, l_alloc, l_bytes, l_addtime, l_usetime, seq) int so; u_int type, satype, mode; struct sockaddr *src, *dst; u_int32_t spi, reqid; u_int wsize; caddr_t keymat; u_int e_type, e_keylen, a_type, a_keylen, flags; u_int32_t l_alloc, l_bytes, l_addtime, l_usetime, seq; { struct sadb_msg *newmsg; int len; caddr_t p; int plen; caddr_t ep; /* validity check */ if (src == NULL || dst == NULL) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } if (src->sa_family != dst->sa_family) { __ipsec_errcode = EIPSEC_FAMILY_MISMATCH; return -1; } switch (src->sa_family) { case AF_INET: plen = sizeof(struct in_addr) << 3; break; case AF_INET6: plen = sizeof(struct in6_addr) << 3; break; default: __ipsec_errcode = EIPSEC_INVAL_FAMILY; return -1; } switch (satype) { case SADB_SATYPE_ESP: if (e_type == SADB_EALG_NONE) { __ipsec_errcode = EIPSEC_NO_ALGS; return -1; } break; case SADB_SATYPE_AH: if (e_type != SADB_EALG_NONE) { __ipsec_errcode = EIPSEC_INVAL_ALGS; return -1; } if (a_type == SADB_AALG_NONE) { __ipsec_errcode = EIPSEC_NO_ALGS; return -1; } break; case SADB_X_SATYPE_IPCOMP: if (e_type == SADB_X_CALG_NONE) { __ipsec_errcode = EIPSEC_INVAL_ALGS; return -1; } if (a_type != SADB_AALG_NONE) { __ipsec_errcode = EIPSEC_NO_ALGS; return -1; } break; case SADB_X_SATYPE_TCPSIGNATURE: if (e_type != SADB_EALG_NONE) { __ipsec_errcode = EIPSEC_INVAL_ALGS; return -1; } if (a_type != SADB_X_AALG_TCP_MD5) { __ipsec_errcode = EIPSEC_INVAL_ALGS; return -1; } break; default: __ipsec_errcode = EIPSEC_INVAL_SATYPE; return -1; } /* create new sadb_msg to reply. */ len = sizeof(struct sadb_msg) + sizeof(struct sadb_sa) + sizeof(struct sadb_x_sa2) + sizeof(struct sadb_address) + PFKEY_ALIGN8(src->sa_len) + sizeof(struct sadb_address) + PFKEY_ALIGN8(dst->sa_len) + sizeof(struct sadb_lifetime) + sizeof(struct sadb_lifetime); if (wsize > UINT8_MAX) { if (wsize > (UINT32_MAX - 32) >> 3) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return (-1); } len += sizeof(struct sadb_x_sa_replay); } if (e_type != SADB_EALG_NONE) len += (sizeof(struct sadb_key) + PFKEY_ALIGN8(e_keylen)); if (a_type != SADB_AALG_NONE) len += (sizeof(struct sadb_key) + PFKEY_ALIGN8(a_keylen)); if ((newmsg = CALLOC(len, struct sadb_msg *)) == NULL) { __ipsec_set_strerror(strerror(errno)); return -1; } ep = ((caddr_t)newmsg) + len; p = pfkey_setsadbmsg((caddr_t)newmsg, ep, type, len, satype, seq, getpid()); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbsa(p, ep, spi, wsize, a_type, e_type, flags); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbxsa2(p, ep, mode, reqid); if (!p) { free(newmsg); return -1; } if (wsize > UINT8_MAX) { p = pfkey_setsadbxreplay(p, ep, wsize); if (!p) { free(newmsg); return (-1); } } p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_SRC, src, plen, IPSEC_ULPROTO_ANY); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_DST, dst, plen, IPSEC_ULPROTO_ANY); if (!p) { free(newmsg); return -1; } if (e_type != SADB_EALG_NONE) { p = pfkey_setsadbkey(p, ep, SADB_EXT_KEY_ENCRYPT, keymat, e_keylen); if (!p) { free(newmsg); return -1; } } if (a_type != SADB_AALG_NONE) { p = pfkey_setsadbkey(p, ep, SADB_EXT_KEY_AUTH, keymat + e_keylen, a_keylen); if (!p) { free(newmsg); return -1; } } /* set sadb_lifetime for destination */ p = pfkey_setsadblifetime(p, ep, SADB_EXT_LIFETIME_HARD, l_alloc, l_bytes, l_addtime, l_usetime); if (!p) { free(newmsg); return -1; } p = pfkey_setsadblifetime(p, ep, SADB_EXT_LIFETIME_SOFT, l_alloc, l_bytes, l_addtime, l_usetime); if (!p || p != ep) { free(newmsg); return -1; } /* send message */ len = pfkey_send(so, newmsg, len); free(newmsg); if (len < 0) return -1; __ipsec_errcode = EIPSEC_NO_ERROR; return len; } /* sending SADB_DELETE or SADB_GET message to the kernel */ static int pfkey_send_x2(so, type, satype, mode, src, dst, spi) int so; u_int type, satype, mode; struct sockaddr *src, *dst; u_int32_t spi; { struct sadb_msg *newmsg; int len; caddr_t p; int plen; caddr_t ep; /* validity check */ if (src == NULL || dst == NULL) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } if (src->sa_family != dst->sa_family) { __ipsec_errcode = EIPSEC_FAMILY_MISMATCH; return -1; } switch (src->sa_family) { case AF_INET: plen = sizeof(struct in_addr) << 3; break; case AF_INET6: plen = sizeof(struct in6_addr) << 3; break; default: __ipsec_errcode = EIPSEC_INVAL_FAMILY; return -1; } /* create new sadb_msg to reply. */ len = sizeof(struct sadb_msg) + sizeof(struct sadb_sa) + sizeof(struct sadb_address) + PFKEY_ALIGN8(src->sa_len) + sizeof(struct sadb_address) + PFKEY_ALIGN8(dst->sa_len); if ((newmsg = CALLOC(len, struct sadb_msg *)) == NULL) { __ipsec_set_strerror(strerror(errno)); return -1; } ep = ((caddr_t)newmsg) + len; p = pfkey_setsadbmsg((caddr_t)newmsg, ep, type, len, satype, 0, getpid()); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbsa(p, ep, spi, 0, 0, 0, 0); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_SRC, src, plen, IPSEC_ULPROTO_ANY); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_DST, dst, plen, IPSEC_ULPROTO_ANY); if (!p || p != ep) { free(newmsg); return -1; } /* send message */ len = pfkey_send(so, newmsg, len); free(newmsg); if (len < 0) return -1; __ipsec_errcode = EIPSEC_NO_ERROR; return len; } /* * sending SADB_REGISTER, SADB_FLUSH, SADB_DUMP or SADB_X_PROMISC message * to the kernel */ static int pfkey_send_x3(so, type, satype) int so; u_int type, satype; { struct sadb_msg *newmsg; int len; caddr_t p; caddr_t ep; /* validity check */ switch (type) { case SADB_X_PROMISC: if (satype != 0 && satype != 1) { __ipsec_errcode = EIPSEC_INVAL_SATYPE; return -1; } break; default: switch (satype) { case SADB_SATYPE_UNSPEC: case SADB_SATYPE_AH: case SADB_SATYPE_ESP: case SADB_X_SATYPE_IPCOMP: case SADB_X_SATYPE_TCPSIGNATURE: break; default: __ipsec_errcode = EIPSEC_INVAL_SATYPE; return -1; } } /* create new sadb_msg to send. */ len = sizeof(struct sadb_msg); if ((newmsg = CALLOC(len, struct sadb_msg *)) == NULL) { __ipsec_set_strerror(strerror(errno)); return -1; } ep = ((caddr_t)newmsg) + len; p = pfkey_setsadbmsg((caddr_t)newmsg, ep, type, len, satype, 0, getpid()); if (!p || p != ep) { free(newmsg); return -1; } /* send message */ len = pfkey_send(so, newmsg, len); free(newmsg); if (len < 0) return -1; __ipsec_errcode = EIPSEC_NO_ERROR; return len; } /* sending SADB_X_SPDADD message to the kernel */ static int pfkey_send_x4(so, type, src, prefs, dst, prefd, proto, ltime, vtime, policy, policylen, seq) int so; struct sockaddr *src, *dst; u_int type, prefs, prefd, proto; u_int64_t ltime, vtime; char *policy; int policylen; u_int32_t seq; { struct sadb_msg *newmsg; int len; caddr_t p; int plen; caddr_t ep; /* validity check */ if (src == NULL || dst == NULL) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } if (src->sa_family != dst->sa_family) { __ipsec_errcode = EIPSEC_FAMILY_MISMATCH; return -1; } switch (src->sa_family) { case AF_INET: plen = sizeof(struct in_addr) << 3; break; case AF_INET6: plen = sizeof(struct in6_addr) << 3; break; default: __ipsec_errcode = EIPSEC_INVAL_FAMILY; return -1; } if (prefs > plen || prefd > plen) { __ipsec_errcode = EIPSEC_INVAL_PREFIXLEN; return -1; } /* create new sadb_msg to reply. */ len = sizeof(struct sadb_msg) + sizeof(struct sadb_address) + PFKEY_ALIGN8(src->sa_len) + sizeof(struct sadb_address) + PFKEY_ALIGN8(src->sa_len) + sizeof(struct sadb_lifetime) + policylen; if ((newmsg = CALLOC(len, struct sadb_msg *)) == NULL) { __ipsec_set_strerror(strerror(errno)); return -1; } ep = ((caddr_t)newmsg) + len; p = pfkey_setsadbmsg((caddr_t)newmsg, ep, type, len, SADB_SATYPE_UNSPEC, seq, getpid()); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_SRC, src, prefs, proto); if (!p) { free(newmsg); return -1; } p = pfkey_setsadbaddr(p, ep, SADB_EXT_ADDRESS_DST, dst, prefd, proto); if (!p) { free(newmsg); return -1; } p = pfkey_setsadblifetime(p, ep, SADB_EXT_LIFETIME_HARD, 0, 0, ltime, vtime); if (!p || p + policylen != ep) { free(newmsg); return -1; } memcpy(p, policy, policylen); /* send message */ len = pfkey_send(so, newmsg, len); free(newmsg); if (len < 0) return -1; __ipsec_errcode = EIPSEC_NO_ERROR; return len; } /* sending SADB_X_SPDGET or SADB_X_SPDDELETE message to the kernel */ static int pfkey_send_x5(so, type, spid) int so; u_int type; u_int32_t spid; { struct sadb_msg *newmsg; struct sadb_x_policy xpl; int len; caddr_t p; caddr_t ep; /* create new sadb_msg to reply. */ len = sizeof(struct sadb_msg) + sizeof(xpl); if ((newmsg = CALLOC(len, struct sadb_msg *)) == NULL) { __ipsec_set_strerror(strerror(errno)); return -1; } ep = ((caddr_t)newmsg) + len; p = pfkey_setsadbmsg((caddr_t)newmsg, ep, type, len, SADB_SATYPE_UNSPEC, 0, getpid()); if (!p) { free(newmsg); return -1; } if (p + sizeof(xpl) != ep) { free(newmsg); return -1; } memset(&xpl, 0, sizeof(xpl)); xpl.sadb_x_policy_len = PFKEY_UNIT64(sizeof(xpl)); xpl.sadb_x_policy_exttype = SADB_X_EXT_POLICY; xpl.sadb_x_policy_id = spid; memcpy(p, &xpl, sizeof(xpl)); /* send message */ len = pfkey_send(so, newmsg, len); free(newmsg); if (len < 0) return -1; __ipsec_errcode = EIPSEC_NO_ERROR; return len; } /* * open a socket. * OUT: * -1: fail. * others : success and return value of socket. */ int pfkey_open(void) { int so; int bufsiz_current, bufsiz_wanted; int ret; socklen_t len; if ((so = socket(PF_KEY, SOCK_RAW, PF_KEY_V2)) < 0) { __ipsec_set_strerror(strerror(errno)); return -1; } /* * This is a temporary workaround for KAME PR 154. * Don't really care even if it fails. */ /* Try to have 128k. If we have more, do not lower it. */ bufsiz_wanted = 128 * 1024; len = sizeof(bufsiz_current); ret = getsockopt(so, SOL_SOCKET, SO_SNDBUF, &bufsiz_current, &len); if ((ret < 0) || (bufsiz_current < bufsiz_wanted)) (void)setsockopt(so, SOL_SOCKET, SO_SNDBUF, &bufsiz_wanted, sizeof(bufsiz_wanted)); /* Try to have have at least 2MB. If we have more, do not lower it. */ bufsiz_wanted = 2 * 1024 * 1024; len = sizeof(bufsiz_current); ret = getsockopt(so, SOL_SOCKET, SO_RCVBUF, &bufsiz_current, &len); if (ret < 0) bufsiz_current = 128 * 1024; for (; bufsiz_wanted > bufsiz_current; bufsiz_wanted /= 2) { if (setsockopt(so, SOL_SOCKET, SO_RCVBUF, &bufsiz_wanted, sizeof(bufsiz_wanted)) == 0) break; } __ipsec_errcode = EIPSEC_NO_ERROR; return so; } /* * close a socket. * OUT: * 0: success. * -1: fail. */ void pfkey_close(so) int so; { (void)close(so); __ipsec_errcode = EIPSEC_NO_ERROR; return; } /* * receive sadb_msg data, and return pointer to new buffer allocated. * Must free this buffer later. * OUT: - * NULL : error occured. + * NULL : error occurred. * others : a pointer to sadb_msg structure. * * XXX should be rewritten to pass length explicitly */ struct sadb_msg * pfkey_recv(so) int so; { struct sadb_msg buf, *newmsg; int len, reallen; while ((len = recv(so, (caddr_t)&buf, sizeof(buf), MSG_PEEK)) < 0) { if (errno == EINTR) continue; __ipsec_set_strerror(strerror(errno)); return NULL; } if (len < sizeof(buf)) { recv(so, (caddr_t)&buf, sizeof(buf), 0); __ipsec_errcode = EIPSEC_MAX; return NULL; } /* read real message */ reallen = PFKEY_UNUNIT64(buf.sadb_msg_len); if ((newmsg = CALLOC(reallen, struct sadb_msg *)) == NULL) { __ipsec_set_strerror(strerror(errno)); return NULL; } while ((len = recv(so, (caddr_t)newmsg, reallen, 0)) < 0) { if (errno == EINTR) continue; __ipsec_set_strerror(strerror(errno)); free(newmsg); return NULL; } if (len != reallen) { __ipsec_errcode = EIPSEC_SYSTEM_ERROR; free(newmsg); return NULL; } /* don't trust what the kernel says, validate! */ if (PFKEY_UNUNIT64(newmsg->sadb_msg_len) != len) { __ipsec_errcode = EIPSEC_SYSTEM_ERROR; free(newmsg); return NULL; } __ipsec_errcode = EIPSEC_NO_ERROR; return newmsg; } /* * send message to a socket. * OUT: * others: success and return length sent. * -1 : fail. */ int pfkey_send(so, msg, len) int so; struct sadb_msg *msg; int len; { if ((len = send(so, (caddr_t)msg, len, 0)) < 0) { __ipsec_set_strerror(strerror(errno)); return -1; } __ipsec_errcode = EIPSEC_NO_ERROR; return len; } /* * %%% Utilities * NOTE: These functions are derived from netkey/key.c in KAME. */ /* * set the pointer to each header in this message buffer. * IN: msg: pointer to message buffer. * mhp: pointer to the buffer initialized like below: * caddr_t mhp[SADB_EXT_MAX + 1]; * OUT: -1: invalid. * 0: valid. * * XXX should be rewritten to obtain length explicitly */ int pfkey_align(msg, mhp) struct sadb_msg *msg; caddr_t *mhp; { struct sadb_ext *ext; int i; caddr_t p; caddr_t ep; /* XXX should be passed from upper layer */ /* validity check */ if (msg == NULL || mhp == NULL) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } /* initialize */ for (i = 0; i < SADB_EXT_MAX + 1; i++) mhp[i] = NULL; mhp[0] = (caddr_t)msg; /* initialize */ p = (caddr_t) msg; ep = p + PFKEY_UNUNIT64(msg->sadb_msg_len); /* skip base header */ p += sizeof(struct sadb_msg); while (p < ep) { ext = (struct sadb_ext *)p; if (ep < p + sizeof(*ext) || PFKEY_EXTLEN(ext) < sizeof(*ext) || ep < p + PFKEY_EXTLEN(ext)) { /* invalid format */ break; } /* duplicate check */ /* XXX Are there duplication either KEY_AUTH or KEY_ENCRYPT ?*/ if (mhp[ext->sadb_ext_type] != NULL) { __ipsec_errcode = EIPSEC_INVAL_EXTTYPE; return -1; } /* set pointer */ switch (ext->sadb_ext_type) { case SADB_EXT_SA: case SADB_EXT_LIFETIME_CURRENT: case SADB_EXT_LIFETIME_HARD: case SADB_EXT_LIFETIME_SOFT: case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: case SADB_EXT_KEY_AUTH: /* XXX should to be check weak keys. */ case SADB_EXT_KEY_ENCRYPT: /* XXX should to be check weak keys. */ case SADB_EXT_IDENTITY_SRC: case SADB_EXT_IDENTITY_DST: case SADB_EXT_SENSITIVITY: case SADB_EXT_PROPOSAL: case SADB_EXT_SUPPORTED_AUTH: case SADB_EXT_SUPPORTED_ENCRYPT: case SADB_EXT_SPIRANGE: case SADB_X_EXT_POLICY: case SADB_X_EXT_SA2: case SADB_X_EXT_NAT_T_TYPE: case SADB_X_EXT_NAT_T_SPORT: case SADB_X_EXT_NAT_T_DPORT: case SADB_X_EXT_NAT_T_OAI: case SADB_X_EXT_NAT_T_OAR: case SADB_X_EXT_NAT_T_FRAG: case SADB_X_EXT_SA_REPLAY: case SADB_X_EXT_NEW_ADDRESS_SRC: case SADB_X_EXT_NEW_ADDRESS_DST: mhp[ext->sadb_ext_type] = (caddr_t)ext; break; default: __ipsec_errcode = EIPSEC_INVAL_EXTTYPE; return -1; } p += PFKEY_EXTLEN(ext); } if (p != ep) { __ipsec_errcode = EIPSEC_INVAL_SADBMSG; return -1; } __ipsec_errcode = EIPSEC_NO_ERROR; return 0; } /* * check basic usage for sadb_msg, * NOTE: This routine is derived from netkey/key.c in KAME. * IN: msg: pointer to message buffer. * mhp: pointer to the buffer initialized like below: * * caddr_t mhp[SADB_EXT_MAX + 1]; * * OUT: -1: invalid. * 0: valid. */ int pfkey_check(mhp) caddr_t *mhp; { struct sadb_msg *msg; /* validity check */ if (mhp == NULL || mhp[0] == NULL) { __ipsec_errcode = EIPSEC_INVAL_ARGUMENT; return -1; } msg = (struct sadb_msg *)mhp[0]; /* check version */ if (msg->sadb_msg_version != PF_KEY_V2) { __ipsec_errcode = EIPSEC_INVAL_VERSION; return -1; } /* check type */ if (msg->sadb_msg_type > SADB_MAX) { __ipsec_errcode = EIPSEC_INVAL_MSGTYPE; return -1; } /* check SA type */ switch (msg->sadb_msg_satype) { case SADB_SATYPE_UNSPEC: switch (msg->sadb_msg_type) { case SADB_GETSPI: case SADB_UPDATE: case SADB_ADD: case SADB_DELETE: case SADB_GET: case SADB_ACQUIRE: case SADB_EXPIRE: __ipsec_errcode = EIPSEC_INVAL_SATYPE; return -1; } break; case SADB_SATYPE_ESP: case SADB_SATYPE_AH: case SADB_X_SATYPE_IPCOMP: case SADB_X_SATYPE_TCPSIGNATURE: switch (msg->sadb_msg_type) { case SADB_X_SPDADD: case SADB_X_SPDDELETE: case SADB_X_SPDGET: case SADB_X_SPDDUMP: case SADB_X_SPDFLUSH: __ipsec_errcode = EIPSEC_INVAL_SATYPE; return -1; } break; case SADB_SATYPE_RSVP: case SADB_SATYPE_OSPFV2: case SADB_SATYPE_RIPV2: case SADB_SATYPE_MIP: __ipsec_errcode = EIPSEC_NOT_SUPPORTED; return -1; case 1: /* XXX: What does it do ? */ if (msg->sadb_msg_type == SADB_X_PROMISC) break; /*FALLTHROUGH*/ default: __ipsec_errcode = EIPSEC_INVAL_SATYPE; return -1; } /* check field of upper layer protocol and address family */ if (mhp[SADB_EXT_ADDRESS_SRC] != NULL && mhp[SADB_EXT_ADDRESS_DST] != NULL) { struct sadb_address *src0, *dst0; src0 = (struct sadb_address *)(mhp[SADB_EXT_ADDRESS_SRC]); dst0 = (struct sadb_address *)(mhp[SADB_EXT_ADDRESS_DST]); if (src0->sadb_address_proto != dst0->sadb_address_proto) { __ipsec_errcode = EIPSEC_PROTO_MISMATCH; return -1; } if (PFKEY_ADDR_SADDR(src0)->sa_family != PFKEY_ADDR_SADDR(dst0)->sa_family) { __ipsec_errcode = EIPSEC_FAMILY_MISMATCH; return -1; } switch (PFKEY_ADDR_SADDR(src0)->sa_family) { case AF_INET: case AF_INET6: break; default: __ipsec_errcode = EIPSEC_INVAL_FAMILY; return -1; } /* * prefixlen == 0 is valid because there must be the case * all addresses are matched. */ } __ipsec_errcode = EIPSEC_NO_ERROR; return 0; } /* * set data into sadb_msg. * `buf' must has been allocated sufficiently. */ static caddr_t pfkey_setsadbmsg(buf, lim, type, tlen, satype, seq, pid) caddr_t buf; caddr_t lim; u_int type, satype; u_int tlen; u_int32_t seq; pid_t pid; { struct sadb_msg *p; u_int len; p = (struct sadb_msg *)buf; len = sizeof(struct sadb_msg); if (buf + len > lim) return NULL; memset(p, 0, len); p->sadb_msg_version = PF_KEY_V2; p->sadb_msg_type = type; p->sadb_msg_errno = 0; p->sadb_msg_satype = satype; p->sadb_msg_len = PFKEY_UNIT64(tlen); p->sadb_msg_reserved = 0; p->sadb_msg_seq = seq; p->sadb_msg_pid = (u_int32_t)pid; return(buf + len); } /* * copy secasvar data into sadb_address. * `buf' must has been allocated sufficiently. */ static caddr_t pfkey_setsadbsa(buf, lim, spi, wsize, auth, enc, flags) caddr_t buf; caddr_t lim; u_int32_t spi, flags; u_int wsize, auth, enc; { struct sadb_sa *p; u_int len; p = (struct sadb_sa *)buf; len = sizeof(struct sadb_sa); if (buf + len > lim) return NULL; memset(p, 0, len); p->sadb_sa_len = PFKEY_UNIT64(len); p->sadb_sa_exttype = SADB_EXT_SA; p->sadb_sa_spi = spi; p->sadb_sa_replay = wsize > UINT8_MAX ? UINT8_MAX: wsize; p->sadb_sa_state = SADB_SASTATE_LARVAL; p->sadb_sa_auth = auth; p->sadb_sa_encrypt = enc; p->sadb_sa_flags = flags; return(buf + len); } /* * Set data into sadb_x_sa_replay. * `buf' must has been allocated sufficiently. */ static caddr_t pfkey_setsadbxreplay(caddr_t buf, caddr_t lim, uint32_t wsize) { struct sadb_x_sa_replay *p; u_int len; p = (struct sadb_x_sa_replay *)buf; len = sizeof(struct sadb_x_sa_replay); if (buf + len > lim) return (NULL); memset(p, 0, len); p->sadb_x_sa_replay_len = PFKEY_UNIT64(len); p->sadb_x_sa_replay_exttype = SADB_X_EXT_SA_REPLAY; /* Convert wsize from bytes to number of packets. */ p->sadb_x_sa_replay_replay = wsize << 3; return (buf + len); } /* * set data into sadb_address. * `buf' must has been allocated sufficiently. * prefixlen is in bits. */ static caddr_t pfkey_setsadbaddr(buf, lim, exttype, saddr, prefixlen, ul_proto) caddr_t buf; caddr_t lim; u_int exttype; struct sockaddr *saddr; u_int prefixlen; u_int ul_proto; { struct sadb_address *p; u_int len; p = (struct sadb_address *)buf; len = sizeof(struct sadb_address) + PFKEY_ALIGN8(saddr->sa_len); if (buf + len > lim) return NULL; memset(p, 0, len); p->sadb_address_len = PFKEY_UNIT64(len); p->sadb_address_exttype = exttype & 0xffff; p->sadb_address_proto = ul_proto & 0xff; p->sadb_address_prefixlen = prefixlen; p->sadb_address_reserved = 0; memcpy(p + 1, saddr, saddr->sa_len); return(buf + len); } /* * set sadb_key structure after clearing buffer with zero. * OUT: the pointer of buf + len. */ static caddr_t pfkey_setsadbkey(buf, lim, type, key, keylen) caddr_t buf; caddr_t lim; caddr_t key; u_int type, keylen; { struct sadb_key *p; u_int len; p = (struct sadb_key *)buf; len = sizeof(struct sadb_key) + PFKEY_ALIGN8(keylen); if (buf + len > lim) return NULL; memset(p, 0, len); p->sadb_key_len = PFKEY_UNIT64(len); p->sadb_key_exttype = type; p->sadb_key_bits = keylen << 3; p->sadb_key_reserved = 0; memcpy(p + 1, key, keylen); return buf + len; } /* * set sadb_lifetime structure after clearing buffer with zero. * OUT: the pointer of buf + len. */ static caddr_t pfkey_setsadblifetime(buf, lim, type, l_alloc, l_bytes, l_addtime, l_usetime) caddr_t buf; caddr_t lim; u_int type; u_int32_t l_alloc, l_bytes, l_addtime, l_usetime; { struct sadb_lifetime *p; u_int len; p = (struct sadb_lifetime *)buf; len = sizeof(struct sadb_lifetime); if (buf + len > lim) return NULL; memset(p, 0, len); p->sadb_lifetime_len = PFKEY_UNIT64(len); p->sadb_lifetime_exttype = type; switch (type) { case SADB_EXT_LIFETIME_SOFT: p->sadb_lifetime_allocations = (l_alloc * soft_lifetime_allocations_rate) /100; p->sadb_lifetime_bytes = (l_bytes * soft_lifetime_bytes_rate) /100; p->sadb_lifetime_addtime = (l_addtime * soft_lifetime_addtime_rate) /100; p->sadb_lifetime_usetime = (l_usetime * soft_lifetime_usetime_rate) /100; break; case SADB_EXT_LIFETIME_HARD: p->sadb_lifetime_allocations = l_alloc; p->sadb_lifetime_bytes = l_bytes; p->sadb_lifetime_addtime = l_addtime; p->sadb_lifetime_usetime = l_usetime; break; } return buf + len; } /* * copy secasvar data into sadb_address. * `buf' must has been allocated sufficiently. */ static caddr_t pfkey_setsadbxsa2(buf, lim, mode0, reqid) caddr_t buf; caddr_t lim; u_int32_t mode0; u_int32_t reqid; { struct sadb_x_sa2 *p; u_int8_t mode = mode0 & 0xff; u_int len; p = (struct sadb_x_sa2 *)buf; len = sizeof(struct sadb_x_sa2); if (buf + len > lim) return NULL; memset(p, 0, len); p->sadb_x_sa2_len = PFKEY_UNIT64(len); p->sadb_x_sa2_exttype = SADB_X_EXT_SA2; p->sadb_x_sa2_mode = mode; p->sadb_x_sa2_reqid = reqid; return(buf + len); } diff --git a/stand/efi/include/efifs.h b/stand/efi/include/efifs.h index 58febb66eb75..76aa93d6d07f 100644 --- a/stand/efi/include/efifs.h +++ b/stand/efi/include/efifs.h @@ -1,123 +1,123 @@ /* $FreeBSD$ */ #ifndef _EFI_FS_H #define _EFI_FS_H /*++ Copyright (c) 1999 - 2002 Intel Corporation. All rights reserved This software and associated documentation (if any) is furnished under a license and may only be used or copied in accordance with the terms of the license. Except as permitted by such license, no part of this software or documentation may be reproduced, stored in a retrieval system, or transmitted in any form or by any means without the express written consent of Intel Corporation. Module Name: efifs.h Abstract: EFI File System structures Revision History --*/ // -// EFI Partition header (normaly starts in LBA 1) +// EFI Partition header (normally starts in LBA 1) // #define EFI_PARTITION_SIGNATURE 0x5053595320494249 #define EFI_PARTITION_REVISION 0x00010001 #define MIN_EFI_PARTITION_BLOCK_SIZE 512 #define EFI_PARTITION_LBA 1 typedef struct _EFI_PARTITION_HEADER { EFI_TABLE_HEADER Hdr; UINT32 DirectoryAllocationNumber; UINT32 BlockSize; EFI_LBA FirstUsableLba; EFI_LBA LastUsableLba; EFI_LBA UnusableSpace; EFI_LBA FreeSpace; EFI_LBA RootFile; EFI_LBA SecutiryFile; } EFI_PARTITION_HEADER; // // File header // #define EFI_FILE_HEADER_SIGNATURE 0x454c494620494249 #define EFI_FILE_HEADER_REVISION 0x00010000 #define EFI_FILE_STRING_SIZE 260 typedef struct _EFI_FILE_HEADER { EFI_TABLE_HEADER Hdr; UINT32 Class; UINT32 LBALOffset; EFI_LBA Parent; UINT64 FileSize; UINT64 FileAttributes; EFI_TIME FileCreateTime; EFI_TIME FileModificationTime; EFI_GUID VendorGuid; CHAR16 FileString[EFI_FILE_STRING_SIZE]; } EFI_FILE_HEADER; // // Return the file's first LBAL which is in the same // logical block as the file header // #define EFI_FILE_LBAL(a) ((EFI_LBAL *) (((CHAR8 *) (a)) + (a)->LBALOffset)) #define EFI_FILE_CLASS_FREE_SPACE 1 #define EFI_FILE_CLASS_EMPTY 2 #define EFI_FILE_CLASS_NORMAL 3 // // Logical Block Address List - the fundemental block // description structure // #define EFI_LBAL_SIGNATURE 0x4c41424c20494249 #define EFI_LBAL_REVISION 0x00010000 typedef struct _EFI_LBAL { EFI_TABLE_HEADER Hdr; UINT32 Class; EFI_LBA Parent; EFI_LBA Next; UINT32 ArraySize; UINT32 ArrayCount; } EFI_LBAL; // Array size #define EFI_LBAL_ARRAY_SIZE(lbal,offs,blks) \ (((blks) - (offs) - (lbal)->Hdr.HeaderSize) / sizeof(EFI_RL)) // // Logical Block run-length // typedef struct { EFI_LBA Start; UINT64 Length; } EFI_RL; // // Return the run-length structure from an LBAL header // #define EFI_LBAL_RL(a) ((EFI_RL*) (((CHAR8 *) (a)) + (a)->Hdr.HeaderSize)) #endif diff --git a/sys/dev/ixl/ixl_pf_i2c.c b/sys/dev/ixl/ixl_pf_i2c.c index 9aea32bbe5ce..c69e2f8ac836 100644 --- a/sys/dev/ixl/ixl_pf_i2c.c +++ b/sys/dev/ixl/ixl_pf_i2c.c @@ -1,743 +1,743 @@ /****************************************************************************** Copyright (c) 2013-2018, Intel Corporation All rights reserved. - - Redistribution and use in source and binary forms, with or without + + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, + + 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - - 3. Neither the name of the Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from + + 3. Neither the name of the Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from this software without specific prior written permission. - + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ /*$FreeBSD$*/ #include "ixl_pf.h" #define IXL_I2C_T_RISE 1 #define IXL_I2C_T_FALL 1 #define IXL_I2C_T_SU_DATA 1 #define IXL_I2C_T_SU_STA 5 #define IXL_I2C_T_SU_STO 4 #define IXL_I2C_T_HD_STA 4 #define IXL_I2C_T_LOW 5 #define IXL_I2C_T_HIGH 4 #define IXL_I2C_T_BUF 5 #define IXL_I2C_CLOCK_STRETCHING_TIMEOUT 500 #define IXL_I2C_REG(_hw) \ I40E_GLGEN_I2CPARAMS(_hw->func_caps.mdio_port_num) /* I2C bit-banging functions */ static s32 ixl_set_i2c_data(struct ixl_pf *pf, u32 *i2cctl, bool data); static bool ixl_get_i2c_data(struct ixl_pf *pf, u32 *i2cctl); static void ixl_raise_i2c_clk(struct ixl_pf *pf, u32 *i2cctl); static void ixl_lower_i2c_clk(struct ixl_pf *pf, u32 *i2cctl); static s32 ixl_clock_out_i2c_bit(struct ixl_pf *pf, bool data); static s32 ixl_get_i2c_ack(struct ixl_pf *pf); static s32 ixl_clock_out_i2c_byte(struct ixl_pf *pf, u8 data); static s32 ixl_clock_in_i2c_bit(struct ixl_pf *pf, bool *data); static s32 ixl_clock_in_i2c_byte(struct ixl_pf *pf, u8 *data); static void ixl_i2c_bus_clear(struct ixl_pf *pf); static void ixl_i2c_start(struct ixl_pf *pf); static void ixl_i2c_stop(struct ixl_pf *pf); static s32 ixl_wait_for_i2c_completion(struct i40e_hw *hw, u8 portnum); /** * ixl_i2c_bus_clear - Clears the I2C bus * @hw: pointer to hardware structure * * Clears the I2C bus by sending nine clock pulses. * Used when data line is stuck low. **/ static void ixl_i2c_bus_clear(struct ixl_pf *pf) { struct i40e_hw *hw = &pf->hw; u32 i2cctl = rd32(hw, IXL_I2C_REG(hw)); u32 i; DEBUGFUNC("ixl_i2c_bus_clear"); ixl_i2c_start(pf); ixl_set_i2c_data(pf, &i2cctl, 1); for (i = 0; i < 9; i++) { ixl_raise_i2c_clk(pf, &i2cctl); /* Min high period of clock is 4us */ i40e_usec_delay(IXL_I2C_T_HIGH); ixl_lower_i2c_clk(pf, &i2cctl); /* Min low period of clock is 4.7us*/ i40e_usec_delay(IXL_I2C_T_LOW); } ixl_i2c_start(pf); /* Put the i2c bus back to default state */ ixl_i2c_stop(pf); } /** * ixl_i2c_stop - Sets I2C stop condition * @hw: pointer to hardware structure * * Sets I2C stop condition (Low -> High on SDA while SCL is High) **/ static void ixl_i2c_stop(struct ixl_pf *pf) { struct i40e_hw *hw = &pf->hw; u32 i2cctl = rd32(hw, IXL_I2C_REG(hw)); DEBUGFUNC("ixl_i2c_stop"); /* Stop condition must begin with data low and clock high */ ixl_set_i2c_data(pf, &i2cctl, 0); ixl_raise_i2c_clk(pf, &i2cctl); /* Setup time for stop condition (4us) */ i40e_usec_delay(IXL_I2C_T_SU_STO); ixl_set_i2c_data(pf, &i2cctl, 1); /* bus free time between stop and start (4.7us)*/ i40e_usec_delay(IXL_I2C_T_BUF); } /** * ixl_clock_in_i2c_byte - Clocks in one byte via I2C * @hw: pointer to hardware structure * @data: data byte to clock in * * Clocks in one byte data via I2C data/clock **/ static s32 ixl_clock_in_i2c_byte(struct ixl_pf *pf, u8 *data) { s32 i; bool bit = 0; DEBUGFUNC("ixl_clock_in_i2c_byte"); for (i = 7; i >= 0; i--) { ixl_clock_in_i2c_bit(pf, &bit); *data |= bit << i; } return I40E_SUCCESS; } /** * ixl_clock_in_i2c_bit - Clocks in one bit via I2C data/clock * @hw: pointer to hardware structure * @data: read data value * * Clocks in one bit via I2C data/clock **/ static s32 ixl_clock_in_i2c_bit(struct ixl_pf *pf, bool *data) { struct i40e_hw *hw = &pf->hw; u32 i2cctl = rd32(hw, IXL_I2C_REG(hw)); DEBUGFUNC("ixl_clock_in_i2c_bit"); ixl_raise_i2c_clk(pf, &i2cctl); /* Minimum high period of clock is 4us */ i40e_usec_delay(IXL_I2C_T_HIGH); i2cctl = rd32(hw, IXL_I2C_REG(hw)); i2cctl |= I40E_GLGEN_I2CPARAMS_DATA_OE_N_MASK; wr32(hw, IXL_I2C_REG(hw), i2cctl); ixl_flush(hw); i2cctl = rd32(hw, IXL_I2C_REG(hw)); *data = ixl_get_i2c_data(pf, &i2cctl); ixl_lower_i2c_clk(pf, &i2cctl); /* Minimum low period of clock is 4.7 us */ i40e_usec_delay(IXL_I2C_T_LOW); return I40E_SUCCESS; } /** * ixl_get_i2c_ack - Polls for I2C ACK * @hw: pointer to hardware structure * * Clocks in/out one bit via I2C data/clock **/ static s32 ixl_get_i2c_ack(struct ixl_pf *pf) { struct i40e_hw *hw = &pf->hw; s32 status = I40E_SUCCESS; u32 i = 0; u32 i2cctl = rd32(hw, IXL_I2C_REG(hw)); u32 timeout = 10; bool ack = 1; ixl_raise_i2c_clk(pf, &i2cctl); /* Minimum high period of clock is 4us */ i40e_usec_delay(IXL_I2C_T_HIGH); i2cctl = rd32(hw, IXL_I2C_REG(hw)); i2cctl |= I40E_GLGEN_I2CPARAMS_DATA_OE_N_MASK; wr32(hw, IXL_I2C_REG(hw), i2cctl); ixl_flush(hw); /* Poll for ACK. Note that ACK in I2C spec is * transition from 1 to 0 */ for (i = 0; i < timeout; i++) { i2cctl = rd32(hw, IXL_I2C_REG(hw)); ack = ixl_get_i2c_data(pf, &i2cctl); i40e_usec_delay(1); if (!ack) break; } if (ack) { ixl_dbg(pf, IXL_DBG_I2C, "I2C ack was not received.\n"); status = I40E_ERR_PHY; } ixl_lower_i2c_clk(pf, &i2cctl); /* Minimum low period of clock is 4.7 us */ i40e_usec_delay(IXL_I2C_T_LOW); return status; } /** * ixl_clock_out_i2c_bit - Clocks in/out one bit via I2C data/clock * @hw: pointer to hardware structure * @data: data value to write * * Clocks out one bit via I2C data/clock **/ static s32 ixl_clock_out_i2c_bit(struct ixl_pf *pf, bool data) { struct i40e_hw *hw = &pf->hw; s32 status; u32 i2cctl = rd32(hw, IXL_I2C_REG(hw)); status = ixl_set_i2c_data(pf, &i2cctl, data); if (status == I40E_SUCCESS) { ixl_raise_i2c_clk(pf, &i2cctl); /* Minimum high period of clock is 4us */ i40e_usec_delay(IXL_I2C_T_HIGH); ixl_lower_i2c_clk(pf, &i2cctl); /* Minimum low period of clock is 4.7 us. * This also takes care of the data hold time. */ i40e_usec_delay(IXL_I2C_T_LOW); } else { status = I40E_ERR_PHY; ixl_dbg(pf, IXL_DBG_I2C, "I2C data was not set to %#x\n", data); } return status; } /** * ixl_clock_out_i2c_byte - Clocks out one byte via I2C * @hw: pointer to hardware structure * @data: data byte clocked out * * Clocks out one byte data via I2C data/clock **/ static s32 ixl_clock_out_i2c_byte(struct ixl_pf *pf, u8 data) { struct i40e_hw *hw = &pf->hw; s32 status = I40E_SUCCESS; s32 i; u32 i2cctl; bool bit; DEBUGFUNC("ixl_clock_out_i2c_byte"); for (i = 7; i >= 0; i--) { bit = (data >> i) & 0x1; status = ixl_clock_out_i2c_bit(pf, bit); if (status != I40E_SUCCESS) break; } /* Release SDA line (set high) */ i2cctl = rd32(hw, IXL_I2C_REG(hw)); i2cctl |= I40E_GLGEN_I2CPARAMS_DATA_OUT_MASK; i2cctl &= ~(I40E_GLGEN_I2CPARAMS_DATA_OE_N_MASK); wr32(hw, IXL_I2C_REG(hw), i2cctl); ixl_flush(hw); return status; } /** * ixl_lower_i2c_clk - Lowers the I2C SCL clock * @hw: pointer to hardware structure * @i2cctl: Current value of I2CCTL register * * Lowers the I2C clock line '1'->'0' **/ static void ixl_lower_i2c_clk(struct ixl_pf *pf, u32 *i2cctl) { struct i40e_hw *hw = &pf->hw; *i2cctl &= ~(I40E_GLGEN_I2CPARAMS_CLK_MASK); *i2cctl &= ~(I40E_GLGEN_I2CPARAMS_CLK_OE_N_MASK); wr32(hw, IXL_I2C_REG(hw), *i2cctl); ixl_flush(hw); /* SCL fall time (300ns) */ i40e_usec_delay(IXL_I2C_T_FALL); } /** * ixl_raise_i2c_clk - Raises the I2C SCL clock * @hw: pointer to hardware structure * @i2cctl: Current value of I2CCTL register * * Raises the I2C clock line '0'->'1' **/ static void ixl_raise_i2c_clk(struct ixl_pf *pf, u32 *i2cctl) { struct i40e_hw *hw = &pf->hw; u32 i = 0; u32 timeout = IXL_I2C_CLOCK_STRETCHING_TIMEOUT; u32 i2cctl_r = 0; for (i = 0; i < timeout; i++) { *i2cctl |= I40E_GLGEN_I2CPARAMS_CLK_MASK; *i2cctl &= ~(I40E_GLGEN_I2CPARAMS_CLK_OE_N_MASK); wr32(hw, IXL_I2C_REG(hw), *i2cctl); ixl_flush(hw); /* SCL rise time (1000ns) */ i40e_usec_delay(IXL_I2C_T_RISE); i2cctl_r = rd32(hw, IXL_I2C_REG(hw)); if (i2cctl_r & I40E_GLGEN_I2CPARAMS_CLK_IN_MASK) break; } } /** * ixl_get_i2c_data - Reads the I2C SDA data bit * @hw: pointer to hardware structure * @i2cctl: Current value of I2CCTL register * * Returns the I2C data bit value **/ static bool ixl_get_i2c_data(struct ixl_pf *pf, u32 *i2cctl) { bool data; if (*i2cctl & I40E_GLGEN_I2CPARAMS_DATA_IN_MASK) data = 1; else data = 0; return data; } /** * ixl_set_i2c_data - Sets the I2C data bit * @hw: pointer to hardware structure * @i2cctl: Current value of I2CCTL register * @data: I2C data value (0 or 1) to set * * Sets the I2C data bit **/ static s32 ixl_set_i2c_data(struct ixl_pf *pf, u32 *i2cctl, bool data) { struct i40e_hw *hw = &pf->hw; s32 status = I40E_SUCCESS; DEBUGFUNC("ixl_set_i2c_data"); if (data) *i2cctl |= I40E_GLGEN_I2CPARAMS_DATA_OUT_MASK; else *i2cctl &= ~(I40E_GLGEN_I2CPARAMS_DATA_OUT_MASK); *i2cctl &= ~(I40E_GLGEN_I2CPARAMS_DATA_OE_N_MASK); wr32(hw, IXL_I2C_REG(hw), *i2cctl); ixl_flush(hw); /* Data rise/fall (1000ns/300ns) and set-up time (250ns) */ i40e_usec_delay(IXL_I2C_T_RISE + IXL_I2C_T_FALL + IXL_I2C_T_SU_DATA); /* Verify data was set correctly */ *i2cctl = rd32(hw, IXL_I2C_REG(hw)); if (data != ixl_get_i2c_data(pf, i2cctl)) { status = I40E_ERR_PHY; ixl_dbg(pf, IXL_DBG_I2C, "Error - I2C data was not set to %X.\n", data); } return status; } /** * ixl_i2c_start - Sets I2C start condition * Sets I2C start condition (High -> Low on SDA while SCL is High) **/ static void ixl_i2c_start(struct ixl_pf *pf) { struct i40e_hw *hw = &pf->hw; u32 i2cctl = rd32(hw, IXL_I2C_REG(hw)); DEBUGFUNC("ixl_i2c_start"); /* Start condition must begin with data and clock high */ ixl_set_i2c_data(pf, &i2cctl, 1); ixl_raise_i2c_clk(pf, &i2cctl); /* Setup time for start condition (4.7us) */ i40e_usec_delay(IXL_I2C_T_SU_STA); ixl_set_i2c_data(pf, &i2cctl, 0); /* Hold time for start condition (4us) */ i40e_usec_delay(IXL_I2C_T_HD_STA); ixl_lower_i2c_clk(pf, &i2cctl); /* Minimum low period of clock is 4.7 us */ i40e_usec_delay(IXL_I2C_T_LOW); } /** * ixl_read_i2c_byte_bb - Reads 8 bit word over I2C **/ s32 ixl_read_i2c_byte_bb(struct ixl_pf *pf, u8 byte_offset, u8 dev_addr, u8 *data) { struct i40e_hw *hw = &pf->hw; u32 max_retry = 10; u32 retry = 0; bool nack = 1; s32 status; *data = 0; u32 i2cctl = rd32(hw, IXL_I2C_REG(hw)); i2cctl |= I40E_GLGEN_I2CPARAMS_I2CBB_EN_MASK; wr32(hw, IXL_I2C_REG(hw), i2cctl); ixl_flush(hw); do { ixl_i2c_start(pf); /* Device Address and write indication */ status = ixl_clock_out_i2c_byte(pf, dev_addr); if (status != I40E_SUCCESS) { ixl_dbg(pf, IXL_DBG_I2C, "dev_addr clock out error\n"); goto fail; } status = ixl_get_i2c_ack(pf); if (status != I40E_SUCCESS) { ixl_dbg(pf, IXL_DBG_I2C, "dev_addr i2c ack error\n"); goto fail; } status = ixl_clock_out_i2c_byte(pf, byte_offset); if (status != I40E_SUCCESS) { ixl_dbg(pf, IXL_DBG_I2C, "byte_offset clock out error\n"); goto fail; } status = ixl_get_i2c_ack(pf); if (status != I40E_SUCCESS) { ixl_dbg(pf, IXL_DBG_I2C, "byte_offset i2c ack error\n"); goto fail; } ixl_i2c_start(pf); /* Device Address and read indication */ status = ixl_clock_out_i2c_byte(pf, (dev_addr | 0x1)); if (status != I40E_SUCCESS) goto fail; status = ixl_get_i2c_ack(pf); if (status != I40E_SUCCESS) goto fail; status = ixl_clock_in_i2c_byte(pf, data); if (status != I40E_SUCCESS) goto fail; status = ixl_clock_out_i2c_bit(pf, nack); if (status != I40E_SUCCESS) goto fail; ixl_i2c_stop(pf); status = I40E_SUCCESS; goto done; fail: ixl_i2c_bus_clear(pf); i40e_msec_delay(100); retry++; if (retry < max_retry) ixl_dbg(pf, IXL_DBG_I2C, "I2C byte read error - Retrying\n"); else ixl_dbg(pf, IXL_DBG_I2C, "I2C byte read error\n"); } while (retry < max_retry); done: i2cctl = rd32(hw, IXL_I2C_REG(hw)); i2cctl &= ~I40E_GLGEN_I2CPARAMS_I2CBB_EN_MASK; wr32(hw, IXL_I2C_REG(hw), i2cctl); ixl_flush(hw); return status; } /** * ixl_write_i2c_byte_bb - Writes 8 bit word over I2C **/ s32 ixl_write_i2c_byte_bb(struct ixl_pf *pf, u8 byte_offset, u8 dev_addr, u8 data) { struct i40e_hw *hw = &pf->hw; s32 status = I40E_SUCCESS; u32 max_retry = 1; u32 retry = 0; u32 i2cctl = rd32(hw, IXL_I2C_REG(hw)); i2cctl |= I40E_GLGEN_I2CPARAMS_I2CBB_EN_MASK; wr32(hw, IXL_I2C_REG(hw), i2cctl); ixl_flush(hw); do { ixl_i2c_start(pf); status = ixl_clock_out_i2c_byte(pf, dev_addr); if (status != I40E_SUCCESS) goto fail; status = ixl_get_i2c_ack(pf); if (status != I40E_SUCCESS) goto fail; status = ixl_clock_out_i2c_byte(pf, byte_offset); if (status != I40E_SUCCESS) goto fail; status = ixl_get_i2c_ack(pf); if (status != I40E_SUCCESS) goto fail; status = ixl_clock_out_i2c_byte(pf, data); if (status != I40E_SUCCESS) goto fail; status = ixl_get_i2c_ack(pf); if (status != I40E_SUCCESS) goto fail; ixl_i2c_stop(pf); goto write_byte_out; fail: ixl_i2c_bus_clear(pf); i40e_msec_delay(100); retry++; if (retry < max_retry) ixl_dbg(pf, IXL_DBG_I2C, "I2C byte write error - Retrying\n"); else ixl_dbg(pf, IXL_DBG_I2C, "I2C byte write error\n"); } while (retry < max_retry); write_byte_out: i2cctl = rd32(hw, IXL_I2C_REG(hw)); i2cctl &= ~I40E_GLGEN_I2CPARAMS_I2CBB_EN_MASK; wr32(hw, IXL_I2C_REG(hw), i2cctl); ixl_flush(hw); return status; } /** * ixl_read_i2c_byte_reg - Reads 8 bit word over I2C using a hardware register **/ s32 ixl_read_i2c_byte_reg(struct ixl_pf *pf, u8 byte_offset, u8 dev_addr, u8 *data) { struct i40e_hw *hw = &pf->hw; u32 reg = 0; s32 status; *data = 0; reg |= (byte_offset << I40E_GLGEN_I2CCMD_REGADD_SHIFT); reg |= (((dev_addr >> 1) & 0x7) << I40E_GLGEN_I2CCMD_PHYADD_SHIFT); reg |= I40E_GLGEN_I2CCMD_OP_MASK; wr32(hw, I40E_GLGEN_I2CCMD(hw->func_caps.mdio_port_num), reg); status = ixl_wait_for_i2c_completion(hw, hw->func_caps.mdio_port_num); /* Get data from I2C register */ reg = rd32(hw, I40E_GLGEN_I2CCMD(hw->func_caps.mdio_port_num)); /* Retrieve data read from EEPROM */ *data = (u8)(reg & 0xff); if (status) ixl_dbg(pf, IXL_DBG_I2C, "I2C byte read error\n"); return status; } /** * ixl_write_i2c_byte_reg - Writes 8 bit word over I2C using a hardware register **/ s32 ixl_write_i2c_byte_reg(struct ixl_pf *pf, u8 byte_offset, u8 dev_addr, u8 data) { struct i40e_hw *hw = &pf->hw; s32 status = I40E_SUCCESS; u32 reg = 0; u8 upperbyte = 0; u16 datai2c = 0; status = ixl_read_i2c_byte_reg(pf, byte_offset + 1, dev_addr, &upperbyte); datai2c = ((u16)upperbyte << 8) | (u16)data; reg = rd32(hw, I40E_GLGEN_I2CCMD(hw->func_caps.mdio_port_num)); /* Form write command */ reg &= ~I40E_GLGEN_I2CCMD_PHYADD_MASK; reg |= (((dev_addr >> 1) & 0x7) << I40E_GLGEN_I2CCMD_PHYADD_SHIFT); reg &= ~I40E_GLGEN_I2CCMD_REGADD_MASK; reg |= (byte_offset << I40E_GLGEN_I2CCMD_REGADD_SHIFT); reg &= ~I40E_GLGEN_I2CCMD_DATA_MASK; reg |= (datai2c << I40E_GLGEN_I2CCMD_DATA_SHIFT); reg &= ~I40E_GLGEN_I2CCMD_OP_MASK; - /* Write command to registers controling I2C - data and address. */ + /* Write command to registers controlling I2C - data and address. */ wr32(hw, I40E_GLGEN_I2CCMD(hw->func_caps.mdio_port_num), reg); status = ixl_wait_for_i2c_completion(hw, hw->func_caps.mdio_port_num); if (status) ixl_dbg(pf, IXL_DBG_I2C, "I2C byte write error\n"); return status; } /** * ixl_wait_for_i2c_completion **/ static s32 ixl_wait_for_i2c_completion(struct i40e_hw *hw, u8 portnum) { s32 status = 0; u32 timeout = 100; u32 reg; do { reg = rd32(hw, I40E_GLGEN_I2CCMD(portnum)); if ((reg & I40E_GLGEN_I2CCMD_R_MASK) != 0) break; i40e_usec_delay(10); } while (timeout-- > 0); if (timeout == 0) return I40E_ERR_TIMEOUT; else return status; } /** * ixl_read_i2c_byte_aq - Reads 8 bit word over I2C using an AQ command **/ s32 ixl_read_i2c_byte_aq(struct ixl_pf *pf, u8 byte_offset, u8 dev_addr, u8 *data) { struct i40e_hw *hw = &pf->hw; s32 status = I40E_SUCCESS; u32 reg; status = i40e_aq_get_phy_register(hw, I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE, dev_addr, false, byte_offset, ®, NULL); if (status) ixl_dbg(pf, IXL_DBG_I2C, "I2C byte read status %s, error %s\n", i40e_stat_str(hw, status), i40e_aq_str(hw, hw->aq.asq_last_status)); else *data = (u8)reg; return status; } /** * ixl_write_i2c_byte_aq - Writes 8 bit word over I2C using an AQ command **/ s32 ixl_write_i2c_byte_aq(struct ixl_pf *pf, u8 byte_offset, u8 dev_addr, u8 data) { struct i40e_hw *hw = &pf->hw; s32 status = I40E_SUCCESS; status = i40e_aq_set_phy_register(hw, I40E_AQ_PHY_REG_ACCESS_EXTERNAL_MODULE, dev_addr, false, byte_offset, data, NULL); if (status) ixl_dbg(pf, IXL_DBG_I2C, "I2C byte write status %s, error %s\n", i40e_stat_str(hw, status), i40e_aq_str(hw, hw->aq.asq_last_status)); return status; } diff --git a/sys/fs/nfsclient/nfs_clrpcops.c b/sys/fs/nfsclient/nfs_clrpcops.c index 527a47338b3f..7cd793502476 100644 --- a/sys/fs/nfsclient/nfs_clrpcops.c +++ b/sys/fs/nfsclient/nfs_clrpcops.c @@ -1,8711 +1,8711 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); /* * Rpc op calls, generally called from the vnode op calls or through the * buffer cache, for NFS v2, 3 and 4. * These do not normally make any changes to vnode arguments or use * structures that might change between the VFS variants. The returned * arguments are all at the end, after the NFSPROC_T *p one. */ #include "opt_inet6.h" #include #include #include #include #include SYSCTL_DECL(_vfs_nfs); static int nfsignore_eexist = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, ignore_eexist, CTLFLAG_RW, &nfsignore_eexist, 0, "NFS ignore EEXIST replies for mkdir/symlink"); static int nfscl_dssameconn = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, dssameconn, CTLFLAG_RW, &nfscl_dssameconn, 0, "Use same TCP connection to multiple DSs"); /* * Global variables */ extern struct nfsstatsv1 nfsstatsv1; extern int nfs_numnfscbd; extern struct timeval nfsboottime; extern u_int32_t newnfs_false, newnfs_true; extern nfstype nfsv34_type[9]; extern int nfsrv_useacl; extern char nfsv4_callbackaddr[INET6_ADDRSTRLEN]; extern int nfscl_debuglevel; extern int nfs_pnfsiothreads; extern u_long sb_max_adj; extern int nfs_maxcopyrange; NFSCLSTATEMUTEX; int nfstest_outofseq = 0; int nfscl_assumeposixlocks = 1; int nfscl_enablecallb = 0; short nfsv4_cbport = NFSV4_CBPORT; int nfstest_openallsetattr = 0; #define DIRHDSIZ offsetof(struct dirent, d_name) /* * nfscl_getsameserver() can return one of three values: * NFSDSP_USETHISSESSION - Use this session for the DS. * NFSDSP_SEQTHISSESSION - Use the nfsclds_sequence field of this dsp for new * session. * NFSDSP_NOTFOUND - No matching server was found. */ enum nfsclds_state { NFSDSP_USETHISSESSION = 0, NFSDSP_SEQTHISSESSION = 1, NFSDSP_NOTFOUND = 2, }; /* * Do a write RPC on a DS data file, using this structure for the arguments, * so that this function can be executed by a separate kernel process. */ struct nfsclwritedsdorpc { int done; int inprog; struct task tsk; struct vnode *vp; int iomode; int must_commit; nfsv4stateid_t *stateidp; struct nfsclds *dsp; uint64_t off; int len; #ifdef notyet int advise; #endif struct nfsfh *fhp; struct mbuf *m; int vers; int minorvers; struct ucred *cred; NFSPROC_T *p; int err; }; static int nfsrpc_setattrrpc(vnode_t , struct vattr *, nfsv4stateid_t *, struct ucred *, NFSPROC_T *, struct nfsvattr *, int *, void *); static int nfsrpc_readrpc(vnode_t , struct uio *, struct ucred *, nfsv4stateid_t *, NFSPROC_T *, struct nfsvattr *, int *, void *); static int nfsrpc_writerpc(vnode_t , struct uio *, int *, int *, struct ucred *, nfsv4stateid_t *, NFSPROC_T *, struct nfsvattr *, int *, void *); static int nfsrpc_createv23(vnode_t , char *, int, struct vattr *, nfsquad_t, int, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *, void *); static int nfsrpc_createv4(vnode_t , char *, int, struct vattr *, nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *, void *, int *); static int nfsrpc_locku(struct nfsrv_descript *, struct nfsmount *, struct nfscllockowner *, u_int64_t, u_int64_t, u_int32_t, struct ucred *, NFSPROC_T *, int); static int nfsrpc_setaclrpc(vnode_t, struct ucred *, NFSPROC_T *, struct acl *, nfsv4stateid_t *, void *); static int nfsrpc_getlayout(struct nfsmount *, vnode_t, struct nfsfh *, int, uint32_t *, nfsv4stateid_t *, uint64_t, struct nfscllayout **, struct ucred *, NFSPROC_T *); static int nfsrpc_fillsa(struct nfsmount *, struct sockaddr_in *, struct sockaddr_in6 *, sa_family_t, int, int, struct nfsclds **, NFSPROC_T *); static void nfscl_initsessionslots(struct nfsclsession *); static int nfscl_doflayoutio(vnode_t, struct uio *, int *, int *, int *, nfsv4stateid_t *, int, struct nfscldevinfo *, struct nfscllayout *, struct nfsclflayout *, uint64_t, uint64_t, int, struct ucred *, NFSPROC_T *); static int nfscl_dofflayoutio(vnode_t, struct uio *, int *, int *, int *, nfsv4stateid_t *, int, struct nfscldevinfo *, struct nfscllayout *, struct nfsclflayout *, uint64_t, uint64_t, int, int, struct mbuf *, struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *); static int nfsrpc_readds(vnode_t, struct uio *, nfsv4stateid_t *, int *, struct nfsclds *, uint64_t, int, struct nfsfh *, int, int, int, struct ucred *, NFSPROC_T *); static int nfsrpc_writeds(vnode_t, struct uio *, int *, int *, nfsv4stateid_t *, struct nfsclds *, uint64_t, int, struct nfsfh *, int, int, int, int, struct ucred *, NFSPROC_T *); static int nfsio_writedsmir(vnode_t, int *, int *, nfsv4stateid_t *, struct nfsclds *, uint64_t, int, struct nfsfh *, struct mbuf *, int, int, struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *); static int nfsrpc_writedsmir(vnode_t, int *, int *, nfsv4stateid_t *, struct nfsclds *, uint64_t, int, struct nfsfh *, struct mbuf *, int, int, struct ucred *, NFSPROC_T *); static enum nfsclds_state nfscl_getsameserver(struct nfsmount *, struct nfsclds *, struct nfsclds **, uint32_t *); static int nfsio_commitds(vnode_t, uint64_t, int, struct nfsclds *, struct nfsfh *, int, int, struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *); static int nfsrpc_commitds(vnode_t, uint64_t, int, struct nfsclds *, struct nfsfh *, int, int, struct ucred *, NFSPROC_T *); #ifdef notyet static int nfsio_adviseds(vnode_t, uint64_t, int, int, struct nfsclds *, struct nfsfh *, int, int, struct nfsclwritedsdorpc *, struct ucred *, NFSPROC_T *); static int nfsrpc_adviseds(vnode_t, uint64_t, int, int, struct nfsclds *, struct nfsfh *, int, int, struct ucred *, NFSPROC_T *); #endif static int nfsrpc_allocaterpc(vnode_t, off_t, off_t, nfsv4stateid_t *, struct nfsvattr *, int *, struct ucred *, NFSPROC_T *, void *); static void nfsrv_setuplayoutget(struct nfsrv_descript *, int, uint64_t, uint64_t, uint64_t, nfsv4stateid_t *, int, int, int); static int nfsrv_parseug(struct nfsrv_descript *, int, uid_t *, gid_t *, NFSPROC_T *); static int nfsrv_parselayoutget(struct nfsmount *, struct nfsrv_descript *, nfsv4stateid_t *, int *, struct nfsclflayouthead *); static int nfsrpc_getopenlayout(struct nfsmount *, vnode_t, u_int8_t *, int, uint8_t *, int, uint32_t, struct nfsclopen *, uint8_t *, int, struct nfscldeleg **, struct ucred *, NFSPROC_T *); static int nfsrpc_getcreatelayout(vnode_t, char *, int, struct vattr *, nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *, void *, int *); static int nfsrpc_openlayoutrpc(struct nfsmount *, vnode_t, u_int8_t *, int, uint8_t *, int, uint32_t, struct nfsclopen *, uint8_t *, int, struct nfscldeleg **, nfsv4stateid_t *, int, int, int, int *, struct nfsclflayouthead *, int *, struct ucred *, NFSPROC_T *); static int nfsrpc_createlayout(vnode_t, char *, int, struct vattr *, nfsquad_t, int, struct nfsclowner *, struct nfscldeleg **, struct ucred *, NFSPROC_T *, struct nfsvattr *, struct nfsvattr *, struct nfsfh **, int *, int *, void *, int *, nfsv4stateid_t *, int, int, int, int *, struct nfsclflayouthead *, int *); static int nfsrpc_layoutget(struct nfsmount *, uint8_t *, int, int, uint64_t, uint64_t, uint64_t, int, int, nfsv4stateid_t *, int *, struct nfsclflayouthead *, struct ucred *, NFSPROC_T *, void *); static int nfsrpc_layoutgetres(struct nfsmount *, vnode_t, uint8_t *, int, nfsv4stateid_t *, int, uint32_t *, struct nfscllayout **, struct nfsclflayouthead *, int, int, int *, struct ucred *, NFSPROC_T *); static int nfsrpc_copyrpc(vnode_t, off_t, vnode_t, off_t, size_t *, nfsv4stateid_t *, nfsv4stateid_t *, struct nfsvattr *, int *, struct nfsvattr *, int *, bool, int *, struct ucred *, NFSPROC_T *); static int nfsrpc_seekrpc(vnode_t, off_t *, nfsv4stateid_t *, bool *, int, struct nfsvattr *, int *, struct ucred *); static struct mbuf *nfsm_split(struct mbuf *, uint64_t); int nfs_pnfsio(task_fn_t *, void *); /* * nfs null call from vfs. */ int nfsrpc_null(vnode_t vp, struct ucred *cred, NFSPROC_T *p) { int error; struct nfsrv_descript nfsd, *nd = &nfsd; NFSCL_REQSTART(nd, NFSPROC_NULL, vp); error = nfscl_request(nd, vp, p, cred, NULL); if (nd->nd_repstat && !error) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * nfs access rpc op. * For nfs version 3 and 4, use the access rpc to check accessibility. If file * modes are changed on the server, accesses might still fail later. */ int nfsrpc_access(vnode_t vp, int acmode, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp) { int error; u_int32_t mode, rmode; if (acmode & VREAD) mode = NFSACCESS_READ; else mode = 0; if (vnode_vtype(vp) == VDIR) { if (acmode & VWRITE) mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND | NFSACCESS_DELETE); if (acmode & VEXEC) mode |= NFSACCESS_LOOKUP; } else { if (acmode & VWRITE) mode |= (NFSACCESS_MODIFY | NFSACCESS_EXTEND); if (acmode & VEXEC) mode |= NFSACCESS_EXECUTE; } /* * Now, just call nfsrpc_accessrpc() to do the actual RPC. */ error = nfsrpc_accessrpc(vp, mode, cred, p, nap, attrflagp, &rmode, NULL); /* * The NFS V3 spec does not clarify whether or not * the returned access bits can be a superset of * the ones requested, so... */ if (!error && (rmode & mode) != mode) error = EACCES; return (error); } /* * The actual rpc, separated out for Darwin. */ int nfsrpc_accessrpc(vnode_t vp, u_int32_t mode, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, u_int32_t *rmodep, void *stuff) { u_int32_t *tl; u_int32_t supported, rmode; int error; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; supported = mode; NFSCL_REQSTART(nd, NFSPROC_ACCESS, vp); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(mode); if (nd->nd_flag & ND_NFSV4) { /* * And do a Getattr op. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (error) goto nfsmout; } if (!nd->nd_repstat) { if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); supported = fxdr_unsigned(u_int32_t, *tl++); } else { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); } rmode = fxdr_unsigned(u_int32_t, *tl); if (nd->nd_flag & ND_NFSV4) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); /* * It's not obvious what should be done about * unsupported access modes. For now, be paranoid * and clear the unsupported ones. */ rmode &= supported; *rmodep = rmode; } else error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs open rpc */ int nfsrpc_open(vnode_t vp, int amode, struct ucred *cred, NFSPROC_T *p) { struct nfsclopen *op; struct nfscldeleg *dp; struct nfsfh *nfhp; struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); u_int32_t mode, clidrev; int ret, newone, error, expireret = 0, retrycnt; /* * For NFSv4, Open Ops are only done on Regular Files. */ if (vnode_vtype(vp) != VREG) return (0); mode = 0; if (amode & FREAD) mode |= NFSV4OPEN_ACCESSREAD; if (amode & FWRITE) mode |= NFSV4OPEN_ACCESSWRITE; nfhp = np->n_fhp; retrycnt = 0; #ifdef notdef { char name[100]; int namel; namel = (np->n_v4->n4_namelen < 100) ? np->n_v4->n4_namelen : 99; bcopy(NFS4NODENAME(np->n_v4), name, namel); name[namel] = '\0'; printf("rpcopen p=0x%x name=%s",p->p_pid,name); if (nfhp->nfh_len > 0) printf(" fh=0x%x\n",nfhp->nfh_fh[12]); else printf(" fhl=0\n"); } #endif do { dp = NULL; error = nfscl_open(vp, nfhp->nfh_fh, nfhp->nfh_len, mode, 1, cred, p, NULL, &op, &newone, &ret, 1); if (error) { return (error); } if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; else clidrev = 0; if (ret == NFSCLOPEN_DOOPEN) { if (np->n_v4 != NULL) { /* * For the first attempt, try and get a layout, if * pNFS is enabled for the mount. */ if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 || nfs_numnfscbd == 0 || (np->n_flag & NNOLAYOUT) != 0 || retrycnt > 0) error = nfsrpc_openrpc(nmp, vp, np->n_v4->n4_data, np->n_v4->n4_fhlen, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, mode, op, NFS4NODENAME(np->n_v4), np->n_v4->n4_namelen, &dp, 0, 0x0, cred, p, 0, 0); else error = nfsrpc_getopenlayout(nmp, vp, np->n_v4->n4_data, np->n_v4->n4_fhlen, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, mode, op, NFS4NODENAME(np->n_v4), np->n_v4->n4_namelen, &dp, cred, p); if (dp != NULL) { #ifdef APPLE OSBitAndAtomic((int32_t)~NDELEGMOD, (UInt32 *)&np->n_flag); #else NFSLOCKNODE(np); np->n_flag &= ~NDELEGMOD; /* * Invalidate the attribute cache, so that * attributes that pre-date the issue of a * delegation are not cached, since the * cached attributes will remain valid while * the delegation is held. */ NFSINVALATTRCACHE(np); NFSUNLOCKNODE(np); #endif (void) nfscl_deleg(nmp->nm_mountp, op->nfso_own->nfsow_clp, nfhp->nfh_fh, nfhp->nfh_len, cred, p, &dp); } } else { error = EIO; } newnfs_copyincred(cred, &op->nfso_cred); } else if (ret == NFSCLOPEN_SETCRED) /* * This is a new local open on a delegation. It needs * to have credentials so that an open can be done * against the server during recovery. */ newnfs_copyincred(cred, &op->nfso_cred); /* * nfso_opencnt is the count of how many VOP_OPEN()s have * been done on this Open successfully and a VOP_CLOSE() * is expected for each of these. * If error is non-zero, don't increment it, since the Open * hasn't succeeded yet. */ if (!error) op->nfso_opencnt++; nfscl_openrelease(nmp, op, error, newone); if (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_open"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); retrycnt++; } } while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; return (error); } /* * the actual open rpc */ int nfsrpc_openrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, u_int8_t *newfhp, int newfhlen, u_int32_t mode, struct nfsclopen *op, u_int8_t *name, int namelen, struct nfscldeleg **dpp, int reclaim, u_int32_t delegtype, struct ucred *cred, NFSPROC_T *p, int syscred, int recursed) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfscldeleg *dp, *ndp = NULL; struct nfsvattr nfsva; u_int32_t rflags, deleg; nfsattrbit_t attrbits; int error, ret, acesize, limitby; struct nfsclsession *tsep; dp = *dpp; *dpp = NULL; nfscl_reqstart(nd, NFSPROC_OPEN, nmp, nfhp, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); *tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; (void) nfsm_strtom(nd, op->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_NOCREATE); if (reclaim) { *tl = txdr_unsigned(NFSV4OPEN_CLAIMPREVIOUS); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(delegtype); } else { if (dp != NULL) { *tl = txdr_unsigned(NFSV4OPEN_CLAIMDELEGATECUR); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = dp->nfsdl_stateid.seqid; *tl++ = dp->nfsdl_stateid.other[0]; *tl++ = dp->nfsdl_stateid.other[1]; *tl = dp->nfsdl_stateid.other[2]; } else { *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); } (void) nfsm_strtom(nd, name, namelen); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY); (void) nfsrv_putattrbit(nd, &attrbits); if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (!nd->nd_repstat) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); op->nfso_stateid.seqid = *tl++; op->nfso_stateid.other[0] = *tl++; op->nfso_stateid.other[1] = *tl++; op->nfso_stateid.other[2] = *tl; rflags = fxdr_unsigned(u_int32_t, *(tl + 6)); error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); deleg = fxdr_unsigned(u_int32_t, *tl); if (deleg == NFSV4OPEN_DELEGATEREAD || deleg == NFSV4OPEN_DELEGATEWRITE) { if (!(op->nfso_own->nfsow_clp->nfsc_flags & NFSCLFLAGS_FIRSTDELEG)) op->nfso_own->nfsow_clp->nfsc_flags |= (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG); ndp = malloc( sizeof (struct nfscldeleg) + newfhlen, M_NFSCLDELEG, M_WAITOK); LIST_INIT(&ndp->nfsdl_owner); LIST_INIT(&ndp->nfsdl_lock); ndp->nfsdl_clp = op->nfso_own->nfsow_clp; ndp->nfsdl_fhlen = newfhlen; NFSBCOPY(newfhp, ndp->nfsdl_fh, newfhlen); newnfs_copyincred(cred, &ndp->nfsdl_cred); nfscl_lockinit(&ndp->nfsdl_rwlock); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); ndp->nfsdl_stateid.seqid = *tl++; ndp->nfsdl_stateid.other[0] = *tl++; ndp->nfsdl_stateid.other[1] = *tl++; ndp->nfsdl_stateid.other[2] = *tl++; ret = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEWRITE) { ndp->nfsdl_flags = NFSCLDL_WRITE; /* * Indicates how much the file can grow. */ NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); limitby = fxdr_unsigned(int, *tl++); switch (limitby) { case NFSV4OPEN_LIMITSIZE: ndp->nfsdl_sizelimit = fxdr_hyper(tl); break; case NFSV4OPEN_LIMITBLOCKS: ndp->nfsdl_sizelimit = fxdr_unsigned(u_int64_t, *tl++); ndp->nfsdl_sizelimit *= fxdr_unsigned(u_int64_t, *tl); break; default: error = NFSERR_BADXDR; goto nfsmout; } } else { ndp->nfsdl_flags = NFSCLDL_READ; } if (ret) ndp->nfsdl_flags |= NFSCLDL_RECALL; error = nfsrv_dissectace(nd, &ndp->nfsdl_ace, &ret, &acesize, p); if (error) goto nfsmout; } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (error) goto nfsmout; if (ndp != NULL) { ndp->nfsdl_change = nfsva.na_filerev; ndp->nfsdl_modtime = nfsva.na_mtime; ndp->nfsdl_flags |= NFSCLDL_MODTIMESET; } if (!reclaim && (rflags & NFSV4OPEN_RESULTCONFIRM)) { do { ret = nfsrpc_openconfirm(vp, newfhp, newfhlen, op, cred, p); if (ret == NFSERR_DELAY) (void) nfs_catnap(PZERO, ret, "nfs_open"); } while (ret == NFSERR_DELAY); error = ret; } if ((rflags & NFSV4OPEN_LOCKTYPEPOSIX) || nfscl_assumeposixlocks) op->nfso_posixlock = 1; else op->nfso_posixlock = 0; /* * If the server is handing out delegations, but we didn't * get one because an OpenConfirm was required, try the * Open again, to get a delegation. This is a harmless no-op, * from a server's point of view. */ if (!reclaim && (rflags & NFSV4OPEN_RESULTCONFIRM) && (op->nfso_own->nfsow_clp->nfsc_flags & NFSCLFLAGS_GOTDELEG) && !error && dp == NULL && ndp == NULL && !recursed) { do { ret = nfsrpc_openrpc(nmp, vp, nfhp, fhlen, newfhp, newfhlen, mode, op, name, namelen, &ndp, 0, 0x0, cred, p, syscred, 1); if (ret == NFSERR_DELAY) (void) nfs_catnap(PZERO, ret, "nfs_open2"); } while (ret == NFSERR_DELAY); if (ret) { if (ndp != NULL) { free(ndp, M_NFSCLDELEG); ndp = NULL; } if (ret == NFSERR_STALECLIENTID || ret == NFSERR_STALEDONTRECOVER || ret == NFSERR_BADSESSION) error = ret; } } } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; if (error == NFSERR_STALECLIENTID) nfscl_initiate_recovery(op->nfso_own->nfsow_clp); nfsmout: if (!error) *dpp = ndp; else if (ndp != NULL) free(ndp, M_NFSCLDELEG); m_freem(nd->nd_mrep); return (error); } /* * open downgrade rpc */ int nfsrpc_opendowngrade(vnode_t vp, u_int32_t mode, struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; int error; NFSCL_REQSTART(nd, NFSPROC_OPENDOWNGRADE, vp); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 3 * NFSX_UNSIGNED); if (NFSHASNFSV4N(VFSTONFS(vp->v_mount))) *tl++ = 0; else *tl++ = op->nfso_stateid.seqid; *tl++ = op->nfso_stateid.other[0]; *tl++ = op->nfso_stateid.other[1]; *tl++ = op->nfso_stateid.other[2]; *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); *tl = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); error = nfscl_request(nd, vp, p, cred, NULL); if (error) return (error); NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (!nd->nd_repstat) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); op->nfso_stateid.seqid = *tl++; op->nfso_stateid.other[0] = *tl++; op->nfso_stateid.other[1] = *tl++; op->nfso_stateid.other[2] = *tl; } if (nd->nd_repstat && error == 0) error = nd->nd_repstat; if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(op->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * V4 Close operation. */ int nfsrpc_close(vnode_t vp, int doclose, NFSPROC_T *p) { struct nfsclclient *clp; int error; if (vnode_vtype(vp) != VREG) return (0); if (doclose) error = nfscl_doclose(vp, &clp, p); else error = nfscl_getclose(vp, &clp); if (error) return (error); nfscl_clientrelease(clp); return (0); } /* * Close the open. */ void nfsrpc_doclose(struct nfsmount *nmp, struct nfsclopen *op, NFSPROC_T *p) { struct nfsrv_descript nfsd, *nd = &nfsd; struct nfscllockowner *lp, *nlp; struct nfscllock *lop, *nlop; struct ucred *tcred; u_int64_t off = 0, len = 0; u_int32_t type = NFSV4LOCKT_READ; int error, do_unlock, trycnt; tcred = newnfs_getcred(); newnfs_copycred(&op->nfso_cred, tcred); /* * (Theoretically this could be done in the same * compound as the close, but having multiple * sequenced Ops in the same compound might be * too scary for some servers.) */ if (op->nfso_posixlock) { off = 0; len = NFS64BITSSET; type = NFSV4LOCKT_READ; } /* * Since this function is only called from VOP_INACTIVE(), no * other thread will be manipulating this Open. As such, the * lock lists are not being changed by other threads, so it should * be safe to do this without locking. */ LIST_FOREACH(lp, &op->nfso_lock, nfsl_list) { do_unlock = 1; LIST_FOREACH_SAFE(lop, &lp->nfsl_lock, nfslo_list, nlop) { if (op->nfso_posixlock == 0) { off = lop->nfslo_first; len = lop->nfslo_end - lop->nfslo_first; if (lop->nfslo_type == F_WRLCK) type = NFSV4LOCKT_WRITE; else type = NFSV4LOCKT_READ; } if (do_unlock) { trycnt = 0; do { error = nfsrpc_locku(nd, nmp, lp, off, len, type, tcred, p, 0); if ((nd->nd_repstat == NFSERR_GRACE || nd->nd_repstat == NFSERR_DELAY) && error == 0) (void) nfs_catnap(PZERO, (int)nd->nd_repstat, "nfs_close"); } while ((nd->nd_repstat == NFSERR_GRACE || nd->nd_repstat == NFSERR_DELAY) && error == 0 && trycnt++ < 5); if (op->nfso_posixlock) do_unlock = 0; } nfscl_freelock(lop, 0); } /* * Do a ReleaseLockOwner. * The lock owner name nfsl_owner may be used by other opens for * other files but the lock_owner4 name that nfsrpc_rellockown() * puts on the wire has the file handle for this file appended * to it, so it can be done now. */ (void)nfsrpc_rellockown(nmp, lp, lp->nfsl_open->nfso_fh, lp->nfsl_open->nfso_fhlen, tcred, p); } /* * There could be other Opens for different files on the same * OpenOwner, so locking is required. */ NFSLOCKCLSTATE(); nfscl_lockexcl(&op->nfso_own->nfsow_rwlock, NFSCLSTATEMUTEXPTR); NFSUNLOCKCLSTATE(); do { error = nfscl_tryclose(op, tcred, nmp, p); if (error == NFSERR_GRACE) (void) nfs_catnap(PZERO, error, "nfs_close"); } while (error == NFSERR_GRACE); NFSLOCKCLSTATE(); nfscl_lockunlock(&op->nfso_own->nfsow_rwlock); LIST_FOREACH_SAFE(lp, &op->nfso_lock, nfsl_list, nlp) nfscl_freelockowner(lp, 0); nfscl_freeopen(op, 0); NFSUNLOCKCLSTATE(); NFSFREECRED(tcred); } /* * The actual Close RPC. */ int nfsrpc_closerpc(struct nfsrv_descript *nd, struct nfsmount *nmp, struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p, int syscred) { u_int32_t *tl; int error; nfscl_reqstart(nd, NFSPROC_CLOSE, nmp, op->nfso_fh, op->nfso_fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID); *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = op->nfso_stateid.seqid; *tl++ = op->nfso_stateid.other[0]; *tl++ = op->nfso_stateid.other[1]; *tl = op->nfso_stateid.other[2]; if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (nd->nd_repstat == 0) NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); error = nd->nd_repstat; if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(op->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * V4 Open Confirm RPC. */ int nfsrpc_openconfirm(vnode_t vp, u_int8_t *nfhp, int fhlen, struct nfsclopen *op, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; int error; nmp = VFSTONFS(vp->v_mount); if (NFSHASNFSV4N(nmp)) return (0); /* No confirmation for NFSv4.1. */ nfscl_reqstart(nd, NFSPROC_OPENCONFIRM, nmp, nfhp, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID); *tl++ = op->nfso_stateid.seqid; *tl++ = op->nfso_stateid.other[0]; *tl++ = op->nfso_stateid.other[1]; *tl++ = op->nfso_stateid.other[2]; *tl = txdr_unsigned(op->nfso_own->nfsow_seqid); error = nfscl_request(nd, vp, p, cred, NULL); if (error) return (error); NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (!nd->nd_repstat) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); op->nfso_stateid.seqid = *tl++; op->nfso_stateid.other[0] = *tl++; op->nfso_stateid.other[1] = *tl++; op->nfso_stateid.other[2] = *tl; } error = nd->nd_repstat; if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(op->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Do the setclientid and setclientid confirm RPCs. Called from nfs_statfs() * when a mount has just occurred and when the server replies NFSERR_EXPIRED. */ int nfsrpc_setclient(struct nfsmount *nmp, struct nfsclclient *clp, int reclaim, bool *retokp, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; u_int8_t *cp = NULL, *cp2, addr[INET6_ADDRSTRLEN + 9]; u_short port; int error, isinet6 = 0, callblen; nfsquad_t confirm; u_int32_t lease; static u_int32_t rev = 0; struct nfsclds *dsp, *odsp; struct in6_addr a6; struct nfsclsession *tsep; if (nfsboottime.tv_sec == 0) NFSSETBOOTTIME(nfsboottime); if (NFSHASNFSV4N(nmp)) { error = NFSERR_BADSESSION; odsp = dsp = NULL; if (retokp != NULL) { NFSLOCKMNT(nmp); odsp = TAILQ_FIRST(&nmp->nm_sess); NFSUNLOCKMNT(nmp); } if (odsp != NULL) { /* * When a session already exists, first try a * CreateSession with the extant ClientID. */ dsp = malloc(sizeof(struct nfsclds) + odsp->nfsclds_servownlen + 1, M_NFSCLDS, M_WAITOK | M_ZERO); dsp->nfsclds_expire = NFSD_MONOSEC + clp->nfsc_renew; dsp->nfsclds_servownlen = odsp->nfsclds_servownlen; dsp->nfsclds_sess.nfsess_clientid = odsp->nfsclds_sess.nfsess_clientid; dsp->nfsclds_sess.nfsess_sequenceid = odsp->nfsclds_sess.nfsess_sequenceid; dsp->nfsclds_flags = odsp->nfsclds_flags; if (dsp->nfsclds_servownlen > 0) memcpy(dsp->nfsclds_serverown, odsp->nfsclds_serverown, dsp->nfsclds_servownlen + 1); mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF); mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF); nfscl_initsessionslots(&dsp->nfsclds_sess); error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess, &nmp->nm_sockreq, NULL, dsp->nfsclds_sess.nfsess_sequenceid, 1, cred, p); NFSCL_DEBUG(1, "create session for extant " "ClientID=%d\n", error); if (error != 0) { nfscl_freenfsclds(dsp); dsp = NULL; /* * If *retokp is true, return any error other * than NFSERR_STALECLIENTID, * NFSERR_BADSESSION or NFSERR_STALEDONTRECOVER * so that nfscl_recover() will not loop. */ if (*retokp) return (NFSERR_IO); } else *retokp = true; } else if (retokp != NULL && *retokp) return (NFSERR_IO); if (error != 0) { /* * Either there was no previous session or the * CreateSession attempt failed, so... * do an ExchangeID followed by the CreateSession. */ clp->nfsc_rev = rev++; error = nfsrpc_exchangeid(nmp, clp, &nmp->nm_sockreq, 0, NFSV4EXCH_USEPNFSMDS | NFSV4EXCH_USENONPNFS, &dsp, cred, p); NFSCL_DEBUG(1, "aft exch=%d\n", error); if (error == 0) error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess, &nmp->nm_sockreq, NULL, dsp->nfsclds_sess.nfsess_sequenceid, 1, cred, p); NFSCL_DEBUG(1, "aft createsess=%d\n", error); } if (error == 0) { NFSLOCKMNT(nmp); /* * The old sessions cannot be safely free'd * here, since they may still be used by * in-progress RPCs. */ tsep = NULL; if (TAILQ_FIRST(&nmp->nm_sess) != NULL) tsep = NFSMNT_MDSSESSION(nmp); TAILQ_INSERT_HEAD(&nmp->nm_sess, dsp, nfsclds_list); /* * Wake up RPCs waiting for a slot on the * old session. These will then fail with * NFSERR_BADSESSION and be retried with the * new session by nfsv4_setsequence(). * Also wakeup() processes waiting for the * new session. */ if (tsep != NULL) wakeup(&tsep->nfsess_slots); wakeup(&nmp->nm_sess); NFSUNLOCKMNT(nmp); } else if (dsp != NULL) nfscl_freenfsclds(dsp); if (error == 0 && reclaim == 0) { error = nfsrpc_reclaimcomplete(nmp, cred, p); NFSCL_DEBUG(1, "aft reclaimcomp=%d\n", error); if (error == NFSERR_COMPLETEALREADY || error == NFSERR_NOTSUPP) /* Ignore this error. */ error = 0; } return (error); } else if (retokp != NULL && *retokp) return (NFSERR_IO); clp->nfsc_rev = rev++; /* * Allocate a single session structure for NFSv4.0, because some of * the fields are used by NFSv4.0 although it doesn't do a session. */ dsp = malloc(sizeof(struct nfsclds), M_NFSCLDS, M_WAITOK | M_ZERO); mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF); mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF); NFSLOCKMNT(nmp); TAILQ_INSERT_HEAD(&nmp->nm_sess, dsp, nfsclds_list); tsep = NFSMNT_MDSSESSION(nmp); NFSUNLOCKMNT(nmp); nfscl_reqstart(nd, NFSPROC_SETCLIENTID, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(nfsboottime.tv_sec); *tl = txdr_unsigned(clp->nfsc_rev); (void) nfsm_strtom(nd, clp->nfsc_id, clp->nfsc_idlen); /* * set up the callback address */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFS_CALLBCKPROG); callblen = strlen(nfsv4_callbackaddr); if (callblen == 0) cp = nfscl_getmyip(nmp, &a6, &isinet6); if (nfscl_enablecallb && nfs_numnfscbd > 0 && (callblen > 0 || cp != NULL)) { port = htons(nfsv4_cbport); cp2 = (u_int8_t *)&port; #ifdef INET6 if ((callblen > 0 && strchr(nfsv4_callbackaddr, ':')) || isinet6) { char ip6buf[INET6_ADDRSTRLEN], *ip6add; (void) nfsm_strtom(nd, "tcp6", 4); if (callblen == 0) { ip6_sprintf(ip6buf, (struct in6_addr *)cp); ip6add = ip6buf; } else { ip6add = nfsv4_callbackaddr; } snprintf(addr, INET6_ADDRSTRLEN + 9, "%s.%d.%d", ip6add, cp2[0], cp2[1]); } else #endif { (void) nfsm_strtom(nd, "tcp", 3); if (callblen == 0) snprintf(addr, INET6_ADDRSTRLEN + 9, "%d.%d.%d.%d.%d.%d", cp[0], cp[1], cp[2], cp[3], cp2[0], cp2[1]); else snprintf(addr, INET6_ADDRSTRLEN + 9, "%s.%d.%d", nfsv4_callbackaddr, cp2[0], cp2[1]); } (void) nfsm_strtom(nd, addr, strlen(addr)); } else { (void) nfsm_strtom(nd, "tcp", 3); (void) nfsm_strtom(nd, "0.0.0.0.0.0", 11); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(clp->nfsc_cbident); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); tsep->nfsess_clientid.lval[0] = *tl++; tsep->nfsess_clientid.lval[1] = *tl++; confirm.lval[0] = *tl++; confirm.lval[1] = *tl; m_freem(nd->nd_mrep); nd->nd_mrep = NULL; /* * and confirm it. */ nfscl_reqstart(nd, NFSPROC_SETCLIENTIDCFRM, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED); *tl++ = tsep->nfsess_clientid.lval[0]; *tl++ = tsep->nfsess_clientid.lval[1]; *tl++ = confirm.lval[0]; *tl = confirm.lval[1]; nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); m_freem(nd->nd_mrep); nd->nd_mrep = NULL; if (nd->nd_repstat == 0) { nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, nmp->nm_fh, nmp->nm_fhsize, NULL, NULL, 0, 0); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_LEASETIME); (void) nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); if (nd->nd_repstat == 0) { error = nfsv4_loadattr(nd, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, &lease, NULL, p, cred); if (error) goto nfsmout; clp->nfsc_renew = NFSCL_RENEW(lease); clp->nfsc_expire = NFSD_MONOSEC + clp->nfsc_renew; clp->nfsc_clientidrev++; if (clp->nfsc_clientidrev == 0) clp->nfsc_clientidrev++; } } } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs getattr call. */ int nfsrpc_getattr(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfsattrbit_t attrbits; NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp); if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (!nd->nd_repstat) error = nfsm_loadattr(nd, nap); else error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * nfs getattr call with non-vnode arguemnts. */ int nfsrpc_getattrnovp(struct nfsmount *nmp, u_int8_t *fhp, int fhlen, int syscred, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, u_int64_t *xidp, uint32_t *leasep) { struct nfsrv_descript nfsd, *nd = &nfsd; int error, vers = NFS_VER2; nfsattrbit_t attrbits; nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, fhp, fhlen, NULL, NULL, 0, 0); if (nd->nd_flag & ND_NFSV4) { vers = NFS_VER4; NFSGETATTR_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_LEASETIME); (void) nfsrv_putattrbit(nd, &attrbits); } else if (nd->nd_flag & ND_NFSV3) { vers = NFS_VER3; } if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, vers, NULL, 1, xidp, NULL); if (error) return (error); if (nd->nd_repstat == 0) { if ((nd->nd_flag & ND_NFSV4) != 0) error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, leasep, NULL, NULL, NULL); else error = nfsm_loadattr(nd, nap); } else error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do an nfs setattr operation. */ int nfsrpc_setattr(vnode_t vp, struct vattr *vap, NFSACL_T *aclp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *rnap, int *attrflagp, void *stuff) { int error, expireret = 0, openerr, retrycnt; u_int32_t clidrev = 0, mode; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsfh *nfhp; nfsv4stateid_t stateid; void *lckp; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; if (vap != NULL && NFSATTRISSET(u_quad_t, vap, va_size)) mode = NFSV4OPEN_ACCESSWRITE; else mode = NFSV4OPEN_ACCESSREAD; retrycnt = 0; do { lckp = NULL; openerr = 1; if (NFSHASNFSV4(nmp)) { nfhp = VTONFS(vp)->n_fhp; error = nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, mode, 0, cred, p, &stateid, &lckp); if (error && vnode_vtype(vp) == VREG && (mode == NFSV4OPEN_ACCESSWRITE || nfstest_openallsetattr)) { /* * No Open stateid, so try and open the file * now. */ if (mode == NFSV4OPEN_ACCESSWRITE) openerr = nfsrpc_open(vp, FWRITE, cred, p); else openerr = nfsrpc_open(vp, FREAD, cred, p); if (!openerr) (void) nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, mode, 0, cred, p, &stateid, &lckp); } } if (vap != NULL) error = nfsrpc_setattrrpc(vp, vap, &stateid, cred, p, rnap, attrflagp, stuff); else error = nfsrpc_setaclrpc(vp, cred, p, aclp, &stateid, stuff); if (error == NFSERR_OPENMODE && mode == NFSV4OPEN_ACCESSREAD) { NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_OPENMODE; NFSUNLOCKMNT(nmp); } if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (!openerr) (void) nfsrpc_close(vp, 0, p); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_setattr"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4) || (error == NFSERR_OPENMODE && mode == NFSV4OPEN_ACCESSREAD && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; return (error); } static int nfsrpc_setattrrpc(vnode_t vp, struct vattr *vap, nfsv4stateid_t *stateidp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *rnap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_SETATTR, vp); if (nd->nd_flag & ND_NFSV4) nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); vap->va_type = vnode_vtype(vp); nfscl_fillsattr(nd, vap, vp, NFSSATTR_FULL, 0); if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = newnfs_false; } else if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) error = nfscl_wcc_data(nd, vp, rnap, attrflagp, NULL, stuff); if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 && !error) error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (!(nd->nd_flag & ND_NFSV3) && !nd->nd_repstat && !error) error = nfscl_postop_attr(nd, rnap, attrflagp, stuff); m_freem(nd->nd_mrep); if (nd->nd_repstat && !error) error = nd->nd_repstat; return (error); } /* * nfs lookup rpc */ int nfsrpc_lookup(vnode_t dvp, char *name, int len, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *stuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; struct nfsnode *np; struct nfsfh *nfhp; nfsattrbit_t attrbits; int error = 0, lookupp = 0; *attrflagp = 0; *dattrflagp = 0; if (vnode_vtype(dvp) != VDIR) return (ENOTDIR); nmp = VFSTONFS(dvp->v_mount); if (len > NFS_MAXNAMLEN) return (ENAMETOOLONG); if (NFSHASNFSV4(nmp) && len == 1 && name[0] == '.') { /* * Just return the current dir's fh. */ np = VTONFS(dvp); nfhp = malloc(sizeof (struct nfsfh) + np->n_fhp->nfh_len, M_NFSFH, M_WAITOK); nfhp->nfh_len = np->n_fhp->nfh_len; NFSBCOPY(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len); *nfhpp = nfhp; return (0); } if (NFSHASNFSV4(nmp) && len == 2 && name[0] == '.' && name[1] == '.') { lookupp = 1; NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, dvp); } else { NFSCL_REQSTART(nd, NFSPROC_LOOKUP, dvp); (void) nfsm_strtom(nd, name, len); } if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, dvp, p, cred, stuff); if (error) return (error); if (nd->nd_repstat) { /* * When an NFSv4 Lookupp returns ENOENT, it means that * the lookup is at the root of an fs, so return this dir. */ if (nd->nd_repstat == NFSERR_NOENT && lookupp) { np = VTONFS(dvp); nfhp = malloc(sizeof (struct nfsfh) + np->n_fhp->nfh_len, M_NFSFH, M_WAITOK); nfhp->nfh_len = np->n_fhp->nfh_len; NFSBCOPY(np->n_fhp->nfh_fh, nfhp->nfh_fh, nfhp->nfh_len); *nfhpp = nfhp; m_freem(nd->nd_mrep); return (0); } if (nd->nd_flag & ND_NFSV3) error = nfscl_postop_attr(nd, dnap, dattrflagp, stuff); else if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); if (error == 0) *dattrflagp = 1; } goto nfsmout; } if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); if (error != 0) goto nfsmout; *dattrflagp = 1; /* Skip over the Lookup and GetFH operation status values. */ NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); } error = nfsm_getfh(nd, nfhpp); if (error) goto nfsmout; error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if ((nd->nd_flag & ND_NFSV3) && !error) error = nfscl_postop_attr(nd, dnap, dattrflagp, stuff); nfsmout: m_freem(nd->nd_mrep); if (!error && nd->nd_repstat) error = nd->nd_repstat; return (error); } /* * Do a readlink rpc. */ int nfsrpc_readlink(vnode_t vp, struct uio *uiop, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsnode *np = VTONFS(vp); nfsattrbit_t attrbits; int error, len, cangetattr = 1; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_READLINK, vp); if (nd->nd_flag & ND_NFSV4) { /* * And do a Getattr op. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (!nd->nd_repstat && !error) { NFSM_STRSIZ(len, NFS_MAXPATHLEN); /* * This seems weird to me, but must have been added to * FreeBSD for some reason. The only thing I can think of * is that there was/is some server that replies with * more link data than it should? */ if (len == NFS_MAXPATHLEN) { NFSLOCKNODE(np); if (np->n_size > 0 && np->n_size < NFS_MAXPATHLEN) { len = np->n_size; cangetattr = 0; } NFSUNLOCKNODE(np); } error = nfsm_mbufuio(nd, uiop, len); if ((nd->nd_flag & ND_NFSV4) && !error && cangetattr) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Read operation. */ int nfsrpc_read(vnode_t vp, struct uio *uiop, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { int error, expireret = 0, retrycnt; u_int32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *np = VTONFS(vp); struct ucred *newcred; struct nfsfh *nfhp = NULL; nfsv4stateid_t stateid; void *lckp; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; newcred = cred; if (NFSHASNFSV4(nmp)) { nfhp = np->n_fhp; newcred = NFSNEWCRED(cred); } retrycnt = 0; do { lckp = NULL; if (NFSHASNFSV4(nmp)) (void)nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSREAD, 0, newcred, p, &stateid, &lckp); error = nfsrpc_readrpc(vp, uiop, newcred, &stateid, p, nap, attrflagp, stuff); if (error == NFSERR_OPENMODE) { NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_OPENMODE; NFSUNLOCKMNT(nmp); } if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_read"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4) || (error == NFSERR_OPENMODE && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; if (NFSHASNFSV4(nmp)) NFSFREECRED(newcred); return (error); } /* * The actual read RPC. */ static int nfsrpc_readrpc(vnode_t vp, struct uio *uiop, struct ucred *cred, nfsv4stateid_t *stateidp, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; int error = 0, len, retlen, tsiz, eof = 0; struct nfsrv_descript nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsrv_descript *nd = &nfsd; int rsize; off_t tmp_off; *attrflagp = 0; tsiz = uiop->uio_resid; tmp_off = uiop->uio_offset + tsiz; NFSLOCKMNT(nmp); if (tmp_off > nmp->nm_maxfilesize || tmp_off < uiop->uio_offset) { NFSUNLOCKMNT(nmp); return (EFBIG); } rsize = nmp->nm_rsize; NFSUNLOCKMNT(nmp); nd->nd_mrep = NULL; while (tsiz > 0) { *attrflagp = 0; len = (tsiz > rsize) ? rsize : tsiz; NFSCL_REQSTART(nd, NFSPROC_READ, vp); if (nd->nd_flag & ND_NFSV4) nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED * 3); if (nd->nd_flag & ND_NFSV2) { *tl++ = txdr_unsigned(uiop->uio_offset); *tl++ = txdr_unsigned(len); *tl = 0; } else { txdr_hyper(uiop->uio_offset, tl); *(tl + 2) = txdr_unsigned(len); } /* * Since I can't do a Getattr for NFSv4 for Write, there * doesn't seem any point in doing one here, either. * (See the comment in nfsrpc_writerpc() for more info.) */ error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); } else if (!nd->nd_repstat && (nd->nd_flag & ND_NFSV2)) { error = nfsm_loadattr(nd, nap); if (!error) *attrflagp = 1; } if (nd->nd_repstat || error) { if (!error) error = nd->nd_repstat; goto nfsmout; } if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); eof = fxdr_unsigned(int, *(tl + 1)); } else if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); eof = fxdr_unsigned(int, *tl); } NFSM_STRSIZ(retlen, len); error = nfsm_mbufuio(nd, uiop, retlen); if (error) goto nfsmout; m_freem(nd->nd_mrep); nd->nd_mrep = NULL; tsiz -= retlen; if (!(nd->nd_flag & ND_NFSV2)) { if (eof || retlen == 0) tsiz = 0; } else if (retlen < len) tsiz = 0; } return (0); nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); return (error); } /* * nfs write operation * When called_from_strategy != 0, it should return EIO for an error that * indicates recovery is in progress, so that the buffer will be left * dirty and be written back to the server later. If it loops around, * the recovery thread could get stuck waiting for the buffer and recovery * will then deadlock. */ int nfsrpc_write(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff, int called_from_strategy) { int error, expireret = 0, retrycnt, nostateid; u_int32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *np = VTONFS(vp); struct ucred *newcred; struct nfsfh *nfhp = NULL; nfsv4stateid_t stateid; void *lckp; *must_commit = 0; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; newcred = cred; if (NFSHASNFSV4(nmp)) { newcred = NFSNEWCRED(cred); nfhp = np->n_fhp; } retrycnt = 0; do { lckp = NULL; nostateid = 0; if (NFSHASNFSV4(nmp)) { (void)nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSWRITE, 0, newcred, p, &stateid, &lckp); if (stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { nostateid = 1; NFSCL_DEBUG(1, "stateid0 in write\n"); } } /* * If there is no stateid for NFSv4, it means this is an * extraneous write after close. Basically a poorly * implemented buffer cache. Just don't do the write. */ if (nostateid) error = 0; else error = nfsrpc_writerpc(vp, uiop, iomode, must_commit, newcred, &stateid, p, nap, attrflagp, stuff); if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_write"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_DELAY || ((error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER) && called_from_strategy == 0) || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error != 0 && (retrycnt >= 4 || ((error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER) && called_from_strategy != 0))) error = EIO; if (NFSHASNFSV4(nmp)) NFSFREECRED(newcred); return (error); } /* * The actual write RPC. */ static int nfsrpc_writerpc(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, struct ucred *cred, nfsv4stateid_t *stateidp, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *np = VTONFS(vp); int error = 0, len, tsiz, rlen, commit, committed = NFSWRITE_FILESYNC; int wccflag = 0, wsize; int32_t backup; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; off_t tmp_off; KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1")); *attrflagp = 0; tsiz = uiop->uio_resid; tmp_off = uiop->uio_offset + tsiz; NFSLOCKMNT(nmp); if (tmp_off > nmp->nm_maxfilesize || tmp_off < uiop->uio_offset) { NFSUNLOCKMNT(nmp); return (EFBIG); } wsize = nmp->nm_wsize; NFSUNLOCKMNT(nmp); nd->nd_mrep = NULL; /* NFSv2 sometimes does a write with */ nd->nd_repstat = 0; /* uio_resid == 0, so the while is not done */ while (tsiz > 0) { *attrflagp = 0; len = (tsiz > wsize) ? wsize : tsiz; NFSCL_REQSTART(nd, NFSPROC_WRITE, vp); if (nd->nd_flag & ND_NFSV4) { nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER+2*NFSX_UNSIGNED); txdr_hyper(uiop->uio_offset, tl); tl += 2; *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); } else if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER+3*NFSX_UNSIGNED); txdr_hyper(uiop->uio_offset, tl); tl += 2; *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); } else { u_int32_t x; NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* * Not sure why someone changed this, since the * RFC clearly states that "beginoffset" and * "totalcount" are ignored, but it wouldn't * surprise me if there's a busted server out there. */ /* Set both "begin" and "current" to non-garbage. */ x = txdr_unsigned((u_int32_t)uiop->uio_offset); *tl++ = x; /* "begin offset" */ *tl++ = x; /* "current offset" */ x = txdr_unsigned(len); *tl++ = x; /* total to this offset */ *tl = x; /* size of this write */ } nfsm_uiombuf(nd, uiop, len); /* * Although it is tempting to do a normal Getattr Op in the * NFSv4 compound, the result can be a nearly hung client * system if the Getattr asks for Owner and/or OwnerGroup. * It occurs when the client can't map either the Owner or * Owner_group name in the Getattr reply to a uid/gid. When * there is a cache miss, the kernel does an upcall to the * nfsuserd. Then, it can try and read the local /etc/passwd * or /etc/group file. It can then block in getnewbuf(), * waiting for dirty writes to be pushed to the NFS server. * The only reason this doesn't result in a complete * deadlock, is that the upcall times out and allows * the write to complete. However, progress is so slow * that it might just as well be deadlocked. * As such, we get the rest of the attributes, but not * Owner or Owner_group. * nb: nfscl_loadattrcache() needs to be told that these * partial attributes from a write rpc are being * passed in, via a argument flag. */ if (nd->nd_flag & ND_NFSV4) { NFSWRITEGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_repstat) { /* * In case the rpc gets retried, roll - * the uio fileds changed by nfsm_uiombuf() + * the uio fields changed by nfsm_uiombuf() * back. */ uiop->uio_offset -= len; uiop->uio_resid += len; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - len; uiop->uio_iov->iov_len += len; } if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { error = nfscl_wcc_data(nd, vp, nap, attrflagp, &wccflag, stuff); if (error) goto nfsmout; } if (!nd->nd_repstat) { if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); if (rlen == 0) { error = NFSERR_IO; goto nfsmout; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest commitment level * obtained by any of the RPCs. */ if (committed == NFSWRITE_FILESYNC) committed = commit; else if (committed == NFSWRITE_DATASYNC && commit == NFSWRITE_UNSTABLE) committed = commit; NFSLOCKMNT(nmp); if (!NFSHASWRITEVERF(nmp)) { NFSBCOPY((caddr_t)tl, (caddr_t)&nmp->nm_verf[0], NFSX_VERF); NFSSETWRITEVERF(nmp); } else if (NFSBCMP(tl, nmp->nm_verf, NFSX_VERF)) { *must_commit = 1; NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); } NFSUNLOCKMNT(nmp); } if (nd->nd_flag & ND_NFSV4) NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (nd->nd_flag & (ND_NFSV2 | ND_NFSV4)) { error = nfsm_loadattr(nd, nap); if (!error) *attrflagp = NFS_LATTR_NOSHRINK; } } else { error = nd->nd_repstat; } if (error) goto nfsmout; NFSWRITERPC_SETTIME(wccflag, np, nap, (nd->nd_flag & ND_NFSV4)); m_freem(nd->nd_mrep); nd->nd_mrep = NULL; tsiz -= len; } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); *iomode = committed; if (nd->nd_repstat && !error) error = nd->nd_repstat; return (error); } /* * nfs mknod rpc * For NFS v2 this is a kludge. Use a create rpc but with the IFMT bits of the * mode set to specify the file type and the size field for rdev. */ int nfsrpc_mknod(vnode_t dvp, char *name, int namelen, struct vattr *vap, u_int32_t rdev, enum vtype vtyp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; int error = 0; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; *nfhpp = NULL; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_MKNOD, dvp); if (nd->nd_flag & ND_NFSV4) { if (vtyp == VBLK || vtyp == VCHR) { NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = vtonfsv34_type(vtyp); *tl++ = txdr_unsigned(NFSMAJOR(rdev)); *tl = txdr_unsigned(NFSMINOR(rdev)); } else { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = vtonfsv34_type(vtyp); } } (void) nfsm_strtom(nd, name, namelen); if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = vtonfsv34_type(vtyp); } if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) nfscl_fillsattr(nd, vap, dvp, 0, 0); if ((nd->nd_flag & ND_NFSV3) && (vtyp == VCHR || vtyp == VBLK)) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSMAJOR(rdev)); *tl = txdr_unsigned(NFSMINOR(rdev)); } if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); } if (nd->nd_flag & ND_NFSV2) nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZERDEV, rdev); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & ND_NFSV4) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (!nd->nd_repstat) { if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error) goto nfsmout; } error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (error) goto nfsmout; } if (nd->nd_flag & ND_NFSV3) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (!error && nd->nd_repstat) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs file create call * Mostly just call the approriate routine. (I separated out v4, so that * error recovery wouldn't be as difficult.) */ int nfsrpc_create(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { int error = 0, newone, expireret = 0, retrycnt, unlocked; struct nfsclowner *owp; struct nfscldeleg *dp; struct nfsmount *nmp = VFSTONFS(dvp->v_mount); u_int32_t clidrev; if (NFSHASNFSV4(nmp)) { retrycnt = 0; do { dp = NULL; error = nfscl_open(dvp, NULL, 0, (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), 0, cred, p, &owp, NULL, &newone, NULL, 1); if (error) return (error); if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; else clidrev = 0; if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 || nfs_numnfscbd == 0 || retrycnt > 0) error = nfsrpc_createv4(dvp, name, namelen, vap, cverf, fmode, owp, &dp, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp, dstuff, &unlocked); else error = nfsrpc_getcreatelayout(dvp, name, namelen, vap, cverf, fmode, owp, &dp, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp, dstuff, &unlocked); /* * There is no need to invalidate cached attributes here, * since new post-delegation issue attributes are always * returned by nfsrpc_createv4() and these will update the * attribute cache. */ if (dp != NULL) (void) nfscl_deleg(nmp->nm_mountp, owp->nfsow_clp, (*nfhpp)->nfh_fh, (*nfhpp)->nfh_len, cred, p, &dp); nfscl_ownerrelease(nmp, owp, error, newone, unlocked); if (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_open"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); retrycnt++; } } while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; } else { error = nfsrpc_createv23(dvp, name, namelen, vap, cverf, fmode, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp, dstuff); } return (error); } /* * The create rpc for v2 and 3. */ static int nfsrpc_createv23(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; int error = 0; struct nfsrv_descript nfsd, *nd = &nfsd; *nfhpp = NULL; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_CREATE, dvp); (void) nfsm_strtom(nd, name, namelen); if (nd->nd_flag & ND_NFSV3) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); if (fmode & O_EXCL) { *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } } else { nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZE0, 0); } error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_repstat == 0) { error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (error) goto nfsmout; } if (nd->nd_flag & ND_NFSV3) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } static int nfsrpc_createv4(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff, int *unlockedp) { u_int32_t *tl; int error = 0, deleg, newone, ret, acesize, limitby; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsclopen *op; struct nfscldeleg *dp = NULL; struct nfsnode *np; struct nfsfh *nfhp; nfsattrbit_t attrbits; nfsv4stateid_t stateid; u_int32_t rflags; struct nfsmount *nmp; struct nfsclsession *tsep; nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); *unlockedp = 0; *nfhpp = NULL; *dpp = NULL; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_CREATE, dvp); /* * For V4, this is actually an Open op. */ NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(owp->nfsow_seqid); *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; (void) nfsm_strtom(nd, owp->nfsow_owner, NFSV4CL_LOCKNAMELEN); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_CREATE); if (fmode & O_EXCL) { if (NFSHASNFSV4N(nmp)) { if (NFSHASSESSPERSIST(nmp)) { /* Use GUARDED for persistent sessions. */ *tl = txdr_unsigned(NFSCREATE_GUARDED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } else { /* Otherwise, use EXCLUSIVE4_1. */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; nfscl_fillsattr(nd, vap, dvp, 0, 0); } } else { /* NFSv4.0 */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; } } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); (void) nfsm_strtom(nd, name, namelen); /* Get the new file's handle and attributes. */ NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); /* Get the directory's post-op attributes. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); (void) nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); NFSCL_INCRSEQID(owp->nfsow_seqid, nd); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); stateid.seqid = *tl++; stateid.other[0] = *tl++; stateid.other[1] = *tl++; stateid.other[2] = *tl; rflags = fxdr_unsigned(u_int32_t, *(tl + 6)); (void) nfsrv_getattrbits(nd, &attrbits, NULL, NULL); NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); deleg = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEREAD || deleg == NFSV4OPEN_DELEGATEWRITE) { if (!(owp->nfsow_clp->nfsc_flags & NFSCLFLAGS_FIRSTDELEG)) owp->nfsow_clp->nfsc_flags |= (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG); dp = malloc( sizeof (struct nfscldeleg) + NFSX_V4FHMAX, M_NFSCLDELEG, M_WAITOK); LIST_INIT(&dp->nfsdl_owner); LIST_INIT(&dp->nfsdl_lock); dp->nfsdl_clp = owp->nfsow_clp; newnfs_copyincred(cred, &dp->nfsdl_cred); nfscl_lockinit(&dp->nfsdl_rwlock); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); dp->nfsdl_stateid.seqid = *tl++; dp->nfsdl_stateid.other[0] = *tl++; dp->nfsdl_stateid.other[1] = *tl++; dp->nfsdl_stateid.other[2] = *tl++; ret = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEWRITE) { dp->nfsdl_flags = NFSCLDL_WRITE; /* * Indicates how much the file can grow. */ NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); limitby = fxdr_unsigned(int, *tl++); switch (limitby) { case NFSV4OPEN_LIMITSIZE: dp->nfsdl_sizelimit = fxdr_hyper(tl); break; case NFSV4OPEN_LIMITBLOCKS: dp->nfsdl_sizelimit = fxdr_unsigned(u_int64_t, *tl++); dp->nfsdl_sizelimit *= fxdr_unsigned(u_int64_t, *tl); break; default: error = NFSERR_BADXDR; goto nfsmout; } } else { dp->nfsdl_flags = NFSCLDL_READ; } if (ret) dp->nfsdl_flags |= NFSCLDL_RECALL; error = nfsrv_dissectace(nd, &dp->nfsdl_ace, &ret, &acesize, p); if (error) goto nfsmout; } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; } error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (error) goto nfsmout; /* Get rid of the PutFH and Getattr status values. */ NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); if (error) goto nfsmout; *dattrflagp = 1; if (dp != NULL && *attrflagp) { dp->nfsdl_change = nnap->na_filerev; dp->nfsdl_modtime = nnap->na_mtime; dp->nfsdl_flags |= NFSCLDL_MODTIMESET; } /* * We can now complete the Open state. */ nfhp = *nfhpp; if (dp != NULL) { dp->nfsdl_fhlen = nfhp->nfh_len; NFSBCOPY(nfhp->nfh_fh, dp->nfsdl_fh, nfhp->nfh_len); } /* * Get an Open structure that will be * attached to the OpenOwner, acquired already. */ error = nfscl_open(dvp, nfhp->nfh_fh, nfhp->nfh_len, (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), 0, cred, p, NULL, &op, &newone, NULL, 0); if (error) goto nfsmout; op->nfso_stateid = stateid; newnfs_copyincred(cred, &op->nfso_cred); if ((rflags & NFSV4OPEN_RESULTCONFIRM)) { do { ret = nfsrpc_openconfirm(dvp, nfhp->nfh_fh, nfhp->nfh_len, op, cred, p); if (ret == NFSERR_DELAY) (void) nfs_catnap(PZERO, ret, "nfs_create"); } while (ret == NFSERR_DELAY); error = ret; } /* * If the server is handing out delegations, but we didn't * get one because an OpenConfirm was required, try the * Open again, to get a delegation. This is a harmless no-op, * from a server's point of view. */ if ((rflags & NFSV4OPEN_RESULTCONFIRM) && (owp->nfsow_clp->nfsc_flags & NFSCLFLAGS_GOTDELEG) && !error && dp == NULL) { do { ret = nfsrpc_openrpc(VFSTONFS(dvp->v_mount), dvp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, nfhp->nfh_fh, nfhp->nfh_len, (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), op, name, namelen, &dp, 0, 0x0, cred, p, 0, 1); if (ret == NFSERR_DELAY) (void) nfs_catnap(PZERO, ret, "nfs_crt2"); } while (ret == NFSERR_DELAY); if (ret) { if (dp != NULL) { free(dp, M_NFSCLDELEG); dp = NULL; } if (ret == NFSERR_STALECLIENTID || ret == NFSERR_STALEDONTRECOVER || ret == NFSERR_BADSESSION) error = ret; } } nfscl_openrelease(nmp, op, error, newone); *unlockedp = 1; } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; if (error == NFSERR_STALECLIENTID) nfscl_initiate_recovery(owp->nfsow_clp); nfsmout: if (!error) *dpp = dp; else if (dp != NULL) free(dp, M_NFSCLDELEG); m_freem(nd->nd_mrep); return (error); } /* * Nfs remove rpc */ int nfsrpc_remove(vnode_t dvp, char *name, int namelen, vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, int *dattrflagp, void *dstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsnode *np; struct nfsmount *nmp; nfsv4stateid_t dstateid; int error, ret = 0, i; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); nmp = VFSTONFS(dvp->v_mount); tryagain: if (NFSHASNFSV4(nmp) && ret == 0) { ret = nfscl_removedeleg(vp, p, &dstateid); if (ret == 1) { NFSCL_REQSTART(nd, NFSPROC_RETDELEGREMOVE, vp); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = dstateid.seqid; *tl++ = dstateid.other[0]; *tl++ = dstateid.other[1]; *tl++ = dstateid.other[2]; *tl = txdr_unsigned(NFSV4OP_PUTFH); np = VTONFS(dvp); (void) nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_REMOVE); } } else { ret = 0; } if (ret == 0) NFSCL_REQSTART(nd, NFSPROC_REMOVE, dvp); (void) nfsm_strtom(nd, name, namelen); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { /* For NFSv4, parse out any Delereturn replies. */ if (ret > 0 && nd->nd_repstat != 0 && (nd->nd_flag & ND_NOMOREDATA)) { /* * If the Delegreturn failed, try again without * it. The server will Recall, as required. */ m_freem(nd->nd_mrep); goto tryagain; } for (i = 0; i < (ret * 2); i++) { if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; } } error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Do an nfs rename rpc. */ int nfsrpc_rename(vnode_t fdvp, vnode_t fvp, char *fnameptr, int fnamelen, vnode_t tdvp, vnode_t tvp, char *tnameptr, int tnamelen, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *fnap, struct nfsvattr *tnap, int *fattrflagp, int *tattrflagp, void *fstuff, void *tstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; struct nfsnode *np; nfsattrbit_t attrbits; nfsv4stateid_t fdstateid, tdstateid; int error = 0, ret = 0, gottd = 0, gotfd = 0, i; *fattrflagp = 0; *tattrflagp = 0; nmp = VFSTONFS(fdvp->v_mount); if (fnamelen > NFS_MAXNAMLEN || tnamelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); tryagain: if (NFSHASNFSV4(nmp) && ret == 0) { ret = nfscl_renamedeleg(fvp, &fdstateid, &gotfd, tvp, &tdstateid, &gottd, p); if (gotfd && gottd) { NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME2, fvp); } else if (gotfd) { NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME1, fvp); } else if (gottd) { NFSCL_REQSTART(nd, NFSPROC_RETDELEGRENAME1, tvp); } if (gotfd) { NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = fdstateid.seqid; *tl++ = fdstateid.other[0]; *tl++ = fdstateid.other[1]; *tl = fdstateid.other[2]; if (gottd) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); np = VTONFS(tvp); (void) nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_DELEGRETURN); } } if (gottd) { NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = tdstateid.seqid; *tl++ = tdstateid.other[0]; *tl++ = tdstateid.other[1]; *tl = tdstateid.other[2]; } if (ret > 0) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); np = VTONFS(fdvp); (void) nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_SAVEFH); } } else { ret = 0; } if (ret == 0) NFSCL_REQSTART(nd, NFSPROC_RENAME, fdvp); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSWCCATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); (void) nfsm_fhtom(nd, VTONFS(tdvp)->n_fhp->nfh_fh, VTONFS(tdvp)->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_V4WCCATTR; NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_RENAME); } (void) nfsm_strtom(nd, fnameptr, fnamelen); if (!(nd->nd_flag & ND_NFSV4)) (void) nfsm_fhtom(nd, VTONFS(tdvp)->n_fhp->nfh_fh, VTONFS(tdvp)->n_fhp->nfh_len, 0); (void) nfsm_strtom(nd, tnameptr, tnamelen); error = nfscl_request(nd, fdvp, p, cred, fstuff); if (error) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) { /* For NFSv4, parse out any Delereturn replies. */ if (ret > 0 && nd->nd_repstat != 0 && (nd->nd_flag & ND_NOMOREDATA)) { /* * If the Delegreturn failed, try again without * it. The server will Recall, as required. */ m_freem(nd->nd_mrep); goto tryagain; } for (i = 0; i < (ret * 2); i++) { if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) { if (i == 0 && ret > 1) { /* * If the Delegreturn failed, try again * without it. The server will Recall, as * required. * If ret > 1, the first iteration of this * loop is the second DelegReturn result. */ m_freem(nd->nd_mrep); goto tryagain; } else { nd->nd_flag |= ND_NOMOREDATA; } } } } /* Now, the first wcc attribute reply. */ if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; } error = nfscl_wcc_data(nd, fdvp, fnap, fattrflagp, NULL, fstuff); /* and the second wcc attribute reply. */ if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4 && !error) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; } if (!error) error = nfscl_wcc_data(nd, tdvp, tnap, tattrflagp, NULL, tstuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs hard link create rpc */ int nfsrpc_link(vnode_t dvp, vnode_t vp, char *name, int namelen, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nap, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; int error = 0; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_LINK, vp); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); } (void) nfsm_fhtom(nd, VTONFS(dvp)->n_fhp->nfh_fh, VTONFS(dvp)->n_fhp->nfh_len, 0); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSWCCATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_V4WCCATTR; NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_LINK); } (void) nfsm_strtom(nd, name, namelen); error = nfscl_request(nd, vp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) { error = nfscl_postop_attr(nd, nap, attrflagp, dstuff); if (!error) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); } else if ((nd->nd_flag & (ND_NFSV4 | ND_NOMOREDATA)) == ND_NFSV4) { /* * First, parse out the PutFH and Getattr result. */ NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (!(*(tl + 1))) NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1)) nd->nd_flag |= ND_NOMOREDATA; /* * Get the pre-op attributes. */ error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs symbolic link create rpc */ int nfsrpc_symlink(vnode_t dvp, char *name, int namelen, const char *target, struct vattr *vap, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; int slen, error = 0; *nfhpp = NULL; *attrflagp = 0; *dattrflagp = 0; nmp = VFSTONFS(dvp->v_mount); slen = strlen(target); if (slen > NFS_MAXPATHLEN || namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_SYMLINK, dvp); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFLNK); (void) nfsm_strtom(nd, target, slen); } (void) nfsm_strtom(nd, name, namelen); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) nfscl_fillsattr(nd, vap, dvp, 0, 0); if (!(nd->nd_flag & ND_NFSV4)) (void) nfsm_strtom(nd, target, slen); if (nd->nd_flag & ND_NFSV2) nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1, 0); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & ND_NFSV4) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if ((nd->nd_flag & ND_NFSV3) && !error) { if (!nd->nd_repstat) error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (!error) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); } if (nd->nd_repstat && !error) error = nd->nd_repstat; m_freem(nd->nd_mrep); /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. * Only do this if vfs.nfs.ignore_eexist is set. * Never do this for NFSv4.1 or later minor versions, since sessions * should guarantee "exactly once" RPC semantics. */ if (error == EEXIST && nfsignore_eexist != 0 && (!NFSHASNFSV4(nmp) || nmp->nm_minorvers == 0)) error = 0; return (error); } /* * nfs make dir rpc */ int nfsrpc_mkdir(vnode_t dvp, char *name, int namelen, struct vattr *vap, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; int error = 0; struct nfsfh *fhp; struct nfsmount *nmp; *nfhpp = NULL; *attrflagp = 0; *dattrflagp = 0; nmp = VFSTONFS(dvp->v_mount); fhp = VTONFS(dvp)->n_fhp; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_MKDIR, dvp); if (nd->nd_flag & ND_NFSV4) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFDIR); } (void) nfsm_strtom(nd, name, namelen); nfscl_fillsattr(nd, vap, dvp, NFSSATTR_SIZENEG1, 0); if (nd->nd_flag & ND_NFSV4) { NFSGETATTR_ATTRBIT(&attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); (void) nfsm_fhtom(nd, fhp->nfh_fh, fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & ND_NFSV4) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (!nd->nd_repstat && !error) { if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); } if (!error) error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); if (error == 0 && (nd->nd_flag & ND_NFSV4) != 0) { /* Get rid of the PutFH and Getattr status values. */ NFSM_DISSECT(tl, u_int32_t *, 4 * NFSX_UNSIGNED); /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); if (error == 0) *dattrflagp = 1; } } if ((nd->nd_flag & ND_NFSV3) && !error) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (nd->nd_repstat && !error) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); /* * Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. * Only do this if vfs.nfs.ignore_eexist is set. * Never do this for NFSv4.1 or later minor versions, since sessions * should guarantee "exactly once" RPC semantics. */ if (error == EEXIST && nfsignore_eexist != 0 && (!NFSHASNFSV4(nmp) || nmp->nm_minorvers == 0)) error = 0; return (error); } /* * nfs remove directory call */ int nfsrpc_rmdir(vnode_t dvp, char *name, int namelen, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, int *dattrflagp, void *dstuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_RMDIR, dvp); (void) nfsm_strtom(nd, name, namelen); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error) return (error); if (nd->nd_flag & (ND_NFSV3 | ND_NFSV4)) error = nfscl_wcc_data(nd, dvp, dnap, dattrflagp, NULL, dstuff); if (nd->nd_repstat && !error) error = nd->nd_repstat; m_freem(nd->nd_mrep); /* * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. */ if (error == ENOENT) error = 0; return (error); } /* * Readdir rpc. * Always returns with either uio_resid unchanged, if you are at the * end of the directory, or uio_resid == 0, with all DIRBLKSIZ chunks * filled in. * I felt this would allow caching of directory blocks more easily * than returning a pertially filled block. * Directory offset cookies: * Oh my, what to do with them... * I can think of three ways to deal with them: * 1 - have the layer above these RPCs maintain a map between logical * directory byte offsets and the NFS directory offset cookies * 2 - pass the opaque directory offset cookies up into userland * and let the libc functions deal with them, via the system call * 3 - return them to userland in the "struct dirent", so future versions * of libc can use them and do whatever is necessary to make things work * above these rpc calls, in the meantime * For now, I do #3 by "hiding" the directory offset cookies after the * d_name field in struct dirent. This is space inside d_reclen that * will be ignored by anything that doesn't know about them. * The directory offset cookies are filled in as the last 8 bytes of * each directory entry, after d_name. Someday, the userland libc * functions may be able to use these. In the meantime, it satisfies * OpenBSD's requirements for cookies being returned. * If expects the directory offset cookie for the read to be in uio_offset * and returns the one for the next entry after this directory block in * there, as well. */ int nfsrpc_readdir(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, int *eofp, void *stuff) { int len, left; struct dirent *dp = NULL; u_int32_t *tl; nfsquad_t cookie, ncookie; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp); struct nfsvattr nfsva; struct nfsrv_descript nfsd, *nd = &nfsd; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int reqsize, tryformoredirs = 1, readsize, eof = 0, gotmnton = 0; u_int64_t dotfileid, dotdotfileid = 0, fakefileno = UINT64_MAX; char *cp; nfsattrbit_t attrbits, dattrbits; u_int32_t rderr, *tl2 = NULL; size_t tresid; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, ("nfs readdirrpc bad uio")); ncookie.lval[0] = ncookie.lval[1] = 0; /* * There is no point in reading a lot more than uio_resid, however * adding one additional DIRBLKSIZ makes sense. Since uio_resid * and nm_readdirsize are both exact multiples of DIRBLKSIZ, this * will never make readsize > nm_readdirsize. */ readsize = nmp->nm_readdirsize; if (readsize > uiop->uio_resid) readsize = uiop->uio_resid + DIRBLKSIZ; *attrflagp = 0; if (eofp) *eofp = 0; tresid = uiop->uio_resid; cookie.lval[0] = cookiep->nfsuquad[0]; cookie.lval[1] = cookiep->nfsuquad[1]; nd->nd_mrep = NULL; /* * For NFSv4, first create the "." and ".." entries. */ if (NFSHASNFSV4(nmp)) { reqsize = 6 * NFSX_UNSIGNED; NFSGETATTR_ATTRBIT(&dattrbits); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FILEID); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TYPE); if (NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, NFSATTRBIT_MOUNTEDONFILEID)) { NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MOUNTEDONFILEID); gotmnton = 1; } else { /* * Must fake it. Use the fileno, except when the * fsid is != to that of the directory. For that * case, generate a fake fileno that is not the same. */ NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FSID); gotmnton = 0; } /* * Joy, oh joy. For V4 we get to hand craft '.' and '..'. */ if (uiop->uio_offset == 0) { NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, vp); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); dotfileid = 0; /* Fake out the compiler. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { error = nfsm_loadattr(nd, &nfsva); if (error != 0) goto nfsmout; dotfileid = nfsva.na_fileid; } if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); len = fxdr_unsigned(int, *(tl + 4)); if (len > 0 && len <= NFSX_V4FHMAX) error = nfsm_advance(nd, NFSM_RNDUP(len), -1); else error = EPERM; if (!error) { NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED); nfsva.na_mntonfileno = UINT64_MAX; error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (error) { dotdotfileid = dotfileid; } else if (gotmnton) { if (nfsva.na_mntonfileno != UINT64_MAX) dotdotfileid = nfsva.na_mntonfileno; else dotdotfileid = nfsva.na_fileid; } else if (nfsva.na_filesid[0] == dnp->n_vattr.na_filesid[0] && nfsva.na_filesid[1] == dnp->n_vattr.na_filesid[1]) { dotdotfileid = nfsva.na_fileid; } else { do { fakefileno--; } while (fakefileno == nfsva.na_fileid); dotdotfileid = fakefileno; } } } else if (nd->nd_repstat == NFSERR_NOENT) { /* * Lookupp returns NFSERR_NOENT when we are * at the root, so just use the current dir. */ nd->nd_repstat = 0; dotdotfileid = dotfileid; } else { error = nd->nd_repstat; } m_freem(nd->nd_mrep); if (error) return (error); nd->nd_mrep = NULL; dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_type = DT_DIR; dp->d_fileno = dotfileid; dp->d_namlen = 1; *((uint64_t *)dp->d_name) = 0; /* Zero pad it. */ dp->d_name[0] = '.'; dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER; /* * Just make these offset cookie 0. */ tl = (u_int32_t *)&dp->d_name[8]; *tl++ = 0; *tl = 0; blksiz += dp->d_reclen; uiop->uio_resid -= dp->d_reclen; uiop->uio_offset += dp->d_reclen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + dp->d_reclen; uiop->uio_iov->iov_len -= dp->d_reclen; dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_type = DT_DIR; dp->d_fileno = dotdotfileid; dp->d_namlen = 2; *((uint64_t *)dp->d_name) = 0; dp->d_name[0] = '.'; dp->d_name[1] = '.'; dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER; /* * Just make these offset cookie 0. */ tl = (u_int32_t *)&dp->d_name[8]; *tl++ = 0; *tl = 0; blksiz += dp->d_reclen; uiop->uio_resid -= dp->d_reclen; uiop->uio_offset += dp->d_reclen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + dp->d_reclen; uiop->uio_iov->iov_len -= dp->d_reclen; } NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_RDATTRERROR); } else { reqsize = 5 * NFSX_UNSIGNED; } /* * Loop around doing readdir rpc's of size readsize. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_READDIR, vp); if (nd->nd_flag & ND_NFSV2) { NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = cookie.lval[1]; *tl = txdr_unsigned(readsize); } else { NFSM_BUILD(tl, u_int32_t *, reqsize); *tl++ = cookie.lval[0]; *tl++ = cookie.lval[1]; if (cookie.qval == 0) { *tl++ = 0; *tl++ = 0; } else { NFSLOCKNODE(dnp); *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; NFSUNLOCKNODE(dnp); } if (nd->nd_flag & ND_NFSV4) { *tl++ = txdr_unsigned(readsize); *tl = txdr_unsigned(readsize); (void) nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &dattrbits); } else { *tl = txdr_unsigned(readsize); } } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (!(nd->nd_flag & ND_NFSV2)) { if (nd->nd_flag & ND_NFSV3) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (!nd->nd_repstat && !error) { NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER); NFSLOCKNODE(dnp); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl; NFSUNLOCKNODE(dnp); } } if (nd->nd_repstat || error) { if (!error) error = nd->nd_repstat; goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); if (!more_dirs) tryformoredirs = 0; /* loop through the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { if (nd->nd_flag & ND_NFSV4) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); ncookie.lval[0] = *tl++; ncookie.lval[1] = *tl++; len = fxdr_unsigned(int, *tl); } else if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); nfsva.na_fileid = fxdr_hyper(tl); tl += 2; len = fxdr_unsigned(int, *tl); } else { NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED); nfsva.na_fileid = fxdr_unsigned(uint64_t, *tl++); len = fxdr_unsigned(int, *tl); } if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; goto nfsmout; } tlen = roundup2(len, 8); if (tlen == len) tlen += 8; /* To ensure null termination. */ left = DIRBLKSIZ - blksiz; if (_GENERIC_DIRLEN(len) + NFSX_HYPER > left) { NFSBZERO(uiop->uio_iov->iov_base, left); dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_resid -= left; uiop->uio_offset += left; blksiz = 0; } if (_GENERIC_DIRLEN(len) + NFSX_HYPER > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_namlen = len; dp->d_reclen = _GENERIC_DIRLEN(len) + NFSX_HYPER; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_resid -= DIRHDSIZ; uiop->uio_offset += DIRHDSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; error = nfsm_mbufuio(nd, uiop, len); if (error) goto nfsmout; cp = uiop->uio_iov->iov_base; tlen -= len; NFSBZERO(cp, tlen); cp += tlen; /* points to cookie storage */ tl2 = (u_int32_t *)cp; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + tlen + NFSX_HYPER; uiop->uio_iov->iov_len -= tlen + NFSX_HYPER; uiop->uio_resid -= tlen + NFSX_HYPER; uiop->uio_offset += (tlen + NFSX_HYPER); } else { error = nfsm_advance(nd, NFSM_RNDUP(len), -1); if (error) goto nfsmout; } if (nd->nd_flag & ND_NFSV4) { rderr = 0; nfsva.na_mntonfileno = UINT64_MAX; error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, &rderr, p, cred); if (error) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); } else if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); ncookie.lval[0] = *tl++; ncookie.lval[1] = *tl++; } else { NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED); ncookie.lval[0] = 0; ncookie.lval[1] = *tl++; } if (bigenough) { if (nd->nd_flag & ND_NFSV4) { if (rderr) { dp->d_fileno = 0; } else { if (gotmnton) { if (nfsva.na_mntonfileno != UINT64_MAX) dp->d_fileno = nfsva.na_mntonfileno; else dp->d_fileno = nfsva.na_fileid; } else if (nfsva.na_filesid[0] == dnp->n_vattr.na_filesid[0] && nfsva.na_filesid[1] == dnp->n_vattr.na_filesid[1]) { dp->d_fileno = nfsva.na_fileid; } else { do { fakefileno--; } while (fakefileno == nfsva.na_fileid); dp->d_fileno = fakefileno; } dp->d_type = vtonfs_dtype(nfsva.na_type); } } else { dp->d_fileno = nfsva.na_fileid; } *tl2++ = cookiep->nfsuquad[0] = cookie.lval[0] = ncookie.lval[0]; *tl2 = cookiep->nfsuquad[1] = cookie.lval[1] = ncookie.lval[1]; } more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); eof = fxdr_unsigned(int, *tl); if (tryformoredirs) more_dirs = !eof; if (nd->nd_flag & ND_NFSV4) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (error) goto nfsmout; } } m_freem(nd->nd_mrep); nd->nd_mrep = NULL; } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; NFSBZERO(uiop->uio_iov->iov_base, left); dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_resid -= left; uiop->uio_offset += left; } /* * If returning no data, assume end of file. * If not bigenough, return not end of file, since you aren't * returning all the data * Otherwise, return the eof flag from the server. */ if (eofp) { if (tresid == ((size_t)(uiop->uio_resid))) *eofp = 1; else if (!bigenough) *eofp = 0; else *eofp = eof; } /* * Add extra empty records to any remaining DIRBLKSIZ chunks. */ while (uiop->uio_resid > 0 && uiop->uio_resid != tresid) { dp = (struct dirent *)uiop->uio_iov->iov_base; NFSBZERO(dp, DIRBLKSIZ); dp->d_type = DT_UNKNOWN; tl = (u_int32_t *)&dp->d_name[4]; *tl++ = cookie.lval[0]; *tl = cookie.lval[1]; dp->d_reclen = DIRBLKSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRBLKSIZ; uiop->uio_iov->iov_len -= DIRBLKSIZ; uiop->uio_resid -= DIRBLKSIZ; uiop->uio_offset += DIRBLKSIZ; } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); return (error); } #ifndef APPLE /* * NFS V3 readdir plus RPC. Used in place of nfsrpc_readdir(). * (Also used for NFS V4 when mount flag set.) * (ditto above w.r.t. multiple of DIRBLKSIZ, etc.) */ int nfsrpc_readdirplus(vnode_t vp, struct uio *uiop, nfsuint64 *cookiep, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, int *eofp, void *stuff) { int len, left; struct dirent *dp = NULL; u_int32_t *tl; vnode_t newvp = NULLVP; struct nfsrv_descript nfsd, *nd = &nfsd; struct nameidata nami, *ndp = &nami; struct componentname *cnp = &ndp->ni_cnd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *dnp = VTONFS(vp), *np; struct nfsvattr nfsva; struct nfsfh *nfhp; nfsquad_t cookie, ncookie; int error = 0, tlen, more_dirs = 1, blksiz = 0, bigenough = 1; int attrflag, tryformoredirs = 1, eof = 0, gotmnton = 0; int isdotdot = 0, unlocknewvp = 0; u_int64_t dotfileid, dotdotfileid = 0, fakefileno = UINT64_MAX; u_int64_t fileno = 0; char *cp; nfsattrbit_t attrbits, dattrbits; size_t tresid; u_int32_t *tl2 = NULL, rderr; struct timespec dctime; KASSERT(uiop->uio_iovcnt == 1 && (uiop->uio_resid & (DIRBLKSIZ - 1)) == 0, ("nfs readdirplusrpc bad uio")); ncookie.lval[0] = ncookie.lval[1] = 0; timespecclear(&dctime); *attrflagp = 0; if (eofp != NULL) *eofp = 0; ndp->ni_dvp = vp; nd->nd_mrep = NULL; cookie.lval[0] = cookiep->nfsuquad[0]; cookie.lval[1] = cookiep->nfsuquad[1]; tresid = uiop->uio_resid; /* * For NFSv4, first create the "." and ".." entries. */ if (NFSHASNFSV4(nmp)) { NFSGETATTR_ATTRBIT(&dattrbits); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FILEID); if (NFSISSET_ATTRBIT(&dnp->n_vattr.na_suppattr, NFSATTRBIT_MOUNTEDONFILEID)) { NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MOUNTEDONFILEID); gotmnton = 1; } else { /* * Must fake it. Use the fileno, except when the * fsid is != to that of the directory. For that * case, generate a fake fileno that is not the same. */ NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_FSID); gotmnton = 0; } /* * Joy, oh joy. For V4 we get to hand craft '.' and '..'. */ if (uiop->uio_offset == 0) { NFSCL_REQSTART(nd, NFSPROC_LOOKUPP, vp); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); dotfileid = 0; /* Fake out the compiler. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { error = nfsm_loadattr(nd, &nfsva); if (error != 0) goto nfsmout; dctime = nfsva.na_ctime; dotfileid = nfsva.na_fileid; } if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED); len = fxdr_unsigned(int, *(tl + 4)); if (len > 0 && len <= NFSX_V4FHMAX) error = nfsm_advance(nd, NFSM_RNDUP(len), -1); else error = EPERM; if (!error) { NFSM_DISSECT(tl, u_int32_t *, 2*NFSX_UNSIGNED); nfsva.na_mntonfileno = UINT64_MAX; error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (error) { dotdotfileid = dotfileid; } else if (gotmnton) { if (nfsva.na_mntonfileno != UINT64_MAX) dotdotfileid = nfsva.na_mntonfileno; else dotdotfileid = nfsva.na_fileid; } else if (nfsva.na_filesid[0] == dnp->n_vattr.na_filesid[0] && nfsva.na_filesid[1] == dnp->n_vattr.na_filesid[1]) { dotdotfileid = nfsva.na_fileid; } else { do { fakefileno--; } while (fakefileno == nfsva.na_fileid); dotdotfileid = fakefileno; } } } else if (nd->nd_repstat == NFSERR_NOENT) { /* * Lookupp returns NFSERR_NOENT when we are * at the root, so just use the current dir. */ nd->nd_repstat = 0; dotdotfileid = dotfileid; } else { error = nd->nd_repstat; } m_freem(nd->nd_mrep); if (error) return (error); nd->nd_mrep = NULL; dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_type = DT_DIR; dp->d_fileno = dotfileid; dp->d_namlen = 1; *((uint64_t *)dp->d_name) = 0; /* Zero pad it. */ dp->d_name[0] = '.'; dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER; /* * Just make these offset cookie 0. */ tl = (u_int32_t *)&dp->d_name[8]; *tl++ = 0; *tl = 0; blksiz += dp->d_reclen; uiop->uio_resid -= dp->d_reclen; uiop->uio_offset += dp->d_reclen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + dp->d_reclen; uiop->uio_iov->iov_len -= dp->d_reclen; dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_type = DT_DIR; dp->d_fileno = dotdotfileid; dp->d_namlen = 2; *((uint64_t *)dp->d_name) = 0; dp->d_name[0] = '.'; dp->d_name[1] = '.'; dp->d_reclen = _GENERIC_DIRSIZ(dp) + NFSX_HYPER; /* * Just make these offset cookie 0. */ tl = (u_int32_t *)&dp->d_name[8]; *tl++ = 0; *tl = 0; blksiz += dp->d_reclen; uiop->uio_resid -= dp->d_reclen; uiop->uio_offset += dp->d_reclen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + dp->d_reclen; uiop->uio_iov->iov_len -= dp->d_reclen; } NFSREADDIRPLUS_ATTRBIT(&attrbits); if (gotmnton) NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_MOUNTEDONFILEID); } /* * Loop around doing readdir rpc's of size nm_readdirsize. * The stopping criteria is EOF or buffer full. */ while (more_dirs && bigenough) { *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_READDIRPLUS, vp); NFSM_BUILD(tl, u_int32_t *, 6 * NFSX_UNSIGNED); *tl++ = cookie.lval[0]; *tl++ = cookie.lval[1]; if (cookie.qval == 0) { *tl++ = 0; *tl++ = 0; } else { NFSLOCKNODE(dnp); *tl++ = dnp->n_cookieverf.nfsuquad[0]; *tl++ = dnp->n_cookieverf.nfsuquad[1]; NFSUNLOCKNODE(dnp); } *tl++ = txdr_unsigned(nmp->nm_readdirsize); *tl = txdr_unsigned(nmp->nm_readdirsize); if (nd->nd_flag & ND_NFSV4) { (void) nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); (void) nfsrv_putattrbit(nd, &dattrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (nd->nd_repstat || error) { if (!error) error = nd->nd_repstat; goto nfsmout; } if ((nd->nd_flag & ND_NFSV3) != 0 && *attrflagp != 0) dctime = nap->na_ctime; NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); NFSLOCKNODE(dnp); dnp->n_cookieverf.nfsuquad[0] = *tl++; dnp->n_cookieverf.nfsuquad[1] = *tl++; NFSUNLOCKNODE(dnp); more_dirs = fxdr_unsigned(int, *tl); if (!more_dirs) tryformoredirs = 0; /* loop through the dir entries, doctoring them to 4bsd form */ while (more_dirs && bigenough) { NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); if (nd->nd_flag & ND_NFSV4) { ncookie.lval[0] = *tl++; ncookie.lval[1] = *tl++; } else { fileno = fxdr_hyper(tl); tl += 2; } len = fxdr_unsigned(int, *tl); if (len <= 0 || len > NFS_MAXNAMLEN) { error = EBADRPC; goto nfsmout; } tlen = roundup2(len, 8); if (tlen == len) tlen += 8; /* To ensure null termination. */ left = DIRBLKSIZ - blksiz; if (_GENERIC_DIRLEN(len) + NFSX_HYPER > left) { NFSBZERO(uiop->uio_iov->iov_base, left); dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_resid -= left; uiop->uio_offset += left; blksiz = 0; } if (_GENERIC_DIRLEN(len) + NFSX_HYPER > uiop->uio_resid) bigenough = 0; if (bigenough) { dp = (struct dirent *)uiop->uio_iov->iov_base; dp->d_pad0 = dp->d_pad1 = 0; dp->d_off = 0; dp->d_namlen = len; dp->d_reclen = _GENERIC_DIRLEN(len) + NFSX_HYPER; dp->d_type = DT_UNKNOWN; blksiz += dp->d_reclen; if (blksiz == DIRBLKSIZ) blksiz = 0; uiop->uio_resid -= DIRHDSIZ; uiop->uio_offset += DIRHDSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRHDSIZ; uiop->uio_iov->iov_len -= DIRHDSIZ; cnp->cn_nameptr = uiop->uio_iov->iov_base; cnp->cn_namelen = len; NFSCNHASHZERO(cnp); error = nfsm_mbufuio(nd, uiop, len); if (error) goto nfsmout; cp = uiop->uio_iov->iov_base; tlen -= len; NFSBZERO(cp, tlen); cp += tlen; /* points to cookie storage */ tl2 = (u_int32_t *)cp; if (len == 2 && cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') isdotdot = 1; else isdotdot = 0; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + tlen + NFSX_HYPER; uiop->uio_iov->iov_len -= tlen + NFSX_HYPER; uiop->uio_resid -= tlen + NFSX_HYPER; uiop->uio_offset += (tlen + NFSX_HYPER); } else { error = nfsm_advance(nd, NFSM_RNDUP(len), -1); if (error) goto nfsmout; } nfhp = NULL; if (nd->nd_flag & ND_NFSV3) { NFSM_DISSECT(tl, u_int32_t *, 3*NFSX_UNSIGNED); ncookie.lval[0] = *tl++; ncookie.lval[1] = *tl++; attrflag = fxdr_unsigned(int, *tl); if (attrflag) { error = nfsm_loadattr(nd, &nfsva); if (error) goto nfsmout; } NFSM_DISSECT(tl,u_int32_t *,NFSX_UNSIGNED); if (*tl) { error = nfsm_getfh(nd, &nfhp); if (error) goto nfsmout; } if (!attrflag && nfhp != NULL) { free(nfhp, M_NFSFH); nfhp = NULL; } } else { rderr = 0; nfsva.na_mntonfileno = 0xffffffff; error = nfsv4_loadattr(nd, NULL, &nfsva, &nfhp, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, &rderr, p, cred); if (error) goto nfsmout; } if (bigenough) { if (nd->nd_flag & ND_NFSV4) { if (rderr) { dp->d_fileno = 0; } else if (gotmnton) { if (nfsva.na_mntonfileno != 0xffffffff) dp->d_fileno = nfsva.na_mntonfileno; else dp->d_fileno = nfsva.na_fileid; } else if (nfsva.na_filesid[0] == dnp->n_vattr.na_filesid[0] && nfsva.na_filesid[1] == dnp->n_vattr.na_filesid[1]) { dp->d_fileno = nfsva.na_fileid; } else { do { fakefileno--; } while (fakefileno == nfsva.na_fileid); dp->d_fileno = fakefileno; } } else { dp->d_fileno = fileno; } *tl2++ = cookiep->nfsuquad[0] = cookie.lval[0] = ncookie.lval[0]; *tl2 = cookiep->nfsuquad[1] = cookie.lval[1] = ncookie.lval[1]; if (nfhp != NULL) { if (NFSRV_CMPFH(nfhp->nfh_fh, nfhp->nfh_len, dnp->n_fhp->nfh_fh, dnp->n_fhp->nfh_len)) { VREF(vp); newvp = vp; unlocknewvp = 0; free(nfhp, M_NFSFH); np = dnp; } else if (isdotdot != 0) { /* * Skip doing a nfscl_nget() call for "..". * There's a race between acquiring the nfs * node here and lookups that look for the * directory being read (in the parent). * It would try to get a lock on ".." here, * owning the lock on the directory being * read. Lookup will hold the lock on ".." * and try to acquire the lock on the * directory being read. * If the directory is unlocked/relocked, * then there is a LOR with the buflock * vp is relocked. */ free(nfhp, M_NFSFH); } else { error = nfscl_nget(vp->v_mount, vp, nfhp, cnp, p, &np, NULL, LK_EXCLUSIVE); if (!error) { newvp = NFSTOV(np); unlocknewvp = 1; } } nfhp = NULL; if (newvp != NULLVP) { error = nfscl_loadattrcache(&newvp, &nfsva, NULL, NULL, 0, 0); if (error) { if (unlocknewvp) vput(newvp); else vrele(newvp); goto nfsmout; } dp->d_type = vtonfs_dtype(np->n_vattr.na_type); ndp->ni_vp = newvp; NFSCNHASH(cnp, HASHINIT); if (cnp->cn_namelen <= NCHNAMLEN && ndp->ni_dvp != ndp->ni_vp && (newvp->v_type != VDIR || dctime.tv_sec != 0)) { cache_enter_time(ndp->ni_dvp, ndp->ni_vp, cnp, &nfsva.na_ctime, newvp->v_type != VDIR ? NULL : &dctime); } if (unlocknewvp) vput(newvp); else vrele(newvp); newvp = NULLVP; } } } else if (nfhp != NULL) { free(nfhp, M_NFSFH); } NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); more_dirs = fxdr_unsigned(int, *tl); } /* * If at end of rpc data, get the eof boolean */ if (!more_dirs) { NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); eof = fxdr_unsigned(int, *tl); if (tryformoredirs) more_dirs = !eof; if (nd->nd_flag & ND_NFSV4) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (error) goto nfsmout; } } m_freem(nd->nd_mrep); nd->nd_mrep = NULL; } /* * Fill last record, iff any, out to a multiple of DIRBLKSIZ * by increasing d_reclen for the last record. */ if (blksiz > 0) { left = DIRBLKSIZ - blksiz; NFSBZERO(uiop->uio_iov->iov_base, left); dp->d_reclen += left; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + left; uiop->uio_iov->iov_len -= left; uiop->uio_resid -= left; uiop->uio_offset += left; } /* * If returning no data, assume end of file. * If not bigenough, return not end of file, since you aren't * returning all the data * Otherwise, return the eof flag from the server. */ if (eofp != NULL) { if (tresid == uiop->uio_resid) *eofp = 1; else if (!bigenough) *eofp = 0; else *eofp = eof; } /* * Add extra empty records to any remaining DIRBLKSIZ chunks. */ while (uiop->uio_resid > 0 && uiop->uio_resid != tresid) { dp = (struct dirent *)uiop->uio_iov->iov_base; NFSBZERO(dp, DIRBLKSIZ); dp->d_type = DT_UNKNOWN; tl = (u_int32_t *)&dp->d_name[4]; *tl++ = cookie.lval[0]; *tl = cookie.lval[1]; dp->d_reclen = DIRBLKSIZ; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + DIRBLKSIZ; uiop->uio_iov->iov_len -= DIRBLKSIZ; uiop->uio_resid -= DIRBLKSIZ; uiop->uio_offset += DIRBLKSIZ; } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); return (error); } #endif /* !APPLE */ /* * Nfs commit rpc */ int nfsrpc_commit(vnode_t vp, u_quad_t offset, int cnt, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t attrbits; int error; struct nfsmount *nmp = VFSTONFS(vp->v_mount); *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_COMMIT, vp); NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); if (nd->nd_flag & ND_NFSV4) { /* * And do a Getattr op. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); } error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); error = nfscl_wcc_data(nd, vp, nap, attrflagp, NULL, stuff); if (!error && !nd->nd_repstat) { NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF); NFSLOCKMNT(nmp); if (NFSBCMP(nmp->nm_verf, tl, NFSX_VERF)) { NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); nd->nd_repstat = NFSERR_STALEWRITEVERF; } NFSUNLOCKMNT(nmp); if (nd->nd_flag & ND_NFSV4) error = nfscl_postop_attr(nd, nap, attrflagp, stuff); } nfsmout: if (!error && nd->nd_repstat) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * NFS byte range lock rpc. * (Mostly just calls one of the three lower level RPC routines.) */ int nfsrpc_advlock(vnode_t vp, off_t size, int op, struct flock *fl, int reclaim, struct ucred *cred, NFSPROC_T *p, void *id, int flags) { struct nfscllockowner *lp; struct nfsclclient *clp; struct nfsfh *nfhp; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); u_int64_t off, len; off_t start, end; u_int32_t clidrev = 0; int error = 0, newone = 0, expireret = 0, retrycnt, donelocally; int callcnt, dorpc; /* * Convert the flock structure into a start and end and do POSIX * bounds checking. */ switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: /* * Caller is responsible for adding any necessary offset * when SEEK_CUR is used. */ start = fl->l_start; off = fl->l_start; break; case SEEK_END: start = size + fl->l_start; off = size + fl->l_start; break; default: return (EINVAL); } if (start < 0) return (EINVAL); if (fl->l_len != 0) { end = start + fl->l_len - 1; if (end < start) return (EINVAL); } len = fl->l_len; if (len == 0) len = NFS64BITSSET; retrycnt = 0; do { nd->nd_repstat = 0; if (op == F_GETLK) { error = nfscl_getcl(vp->v_mount, cred, p, 1, &clp); if (error) return (error); error = nfscl_lockt(vp, clp, off, len, fl, p, id, flags); if (!error) { clidrev = clp->nfsc_clientidrev; error = nfsrpc_lockt(nd, vp, clp, off, len, fl, cred, p, id, flags); } else if (error == -1) { error = 0; } nfscl_clientrelease(clp); } else if (op == F_UNLCK && fl->l_type == F_UNLCK) { /* * We must loop around for all lockowner cases. */ callcnt = 0; error = nfscl_getcl(vp->v_mount, cred, p, 1, &clp); if (error) return (error); do { error = nfscl_relbytelock(vp, off, len, cred, p, callcnt, clp, id, flags, &lp, &dorpc); /* * If it returns a NULL lp, we're done. */ if (lp == NULL) { if (callcnt == 0) nfscl_clientrelease(clp); else nfscl_releasealllocks(clp, vp, p, id, flags); return (error); } if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; else clidrev = 0; /* * If the server doesn't support Posix lock semantics, * only allow locks on the entire file, since it won't * handle overlapping byte ranges. * There might still be a problem when a lock * upgrade/downgrade (read<->write) occurs, since the * server "might" expect an unlock first? */ if (dorpc && (lp->nfsl_open->nfso_posixlock || (off == 0 && len == NFS64BITSSET))) { /* * Since the lock records will go away, we must * wait for grace and delay here. */ do { error = nfsrpc_locku(nd, nmp, lp, off, len, NFSV4LOCKT_READ, cred, p, 0); if ((nd->nd_repstat == NFSERR_GRACE || nd->nd_repstat == NFSERR_DELAY) && error == 0) (void) nfs_catnap(PZERO, (int)nd->nd_repstat, "nfs_advlock"); } while ((nd->nd_repstat == NFSERR_GRACE || nd->nd_repstat == NFSERR_DELAY) && error == 0); } callcnt++; } while (error == 0 && nd->nd_repstat == 0); nfscl_releasealllocks(clp, vp, p, id, flags); } else if (op == F_SETLK) { error = nfscl_getbytelock(vp, off, len, fl->l_type, cred, p, NULL, 0, id, flags, NULL, NULL, &lp, &newone, &donelocally); if (error || donelocally) { return (error); } if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; else clidrev = 0; nfhp = VTONFS(vp)->n_fhp; if (!lp->nfsl_open->nfso_posixlock && (off != 0 || len != NFS64BITSSET)) { error = EINVAL; } else { error = nfsrpc_lock(nd, nmp, vp, nfhp->nfh_fh, nfhp->nfh_len, lp, newone, reclaim, off, len, fl->l_type, cred, p, 0); } if (!error) error = nd->nd_repstat; nfscl_lockrelease(lp, error, newone); } else { error = EINVAL; } if (!error) error = nd->nd_repstat; if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_STALECLIENTID || error == NFSERR_DELAY || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_advlock"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); retrycnt++; } } while (error == NFSERR_GRACE || error == NFSERR_STALECLIENTID || error == NFSERR_DELAY || error == NFSERR_STALEDONTRECOVER || error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; return (error); } /* * The lower level routine for the LockT case. */ int nfsrpc_lockt(struct nfsrv_descript *nd, vnode_t vp, struct nfsclclient *clp, u_int64_t off, u_int64_t len, struct flock *fl, struct ucred *cred, NFSPROC_T *p, void *id, int flags) { u_int32_t *tl; int error, type, size; uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX]; struct nfsnode *np; struct nfsmount *nmp; struct nfsclsession *tsep; nmp = VFSTONFS(vp->v_mount); NFSCL_REQSTART(nd, NFSPROC_LOCKT, vp); NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED); if (fl->l_type == F_RDLCK) *tl++ = txdr_unsigned(NFSV4LOCKT_READ); else *tl++ = txdr_unsigned(NFSV4LOCKT_WRITE); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; nfscl_filllockowner(id, own, flags); np = VTONFS(vp); NFSBCOPY(np->n_fhp->nfh_fh, &own[NFSV4CL_LOCKNAMELEN], np->n_fhp->nfh_len); (void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + np->n_fhp->nfh_len); error = nfscl_request(nd, vp, p, cred, NULL); if (error) return (error); if (nd->nd_repstat == 0) { fl->l_type = F_UNLCK; } else if (nd->nd_repstat == NFSERR_DENIED) { nd->nd_repstat = 0; fl->l_whence = SEEK_SET; NFSM_DISSECT(tl, u_int32_t *, 8 * NFSX_UNSIGNED); fl->l_start = fxdr_hyper(tl); tl += 2; len = fxdr_hyper(tl); tl += 2; if (len == NFS64BITSSET) fl->l_len = 0; else fl->l_len = len; type = fxdr_unsigned(int, *tl++); if (type == NFSV4LOCKT_WRITE) fl->l_type = F_WRLCK; else fl->l_type = F_RDLCK; /* * XXX For now, I have no idea what to do with the * conflicting lock_owner, so I'll just set the pid == 0 * and skip over the lock_owner. */ fl->l_pid = (pid_t)0; tl += 2; size = fxdr_unsigned(int, *tl); if (size < 0 || size > NFSV4_OPAQUELIMIT) error = EBADRPC; if (!error) error = nfsm_advance(nd, NFSM_RNDUP(size), -1); } else if (nd->nd_repstat == NFSERR_STALECLIENTID) nfscl_initiate_recovery(clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Lower level function that performs the LockU RPC. */ static int nfsrpc_locku(struct nfsrv_descript *nd, struct nfsmount *nmp, struct nfscllockowner *lp, u_int64_t off, u_int64_t len, u_int32_t type, struct ucred *cred, NFSPROC_T *p, int syscred) { u_int32_t *tl; int error; nfscl_reqstart(nd, NFSPROC_LOCKU, nmp, lp->nfsl_open->nfso_fh, lp->nfsl_open->nfso_fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(type); *tl = txdr_unsigned(lp->nfsl_seqid); if (nfstest_outofseq && (arc4random() % nfstest_outofseq) == 0) *tl = txdr_unsigned(lp->nfsl_seqid + 1); tl++; if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = lp->nfsl_stateid.seqid; *tl++ = lp->nfsl_stateid.other[0]; *tl++ = lp->nfsl_stateid.other[1]; *tl++ = lp->nfsl_stateid.other[2]; txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); NFSCL_INCRSEQID(lp->nfsl_seqid, nd); if (error) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); lp->nfsl_stateid.seqid = *tl++; lp->nfsl_stateid.other[0] = *tl++; lp->nfsl_stateid.other[1] = *tl++; lp->nfsl_stateid.other[2] = *tl; } else if (nd->nd_repstat == NFSERR_STALESTATEID) nfscl_initiate_recovery(lp->nfsl_open->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The actual Lock RPC. */ int nfsrpc_lock(struct nfsrv_descript *nd, struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, struct nfscllockowner *lp, int newone, int reclaim, u_int64_t off, u_int64_t len, short type, struct ucred *cred, NFSPROC_T *p, int syscred) { u_int32_t *tl; int error, size; uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX]; struct nfsclsession *tsep; nfscl_reqstart(nd, NFSPROC_LOCK, nmp, nfhp, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 7 * NFSX_UNSIGNED); if (type == F_RDLCK) *tl++ = txdr_unsigned(NFSV4LOCKT_READ); else *tl++ = txdr_unsigned(NFSV4LOCKT_WRITE); *tl++ = txdr_unsigned(reclaim); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; if (newone) { *tl = newnfs_true; NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + 2 * NFSX_UNSIGNED + NFSX_HYPER); *tl++ = txdr_unsigned(lp->nfsl_open->nfso_own->nfsow_seqid); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = lp->nfsl_open->nfso_stateid.seqid; *tl++ = lp->nfsl_open->nfso_stateid.other[0]; *tl++ = lp->nfsl_open->nfso_stateid.other[1]; *tl++ = lp->nfsl_open->nfso_stateid.other[2]; *tl++ = txdr_unsigned(lp->nfsl_seqid); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; NFSBCOPY(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN); NFSBCOPY(nfhp, &own[NFSV4CL_LOCKNAMELEN], fhlen); (void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + fhlen); } else { *tl = newnfs_false; NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = lp->nfsl_stateid.seqid; *tl++ = lp->nfsl_stateid.other[0]; *tl++ = lp->nfsl_stateid.other[1]; *tl++ = lp->nfsl_stateid.other[2]; *tl = txdr_unsigned(lp->nfsl_seqid); if (nfstest_outofseq && (arc4random() % nfstest_outofseq) == 0) *tl = txdr_unsigned(lp->nfsl_seqid + 1); } if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); if (newone) NFSCL_INCRSEQID(lp->nfsl_open->nfso_own->nfsow_seqid, nd); NFSCL_INCRSEQID(lp->nfsl_seqid, nd); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID); lp->nfsl_stateid.seqid = *tl++; lp->nfsl_stateid.other[0] = *tl++; lp->nfsl_stateid.other[1] = *tl++; lp->nfsl_stateid.other[2] = *tl; } else if (nd->nd_repstat == NFSERR_DENIED) { NFSM_DISSECT(tl, u_int32_t *, 8 * NFSX_UNSIGNED); size = fxdr_unsigned(int, *(tl + 7)); if (size < 0 || size > NFSV4_OPAQUELIMIT) error = EBADRPC; if (!error) error = nfsm_advance(nd, NFSM_RNDUP(size), -1); } else if (nd->nd_repstat == NFSERR_STALESTATEID) nfscl_initiate_recovery(lp->nfsl_open->nfso_own->nfsow_clp); nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs statfs rpc * (always called with the vp for the mount point) */ int nfsrpc_statfs(vnode_t vp, struct nfsstatfs *sbp, struct nfsfsinfo *fsp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl = NULL; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; nfsattrbit_t attrbits; int error; *attrflagp = 0; nmp = VFSTONFS(vp->v_mount); if (NFSHASNFSV4(nmp)) { /* * For V4, you actually do a getattr. */ NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp); NFSSTATFS_GETATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_USEGSSNAME; error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_repstat == 0) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, sbp, fsp, NULL, 0, NULL, NULL, NULL, p, cred); if (!error) { nmp->nm_fsid[0] = nap->na_filesid[0]; nmp->nm_fsid[1] = nap->na_filesid[1]; NFSSETHASSETFSID(nmp); *attrflagp = 1; } } else { error = nd->nd_repstat; } if (error) goto nfsmout; } else { NFSCL_REQSTART(nd, NFSPROC_FSSTAT, vp); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_flag & ND_NFSV3) { error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (error) goto nfsmout; } if (nd->nd_repstat) { error = nd->nd_repstat; goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, NFSX_STATFS(nd->nd_flag & ND_NFSV3)); } if (NFSHASNFSV3(nmp)) { sbp->sf_tbytes = fxdr_hyper(tl); tl += 2; sbp->sf_fbytes = fxdr_hyper(tl); tl += 2; sbp->sf_abytes = fxdr_hyper(tl); tl += 2; sbp->sf_tfiles = fxdr_hyper(tl); tl += 2; sbp->sf_ffiles = fxdr_hyper(tl); tl += 2; sbp->sf_afiles = fxdr_hyper(tl); tl += 2; sbp->sf_invarsec = fxdr_unsigned(u_int32_t, *tl); } else if (NFSHASNFSV4(nmp) == 0) { sbp->sf_tsize = fxdr_unsigned(u_int32_t, *tl++); sbp->sf_bsize = fxdr_unsigned(u_int32_t, *tl++); sbp->sf_blocks = fxdr_unsigned(u_int32_t, *tl++); sbp->sf_bfree = fxdr_unsigned(u_int32_t, *tl++); sbp->sf_bavail = fxdr_unsigned(u_int32_t, *tl); } nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs pathconf rpc */ int nfsrpc_pathconf(vnode_t vp, struct nfsv3_pathconf *pc, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp; u_int32_t *tl; nfsattrbit_t attrbits; int error; *attrflagp = 0; nmp = VFSTONFS(vp->v_mount); if (NFSHASNFSV4(nmp)) { /* * For V4, you actually do a getattr. */ NFSCL_REQSTART(nd, NFSPROC_GETATTR, vp); NFSPATHCONF_GETATTRBIT(&attrbits); (void) nfsrv_putattrbit(nd, &attrbits); nd->nd_flag |= ND_USEGSSNAME; error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (nd->nd_repstat == 0) { error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, pc, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (!error) *attrflagp = 1; } else { error = nd->nd_repstat; } } else { NFSCL_REQSTART(nd, NFSPROC_PATHCONF, vp); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (nd->nd_repstat && !error) error = nd->nd_repstat; if (!error) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V3PATHCONF); pc->pc_linkmax = fxdr_unsigned(u_int32_t, *tl++); pc->pc_namemax = fxdr_unsigned(u_int32_t, *tl++); pc->pc_notrunc = fxdr_unsigned(u_int32_t, *tl++); pc->pc_chownrestricted = fxdr_unsigned(u_int32_t, *tl++); pc->pc_caseinsensitive = fxdr_unsigned(u_int32_t, *tl++); pc->pc_casepreserving = fxdr_unsigned(u_int32_t, *tl); } } nfsmout: m_freem(nd->nd_mrep); return (error); } /* * nfs version 3 fsinfo rpc call */ int nfsrpc_fsinfo(vnode_t vp, struct nfsfsinfo *fsp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *nap, int *attrflagp, void *stuff) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; int error; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_FSINFO, vp); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); error = nfscl_postop_attr(nd, nap, attrflagp, stuff); if (nd->nd_repstat && !error) error = nd->nd_repstat; if (!error) { NFSM_DISSECT(tl, u_int32_t *, NFSX_V3FSINFO); fsp->fs_rtmax = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_rtpref = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_rtmult = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_wtmax = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_wtpref = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_wtmult = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_dtpref = fxdr_unsigned(u_int32_t, *tl++); fsp->fs_maxfilesize = fxdr_hyper(tl); tl += 2; fxdr_nfsv3time(tl, &fsp->fs_timedelta); tl += 2; fsp->fs_properties = fxdr_unsigned(u_int32_t, *tl); } nfsmout: m_freem(nd->nd_mrep); return (error); } /* * This function performs the Renew RPC. */ int nfsrpc_renew(struct nfsclclient *clp, struct nfsclds *dsp, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfsmount *nmp; int error; struct nfssockreq *nrp; struct nfsclsession *tsep; nmp = clp->nfsc_nmp; if (nmp == NULL) return (0); if (dsp == NULL) nfscl_reqstart(nd, NFSPROC_RENEW, nmp, NULL, 0, NULL, NULL, 0, 0); else nfscl_reqstart(nd, NFSPROC_RENEW, nmp, NULL, 0, NULL, &dsp->nfsclds_sess, 0, 0); if (!NFSHASNFSV4N(nmp)) { /* NFSv4.1 just uses a Sequence Op and not a Renew. */ NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; } nrp = NULL; if (dsp != NULL) nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; nd->nd_flag |= ND_USEGSSNAME; if (dsp == NULL) error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); else { error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, &dsp->nfsclds_sess); if (error == ENXIO) nfscl_cancelreqs(dsp); } if (error) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * This function performs the Releaselockowner RPC. */ int nfsrpc_rellockown(struct nfsmount *nmp, struct nfscllockowner *lp, uint8_t *fh, int fhlen, struct ucred *cred, NFSPROC_T *p) { struct nfsrv_descript nfsd, *nd = &nfsd; u_int32_t *tl; int error; uint8_t own[NFSV4CL_LOCKNAMELEN + NFSX_V4FHMAX]; struct nfsclsession *tsep; if (NFSHASNFSV4N(nmp)) { /* For NFSv4.1, do a FreeStateID. */ nfscl_reqstart(nd, NFSPROC_FREESTATEID, nmp, NULL, 0, NULL, NULL, 0, 0); nfsm_stateidtom(nd, &lp->nfsl_stateid, NFSSTATEID_PUTSTATEID); } else { nfscl_reqstart(nd, NFSPROC_RELEASELCKOWN, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; NFSBCOPY(lp->nfsl_owner, own, NFSV4CL_LOCKNAMELEN); NFSBCOPY(fh, &own[NFSV4CL_LOCKNAMELEN], fhlen); (void)nfsm_strtom(nd, own, NFSV4CL_LOCKNAMELEN + fhlen); } nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * This function performs the Compound to get the mount pt FH. */ int nfsrpc_getdirpath(struct nfsmount *nmp, u_char *dirpath, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; u_char *cp, *cp2; int error, cnt, len, setnil; u_int32_t *opcntp; nfscl_reqstart(nd, NFSPROC_PUTROOTFH, nmp, NULL, 0, &opcntp, NULL, 0, 0); cp = dirpath; cnt = 0; do { setnil = 0; while (*cp == '/') cp++; cp2 = cp; while (*cp2 != '\0' && *cp2 != '/') cp2++; if (*cp2 == '/') { setnil = 1; *cp2 = '\0'; } if (cp2 != cp) { NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_LOOKUP); nfsm_strtom(nd, cp, strlen(cp)); cnt++; } if (setnil) *cp2++ = '/'; cp = cp2; } while (*cp != '\0'); if (NFSHASNFSV4N(nmp)) /* Has a Sequence Op done by nfscl_reqstart(). */ *opcntp = txdr_unsigned(3 + cnt); else *opcntp = txdr_unsigned(2 + cnt); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETFH); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, (3 + 2 * cnt) * NFSX_UNSIGNED); tl += (2 + 2 * cnt); if ((len = fxdr_unsigned(int, *tl)) <= 0 || len > NFSX_FHMAX) { nd->nd_repstat = NFSERR_BADXDR; } else { nd->nd_repstat = nfsrv_mtostr(nd, nmp->nm_fh, len); if (nd->nd_repstat == 0) nmp->nm_fhsize = len; } } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * This function performs the Delegreturn RPC. */ int nfsrpc_delegreturn(struct nfscldeleg *dp, struct ucred *cred, struct nfsmount *nmp, NFSPROC_T *p, int syscred) { u_int32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error; nfscl_reqstart(nd, NFSPROC_DELEGRETURN, nmp, dp->nfsdl_fh, dp->nfsdl_fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_STATEID); if (NFSHASNFSV4N(nmp)) *tl++ = 0; else *tl++ = dp->nfsdl_stateid.seqid; *tl++ = dp->nfsdl_stateid.other[0]; *tl++ = dp->nfsdl_stateid.other[1]; *tl = dp->nfsdl_stateid.other[2]; if (syscred) nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * nfs getacl call. */ int nfsrpc_getacl(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct acl *aclp, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfsattrbit_t attrbits; struct nfsmount *nmp = VFSTONFS(vp->v_mount); if (nfsrv_useacl == 0 || !NFSHASNFSV4(nmp)) return (EOPNOTSUPP); NFSCL_REQSTART(nd, NFSPROC_GETACL, vp); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL); (void) nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); if (!nd->nd_repstat) error = nfsv4_loadattr(nd, vp, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, aclp, 0, NULL, NULL, NULL, p, cred); else error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * nfs setacl call. */ int nfsrpc_setacl(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct acl *aclp, void *stuff) { int error; struct nfsmount *nmp = VFSTONFS(vp->v_mount); if (nfsrv_useacl == 0 || !NFSHASNFSV4(nmp)) return (EOPNOTSUPP); error = nfsrpc_setattr(vp, NULL, aclp, cred, p, NULL, NULL, stuff); return (error); } /* * nfs setacl call. */ static int nfsrpc_setaclrpc(vnode_t vp, struct ucred *cred, NFSPROC_T *p, struct acl *aclp, nfsv4stateid_t *stateidp, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfsattrbit_t attrbits; struct nfsmount *nmp = VFSTONFS(vp->v_mount); if (!NFSHASNFSV4(nmp)) return (EOPNOTSUPP); NFSCL_REQSTART(nd, NFSPROC_SETACL, vp); nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL); (void) nfsv4_fillattr(nd, vp->v_mount, vp, aclp, NULL, NULL, 0, &attrbits, NULL, NULL, 0, 0, 0, 0, (uint64_t)0, NULL); error = nfscl_request(nd, vp, p, cred, stuff); if (error) return (error); /* Don't care about the pre/postop attributes */ m_freem(nd->nd_mrep); return (nd->nd_repstat); } /* * Do the NFSv4.1 Exchange ID. */ int nfsrpc_exchangeid(struct nfsmount *nmp, struct nfsclclient *clp, struct nfssockreq *nrp, int minorvers, uint32_t exchflags, struct nfsclds **dspp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl, v41flags; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfsclds *dsp; struct timespec verstime; int error, len; *dspp = NULL; if (minorvers == 0) minorvers = nmp->nm_minorvers; nfscl_reqstart(nd, NFSPROC_EXCHANGEID, nmp, NULL, 0, NULL, NULL, NFS_VER4, minorvers); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(nfsboottime.tv_sec); /* Client owner */ *tl = txdr_unsigned(clp->nfsc_rev); (void) nfsm_strtom(nd, clp->nfsc_id, clp->nfsc_idlen); NFSM_BUILD(tl, uint32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(exchflags); *tl++ = txdr_unsigned(NFSV4EXCH_SP4NONE); /* Set the implementation id4 */ *tl = txdr_unsigned(1); (void) nfsm_strtom(nd, "freebsd.org", strlen("freebsd.org")); (void) nfsm_strtom(nd, version, strlen(version)); NFSM_BUILD(tl, uint32_t *, NFSX_V4TIME); verstime.tv_sec = 1293840000; /* Jan 1, 2011 */ verstime.tv_nsec = 0; txdr_nfsv4time(&verstime, tl); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); NFSCL_DEBUG(1, "exchangeid err=%d reps=%d\n", error, (int)nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 6 * NFSX_UNSIGNED + NFSX_HYPER); len = fxdr_unsigned(int, *(tl + 7)); if (len < 0 || len > NFSV4_OPAQUELIMIT) { error = NFSERR_BADXDR; goto nfsmout; } dsp = malloc(sizeof(struct nfsclds) + len + 1, M_NFSCLDS, M_WAITOK | M_ZERO); dsp->nfsclds_expire = NFSD_MONOSEC + clp->nfsc_renew; dsp->nfsclds_servownlen = len; dsp->nfsclds_sess.nfsess_clientid.lval[0] = *tl++; dsp->nfsclds_sess.nfsess_clientid.lval[1] = *tl++; dsp->nfsclds_sess.nfsess_sequenceid = fxdr_unsigned(uint32_t, *tl++); v41flags = fxdr_unsigned(uint32_t, *tl); if ((v41flags & NFSV4EXCH_USEPNFSMDS) != 0 && NFSHASPNFSOPT(nmp)) { NFSCL_DEBUG(1, "set PNFS\n"); NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_PNFS; NFSUNLOCKMNT(nmp); dsp->nfsclds_flags |= NFSCLDS_MDS; } if ((v41flags & NFSV4EXCH_USEPNFSDS) != 0) dsp->nfsclds_flags |= NFSCLDS_DS; if (minorvers == NFSV42_MINORVERSION) dsp->nfsclds_flags |= NFSCLDS_MINORV2; if (len > 0) nd->nd_repstat = nfsrv_mtostr(nd, dsp->nfsclds_serverown, len); if (nd->nd_repstat == 0) { mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF); mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF); nfscl_initsessionslots(&dsp->nfsclds_sess); *dspp = dsp; } else free(dsp, M_NFSCLDS); } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 Create Session. */ int nfsrpc_createsession(struct nfsmount *nmp, struct nfsclsession *sep, struct nfssockreq *nrp, struct nfsclds *dsp, uint32_t sequenceid, int mds, struct ucred *cred, NFSPROC_T *p) { uint32_t crflags, maxval, *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error, irdcnt, minorvers; /* Make sure nm_rsize, nm_wsize is set. */ if (nmp->nm_rsize > NFS_MAXBSIZE || nmp->nm_rsize == 0) nmp->nm_rsize = NFS_MAXBSIZE; if (nmp->nm_wsize > NFS_MAXBSIZE || nmp->nm_wsize == 0) nmp->nm_wsize = NFS_MAXBSIZE; if (dsp == NULL) minorvers = nmp->nm_minorvers; else if ((dsp->nfsclds_flags & NFSCLDS_MINORV2) != 0) minorvers = NFSV42_MINORVERSION; else minorvers = NFSV41_MINORVERSION; nfscl_reqstart(nd, NFSPROC_CREATESESSION, nmp, NULL, 0, NULL, NULL, NFS_VER4, minorvers); NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED); *tl++ = sep->nfsess_clientid.lval[0]; *tl++ = sep->nfsess_clientid.lval[1]; *tl++ = txdr_unsigned(sequenceid); crflags = (NFSMNT_RDONLY(nmp->nm_mountp) ? 0 : NFSV4CRSESS_PERSIST); if (nfscl_enablecallb != 0 && nfs_numnfscbd > 0 && mds != 0) crflags |= NFSV4CRSESS_CONNBACKCHAN; *tl = txdr_unsigned(crflags); /* Fill in fore channel attributes. */ NFSM_BUILD(tl, uint32_t *, 7 * NFSX_UNSIGNED); *tl++ = 0; /* Header pad size */ if ((nd->nd_flag & ND_NFSV42) != 0 && mds != 0 && sb_max_adj >= nmp->nm_wsize && sb_max_adj >= nmp->nm_rsize) { /* * NFSv4.2 Extended Attribute operations may want to do * requests/replies that are larger than nm_rsize/nm_wsize. */ *tl++ = txdr_unsigned(sb_max_adj - NFS_MAXXDR); *tl++ = txdr_unsigned(sb_max_adj - NFS_MAXXDR); } else { *tl++ = txdr_unsigned(nmp->nm_wsize + NFS_MAXXDR); *tl++ = txdr_unsigned(nmp->nm_rsize + NFS_MAXXDR); } *tl++ = txdr_unsigned(4096); /* Max response size cached */ *tl++ = txdr_unsigned(20); /* Max operations */ *tl++ = txdr_unsigned(64); /* Max slots */ *tl = 0; /* No rdma ird */ /* Fill in back channel attributes. */ NFSM_BUILD(tl, uint32_t *, 7 * NFSX_UNSIGNED); *tl++ = 0; /* Header pad size */ *tl++ = txdr_unsigned(10000); /* Max request size */ *tl++ = txdr_unsigned(10000); /* Max response size */ *tl++ = txdr_unsigned(4096); /* Max response size cached */ *tl++ = txdr_unsigned(4); /* Max operations */ *tl++ = txdr_unsigned(NFSV4_CBSLOTS); /* Max slots */ *tl = 0; /* No rdma ird */ NFSM_BUILD(tl, uint32_t *, 8 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFS_CALLBCKPROG); /* Call back prog # */ /* Allow AUTH_SYS callbacks as uid, gid == 0. */ *tl++ = txdr_unsigned(1); /* Auth_sys only */ *tl++ = txdr_unsigned(AUTH_SYS); /* AUTH_SYS type */ *tl++ = txdr_unsigned(nfsboottime.tv_sec); /* time stamp */ *tl++ = 0; /* Null machine name */ *tl++ = 0; /* Uid == 0 */ *tl++ = 0; /* Gid == 0 */ *tl = 0; /* No additional gids */ nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, nrp, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_V4SESSIONID + 2 * NFSX_UNSIGNED); bcopy(tl, sep->nfsess_sessionid, NFSX_V4SESSIONID); tl += NFSX_V4SESSIONID / NFSX_UNSIGNED; sep->nfsess_sequenceid = fxdr_unsigned(uint32_t, *tl++); crflags = fxdr_unsigned(uint32_t, *tl); if ((crflags & NFSV4CRSESS_PERSIST) != 0 && mds != 0) { NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_SESSPERSIST; NFSUNLOCKMNT(nmp); } /* Get the fore channel slot count. */ NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED); tl++; /* Skip the header pad size. */ /* Make sure nm_wsize is small enough. */ maxval = fxdr_unsigned(uint32_t, *tl++); while (maxval < nmp->nm_wsize + NFS_MAXXDR) { if (nmp->nm_wsize > 8096) nmp->nm_wsize /= 2; else break; } sep->nfsess_maxreq = maxval; /* Make sure nm_rsize is small enough. */ maxval = fxdr_unsigned(uint32_t, *tl++); while (maxval < nmp->nm_rsize + NFS_MAXXDR) { if (nmp->nm_rsize > 8096) nmp->nm_rsize /= 2; else break; } sep->nfsess_maxresp = maxval; sep->nfsess_maxcache = fxdr_unsigned(int, *tl++); tl++; sep->nfsess_foreslots = fxdr_unsigned(uint16_t, *tl++); NFSCL_DEBUG(4, "fore slots=%d\n", (int)sep->nfsess_foreslots); irdcnt = fxdr_unsigned(int, *tl); if (irdcnt > 0) NFSM_DISSECT(tl, uint32_t *, irdcnt * NFSX_UNSIGNED); /* and the back channel slot count. */ NFSM_DISSECT(tl, uint32_t *, 7 * NFSX_UNSIGNED); tl += 5; sep->nfsess_backslots = fxdr_unsigned(uint16_t, *tl); NFSCL_DEBUG(4, "back slots=%d\n", (int)sep->nfsess_backslots); } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 Destroy Session. */ int nfsrpc_destroysession(struct nfsmount *nmp, struct nfsclclient *clp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error; struct nfsclsession *tsep; nfscl_reqstart(nd, NFSPROC_DESTROYSESSION, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID); tsep = nfsmnt_mdssession(nmp); bcopy(tsep->nfsess_sessionid, tl, NFSX_V4SESSIONID); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 Destroy Client. */ int nfsrpc_destroyclient(struct nfsmount *nmp, struct nfsclclient *clp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error; struct nfsclsession *tsep; nfscl_reqstart(nd, NFSPROC_DESTROYCLIENT, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 LayoutGet. */ static int nfsrpc_layoutget(struct nfsmount *nmp, uint8_t *fhp, int fhlen, int iomode, uint64_t offset, uint64_t len, uint64_t minlen, int layouttype, int layoutlen, nfsv4stateid_t *stateidp, int *retonclosep, struct nfsclflayouthead *flhp, struct ucred *cred, NFSPROC_T *p, void *stuff) { struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfscl_reqstart(nd, NFSPROC_LAYOUTGET, nmp, fhp, fhlen, NULL, NULL, 0, 0); nfsrv_setuplayoutget(nd, iomode, offset, len, minlen, stateidp, layouttype, layoutlen, 0); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); NFSCL_DEBUG(4, "layget err=%d st=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat == 0) error = nfsrv_parselayoutget(nmp, nd, stateidp, retonclosep, flhp); if (error == 0 && nd->nd_repstat != 0) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 Get Device Info. */ int nfsrpc_getdeviceinfo(struct nfsmount *nmp, uint8_t *deviceid, int layouttype, uint32_t *notifybitsp, struct nfscldevinfo **ndip, struct ucred *cred, NFSPROC_T *p) { uint32_t cnt, *tl, vers, minorvers; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct sockaddr_in sin, ssin; struct sockaddr_in6 sin6, ssin6; struct nfsclds *dsp = NULL, **dspp, **gotdspp; struct nfscldevinfo *ndi; int addrcnt = 0, bitcnt, error, gotminor, gotvers, i, isudp, j; int stripecnt; uint8_t stripeindex; sa_family_t af, safilled; ssin.sin_port = 0; /* To shut up compiler. */ ssin.sin_addr.s_addr = 0; /* ditto */ *ndip = NULL; ndi = NULL; gotdspp = NULL; nfscl_reqstart(nd, NFSPROC_GETDEVICEINFO, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, NFSX_V4DEVICEID + 3 * NFSX_UNSIGNED); NFSBCOPY(deviceid, tl, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); *tl++ = txdr_unsigned(layouttype); *tl++ = txdr_unsigned(100000); if (notifybitsp != NULL && *notifybitsp != 0) { *tl = txdr_unsigned(1); /* One word of bits. */ NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(*notifybitsp); } else *tl = txdr_unsigned(0); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (layouttype != fxdr_unsigned(int, *tl)) printf("EEK! devinfo layout type not same!\n"); if (layouttype == NFSLAYOUT_NFSV4_1_FILES) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); stripecnt = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "stripecnt=%d\n", stripecnt); if (stripecnt < 1 || stripecnt > 4096) { printf("pNFS File layout devinfo stripecnt %d:" " out of range\n", stripecnt); error = NFSERR_BADXDR; goto nfsmout; } NFSM_DISSECT(tl, uint32_t *, (stripecnt + 1) * NFSX_UNSIGNED); addrcnt = fxdr_unsigned(int, *(tl + stripecnt)); NFSCL_DEBUG(4, "addrcnt=%d\n", addrcnt); if (addrcnt < 1 || addrcnt > 128) { printf("NFS devinfo addrcnt %d: out of range\n", addrcnt); error = NFSERR_BADXDR; goto nfsmout; } /* * Now we know how many stripe indices and addresses, so * we can allocate the structure the correct size. */ i = (stripecnt * sizeof(uint8_t)) / sizeof(struct nfsclds *) + 1; NFSCL_DEBUG(4, "stripeindices=%d\n", i); ndi = malloc(sizeof(*ndi) + (addrcnt + i) * sizeof(struct nfsclds *), M_NFSDEVINFO, M_WAITOK | M_ZERO); NFSBCOPY(deviceid, ndi->nfsdi_deviceid, NFSX_V4DEVICEID); ndi->nfsdi_refcnt = 0; ndi->nfsdi_flags = NFSDI_FILELAYOUT; ndi->nfsdi_stripecnt = stripecnt; ndi->nfsdi_addrcnt = addrcnt; /* Fill in the stripe indices. */ for (i = 0; i < stripecnt; i++) { stripeindex = fxdr_unsigned(uint8_t, *tl++); NFSCL_DEBUG(4, "stripeind=%d\n", stripeindex); if (stripeindex >= addrcnt) { printf("pNFS File Layout devinfo" " stripeindex %d: too big\n", (int)stripeindex); error = NFSERR_BADXDR; goto nfsmout; } nfsfldi_setstripeindex(ndi, i, stripeindex); } } else if (layouttype == NFSLAYOUT_FLEXFILE) { /* For Flex File, we only get one address list. */ ndi = malloc(sizeof(*ndi) + sizeof(struct nfsclds *), M_NFSDEVINFO, M_WAITOK | M_ZERO); NFSBCOPY(deviceid, ndi->nfsdi_deviceid, NFSX_V4DEVICEID); ndi->nfsdi_refcnt = 0; ndi->nfsdi_flags = NFSDI_FLEXFILE; addrcnt = ndi->nfsdi_addrcnt = 1; } /* Now, dissect the server address(es). */ safilled = AF_UNSPEC; for (i = 0; i < addrcnt; i++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); cnt = fxdr_unsigned(uint32_t, *tl); if (cnt == 0) { printf("NFS devinfo 0 len addrlist\n"); error = NFSERR_BADXDR; goto nfsmout; } dspp = nfsfldi_addr(ndi, i); safilled = AF_UNSPEC; for (j = 0; j < cnt; j++) { error = nfsv4_getipaddr(nd, &sin, &sin6, &af, &isudp); if (error != 0 && error != EPERM) { error = NFSERR_BADXDR; goto nfsmout; } if (error == 0 && isudp == 0) { /* * The priority is: * - Same address family. * Save the address and dspp, so that * the connection can be done after * parsing is complete. */ if (safilled == AF_UNSPEC || (af == nmp->nm_nam->sa_family && safilled != nmp->nm_nam->sa_family) ) { if (af == AF_INET) ssin = sin; else ssin6 = sin6; safilled = af; gotdspp = dspp; } } } } gotvers = NFS_VER4; /* Default NFSv4.1 for File Layout. */ gotminor = NFSV41_MINORVERSION; /* For Flex File, we will take one of the versions to use. */ if (layouttype == NFSLAYOUT_FLEXFILE) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); j = fxdr_unsigned(int, *tl); if (j < 1 || j > NFSDEV_MAXVERS) { printf("pNFS: too many versions\n"); error = NFSERR_BADXDR; goto nfsmout; } gotvers = 0; gotminor = 0; for (i = 0; i < j; i++) { NFSM_DISSECT(tl, uint32_t *, 5 * NFSX_UNSIGNED); vers = fxdr_unsigned(uint32_t, *tl++); minorvers = fxdr_unsigned(uint32_t, *tl++); if (vers == NFS_VER3) minorvers = 0; if ((vers == NFS_VER4 && ((minorvers == NFSV41_MINORVERSION && gotminor == 0) || minorvers == NFSV42_MINORVERSION)) || (vers == NFS_VER3 && gotvers == 0)) { gotvers = vers; gotminor = minorvers; /* We'll take this one. */ ndi->nfsdi_versindex = i; ndi->nfsdi_vers = vers; ndi->nfsdi_minorvers = minorvers; ndi->nfsdi_rsize = fxdr_unsigned( uint32_t, *tl++); ndi->nfsdi_wsize = fxdr_unsigned( uint32_t, *tl++); if (*tl == newnfs_true) ndi->nfsdi_flags |= NFSDI_TIGHTCOUPLED; else ndi->nfsdi_flags &= ~NFSDI_TIGHTCOUPLED; } } if (gotvers == 0) { printf("pNFS: no NFSv3, NFSv4.1 or NFSv4.2\n"); error = NFSERR_BADXDR; goto nfsmout; } } /* And the notify bits. */ NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); bitcnt = fxdr_unsigned(int, *tl); if (bitcnt > 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); if (notifybitsp != NULL) *notifybitsp = fxdr_unsigned(uint32_t, *tl); } if (safilled != AF_UNSPEC) { KASSERT(ndi != NULL, ("ndi is NULL")); *ndip = ndi; } else error = EPERM; if (error == 0) { /* * Now we can do a TCP connection for the correct * NFS version and IP address. */ error = nfsrpc_fillsa(nmp, &ssin, &ssin6, safilled, gotvers, gotminor, &dsp, p); } if (error == 0) { KASSERT(gotdspp != NULL, ("gotdspp is NULL")); *gotdspp = dsp; } } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; nfsmout: if (error != 0 && ndi != NULL) nfscl_freedevinfo(ndi); m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 LayoutCommit. */ int nfsrpc_layoutcommit(struct nfsmount *nmp, uint8_t *fh, int fhlen, int reclaim, uint64_t off, uint64_t len, uint64_t lastbyte, nfsv4stateid_t *stateidp, int layouttype, struct ucred *cred, NFSPROC_T *p, void *stuff) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; int error; nfscl_reqstart(nd, NFSPROC_LAYOUTCOMMIT, nmp, fh, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED + 3 * NFSX_HYPER + NFSX_STATEID); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; if (reclaim != 0) *tl++ = newnfs_true; else *tl++ = newnfs_false; *tl++ = txdr_unsigned(stateidp->seqid); *tl++ = stateidp->other[0]; *tl++ = stateidp->other[1]; *tl++ = stateidp->other[2]; *tl++ = newnfs_true; if (lastbyte < off) lastbyte = off; else if (lastbyte >= (off + len)) lastbyte = off + len - 1; txdr_hyper(lastbyte, tl); tl += 2; *tl++ = newnfs_false; *tl++ = txdr_unsigned(layouttype); /* All supported layouts are 0 length. */ *tl = txdr_unsigned(0); nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Do the NFSv4.1 LayoutReturn. */ int nfsrpc_layoutreturn(struct nfsmount *nmp, uint8_t *fh, int fhlen, int reclaim, int layouttype, uint32_t iomode, int layoutreturn, uint64_t offset, uint64_t len, nfsv4stateid_t *stateidp, struct ucred *cred, NFSPROC_T *p, uint32_t stat, uint32_t op, char *devid) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; uint64_t tu64; int error; nfscl_reqstart(nd, NFSPROC_LAYOUTRETURN, nmp, fh, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED); if (reclaim != 0) *tl++ = newnfs_true; else *tl++ = newnfs_false; *tl++ = txdr_unsigned(layouttype); *tl++ = txdr_unsigned(iomode); *tl = txdr_unsigned(layoutreturn); if (layoutreturn == NFSLAYOUTRETURN_FILE) { NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_STATEID + NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; txdr_hyper(len, tl); tl += 2; NFSCL_DEBUG(4, "layoutret stseq=%d\n", (int)stateidp->seqid); *tl++ = txdr_unsigned(stateidp->seqid); *tl++ = stateidp->other[0]; *tl++ = stateidp->other[1]; *tl++ = stateidp->other[2]; if (layouttype == NFSLAYOUT_NFSV4_1_FILES) *tl = txdr_unsigned(0); else if (layouttype == NFSLAYOUT_FLEXFILE) { if (stat != 0) { *tl = txdr_unsigned(2 * NFSX_HYPER + NFSX_STATEID + NFSX_V4DEVICEID + 5 * NFSX_UNSIGNED); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_STATEID + NFSX_V4DEVICEID + 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(1); /* One error. */ tu64 = 0; /* Offset. */ txdr_hyper(tu64, tl); tl += 2; tu64 = UINT64_MAX; /* Length. */ txdr_hyper(tu64, tl); tl += 2; NFSBCOPY(stateidp, tl, NFSX_STATEID); tl += (NFSX_STATEID / NFSX_UNSIGNED); *tl++ = txdr_unsigned(1); /* One error. */ NFSBCOPY(devid, tl, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); *tl++ = txdr_unsigned(stat); *tl++ = txdr_unsigned(op); } else { *tl = txdr_unsigned(2 * NFSX_UNSIGNED); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); /* No ioerrs. */ *tl++ = 0; } *tl = 0; /* No stats yet. */ } } nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); if (*tl != 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_STATEID); stateidp->seqid = fxdr_unsigned(uint32_t, *tl++); stateidp->other[0] = *tl++; stateidp->other[1] = *tl++; stateidp->other[2] = *tl; } } else error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Acquire a layout and devinfo, if possible. The caller must have acquired * a reference count on the nfsclclient structure before calling this. * Return the layout in lypp with a reference count on it, if successful. */ static int nfsrpc_getlayout(struct nfsmount *nmp, vnode_t vp, struct nfsfh *nfhp, int iomode, uint32_t *notifybitsp, nfsv4stateid_t *stateidp, uint64_t off, struct nfscllayout **lypp, struct ucred *cred, NFSPROC_T *p) { struct nfscllayout *lyp; struct nfsclflayout *flp; struct nfsclflayouthead flh; int error = 0, islocked, layoutlen, layouttype, recalled, retonclose; nfsv4stateid_t stateid; struct nfsclsession *tsep; *lypp = NULL; if (NFSHASFLEXFILE(nmp)) layouttype = NFSLAYOUT_FLEXFILE; else layouttype = NFSLAYOUT_NFSV4_1_FILES; /* * If lyp is returned non-NULL, there will be a refcnt (shared lock) * on it, iff flp != NULL or a lock (exclusive lock) on it iff * flp == NULL. */ lyp = nfscl_getlayout(nmp->nm_clp, nfhp->nfh_fh, nfhp->nfh_len, off, &flp, &recalled); islocked = 0; if (lyp == NULL || flp == NULL) { if (recalled != 0) return (EIO); LIST_INIT(&flh); tsep = nfsmnt_mdssession(nmp); layoutlen = tsep->nfsess_maxcache - (NFSX_STATEID + 3 * NFSX_UNSIGNED); if (lyp == NULL) { stateid.seqid = 0; stateid.other[0] = stateidp->other[0]; stateid.other[1] = stateidp->other[1]; stateid.other[2] = stateidp->other[2]; error = nfsrpc_layoutget(nmp, nfhp->nfh_fh, nfhp->nfh_len, iomode, (uint64_t)0, UINT64_MAX, (uint64_t)0, layouttype, layoutlen, &stateid, &retonclose, &flh, cred, p, NULL); } else { islocked = 1; stateid.seqid = lyp->nfsly_stateid.seqid; stateid.other[0] = lyp->nfsly_stateid.other[0]; stateid.other[1] = lyp->nfsly_stateid.other[1]; stateid.other[2] = lyp->nfsly_stateid.other[2]; error = nfsrpc_layoutget(nmp, nfhp->nfh_fh, nfhp->nfh_len, iomode, off, UINT64_MAX, (uint64_t)0, layouttype, layoutlen, &stateid, &retonclose, &flh, cred, p, NULL); } error = nfsrpc_layoutgetres(nmp, vp, nfhp->nfh_fh, nfhp->nfh_len, &stateid, retonclose, notifybitsp, &lyp, &flh, layouttype, error, NULL, cred, p); if (error == 0) *lypp = lyp; else if (islocked != 0) nfscl_rellayout(lyp, 1); } else *lypp = lyp; return (error); } /* * Do a TCP connection plus exchange id and create session. * If successful, a "struct nfsclds" is linked into the list for the * mount point and a pointer to it is returned. */ static int nfsrpc_fillsa(struct nfsmount *nmp, struct sockaddr_in *sin, struct sockaddr_in6 *sin6, sa_family_t af, int vers, int minorvers, struct nfsclds **dspp, NFSPROC_T *p) { struct sockaddr_in *msad, *sad; struct sockaddr_in6 *msad6, *sad6; struct nfsclclient *clp; struct nfssockreq *nrp; struct nfsclds *dsp, *tdsp; int error, firsttry; enum nfsclds_state retv; uint32_t sequenceid = 0; KASSERT(nmp->nm_sockreq.nr_cred != NULL, ("nfsrpc_fillsa: NULL nr_cred")); NFSLOCKCLSTATE(); clp = nmp->nm_clp; NFSUNLOCKCLSTATE(); if (clp == NULL) return (EPERM); if (af == AF_INET) { NFSLOCKMNT(nmp); /* * Check to see if we already have a session for this * address that is usable for a DS. * Note that the MDS's address is in a different place * than the sessions already acquired for DS's. */ msad = (struct sockaddr_in *)nmp->nm_sockreq.nr_nam; tdsp = TAILQ_FIRST(&nmp->nm_sess); while (tdsp != NULL) { if (msad != NULL && msad->sin_family == AF_INET && sin->sin_addr.s_addr == msad->sin_addr.s_addr && sin->sin_port == msad->sin_port && (tdsp->nfsclds_flags & NFSCLDS_DS) != 0 && tdsp->nfsclds_sess.nfsess_defunct == 0) { *dspp = tdsp; NFSUNLOCKMNT(nmp); NFSCL_DEBUG(4, "fnd same addr\n"); return (0); } tdsp = TAILQ_NEXT(tdsp, nfsclds_list); if (tdsp != NULL && tdsp->nfsclds_sockp != NULL) msad = (struct sockaddr_in *) tdsp->nfsclds_sockp->nr_nam; else msad = NULL; } NFSUNLOCKMNT(nmp); /* No IP address match, so look for new/trunked one. */ sad = malloc(sizeof(*sad), M_SONAME, M_WAITOK | M_ZERO); sad->sin_len = sizeof(*sad); sad->sin_family = AF_INET; sad->sin_port = sin->sin_port; sad->sin_addr.s_addr = sin->sin_addr.s_addr; nrp = malloc(sizeof(*nrp), M_NFSSOCKREQ, M_WAITOK | M_ZERO); nrp->nr_nam = (struct sockaddr *)sad; } else if (af == AF_INET6) { NFSLOCKMNT(nmp); /* * Check to see if we already have a session for this * address that is usable for a DS. * Note that the MDS's address is in a different place * than the sessions already acquired for DS's. */ msad6 = (struct sockaddr_in6 *)nmp->nm_sockreq.nr_nam; tdsp = TAILQ_FIRST(&nmp->nm_sess); while (tdsp != NULL) { if (msad6 != NULL && msad6->sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &msad6->sin6_addr) && sin6->sin6_port == msad6->sin6_port && (tdsp->nfsclds_flags & NFSCLDS_DS) != 0 && tdsp->nfsclds_sess.nfsess_defunct == 0) { *dspp = tdsp; NFSUNLOCKMNT(nmp); return (0); } tdsp = TAILQ_NEXT(tdsp, nfsclds_list); if (tdsp != NULL && tdsp->nfsclds_sockp != NULL) msad6 = (struct sockaddr_in6 *) tdsp->nfsclds_sockp->nr_nam; else msad6 = NULL; } NFSUNLOCKMNT(nmp); /* No IP address match, so look for new/trunked one. */ sad6 = malloc(sizeof(*sad6), M_SONAME, M_WAITOK | M_ZERO); sad6->sin6_len = sizeof(*sad6); sad6->sin6_family = AF_INET6; sad6->sin6_port = sin6->sin6_port; NFSBCOPY(&sin6->sin6_addr, &sad6->sin6_addr, sizeof(struct in6_addr)); nrp = malloc(sizeof(*nrp), M_NFSSOCKREQ, M_WAITOK | M_ZERO); nrp->nr_nam = (struct sockaddr *)sad6; } else return (EPERM); nrp->nr_sotype = SOCK_STREAM; mtx_init(&nrp->nr_mtx, "nfssock", NULL, MTX_DEF); nrp->nr_prog = NFS_PROG; nrp->nr_vers = vers; /* * Use the credentials that were used for the mount, which are * in nmp->nm_sockreq.nr_cred for newnfs_connect() etc. * Ref. counting the credentials with crhold() is probably not * necessary, since nm_sockreq.nr_cred won't be crfree()'d until * unmount, but I did it anyhow. */ nrp->nr_cred = crhold(nmp->nm_sockreq.nr_cred); error = newnfs_connect(nmp, nrp, NULL, p, 0, false); NFSCL_DEBUG(3, "DS connect=%d\n", error); dsp = NULL; /* Now, do the exchangeid and create session. */ if (error == 0) { if (vers == NFS_VER4) { firsttry = 0; do { error = nfsrpc_exchangeid(nmp, clp, nrp, minorvers, NFSV4EXCH_USEPNFSDS, &dsp, nrp->nr_cred, p); NFSCL_DEBUG(3, "DS exchangeid=%d\n", error); if (error == NFSERR_MINORVERMISMATCH) minorvers = NFSV42_MINORVERSION; } while (error == NFSERR_MINORVERMISMATCH && firsttry++ == 0); if (error != 0) newnfs_disconnect(nrp); } else { dsp = malloc(sizeof(struct nfsclds), M_NFSCLDS, M_WAITOK | M_ZERO); dsp->nfsclds_flags |= NFSCLDS_DS; dsp->nfsclds_expire = INT32_MAX; /* No renews needed. */ mtx_init(&dsp->nfsclds_mtx, "nfsds", NULL, MTX_DEF); mtx_init(&dsp->nfsclds_sess.nfsess_mtx, "nfssession", NULL, MTX_DEF); } } if (error == 0) { dsp->nfsclds_sockp = nrp; if (vers == NFS_VER4) { NFSLOCKMNT(nmp); retv = nfscl_getsameserver(nmp, dsp, &tdsp, &sequenceid); NFSCL_DEBUG(3, "getsame ret=%d\n", retv); if (retv == NFSDSP_USETHISSESSION && nfscl_dssameconn != 0) { NFSLOCKDS(tdsp); tdsp->nfsclds_flags |= NFSCLDS_SAMECONN; NFSUNLOCKDS(tdsp); NFSUNLOCKMNT(nmp); /* * If there is already a session for this * server, use it. */ (void)newnfs_disconnect(nrp); nfscl_freenfsclds(dsp); *dspp = tdsp; return (0); } if (retv == NFSDSP_NOTFOUND) sequenceid = dsp->nfsclds_sess.nfsess_sequenceid; NFSUNLOCKMNT(nmp); error = nfsrpc_createsession(nmp, &dsp->nfsclds_sess, nrp, dsp, sequenceid, 0, nrp->nr_cred, p); NFSCL_DEBUG(3, "DS createsess=%d\n", error); } } else { NFSFREECRED(nrp->nr_cred); NFSFREEMUTEX(&nrp->nr_mtx); free(nrp->nr_nam, M_SONAME); free(nrp, M_NFSSOCKREQ); } if (error == 0) { NFSCL_DEBUG(3, "add DS session\n"); /* * Put it at the end of the list. That way the list * is ordered by when the entry was added. This matters * since the one done first is the one that should be * used for sequencid'ing any subsequent create sessions. */ NFSLOCKMNT(nmp); TAILQ_INSERT_TAIL(&nmp->nm_sess, dsp, nfsclds_list); NFSUNLOCKMNT(nmp); *dspp = dsp; } else if (dsp != NULL) { newnfs_disconnect(nrp); nfscl_freenfsclds(dsp); } return (error); } /* * Do the NFSv4.1 Reclaim Complete. */ int nfsrpc_reclaimcomplete(struct nfsmount *nmp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; int error; nfscl_reqstart(nd, NFSPROC_RECLAIMCOMPL, nmp, NULL, 0, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = newnfs_false; nd->nd_flag |= ND_USEGSSNAME; error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Initialize the slot tables for a session. */ static void nfscl_initsessionslots(struct nfsclsession *sep) { int i; for (i = 0; i < NFSV4_CBSLOTS; i++) { if (sep->nfsess_cbslots[i].nfssl_reply != NULL) m_freem(sep->nfsess_cbslots[i].nfssl_reply); NFSBZERO(&sep->nfsess_cbslots[i], sizeof(struct nfsslot)); } for (i = 0; i < 64; i++) sep->nfsess_slotseq[i] = 0; sep->nfsess_slots = 0; } /* * Called to try and do an I/O operation via an NFSv4.1 Data Server (DS). */ int nfscl_doiods(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, uint32_t rwaccess, int docommit, struct ucred *cred, NFSPROC_T *p) { struct nfsnode *np = VTONFS(vp); struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfscllayout *layp; struct nfscldevinfo *dip; struct nfsclflayout *rflp; struct mbuf *m, *m2; struct nfsclwritedsdorpc *drpc, *tdrpc; nfsv4stateid_t stateid; struct ucred *newcred; uint64_t lastbyte, len, off, oresid, xfer; int eof, error, firstmirror, i, iolaymode, mirrorcnt, recalled, timo; void *lckp; uint8_t *dev; void *iovbase = NULL; size_t iovlen = 0; off_t offs = 0; ssize_t resid = 0; if (!NFSHASPNFS(nmp) || nfscl_enablecallb == 0 || nfs_numnfscbd == 0 || (np->n_flag & NNOLAYOUT) != 0) return (EIO); /* Now, get a reference cnt on the clientid for this mount. */ if (nfscl_getref(nmp) == 0) return (EIO); /* Find an appropriate stateid. */ newcred = NFSNEWCRED(cred); error = nfscl_getstateid(vp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, rwaccess, 1, newcred, p, &stateid, &lckp); if (error != 0) { NFSFREECRED(newcred); nfscl_relref(nmp); return (error); } /* Search for a layout for this file. */ off = uiop->uio_offset; layp = nfscl_getlayout(nmp->nm_clp, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, off, &rflp, &recalled); if (layp == NULL || rflp == NULL) { if (recalled != 0) { NFSFREECRED(newcred); nfscl_relref(nmp); return (EIO); } if (layp != NULL) { nfscl_rellayout(layp, (rflp == NULL) ? 1 : 0); layp = NULL; } /* Try and get a Layout, if it is supported. */ if (rwaccess == NFSV4OPEN_ACCESSWRITE || (np->n_flag & NWRITEOPENED) != 0) iolaymode = NFSLAYOUTIOMODE_RW; else iolaymode = NFSLAYOUTIOMODE_READ; error = nfsrpc_getlayout(nmp, vp, np->n_fhp, iolaymode, NULL, &stateid, off, &layp, newcred, p); if (error != 0) { NFSLOCKNODE(np); np->n_flag |= NNOLAYOUT; NFSUNLOCKNODE(np); if (lckp != NULL) nfscl_lockderef(lckp); NFSFREECRED(newcred); if (layp != NULL) nfscl_rellayout(layp, 0); nfscl_relref(nmp); return (error); } } /* * Loop around finding a layout that works for the first part of * this I/O operation, and then call the function that actually * does the RPC. */ eof = 0; len = (uint64_t)uiop->uio_resid; while (len > 0 && error == 0 && eof == 0) { off = uiop->uio_offset; error = nfscl_findlayoutforio(layp, off, rwaccess, &rflp); if (error == 0) { oresid = xfer = (uint64_t)uiop->uio_resid; if (xfer > (rflp->nfsfl_end - rflp->nfsfl_off)) xfer = rflp->nfsfl_end - rflp->nfsfl_off; /* * For Flex File layout with mirrored DSs, select one * of them at random for reads. For writes and commits, * do all mirrors. */ m = NULL; tdrpc = drpc = NULL; firstmirror = 0; mirrorcnt = 1; if ((layp->nfsly_flags & NFSLY_FLEXFILE) != 0 && (mirrorcnt = rflp->nfsfl_mirrorcnt) > 1) { if (rwaccess == NFSV4OPEN_ACCESSREAD) { firstmirror = arc4random() % mirrorcnt; mirrorcnt = firstmirror + 1; } else { if (docommit == 0) { /* * Save values, so uiop can be * rolled back upon a write * error. */ offs = uiop->uio_offset; resid = uiop->uio_resid; iovbase = uiop->uio_iov->iov_base; iovlen = uiop->uio_iov->iov_len; m = nfsm_uiombuflist(uiop, len, 0); } tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP, M_WAITOK | M_ZERO); } } for (i = firstmirror; i < mirrorcnt && error == 0; i++){ m2 = NULL; if (m != NULL && i < mirrorcnt - 1) m2 = m_copym(m, 0, M_COPYALL, M_WAITOK); else { m2 = m; m = NULL; } if ((layp->nfsly_flags & NFSLY_FLEXFILE) != 0) { dev = rflp->nfsfl_ffm[i].dev; dip = nfscl_getdevinfo(nmp->nm_clp, dev, rflp->nfsfl_ffm[i].devp); } else { dev = rflp->nfsfl_dev; dip = nfscl_getdevinfo(nmp->nm_clp, dev, rflp->nfsfl_devp); } if (dip != NULL) { if ((rflp->nfsfl_flags & NFSFL_FLEXFILE) != 0) error = nfscl_dofflayoutio(vp, uiop, iomode, must_commit, &eof, &stateid, rwaccess, dip, layp, rflp, off, xfer, i, docommit, m2, tdrpc, newcred, p); else error = nfscl_doflayoutio(vp, uiop, iomode, must_commit, &eof, &stateid, rwaccess, dip, layp, rflp, off, xfer, docommit, newcred, p); nfscl_reldevinfo(dip); } else { if (m2 != NULL) m_freem(m2); error = EIO; } tdrpc++; } if (m != NULL) m_freem(m); tdrpc = drpc; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; for (i = firstmirror; i < mirrorcnt - 1 && tdrpc != NULL; i++, tdrpc++) { /* * For the unused drpc entries, both inprog and * err == 0, so this loop won't break. */ while (tdrpc->inprog != 0 && tdrpc->done == 0) tsleep(&tdrpc->tsk, PVFS, "clrpcio", timo); if (error == 0 && tdrpc->err != 0) error = tdrpc->err; } free(drpc, M_TEMP); if (error == 0) { if (mirrorcnt > 1 && rwaccess == NFSV4OPEN_ACCESSWRITE && docommit == 0) { NFSLOCKCLSTATE(); layp->nfsly_flags |= NFSLY_WRITTEN; NFSUNLOCKCLSTATE(); } lastbyte = off + xfer - 1; NFSLOCKCLSTATE(); if (lastbyte > layp->nfsly_lastbyte) layp->nfsly_lastbyte = lastbyte; NFSUNLOCKCLSTATE(); } else if (error == NFSERR_OPENMODE && rwaccess == NFSV4OPEN_ACCESSREAD) { NFSLOCKMNT(nmp); nmp->nm_state |= NFSSTA_OPENMODE; NFSUNLOCKMNT(nmp); } else error = EIO; if (error == 0) len -= (oresid - (uint64_t)uiop->uio_resid); else if (mirrorcnt > 1 && rwaccess == NFSV4OPEN_ACCESSWRITE && docommit == 0) { /* * In case the rpc gets retried, roll the * uio fields changed by nfsm_uiombuflist() * back. */ uiop->uio_offset = offs; uiop->uio_resid = resid; uiop->uio_iov->iov_base = iovbase; uiop->uio_iov->iov_len = iovlen; } } } if (lckp != NULL) nfscl_lockderef(lckp); NFSFREECRED(newcred); nfscl_rellayout(layp, 0); nfscl_relref(nmp); return (error); } /* * Find a file layout that will handle the first bytes of the requested * range and return the information from it needed to the I/O operation. */ int nfscl_findlayoutforio(struct nfscllayout *lyp, uint64_t off, uint32_t rwaccess, struct nfsclflayout **retflpp) { struct nfsclflayout *flp, *nflp, *rflp; uint32_t rw; rflp = NULL; rw = rwaccess; /* For reading, do the Read list first and then the Write list. */ do { if (rw == NFSV4OPEN_ACCESSREAD) flp = LIST_FIRST(&lyp->nfsly_flayread); else flp = LIST_FIRST(&lyp->nfsly_flayrw); while (flp != NULL) { nflp = LIST_NEXT(flp, nfsfl_list); if (flp->nfsfl_off > off) break; if (flp->nfsfl_end > off && (rflp == NULL || rflp->nfsfl_end < flp->nfsfl_end)) rflp = flp; flp = nflp; } if (rw == NFSV4OPEN_ACCESSREAD) rw = NFSV4OPEN_ACCESSWRITE; else rw = 0; } while (rw != 0); if (rflp != NULL) { /* This one covers the most bytes starting at off. */ *retflpp = rflp; return (0); } return (EIO); } /* * Do I/O using an NFSv4.1 or NFSv4.2 file layout. */ static int nfscl_doflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, int *eofp, nfsv4stateid_t *stateidp, int rwflag, struct nfscldevinfo *dp, struct nfscllayout *lyp, struct nfsclflayout *flp, uint64_t off, uint64_t len, int docommit, struct ucred *cred, NFSPROC_T *p) { uint64_t io_off, rel_off, stripe_unit_size, transfer, xfer; int commit_thru_mds, error, stripe_index, stripe_pos, minorvers; struct nfsnode *np; struct nfsfh *fhp; struct nfsclds **dspp; np = VTONFS(vp); rel_off = off - flp->nfsfl_patoff; stripe_unit_size = flp->nfsfl_util & NFSFLAYUTIL_STRIPE_MASK; stripe_pos = (rel_off / stripe_unit_size + flp->nfsfl_stripe1) % dp->nfsdi_stripecnt; transfer = stripe_unit_size - (rel_off % stripe_unit_size); error = 0; /* Loop around, doing I/O for each stripe unit. */ while (len > 0 && error == 0) { stripe_index = nfsfldi_stripeindex(dp, stripe_pos); dspp = nfsfldi_addr(dp, stripe_index); if (((*dspp)->nfsclds_flags & NFSCLDS_MINORV2) != 0) minorvers = NFSV42_MINORVERSION; else minorvers = NFSV41_MINORVERSION; if (len > transfer && docommit == 0) xfer = transfer; else xfer = len; if ((flp->nfsfl_util & NFSFLAYUTIL_DENSE) != 0) { /* Dense layout. */ if (stripe_pos >= flp->nfsfl_fhcnt) return (EIO); fhp = flp->nfsfl_fh[stripe_pos]; io_off = (rel_off / (stripe_unit_size * dp->nfsdi_stripecnt)) * stripe_unit_size + rel_off % stripe_unit_size; } else { /* Sparse layout. */ if (flp->nfsfl_fhcnt > 1) { if (stripe_index >= flp->nfsfl_fhcnt) return (EIO); fhp = flp->nfsfl_fh[stripe_index]; } else if (flp->nfsfl_fhcnt == 1) fhp = flp->nfsfl_fh[0]; else fhp = np->n_fhp; io_off = off; } if ((flp->nfsfl_util & NFSFLAYUTIL_COMMIT_THRU_MDS) != 0) { commit_thru_mds = 1; if (docommit != 0) error = EIO; } else { commit_thru_mds = 0; NFSLOCKNODE(np); np->n_flag |= NDSCOMMIT; NFSUNLOCKNODE(np); } if (docommit != 0) { if (error == 0) error = nfsrpc_commitds(vp, io_off, xfer, *dspp, fhp, NFS_VER4, minorvers, cred, p); if (error == 0) { /* * Set both eof and uio_resid = 0 to end any * loops. */ *eofp = 1; uiop->uio_resid = 0; } else { NFSLOCKNODE(np); np->n_flag &= ~NDSCOMMIT; NFSUNLOCKNODE(np); } } else if (rwflag == NFSV4OPEN_ACCESSREAD) error = nfsrpc_readds(vp, uiop, stateidp, eofp, *dspp, io_off, xfer, fhp, 0, NFS_VER4, minorvers, cred, p); else { error = nfsrpc_writeds(vp, uiop, iomode, must_commit, stateidp, *dspp, io_off, xfer, fhp, commit_thru_mds, 0, NFS_VER4, minorvers, cred, p); if (error == 0) { NFSLOCKCLSTATE(); lyp->nfsly_flags |= NFSLY_WRITTEN; NFSUNLOCKCLSTATE(); } } if (error == 0) { transfer = stripe_unit_size; stripe_pos = (stripe_pos + 1) % dp->nfsdi_stripecnt; len -= xfer; off += xfer; } } return (error); } /* * Do I/O using an NFSv4.1 flex file layout. */ static int nfscl_dofflayoutio(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, int *eofp, nfsv4stateid_t *stateidp, int rwflag, struct nfscldevinfo *dp, struct nfscllayout *lyp, struct nfsclflayout *flp, uint64_t off, uint64_t len, int mirror, int docommit, struct mbuf *mp, struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p) { uint64_t xfer; int error; struct nfsnode *np; struct nfsfh *fhp; struct nfsclds **dspp; struct ucred *tcred; struct mbuf *m, *m2; uint32_t copylen; np = VTONFS(vp); error = 0; NFSCL_DEBUG(4, "nfscl_dofflayoutio: off=%ju len=%ju\n", (uintmax_t)off, (uintmax_t)len); /* Loop around, doing I/O for each stripe unit. */ while (len > 0 && error == 0) { dspp = nfsfldi_addr(dp, 0); fhp = flp->nfsfl_ffm[mirror].fh[dp->nfsdi_versindex]; stateidp = &flp->nfsfl_ffm[mirror].st; NFSCL_DEBUG(4, "mirror=%d vind=%d fhlen=%d st.seqid=0x%x\n", mirror, dp->nfsdi_versindex, fhp->nfh_len, stateidp->seqid); if ((dp->nfsdi_flags & NFSDI_TIGHTCOUPLED) == 0) { tcred = NFSNEWCRED(cred); tcred->cr_uid = flp->nfsfl_ffm[mirror].user; tcred->cr_groups[0] = flp->nfsfl_ffm[mirror].group; tcred->cr_ngroups = 1; } else tcred = cred; if (rwflag == NFSV4OPEN_ACCESSREAD) copylen = dp->nfsdi_rsize; else { copylen = dp->nfsdi_wsize; if (len > copylen && mp != NULL) { /* * When a mirrored configuration needs to do * multiple writes to each mirror, all writes * except the last one must be a multiple of * 4 bytes. This is required so that the XDR * does not need padding. * If possible, clip the size to an exact * multiple of the mbuf length, so that the * split will be on an mbuf boundary. */ copylen &= 0xfffffffc; if (copylen > mp->m_len) copylen = copylen / mp->m_len * mp->m_len; } } NFSLOCKNODE(np); np->n_flag |= NDSCOMMIT; NFSUNLOCKNODE(np); if (len > copylen && docommit == 0) xfer = copylen; else xfer = len; if (docommit != 0) { if (error == 0) { /* * Do last mirrored DS commit with this thread. */ if (mirror < flp->nfsfl_mirrorcnt - 1) error = nfsio_commitds(vp, off, xfer, *dspp, fhp, dp->nfsdi_vers, dp->nfsdi_minorvers, drpc, tcred, p); else error = nfsrpc_commitds(vp, off, xfer, *dspp, fhp, dp->nfsdi_vers, dp->nfsdi_minorvers, tcred, p); NFSCL_DEBUG(4, "commitds=%d\n", error); if (error != 0 && error != EACCES && error != ESTALE) { NFSCL_DEBUG(4, "DS layreterr for commit\n"); nfscl_dserr(NFSV4OP_COMMIT, error, dp, lyp, *dspp); } } NFSCL_DEBUG(4, "aft nfsio_commitds=%d\n", error); if (error == 0) { /* * Set both eof and uio_resid = 0 to end any * loops. */ *eofp = 1; uiop->uio_resid = 0; } else { NFSLOCKNODE(np); np->n_flag &= ~NDSCOMMIT; NFSUNLOCKNODE(np); } } else if (rwflag == NFSV4OPEN_ACCESSREAD) { error = nfsrpc_readds(vp, uiop, stateidp, eofp, *dspp, off, xfer, fhp, 1, dp->nfsdi_vers, dp->nfsdi_minorvers, tcred, p); NFSCL_DEBUG(4, "readds=%d\n", error); if (error != 0 && error != EACCES && error != ESTALE) { NFSCL_DEBUG(4, "DS layreterr for read\n"); nfscl_dserr(NFSV4OP_READ, error, dp, lyp, *dspp); } } else { if (flp->nfsfl_mirrorcnt == 1) { error = nfsrpc_writeds(vp, uiop, iomode, must_commit, stateidp, *dspp, off, xfer, fhp, 0, 1, dp->nfsdi_vers, dp->nfsdi_minorvers, tcred, p); if (error == 0) { NFSLOCKCLSTATE(); lyp->nfsly_flags |= NFSLY_WRITTEN; NFSUNLOCKCLSTATE(); } } else { m = mp; if (xfer < len) { /* The mbuf list must be split. */ m2 = nfsm_split(mp, xfer); if (m2 != NULL) mp = m2; else { m_freem(mp); error = EIO; } } NFSCL_DEBUG(4, "mcopy len=%jd xfer=%jd\n", (uintmax_t)len, (uintmax_t)xfer); /* * Do last write to a mirrored DS with this * thread. */ if (error == 0) { if (mirror < flp->nfsfl_mirrorcnt - 1) error = nfsio_writedsmir(vp, iomode, must_commit, stateidp, *dspp, off, xfer, fhp, m, dp->nfsdi_vers, dp->nfsdi_minorvers, drpc, tcred, p); else error = nfsrpc_writedsmir(vp, iomode, must_commit, stateidp, *dspp, off, xfer, fhp, m, dp->nfsdi_vers, dp->nfsdi_minorvers, tcred, p); } NFSCL_DEBUG(4, "nfsio_writedsmir=%d\n", error); if (error != 0 && error != EACCES && error != ESTALE) { NFSCL_DEBUG(4, "DS layreterr for write\n"); nfscl_dserr(NFSV4OP_WRITE, error, dp, lyp, *dspp); } } } NFSCL_DEBUG(4, "aft read/writeds=%d\n", error); if (error == 0) { len -= xfer; off += xfer; } if ((dp->nfsdi_flags & NFSDI_TIGHTCOUPLED) == 0) NFSFREECRED(tcred); } NFSCL_DEBUG(4, "eo nfscl_dofflayoutio=%d\n", error); return (error); } /* * The actual read RPC done to a DS. */ static int nfsrpc_readds(vnode_t vp, struct uio *uiop, nfsv4stateid_t *stateidp, int *eofp, struct nfsclds *dsp, uint64_t io_off, int len, struct nfsfh *fhp, int flex, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int attrflag, error, retlen; struct nfsrv_descript nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsrv_descript *nd = &nfsd; struct nfssockreq *nrp; struct nfsvattr na; nd->nd_mrep = NULL; if (vers == 0 || vers == NFS_VER4) { nfscl_reqstart(nd, NFSPROC_READDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); vers = NFS_VER4; NFSCL_DEBUG(4, "nfsrpc_readds: vers4 minvers=%d\n", minorvers); if (flex != 0) nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); else nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSEQIDZERO); } else { nfscl_reqstart(nd, NFSPROC_READ, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_READ]); NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_READDS]); NFSCL_DEBUG(4, "nfsrpc_readds: vers3\n"); } NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED * 3); txdr_hyper(io_off, tl); *(tl + 2) = txdr_unsigned(len); nrp = dsp->nfsclds_sockp; NFSCL_DEBUG(4, "nfsrpc_readds: nrp=%p\n", nrp); if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_readds: stat=%d err=%d\n", nd->nd_repstat, error); if (error != 0) return (error); if (vers == NFS_VER3) { error = nfscl_postop_attr(nd, &na, &attrflag, NULL); NFSCL_DEBUG(4, "nfsrpc_readds: postop=%d\n", error); if (error != 0) goto nfsmout; } if (nd->nd_repstat != 0) { error = nd->nd_repstat; goto nfsmout; } if (vers == NFS_VER3) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); *eofp = fxdr_unsigned(int, *(tl + 1)); } else { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); *eofp = fxdr_unsigned(int, *tl); } NFSM_STRSIZ(retlen, len); NFSCL_DEBUG(4, "nfsrpc_readds: retlen=%d eof=%d\n", retlen, *eofp); error = nfsm_mbufuio(nd, uiop, retlen); nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); return (error); } /* * The actual write RPC done to a DS. */ static int nfsrpc_writeds(vnode_t vp, struct uio *uiop, int *iomode, int *must_commit, nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t io_off, int len, struct nfsfh *fhp, int commit_thru_mds, int flex, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int attrflag, error, rlen, commit, committed = NFSWRITE_FILESYNC; int32_t backup; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfssockreq *nrp; struct nfsvattr na; KASSERT(uiop->uio_iovcnt == 1, ("nfs: writerpc iovcnt > 1")); nd->nd_mrep = NULL; if (vers == 0 || vers == NFS_VER4) { nfscl_reqstart(nd, NFSPROC_WRITEDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSCL_DEBUG(4, "nfsrpc_writeds: vers4 minvers=%d\n", minorvers); vers = NFS_VER4; if (flex != 0) nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); else nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSEQIDZERO); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); } else { nfscl_reqstart(nd, NFSPROC_WRITE, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITE]); NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITEDS]); NFSCL_DEBUG(4, "nfsrpc_writeds: vers3\n"); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 3 * NFSX_UNSIGNED); } txdr_hyper(io_off, tl); tl += 2; if (vers == NFS_VER3) *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); nfsm_uiombuf(nd, uiop, len); nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_writeds: err=%d stat=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat != 0) { /* * In case the rpc gets retried, roll - * the uio fileds changed by nfsm_uiombuf() + * the uio fields changed by nfsm_uiombuf() * back. */ uiop->uio_offset -= len; uiop->uio_resid += len; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - len; uiop->uio_iov->iov_len += len; error = nd->nd_repstat; } else { if (vers == NFS_VER3) { error = nfscl_wcc_data(nd, vp, &na, &attrflag, NULL, NULL); NFSCL_DEBUG(4, "nfsrpc_writeds: wcc_data=%d\n", error); if (error != 0) goto nfsmout; } NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); NFSCL_DEBUG(4, "nfsrpc_writeds: len=%d rlen=%d\n", len, rlen); if (rlen == 0) { error = NFSERR_IO; goto nfsmout; } else if (rlen < len) { backup = len - rlen; uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup; uiop->uio_iov->iov_len += backup; uiop->uio_offset -= backup; uiop->uio_resid += backup; len = rlen; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest commitment level * obtained by any of the RPCs. */ if (committed == NFSWRITE_FILESYNC) committed = commit; else if (committed == NFSWRITE_DATASYNC && commit == NFSWRITE_UNSTABLE) committed = commit; if (commit_thru_mds != 0) { NFSLOCKMNT(nmp); if (!NFSHASWRITEVERF(nmp)) { NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); NFSSETWRITEVERF(nmp); } else if (NFSBCMP(tl, nmp->nm_verf, NFSX_VERF)) { *must_commit = 1; NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); } NFSUNLOCKMNT(nmp); } else { NFSLOCKDS(dsp); if ((dsp->nfsclds_flags & NFSCLDS_HASWRITEVERF) == 0) { NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); dsp->nfsclds_flags |= NFSCLDS_HASWRITEVERF; } else if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF)) { *must_commit = 1; NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); } NFSUNLOCKDS(dsp); } } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); *iomode = committed; if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; return (error); } /* * The actual write RPC done to a DS. * This variant is called from a separate kernel process for mirrors. * Any short write is considered an IO error. */ static int nfsrpc_writedsmir(vnode_t vp, int *iomode, int *must_commit, nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t io_off, int len, struct nfsfh *fhp, struct mbuf *m, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsmount *nmp = VFSTONFS(vp->v_mount); int attrflag, error, commit, committed = NFSWRITE_FILESYNC, rlen; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfssockreq *nrp; struct nfsvattr na; nd->nd_mrep = NULL; if (vers == 0 || vers == NFS_VER4) { nfscl_reqstart(nd, NFSPROC_WRITEDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); vers = NFS_VER4; NFSCL_DEBUG(4, "nfsrpc_writedsmir: vers4 minvers=%d\n", minorvers); nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); } else { nfscl_reqstart(nd, NFSPROC_WRITE, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITE]); NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_WRITEDS]); NFSCL_DEBUG(4, "nfsrpc_writedsmir: vers3\n"); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 3 * NFSX_UNSIGNED); } txdr_hyper(io_off, tl); tl += 2; if (vers == NFS_VER3) *tl++ = txdr_unsigned(len); *tl++ = txdr_unsigned(*iomode); *tl = txdr_unsigned(len); if (len > 0) { /* Put data in mbuf chain. */ nd->nd_mb->m_next = m; } nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_writedsmir: err=%d stat=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat != 0) error = nd->nd_repstat; else { if (vers == NFS_VER3) { error = nfscl_wcc_data(nd, vp, &na, &attrflag, NULL, NULL); NFSCL_DEBUG(4, "nfsrpc_writedsmir: wcc_data=%d\n", error); if (error != 0) goto nfsmout; } NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF); rlen = fxdr_unsigned(int, *tl++); NFSCL_DEBUG(4, "nfsrpc_writedsmir: len=%d rlen=%d\n", len, rlen); if (rlen != len) { error = NFSERR_IO; NFSCL_DEBUG(4, "nfsrpc_writedsmir: len=%d rlen=%d\n", len, rlen); goto nfsmout; } commit = fxdr_unsigned(int, *tl++); /* * Return the lowest commitment level * obtained by any of the RPCs. */ if (committed == NFSWRITE_FILESYNC) committed = commit; else if (committed == NFSWRITE_DATASYNC && commit == NFSWRITE_UNSTABLE) committed = commit; NFSLOCKDS(dsp); if ((dsp->nfsclds_flags & NFSCLDS_HASWRITEVERF) == 0) { NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); dsp->nfsclds_flags |= NFSCLDS_HASWRITEVERF; } else if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF)) { *must_commit = 1; NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); } NFSUNLOCKDS(dsp); } nfsmout: if (nd->nd_mrep != NULL) m_freem(nd->nd_mrep); *iomode = committed; if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; return (error); } /* * Start up the thread that will execute nfsrpc_writedsmir(). */ static void start_writedsmir(void *arg, int pending) { struct nfsclwritedsdorpc *drpc; drpc = (struct nfsclwritedsdorpc *)arg; drpc->err = nfsrpc_writedsmir(drpc->vp, &drpc->iomode, &drpc->must_commit, drpc->stateidp, drpc->dsp, drpc->off, drpc->len, drpc->fhp, drpc->m, drpc->vers, drpc->minorvers, drpc->cred, drpc->p); drpc->done = 1; NFSCL_DEBUG(4, "start_writedsmir: err=%d\n", drpc->err); } /* * Set up the write DS mirror call for the pNFS I/O thread. */ static int nfsio_writedsmir(vnode_t vp, int *iomode, int *must_commit, nfsv4stateid_t *stateidp, struct nfsclds *dsp, uint64_t off, int len, struct nfsfh *fhp, struct mbuf *m, int vers, int minorvers, struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p) { int error, ret; error = 0; drpc->done = 0; drpc->vp = vp; drpc->iomode = *iomode; drpc->must_commit = *must_commit; drpc->stateidp = stateidp; drpc->dsp = dsp; drpc->off = off; drpc->len = len; drpc->fhp = fhp; drpc->m = m; drpc->vers = vers; drpc->minorvers = minorvers; drpc->cred = cred; drpc->p = p; drpc->inprog = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_writedsmir, drpc); NFSCL_DEBUG(4, "nfsio_writedsmir: nfs_pnfsio=%d\n", ret); } if (ret != 0) error = nfsrpc_writedsmir(vp, iomode, must_commit, stateidp, dsp, off, len, fhp, m, vers, minorvers, cred, p); NFSCL_DEBUG(4, "nfsio_writedsmir: error=%d\n", error); return (error); } /* * Free up the nfsclds structure. */ void nfscl_freenfsclds(struct nfsclds *dsp) { int i; if (dsp == NULL) return; if (dsp->nfsclds_sockp != NULL) { NFSFREECRED(dsp->nfsclds_sockp->nr_cred); NFSFREEMUTEX(&dsp->nfsclds_sockp->nr_mtx); free(dsp->nfsclds_sockp->nr_nam, M_SONAME); free(dsp->nfsclds_sockp, M_NFSSOCKREQ); } NFSFREEMUTEX(&dsp->nfsclds_mtx); NFSFREEMUTEX(&dsp->nfsclds_sess.nfsess_mtx); for (i = 0; i < NFSV4_CBSLOTS; i++) { if (dsp->nfsclds_sess.nfsess_cbslots[i].nfssl_reply != NULL) m_freem( dsp->nfsclds_sess.nfsess_cbslots[i].nfssl_reply); } free(dsp, M_NFSCLDS); } static enum nfsclds_state nfscl_getsameserver(struct nfsmount *nmp, struct nfsclds *newdsp, struct nfsclds **retdspp, uint32_t *sequencep) { struct nfsclds *dsp; int fndseq; /* * Search the list of nfsclds structures for one with the same * server. */ fndseq = 0; TAILQ_FOREACH(dsp, &nmp->nm_sess, nfsclds_list) { if (dsp->nfsclds_servownlen == newdsp->nfsclds_servownlen && dsp->nfsclds_servownlen != 0 && !NFSBCMP(dsp->nfsclds_serverown, newdsp->nfsclds_serverown, dsp->nfsclds_servownlen) && dsp->nfsclds_sess.nfsess_defunct == 0) { NFSCL_DEBUG(4, "fnd same fdsp=%p dsp=%p flg=0x%x\n", TAILQ_FIRST(&nmp->nm_sess), dsp, dsp->nfsclds_flags); if (fndseq == 0) { /* Get sequenceid# from first entry. */ *sequencep = dsp->nfsclds_sess.nfsess_sequenceid; fndseq = 1; } /* Server major id matches. */ if ((dsp->nfsclds_flags & NFSCLDS_DS) != 0) { *retdspp = dsp; return (NFSDSP_USETHISSESSION); } } } if (fndseq != 0) return (NFSDSP_SEQTHISSESSION); return (NFSDSP_NOTFOUND); } /* * NFS commit rpc to a NFSv4.1 DS. */ static int nfsrpc_commitds(vnode_t vp, uint64_t offset, int cnt, struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfssockreq *nrp; struct nfsvattr na; int attrflag, error; nd->nd_mrep = NULL; if (vers == 0 || vers == NFS_VER4) { nfscl_reqstart(nd, NFSPROC_COMMITDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); vers = NFS_VER4; } else { nfscl_reqstart(nd, NFSPROC_COMMIT, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); NFSDECRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_COMMIT]); NFSINCRGLOBAL(nfsstatsv1.rpccnt[NFSPROC_COMMITDS]); } NFSCL_DEBUG(4, "nfsrpc_commitds: vers=%d minvers=%d\n", vers, minorvers); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_commitds: err=%d stat=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat == 0) { if (vers == NFS_VER3) { error = nfscl_wcc_data(nd, vp, &na, &attrflag, NULL, NULL); NFSCL_DEBUG(4, "nfsrpc_commitds: wccdata=%d\n", error); if (error != 0) goto nfsmout; } NFSM_DISSECT(tl, u_int32_t *, NFSX_VERF); NFSLOCKDS(dsp); if (NFSBCMP(tl, dsp->nfsclds_verf, NFSX_VERF)) { NFSBCOPY(tl, dsp->nfsclds_verf, NFSX_VERF); error = NFSERR_STALEWRITEVERF; } NFSUNLOCKDS(dsp); } nfsmout: if (error == 0 && nd->nd_repstat != 0) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Start up the thread that will execute nfsrpc_commitds(). */ static void start_commitds(void *arg, int pending) { struct nfsclwritedsdorpc *drpc; drpc = (struct nfsclwritedsdorpc *)arg; drpc->err = nfsrpc_commitds(drpc->vp, drpc->off, drpc->len, drpc->dsp, drpc->fhp, drpc->vers, drpc->minorvers, drpc->cred, drpc->p); drpc->done = 1; NFSCL_DEBUG(4, "start_commitds: err=%d\n", drpc->err); } /* * Set up the commit DS mirror call for the pNFS I/O thread. */ static int nfsio_commitds(vnode_t vp, uint64_t offset, int cnt, struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers, struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p) { int error, ret; error = 0; drpc->done = 0; drpc->vp = vp; drpc->off = offset; drpc->len = cnt; drpc->dsp = dsp; drpc->fhp = fhp; drpc->vers = vers; drpc->minorvers = minorvers; drpc->cred = cred; drpc->p = p; drpc->inprog = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_commitds, drpc); NFSCL_DEBUG(4, "nfsio_commitds: nfs_pnfsio=%d\n", ret); } if (ret != 0) error = nfsrpc_commitds(vp, offset, cnt, dsp, fhp, vers, minorvers, cred, p); NFSCL_DEBUG(4, "nfsio_commitds: error=%d\n", error); return (error); } /* * NFS Advise rpc */ int nfsrpc_advise(vnode_t vp, off_t offset, uint64_t cnt, int advise, struct ucred *cred, NFSPROC_T *p) { u_int32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; nfsattrbit_t hints; int error; NFSZERO_ATTRBIT(&hints); if (advise == POSIX_FADV_WILLNEED) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED); else if (advise == POSIX_FADV_DONTNEED) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_DONTNEED); else return (0); NFSCL_REQSTART(nd, NFSPROC_IOADVISE, vp); nfsm_stateidtom(nd, NULL, NFSSTATEID_PUTALLZERO); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER); txdr_hyper(offset, tl); tl += 2; txdr_hyper(cnt, tl); nfsrv_putattrbit(nd, &hints); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat != 0) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } #ifdef notyet /* * NFS advise rpc to a NFSv4.2 DS. */ static int nfsrpc_adviseds(vnode_t vp, uint64_t offset, int cnt, int advise, struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfssockreq *nrp; nfsattrbit_t hints; int error; /* For NFS DSs prior to NFSv4.2, just return OK. */ if (vers == NFS_VER3 || minorversion < NFSV42_MINORVERSION) return (0); NFSZERO_ATTRBIT(&hints); if (advise == POSIX_FADV_WILLNEED) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_WILLNEED); else if (advise == POSIX_FADV_DONTNEED) NFSSETBIT_ATTRBIT(&hints, NFSV4IOHINT_DONTNEED); else return (0); nd->nd_mrep = NULL; nfscl_reqstart(nd, NFSPROC_IOADVISEDS, nmp, fhp->nfh_fh, fhp->nfh_len, NULL, &dsp->nfsclds_sess, vers, minorvers); vers = NFS_VER4; NFSCL_DEBUG(4, "nfsrpc_adviseds: vers=%d minvers=%d\n", vers, minorvers); nfsm_stateidtom(nd, NULL, NFSSTATEID_PUTALLZERO); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); nfsrv_putattrbit(nd, &hints); nrp = dsp->nfsclds_sockp; if (nrp == NULL) /* If NULL, use the MDS socket. */ nrp = &nmp->nm_sockreq; error = newnfs_request(nd, nmp, NULL, nrp, vp, p, cred, NFS_PROG, vers, NULL, 1, NULL, &dsp->nfsclds_sess); NFSCL_DEBUG(4, "nfsrpc_adviseds: err=%d stat=%d\n", error, nd->nd_repstat); if (error != 0) return (error); if (nd->nd_repstat != 0) error = nd->nd_repstat; m_freem(nd->nd_mrep); return (error); } /* * Start up the thread that will execute nfsrpc_commitds(). */ static void start_adviseds(void *arg, int pending) { struct nfsclwritedsdorpc *drpc; drpc = (struct nfsclwritedsdorpc *)arg; drpc->err = nfsrpc_adviseds(drpc->vp, drpc->off, drpc->len, drpc->advise, drpc->dsp, drpc->fhp, drpc->vers, drpc->minorvers, drpc->cred, drpc->p); drpc->done = 1; NFSCL_DEBUG(4, "start_adviseds: err=%d\n", drpc->err); } /* * Set up the commit DS mirror call for the pNFS I/O thread. */ static int nfsio_adviseds(vnode_t vp, uint64_t offset, int cnt, int advise, struct nfsclds *dsp, struct nfsfh *fhp, int vers, int minorvers, struct nfsclwritedsdorpc *drpc, struct ucred *cred, NFSPROC_T *p) { int error, ret; error = 0; drpc->done = 0; drpc->vp = vp; drpc->off = offset; drpc->len = cnt; drpc->advise = advise; drpc->dsp = dsp; drpc->fhp = fhp; drpc->vers = vers; drpc->minorvers = minorvers; drpc->cred = cred; drpc->p = p; drpc->inprog = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_adviseds, drpc); NFSCL_DEBUG(4, "nfsio_adviseds: nfs_pnfsio=%d\n", ret); } if (ret != 0) error = nfsrpc_adviseds(vp, offset, cnt, advise, dsp, fhp, vers, minorvers, cred, p); NFSCL_DEBUG(4, "nfsio_adviseds: error=%d\n", error); return (error); } #endif /* notyet */ /* * Do the Allocate operation, retrying for recovery. */ int nfsrpc_allocate(vnode_t vp, off_t off, off_t len, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p, void *stuff) { int error, expireret = 0, retrycnt, nostateid; uint32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsfh *nfhp = NULL; nfsv4stateid_t stateid; off_t tmp_off; void *lckp; if (len < 0) return (EINVAL); if (len == 0) return (0); tmp_off = off + len; NFSLOCKMNT(nmp); if (tmp_off > nmp->nm_maxfilesize || tmp_off < off) { NFSUNLOCKMNT(nmp); return (EFBIG); } if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; NFSUNLOCKMNT(nmp); nfhp = VTONFS(vp)->n_fhp; retrycnt = 0; do { lckp = NULL; nostateid = 0; nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSWRITE, 0, cred, p, &stateid, &lckp); if (stateid.other[0] == 0 && stateid.other[1] == 0 && stateid.other[2] == 0) { nostateid = 1; NFSCL_DEBUG(1, "stateid0 in allocate\n"); } /* * Not finding a stateid should probably never happen, * but just return an error for this case. */ if (nostateid != 0) error = EIO; else error = nfsrpc_allocaterpc(vp, off, len, &stateid, nap, attrflagp, cred, p, stuff); if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_allocate"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, p); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_DELAY || error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error != 0 && retrycnt >= 4) error = EIO; return (error); } /* * The allocate RPC. */ static int nfsrpc_allocaterpc(vnode_t vp, off_t off, off_t len, nfsv4stateid_t *stateidp, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p, void *stuff) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_ALLOCATE, vp); nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_UNSIGNED); txdr_hyper(off, tl); tl += 2; txdr_hyper(len, tl); tl += 2; *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, stuff); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = NFS_LATTR_NOSHRINK; } else error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Set up the XDR arguments for the LayoutGet operation. */ static void nfsrv_setuplayoutget(struct nfsrv_descript *nd, int iomode, uint64_t offset, uint64_t len, uint64_t minlen, nfsv4stateid_t *stateidp, int layouttype, int layoutlen, int usecurstateid) { uint32_t *tl; NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED + 3 * NFSX_HYPER + NFSX_STATEID); *tl++ = newnfs_false; /* Don't signal availability. */ *tl++ = txdr_unsigned(layouttype); *tl++ = txdr_unsigned(iomode); txdr_hyper(offset, tl); tl += 2; txdr_hyper(len, tl); tl += 2; txdr_hyper(minlen, tl); tl += 2; if (usecurstateid != 0) { /* Special stateid for Current stateid. */ *tl++ = txdr_unsigned(1); *tl++ = 0; *tl++ = 0; *tl++ = 0; } else { *tl++ = txdr_unsigned(stateidp->seqid); NFSCL_DEBUG(4, "layget seq=%d\n", (int)stateidp->seqid); *tl++ = stateidp->other[0]; *tl++ = stateidp->other[1]; *tl++ = stateidp->other[2]; } *tl = txdr_unsigned(layoutlen); } /* * Parse the reply for a successful LayoutGet operation. */ static int nfsrv_parselayoutget(struct nfsmount *nmp, struct nfsrv_descript *nd, nfsv4stateid_t *stateidp, int *retonclosep, struct nfsclflayouthead *flhp) { uint32_t *tl; struct nfsclflayout *flp, *prevflp, *tflp; int cnt, error, fhcnt, gotiomode, i, iomode, j, k, l, laytype, nfhlen; int m, mirrorcnt; uint64_t retlen, off; struct nfsfh *nfhp; uint8_t *cp; uid_t user; gid_t grp; NFSCL_DEBUG(4, "in nfsrv_parselayoutget\n"); error = 0; flp = NULL; gotiomode = -1; NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_STATEID); if (*tl++ != 0) *retonclosep = 1; else *retonclosep = 0; stateidp->seqid = fxdr_unsigned(uint32_t, *tl++); NFSCL_DEBUG(4, "retoncls=%d stseq=%d\n", *retonclosep, (int)stateidp->seqid); stateidp->other[0] = *tl++; stateidp->other[1] = *tl++; stateidp->other[2] = *tl++; cnt = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "layg cnt=%d\n", cnt); if (cnt <= 0 || cnt > 10000) { /* Don't accept more than 10000 layouts in reply. */ error = NFSERR_BADXDR; goto nfsmout; } for (i = 0; i < cnt; i++) { /* Dissect to the layout type. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + 3 * NFSX_UNSIGNED); off = fxdr_hyper(tl); tl += 2; retlen = fxdr_hyper(tl); tl += 2; iomode = fxdr_unsigned(int, *tl++); laytype = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "layt=%d off=%ju len=%ju iom=%d\n", laytype, (uintmax_t)off, (uintmax_t)retlen, iomode); /* Ignore length of layout body for now. */ if (laytype == NFSLAYOUT_NFSV4_1_FILES) { /* Parse the File layout up to fhcnt. */ NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED + NFSX_HYPER + NFSX_V4DEVICEID); fhcnt = fxdr_unsigned(int, *(tl + 4 + NFSX_V4DEVICEID / NFSX_UNSIGNED)); NFSCL_DEBUG(4, "fhcnt=%d\n", fhcnt); if (fhcnt < 0 || fhcnt > 100) { /* Don't accept more than 100 file handles. */ error = NFSERR_BADXDR; goto nfsmout; } if (fhcnt > 0) flp = malloc(sizeof(*flp) + fhcnt * sizeof(struct nfsfh *), M_NFSFLAYOUT, M_WAITOK); else flp = malloc(sizeof(*flp), M_NFSFLAYOUT, M_WAITOK); flp->nfsfl_flags = NFSFL_FILE; flp->nfsfl_fhcnt = 0; flp->nfsfl_devp = NULL; flp->nfsfl_off = off; if (flp->nfsfl_off + retlen < flp->nfsfl_off) flp->nfsfl_end = UINT64_MAX - flp->nfsfl_off; else flp->nfsfl_end = flp->nfsfl_off + retlen; flp->nfsfl_iomode = iomode; if (gotiomode == -1) gotiomode = flp->nfsfl_iomode; /* Ignore layout body length for now. */ NFSBCOPY(tl, flp->nfsfl_dev, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); flp->nfsfl_util = fxdr_unsigned(uint32_t, *tl++); NFSCL_DEBUG(4, "flutil=0x%x\n", flp->nfsfl_util); mtx_lock(&nmp->nm_mtx); if (nmp->nm_minorvers > 1 && (flp->nfsfl_util & NFSFLAYUTIL_IOADVISE_THRU_MDS) != 0) nmp->nm_privflag |= NFSMNTP_IOADVISETHRUMDS; mtx_unlock(&nmp->nm_mtx); flp->nfsfl_stripe1 = fxdr_unsigned(uint32_t, *tl++); flp->nfsfl_patoff = fxdr_hyper(tl); tl += 2; NFSCL_DEBUG(4, "stripe1=%u poff=%ju\n", flp->nfsfl_stripe1, (uintmax_t)flp->nfsfl_patoff); for (j = 0; j < fhcnt; j++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); nfhlen = fxdr_unsigned(int, *tl); if (nfhlen <= 0 || nfhlen > NFSX_V4FHMAX) { error = NFSERR_BADXDR; goto nfsmout; } nfhp = malloc(sizeof(*nfhp) + nfhlen - 1, M_NFSFH, M_WAITOK); flp->nfsfl_fh[j] = nfhp; flp->nfsfl_fhcnt++; nfhp->nfh_len = nfhlen; NFSM_DISSECT(cp, uint8_t *, NFSM_RNDUP(nfhlen)); NFSBCOPY(cp, nfhp->nfh_fh, nfhlen); } } else if (laytype == NFSLAYOUT_FLEXFILE) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED + NFSX_HYPER); mirrorcnt = fxdr_unsigned(int, *(tl + 2)); NFSCL_DEBUG(4, "mirrorcnt=%d\n", mirrorcnt); if (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS) { error = NFSERR_BADXDR; goto nfsmout; } flp = malloc(sizeof(*flp) + mirrorcnt * sizeof(struct nfsffm), M_NFSFLAYOUT, M_WAITOK); flp->nfsfl_flags = NFSFL_FLEXFILE; flp->nfsfl_mirrorcnt = mirrorcnt; for (j = 0; j < mirrorcnt; j++) flp->nfsfl_ffm[j].devp = NULL; flp->nfsfl_off = off; if (flp->nfsfl_off + retlen < flp->nfsfl_off) flp->nfsfl_end = UINT64_MAX - flp->nfsfl_off; else flp->nfsfl_end = flp->nfsfl_off + retlen; flp->nfsfl_iomode = iomode; if (gotiomode == -1) gotiomode = flp->nfsfl_iomode; flp->nfsfl_stripeunit = fxdr_hyper(tl); NFSCL_DEBUG(4, "stripeunit=%ju\n", (uintmax_t)flp->nfsfl_stripeunit); for (j = 0; j < mirrorcnt; j++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); k = fxdr_unsigned(int, *tl); if (k < 1 || k > 128) { error = NFSERR_BADXDR; goto nfsmout; } NFSCL_DEBUG(4, "servercnt=%d\n", k); for (l = 0; l < k; l++) { NFSM_DISSECT(tl, uint32_t *, NFSX_V4DEVICEID + NFSX_STATEID + 2 * NFSX_UNSIGNED); if (l == 0) { /* Just use the first server. */ NFSBCOPY(tl, flp->nfsfl_ffm[j].dev, NFSX_V4DEVICEID); tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); tl++; flp->nfsfl_ffm[j].st.seqid = *tl++; flp->nfsfl_ffm[j].st.other[0] = *tl++; flp->nfsfl_ffm[j].st.other[1] = *tl++; flp->nfsfl_ffm[j].st.other[2] = *tl++; NFSCL_DEBUG(4, "st.seqid=%u " "st.o0=0x%x st.o1=0x%x " "st.o2=0x%x\n", flp->nfsfl_ffm[j].st.seqid, flp->nfsfl_ffm[j].st.other[0], flp->nfsfl_ffm[j].st.other[1], flp->nfsfl_ffm[j].st.other[2]); } else tl += ((NFSX_V4DEVICEID + NFSX_STATEID + NFSX_UNSIGNED) / NFSX_UNSIGNED); fhcnt = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "fhcnt=%d\n", fhcnt); if (fhcnt < 1 || fhcnt > NFSDEV_MAXVERS) { error = NFSERR_BADXDR; goto nfsmout; } for (m = 0; m < fhcnt; m++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); nfhlen = fxdr_unsigned(int, *tl); NFSCL_DEBUG(4, "nfhlen=%d\n", nfhlen); if (nfhlen <= 0 || nfhlen > NFSX_V4FHMAX) { error = NFSERR_BADXDR; goto nfsmout; } NFSM_DISSECT(cp, uint8_t *, NFSM_RNDUP(nfhlen)); if (l == 0) { flp->nfsfl_ffm[j].fhcnt = fhcnt; nfhp = malloc( sizeof(*nfhp) + nfhlen - 1, M_NFSFH, M_WAITOK); flp->nfsfl_ffm[j].fh[m] = nfhp; nfhp->nfh_len = nfhlen; NFSBCOPY(cp, nfhp->nfh_fh, nfhlen); NFSCL_DEBUG(4, "got fh\n"); } } /* Now, get the ffsd_user/ffds_group. */ error = nfsrv_parseug(nd, 0, &user, &grp, curthread); NFSCL_DEBUG(4, "after parseu=%d\n", error); if (error == 0) error = nfsrv_parseug(nd, 1, &user, &grp, curthread); NFSCL_DEBUG(4, "aft parseg=%d\n", grp); if (error != 0) goto nfsmout; NFSCL_DEBUG(4, "user=%d group=%d\n", user, grp); if (l == 0) { flp->nfsfl_ffm[j].user = user; flp->nfsfl_ffm[j].group = grp; NFSCL_DEBUG(4, "usr=%d grp=%d\n", user, grp); } } } NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); flp->nfsfl_fflags = fxdr_unsigned(uint32_t, *tl++); #ifdef notnow /* * At this time, there is no flag. * NFSFLEXFLAG_IOADVISE_THRU_MDS might need to be * added, or it may never exist? */ mtx_lock(&nmp->nm_mtx); if (nmp->nm_minorvers > 1 && (flp->nfsfl_fflags & NFSFLEXFLAG_IOADVISE_THRU_MDS) != 0) nmp->nm_privflag |= NFSMNTP_IOADVISETHRUMDS; mtx_unlock(&nmp->nm_mtx); #endif flp->nfsfl_statshint = fxdr_unsigned(uint32_t, *tl); NFSCL_DEBUG(4, "fflags=0x%x statshint=%d\n", flp->nfsfl_fflags, flp->nfsfl_statshint); } else { error = NFSERR_BADXDR; goto nfsmout; } if (flp->nfsfl_iomode == gotiomode) { /* Keep the list in increasing offset order. */ tflp = LIST_FIRST(flhp); prevflp = NULL; while (tflp != NULL && tflp->nfsfl_off < flp->nfsfl_off) { prevflp = tflp; tflp = LIST_NEXT(tflp, nfsfl_list); } if (prevflp == NULL) LIST_INSERT_HEAD(flhp, flp, nfsfl_list); else LIST_INSERT_AFTER(prevflp, flp, nfsfl_list); NFSCL_DEBUG(4, "flp inserted\n"); } else { printf("nfscl_layoutget(): got wrong iomode\n"); nfscl_freeflayout(flp); } flp = NULL; } nfsmout: NFSCL_DEBUG(4, "eo nfsrv_parselayoutget=%d\n", error); if (error != 0 && flp != NULL) nfscl_freeflayout(flp); return (error); } /* * Parse a user/group digit string. */ static int nfsrv_parseug(struct nfsrv_descript *nd, int dogrp, uid_t *uidp, gid_t *gidp, NFSPROC_T *p) { uint32_t *tl; char *cp, *str, str0[NFSV4_SMALLSTR + 1]; uint32_t len = 0; int error = 0; NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(uint32_t, *tl); str = NULL; if (len > NFSV4_OPAQUELIMIT) { error = NFSERR_BADXDR; goto nfsmout; } NFSCL_DEBUG(4, "nfsrv_parseug: len=%d\n", len); if (len == 0) { if (dogrp != 0) *gidp = GID_NOGROUP; else *uidp = UID_NOBODY; return (0); } if (len > NFSV4_SMALLSTR) str = malloc(len + 1, M_TEMP, M_WAITOK); else str = str0; NFSM_DISSECT(cp, char *, NFSM_RNDUP(len)); NFSBCOPY(cp, str, len); str[len] = '\0'; NFSCL_DEBUG(4, "nfsrv_parseug: str=%s\n", str); if (dogrp != 0) error = nfsv4_strtogid(nd, str, len, gidp); else error = nfsv4_strtouid(nd, str, len, uidp); nfsmout: if (len > NFSV4_SMALLSTR) free(str, M_TEMP); NFSCL_DEBUG(4, "eo nfsrv_parseug=%d\n", error); return (error); } /* * Similar to nfsrpc_getlayout(), except that it uses nfsrpc_openlayget(), * so that it does both an Open and a Layoutget. */ static int nfsrpc_getopenlayout(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, uint8_t *newfhp, int newfhlen, uint32_t mode, struct nfsclopen *op, uint8_t *name, int namelen, struct nfscldeleg **dpp, struct ucred *cred, NFSPROC_T *p) { struct nfscllayout *lyp; struct nfsclflayout *flp; struct nfsclflayouthead flh; int error, islocked, layoutlen, recalled, retonclose, usecurstateid; int layouttype, laystat; nfsv4stateid_t stateid; struct nfsclsession *tsep; error = 0; if (NFSHASFLEXFILE(nmp)) layouttype = NFSLAYOUT_FLEXFILE; else layouttype = NFSLAYOUT_NFSV4_1_FILES; /* * If lyp is returned non-NULL, there will be a refcnt (shared lock) * on it, iff flp != NULL or a lock (exclusive lock) on it iff * flp == NULL. */ lyp = nfscl_getlayout(nmp->nm_clp, newfhp, newfhlen, 0, &flp, &recalled); NFSCL_DEBUG(4, "nfsrpc_getopenlayout nfscl_getlayout lyp=%p\n", lyp); if (lyp == NULL) islocked = 0; else if (flp != NULL) islocked = 1; else islocked = 2; if ((lyp == NULL || flp == NULL) && recalled == 0) { LIST_INIT(&flh); tsep = nfsmnt_mdssession(nmp); layoutlen = tsep->nfsess_maxcache - (NFSX_STATEID + 3 * NFSX_UNSIGNED); if (lyp == NULL) usecurstateid = 1; else { usecurstateid = 0; stateid.seqid = lyp->nfsly_stateid.seqid; stateid.other[0] = lyp->nfsly_stateid.other[0]; stateid.other[1] = lyp->nfsly_stateid.other[1]; stateid.other[2] = lyp->nfsly_stateid.other[2]; } error = nfsrpc_openlayoutrpc(nmp, vp, nfhp, fhlen, newfhp, newfhlen, mode, op, name, namelen, dpp, &stateid, usecurstateid, layouttype, layoutlen, &retonclose, &flh, &laystat, cred, p); NFSCL_DEBUG(4, "aft nfsrpc_openlayoutrpc laystat=%d err=%d\n", laystat, error); laystat = nfsrpc_layoutgetres(nmp, vp, newfhp, newfhlen, &stateid, retonclose, NULL, &lyp, &flh, layouttype, laystat, &islocked, cred, p); } else error = nfsrpc_openrpc(nmp, vp, nfhp, fhlen, newfhp, newfhlen, mode, op, name, namelen, dpp, 0, 0, cred, p, 0, 0); if (islocked == 2) nfscl_rellayout(lyp, 1); else if (islocked == 1) nfscl_rellayout(lyp, 0); return (error); } /* * This function does an Open+LayoutGet for an NFSv4.1 mount with pNFS * enabled, only for the CLAIM_NULL case. All other NFSv4 Opens are * handled by nfsrpc_openrpc(). * For the case where op == NULL, dvp is the directory. When op != NULL, it * can be NULL. */ static int nfsrpc_openlayoutrpc(struct nfsmount *nmp, vnode_t vp, u_int8_t *nfhp, int fhlen, uint8_t *newfhp, int newfhlen, uint32_t mode, struct nfsclopen *op, uint8_t *name, int namelen, struct nfscldeleg **dpp, nfsv4stateid_t *stateidp, int usecurstateid, int layouttype, int layoutlen, int *retonclosep, struct nfsclflayouthead *flhp, int *laystatp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfscldeleg *ndp = NULL; struct nfsvattr nfsva; struct nfsclsession *tsep; uint32_t rflags, deleg; nfsattrbit_t attrbits; int error, ret, acesize, limitby, iomode; *dpp = NULL; *laystatp = ENXIO; nfscl_reqstart(nd, NFSPROC_OPENLAYGET, nmp, nfhp, fhlen, NULL, NULL, 0, 0); NFSM_BUILD(tl, uint32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(op->nfso_own->nfsow_seqid); *tl++ = txdr_unsigned(mode & NFSV4OPEN_ACCESSBOTH); *tl++ = txdr_unsigned((mode >> NFSLCK_SHIFT) & NFSV4OPEN_DENYBOTH); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; nfsm_strtom(nd, op->nfso_own->nfsow_owner, NFSV4CL_LOCKNAMELEN); NFSM_BUILD(tl, uint32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_NOCREATE); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); nfsm_strtom(nd, name, namelen); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSZERO_ATTRBIT(&attrbits); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE); NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY); nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_LAYOUTGET); if ((mode & NFSV4OPEN_ACCESSWRITE) != 0) iomode = NFSLAYOUTIOMODE_RW; else iomode = NFSLAYOUTIOMODE_READ; nfsrv_setuplayoutget(nd, iomode, 0, UINT64_MAX, 0, stateidp, layouttype, layoutlen, usecurstateid); error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, vp, p, cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL); if (error != 0) return (error); NFSCL_INCRSEQID(op->nfso_own->nfsow_seqid, nd); if (nd->nd_repstat != 0) *laystatp = nd->nd_repstat; if ((nd->nd_flag & ND_NOMOREDATA) == 0) { /* ND_NOMOREDATA will be set if the Open operation failed. */ NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); op->nfso_stateid.seqid = *tl++; op->nfso_stateid.other[0] = *tl++; op->nfso_stateid.other[1] = *tl++; op->nfso_stateid.other[2] = *tl; rflags = fxdr_unsigned(u_int32_t, *(tl + 6)); error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL); if (error != 0) goto nfsmout; NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); deleg = fxdr_unsigned(u_int32_t, *tl); if (deleg == NFSV4OPEN_DELEGATEREAD || deleg == NFSV4OPEN_DELEGATEWRITE) { if (!(op->nfso_own->nfsow_clp->nfsc_flags & NFSCLFLAGS_FIRSTDELEG)) op->nfso_own->nfsow_clp->nfsc_flags |= (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG); ndp = malloc(sizeof(struct nfscldeleg) + newfhlen, M_NFSCLDELEG, M_WAITOK); LIST_INIT(&ndp->nfsdl_owner); LIST_INIT(&ndp->nfsdl_lock); ndp->nfsdl_clp = op->nfso_own->nfsow_clp; ndp->nfsdl_fhlen = newfhlen; NFSBCOPY(newfhp, ndp->nfsdl_fh, newfhlen); newnfs_copyincred(cred, &ndp->nfsdl_cred); nfscl_lockinit(&ndp->nfsdl_rwlock); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); ndp->nfsdl_stateid.seqid = *tl++; ndp->nfsdl_stateid.other[0] = *tl++; ndp->nfsdl_stateid.other[1] = *tl++; ndp->nfsdl_stateid.other[2] = *tl++; ret = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEWRITE) { ndp->nfsdl_flags = NFSCLDL_WRITE; /* * Indicates how much the file can grow. */ NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); limitby = fxdr_unsigned(int, *tl++); switch (limitby) { case NFSV4OPEN_LIMITSIZE: ndp->nfsdl_sizelimit = fxdr_hyper(tl); break; case NFSV4OPEN_LIMITBLOCKS: ndp->nfsdl_sizelimit = fxdr_unsigned(u_int64_t, *tl++); ndp->nfsdl_sizelimit *= fxdr_unsigned(u_int64_t, *tl); break; default: error = NFSERR_BADXDR; goto nfsmout; }; } else ndp->nfsdl_flags = NFSCLDL_READ; if (ret != 0) ndp->nfsdl_flags |= NFSCLDL_RECALL; error = nfsrv_dissectace(nd, &ndp->nfsdl_ace, &ret, &acesize, p); if (error != 0) goto nfsmout; } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; } if ((rflags & NFSV4OPEN_LOCKTYPEPOSIX) != 0 || nfscl_assumeposixlocks) op->nfso_posixlock = 1; else op->nfso_posixlock = 0; NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED); /* If the 2nd element == NFS_OK, the Getattr succeeded. */ if (*++tl == 0) { error = nfsv4_loadattr(nd, NULL, &nfsva, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, p, cred); if (error != 0) goto nfsmout; if (ndp != NULL) { ndp->nfsdl_change = nfsva.na_filerev; ndp->nfsdl_modtime = nfsva.na_mtime; ndp->nfsdl_flags |= NFSCLDL_MODTIMESET; *dpp = ndp; ndp = NULL; } /* * At this point, the Open has succeeded, so set * nd_repstat = NFS_OK. If the Layoutget failed, * this function just won't return a layout. */ if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); *laystatp = fxdr_unsigned(int, *++tl); if (*laystatp == 0) { error = nfsrv_parselayoutget(nmp, nd, stateidp, retonclosep, flhp); if (error != 0) *laystatp = error; } } else nd->nd_repstat = 0; /* Return 0 for Open. */ } } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; nfsmout: free(ndp, M_NFSCLDELEG); m_freem(nd->nd_mrep); return (error); } /* * Similar nfsrpc_createv4(), but also does the LayoutGet operation. * Used only for mounts with pNFS enabled. */ static int nfsrpc_createlayout(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff, int *unlockedp, nfsv4stateid_t *stateidp, int usecurstateid, int layouttype, int layoutlen, int *retonclosep, struct nfsclflayouthead *flhp, int *laystatp) { uint32_t *tl; int error = 0, deleg, newone, ret, acesize, limitby; struct nfsrv_descript nfsd, *nd = &nfsd; struct nfsclopen *op; struct nfscldeleg *dp = NULL; struct nfsnode *np; struct nfsfh *nfhp; struct nfsclsession *tsep; nfsattrbit_t attrbits; nfsv4stateid_t stateid; struct nfsmount *nmp; nmp = VFSTONFS(dvp->v_mount); np = VTONFS(dvp); *laystatp = ENXIO; *unlockedp = 0; *nfhpp = NULL; *dpp = NULL; *attrflagp = 0; *dattrflagp = 0; if (namelen > NFS_MAXNAMLEN) return (ENAMETOOLONG); NFSCL_REQSTART(nd, NFSPROC_CREATELAYGET, dvp); /* * For V4, this is actually an Open op. */ NFSM_BUILD(tl, u_int32_t *, 5 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(owp->nfsow_seqid); *tl++ = txdr_unsigned(NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD); *tl++ = txdr_unsigned(NFSV4OPEN_DENYNONE); tsep = nfsmnt_mdssession(nmp); *tl++ = tsep->nfsess_clientid.lval[0]; *tl = tsep->nfsess_clientid.lval[1]; nfsm_strtom(nd, owp->nfsow_owner, NFSV4CL_LOCKNAMELEN); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OPEN_CREATE); if ((fmode & O_EXCL) != 0) { if (NFSHASSESSPERSIST(nmp)) { /* Use GUARDED for persistent sessions. */ *tl = txdr_unsigned(NFSCREATE_GUARDED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } else { /* Otherwise, use EXCLUSIVE4_1. */ *tl = txdr_unsigned(NFSCREATE_EXCLUSIVE41); NFSM_BUILD(tl, u_int32_t *, NFSX_VERF); *tl++ = cverf.lval[0]; *tl = cverf.lval[1]; nfscl_fillsattr(nd, vap, dvp, 0, 0); } } else { *tl = txdr_unsigned(NFSCREATE_UNCHECKED); nfscl_fillsattr(nd, vap, dvp, 0, 0); } NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OPEN_CLAIMNULL); nfsm_strtom(nd, name, namelen); /* Get the new file's handle and attributes, plus save the FH. */ NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_SAVEFH); *tl++ = txdr_unsigned(NFSV4OP_GETFH); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); /* Get the directory's post-op attributes. */ NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); nfsm_fhtom(nd, np->n_fhp->nfh_fh, np->n_fhp->nfh_len, 0); NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED); *tl++ = txdr_unsigned(NFSV4OP_RESTOREFH); *tl = txdr_unsigned(NFSV4OP_LAYOUTGET); nfsrv_setuplayoutget(nd, NFSLAYOUTIOMODE_RW, 0, UINT64_MAX, 0, stateidp, layouttype, layoutlen, usecurstateid); error = nfscl_request(nd, dvp, p, cred, dstuff); if (error != 0) return (error); NFSCL_DEBUG(4, "nfsrpc_createlayout stat=%d err=%d\n", nd->nd_repstat, error); if (nd->nd_repstat != 0) *laystatp = nd->nd_repstat; NFSCL_INCRSEQID(owp->nfsow_seqid, nd); if ((nd->nd_flag & ND_NOMOREDATA) == 0) { NFSCL_DEBUG(4, "nfsrpc_createlayout open succeeded\n"); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + 6 * NFSX_UNSIGNED); stateid.seqid = *tl++; stateid.other[0] = *tl++; stateid.other[1] = *tl++; stateid.other[2] = *tl; nfsrv_getattrbits(nd, &attrbits, NULL, NULL); NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED); deleg = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEREAD || deleg == NFSV4OPEN_DELEGATEWRITE) { if (!(owp->nfsow_clp->nfsc_flags & NFSCLFLAGS_FIRSTDELEG)) owp->nfsow_clp->nfsc_flags |= (NFSCLFLAGS_FIRSTDELEG | NFSCLFLAGS_GOTDELEG); dp = malloc(sizeof(struct nfscldeleg) + NFSX_V4FHMAX, M_NFSCLDELEG, M_WAITOK); LIST_INIT(&dp->nfsdl_owner); LIST_INIT(&dp->nfsdl_lock); dp->nfsdl_clp = owp->nfsow_clp; newnfs_copyincred(cred, &dp->nfsdl_cred); nfscl_lockinit(&dp->nfsdl_rwlock); NFSM_DISSECT(tl, u_int32_t *, NFSX_STATEID + NFSX_UNSIGNED); dp->nfsdl_stateid.seqid = *tl++; dp->nfsdl_stateid.other[0] = *tl++; dp->nfsdl_stateid.other[1] = *tl++; dp->nfsdl_stateid.other[2] = *tl++; ret = fxdr_unsigned(int, *tl); if (deleg == NFSV4OPEN_DELEGATEWRITE) { dp->nfsdl_flags = NFSCLDL_WRITE; /* * Indicates how much the file can grow. */ NFSM_DISSECT(tl, u_int32_t *, 3 * NFSX_UNSIGNED); limitby = fxdr_unsigned(int, *tl++); switch (limitby) { case NFSV4OPEN_LIMITSIZE: dp->nfsdl_sizelimit = fxdr_hyper(tl); break; case NFSV4OPEN_LIMITBLOCKS: dp->nfsdl_sizelimit = fxdr_unsigned(u_int64_t, *tl++); dp->nfsdl_sizelimit *= fxdr_unsigned(u_int64_t, *tl); break; default: error = NFSERR_BADXDR; goto nfsmout; }; } else { dp->nfsdl_flags = NFSCLDL_READ; } if (ret != 0) dp->nfsdl_flags |= NFSCLDL_RECALL; error = nfsrv_dissectace(nd, &dp->nfsdl_ace, &ret, &acesize, p); if (error != 0) goto nfsmout; } else if (deleg != NFSV4OPEN_DELEGATENONE) { error = NFSERR_BADXDR; goto nfsmout; } /* Now, we should have the status for the SaveFH. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl == 0) { NFSCL_DEBUG(4, "nfsrpc_createlayout SaveFH ok\n"); /* * Now, process the GetFH and Getattr for the newly * created file. nfscl_mtofh() will set * ND_NOMOREDATA if these weren't successful. */ error = nfscl_mtofh(nd, nfhpp, nnap, attrflagp); NFSCL_DEBUG(4, "aft nfscl_mtofh err=%d\n", error); if (error != 0) goto nfsmout; } else nd->nd_flag |= ND_NOMOREDATA; /* Now we have the PutFH and Getattr for the directory. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) nd->nd_flag |= ND_NOMOREDATA; else { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) nd->nd_flag |= ND_NOMOREDATA; } } if ((nd->nd_flag & ND_NOMOREDATA) == 0) { /* Load the directory attributes. */ error = nfsm_loadattr(nd, dnap); NFSCL_DEBUG(4, "aft nfsm_loadattr err=%d\n", error); if (error != 0) goto nfsmout; *dattrflagp = 1; if (dp != NULL && *attrflagp != 0) { dp->nfsdl_change = nnap->na_filerev; dp->nfsdl_modtime = nnap->na_mtime; dp->nfsdl_flags |= NFSCLDL_MODTIMESET; } /* * We can now complete the Open state. */ nfhp = *nfhpp; if (dp != NULL) { dp->nfsdl_fhlen = nfhp->nfh_len; NFSBCOPY(nfhp->nfh_fh, dp->nfsdl_fh, nfhp->nfh_len); } /* * Get an Open structure that will be * attached to the OpenOwner, acquired already. */ error = nfscl_open(dvp, nfhp->nfh_fh, nfhp->nfh_len, (NFSV4OPEN_ACCESSWRITE | NFSV4OPEN_ACCESSREAD), 0, cred, p, NULL, &op, &newone, NULL, 0); if (error != 0) goto nfsmout; op->nfso_stateid = stateid; newnfs_copyincred(cred, &op->nfso_cred); nfscl_openrelease(nmp, op, error, newone); *unlockedp = 1; /* Now, handle the RestoreFH and LayoutGet. */ if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 4 * NFSX_UNSIGNED); *laystatp = fxdr_unsigned(int, *(tl + 3)); if (*laystatp == 0) { error = nfsrv_parselayoutget(nmp, nd, stateidp, retonclosep, flhp); if (error != 0) *laystatp = error; } NFSCL_DEBUG(4, "aft nfsrv_parselayout err=%d\n", error); } else nd->nd_repstat = 0; } } if (nd->nd_repstat != 0 && error == 0) error = nd->nd_repstat; if (error == NFSERR_STALECLIENTID || error == NFSERR_BADSESSION) nfscl_initiate_recovery(owp->nfsow_clp); nfsmout: NFSCL_DEBUG(4, "eo nfsrpc_createlayout err=%d\n", error); if (error == 0) *dpp = dp; else free(dp, M_NFSCLDELEG); m_freem(nd->nd_mrep); return (error); } /* * Similar to nfsrpc_getopenlayout(), except that it used for the Create case. */ static int nfsrpc_getcreatelayout(vnode_t dvp, char *name, int namelen, struct vattr *vap, nfsquad_t cverf, int fmode, struct nfsclowner *owp, struct nfscldeleg **dpp, struct ucred *cred, NFSPROC_T *p, struct nfsvattr *dnap, struct nfsvattr *nnap, struct nfsfh **nfhpp, int *attrflagp, int *dattrflagp, void *dstuff, int *unlockedp) { struct nfscllayout *lyp; struct nfsclflayouthead flh; struct nfsfh *nfhp; struct nfsclsession *tsep; struct nfsmount *nmp; nfsv4stateid_t stateid; int error, layoutlen, layouttype, retonclose, laystat; error = 0; nmp = VFSTONFS(dvp->v_mount); if (NFSHASFLEXFILE(nmp)) layouttype = NFSLAYOUT_FLEXFILE; else layouttype = NFSLAYOUT_NFSV4_1_FILES; LIST_INIT(&flh); tsep = nfsmnt_mdssession(nmp); layoutlen = tsep->nfsess_maxcache - (NFSX_STATEID + 3 * NFSX_UNSIGNED); error = nfsrpc_createlayout(dvp, name, namelen, vap, cverf, fmode, owp, dpp, cred, p, dnap, nnap, nfhpp, attrflagp, dattrflagp, dstuff, unlockedp, &stateid, 1, layouttype, layoutlen, &retonclose, &flh, &laystat); NFSCL_DEBUG(4, "aft nfsrpc_createlayoutrpc laystat=%d err=%d\n", laystat, error); lyp = NULL; if (laystat == 0) { nfhp = *nfhpp; laystat = nfsrpc_layoutgetres(nmp, dvp, nfhp->nfh_fh, nfhp->nfh_len, &stateid, retonclose, NULL, &lyp, &flh, layouttype, laystat, NULL, cred, p); } else laystat = nfsrpc_layoutgetres(nmp, dvp, NULL, 0, &stateid, retonclose, NULL, &lyp, &flh, layouttype, laystat, NULL, cred, p); if (laystat == 0) nfscl_rellayout(lyp, 0); return (error); } /* * Process the results of a layoutget() operation. */ static int nfsrpc_layoutgetres(struct nfsmount *nmp, vnode_t vp, uint8_t *newfhp, int newfhlen, nfsv4stateid_t *stateidp, int retonclose, uint32_t *notifybit, struct nfscllayout **lypp, struct nfsclflayouthead *flhp, int layouttype, int laystat, int *islockedp, struct ucred *cred, NFSPROC_T *p) { struct nfsclflayout *tflp; struct nfscldevinfo *dip; uint8_t *dev; int i, mirrorcnt; if (laystat == NFSERR_UNKNLAYOUTTYPE) { NFSLOCKMNT(nmp); if (!NFSHASFLEXFILE(nmp)) { /* Switch to using Flex File Layout. */ nmp->nm_state |= NFSSTA_FLEXFILE; } else if (layouttype == NFSLAYOUT_FLEXFILE) { /* Disable pNFS. */ NFSCL_DEBUG(1, "disable PNFS\n"); nmp->nm_state &= ~(NFSSTA_PNFS | NFSSTA_FLEXFILE); } NFSUNLOCKMNT(nmp); } if (laystat == 0) { NFSCL_DEBUG(4, "nfsrpc_layoutgetres at FOREACH\n"); LIST_FOREACH(tflp, flhp, nfsfl_list) { if (layouttype == NFSLAYOUT_FLEXFILE) mirrorcnt = tflp->nfsfl_mirrorcnt; else mirrorcnt = 1; for (i = 0; i < mirrorcnt; i++) { laystat = nfscl_adddevinfo(nmp, NULL, i, tflp); NFSCL_DEBUG(4, "aft adddev=%d\n", laystat); if (laystat != 0) { if (layouttype == NFSLAYOUT_FLEXFILE) dev = tflp->nfsfl_ffm[i].dev; else dev = tflp->nfsfl_dev; laystat = nfsrpc_getdeviceinfo(nmp, dev, layouttype, notifybit, &dip, cred, p); NFSCL_DEBUG(4, "aft nfsrpc_gdi=%d\n", laystat); if (laystat != 0) goto out; laystat = nfscl_adddevinfo(nmp, dip, i, tflp); if (laystat != 0) printf("nfsrpc_layoutgetresout" ": cannot add\n"); } } } } out: if (laystat == 0) { /* * nfscl_layout() always returns with the nfsly_lock * set to a refcnt (shared lock). * Passing in dvp is sufficient, since it is only used to * get the fsid for the file system. */ laystat = nfscl_layout(nmp, vp, newfhp, newfhlen, stateidp, layouttype, retonclose, flhp, lypp, cred, p); NFSCL_DEBUG(4, "nfsrpc_layoutgetres: aft nfscl_layout=%d\n", laystat); if (laystat == 0 && islockedp != NULL) *islockedp = 1; } return (laystat); } /* * nfs copy_file_range operation. */ int nfsrpc_copy_file_range(vnode_t invp, off_t *inoffp, vnode_t outvp, off_t *outoffp, size_t *lenp, unsigned int flags, int *inattrflagp, struct nfsvattr *innap, int *outattrflagp, struct nfsvattr *outnap, struct ucred *cred, bool consecutive, bool *must_commitp) { int commit, error, expireret = 0, retrycnt; u_int32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(invp->v_mount); struct nfsfh *innfhp = NULL, *outnfhp = NULL; nfsv4stateid_t instateid, outstateid; void *inlckp, *outlckp; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; innfhp = VTONFS(invp)->n_fhp; outnfhp = VTONFS(outvp)->n_fhp; retrycnt = 0; do { /* Get both stateids. */ inlckp = NULL; nfscl_getstateid(invp, innfhp->nfh_fh, innfhp->nfh_len, NFSV4OPEN_ACCESSREAD, 0, NULL, curthread, &instateid, &inlckp); outlckp = NULL; nfscl_getstateid(outvp, outnfhp->nfh_fh, outnfhp->nfh_len, NFSV4OPEN_ACCESSWRITE, 0, NULL, curthread, &outstateid, &outlckp); error = nfsrpc_copyrpc(invp, *inoffp, outvp, *outoffp, lenp, &instateid, &outstateid, innap, inattrflagp, outnap, outattrflagp, consecutive, &commit, cred, curthread); if (error == 0) { if (commit != NFSWRITE_FILESYNC) *must_commitp = true; *inoffp += *lenp; *outoffp += *lenp; } else if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (inlckp != NULL) nfscl_lockderef(inlckp); if (outlckp != NULL) nfscl_lockderef(outlckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_cfr"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, curthread); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_DELAY || error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4)); if (error != 0 && (retrycnt >= 4 || error == NFSERR_STALESTATEID || error == NFSERR_BADSESSION || error == NFSERR_STALEDONTRECOVER)) error = EIO; return (error); } /* * The copy RPC. */ static int nfsrpc_copyrpc(vnode_t invp, off_t inoff, vnode_t outvp, off_t outoff, size_t *lenp, nfsv4stateid_t *instateidp, nfsv4stateid_t *outstateidp, struct nfsvattr *innap, int *inattrflagp, struct nfsvattr *outnap, int *outattrflagp, bool consecutive, int *commitp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; struct nfsmount *nmp; nfsattrbit_t attrbits; uint64_t len; nmp = VFSTONFS(outvp->v_mount); *inattrflagp = *outattrflagp = 0; *commitp = NFSWRITE_UNSTABLE; len = *lenp; *lenp = 0; if (len > nfs_maxcopyrange) len = nfs_maxcopyrange; NFSCL_REQSTART(nd, NFSPROC_COPY, invp); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_PUTFH); nfsm_fhtom(nd, VTONFS(outvp)->n_fhp->nfh_fh, VTONFS(outvp)->n_fhp->nfh_len, 0); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_COPY); nfsm_stateidtom(nd, instateidp, NFSSTATEID_PUTSTATEID); nfsm_stateidtom(nd, outstateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, 3 * NFSX_HYPER + 4 * NFSX_UNSIGNED); txdr_hyper(inoff, tl); tl += 2; txdr_hyper(outoff, tl); tl += 2; txdr_hyper(len, tl); tl += 2; if (consecutive) *tl++ = newnfs_true; else *tl++ = newnfs_false; *tl++ = newnfs_true; *tl++ = 0; *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSWRITEGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, invp, p, cred, NULL); if (error != 0) return (error); if ((nd->nd_flag & ND_NOMOREDATA) == 0) { /* Get the input file's attributes. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*(tl + 1) == 0) { error = nfsm_loadattr(nd, innap); if (error != 0) goto nfsmout; *inattrflagp = 1; } else nd->nd_flag |= ND_NOMOREDATA; } /* Skip over return stat for PutFH. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (*++tl != 0) nd->nd_flag |= ND_NOMOREDATA; } /* Skip over return stat for Copy. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); if (*tl != 0) { /* There should be no callback ids. */ error = NFSERR_BADXDR; goto nfsmout; } NFSM_DISSECT(tl, uint32_t *, NFSX_HYPER + 3 * NFSX_UNSIGNED + NFSX_VERF); len = fxdr_hyper(tl); tl += 2; *commitp = fxdr_unsigned(int, *tl++); NFSLOCKMNT(nmp); if (!NFSHASWRITEVERF(nmp)) { NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); NFSSETWRITEVERF(nmp); } else if (NFSBCMP(tl, nmp->nm_verf, NFSX_VERF)) { NFSBCOPY(tl, nmp->nm_verf, NFSX_VERF); nd->nd_repstat = NFSERR_STALEWRITEVERF; } NFSUNLOCKMNT(nmp); tl += (NFSX_VERF / NFSX_UNSIGNED); if (nd->nd_repstat == 0 && *++tl != newnfs_true) /* Must be a synchronous copy. */ nd->nd_repstat = NFSERR_NOTSUPP; NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, outnap); if (error == 0) *outattrflagp = NFS_LATTR_NOSHRINK; if (nd->nd_repstat == 0) *lenp = len; } else if (nd->nd_repstat == NFSERR_OFFLOADNOREQS) { /* * For the case where consecutive is not supported, but * synchronous is supported, we can try consecutive == false * by returning this error. Otherwise, return NFSERR_NOTSUPP, * since Copy cannot be done. */ if ((nd->nd_flag & ND_NOMOREDATA) == 0) { NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); if (!consecutive || *++tl == newnfs_false) nd->nd_repstat = NFSERR_NOTSUPP; } else nd->nd_repstat = NFSERR_BADXDR; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Seek operation. */ int nfsrpc_seek(vnode_t vp, off_t *offp, bool *eofp, int content, struct ucred *cred, struct nfsvattr *nap, int *attrflagp) { int error, expireret = 0, retrycnt; u_int32_t clidrev = 0; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsnode *np = VTONFS(vp); struct nfsfh *nfhp = NULL; nfsv4stateid_t stateid; void *lckp; if (nmp->nm_clp != NULL) clidrev = nmp->nm_clp->nfsc_clientidrev; nfhp = np->n_fhp; retrycnt = 0; do { lckp = NULL; nfscl_getstateid(vp, nfhp->nfh_fh, nfhp->nfh_len, NFSV4OPEN_ACCESSREAD, 0, cred, curthread, &stateid, &lckp); error = nfsrpc_seekrpc(vp, offp, &stateid, eofp, content, nap, attrflagp, cred); if (error == NFSERR_STALESTATEID) nfscl_initiate_recovery(nmp->nm_clp); if (lckp != NULL) nfscl_lockderef(lckp); if (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_OLDSTATEID || error == NFSERR_BADSESSION) { (void) nfs_catnap(PZERO, error, "nfs_seek"); } else if ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && clidrev != 0) { expireret = nfscl_hasexpired(nmp->nm_clp, clidrev, curthread); } retrycnt++; } while (error == NFSERR_GRACE || error == NFSERR_STALESTATEID || error == NFSERR_STALEDONTRECOVER || error == NFSERR_DELAY || error == NFSERR_BADSESSION || (error == NFSERR_OLDSTATEID && retrycnt < 20) || ((error == NFSERR_EXPIRED || error == NFSERR_BADSTATEID) && expireret == 0 && clidrev != 0 && retrycnt < 4) || (error == NFSERR_OPENMODE && retrycnt < 4)); if (error && retrycnt >= 4) error = EIO; return (error); } /* * The seek RPC. */ static int nfsrpc_seekrpc(vnode_t vp, off_t *offp, nfsv4stateid_t *stateidp, bool *eofp, int content, struct nfsvattr *nap, int *attrflagp, struct ucred *cred) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_SEEK, vp); nfsm_stateidtom(nd, stateidp, NFSSTATEID_PUTSTATEID); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); txdr_hyper(*offp, tl); tl += 2; *tl++ = txdr_unsigned(content); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, curthread, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED + NFSX_HYPER); if (*tl++ == newnfs_true) *eofp = true; else *eofp = false; *offp = fxdr_hyper(tl); /* Just skip over Getattr op status. */ error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The getextattr RPC. */ int nfsrpc_getextattr(vnode_t vp, const char *name, struct uio *uiop, ssize_t *lenp, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; uint32_t len, len2; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_GETEXTATTR, vp); nfsm_strtom(nd, name, strlen(name)); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(uint32_t, *tl); /* Sanity check lengths. */ if (uiop != NULL && len > 0 && len <= IOSIZE_MAX && uiop->uio_resid <= UINT32_MAX) { len2 = uiop->uio_resid; if (len2 >= len) error = nfsm_mbufuio(nd, uiop, len); else { error = nfsm_mbufuio(nd, uiop, len2); if (error == 0) { /* * nfsm_mbufuio() advances to a multiple * of 4, so round up len2 as well. Then * we need to advance over the rest of * the data, rounding up the remaining * length. */ len2 = NFSM_RNDUP(len2); len2 = NFSM_RNDUP(len - len2); if (len2 > 0) error = nfsm_advance(nd, len2, -1); } } } else if (uiop == NULL && len > 0) { /* Just wants the length and not the data. */ error = nfsm_advance(nd, NFSM_RNDUP(len), -1); } else if (len > 0) error = ENOATTR; if (error != 0) goto nfsmout; *lenp = len; /* Just skip over Getattr op status. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The setextattr RPC. */ int nfsrpc_setextattr(vnode_t vp, const char *name, struct uio *uiop, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_SETEXTATTR, vp); if (uiop->uio_resid > nd->nd_maxreq) { /* nd_maxreq is set by NFSCL_REQSTART(). */ m_freem(nd->nd_mreq); return (EINVAL); } NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4SXATTR_EITHER); nfsm_strtom(nd, name, strlen(name)); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(uiop->uio_resid); nfsm_uiombuf(nd, uiop, uiop->uio_resid); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { /* Just skip over the reply and Getattr op status. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + 3 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The removeextattr RPC. */ int nfsrpc_rmextattr(vnode_t vp, const char *name, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int error; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_RMEXTATTR, vp); nfsm_strtom(nd, name, strlen(name)); NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); if (nd->nd_repstat == 0) { /* Just skip over the reply and Getattr op status. */ NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_HYPER + 3 * NFSX_UNSIGNED); error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * The listextattr RPC. */ int nfsrpc_listextattr(vnode_t vp, uint64_t *cookiep, struct uio *uiop, size_t *lenp, bool *eofp, struct nfsvattr *nap, int *attrflagp, struct ucred *cred, NFSPROC_T *p) { uint32_t *tl; int cnt, error, i, len; struct nfsrv_descript nfsd; struct nfsrv_descript *nd = &nfsd; nfsattrbit_t attrbits; u_char c; *attrflagp = 0; NFSCL_REQSTART(nd, NFSPROC_LISTEXTATTR, vp); NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED); txdr_hyper(*cookiep, tl); tl += 2; *tl++ = txdr_unsigned(*lenp); *tl = txdr_unsigned(NFSV4OP_GETATTR); NFSGETATTR_ATTRBIT(&attrbits); nfsrv_putattrbit(nd, &attrbits); error = nfscl_request(nd, vp, p, cred, NULL); if (error != 0) return (error); *eofp = true; *lenp = 0; if (nd->nd_repstat == 0) { NFSM_DISSECT(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED); *cookiep = fxdr_hyper(tl); tl += 2; cnt = fxdr_unsigned(int, *tl); if (cnt < 0) { error = EBADRPC; goto nfsmout; } for (i = 0; i < cnt; i++) { NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED); len = fxdr_unsigned(int, *tl); if (len <= 0 || len > EXTATTR_MAXNAMELEN) { error = EBADRPC; goto nfsmout; } if (uiop == NULL) error = nfsm_advance(nd, NFSM_RNDUP(len), -1); else if (uiop->uio_resid >= len + 1) { c = len; error = uiomove(&c, sizeof(c), uiop); if (error == 0) error = nfsm_mbufuio(nd, uiop, len); } else { error = nfsm_advance(nd, NFSM_RNDUP(len), -1); *eofp = false; } if (error != 0) goto nfsmout; *lenp += (len + 1); } /* Get the eof and skip over the Getattr op status. */ NFSM_DISSECT(tl, uint32_t *, 3 * NFSX_UNSIGNED); /* * *eofp is set false above, because it wasn't able to copy * all of the reply. */ if (*eofp && *tl == 0) *eofp = false; error = nfsm_loadattr(nd, nap); if (error == 0) *attrflagp = 1; } if (error == 0) error = nd->nd_repstat; nfsmout: m_freem(nd->nd_mrep); return (error); } /* * Split an mbuf list. For non-M_EXTPG mbufs, just use m_split(). */ static struct mbuf * nfsm_split(struct mbuf *mp, uint64_t xfer) { struct mbuf *m, *m2; vm_page_t pg; int i, j, left, pgno, plen, trim; char *cp, *cp2; if ((mp->m_flags & M_EXTPG) == 0) { m = m_split(mp, xfer, M_WAITOK); return (m); } /* Find the correct mbuf to split at. */ for (m = mp; m != NULL && xfer > m->m_len; m = m->m_next) xfer -= m->m_len; if (m == NULL) return (NULL); /* If xfer == m->m_len, we can just split the mbuf list. */ if (xfer == m->m_len) { m2 = m->m_next; m->m_next = NULL; return (m2); } /* Find the page to split at. */ pgno = 0; left = xfer; do { if (pgno == 0) plen = m_epg_pagelen(m, 0, m->m_epg_1st_off); else plen = m_epg_pagelen(m, pgno, 0); if (left <= plen) break; left -= plen; pgno++; } while (pgno < m->m_epg_npgs); if (pgno == m->m_epg_npgs) panic("nfsm_split: eroneous ext_pgs mbuf"); m2 = mb_alloc_ext_pgs(M_WAITOK, mb_free_mext_pgs); m2->m_epg_flags |= EPG_FLAG_ANON; /* * If left < plen, allocate a new page for the new mbuf * and copy the data after left in the page to this new * page. */ if (left < plen) { do { pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_NODUMP | VM_ALLOC_WIRED); if (pg == NULL) vm_wait(NULL); } while (pg == NULL); m2->m_epg_pa[0] = VM_PAGE_TO_PHYS(pg); m2->m_epg_npgs = 1; /* Copy the data after left to the new page. */ trim = plen - left; cp = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[pgno]); if (pgno == 0) cp += m->m_epg_1st_off; cp += left; cp2 = (char *)(void *)PHYS_TO_DMAP(m2->m_epg_pa[0]); if (pgno == m->m_epg_npgs - 1) m2->m_epg_last_len = trim; else { cp2 += PAGE_SIZE - trim; m2->m_epg_1st_off = PAGE_SIZE - trim; m2->m_epg_last_len = m->m_epg_last_len; } memcpy(cp2, cp, trim); m2->m_len = trim; } else { m2->m_len = 0; m2->m_epg_last_len = m->m_epg_last_len; } /* Move the pages beyond pgno to the new mbuf. */ for (i = pgno + 1, j = m2->m_epg_npgs; i < m->m_epg_npgs; i++, j++) { m2->m_epg_pa[j] = m->m_epg_pa[i]; /* Never moves page 0. */ m2->m_len += m_epg_pagelen(m, i, 0); } m2->m_epg_npgs = j; m->m_epg_npgs = pgno + 1; m->m_epg_last_len = left; m->m_len = xfer; m2->m_next = m->m_next; m->m_next = NULL; return (m2); } diff --git a/sys/mips/ingenic/jz4780_regs.h b/sys/mips/ingenic/jz4780_regs.h index 221c21784925..c46045ae0e18 100644 --- a/sys/mips/ingenic/jz4780_regs.h +++ b/sys/mips/ingenic/jz4780_regs.h @@ -1,787 +1,787 @@ /* $NetBSD: ingenic_regs.h,v 1.22 2015/10/08 17:54:30 macallan Exp $ */ /*- * Copyright (c) 2014 Michael Lorenz * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef JZ4780_REGS_H #define JZ4780_REGS_H /* for mips_wbflush() */ #include /* UARTs, mostly 16550 compatible with 32bit spaced registers */ #define JZ_UART0 0x10030000 #define JZ_UART1 0x10031000 #define JZ_UART2 0x10032000 #define JZ_UART3 0x10033000 #define JZ_UART4 0x10034000 /* LCD controller base addresses, registers are in jzfb_regs.h */ #define JZ_LCDC0_BASE 0x13050000 #define JZ_LCDC1_BASE 0x130a0000 /* TCU unit base address */ #define JZ_TCU_BASE 0x10002000 /* Watchdog */ #define JZ_WDOG_TDR 0x00000000 /* compare */ #define JZ_WDOG_TCER 0x00000004 #define TCER_ENABLE 0x01 /* enable counter */ #define JZ_WDOG_TCNT 0x00000008 /* 16bit up count */ #define JZ_WDOG_TCSR 0x0000000c #define TCSR_PCK_EN 0x01 /* PCLK */ #define TCSR_RTC_EN 0x02 /* RTCCLK - 32.768kHz */ #define TCSR_EXT_EN 0x04 /* EXTCLK - 48MHz */ #define TCSR_PRESCALE_M 0x38 #define TCSR_DIV_1 0x00 #define TCSR_DIV_4 0x08 #define TCSR_DIV_16 0x10 #define TCSR_DIV_64 0x18 #define TCSR_DIV_256 0x20 #define TCSR_DIV_1024 0x28 /* timers and PWMs */ #define JZ_TC_TER 0x00000010 /* TC enable reg, ro */ #define JZ_TC_TESR 0x00000014 /* TC enable set reg. */ #define TESR_TCST0 0x0001 /* enable counter 0 */ #define TESR_TCST1 0x0002 /* enable counter 1 */ #define TESR_TCST2 0x0004 /* enable counter 2 */ #define TESR_TCST3 0x0008 /* enable counter 3 */ #define TESR_TCST4 0x0010 /* enable counter 4 */ #define TESR_TCST5 0x0020 /* enable counter 5 */ #define TESR_TCST6 0x0040 /* enable counter 6 */ #define TESR_TCST7 0x0080 /* enable counter 7 */ #define TESR_OST 0x8000 /* enable OST */ #define JZ_TC_TECR 0x00000018 /* TC enable clear reg. */ #define JZ_TC_TFR 0x00000020 #define TFR_FFLAG0 0x00000001 /* channel 0 */ #define TFR_FFLAG1 0x00000002 /* channel 1 */ #define TFR_FFLAG2 0x00000004 /* channel 2 */ #define TFR_FFLAG3 0x00000008 /* channel 3 */ #define TFR_FFLAG4 0x00000010 /* channel 4 */ #define TFR_FFLAG5 0x00000020 /* channel 5 */ #define TFR_FFLAG6 0x00000040 /* channel 6 */ #define TFR_FFLAG7 0x00000080 /* channel 7 */ #define TFR_OSTFLAG 0x00008000 /* OS timer */ #define JZ_TC_TFSR 0x00000024 /* timer flag set */ #define JZ_TC_TFCR 0x00000028 /* timer flag clear */ #define JZ_TC_TMR 0x00000030 /* timer flag mask */ #define TMR_FMASK(n) (1 << (n)) #define TMR_HMASK(n) (1 << ((n) + 16)) #define JZ_TC_TMSR 0x00000034 /* timer flag mask set */ #define JZ_TC_TMCR 0x00000038 /* timer flag mask clear*/ #define JZ_TC_TDFR(n) (0x00000040 + (n * 0x10)) /* FULL compare */ #define JZ_TC_TDHR(n) (0x00000044 + (n * 0x10)) /* HALF compare */ #define JZ_TC_TCNT(n) (0x00000048 + (n * 0x10)) /* count */ #define JZ_TC_TCSR(n) (0x0000004c + (n * 0x10)) /* same bits as in JZ_WDOG_TCSR */ /* operating system timer */ #define JZ_OST_DATA 0x000000e0 /* compare */ #define JZ_OST_CNT_LO 0x000000e4 #define JZ_OST_CNT_HI 0x000000e8 #define JZ_OST_CTRL 0x000000ec #define OSTC_PCK_EN 0x0001 /* use PCLK */ #define OSTC_RTC_EN 0x0002 /* use RTCCLK */ #define OSTC_EXT_EN 0x0004 /* use EXTCLK */ #define OSTC_PRESCALE_M 0x0038 #define OSTC_DIV_1 0x0000 #define OSTC_DIV_4 0x0008 #define OSTC_DIV_16 0x0010 #define OSTC_DIV_64 0x0018 #define OSTC_DIV_256 0x0020 #define OSTC_DIV_1024 0x0028 #define OSTC_SHUTDOWN 0x0200 #define OSTC_MODE 0x8000 /* 0 - reset to 0 when = OST_DATA */ #define JZ_OST_CNT_U32 0x000000fc /* copy of CNT_HI when reading CNT_LO */ static inline void writereg(uint32_t reg, uint32_t val) { *(volatile int32_t *)MIPS_PHYS_TO_KSEG1(reg) = val; mips_wbflush(); } static inline uint32_t readreg(uint32_t reg) { mips_wbflush(); return *(volatile int32_t *)MIPS_PHYS_TO_KSEG1(reg); } /* Clock management */ #define JZ_CGU_BASE 0x10000000 #define JZ_CPCCR 0x00000000 /* Clock Control Register */ #define JZ_PDIV_M 0x000f0000 /* PCLK divider mask */ #define JZ_PDIV_S 16 /* PCLK divider shift */ #define JZ_CDIV_M 0x0000000f /* CPU clock divider mask */ #define JZ_CDIV_S 0 /* CPU clock divider shift */ #define JZ_CPAPCR 0x00000010 /* APLL */ #define JZ_CPMPCR 0x00000014 /* MPLL */ #define JZ_CPEPCR 0x00000018 /* EPLL */ #define JZ_CPVPCR 0x0000001C /* VPLL */ #define JZ_PLLM_S 19 /* PLL multiplier shift */ #define JZ_PLLM_M 0xfff80000 /* PLL multiplier mask */ #define JZ_PLLN_S 13 /* PLL divider shift */ #define JZ_PLLN_M 0x0007e000 /* PLL divider mask */ #define JZ_PLLP_S 9 /* PLL postdivider shift */ #define JZ_PLLP_M 0x00001700 /* PLL postdivider mask */ #define JZ_PLLON 0x00000010 /* PLL is on and stable */ #define JZ_PLLBP 0x00000002 /* PLL bypass */ #define JZ_PLLEN 0x00000001 /* PLL enable */ #define JZ_CLKGR0 0x00000020 /* Clock Gating Registers */ #define CLK_NEMC (1 << 0) #define CLK_BCH (1 << 1) #define CLK_OTG0 (1 << 2) #define CLK_MSC0 (1 << 3) #define CLK_SSI0 (1 << 4) #define CLK_SMB0 (1 << 5) #define CLK_SMB1 (1 << 6) #define CLK_SCC (1 << 7) #define CLK_AIC (1 << 8) #define CLK_TSSI0 (1 << 9) #define CLK_OWI (1 << 10) #define CLK_MSC1 (1 << 11) #define CLK_MSC2 (1 << 12) #define CLK_KBC (1 << 13) #define CLK_SADC (1 << 14) #define CLK_UART0 (1 << 15) #define CLK_UART1 (1 << 16) #define CLK_UART2 (1 << 17) #define CLK_UART3 (1 << 18) #define CLK_SSI1 (1 << 19) #define CLK_SSI2 (1 << 20) #define CLK_PDMA (1 << 21) #define CLK_GPS (1 << 22) #define CLK_MAC (1 << 23) #define CLK_UHC (1 << 24) #define CLK_SMB2 (1 << 25) #define CLK_CIM (1 << 26) #define CLK_TVE (1 << 27) #define CLK_LCD (1 << 28) #define CLK_IPU (1 << 29) #define CLK_DDR0 (1 << 30) #define CLK_DDR1 (1 << 31) #define JZ_CLKGR1 0x00000028 /* Clock Gating Registers */ #define CLK_SMB3 (1 << 0) #define CLK_TSSI1 (1 << 1) #define CLK_VPU (1 << 2) #define CLK_PCM (1 << 3) #define CLK_GPU (1 << 4) #define CLK_COMPRESS (1 << 5) #define CLK_AIC1 (1 << 6) #define CLK_GPVLC (1 << 7) #define CLK_OTG1 (1 << 8) #define CLK_HDMI (1 << 9) #define CLK_UART4 (1 << 10) #define CLK_AHB_MON (1 << 11) #define CLK_SMB4 (1 << 12) #define CLK_DES (1 << 13) #define CLK_X2D (1 << 14) #define CLK_P1 (1 << 15) #define JZ_DDCDR 0x0000002c /* DDR clock divider register */ #define JZ_VPUCDR 0x00000030 /* VPU clock divider register */ #define JZ_I2SCDR 0x00000060 /* I2S device clock divider register */ #define JZ_I2S1CDR 0x000000a0 /* I2S device clock divider register */ #define JZ_USBCDR 0x00000050 /* OTG PHY clock divider register */ #define JZ_LP0CDR 0x00000054 /* LCD0 pix clock divider register */ #define JZ_LP1CDR 0x00000064 /* LCD1 pix clock divider register */ #define JZ_MSC0CDR 0x00000068 /* MSC0 clock divider register */ #define JZ_MSC1CDR 0x000000a4 /* MSC1 clock divider register */ #define JZ_MSC2CDR 0x000000a8 /* MSC2 clock divider register */ #define MSCCDR_SCLK_A 0x40000000 #define MSCCDR_MPLL 0x80000000 #define MSCCDR_CE 0x20000000 #define MSCCDR_BUSY 0x10000000 #define MSCCDR_STOP 0x08000000 #define MSCCDR_PHASE 0x00008000 /* 0 - 90deg phase, 1 - 180 */ #define MSCCDR_DIV_M 0x000000ff /* src / ((div + 1) * 2) */ #define UHCCDR_DIV_M 0x000000ff #define JZ_UHCCDR 0x0000006c /* UHC 48M clock divider register */ #define UHCCDR_SCLK_A 0x00000000 #define UHCCDR_MPLL 0x40000000 #define UHCCDR_EPLL 0x80000000 #define UHCCDR_OTG_PHY 0xc0000000 #define UHCCDR_CLK_MASK 0xc0000000 #define UHCCDR_CE 0x20000000 #define UHCCDR_BUSY 0x10000000 #define UHCCDR_STOP 0x08000000 #define UHCCDR_DIV_M 0x000000ff #define UHCCDR_DIV(d) (d) #define JZ_SSICDR 0x00000074 /* SSI clock divider register */ #define JZ_CIMCDR 0x0000007c /* CIM MCLK clock divider register */ #define JZ_PCMCDR 0x00000084 /* PCM device clock divider register */ #define JZ_GPUCDR 0x00000088 /* GPU clock divider register */ #define JZ_HDMICDR 0x0000008c /* HDMI clock divider register */ #define JZ_BCHCDR 0x000000ac /* BCH clock divider register */ #define JZ_CPM_INTR 0x000000b0 /* CPM interrupt register */ #define JZ_CPM_INTRE 0x000000b4 /* CPM interrupt enable register */ #define JZ_CPSPR 0x00000034 /* CPM scratch register */ #define JZ_CPSRPR 0x00000038 /* CPM scratch protected register */ #define JZ_USBPCR 0x0000003c /* USB parameter control register */ #define PCR_USB_MODE 0x80000000 /* 1 - otg */ #define PCR_AVLD_REG 0x40000000 #define PCR_IDPULLUP_MASK 0x30000000 #define PCR_INCR_MASK 0x08000000 #define PCR_TCRISETUNE 0x04000000 #define PCR_COMMONONN 0x02000000 #define PCR_VBUSVLDEXT 0x01000000 #define PCR_VBUSVLDEXTSEL 0x00800000 #define PCR_POR 0x00400000 #define PCR_SIDDQ 0x00200000 #define PCR_OTG_DISABLE 0x00100000 #define PCR_COMPDISTN_M 0x000e0000 #define PCR_OTGTUNE 0x0001c000 #define PCR_SQRXTUNE 0x00003800 #define PCR_TXFSLSTUNE 0x00000780 #define PCR_TXPREEMPHTUNE 0x00000040 #define PCR_TXHSXVTUNE 0x00000030 #define PCR_TXVREFTUNE 0x0000000f #define JZ_USBRDT 0x00000040 /* Reset detect timer register */ #define USBRDT_USBRDT_SHIFT 0 #define USBRDT_USBRDT_WIDTH 23 #define USBRDT_VBFIL_LD_EN 0x01000000 #define JZ_USBVBFIL 0x00000044 /* USB jitter filter register */ #define USBVBFIL_IDDIGFIL_SHIFT 16 #define USBVBFIL_IDDIGFIL_WIDTH 16 #define USBVBFIL_USBVBFIL_SHIFT 0 #define USBVBFIL_USBVBFIL_WIDTH 16 #define JZ_USBPCR1 0x00000048 /* USB parameter control register 1 */ #define PCR_SYNOPSYS 0x10000000 /* Mentor mode otherwise */ #define PCR_REFCLK_CORE 0x08000000 #define PCR_REFCLK_XO25 0x04000000 #define PCR_REFCLK_CO 0x00000000 #define PCR_REFCLK_M 0x0c000000 #define PCR_CLK_M 0x03000000 /* clock */ #define PCR_CLK_192 0x03000000 /* 19.2MHz */ #define PCR_CLK_48 0x02000000 /* 48MHz */ #define PCR_CLK_24 0x01000000 /* 24MHz */ #define PCR_CLK_12 0x00000000 /* 12MHz */ #define PCR_DMPD1 0x00800000 /* pull down D- on port 1 */ #define PCR_DPPD1 0x00400000 /* pull down D+ on port 1 */ #define PCR_PORT0_RST 0x00200000 /* port 0 reset */ #define PCR_PORT1_RST 0x00100000 /* port 1 reset */ #define PCR_WORD_I_F0 0x00080000 /* 1: 16bit/30M, 8/60 otherw. */ #define PCR_WORD_I_F1 0x00040000 /* same for port 1 */ #define PCR_COMPDISTUNE 0x00038000 /* disconnect threshold */ #define PCR_SQRXTUNE1 0x00007000 /* squelch threshold */ #define PCR_TXFSLSTUNE1 0x00000f00 /* FS/LS impedance adj. */ #define PCR_TXPREEMPH 0x00000080 /* HS transm. pre-emphasis */ #define PCR_TXHSXVTUNE1 0x00000060 /* dp/dm voltage adj. */ #define PCR_TXVREFTUNE1 0x00000017 /* HS DC voltage adj. */ #define PCR_TXRISETUNE1 0x00000001 /* rise/fall wave adj. */ /* power manager */ #define JZ_LPCR 0x00000004 #define LPCR_PD_SCPU (1u << 31) /* CPU1 power down */ #define LPCR_PD_VPU (1u << 30) /* VPU power down */ #define LPCR_PD_GPU (1u << 29) /* GPU power down */ #define LPCR_PD_GPS (1u << 28) /* GPS power down */ #define LPCR_SCPUS (1u << 27) /* CPU1 power down status */ #define LPCR_VPUS (1u << 26) /* VPU power down status */ #define LPCR_GPUS (1u << 25) /* GPU power down status */ #define LPCR_GPSS (1u << 24) /* GPS power down status */ #define LPCR_GPU_IDLE (1u << 20) /* GPU idle status */ #define LPCR_PST_SHIFT 8 /* Power stability time */ #define LPCR_PST_MASK (0xFFFu << 8) #define LPCR_DUTY_SHIFT 3 /* CPU clock duty */ #define LPCR_DUTY_MASK (0x1Fu << 3) #define LPCR_DOZE (1u << 2) /* Doze mode */ #define LPCR_LPM_SHIFT 0 /* Low power mode */ #define LPCR_LPM_MASK (0x03u << 0) #define JZ_OPCR 0x00000024 /* Oscillator Power Control Reg. */ #define OPCR_IDLE_DIS 0x80000000 /* don't stop CPU clk on idle */ #define OPCR_GPU_CLK_ST 0x40000000 /* stop GPU clock */ #define OPCR_L2CM_M 0x0c000000 #define OPCR_L2CM_ON 0x00000000 /* L2 stays on in sleep */ #define OPCR_L2CM_RET 0x04000000 /* L2 retention mode in sleep */ #define OPCR_L2CM_OFF 0x08000000 /* L2 powers down in sleep */ #define OPCR_SPENDN0 0x00000080 /* 0 - OTG port forced down */ #define OPCR_SPENDN1 0x00000040 /* 0 - UHC port forced down */ #define OPCR_BUS_MODE 0x00000020 /* 1 - bursts */ #define OPCR_O1SE 0x00000010 /* EXTCLK on in sleep */ #define OPCR_PD 0x00000008 /* P0 down in sleep */ #define OPCR_ERCS 0x00000004 /* 1 RTCCLK, 0 EXTCLK/512 */ #define OPCR_CPU_MODE 0x00000002 /* 1 access 'accelerated' */ #define OPCR_OSE 0x00000001 /* disable EXTCLK */ #define JZ_SPCR0 0x000000b8 /* SRAM Power Control Registers */ #define JZ_SPCR1 0x000000bc #define JZ_SRBC 0x000000c4 /* Soft Reset & Bus Control */ #define SRBC_UHC_SR 0x00004000 /* UHC soft reset*/ /* * random number generator * * Its function currently isn't documented by Ingenic. * However, testing suggests that it works as expected. */ #define JZ_ERNG 0x000000d8 #define JZ_RNG 0x000000dc /* Interrupt controller */ #define JZ_ICBASE 0x10001000 /* IC base address */ #define JZ_ICSR0 0x00000000 /* raw IRQ line status */ #define JZ_ICMR0 0x00000004 /* IRQ mask, 1 masks IRQ */ #define JZ_ICMSR0 0x00000008 /* sets bits in mask register */ #define JZ_ICMCR0 0x0000000c /* clears bits in mask register */ #define JZ_ICPR0 0x00000010 /* line status after masking */ #define JZ_ICSR1 0x00000020 /* raw IRQ line status */ #define JZ_ICMR1 0x00000024 /* IRQ mask, 1 masks IRQ */ #define JZ_ICMSR1 0x00000028 /* sets bits in mask register */ #define JZ_ICMCR1 0x0000002c /* clears bits in maks register */ #define JZ_ICPR1 0x00000030 /* line status after masking */ #define JZ_DSR0 0x00000034 /* source for PDMA */ #define JZ_DMR0 0x00000038 /* mask for PDMA */ #define JZ_DPR0 0x0000003c /* pending for PDMA */ #define JZ_DSR1 0x00000040 /* source for PDMA */ #define JZ_DMR1 0x00000044 /* mask for PDMA */ #define JZ_DPR1 0x00000048 /* pending for PDMA */ /* memory controller */ #define JZ_DMMAP0 0x13010024 #define JZ_DMMAP1 0x13010028 #define DMMAP_BASE 0x0000ff00 /* base PADDR of memory chunk */ #define DMMAP_MASK 0x000000ff /* mask which bits of PADDR are * constant */ /* USB controllers */ #define JZ_EHCI_BASE 0x13490000 #define JZ_EHCI_REG_UTMI_BUS 0x000000b0 #define UTMI_BUS_WIDTH 0x00000040 #define JZ_OHCI_BASE 0x134a0000 #define JZ_DWC2_BASE 0x13500000 #define JZ_DWC2_GUSBCFG 0 /* Ethernet */ #define JZ_DME_BASE 0x16000000 #define JZ_DME_IO 0 #define JZ_DME_DATA 2 /* GPIO */ #define JZ_GPIO_A_BASE 0x10010000 #define JZ_GPIO_B_BASE 0x10010100 #define JZ_GPIO_C_BASE 0x10010200 #define JZ_GPIO_D_BASE 0x10010300 #define JZ_GPIO_E_BASE 0x10010400 #define JZ_GPIO_F_BASE 0x10010500 /* GPIO registers per port */ #define JZ_GPIO_PIN 0x00000000 /* pin level register */ /* 0 - normal gpio, 1 - interrupt */ #define JZ_GPIO_INT 0x00000010 /* interrupt register */ #define JZ_GPIO_INTS 0x00000014 /* interrupt set register */ #define JZ_GPIO_INTC 0x00000018 /* interrupt clear register */ /* * INT == 1: 1 disables interrupt * INT == 0: device select, see below */ #define JZ_GPIO_MASK 0x00000020 /* port mask register */ #define JZ_GPIO_MASKS 0x00000024 /* port mask set register */ #define JZ_GPIO_MASKC 0x00000028 /* port mask clear register */ /* * INT == 1: 0 - level triggered, 1 - edge triggered * INT == 0: 0 - device select, see below */ #define JZ_GPIO_PAT1 0x00000030 /* pattern 1 register */ #define JZ_GPIO_PAT1S 0x00000034 /* pattern 1 set register */ #define JZ_GPIO_PAT1C 0x00000038 /* pattern 1 clear register */ /* * INT == 1: * PAT1 == 0: 0 - trigger on low, 1 - trigger on high * PAT0 == 1: 0 - trigger on falling edge, 1 - trigger on rising edge * INT == 0: * MASK == 0: * PAT1 == 0: 0 - device 0, 1 - device 1 * PAT0 == 1: 0 - device 2, 1 - device 3 * MASK == 1: * PAT1 == 0: set gpio output * PAT1 == 1: pin is input */ #define JZ_GPIO_PAT0 0x00000040 /* pattern 0 register */ #define JZ_GPIO_PAT0S 0x00000044 /* pattern 0 set register */ #define JZ_GPIO_PAT0C 0x00000048 /* pattern 0 clear register */ /* 1 - interrupt happened */ #define JZ_GPIO_FLAG 0x00000050 /* flag register */ #define JZ_GPIO_FLAGC 0x00000058 /* flag clear register */ /* 1 - disable pull up/down resistors */ #define JZ_GPIO_DPULL 0x00000070 /* pull disable register */ #define JZ_GPIO_DPULLS 0x00000074 /* pull disable set register */ #define JZ_GPIO_DPULLC 0x00000078 /* pull disable clear register */ /* the following are uncommented in the manual */ #define JZ_GPIO_DRVL 0x00000080 /* drive low register */ #define JZ_GPIO_DRVLS 0x00000084 /* drive low set register */ #define JZ_GPIO_DRVLC 0x00000088 /* drive low clear register */ #define JZ_GPIO_DIR 0x00000090 /* direction register */ #define JZ_GPIO_DIRS 0x00000094 /* direction register */ #define JZ_GPIO_DIRC 0x00000098 /* direction register */ #define JZ_GPIO_DRVH 0x000000a0 /* drive high register */ #define JZ_GPIO_DRVHS 0x000000a4 /* drive high set register */ #define JZ_GPIO_DRVHC 0x000000a8 /* drive high clear register */ /* I2C / SMBus */ #define JZ_SMB0_BASE 0x10050000 #define JZ_SMB1_BASE 0x10051000 #define JZ_SMB2_BASE 0x10052000 #define JZ_SMB3_BASE 0x10053000 #define JZ_SMB4_BASE 0x10054000 /* SMBus register offsets, per port */ #define JZ_SMBCON 0x00 /* SMB control */ #define JZ_STPHLD 0x80 /* Stop Hold Enable bit */ #define JZ_SLVDIS 0x40 /* 1 - slave disabled */ #define JZ_REST 0x20 /* 1 - allow RESTART */ #define JZ_MATP 0x10 /* 1 - enable 10bit addr. for master */ #define JZ_SATP 0x08 /* 1 - enable 10bit addr. for slave */ #define JZ_SPD_M 0x06 /* bus speed control */ #define JZ_SPD_100KB 0x02 /* 100kBit/s mode */ #define JZ_SPD_400KB 0x04 /* 400kBit/s mode */ #define JZ_MD 0x01 /* enable master */ #define JZ_SMBTAR 0x04 /* SMB target address */ #define JZ_SMATP 0x1000 /* enable 10bit master addr */ #define JZ_SPECIAL 0x0800 /* 1 - special command */ #define JZ_START 0x0400 /* 1 - send START */ #define JZ_SMBTAR_M 0x03ff /* target address */ #define JZ_SMBSAR 0x08 /* SMB slave address */ #define JZ_SMBDC 0x10 /* SMB data buffer and command */ #define JZ_CMD 0x100 /* 1 - read, 0 - write */ #define JZ_DATA 0x0ff #define JZ_SMBSHCNT 0x14 /* Standard speed SMB SCL high count */ #define JZ_SMBSLCNT 0x18 /* Standard speed SMB SCL low count */ #define JZ_SMBFHCNT 0x1C /* Fast speed SMB SCL high count */ #define JZ_SMBFLCNT 0x20 /* Fast speed SMB SCL low count */ #define JZ_SMBINTST 0x2C /* SMB Interrupt Status */ - #define JZ_ISTT 0x400 /* START or RESTART occured */ - #define JZ_ISTP 0x200 /* STOP occured */ - #define JZ_TXABT 0x40 /* ABORT occured */ + #define JZ_ISTT 0x400 /* START or RESTART occurred */ + #define JZ_ISTP 0x200 /* STOP occurred */ + #define JZ_TXABT 0x40 /* ABORT occurred */ #define JZ_TXEMP 0x10 /* TX FIFO is low */ #define JZ_TXOF 0x08 /* TX FIFO is high */ #define JZ_RXFL 0x04 /* RX FIFO is at JZ_SMBRXTL*/ #define JZ_RXOF 0x02 /* RX FIFO is high */ #define JZ_RXUF 0x01 /* RX FIFO underflow */ #define JZ_SMBINTM 0x30 /* SMB Interrupt Mask */ #define JZ_SMBRXTL 0x38 /* SMB RxFIFO Threshold */ #define JZ_SMBTXTL 0x3C /* SMB TxFIFO Threshold */ #define JZ_SMBCINT 0x40 /* Clear Interrupts */ #define JZ_CLEARALL 0x01 #define JZ_SMBCRXUF 0x44 /* Clear RXUF Interrupt */ #define JZ_SMBCRXOF 0x48 /* Clear RX_OVER Interrupt */ #define JZ_SMBCTXOF 0x4C /* Clear TX_OVER Interrupt */ #define JZ_SMBCRXREQ 0x50 /* Clear RDREQ Interrupt */ #define JZ_SMBCTXABT 0x54 /* Clear TX_ABRT Interrupt */ #define JZ_SMBCRXDN 0x58 /* Clear RX_DONE Interrupt */ #define JZ_SMBCACT 0x5c /* Clear ACTIVITY Interrupt */ #define JZ_SMBCSTP 0x60 /* Clear STOP Interrupt */ #define JZ_SMBCSTT 0x64 /* Clear START Interrupt */ #define JZ_SMBCGC 0x68 /* Clear GEN_CALL Interrupt */ #define JZ_SMBENB 0x6C /* SMB Enable */ #define JZ_ENABLE 0x01 #define JZ_SMBST 0x70 /* SMB Status register */ #define JZ_SLVACT 0x40 /* slave is active */ #define JZ_MSTACT 0x20 /* master is active */ #define JZ_RFF 0x10 /* RX FIFO is full */ #define JZ_RFNE 0x08 /* RX FIFO not empty */ #define JZ_TFE 0x04 /* TX FIFO is empty */ #define JZ_TFNF 0x02 /* TX FIFO is not full */ #define JZ_ACT 0x01 /* JZ_SLVACT | JZ_MSTACT */ #define JZ_SMBABTSRC 0x80 /* SMB Transmit Abort Status Register */ #define JZ_SMBDMACR 0x88 /* DMA Control Register */ #define JZ_SMBDMATDL 0x8c /* DMA Transmit Data Level */ #define JZ_SMBDMARDL 0x90 /* DMA Receive Data Level */ #define JZ_SMBSDASU 0x94 /* SMB SDA Setup Register */ #define JZ_SMBACKGC 0x98 /* SMB ACK General Call Register */ #define JZ_SMBENBST 0x9C /* SMB Enable Status Register */ #define JZ_SMBSDAHD 0xD0 /* SMB SDA HolD time Register */ #define JZ_HDENB 0x100 /* enable hold time */ /* SD/MMC hosts */ #define JZ_MSC0_BASE 0x13450000 #define JZ_MSC1_BASE 0x13460000 #define JZ_MSC2_BASE 0x13470000 #define JZ_MSC_CTRL 0x00 #define JZ_SEND_CCSD 0x8000 #define JZ_SEND_AS_CCSD 0x4000 #define JZ_EXIT_MULTIPLE 0x0080 #define JZ_EXIT_TRANSFER 0x0040 #define JZ_START_READWAIT 0x0020 #define JZ_STOP_READWAIT 0x0010 #define JZ_RESET 0x0008 #define JZ_START_OP 0x0004 #define JZ_CLOCK_CTRL_M 0x0003 #define JZ_CLOCK_START 0x0002 #define JZ_CLOCK_STOP 0x0001 #define JZ_MSC_STAT 0x04 #define JZ_AUTO_CMD12_DONE 0x80000000 #define JZ_AUTO_CMD23_DONE 0x40000000 #define JZ_SVS 0x20000000 #define JZ_PIN_LEVEL_M 0x1f000000 #define JZ_BCE 0x00100000 /* boot CRC error */ #define JZ_BDE 0x00080000 /* boot data end */ #define JZ_BAE 0x00040000 /* boot acknowledge error */ #define JZ_BAR 0x00020000 /* boot ack. received */ #define JZ_DMAEND 0x00010000 #define JZ_IS_RESETTING 0x00008000 #define JZ_SDIO_INT_ACTIVE 0x00004000 #define JZ_PRG_DONE 0x00002000 #define JZ_DATA_TRAN_DONE 0x00001000 #define JZ_END_CMD_RES 0x00000800 #define JZ_DATA_FIFO_AFULL 0x00000400 #define JZ_IS_READWAIT 0x00000200 #define JZ_CLK_EN 0x00000100 #define JZ_DATA_FIFO_FULL 0x00000080 #define JZ_DATA_FIFO_EMPTY 0x00000040 #define JZ_CRC_RES_ERR 0x00000020 #define JZ_CRC_READ_ERR 0x00000010 #define JZ_CRC_WRITE_ERR_M 0x0000000c #define JZ_CRC_WRITE_OK 0x00000000 #define JZ_CRC_CARD_ERR 0x00000004 #define JZ_CRC_NO_STATUS 0x00000008 #define JZ_TIME_OUT_RES 0x00000002 #define JZ_TIME_OUT_READ 0x00000001 #define JZ_MSC_CLKRT 0x08 #define JZ_DEV_CLK 0x0 #define JZ_DEV_CLK_2 0x1 /* DEV_CLK / 2 */ #define JZ_DEV_CLK_4 0x2 /* DEV_CLK / 4 */ #define JZ_DEV_CLK_8 0x3 /* DEV_CLK / 8 */ #define JZ_DEV_CLK_16 0x4 /* DEV_CLK / 16 */ #define JZ_DEV_CLK_32 0x5 /* DEV_CLK / 32 */ #define JZ_DEV_CLK_64 0x6 /* DEV_CLK / 64 */ #define JZ_DEV_CLK_128 0x7 /* DEV_CLK / 128 */ #define JZ_MSC_CMDAT 0x0c #define JZ_CCS_EXPECTED 0x80000000 #define JZ_READ_CEATA 0x40000000 #define JZ_DIS_BOOT 0x08000000 #define JZ_ENA_BOOT 0x04000000 #define JZ_EXP_BOOT_ACK 0x02000000 #define JZ_BOOT_MODE 0x01000000 #define JZ_AUTO_CMD23 0x00040000 #define JZ_SDIO_PRDT 0x00020000 #define JZ_AUTO_CMD12 0x00010000 #define JZ_RTRG_M 0x0000c000 /* receive FIFO trigger */ #define JZ_RTRG_16 0x00000000 /* >= 16 */ #define JZ_RTRG_32 0x00004000 /* >= 32 */ #define JZ_RTRG_64 0x00008000 /* >= 64 */ #define JZ_RTRG_96 0x0000c000 /* >= 96 */ #define JZ_TTRG_M 0x00003000 /* transmit FIFO trigger */ #define JZ_TTRG_16 0x00000000 /* >= 16 */ #define JZ_TTRG_32 0x00001000 /* >= 32 */ #define JZ_TTRG_64 0x00002000 /* >= 64 */ #define JZ_TTRG_96 0x00003000 /* >= 96 */ #define JZ_IO_ABORT 0x00000800 #define JZ_BUS_WIDTH_M 0x00000600 #define JZ_BUS_1BIT 0x00000000 #define JZ_BUS_4BIT 0x00000400 #define JZ_BUS_8BIT 0x00000600 #define JZ_INIT 0x00000080 /* send 80 clk init before cmd */ #define JZ_BUSY 0x00000040 #define JZ_STREAM 0x00000020 #define JZ_WRITE 0x00000010 /* read otherwise */ #define JZ_DATA_EN 0x00000008 #define JZ_RESPONSE_M 0x00000007 /* response format */ #define JZ_RES_NONE 0x00000000 #define JZ_RES_R1 0x00000001 /* R1 and R1b */ #define JZ_RES_R2 0x00000002 #define JZ_RES_R3 0x00000003 #define JZ_RES_R4 0x00000004 #define JZ_RES_R5 0x00000005 #define JZ_RES_R6 0x00000006 #define JZ_RES_R7 0x00000007 #define JZ_MSC_RESTO 0x10 /* 16bit response timeout in MSC_CLK */ #define JZ_MSC_RDTO 0x14 /* 32bit read timeout in MSC_CLK */ #define JZ_MSC_BLKLEN 0x18 /* 16bit block length */ #define JZ_MSC_NOB 0x1c /* 16bit block counter */ #define JZ_MSC_SNOB 0x20 /* 16bit successful block counter */ #define JZ_MSC_IMASK 0x24 /* interrupt mask */ #define JZ_INT_AUTO_CMD23_DONE 0x40000000 #define JZ_INT_SVS 0x20000000 #define JZ_INT_PIN_LEVEL_M 0x1f000000 #define JZ_INT_BCE 0x00100000 #define JZ_INT_BDE 0x00080000 #define JZ_INT_BAE 0x00040000 #define JZ_INT_BAR 0x00020000 #define JZ_INT_DMAEND 0x00010000 #define JZ_INT_AUTO_CMD12_DONE 0x00008000 #define JZ_INT_DATA_FIFO_FULL 0x00004000 #define JZ_INT_DATA_FIFO_EMPTY 0x00002000 #define JZ_INT_CRC_RES_ERR 0x00001000 #define JZ_INT_CRC_READ_ERR 0x00000800 #define JZ_INT_CRC_WRITE_ERR 0x00000400 #define JZ_INT_TIMEOUT_RES 0x00000200 #define JZ_INT_TIMEOUT_READ 0x00000100 #define JZ_INT_SDIO 0x00000080 #define JZ_INT_TXFIFO_WR_REQ 0x00000040 #define JZ_INT_RXFIFO_RD_REQ 0x00000020 #define JZ_INT_END_CMD_RES 0x00000004 #define JZ_INT_PRG_DONE 0x00000002 #define JZ_INT_DATA_TRAN_DONE 0x00000001 #define JZ_MSC_IFLG 0x28 /* interrupt flags */ #define JZ_MSC_CMD 0x2c /* 6bit CMD index */ #define JZ_MSC_ARG 0x30 /* 32bit argument */ #define JZ_MSC_RES 0x34 /* 8x16bit response data FIFO */ #define JZ_MSC_RXFIFO 0x38 #define JZ_MSC_TXFIFO 0x3c #define JZ_MSC_LPM 0x40 #define JZ_DRV_SEL_M 0xc0000000 #define JZ_FALLING_EDGE 0x00000000 #define JZ_RISING_1NS 0x40000000 /* 1ns delay */ #define JZ_RISING_4 0x80000000 /* 1/4 MSC_CLK delay */ #define JZ_SMP_SEL 0x20000000 /* 1 - rising edge */ #define JZ_LPM 0x00000001 /* low power mode */ #define JZ_MSC_DMAC 0x44 #define JZ_MODE_SEL 0x80 /* 1 - specify transfer length */ #define JZ_AOFST_M 0x60 /* address offset in bytes */ #define JZ_AOFST_S 6 /* addrress offset shift */ #define JZ_ALIGNEN 0x10 /* allow non-32bit-aligned transfers */ #define JZ_INCR_M 0x0c /* burst type */ #define JZ_INCR_16 0x00 #define JZ_INCR_32 0x04 #define JZ_INCR_64 0x08 #define JZ_DMASEL 0x02 /* 1 - SoC DMAC, 0 - MSC built-in */ #define JZ_DMAEN 0x01 /* enable DMA */ #define JZ_MSC_DMANDA 0x48 /* next descriptor paddr */ #define JZ_MSC_DMADA 0x4c /* current descriptor */ #define JZ_MSC_DMALEN 0x50 /* transfer tength */ #define JZ_MSC_DMACMD 0x54 #define JZ_DMA_IDI_M 0xff000000 #define JZ_DMA_ID_M 0x00ff0000 #define JZ_DMA_AOFST_M 0x00000600 #define JZ_DMA_ALIGN 0x00000100 #define JZ_DMA_ENDI 0x00000002 #define JZ_DMA_LINK 0x00000001 #define JZ_MSC_CTRL2 0x58 #define JZ_PIP 0x1f000000 /* 1 - intr trigger on high */ #define JZ_RST_EN 0x00800000 #define JZ_STPRM 0x00000010 #define JZ_SVC 0x00000008 #define JZ_SMS_M 0x00000007 #define JZ_SMS_DEF 0x00000000 /* default speed */ #define JZ_SMS_HIGH 0x00000001 /* high speed */ #define JZ_SMS_SDR12 0x00000002 #define JZ_SMS_SDR25 0x00000003 #define JZ_SMS_SDR50 0x00000004 #define JZ_MSC_RTCNT 0x5c /* RT FIFO count */ /* EFUSE Slave Interface */ #define JZ_EFUSE 0x134100D0 #define JZ_EFUCTRL 0x00 #define JZ_EFUSE_BANK 0x40000000 /* select upper 4KBit */ #define JZ_EFUSE_ADDR_M 0x3fe00000 /* in bytes */ #define JZ_EFUSE_ADDR_SHIFT 21 #define JZ_EFUSE_SIZE_M 0x001f0000 /* in bytes */ #define JZ_EFUSE_SIZE_SHIFT 16 #define JZ_EFUSE_PROG 0x00008000 /* enable programming */ #define JZ_EFUSE_WRITE 0x00000002 /* write enable */ #define JZ_EFUSE_READ 0x00000001 /* read enable */ #define JZ_EFUCFG 0x04 #define JZ_EFUSE_INT_E 0x80000000 /* which IRQ? */ #define JZ_EFUSE_RD_ADJ_M 0x00f00000 #define JZ_EFUSE_RD_STROBE 0x000f0000 #define JZ_EFUSE_WR_ADJUST 0x0000f000 #define JZ_EFUSE_WR_STROBE 0x00000fff #define JZ_EFUSTATE 0x08 #define JZ_EFUSE_GLOBAL_P 0x00008000 /* wr protect bits */ #define JZ_EFUSE_CHIPID_P 0x00004000 #define JZ_EFUSE_CUSTID_P 0x00002000 #define JZ_EFUSE_SECWR_EN 0x00001000 #define JZ_EFUSE_PC_P 0x00000800 #define JZ_EFUSE_HDMIKEY_P 0x00000400 #define JZ_EFUSE_SECKEY_P 0x00000200 #define JZ_EFUSE_SECBOOT_EN 0x00000100 #define JZ_EFUSE_HDMI_BUSY 0x00000004 #define JZ_EFUSE_WR_DONE 0x00000002 #define JZ_EFUSE_RD_DONE 0x00000001 #define JZ_EFUDATA0 0x0C #define JZ_EFUDATA1 0x10 #define JZ_EFUDATA2 0x14 #define JZ_EFUDATA3 0x18 #define JZ_EFUDATA4 0x1C #define JZ_EFUDATA5 0x20 #define JZ_EFUDATA6 0x24 #define JZ_EFUDATA7 0x28 /* NEMC */ #define JZ_NEMC_BASE 0x13410000 #define JZ_NEMC_SMCR(n) (0x10 + (n) * 4) # define JZ_NEMC_SMCR_SMT_SHIFT 0 # define JZ_NEMC_SMCR_SMT_WIDTH 1 # define JZ_NEMC_SMCR_SMT_MASK (((1 << JZ_NEMC_SMCR_SMT_WIDTH) - 1) << JZ_NEMC_SMCR_SMT_SHIFT) # define JZ_NEMC_SMCR_SMT_NORMAL (0 << JZ_NEMC_SMCR_SMT_SHIFT) # define JZ_NEMC_SMCR_SMT_BROM (1 << JZ_NEMC_SMCR_SMT_SHIFT) # define JZ_NEMC_SMCR_BL_SHIFT 1 # define JZ_NEMC_SMCR_BL_WIDTH 2 # define JZ_NEMC_SMCR_BL_MASK (((1 << JZ_NEMC_SMCR_BL_WIDTH) - 1) << JZ_NEMC_SMCR_BL_SHIFT) # define JZ_NEMC_SMCR_BL(n) (((n) << JZ_NEMC_SMCR_BL_SHIFT) # define JZ_NEMC_SMCR_BW_SHIFT 6 # define JZ_NEMC_SMCR_BW_WIDTH 2 # define JZ_NEMC_SMCR_BW_MASK (((1 << JZ_NEMC_SMCR_BW_WIDTH) - 1) << JZ_NEMC_SMCR_BW_SHIFT) # define JZ_NEMC_SMCR_BW_8 (0 << JZ_NEMC_SMCR_BW_SHIFT) # define JZ_NEMC_SMCR_TAS_SHIFT 8 # define JZ_NEMC_SMCR_TAS_WIDTH 4 # define JZ_NEMC_SMCR_TAS_MASK (((1 << JZ_NEMC_SMCR_TAS_WIDTH) - 1) << JZ_NEMC_SMCR_TAS_SHIFT) # define JZ_NEMC_SMCR_TAH_SHIFT 12 # define JZ_NEMC_SMCR_TAH_WIDTH 4 # define JZ_NEMC_SMCR_TAH_MASK (((1 << JZ_NEMC_SMCR_TAH_WIDTH) - 1) << JZ_NEMC_SMCR_TAH_SHIFT) # define JZ_NEMC_SMCR_TBP_SHIFT 16 # define JZ_NEMC_SMCR_TBP_WIDTH 4 # define JZ_NEMC_SMCR_TBP_MASK (((1 << JZ_NEMC_SMCR_TBP_WIDTH) - 1) << JZ_NEMC_SMCR_TBP_SHIFT) # define JZ_NEMC_SMCR_TAW_SHIFT 20 # define JZ_NEMC_SMCR_TAW_WIDTH 4 # define JZ_NEMC_SMCR_TAW_MASK (((1 << JZ_NEMC_SMCR_TAW_WIDTH) - 1) << JZ_NEMC_SMCR_TAW_SHIFT) # define JZ_NEMC_SMCR_STRV_SHIFT 24 # define JZ_NEMC_SMCR_STRV_WIDTH 4 # define JZ_NEMC_SMCR_STRV_MASK (((1 << JZ_NEMC_SMCR_STRV_WIDTH) - 1) << JZ_NEMC_SMCR_STRV_SHIFT) #define JZ_NEMC_SACR(n) (0x30 + (n) * 4) # define JZ_NEMC_SACR_MASK_SHIFT 0 # define JZ_NEMC_SACR_MASK_WIDTH 8 # define JZ_NEMC_SACR_MASK_MASK (((1 << JZ_NEMC_SACR_MASK_WIDTH) - 1) << JZ_NEMC_SACR_MASK_SHIFT) # define JZ_NEMC_SACR_ADDR_SHIFT 0 # define JZ_NEMC_SACR_ADDR_WIDTH 8 # define JZ_NEMC_SACR_ADDR_MASK (((1 << JZ_NEMC_SACR_ADDR_WIDTH) - 1) << JZ_NEMC_SACR_ADDR_SHIFT) #define JC_NEMC_NFSCR 0x50 #endif /* JZ4780_REGS_H */ diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h index f26a2ae34eed..ccd26c9ac0de 100644 --- a/sys/net/if_pfsync.h +++ b/sys/net/if_pfsync.h @@ -1,266 +1,266 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001 Michael Shalayeff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2008 David Gwynne * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ /* * $OpenBSD: if_pfsync.h,v 1.35 2008/06/29 08:42:15 mcbride Exp $ * $FreeBSD$ */ #ifndef _NET_IF_PFSYNC_H_ #define _NET_IF_PFSYNC_H_ #define PFSYNC_VERSION 5 #define PFSYNC_DFLTTL 255 #define PFSYNC_ACT_CLR 0 /* clear all states */ #define PFSYNC_ACT_INS 1 /* insert state */ -#define PFSYNC_ACT_INS_ACK 2 /* ack of insterted state */ +#define PFSYNC_ACT_INS_ACK 2 /* ack of inserted state */ #define PFSYNC_ACT_UPD 3 /* update state */ #define PFSYNC_ACT_UPD_C 4 /* "compressed" update state */ #define PFSYNC_ACT_UPD_REQ 5 /* request "uncompressed" state */ #define PFSYNC_ACT_DEL 6 /* delete state */ #define PFSYNC_ACT_DEL_C 7 /* "compressed" delete state */ #define PFSYNC_ACT_INS_F 8 /* insert fragment */ #define PFSYNC_ACT_DEL_F 9 /* delete fragments */ #define PFSYNC_ACT_BUS 10 /* bulk update status */ #define PFSYNC_ACT_TDB 11 /* TDB replay counter update */ #define PFSYNC_ACT_EOF 12 /* end of frame */ #define PFSYNC_ACT_MAX 13 /* * A pfsync frame is built from a header followed by several sections which * are all prefixed with their own subheaders. Frames must be terminated with * an EOF subheader. * * | ... | * | IP header | * +============================+ * | pfsync_header | * +----------------------------+ * | pfsync_subheader | * +----------------------------+ * | first action fields | * | ... | * +----------------------------+ * | pfsync_subheader | * +----------------------------+ * | second action fields | * | ... | * +----------------------------+ * | EOF pfsync_subheader | * +----------------------------+ * | HMAC | * +============================+ */ /* * Frame header */ struct pfsync_header { u_int8_t version; u_int8_t _pad; u_int16_t len; u_int8_t pfcksum[PF_MD5_DIGEST_LENGTH]; } __packed; /* * Frame region subheader */ struct pfsync_subheader { u_int8_t action; u_int8_t _pad; u_int16_t count; } __packed; /* * CLR */ struct pfsync_clr { char ifname[IFNAMSIZ]; u_int32_t creatorid; } __packed; /* * INS, UPD, DEL */ /* these use struct pfsync_state in pfvar.h */ /* * INS_ACK */ struct pfsync_ins_ack { u_int64_t id; u_int32_t creatorid; } __packed; /* * UPD_C */ struct pfsync_upd_c { u_int64_t id; struct pfsync_state_peer src; struct pfsync_state_peer dst; u_int32_t creatorid; u_int32_t expire; u_int8_t timeout; u_int8_t _pad[3]; } __packed; /* * UPD_REQ */ struct pfsync_upd_req { u_int64_t id; u_int32_t creatorid; } __packed; /* * DEL_C */ struct pfsync_del_c { u_int64_t id; u_int32_t creatorid; } __packed; /* * INS_F, DEL_F */ /* not implemented (yet) */ /* * BUS */ struct pfsync_bus { u_int32_t creatorid; u_int32_t endtime; u_int8_t status; #define PFSYNC_BUS_START 1 #define PFSYNC_BUS_END 2 u_int8_t _pad[3]; } __packed; /* * TDB */ struct pfsync_tdb { u_int32_t spi; union sockaddr_union dst; u_int32_t rpl; u_int64_t cur_bytes; u_int8_t sproto; u_int8_t updates; u_int8_t _pad[2]; } __packed; #define PFSYNC_HDRLEN sizeof(struct pfsync_header) struct pfsyncstats { u_int64_t pfsyncs_ipackets; /* total input packets, IPv4 */ u_int64_t pfsyncs_ipackets6; /* total input packets, IPv6 */ u_int64_t pfsyncs_badif; /* not the right interface */ u_int64_t pfsyncs_badttl; /* TTL is not PFSYNC_DFLTTL */ u_int64_t pfsyncs_hdrops; /* packets shorter than hdr */ u_int64_t pfsyncs_badver; /* bad (incl unsupp) version */ u_int64_t pfsyncs_badact; /* bad action */ u_int64_t pfsyncs_badlen; /* data length does not match */ u_int64_t pfsyncs_badauth; /* bad authentication */ u_int64_t pfsyncs_stale; /* stale state */ u_int64_t pfsyncs_badval; /* bad values */ u_int64_t pfsyncs_badstate; /* insert/lookup failed */ u_int64_t pfsyncs_opackets; /* total output packets, IPv4 */ u_int64_t pfsyncs_opackets6; /* total output packets, IPv6 */ u_int64_t pfsyncs_onomem; /* no memory for an mbuf */ u_int64_t pfsyncs_oerrors; /* ip output error */ u_int64_t pfsyncs_iacts[PFSYNC_ACT_MAX]; u_int64_t pfsyncs_oacts[PFSYNC_ACT_MAX]; }; /* * Configuration structure for SIOCSETPFSYNC SIOCGETPFSYNC */ struct pfsyncreq { char pfsyncr_syncdev[IFNAMSIZ]; struct in_addr pfsyncr_syncpeer; int pfsyncr_maxupdates; int pfsyncr_defer; }; #define SIOCSETPFSYNC _IOW('i', 247, struct ifreq) #define SIOCGETPFSYNC _IOWR('i', 248, struct ifreq) #ifdef _KERNEL /* * this shows where a pf state is with respect to the syncing. */ #define PFSYNC_S_INS 0x00 #define PFSYNC_S_IACK 0x01 #define PFSYNC_S_UPD 0x02 #define PFSYNC_S_UPD_C 0x03 #define PFSYNC_S_DEL 0x04 #define PFSYNC_S_COUNT 0x05 #define PFSYNC_S_DEFER 0xfe #define PFSYNC_S_NONE 0xff #define PFSYNC_SI_IOCTL 0x01 #define PFSYNC_SI_CKSUM 0x02 #define PFSYNC_SI_ACK 0x04 #endif /* _KERNEL */ #endif /* _NET_IF_PFSYNC_H_ */ diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h index 436383124dce..bdd56c94587e 100644 --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -1,386 +1,386 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __tcp_log_buf_h__ #define __tcp_log_buf_h__ #define TCP_LOG_REASON_LEN 32 #define TCP_LOG_TAG_LEN 32 #define TCP_LOG_BUF_VER (9) /* * Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires * 8-byte alignment to work properly on all platforms. Therefore, we will * enforce 8-byte alignment for all the structures that may appear by * themselves (instead of being embedded in another structure) in a data * stream. */ #define ALIGN_TCP_LOG __aligned(8) /* Information about the socketbuffer state. */ struct tcp_log_sockbuf { uint32_t tls_sb_acc; /* available chars (sb->sb_acc) */ uint32_t tls_sb_ccc; /* claimed chars (sb->sb_ccc) */ uint32_t tls_sb_spare; /* spare */ }; /* Optional, verbose information that may be appended to an event log. */ struct tcp_log_verbose { #define TCP_FUNC_LEN 32 char tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */ char tlv_trace_func[TCP_FUNC_LEN]; /* Function that generated trace */ uint32_t tlv_trace_line; /* Line number that generated trace */ uint8_t _pad[4]; } ALIGN_TCP_LOG; /* Internal RACK state variables. */ struct tcp_log_rack { uint32_t tlr_rack_rtt; /* rc_rack_rtt */ uint8_t tlr_state; /* Internal RACK state */ uint8_t _pad[3]; /* Padding */ }; struct tcp_log_bbr { uint64_t cur_del_rate; uint64_t delRate; uint64_t rttProp; uint64_t bw_inuse; uint32_t inflight; uint32_t applimited; uint32_t delivered; uint32_t timeStamp; uint32_t epoch; uint32_t lt_epoch; uint32_t pkts_out; uint32_t flex1; uint32_t flex2; uint32_t flex3; uint32_t flex4; uint32_t flex5; uint32_t flex6; uint32_t lost; uint16_t pacing_gain; uint16_t cwnd_gain; uint16_t flex7; uint8_t bbr_state; uint8_t bbr_substate; uint8_t inhpts; uint8_t ininput; uint8_t use_lt_bw; uint8_t flex8; uint32_t pkt_epoch; }; /* Per-stack stack-specific info. */ union tcp_log_stackspecific { struct tcp_log_rack u_rack; struct tcp_log_bbr u_bbr; }; struct tcp_log_buffer { /* Event basics */ struct timeval tlb_tv; /* Timestamp of trace */ uint32_t tlb_ticks; /* Timestamp of trace */ uint32_t tlb_sn; /* Serial number */ uint8_t tlb_stackid; /* Stack ID */ uint8_t tlb_eventid; /* Event ID */ uint16_t tlb_eventflags; /* Flags for the record */ #define TLB_FLAG_RXBUF 0x0001 /* Includes receive buffer info */ #define TLB_FLAG_TXBUF 0x0002 /* Includes send buffer info */ #define TLB_FLAG_HDR 0x0004 /* Includes a TCP header */ #define TLB_FLAG_VERBOSE 0x0008 /* Includes function/line numbers */ #define TLB_FLAG_STACKINFO 0x0010 /* Includes stack-specific info */ int tlb_errno; /* Event error (if any) */ /* Internal session state */ struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */ struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */ int tlb_state; /* TCPCB t_state */ uint32_t tlb_starttime; /* TCPCB t_starttime */ uint32_t tlb_iss; /* TCPCB iss */ uint32_t tlb_flags; /* TCPCB flags */ uint32_t tlb_snd_una; /* TCPCB snd_una */ uint32_t tlb_snd_max; /* TCPCB snd_max */ uint32_t tlb_snd_cwnd; /* TCPCB snd_cwnd */ uint32_t tlb_snd_nxt; /* TCPCB snd_nxt */ uint32_t tlb_snd_recover;/* TCPCB snd_recover */ uint32_t tlb_snd_wnd; /* TCPCB snd_wnd */ uint32_t tlb_snd_ssthresh; /* TCPCB snd_ssthresh */ uint32_t tlb_srtt; /* TCPCB t_srtt */ uint32_t tlb_rttvar; /* TCPCB t_rttvar */ uint32_t tlb_rcv_up; /* TCPCB rcv_up */ uint32_t tlb_rcv_adv; /* TCPCB rcv_adv */ uint32_t tlb_flags2; /* TCPCB t_flags2 */ uint32_t tlb_rcv_nxt; /* TCPCB rcv_nxt */ uint32_t tlb_rcv_wnd; /* TCPCB rcv_wnd */ uint32_t tlb_dupacks; /* TCPCB t_dupacks */ int tlb_segqlen; /* TCPCB segqlen */ int tlb_snd_numholes; /* TCPCB snd_numholes */ uint32_t tlb_flex1; /* Event specific information */ uint32_t tlb_flex2; /* Event specific information */ uint32_t tlb_fbyte_in; /* TCPCB first byte in time */ uint32_t tlb_fbyte_out; /* TCPCB first byte out time */ uint8_t tlb_snd_scale:4, /* TCPCB snd_scale */ tlb_rcv_scale:4; /* TCPCB rcv_scale */ uint8_t _pad[3]; /* Padding */ /* Per-stack info */ union tcp_log_stackspecific tlb_stackinfo; #define tlb_rack tlb_stackinfo.u_rack /* The packet */ uint32_t tlb_len; /* The packet's data length */ struct tcphdr tlb_th; /* The TCP header */ uint8_t tlb_opts[TCP_MAXOLEN]; /* The TCP options */ /* Verbose information (optional) */ struct tcp_log_verbose tlb_verbose[0]; } ALIGN_TCP_LOG; enum tcp_log_events { TCP_LOG_IN = 1, /* Incoming packet 1 */ TCP_LOG_OUT, /* Transmit (without other event) 2 */ TCP_LOG_RTO, /* Retransmit timeout 3 */ TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER, /* Detected reorder 7 */ TCP_LOG_HPTS, /* Hpts sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */ BBR_LOG_TIMERSTAR, /* Start a timer 13 */ BBR_LOG_TIMERCANC, /* Cancel a timer 14 */ BBR_LOG_ENTREC, /* Entered recovery 15 */ BBR_LOG_EXITREC, /* Exited recovery 16 */ BBR_LOG_CWND, /* Cwnd change 17 */ BBR_LOG_BWSAMP, /* LT B/W sample has been made 18 */ BBR_LOG_MSGSIZE, /* We received a EMSGSIZE error 19 */ BBR_LOG_BBRRTT, /* BBR RTT is updated 20 */ BBR_LOG_JUSTRET, /* We just returned out of output 21 */ - BBR_LOG_STATE, /* A BBR state change occured 22 */ - BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occured 23 */ + BBR_LOG_STATE, /* A BBR state change occurred 22 */ + BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occurred 23 */ BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ - BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ + BBR_LOG_TIME_EPOCH, /* A timed based Epoch occurred 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ TCP_HDWR_PACE_SIZE, /* TCP pacing size set (rl and rack uses this) 51 */ BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */ BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */ TCP_LOG_CONNEND, /* End of connection 54 */ TCP_LOG_LRO, /* LRO entry 55 */ TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */ TCP_SAD_DETECTION, /* Sack Attack Detection 57 */ TCP_TIMELY_WORK, /* Logs regarding Timely CC tweaks 58 */ TCP_LOG_USER_EVENT, /* User space event data 59 */ TCP_LOG_SENDFILE, /* sendfile() logging for TCP connections 60 */ TCP_LOG_HTTP_T, /* logging of http request tracking 61 */ TCP_LOG_END /* End (keep at end) 62 */ }; enum tcp_log_states { TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing */ TCP_LOG_STATE_OFF = 0, /* Pause */ TCP_LOG_STATE_TAIL=1, /* Keep the trailing events */ TCP_LOG_STATE_HEAD=2, /* Keep the leading events */ TCP_LOG_STATE_HEAD_AUTO=3, /* Keep the leading events, and automatically dump them to the device */ TCP_LOG_STATE_CONTINUAL=4, /* Continually dump the data when full */ TCP_LOG_STATE_TAIL_AUTO=5, /* Keep the trailing events, and automatically dump them when the session ends */ }; /* Use this if we don't know whether the operation succeeded. */ #define ERRNO_UNK (-1) /* * If the user included dev/tcp_log/tcp_log_dev.h, then include our private * headers. Otherwise, there is no reason to pollute all the files with an * additional include. * * This structure is aligned to an 8-byte boundary to match the alignment * requirements of (struct tcp_log_buffer). */ #ifdef __tcp_log_dev_h__ struct tcp_log_header { struct tcp_log_common_header tlh_common; #define tlh_version tlh_common.tlch_version #define tlh_type tlh_common.tlch_type #define tlh_length tlh_common.tlch_length struct in_endpoints tlh_ie; struct timeval tlh_offset; /* Uptime -> UTC offset */ char tlh_id[TCP_LOG_ID_LEN]; char tlh_reason[TCP_LOG_REASON_LEN]; char tlh_tag[TCP_LOG_TAG_LEN]; uint8_t tlh_af; uint8_t _pad[7]; } ALIGN_TCP_LOG; #ifdef _KERNEL struct tcp_log_dev_log_queue { struct tcp_log_dev_queue tldl_common; char tldl_id[TCP_LOG_ID_LEN]; char tldl_reason[TCP_LOG_REASON_LEN]; char tldl_tag[TCP_LOG_TAG_LEN]; struct in_endpoints tldl_ie; struct tcp_log_stailq tldl_entries; int tldl_count; uint8_t tldl_af; }; #endif /* _KERNEL */ #endif /* __tcp_log_dev_h__ */ #ifdef _KERNEL #define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000 #define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000 /* * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always * tries to record verbose information. */ #define TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ tp->t_output_caller, __func__, __LINE__, tv);\ } while (0) /* * TCP_LOG_EVENT: This is a macro so we can capture function/line * information when needed. * * Prototype: * TCP_LOG_EVENT(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, * struct sockbuf *txbuf, uint8_t eventid, int errornum, * union tcp_log_stackspecific *stackinfo) * * tp is mandatory and must be write locked. * th is optional; if present, it will appear in the record. * rxbuf and txbuf are optional; if present, they will appear in the record. * eventid is mandatory. * errornum is mandatory (it indicates the success or failure of the * operation associated with the event). * len indicates the length of the packet. If no packet, use 0. * stackinfo is optional; if present, it will appear in the record. */ #ifdef TCP_LOG_FORCEVERBOSE #define TCP_LOG_EVENT TCP_LOG_EVENT_VERBOSE #else #define TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \ do { \ if (tcp_log_verbose) \ TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, \ eventid, errornum, len, stackinfo, \ th_hostorder, NULL); \ else if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, NULL); \ } while (0) #endif /* TCP_LOG_FORCEVERBOSE */ #define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, tv); \ } while (0) #ifdef TCP_BLACKBOX extern bool tcp_log_verbose; void tcp_log_drain(struct tcpcb *tp); int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force); void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason); struct tcp_log_buffer *tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv); size_t tcp_log_get_id(struct tcpcb *tp, char *buf); size_t tcp_log_get_tag(struct tcpcb *tp, char *buf); u_int tcp_log_get_id_cnt(struct tcpcb *tp); int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp); void tcp_log_init(void); int tcp_log_set_id(struct tcpcb *tp, char *id); int tcp_log_set_tag(struct tcpcb *tp, char *tag); int tcp_log_state_change(struct tcpcb *tp, int state); void tcp_log_tcpcbinit(struct tcpcb *tp); void tcp_log_tcpcbfini(struct tcpcb *tp); void tcp_log_flowend(struct tcpcb *tp); #else /* !TCP_BLACKBOX */ #define tcp_log_verbose (false) static inline struct tcp_log_buffer * tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv) { return (NULL); } #endif /* TCP_BLACKBOX */ #endif /* _KERNEL */ #endif /* __tcp_log_buf_h__ */ diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c index d57dd03ec4ec..f9535935a9db 100644 --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -1,14980 +1,14980 @@ /*- * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /** * Author: Randall Stewart * This work is based on the ACM Queue paper * BBR - Congestion Based Congestion Control * and also numerous discussions with Neal, Yuchung and Van. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_ratelimit.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #include #include #ifdef STATS #include #include #include /* Must come after qmath.h and tree.h */ #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef INET6 #include #endif #include #include #include #include #include #if defined(IPSEC) || defined(IPSEC_SUPPORT) #include #include #endif /* IPSEC */ #include #include #include #ifdef MAC #include #endif #include "sack_filter.h" #include "tcp_bbr.h" #include "rack_bbr_common.h" uma_zone_t bbr_zone; uma_zone_t bbr_pcb_zone; struct sysctl_ctx_list bbr_sysctl_ctx; struct sysctl_oid *bbr_sysctl_root; #define TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \ (tv) = (value); \ if ((u_long)(tv) < (u_long)(tvmin)) \ (tv) = (tvmin); \ if ((u_long)(tv) > (u_long)(tvmax)) \ (tv) = (tvmax); \ } while(0) /*#define BBR_INVARIANT 1*/ /* * initial window */ static uint32_t bbr_def_init_win = 10; static int32_t bbr_persist_min = 250000; /* 250ms */ static int32_t bbr_persist_max = 1000000; /* 1 Second */ static int32_t bbr_cwnd_may_shrink = 0; static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP; static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT; static int32_t bbr_hardware_pacing_limit = 8000; static int32_t bbr_quanta = 3; /* How much extra quanta do we get? */ static int32_t bbr_no_retran = 0; static int32_t bbr_error_base_paceout = 10000; /* usec to pace */ static int32_t bbr_max_net_error_cnt = 10; /* Should the following be dynamic too -- loss wise */ static int32_t bbr_rtt_gain_thresh = 0; /* Measurement controls */ static int32_t bbr_use_google_algo = 1; static int32_t bbr_ts_limiting = 1; static int32_t bbr_ts_can_raise = 0; static int32_t bbr_do_red = 600; static int32_t bbr_red_scale = 20000; static int32_t bbr_red_mul = 1; static int32_t bbr_red_div = 2; static int32_t bbr_red_growth_restrict = 1; static int32_t bbr_target_is_bbunit = 0; static int32_t bbr_drop_limit = 0; /* * How much gain do we need to see to * stay in startup? */ static int32_t bbr_marks_rxt_sack_passed = 0; static int32_t bbr_start_exit = 25; static int32_t bbr_low_start_exit = 25; /* When we are in reduced gain */ static int32_t bbr_startup_loss_thresh = 2000; /* 20.00% loss */ static int32_t bbr_hptsi_max_mul = 1; /* These two mul/div assure a min pacing */ static int32_t bbr_hptsi_max_div = 2; /* time, 0 means turned off. We need this * if we go back ever to where the pacer * has priority over timers. */ static int32_t bbr_policer_call_from_rack_to = 0; static int32_t bbr_policer_detection_enabled = 1; static int32_t bbr_min_measurements_req = 1; /* We need at least 2 * measurments before we are * "good" note that 2 == 1. * This is because we use a > * comparison. This means if * min_measure was 0, it takes * num-measures > min(0) and * you get 1 measurement and * you are good. Set to 1, you * have to have two * measurements (this is done * to prevent it from being ok * to have no measurements). */ static int32_t bbr_no_pacing_until = 4; static int32_t bbr_min_usec_delta = 20000; /* 20,000 usecs */ static int32_t bbr_min_peer_delta = 20; /* 20 units */ static int32_t bbr_delta_percent = 150; /* 15.0 % */ static int32_t bbr_target_cwnd_mult_limit = 8; /* * bbr_cwnd_min_val is the number of * segments we hold to in the RTT probe * state typically 4. */ static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS; static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS; static int32_t bbr_gain_to_target = 1; static int32_t bbr_gain_gets_extra_too = 1; /* * bbr_high_gain is the 2/ln(2) value we need * to double the sending rate in startup. This * is used for both cwnd and hptsi gain's. */ static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1; static int32_t bbr_use_lower_gain_in_startup = 1; /* thresholds for reduction on drain in sub-states/drain */ static int32_t bbr_drain_rtt = BBR_SRTT; static int32_t bbr_drain_floor = 88; static int32_t google_allow_early_out = 1; static int32_t google_consider_lost = 1; static int32_t bbr_drain_drop_mul = 4; static int32_t bbr_drain_drop_div = 5; static int32_t bbr_rand_ot = 50; static int32_t bbr_can_force_probertt = 0; static int32_t bbr_can_adjust_probertt = 1; static int32_t bbr_probertt_sets_rtt = 0; static int32_t bbr_can_use_ts_for_rtt = 1; static int32_t bbr_is_ratio = 0; static int32_t bbr_sub_drain_app_limit = 1; static int32_t bbr_prtt_slam_cwnd = 1; static int32_t bbr_sub_drain_slam_cwnd = 1; static int32_t bbr_slam_cwnd_in_main_drain = 1; static int32_t bbr_filter_len_sec = 6; /* How long does the rttProp filter * hold */ static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4); /* * bbr_drain_gain is the reverse of the high_gain * designed to drain back out the standing queue * that is formed in startup by causing a larger * hptsi gain and thus drainging the packets * in flight. */ static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885; static int32_t bbr_rttprobe_gain = 192; /* * The cwnd_gain is the default cwnd gain applied when * calculating a target cwnd. Note that the cwnd is * a secondary factor in the way BBR works (see the * paper and think about it, it will take some time). * Basically the hptsi_gain spreads the packets out * so you never get more than BDP to the peer even * if the cwnd is high. In our implemenation that * means in non-recovery/retransmission scenarios * cwnd will never be reached by the flight-size. */ static int32_t bbr_cwnd_gain = BBR_UNIT * 2; static int32_t bbr_tlp_type_to_use = BBR_SRTT; static int32_t bbr_delack_time = 100000; /* 100ms in useconds */ static int32_t bbr_sack_not_required = 0; /* set to one to allow non-sack to use bbr */ static int32_t bbr_initial_bw_bps = 62500; /* 500kbps in bytes ps */ static int32_t bbr_ignore_data_after_close = 1; static int16_t bbr_hptsi_gain[] = { (BBR_UNIT *5 / 4), (BBR_UNIT * 3 / 4), BBR_UNIT, BBR_UNIT, BBR_UNIT, BBR_UNIT, BBR_UNIT, BBR_UNIT }; int32_t bbr_use_rack_resend_cheat = 1; int32_t bbr_sends_full_iwnd = 1; #define BBR_HPTSI_GAIN_MAX 8 /* * The BBR module incorporates a number of * TCP ideas that have been put out into the IETF * over the last few years: * - Yuchung Cheng's RACK TCP (for which its named) that * will stop us using the number of dup acks and instead * use time as the gage of when we retransmit. * - Reorder Detection of RFC4737 and the Tail-Loss probe draft * of Dukkipati et.al. * - Van Jacobson's et.al BBR. * * RACK depends on SACK, so if an endpoint arrives that * cannot do SACK the state machine below will shuttle the * connection back to using the "default" TCP stack that is * in FreeBSD. * * To implement BBR and RACK the original TCP stack was first decomposed * into a functional state machine with individual states * for each of the possible TCP connection states. The do_segement * functions role in life is to mandate the connection supports SACK * initially and then assure that the RACK state matches the conenction * state before calling the states do_segment function. Data processing * of inbound segments also now happens in the hpts_do_segment in general * with only one exception. This is so we can keep the connection on * a single CPU. * * Each state is simplified due to the fact that the original do_segment * has been decomposed and we *know* what state we are in (no * switches on the state) and all tests for SACK are gone. This * greatly simplifies what each state does. * * TCP output is also over-written with a new version since it * must maintain the new rack scoreboard and has had hptsi * integrated as a requirment. Still todo is to eliminate the * use of the callout_() system and use the hpts for all * timers as well. */ static uint32_t bbr_rtt_probe_time = 200000; /* 200ms in micro seconds */ static uint32_t bbr_rtt_probe_cwndtarg = 4; /* How many mss's outstanding */ static const int32_t bbr_min_req_free = 2; /* The min we must have on the * free list */ static int32_t bbr_tlp_thresh = 1; static int32_t bbr_reorder_thresh = 2; static int32_t bbr_reorder_fade = 60000000; /* 0 - never fade, def * 60,000,000 - 60 seconds */ static int32_t bbr_pkt_delay = 1000; static int32_t bbr_min_to = 1000; /* Number of usec's minimum timeout */ static int32_t bbr_incr_timers = 1; static int32_t bbr_tlp_min = 10000; /* 10ms in usecs */ static int32_t bbr_delayed_ack_time = 200000; /* 200ms in usecs */ static int32_t bbr_exit_startup_at_loss = 1; /* * bbr_lt_bw_ratio is 1/8th * bbr_lt_bw_diff is < 4 Kbit/sec */ static uint64_t bbr_lt_bw_diff = 4000 / 8; /* In bytes per second */ static uint64_t bbr_lt_bw_ratio = 8; /* For 1/8th */ static uint32_t bbr_lt_bw_max_rtts = 48; /* How many rtt's do we use * the lt_bw for */ static uint32_t bbr_lt_intvl_min_rtts = 4; /* Min num of RTT's to measure * lt_bw */ static int32_t bbr_lt_intvl_fp = 0; /* False positive epoch diff */ static int32_t bbr_lt_loss_thresh = 196; /* Lost vs delivered % */ static int32_t bbr_lt_fd_thresh = 100; /* false detection % */ static int32_t bbr_verbose_logging = 0; /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up * being a total of 122.850 seconds before a * connection is killed. */ static int32_t bbr_rto_min_ms = 30; /* 30ms same as main freebsd */ static int32_t bbr_rto_max_sec = 4; /* 4 seconds */ /****************************************************/ /* DEFAULT TSO SIZING (cpu performance impacting) */ /****************************************************/ /* What amount is our formula using to get TSO size */ static int32_t bbr_hptsi_per_second = 1000; /* * For hptsi under bbr_cross_over connections what is delay * target 7ms (in usec) combined with a seg_max of 2 * gets us close to identical google behavior in * TSO size selection (possibly more 1MSS sends). */ static int32_t bbr_hptsi_segments_delay_tar = 7000; /* Does pacing delay include overhead's in its time calculations? */ static int32_t bbr_include_enet_oh = 0; static int32_t bbr_include_ip_oh = 1; static int32_t bbr_include_tcp_oh = 1; static int32_t bbr_google_discount = 10; /* Do we use (nf mode) pkt-epoch to drive us or rttProp? */ static int32_t bbr_state_is_pkt_epoch = 0; static int32_t bbr_state_drain_2_tar = 1; /* What is the max the 0 - bbr_cross_over MBPS TSO target * can reach using our delay target. Note that this * value becomes the floor for the cross over * algorithm. */ static int32_t bbr_hptsi_segments_max = 2; static int32_t bbr_hptsi_segments_floor = 1; static int32_t bbr_hptsi_utter_max = 0; /* What is the min the 0 - bbr_cross-over MBPS TSO target can be */ static int32_t bbr_hptsi_bytes_min = 1460; static int32_t bbr_all_get_min = 0; /* Cross over point from algo-a to algo-b */ static uint32_t bbr_cross_over = TWENTY_THREE_MBPS; /* Do we deal with our restart state? */ static int32_t bbr_uses_idle_restart = 0; static int32_t bbr_idle_restart_threshold = 100000; /* 100ms in useconds */ /* Do we allow hardware pacing? */ static int32_t bbr_allow_hdwr_pacing = 0; static int32_t bbr_hdwr_pace_adjust = 2; /* multipler when we calc the tso size */ static int32_t bbr_hdwr_pace_floor = 1; static int32_t bbr_hdwr_pacing_delay_cnt = 10; /****************************************************/ static int32_t bbr_resends_use_tso = 0; static int32_t bbr_tlp_max_resend = 2; static int32_t bbr_sack_block_limit = 128; #define BBR_MAX_STAT 19 counter_u64_t bbr_state_time[BBR_MAX_STAT]; counter_u64_t bbr_state_lost[BBR_MAX_STAT]; counter_u64_t bbr_state_resend[BBR_MAX_STAT]; counter_u64_t bbr_stat_arry[BBR_STAT_SIZE]; counter_u64_t bbr_opts_arry[BBR_OPTS_SIZE]; counter_u64_t bbr_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t bbr_flows_whdwr_pacing; counter_u64_t bbr_flows_nohdwr_pacing; counter_u64_t bbr_nohdwr_pacing_enobuf; counter_u64_t bbr_hdwr_pacing_enobuf; static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr); /* * Static defintions we need for forward declarations. */ static uint32_t bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw); static uint32_t bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain); static void bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win); static void bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses); static void bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int line, int dolog); static uint32_t bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain); static void bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses); static uint32_t bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm); static uint32_t bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp); static uint32_t bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t srtt, uint32_t cts); static void bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line); static void bbr_set_state_target(struct tcp_bbr *bbr, int line); static void bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line); static void bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line); static void tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts); static void bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts); static void bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, uint32_t rtt, uint32_t line, uint8_t is_start, uint16_t set); static struct bbr_sendmap * bbr_find_lowest_rsm(struct tcp_bbr *bbr); static __inline uint32_t bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type); static void bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which); static void bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt, uint32_t thresh, uint32_t to); static void bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag); static void bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t sloton, uint32_t prev_delay); static void bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line); static void bbr_stop_all_timers(struct tcpcb *tp); static void bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts); static void bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts); static void bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts); static void bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod); static inline uint8_t bbr_state_val(struct tcp_bbr *bbr) { return(bbr->rc_bbr_substate); } static inline uint32_t get_min_cwnd(struct tcp_bbr *bbr) { int mss; mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); if (bbr_get_rtt(bbr, BBR_RTT_PROP) < BBR_HIGH_SPEED) return (bbr_cwnd_min_val_hs * mss); else return (bbr_cwnd_min_val * mss); } static uint32_t bbr_get_persists_timer_val(struct tcpcb *tp, struct tcp_bbr *bbr) { uint64_t srtt, var; uint64_t ret_val; bbr->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; if (tp->t_srtt == 0) { srtt = (uint64_t)BBR_INITIAL_RTO; var = 0; } else { srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); var = ((uint64_t)TICKS_2_USEC(tp->t_rttvar) >> TCP_RTT_SHIFT); } TCPT_RANGESET_NOSLOP(ret_val, ((srtt + var) * tcp_backoff[tp->t_rxtshift]), bbr_persist_min, bbr_persist_max); return ((uint32_t)ret_val); } static uint32_t bbr_timer_start(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* * Start the FR timer, we do this based on getting the first one in * the rc_tmap. Note that if its NULL we must stop the timer. in all * events we need to stop the running timer (if its running) before * starting the new one. */ uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; int32_t idx; int32_t is_tlp_timer = 0; struct bbr_sendmap *rsm; if (bbr->rc_all_timers_stopped) { /* All timers have been stopped none are to run */ return (0); } if (bbr->rc_in_persist) { /* We can't start any timer in persists */ return (bbr_get_persists_timer_val(tp, bbr)); } rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if ((rsm == NULL) || ((tp->t_flags & TF_SACK_PERMIT) == 0) || (tp->t_state < TCPS_ESTABLISHED)) { /* Nothing on the send map */ activate_rxt: if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { uint64_t tov; time_since_sent = 0; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if (rsm) { idx = rsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time)) tstmp_touse = rsm->r_tim_lastsent[idx]; else tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time; if (TSTMP_GT(tstmp_touse, cts)) time_since_sent = cts - tstmp_touse; } bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; if (tp->t_srtt == 0) tov = BBR_INITIAL_RTO; else tov = ((uint64_t)(TICKS_2_USEC(tp->t_srtt) + ((uint64_t)TICKS_2_USEC(tp->t_rttvar) * (uint64_t)4)) >> TCP_RTT_SHIFT); if (tp->t_rxtshift) tov *= tcp_backoff[tp->t_rxtshift]; if (tov > time_since_sent) tov -= time_since_sent; else tov = bbr->r_ctl.rc_min_to; TCPT_RANGESET_NOSLOP(to, tov, (bbr->r_ctl.rc_min_rto_ms * MS_IN_USEC), (bbr->rc_max_rto_sec * USECS_IN_SECOND)); bbr_log_timer_var(bbr, 2, cts, 0, srtt, 0, to); return (to); } return (0); } if (rsm->r_flags & BBR_ACKED) { rsm = bbr_find_lowest_rsm(bbr); if (rsm == NULL) { /* No lowest? */ goto activate_rxt; } } /* Convert from ms to usecs */ if (rsm->r_flags & BBR_SACK_PASSED) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & BBR_HAS_FIN)) { /* * We don't start a bbr rack timer if all we have is * a FIN outstanding. */ goto activate_rxt; } srtt = bbr_get_rtt(bbr, BBR_RTT_RACK); thresh = bbr_calc_thresh_rack(bbr, srtt, cts, rsm); idx = rsm->r_rtr_cnt - 1; exp = rsm->r_tim_lastsent[idx] + thresh; if (SEQ_GEQ(exp, cts)) { to = exp - cts; if (to < bbr->r_ctl.rc_min_to) { to = bbr->r_ctl.rc_min_to; } } else { to = bbr->r_ctl.rc_min_to; } } else { /* Ok we need to do a TLP not RACK */ if (bbr->rc_tlp_in_progress != 0) { /* * The previous send was a TLP. */ goto activate_rxt; } rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext); if (rsm == NULL) { /* We found no rsm to TLP with. */ goto activate_rxt; } if (rsm->r_flags & BBR_HAS_FIN) { /* If its a FIN we don't do TLP */ rsm = NULL; goto activate_rxt; } time_since_sent = 0; idx = rsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], bbr->r_ctl.rc_tlp_rxt_last_time)) tstmp_touse = rsm->r_tim_lastsent[idx]; else tstmp_touse = bbr->r_ctl.rc_tlp_rxt_last_time; if (TSTMP_GT(tstmp_touse, cts)) time_since_sent = cts - tstmp_touse; is_tlp_timer = 1; srtt = bbr_get_rtt(bbr, bbr_tlp_type_to_use); thresh = bbr_calc_thresh_tlp(tp, bbr, rsm, srtt, cts); if (thresh > time_since_sent) to = thresh - time_since_sent; else to = bbr->r_ctl.rc_min_to; if (to > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { /* * If the TLP time works out to larger than the max * RTO lets not do TLP.. just RTO. */ goto activate_rxt; } if ((bbr->rc_tlp_rtx_out == 1) && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq)) { /* * Second retransmit of the same TLP * lets not. */ bbr->rc_tlp_rtx_out = 0; goto activate_rxt; } if (rsm->r_start != bbr->r_ctl.rc_last_tlp_seq) { /* * The tail is no longer the last one I did a probe * on */ bbr->r_ctl.rc_tlp_seg_send_cnt = 0; bbr->r_ctl.rc_last_tlp_seq = rsm->r_start; } } if (is_tlp_timer == 0) { BBR_STAT_INC(bbr_to_arm_rack); bbr->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; } else { bbr_log_timer_var(bbr, 1, cts, time_since_sent, srtt, thresh, to); if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) { /* * We have exceeded how many times we can retran the * current TLP timer, switch to the RTO timer. */ goto activate_rxt; } else { BBR_STAT_INC(bbr_to_arm_tlp); bbr->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; } } return (to); } static inline int32_t bbr_minseg(struct tcp_bbr *bbr) { return (bbr->r_ctl.rc_pace_min_segs - bbr->rc_last_options); } static void bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_t frm, int32_t slot, uint32_t tot_len) { struct inpcb *inp; struct hpts_diag diag; uint32_t delayed_ack = 0; uint32_t left = 0; uint32_t hpts_timeout; uint8_t stopped; int32_t delay_calc = 0; uint32_t prev_delay = 0; inp = tp->t_inpcb; if (inp->inp_in_hpts) { /* A previous call is already set up */ return; } if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { return; } stopped = bbr->rc_tmr_stopped; if (stopped && TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) { left = bbr->r_ctl.rc_timer_exp - cts; } bbr->r_ctl.rc_hpts_flags = 0; bbr->r_ctl.rc_timer_exp = 0; prev_delay = bbr->r_ctl.rc_last_delay_val; if (bbr->r_ctl.rc_last_delay_val && (slot == 0)) { /* * If a previous pacer delay was in place we * are not coming from the output side (where * we calculate a delay, more likely a timer). */ slot = bbr->r_ctl.rc_last_delay_val; if (TSTMP_GT(cts, bbr->rc_pacer_started)) { /* Compensate for time passed */ delay_calc = cts - bbr->rc_pacer_started; if (delay_calc <= slot) slot -= delay_calc; } } /* Do we have early to make up for by pushing out the pacing time? */ if (bbr->r_agg_early_set) { bbr_log_pacing_delay_calc(bbr, 0, bbr->r_ctl.rc_agg_early, cts, slot, 0, bbr->r_agg_early_set, 2); slot += bbr->r_ctl.rc_agg_early; bbr->r_ctl.rc_agg_early = 0; bbr->r_agg_early_set = 0; } /* Are we running a total debt that needs to be compensated for? */ if (bbr->r_ctl.rc_hptsi_agg_delay) { if (slot > bbr->r_ctl.rc_hptsi_agg_delay) { /* We nuke the delay */ slot -= bbr->r_ctl.rc_hptsi_agg_delay; bbr->r_ctl.rc_hptsi_agg_delay = 0; } else { /* We nuke some of the delay, put in a minimal 100usecs */ bbr->r_ctl.rc_hptsi_agg_delay -= slot; bbr->r_ctl.rc_last_delay_val = slot = 100; } } bbr->r_ctl.rc_last_delay_val = slot; hpts_timeout = bbr_timer_start(tp, bbr, cts); if (tp->t_flags & TF_DELACK) { if (bbr->rc_in_persist == 0) { delayed_ack = bbr_delack_time; } else { /* * We are in persists and have * gotten a new data element. */ if (hpts_timeout > bbr_delack_time) { /* * Lets make the persists timer (which acks) * be the smaller of hpts_timeout and bbr_delack_time. */ hpts_timeout = bbr_delack_time; } } } if (delayed_ack && ((hpts_timeout == 0) || (delayed_ack < hpts_timeout))) { /* We need a Delayed ack timer */ bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; hpts_timeout = delayed_ack; } if (slot) { /* Mark that we have a pacing timer up */ BBR_STAT_INC(bbr_paced_segments); bbr->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; } /* * If no timers are going to run and we will fall off thfe hptsi * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && (slot == 0)) { if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* * Ok we have no timer (persists, rack, tlp, rxt or * del-ack), we don't have segments being paced. So * all that is left is the keepalive timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); } else { hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); } bbr->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; } } if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { /* * RACK, TLP, persists and RXT timers all are restartable * based on actions input .. i.e we received a packet (ack * or sack) and that changes things (rw, or snd_una etc). * Thus we can restart them with a new value. For * keep-alive, delayed_ack we keep track of what was left * and restart the timer with a smaller value. */ if (left < hpts_timeout) hpts_timeout = left; } if (bbr->r_ctl.rc_incr_tmrs && slot && (bbr->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* * If configured to do so, and the timer is either * the TLP or RXT timer, we need to increase the timeout * by the pacing time. Consider the bottleneck at my * machine as an example, we are sending something * to start a TLP on. The last packet won't be emitted * fully until the pacing time (the bottleneck will hold * the data in place). Once the packet is emitted that * is when we want to start waiting for the TLP. This * is most evident with hardware pacing (where the nic * is holding the packet(s) before emitting). But it * can also show up in the network so we do it for all * cases. Technically we would take off one packet from * this extra delay but this is easier and being more * conservative is probably better. */ hpts_timeout += slot; } if (hpts_timeout) { /* * Hack alert for now we can't time-out over 2147 seconds (a * bit more than 35min) */ if (hpts_timeout > 0x7ffffffe) hpts_timeout = 0x7ffffffe; bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; } else bbr->r_ctl.rc_timer_exp = 0; if ((slot) && (bbr->rc_use_google || bbr->output_error_seen || (slot <= hpts_timeout)) ) { /* * Tell LRO that it can queue packets while * we pace. */ bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && (bbr->rc_cwnd_limited == 0)) { /* * If we are not cwnd limited and we * are running a rack timer we put on * the do not disturbe even for sack. */ inp->inp_flags2 |= INP_DONT_SACK_QUEUE; } else inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; bbr->rc_pacer_started = cts; (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), __LINE__, &diag); bbr->rc_timer_first = 0; bbr->bbr_timer_src = frm; bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1); bbr_log_hpts_diag(bbr, cts, &diag); } else if (hpts_timeout) { (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); /* * We add the flag here as well if the slot is set, * since hpts will call in to clear the queue first before * calling the output routine (which does our timers). * We don't want to set the flag if its just a timer * else the arrival of data might (that causes us * to send more) might get delayed. Imagine being * on a keep-alive timer and a request comes in for * more data. */ if (slot) bbr->rc_pacer_started = cts; if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && (bbr->rc_cwnd_limited == 0)) { /* * For a rack timer, don't wake us even * if a sack arrives as long as we are * not cwnd limited. */ bbr->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; inp->inp_flags2 |= INP_DONT_SACK_QUEUE; } else { /* All other timers wake us up */ bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; } bbr->bbr_timer_src = frm; bbr_log_to_start(bbr, cts, hpts_timeout, slot, 0); bbr_log_hpts_diag(bbr, cts, &diag); bbr->rc_timer_first = 1; } bbr->rc_tmr_stopped = 0; bbr_log_type_bbrsnd(bbr, tot_len, slot, delay_calc, cts, frm, prev_delay); } static void bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sockbuf *sb) { /* * We received an ack, and then did not call send or were bounced * out due to the hpts was running. Now a timer is up as well, is it * the right timer? */ struct inpcb *inp; struct bbr_sendmap *rsm; uint32_t hpts_timeout; int tmr_up; tmr_up = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (bbr->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) return; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && (tmr_up == PACE_TMR_RXT)) { /* Should be an RXT */ return; } inp = bbr->rc_inp; if (rsm == NULL) { /* Nothing outstanding? */ if (tp->t_flags & TF_DELACK) { if (tmr_up == PACE_TMR_DELACK) /* * We are supposed to have delayed ack up * and we do */ return; } else if (sbavail(&inp->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { /* * if we hit enobufs then we would expect the * possiblity of nothing outstanding and the RXT up * (and the hptsi timer). */ return; } else if (((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) && (tmr_up == PACE_TMR_KEEP) && (tp->snd_max == tp->snd_una)) { /* We should have keep alive up and we do */ return; } } if (rsm && (rsm->r_flags & BBR_SACK_PASSED)) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & BBR_HAS_FIN)) { /* needs to be a RXT */ if (tmr_up == PACE_TMR_RXT) return; else goto wrong_timer; } else if (tmr_up == PACE_TMR_RACK) return; else goto wrong_timer; } else if (rsm && (tmr_up == PACE_TMR_RACK)) { /* Rack timer has priority if we have data out */ return; } else if (SEQ_GT(tp->snd_max, tp->snd_una) && ((tmr_up == PACE_TMR_TLP) || (tmr_up == PACE_TMR_RXT))) { /* * Either a TLP or RXT is fine if no sack-passed is in place * and data is outstanding. */ return; } else if (tmr_up == PACE_TMR_DELACK) { /* * If the delayed ack was going to go off before the * rtx/tlp/rack timer were going to expire, then that would * be the timer in control. Note we don't check the time * here trusting the code is correct. */ return; } if (SEQ_GT(tp->snd_max, tp->snd_una) && ((tmr_up == PACE_TMR_RXT) || (tmr_up == PACE_TMR_TLP) || (tmr_up == PACE_TMR_RACK))) { /* * We have outstanding data and * we *do* have a RACK, TLP or RXT * timer running. We won't restart * anything here since thats probably ok we * will get called with some timer here shortly. */ return; } /* * Ok the timer originally started is not what we want now. We will * force the hpts to be stopped if any, and restart with the slot * set to what was in the saved slot. */ wrong_timer: if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) { if (inp->inp_in_hpts) tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); bbr_timer_cancel(bbr, __LINE__, cts); bbr_start_hpts_timer(bbr, tp, cts, 1, bbr->r_ctl.rc_last_delay_val, 0); } else { /* * Output is hptsi so we just need to switch the type of * timer. We don't bother with keep-alive, since when we * jump through the output, it will start the keep-alive if * nothing is sent. * * We only need a delayed-ack added and or the hpts_timeout. */ hpts_timeout = bbr_timer_start(tp, bbr, cts); if (tp->t_flags & TF_DELACK) { if (hpts_timeout == 0) { hpts_timeout = bbr_delack_time; bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; } else if (hpts_timeout > bbr_delack_time) { hpts_timeout = bbr_delack_time; bbr->r_ctl.rc_hpts_flags = PACE_TMR_DELACK; } } if (hpts_timeout) { if (hpts_timeout > 0x7ffffffe) hpts_timeout = 0x7ffffffe; bbr->r_ctl.rc_timer_exp = cts + hpts_timeout; } } } int32_t bbr_clear_lost = 0; /* * Considers the two time values now (cts) and earlier. * If cts is smaller than earlier, we could have * had a sequence wrap (our counter wraps every * 70 min or so) or it could be just clock skew * getting us two differnt time values. Clock skew * will show up within 10ms or so. So in such * a case (where cts is behind earlier time by * less than 10ms) we return 0. Otherwise we * return the true difference between them. */ static inline uint32_t bbr_calc_time(uint32_t cts, uint32_t earlier_time) { /* * Given two timestamps, the current time stamp cts, and some other * time-stamp taken in theory earlier return the difference. The * trick is here sometimes locking will get the other timestamp * after the cts. If this occurs we need to return 0. */ if (TSTMP_GEQ(cts, earlier_time)) return (cts - earlier_time); /* * cts is behind earlier_time if its less than 10ms consider it 0. * If its more than 10ms difference then we had a time wrap. Else * its just the normal locking foo. I wonder if we should not go to * 64bit TS and get rid of this issue. */ if (TSTMP_GEQ((cts + 10000), earlier_time)) return (0); /* * Ok the time must have wrapped. So we need to answer a large * amount of time, which the normal subtraction should do. */ return (cts - earlier_time); } static int sysctl_bbr_clear_lost(SYSCTL_HANDLER_ARGS) { uint32_t stat; int32_t error; error = SYSCTL_OUT(req, &bbr_clear_lost, sizeof(uint32_t)); if (error || req->newptr == NULL) return error; error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); if (error) return (error); if (stat == 1) { #ifdef BBR_INVARIANTS printf("Clearing BBR lost counters\n"); #endif COUNTER_ARRAY_ZERO(bbr_state_lost, BBR_MAX_STAT); COUNTER_ARRAY_ZERO(bbr_state_time, BBR_MAX_STAT); COUNTER_ARRAY_ZERO(bbr_state_resend, BBR_MAX_STAT); } else if (stat == 2) { #ifdef BBR_INVARIANTS printf("Clearing BBR option counters\n"); #endif COUNTER_ARRAY_ZERO(bbr_opts_arry, BBR_OPTS_SIZE); } else if (stat == 3) { #ifdef BBR_INVARIANTS printf("Clearing BBR stats counters\n"); #endif COUNTER_ARRAY_ZERO(bbr_stat_arry, BBR_STAT_SIZE); } else if (stat == 4) { #ifdef BBR_INVARIANTS printf("Clearing BBR out-size counters\n"); #endif COUNTER_ARRAY_ZERO(bbr_out_size, TCP_MSS_ACCT_SIZE); } bbr_clear_lost = 0; return (0); } static void bbr_init_sysctls(void) { struct sysctl_oid *bbr_probertt; struct sysctl_oid *bbr_hptsi; struct sysctl_oid *bbr_measure; struct sysctl_oid *bbr_cwnd; struct sysctl_oid *bbr_timeout; struct sysctl_oid *bbr_states; struct sysctl_oid *bbr_startup; struct sysctl_oid *bbr_policer; /* Probe rtt controls */ bbr_probertt = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "probertt", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "gain", CTLFLAG_RW, &bbr_rttprobe_gain, 192, "What is the filter gain drop in probe_rtt (0=disable)?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "cwnd", CTLFLAG_RW, &bbr_rtt_probe_cwndtarg, 4, "How many mss's are outstanding during probe-rtt"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "int", CTLFLAG_RW, &bbr_rtt_probe_limit, 4000000, "If RTT has not shrank in this many micro-seconds enter probe-rtt"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "mintime", CTLFLAG_RW, &bbr_rtt_probe_time, 200000, "How many microseconds in probe-rtt"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "filter_len_sec", CTLFLAG_RW, &bbr_filter_len_sec, 6, "How long in seconds does the rttProp filter run?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "drain_rtt", CTLFLAG_RW, &bbr_drain_rtt, BBR_SRTT, "What is the drain rtt to use in probeRTT (rtt_prop=0, rtt_rack=1, rtt_pkt=2, rtt_srtt=3?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "can_force", CTLFLAG_RW, &bbr_can_force_probertt, 0, "If we keep setting new low rtt's but delay going in probe-rtt can we force in??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "enter_sets_force", CTLFLAG_RW, &bbr_probertt_sets_rtt, 0, "In NF mode, do we imitate google_mode and set the rttProp on entry to probe-rtt?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "can_adjust", CTLFLAG_RW, &bbr_can_adjust_probertt, 1, "Can we dynamically adjust the probe-rtt limits and times?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "is_ratio", CTLFLAG_RW, &bbr_is_ratio, 0, "is the limit to filter a ratio?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "use_cwnd", CTLFLAG_RW, &bbr_prtt_slam_cwnd, 0, "Should we set/recover cwnd?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_probertt), OID_AUTO, "can_use_ts", CTLFLAG_RW, &bbr_can_use_ts_for_rtt, 1, "Can we use the ms timestamp if available for retransmistted rtt calculations?"); /* Pacing controls */ bbr_hptsi = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "pacing", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing", CTLFLAG_RW, &bbr_allow_hdwr_pacing, 1, "Do we allow hardware pacing?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing_limit", CTLFLAG_RW, &bbr_hardware_pacing_limit, 4000, "Do we have a limited number of connections for pacing chelsio (0=no limit)?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing_adj", CTLFLAG_RW, &bbr_hdwr_pace_adjust, 2, "Multiplier to calculated tso size?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing_floor", CTLFLAG_RW, &bbr_hdwr_pace_floor, 1, "Do we invoke the hardware pacing floor?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "hw_pacing_delay_cnt", CTLFLAG_RW, &bbr_hdwr_pacing_delay_cnt, 10, "How many packets must be sent after hdwr pacing is enabled"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "bw_cross", CTLFLAG_RW, &bbr_cross_over, 3000000, "What is the point where we cross over to linux like TSO size set"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "seg_deltarg", CTLFLAG_RW, &bbr_hptsi_segments_delay_tar, 7000, "What is the worse case delay target for hptsi < 48Mbp connections"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "enet_oh", CTLFLAG_RW, &bbr_include_enet_oh, 0, "Do we include the ethernet overhead in calculating pacing delay?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "ip_oh", CTLFLAG_RW, &bbr_include_ip_oh, 1, "Do we include the IP overhead in calculating pacing delay?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "tcp_oh", CTLFLAG_RW, &bbr_include_tcp_oh, 0, "Do we include the TCP overhead in calculating pacing delay?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "google_discount", CTLFLAG_RW, &bbr_google_discount, 10, "What is the default google discount percentage wise for pacing (11 = 1.1%%)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "all_get_min", CTLFLAG_RW, &bbr_all_get_min, 0, "If you are less than a MSS do you just get the min?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "tso_min", CTLFLAG_RW, &bbr_hptsi_bytes_min, 1460, "For 0 -> 24Mbps what is floor number of segments for TSO"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "seg_tso_max", CTLFLAG_RW, &bbr_hptsi_segments_max, 6, "For 0 -> 24Mbps what is top number of segments for TSO"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "seg_floor", CTLFLAG_RW, &bbr_hptsi_segments_floor, 1, "Minimum TSO size we will fall too in segments"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "utter_max", CTLFLAG_RW, &bbr_hptsi_utter_max, 0, "The absolute maximum that any pacing (outside of hardware) can be"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "seg_divisor", CTLFLAG_RW, &bbr_hptsi_per_second, 100, "What is the divisor in our hptsi TSO calculation 512Mbps < X > 24Mbps "); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "srtt_mul", CTLFLAG_RW, &bbr_hptsi_max_mul, 1, "The multiplier for pace len max"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_hptsi), OID_AUTO, "srtt_div", CTLFLAG_RW, &bbr_hptsi_max_div, 2, "The divisor for pace len max"); /* Measurement controls */ bbr_measure = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "measure", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Measurement controls"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "min_i_bw", CTLFLAG_RW, &bbr_initial_bw_bps, 62500, "Minimum initial b/w in bytes per second"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "no_sack_needed", CTLFLAG_RW, &bbr_sack_not_required, 0, "Do we allow bbr to run on connections not supporting SACK?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "use_google", CTLFLAG_RW, &bbr_use_google_algo, 0, "Use has close to google V1.0 has possible?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_limiting", CTLFLAG_RW, &bbr_ts_limiting, 1, "Do we attempt to use the peers timestamp to limit b/w caculations?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_can_raise", CTLFLAG_RW, &bbr_ts_can_raise, 0, "Can we raise the b/w via timestamp b/w calculation?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_delta", CTLFLAG_RW, &bbr_min_usec_delta, 20000, "How long in usec between ts of our sends in ts validation code?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_peer_delta", CTLFLAG_RW, &bbr_min_peer_delta, 20, "What min numerical value should be between the peer deltas?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "ts_delta_percent", CTLFLAG_RW, &bbr_delta_percent, 150, "What percentage (150 = 15.0) do we allow variance for?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "min_measure_good_bw", CTLFLAG_RW, &bbr_min_measurements_req, 1, "What is the minimum measurment count we need before we switch to our b/w estimate"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "min_measure_before_pace", CTLFLAG_RW, &bbr_no_pacing_until, 4, "How many pkt-epoch's (0 is off) do we need before pacing is on?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "quanta", CTLFLAG_RW, &bbr_quanta, 2, "Extra quanta to add when calculating the target (ID section 4.2.3.2)."); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_measure), OID_AUTO, "noretran", CTLFLAG_RW, &bbr_no_retran, 0, "Should google mode not use retransmission measurements for the b/w estimation?"); /* State controls */ bbr_states = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "states", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "State controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "idle_restart", CTLFLAG_RW, &bbr_uses_idle_restart, 0, "Do we use a new special idle_restart state to ramp back up quickly?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "idle_restart_threshold", CTLFLAG_RW, &bbr_idle_restart_threshold, 100000, "How long must we be idle before we restart??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "use_pkt_epoch", CTLFLAG_RW, &bbr_state_is_pkt_epoch, 0, "Do we use a pkt-epoch for substate if 0 rttProp?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "startup_rtt_gain", CTLFLAG_RW, &bbr_rtt_gain_thresh, 0, "What increase in RTT triggers us to stop ignoring no-loss and possibly exit startup?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "drain_floor", CTLFLAG_RW, &bbr_drain_floor, 88, "What is the lowest we can drain (pg) too?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "drain_2_target", CTLFLAG_RW, &bbr_state_drain_2_tar, 1, "Do we drain to target in drain substate?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "gain_2_target", CTLFLAG_RW, &bbr_gain_to_target, 1, "Does probe bw gain to target??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "gain_extra_time", CTLFLAG_RW, &bbr_gain_gets_extra_too, 1, "Does probe bw gain get the extra time too?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "ld_div", CTLFLAG_RW, &bbr_drain_drop_div, 5, "Long drain drop divider?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "ld_mul", CTLFLAG_RW, &bbr_drain_drop_mul, 4, "Long drain drop multiplier?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "rand_ot_disc", CTLFLAG_RW, &bbr_rand_ot, 50, "Random discount of the ot?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "dr_filter_life", CTLFLAG_RW, &bbr_num_pktepo_for_del_limit, BBR_NUM_RTTS_FOR_DEL_LIMIT, "How many packet-epochs does the b/w delivery rate last?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "subdrain_applimited", CTLFLAG_RW, &bbr_sub_drain_app_limit, 0, "Does our sub-state drain invoke app limited if its long?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "use_cwnd_subdrain", CTLFLAG_RW, &bbr_sub_drain_slam_cwnd, 0, "Should we set/recover cwnd for sub-state drain?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "use_cwnd_maindrain", CTLFLAG_RW, &bbr_slam_cwnd_in_main_drain, 0, "Should we set/recover cwnd for main-state drain?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "google_gets_earlyout", CTLFLAG_RW, &google_allow_early_out, 1, "Should we allow google probe-bw/drain to exit early at flight target?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_states), OID_AUTO, "google_exit_loss", CTLFLAG_RW, &google_consider_lost, 1, "Should we have losses exit gain of probebw in google mode??"); /* Startup controls */ bbr_startup = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "startup", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Startup controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "cheat_iwnd", CTLFLAG_RW, &bbr_sends_full_iwnd, 1, "Do we not pace but burst out initial windows has our TSO size?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "loss_threshold", CTLFLAG_RW, &bbr_startup_loss_thresh, 2000, "In startup what is the loss threshold in a pe that will exit us from startup?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "use_lowerpg", CTLFLAG_RW, &bbr_use_lower_gain_in_startup, 1, "Should we use a lower hptsi gain if we see loss in startup?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "gain", CTLFLAG_RW, &bbr_start_exit, 25, "What gain percent do we need to see to stay in startup??"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "low_gain", CTLFLAG_RW, &bbr_low_start_exit, 15, "What gain percent do we need to see to stay in the lower gain startup??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_startup), OID_AUTO, "loss_exit", CTLFLAG_RW, &bbr_exit_startup_at_loss, 1, "Should we exit startup at loss in an epoch if we are not gaining?"); /* CWND controls */ bbr_cwnd = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "cwnd", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Cwnd controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "tar_rtt", CTLFLAG_RW, &bbr_cwndtarget_rtt_touse, 0, "Target cwnd rtt measurment to use (0=rtt_prop, 1=rtt_rack, 2=pkt_rtt, 3=srtt)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "may_shrink", CTLFLAG_RW, &bbr_cwnd_may_shrink, 0, "Can the cwnd shrink if it would grow to more than the target?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "max_target_limit", CTLFLAG_RW, &bbr_target_cwnd_mult_limit, 8, "Do we limit the cwnd to some multiple of the cwnd target if cwnd can't shrink 0=no?"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "highspeed_min", CTLFLAG_RW, &bbr_cwnd_min_val_hs, BBR_HIGHSPEED_NUM_MSS, "What is the high-speed min cwnd (rttProp under 1ms)"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "lowspeed_min", CTLFLAG_RW, &bbr_cwnd_min_val, BBR_PROBERTT_NUM_MSS, "What is the min cwnd (rttProp > 1ms)"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "initwin", CTLFLAG_RW, &bbr_def_init_win, 10, "What is the BBR initial window, if 0 use tcp version"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "do_loss_red", CTLFLAG_RW, &bbr_do_red, 600, "Do we reduce the b/w at exit from recovery based on ratio of prop/srtt (800=80.0, 0=off)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "red_scale", CTLFLAG_RW, &bbr_red_scale, 20000, "What RTT do we scale with?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "red_growslow", CTLFLAG_RW, &bbr_red_growth_restrict, 1, "Do we restrict cwnd growth for whats in flight?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "red_div", CTLFLAG_RW, &bbr_red_div, 2, "If we reduce whats the divisor?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "red_mul", CTLFLAG_RW, &bbr_red_mul, 1, "If we reduce whats the mulitiplier?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "target_is_unit", CTLFLAG_RW, &bbr_target_is_bbunit, 0, "Is the state target the pacing_gain or BBR_UNIT?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_cwnd), OID_AUTO, "drop_limit", CTLFLAG_RW, &bbr_drop_limit, 0, "Number of segments limit for drop (0=use min_cwnd w/flight)?"); /* Timeout controls */ bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "timeout", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Time out controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "delack", CTLFLAG_RW, &bbr_delack_time, 100000, "BBR's delayed ack time"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "tlp_uses", CTLFLAG_RW, &bbr_tlp_type_to_use, 3, "RTT that TLP uses in its calculations, 0=rttProp, 1=Rack_rtt, 2=pkt_rtt and 3=srtt"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "persmin", CTLFLAG_RW, &bbr_persist_min, 250000, "What is the minimum time in microseconds between persists"); SYSCTL_ADD_U32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "persmax", CTLFLAG_RW, &bbr_persist_max, 1000000, "What is the largest delay in microseconds between persists"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "tlp_minto", CTLFLAG_RW, &bbr_tlp_min, 10000, "TLP Min timeout in usecs"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "tlp_dack_time", CTLFLAG_RW, &bbr_delayed_ack_time, 200000, "TLP delayed ack compensation value"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "minrto", CTLFLAG_RW, &bbr_rto_min_ms, 30, "Minimum RTO in ms"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "maxrto", CTLFLAG_RW, &bbr_rto_max_sec, 4, "Maxiumum RTO in seconds -- should be at least as large as min_rto"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "tlp_retry", CTLFLAG_RW, &bbr_tlp_max_resend, 2, "How many times does TLP retry a single segment or multiple with no ACK"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "minto", CTLFLAG_RW, &bbr_min_to, 1000, "Minimum rack timeout in useconds"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "pktdelay", CTLFLAG_RW, &bbr_pkt_delay, 1000, "Extra RACK time (in useconds) besides reordering thresh"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "incr_tmrs", CTLFLAG_RW, &bbr_incr_timers, 1, "Increase the RXT/TLP timer by the pacing time used?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_timeout), OID_AUTO, "rxtmark_sackpassed", CTLFLAG_RW, &bbr_marks_rxt_sack_passed, 0, "Mark sack passed on all those not ack'd when a RXT hits?"); /* Policer controls */ bbr_policer = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "policer", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Policer controls"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "detect_enable", CTLFLAG_RW, &bbr_policer_detection_enabled, 1, "Is policer detection enabled??"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "min_pes", CTLFLAG_RW, &bbr_lt_intvl_min_rtts, 4, "Minimum number of PE's?"); SYSCTL_ADD_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "bwdiff", CTLFLAG_RW, &bbr_lt_bw_diff, (4000/8), "Minimal bw diff?"); SYSCTL_ADD_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "bwratio", CTLFLAG_RW, &bbr_lt_bw_ratio, 8, "Minimal bw diff?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "from_rack_rxt", CTLFLAG_RW, &bbr_policer_call_from_rack_to, 0, "Do we call the policer detection code from a rack-timeout?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "false_postive", CTLFLAG_RW, &bbr_lt_intvl_fp, 0, "What packet epoch do we do false-postive detection at (0=no)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "loss_thresh", CTLFLAG_RW, &bbr_lt_loss_thresh, 196, "Loss threshold 196 = 19.6%?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_policer), OID_AUTO, "false_postive_thresh", CTLFLAG_RW, &bbr_lt_fd_thresh, 100, "What percentage is the false detection threshold (150=15.0)?"); /* All the rest */ SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "cheat_rxt", CTLFLAG_RW, &bbr_use_rack_resend_cheat, 0, "Do we burst 1ms between sends on retransmissions (like rack)?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "error_paceout", CTLFLAG_RW, &bbr_error_base_paceout, 10000, "When we hit an error what is the min to pace out in usec's?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "kill_paceout", CTLFLAG_RW, &bbr_max_net_error_cnt, 10, "When we hit this many errors in a row, kill the session?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "data_after_close", CTLFLAG_RW, &bbr_ignore_data_after_close, 1, "Do we hold off sending a RST until all pending data is ack'd"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "resend_use_tso", CTLFLAG_RW, &bbr_resends_use_tso, 0, "Can resends use TSO?"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "sblklimit", CTLFLAG_RW, &bbr_sack_block_limit, 128, "When do we start ignoring small sack blocks"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "bb_verbose", CTLFLAG_RW, &bbr_verbose_logging, 0, "Should BBR black box logging be verbose"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "reorder_thresh", CTLFLAG_RW, &bbr_reorder_thresh, 2, "What factor for rack will be added when seeing reordering (shift right)"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "reorder_fade", CTLFLAG_RW, &bbr_reorder_fade, 0, "Does reorder detection fade, if so how many ms (0 means never)"); SYSCTL_ADD_S32(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, &bbr_tlp_thresh, 1, "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); /* Stats and counters */ /* The pacing counters for hdwr/software can't be in the array */ bbr_nohdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK); bbr_hdwr_pacing_enobuf = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "enob_hdwr_pacing", CTLFLAG_RD, &bbr_hdwr_pacing_enobuf, "Total number of enobufs for hardware paced flows"); SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "enob_no_hdwr_pacing", CTLFLAG_RD, &bbr_nohdwr_pacing_enobuf, "Total number of enobufs for non-hardware paced flows"); bbr_flows_whdwr_pacing = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "hdwr_pacing", CTLFLAG_RD, &bbr_flows_whdwr_pacing, "Total number of hardware paced flows"); bbr_flows_nohdwr_pacing = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "software_pacing", CTLFLAG_RD, &bbr_flows_nohdwr_pacing, "Total number of software paced flows"); COUNTER_ARRAY_ALLOC(bbr_stat_arry, BBR_STAT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "stats", CTLFLAG_RD, bbr_stat_arry, BBR_STAT_SIZE, "BBR Stats"); COUNTER_ARRAY_ALLOC(bbr_opts_arry, BBR_OPTS_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "opts", CTLFLAG_RD, bbr_opts_arry, BBR_OPTS_SIZE, "BBR Option Stats"); COUNTER_ARRAY_ALLOC(bbr_state_lost, BBR_MAX_STAT, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "lost", CTLFLAG_RD, bbr_state_lost, BBR_MAX_STAT, "Stats of when losses occur"); COUNTER_ARRAY_ALLOC(bbr_state_resend, BBR_MAX_STAT, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "stateresend", CTLFLAG_RD, bbr_state_resend, BBR_MAX_STAT, "Stats of what states resend"); COUNTER_ARRAY_ALLOC(bbr_state_time, BBR_MAX_STAT, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "statetime", CTLFLAG_RD, bbr_state_time, BBR_MAX_STAT, "Stats of time spent in the states"); COUNTER_ARRAY_ALLOC(bbr_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, bbr_out_size, TCP_MSS_ACCT_SIZE, "Size of output calls"); SYSCTL_ADD_PROC(&bbr_sysctl_ctx, SYSCTL_CHILDREN(bbr_sysctl_root), OID_AUTO, "clrlost", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters"); } static void bbr_counter_destroy(void) { COUNTER_ARRAY_FREE(bbr_stat_arry, BBR_STAT_SIZE); COUNTER_ARRAY_FREE(bbr_opts_arry, BBR_OPTS_SIZE); COUNTER_ARRAY_FREE(bbr_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT); COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT); COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT); counter_u64_free(bbr_nohdwr_pacing_enobuf); counter_u64_free(bbr_hdwr_pacing_enobuf); counter_u64_free(bbr_flows_whdwr_pacing); counter_u64_free(bbr_flows_nohdwr_pacing); } static __inline void bbr_fill_in_logging_data(struct tcp_bbr *bbr, struct tcp_log_bbr *l, uint32_t cts) { memset(l, 0, sizeof(union tcp_log_stackspecific)); l->cur_del_rate = bbr->r_ctl.rc_bbr_cur_del_rate; l->delRate = get_filter_value(&bbr->r_ctl.rc_delrate); l->rttProp = get_filter_value_small(&bbr->r_ctl.rc_rttprop); l->bw_inuse = bbr_get_bw(bbr); l->inflight = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); l->applimited = bbr->r_ctl.r_app_limited_until; l->delivered = bbr->r_ctl.rc_delivered; l->timeStamp = cts; l->lost = bbr->r_ctl.rc_lost; l->bbr_state = bbr->rc_bbr_state; l->bbr_substate = bbr_state_val(bbr); l->epoch = bbr->r_ctl.rc_rtt_epoch; l->lt_epoch = bbr->r_ctl.rc_lt_epoch; l->pacing_gain = bbr->r_ctl.rc_bbr_hptsi_gain; l->cwnd_gain = bbr->r_ctl.rc_bbr_cwnd_gain; l->inhpts = bbr->rc_inp->inp_in_hpts; l->ininput = bbr->rc_inp->inp_in_input; l->use_lt_bw = bbr->rc_lt_use_bw; l->pkts_out = bbr->r_ctl.rc_flight_at_input; l->pkt_epoch = bbr->r_ctl.rc_pkt_epoch; } static void bbr_log_type_bw_reduce(struct tcp_bbr *bbr, int reason) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = 0; log.u_bbr.flex2 = 0; log.u_bbr.flex5 = 0; log.u_bbr.flex3 = 0; log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_loss_rate; log.u_bbr.flex7 = reason; log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_enters_probertt; log.u_bbr.flex8 = 0; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BW_RED_EV, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_rwnd_collapse(struct tcp_bbr *bbr, int seq, int mode, uint32_t count) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = seq; log.u_bbr.flex2 = count; log.u_bbr.flex8 = mode; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_LOWGAIN, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_just_return(struct tcp_bbr *bbr, uint32_t cts, uint32_t tlen, uint8_t hpts_calling, uint8_t reason, uint32_t p_maxseg, int len) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = p_maxseg; log.u_bbr.flex2 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp; log.u_bbr.flex4 = reason; log.u_bbr.flex5 = bbr->rc_in_persist; log.u_bbr.flex6 = bbr->r_ctl.rc_last_delay_val; log.u_bbr.flex7 = p_maxseg; log.u_bbr.flex8 = bbr->rc_in_persist; log.u_bbr.pkts_out = 0; log.u_bbr.applimited = len; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_JUSTRET, 0, tlen, &log, false, &bbr->rc_tv); } } static void bbr_log_type_enter_rec(struct tcp_bbr *bbr, uint32_t seq) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = seq; log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent; log.u_bbr.flex3 = bbr->r_ctl.rc_recovery_start; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_ENTREC, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = tso; log.u_bbr.flex2 = maxseg; log.u_bbr.flex3 = mtu; log.u_bbr.flex4 = csum_flags; TCP_LOG_EVENTP(tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_MSGSIZE, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_flowend(struct tcp_bbr *bbr) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct sockbuf *r, *s; struct timeval tv; if (bbr->rc_inp->inp_socket) { r = &bbr->rc_inp->inp_socket->so_rcv; s = &bbr->rc_inp->inp_socket->so_snd; } else { r = s = NULL; } bbr_fill_in_logging_data(bbr, &log.u_bbr, tcp_get_usecs(&tv)); TCP_LOG_EVENTP(bbr->rc_tp, NULL, r, s, TCP_LOG_FLOWEND, 0, 0, &log, false, &tv); } } static void bbr_log_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t lost, uint32_t del) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = lost; log.u_bbr.flex2 = del; log.u_bbr.flex3 = bbr->r_ctl.rc_bbr_lastbtlbw; log.u_bbr.flex4 = bbr->r_ctl.rc_pkt_epoch_rtt; log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch; log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; log.u_bbr.flex7 = line; log.u_bbr.flex8 = 0; log.u_bbr.inflight = bbr->r_ctl.r_measurement_count; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_PKT_EPOCH, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_time_epoch(struct tcp_bbr *bbr, uint32_t cts, uint32_t line, uint32_t epoch_time) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->r_ctl.rc_lost; log.u_bbr.flex2 = bbr->rc_inp->inp_socket->so_snd.sb_lowat; log.u_bbr.flex3 = bbr->rc_inp->inp_socket->so_snd.sb_hiwat; log.u_bbr.flex7 = line; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TIME_EPOCH, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_set_of_state_target(struct tcp_bbr *bbr, uint32_t new_tar, int line, int meth) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex2 = new_tar; log.u_bbr.flex3 = line; log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs; log.u_bbr.flex5 = bbr_quanta; log.u_bbr.flex6 = bbr->r_ctl.rc_pace_min_segs; log.u_bbr.flex7 = bbr->rc_last_options; log.u_bbr.flex8 = meth; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_STATE_TARGET, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_statechange(struct tcp_bbr *bbr, uint32_t cts, int32_t line) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int; if (bbr_state_is_pkt_epoch) log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PKTRTT); else log.u_bbr.flex4 = bbr_get_rtt(bbr, BBR_RTT_PROP); log.u_bbr.flex5 = bbr->r_ctl.rc_bbr_last_startup_epoch; log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; log.u_bbr.flex7 = (bbr->r_ctl.rc_target_at_state/1000); log.u_bbr.lt_epoch = bbr->r_ctl.rc_level_state_extra; log.u_bbr.pkts_out = bbr->r_ctl.rc_target_at_state; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_STATE, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_rtt_shrinks(struct tcp_bbr *bbr, uint32_t cts, uint32_t applied, uint32_t rtt, uint32_t line, uint8_t reas, uint16_t cond) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; log.u_bbr.flex3 = bbr->r_ctl.last_in_probertt; log.u_bbr.flex4 = applied; log.u_bbr.flex5 = rtt; log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex7 = cond; log.u_bbr.flex8 = reas; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_RTT_SHRINKS, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_exit_rec(struct tcp_bbr *bbr) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = bbr->r_ctl.rc_recovery_start; log.u_bbr.flex2 = bbr->r_ctl.rc_cwnd_on_ent; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_EXITREC, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_cwndupd(struct tcp_bbr *bbr, uint32_t bytes_this_ack, uint32_t chg, uint32_t prev_acked, int32_t meth, uint32_t target, uint32_t th_ack, int32_t line) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = line; log.u_bbr.flex2 = prev_acked; log.u_bbr.flex3 = bytes_this_ack; log.u_bbr.flex4 = chg; log.u_bbr.flex5 = th_ack; log.u_bbr.flex6 = target; log.u_bbr.flex8 = meth; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_CWND, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_rtt_sample(struct tcp_bbr *bbr, uint32_t rtt, uint32_t tsin) { /* * Log the rtt sample we are applying to the srtt algorithm in * useconds. */ if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = rtt; log.u_bbr.flex2 = bbr->r_ctl.rc_bbr_state_time; log.u_bbr.flex3 = bbr->r_ctl.rc_ack_hdwr_delay; log.u_bbr.flex4 = bbr->rc_tp->ts_offset; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; log.u_bbr.pkts_out = tcp_tv_to_mssectick(&bbr->rc_tv); log.u_bbr.flex6 = tsin; log.u_bbr.flex7 = 0; log.u_bbr.flex8 = bbr->rc_ack_was_delayed; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, TCP_LOG_RTT, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_pesist(struct tcp_bbr *bbr, uint32_t cts, uint32_t time_in, int32_t line, uint8_t enter_exit) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = time_in; log.u_bbr.flex2 = line; log.u_bbr.flex8 = enter_exit; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_PERSIST, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_ack_clear(struct tcp_bbr *bbr, uint32_t cts) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->rc_tp->ts_recent_age; log.u_bbr.flex2 = bbr->r_ctl.rc_rtt_shrinks; log.u_bbr.flex3 = bbr->r_ctl.rc_probertt_int; log.u_bbr.flex4 = bbr->r_ctl.rc_went_idle_time; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_ACKCLEAR, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_ack_event(struct tcp_bbr *bbr, struct tcphdr *th, struct tcpopt *to, uint32_t tlen, uint16_t nsegs, uint32_t cts, int32_t nxt_pkt, struct mbuf *m) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = nsegs; log.u_bbr.flex2 = bbr->r_ctl.rc_lost_bytes; if (m) { struct timespec ts; log.u_bbr.flex3 = m->m_flags; if (m->m_flags & M_TSTMP) { mbuf_tstmp2timespec(m, &ts); tv.tv_sec = ts.tv_sec; tv.tv_usec = ts.tv_nsec / 1000; log.u_bbr.lt_epoch = tcp_tv_to_usectick(&tv); } else { log.u_bbr.lt_epoch = 0; } if (m->m_flags & M_TSTMP_LRO) { tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000; log.u_bbr.flex5 = tcp_tv_to_usectick(&tv); } else { /* No arrival timestamp */ log.u_bbr.flex5 = 0; } log.u_bbr.pkts_out = tcp_get_usecs(&tv); } else { log.u_bbr.flex3 = 0; log.u_bbr.flex5 = 0; log.u_bbr.flex6 = 0; log.u_bbr.pkts_out = 0; } log.u_bbr.flex4 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex7 = bbr->r_wanted_output; log.u_bbr.flex8 = bbr->rc_in_persist; TCP_LOG_EVENTP(bbr->rc_tp, th, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, TCP_LOG_IN, 0, tlen, &log, true, &bbr->rc_tv); } } static void bbr_log_doseg_done(struct tcp_bbr *bbr, uint32_t cts, int32_t nxt_pkt, int32_t did_out) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = bbr->r_ctl.rc_last_delay_val; log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = bbr->r_ctl.rc_timer_exp; log.u_bbr.flex6 = bbr->r_ctl.rc_lost_bytes; log.u_bbr.flex7 = bbr->r_wanted_output; log.u_bbr.flex8 = bbr->rc_in_persist; log.u_bbr.pkts_out = bbr->r_ctl.highest_hdwr_delay; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_DOSEG_DONE, 0, 0, &log, true, &bbr->rc_tv); } } static void bbr_log_enobuf_jmp(struct tcp_bbr *bbr, uint32_t len, uint32_t cts, int32_t line, uint32_t o_len, uint32_t segcnt, uint32_t segsiz) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = o_len; log.u_bbr.flex3 = segcnt; log.u_bbr.flex4 = segsiz; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_ENOBUF_JMP, ENOBUFS, len, &log, true, &bbr->rc_tv); } } static void bbr_log_to_processing(struct tcp_bbr *bbr, uint32_t cts, int32_t ret, int32_t timers, uint8_t hpts_calling) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = timers; log.u_bbr.flex2 = ret; log.u_bbr.flex3 = bbr->r_ctl.rc_timer_exp; log.u_bbr.flex4 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = cts; log.u_bbr.flex6 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex8 = hpts_calling; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TO_PROCESS, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_to_event(struct tcp_bbr *bbr, uint32_t cts, int32_t to_num) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; uint64_t ar; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->bbr_timer_src; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; ar = (uint64_t)(bbr->r_ctl.rc_resend); ar >>= 32; ar &= 0x00000000ffffffff; log.u_bbr.flex4 = (uint32_t)ar; ar = (uint64_t)bbr->r_ctl.rc_resend; ar &= 0x00000000ffffffff; log.u_bbr.flex5 = (uint32_t)ar; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.flex8 = to_num; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_RTO, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_startup_event(struct tcp_bbr *bbr, uint32_t cts, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint8_t reason) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = flex2; log.u_bbr.flex3 = flex3; log.u_bbr.flex4 = 0; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex6 = bbr->r_ctl.rc_lost_at_startup; log.u_bbr.flex8 = reason; log.u_bbr.cur_del_rate = bbr->r_ctl.rc_bbr_lastbtlbw; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_REDUCE, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_hpts_diag(struct tcp_bbr *bbr, uint32_t cts, struct hpts_diag *diag) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = diag->p_nxt_slot; log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; log.u_bbr.flex4 = diag->inp_hptsslot; log.u_bbr.flex5 = diag->slot_remaining; log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; /* Hijack other fields as needed */ log.u_bbr.epoch = diag->have_slept; log.u_bbr.lt_epoch = diag->yet_to_sleep; log.u_bbr.pkts_out = diag->co_ret; log.u_bbr.applimited = diag->hpts_sleep_time; log.u_bbr.delivered = diag->p_prev_slot; log.u_bbr.inflight = diag->p_runningtick; log.u_bbr.bw_inuse = diag->wheel_tick; log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.delRate = diag->maxticks; log.u_bbr.cur_del_rate = diag->p_curtick; log.u_bbr.cur_del_rate <<= 32; log.u_bbr.cur_del_rate |= diag->p_lasttick; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_HPTSDIAG, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_timer_var(struct tcp_bbr *bbr, int mode, uint32_t cts, uint32_t time_since_sent, uint32_t srtt, uint32_t thresh, uint32_t to) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->rc_tp->t_rttvar; log.u_bbr.flex2 = time_since_sent; log.u_bbr.flex3 = srtt; log.u_bbr.flex4 = thresh; log.u_bbr.flex5 = to; log.u_bbr.flex6 = bbr->rc_tp->t_srtt; log.u_bbr.flex8 = mode; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERPREP, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_pacing_delay_calc(struct tcp_bbr *bbr, uint16_t gain, uint32_t len, uint32_t cts, uint32_t usecs, uint64_t bw, uint32_t override, int mod) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = usecs; log.u_bbr.flex2 = len; log.u_bbr.flex3 = (uint32_t)((bw >> 32) & 0x00000000ffffffff); log.u_bbr.flex4 = (uint32_t)(bw & 0x00000000ffffffff); if (override) log.u_bbr.flex5 = (1 << 2); else log.u_bbr.flex5 = 0; log.u_bbr.flex6 = override; log.u_bbr.flex7 = gain; log.u_bbr.flex8 = mod; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_HPTSI_CALC, 0, len, &log, false, &bbr->rc_tv); } } static void bbr_log_to_start(struct tcp_bbr *bbr, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->bbr_timer_src; log.u_bbr.flex2 = to; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; log.u_bbr.flex5 = bbr->rc_inp->inp_hptsslot; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.pkts_out = bbr->rc_inp->inp_flags2; log.u_bbr.flex8 = which; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERSTAR, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_thresh_choice(struct tcp_bbr *bbr, uint32_t cts, uint32_t thresh, uint32_t lro, uint32_t srtt, struct bbr_sendmap *rsm, uint8_t frm) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = thresh; log.u_bbr.flex2 = lro; log.u_bbr.flex3 = bbr->r_ctl.rc_reorder_ts; log.u_bbr.flex4 = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; log.u_bbr.flex5 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.flex6 = srtt; log.u_bbr.flex7 = bbr->r_ctl.rc_reorder_shift; log.u_bbr.flex8 = frm; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_THRESH_CALC, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_to_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts, uint8_t hpts_removed) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = bbr->bbr_timer_src; log.u_bbr.flex3 = bbr->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = bbr->rc_in_persist; log.u_bbr.flex5 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex6 = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); log.u_bbr.flex8 = hpts_removed; log.u_bbr.pkts_out = bbr->rc_pacer_started; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERCANC, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_tstmp_validation(struct tcp_bbr *bbr, uint64_t peer_delta, uint64_t delta) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = bbr->r_ctl.bbr_peer_tsratio; log.u_bbr.flex2 = (peer_delta >> 32); log.u_bbr.flex3 = (peer_delta & 0x00000000ffffffff); log.u_bbr.flex4 = (delta >> 32); log.u_bbr.flex5 = (delta & 0x00000000ffffffff); log.u_bbr.flex7 = bbr->rc_ts_clock_set; log.u_bbr.flex8 = bbr->rc_ts_cant_be_used; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_TSTMP_VAL, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_tsosize(struct tcp_bbr *bbr, uint32_t cts, uint32_t tsosz, uint32_t tls, uint32_t old_val, uint32_t maxseg, int hdwr) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = tsosz; log.u_bbr.flex2 = tls; log.u_bbr.flex3 = tcp_min_hptsi_time; log.u_bbr.flex4 = bbr->r_ctl.bbr_hptsi_bytes_min; log.u_bbr.flex5 = old_val; log.u_bbr.flex6 = maxseg; log.u_bbr.flex7 = bbr->rc_no_pacing; log.u_bbr.flex7 <<= 1; log.u_bbr.flex7 |= bbr->rc_past_init_win; if (hdwr) log.u_bbr.flex8 = 0x80 | bbr->rc_use_google; else log.u_bbr.flex8 = bbr->rc_use_google; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BBRTSO, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_rsmclear(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, uint32_t flags, uint32_t line) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = line; log.u_bbr.flex2 = rsm->r_start; log.u_bbr.flex3 = rsm->r_end; log.u_bbr.flex4 = rsm->r_delivered; log.u_bbr.flex5 = rsm->r_rtr_cnt; log.u_bbr.flex6 = rsm->r_dupack; log.u_bbr.flex7 = rsm->r_tim_lastsent[0]; log.u_bbr.flex8 = rsm->r_flags; /* Hijack the pkts_out fids */ log.u_bbr.applimited = flags; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_RSM_CLEARED, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_bbrupd(struct tcp_bbr *bbr, uint8_t flex8, uint32_t cts, uint32_t flex3, uint32_t flex2, uint32_t flex5, uint32_t flex6, uint32_t pkts_out, int flex7, uint32_t flex4, uint32_t flex1) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = flex2; log.u_bbr.flex3 = flex3; log.u_bbr.flex4 = flex4; log.u_bbr.flex5 = flex5; log.u_bbr.flex6 = flex6; log.u_bbr.flex7 = flex7; /* Hijack the pkts_out fids */ log.u_bbr.pkts_out = pkts_out; log.u_bbr.flex8 = flex8; if (bbr->rc_ack_was_delayed) log.u_bbr.epoch = bbr->r_ctl.rc_ack_hdwr_delay; else log.u_bbr.epoch = 0; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BBRUPD, 0, flex2, &log, false, &bbr->rc_tv); } } static void bbr_log_type_ltbw(struct tcp_bbr *bbr, uint32_t cts, int32_t reason, uint32_t newbw, uint32_t obw, uint32_t diff, uint32_t tim) { if (/*bbr_verbose_logging && */(bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = reason; log.u_bbr.flex2 = newbw; log.u_bbr.flex3 = obw; log.u_bbr.flex4 = diff; log.u_bbr.flex5 = bbr->r_ctl.rc_lt_lost; log.u_bbr.flex6 = bbr->r_ctl.rc_lt_del; log.u_bbr.flex7 = bbr->rc_lt_is_sampling; log.u_bbr.pkts_out = tim; log.u_bbr.bw_inuse = bbr->r_ctl.rc_lt_bw; if (bbr->rc_lt_use_bw == 0) log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch; else log.u_bbr.epoch = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BWSAMP, 0, 0, &log, false, &bbr->rc_tv); } } static inline void bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; log.u_bbr.flex3 = tp->t_maxunacktime; log.u_bbr.flex4 = tp->t_acktime; log.u_bbr.flex8 = event; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_PROGRESS, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_type_log_hdwr_pacing(struct tcp_bbr *bbr, const struct ifnet *ifp, uint64_t rate, uint64_t hw_rate, int line, uint32_t cts, int error) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); log.u_bbr.bw_inuse = rate; log.u_bbr.flex5 = line; log.u_bbr.flex6 = error; log.u_bbr.flex8 = bbr->skip_gain; log.u_bbr.flex8 <<= 1; log.u_bbr.flex8 |= bbr->gain_is_limited; log.u_bbr.flex8 <<= 1; log.u_bbr.flex8 |= bbr->bbr_hdrw_pacing; log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_HDWR_PACE, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_type_bbrsnd(struct tcp_bbr *bbr, uint32_t len, uint32_t slot, uint32_t del_by, uint32_t cts, uint32_t line, uint32_t prev_delay) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = slot; log.u_bbr.flex2 = del_by; log.u_bbr.flex3 = prev_delay; log.u_bbr.flex4 = line; log.u_bbr.flex5 = bbr->r_ctl.rc_last_delay_val; log.u_bbr.flex6 = bbr->r_ctl.rc_hptsi_agg_delay; log.u_bbr.flex7 = (0x0000ffff & bbr->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = bbr->rc_in_persist; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BBRSND, 0, len, &log, false, &bbr->rc_tv); } } static void bbr_log_type_bbrrttprop(struct tcp_bbr *bbr, uint32_t t, uint32_t end, uint32_t tsconv, uint32_t cts, int32_t match, uint32_t seq, uint8_t flags) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->r_ctl.rc_delivered; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = bbr->r_ctl.rc_lowest_rtt; log.u_bbr.flex4 = end; log.u_bbr.flex5 = seq; log.u_bbr.flex6 = t; log.u_bbr.flex7 = match; log.u_bbr.flex8 = flags; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_BBRRTT, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_exit_gain(struct tcp_bbr *bbr, uint32_t cts, int32_t entry_method) { if (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); log.u_bbr.flex1 = bbr->r_ctl.rc_target_at_state; log.u_bbr.flex2 = (bbr->rc_tp->t_maxseg - bbr->rc_last_options); log.u_bbr.flex3 = bbr->r_ctl.gain_epoch; log.u_bbr.flex4 = bbr->r_ctl.rc_pace_max_segs; log.u_bbr.flex5 = bbr->r_ctl.rc_pace_min_segs; log.u_bbr.flex6 = bbr->r_ctl.rc_bbr_state_atflight; log.u_bbr.flex7 = 0; log.u_bbr.flex8 = entry_method; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_EXIT_GAIN, 0, 0, &log, false, &bbr->rc_tv); } } static void bbr_log_settings_change(struct tcp_bbr *bbr, int settings_desired) { if (bbr_verbose_logging && (bbr->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); /* R-HU */ log.u_bbr.flex1 = 0; log.u_bbr.flex2 = 0; log.u_bbr.flex3 = 0; log.u_bbr.flex4 = 0; log.u_bbr.flex7 = 0; log.u_bbr.flex8 = settings_desired; TCP_LOG_EVENTP(bbr->rc_tp, NULL, &bbr->rc_inp->inp_socket->so_rcv, &bbr->rc_inp->inp_socket->so_snd, BBR_LOG_SETTINGS_CHG, 0, 0, &log, false, &bbr->rc_tv); } } /* * Returns the bw from the our filter. */ static inline uint64_t bbr_get_full_bw(struct tcp_bbr *bbr) { uint64_t bw; bw = get_filter_value(&bbr->r_ctl.rc_delrate); return (bw); } static inline void bbr_set_pktepoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line) { uint64_t calclr; uint32_t lost, del; if (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_pktepoch) lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lost_at_pktepoch; else lost = 0; del = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_pkt_epoch_del; if (lost == 0) { calclr = 0; } else if (del) { calclr = lost; calclr *= (uint64_t)1000; calclr /= (uint64_t)del; } else { /* Nothing delivered? 100.0% loss */ calclr = 1000; } bbr->r_ctl.rc_pkt_epoch_loss_rate = (uint32_t)calclr; if (IN_RECOVERY(bbr->rc_tp->t_flags)) bbr->r_ctl.recovery_lr += (uint32_t)calclr; bbr->r_ctl.rc_pkt_epoch++; if (bbr->rc_no_pacing && (bbr->r_ctl.rc_pkt_epoch >= bbr->no_pacing_until)) { bbr->rc_no_pacing = 0; tcp_bbr_tso_size_check(bbr, cts); } bbr->r_ctl.rc_pkt_epoch_rtt = bbr_calc_time(cts, bbr->r_ctl.rc_pkt_epoch_time); bbr->r_ctl.rc_pkt_epoch_time = cts; /* What was our loss rate */ bbr_log_pkt_epoch(bbr, cts, line, lost, del); bbr->r_ctl.rc_pkt_epoch_del = bbr->r_ctl.rc_delivered; bbr->r_ctl.rc_lost_at_pktepoch = bbr->r_ctl.rc_lost; } static inline void bbr_set_epoch(struct tcp_bbr *bbr, uint32_t cts, int32_t line) { uint32_t epoch_time; /* Tick the RTT clock */ bbr->r_ctl.rc_rtt_epoch++; epoch_time = cts - bbr->r_ctl.rc_rcv_epoch_start; bbr_log_time_epoch(bbr, cts, line, epoch_time); bbr->r_ctl.rc_rcv_epoch_start = cts; } static inline void bbr_isit_a_pkt_epoch(struct tcp_bbr *bbr, uint32_t cts, struct bbr_sendmap *rsm, int32_t line, int32_t cum_acked) { if (SEQ_GEQ(rsm->r_delivered, bbr->r_ctl.rc_pkt_epoch_del)) { bbr->rc_is_pkt_epoch_now = 1; } } /* * Returns the bw from either the b/w filter * or from the lt_bw (if the connection is being * policed). */ static inline uint64_t __bbr_get_bw(struct tcp_bbr *bbr) { uint64_t bw, min_bw; uint64_t rtt; int gm_measure_cnt = 1; /* * For startup we make, like google, a * minimum b/w. This is generated from the * IW and the rttProp. We do fall back to srtt * if for some reason (initial handshake) we don't * have a rttProp. We, in the worst case, fall back * to the configured min_bw (rc_initial_hptsi_bw). */ if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { /* Attempt first to use rttProp */ rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop); if (rtt && (rtt < 0xffffffff)) { measure: min_bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) * ((uint64_t)1000000); min_bw /= rtt; if (min_bw < bbr->r_ctl.rc_initial_hptsi_bw) { min_bw = bbr->r_ctl.rc_initial_hptsi_bw; } } else if (bbr->rc_tp->t_srtt != 0) { /* No rttProp, use srtt? */ rtt = bbr_get_rtt(bbr, BBR_SRTT); goto measure; } else { min_bw = bbr->r_ctl.rc_initial_hptsi_bw; } } else min_bw = 0; if ((bbr->rc_past_init_win == 0) && (bbr->r_ctl.rc_delivered > bbr_initial_cwnd(bbr, bbr->rc_tp))) bbr->rc_past_init_win = 1; if ((bbr->rc_use_google) && (bbr->r_ctl.r_measurement_count >= 1)) gm_measure_cnt = 0; if (gm_measure_cnt && ((bbr->r_ctl.r_measurement_count < bbr_min_measurements_req) || (bbr->rc_past_init_win == 0))) { /* For google we use our guess rate until we get 1 measurement */ use_initial_window: rtt = (uint64_t)get_filter_value_small(&bbr->r_ctl.rc_rttprop); if (rtt && (rtt < 0xffffffff)) { /* * We have an RTT measurment. Use that in * combination with our initial window to calculate * a b/w. */ bw = (uint64_t)(bbr_initial_cwnd(bbr, bbr->rc_tp)) * ((uint64_t)1000000); bw /= rtt; if (bw < bbr->r_ctl.rc_initial_hptsi_bw) { bw = bbr->r_ctl.rc_initial_hptsi_bw; } } else { /* Drop back to the 40 and punt to a default */ bw = bbr->r_ctl.rc_initial_hptsi_bw; } if (bw < 1) /* Probably should panic */ bw = 1; if (bw > min_bw) return (bw); else return (min_bw); } if (bbr->rc_lt_use_bw) bw = bbr->r_ctl.rc_lt_bw; else if (bbr->r_recovery_bw && (bbr->rc_use_google == 0)) bw = bbr->r_ctl.red_bw; else bw = get_filter_value(&bbr->r_ctl.rc_delrate); if (bbr->rc_tp->t_peakrate_thr && (bbr->rc_use_google == 0)) { /* * Enforce user set rate limit, keep in mind that * t_peakrate_thr is in B/s already */ bw = uqmin((uint64_t)bbr->rc_tp->t_peakrate_thr, bw); } if (bw == 0) { /* We should not be at 0, go to the initial window then */ goto use_initial_window; } if (bw < 1) /* Probably should panic */ bw = 1; if (bw < min_bw) bw = min_bw; return (bw); } static inline uint64_t bbr_get_bw(struct tcp_bbr *bbr) { uint64_t bw; bw = __bbr_get_bw(bbr); return (bw); } static inline void bbr_reset_lt_bw_interval(struct tcp_bbr *bbr, uint32_t cts) { bbr->r_ctl.rc_lt_epoch = bbr->r_ctl.rc_pkt_epoch; bbr->r_ctl.rc_lt_time = bbr->r_ctl.rc_del_time; bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered; bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } static inline void bbr_reset_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts) { bbr->rc_lt_is_sampling = 0; bbr->rc_lt_use_bw = 0; bbr->r_ctl.rc_lt_bw = 0; bbr_reset_lt_bw_interval(bbr, cts); } static inline void bbr_lt_bw_samp_done(struct tcp_bbr *bbr, uint64_t bw, uint32_t cts, uint32_t timin) { uint64_t diff; /* Do we have a previous sample? */ if (bbr->r_ctl.rc_lt_bw) { /* Get the diff in bytes per second */ if (bbr->r_ctl.rc_lt_bw > bw) diff = bbr->r_ctl.rc_lt_bw - bw; else diff = bw - bbr->r_ctl.rc_lt_bw; if ((diff <= bbr_lt_bw_diff) || (diff <= (bbr->r_ctl.rc_lt_bw / bbr_lt_bw_ratio))) { /* Consider us policed */ uint32_t saved_bw; saved_bw = (uint32_t)bbr->r_ctl.rc_lt_bw; bbr->r_ctl.rc_lt_bw = (bw + bbr->r_ctl.rc_lt_bw) / 2; /* average of two */ bbr->rc_lt_use_bw = 1; bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; /* * Use pkt based epoch for measuring length of * policer up */ bbr->r_ctl.rc_lt_epoch_use = bbr->r_ctl.rc_pkt_epoch; /* * reason 4 is we need to start consider being * policed */ bbr_log_type_ltbw(bbr, cts, 4, (uint32_t)bw, saved_bw, (uint32_t)diff, timin); return; } } bbr->r_ctl.rc_lt_bw = bw; bbr_reset_lt_bw_interval(bbr, cts); bbr_log_type_ltbw(bbr, cts, 5, 0, (uint32_t)bw, 0, timin); } static void bbr_randomize_extra_state_time(struct tcp_bbr *bbr) { uint32_t ran, deduct; ran = arc4random_uniform(bbr_rand_ot); if (ran) { deduct = bbr->r_ctl.rc_level_state_extra / ran; bbr->r_ctl.rc_level_state_extra -= deduct; } } /* * Return randomly the starting state * to use in probebw. */ static uint8_t bbr_pick_probebw_substate(struct tcp_bbr *bbr, uint32_t cts) { uint32_t ran; uint8_t ret_val; /* Initialize the offset to 0 */ bbr->r_ctl.rc_exta_time_gd = 0; bbr->rc_hit_state_1 = 0; bbr->r_ctl.rc_level_state_extra = 0; ran = arc4random_uniform((BBR_SUBSTATE_COUNT-1)); /* * The math works funny here :) the return value is used to set the * substate and then the state change is called which increments by * one. So if we return 1 (DRAIN) we will increment to 2 (LEVEL1) when * we fully enter the state. Note that the (8 - 1 - ran) assures that * we return 1 - 7, so we dont return 0 and end up starting in * state 1 (DRAIN). */ ret_val = BBR_SUBSTATE_COUNT - 1 - ran; /* Set an epoch */ if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) bbr_set_epoch(bbr, cts, __LINE__); bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; return (ret_val); } static void bbr_lt_bw_sampling(struct tcp_bbr *bbr, uint32_t cts, int32_t loss_detected) { uint32_t diff, d_time; uint64_t del_time, bw, lost, delivered; if (bbr->r_use_policer == 0) return; if (bbr->rc_lt_use_bw) { /* We are using lt bw do we stop yet? */ diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch_use; if (diff > bbr_lt_bw_max_rtts) { /* Reset it all */ reset_all: bbr_reset_lt_bw_sampling(bbr, cts); if (bbr->rc_filled_pipe) { bbr_set_epoch(bbr, cts, __LINE__); bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); bbr_substate_change(bbr, cts, __LINE__, 0); bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr_log_type_statechange(bbr, cts, __LINE__); } else { /* * This should not happen really * unless we remove the startup/drain * restrictions above. */ bbr->rc_bbr_state = BBR_STATE_STARTUP; bbr_set_epoch(bbr, cts, __LINE__); bbr->r_ctl.rc_bbr_state_time = cts; bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; bbr_set_state_target(bbr, __LINE__); bbr_log_type_statechange(bbr, cts, __LINE__); } /* reason 0 is to stop using lt-bw */ bbr_log_type_ltbw(bbr, cts, 0, 0, 0, 0, 0); return; } if (bbr_lt_intvl_fp == 0) { /* Not doing false-postive detection */ return; } /* False positive detection */ if (diff == bbr_lt_intvl_fp) { /* At bbr_lt_intvl_fp we record the lost */ bbr->r_ctl.rc_lt_del = bbr->r_ctl.rc_delivered; bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } else if (diff > (bbr_lt_intvl_min_rtts + bbr_lt_intvl_fp)) { /* Now is our loss rate still high? */ lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost; delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del; if ((delivered == 0) || (((lost * 1000)/delivered) < bbr_lt_fd_thresh)) { /* No still below our threshold */ bbr_log_type_ltbw(bbr, cts, 7, lost, delivered, 0, 0); } else { /* Yikes its still high, it must be a false positive */ bbr_log_type_ltbw(bbr, cts, 8, lost, delivered, 0, 0); goto reset_all; } } return; } /* * Wait for the first loss before sampling, to let the policer * exhaust its tokens and estimate the steady-state rate allowed by * the policer. Starting samples earlier includes bursts that * over-estimate the bw. */ if (bbr->rc_lt_is_sampling == 0) { /* reason 1 is to begin doing the sampling */ if (loss_detected == 0) return; bbr_reset_lt_bw_interval(bbr, cts); bbr->rc_lt_is_sampling = 1; bbr_log_type_ltbw(bbr, cts, 1, 0, 0, 0, 0); return; } /* Now how long were we delivering long term last> */ if (TSTMP_GEQ(bbr->r_ctl.rc_del_time, bbr->r_ctl.rc_lt_time)) d_time = bbr->r_ctl.rc_del_time - bbr->r_ctl.rc_lt_time; else d_time = 0; /* To avoid underestimates, reset sampling if we run out of data. */ if (bbr->r_ctl.r_app_limited_until) { /* Can not measure in app-limited state */ bbr_reset_lt_bw_sampling(bbr, cts); /* reason 2 is to reset sampling due to app limits */ bbr_log_type_ltbw(bbr, cts, 2, 0, 0, 0, d_time); return; } diff = bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_lt_epoch; if (diff < bbr_lt_intvl_min_rtts) { /* * need more samples (we don't * start on a round like linux so * we need 1 more). */ /* 6 is not_enough time or no-loss */ bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); return; } if (diff > (4 * bbr_lt_intvl_min_rtts)) { /* * For now if we wait too long, reset all sampling. We need * to do some research here, its possible that we should * base this on how much loss as occurred.. something like * if its under 10% (or some thresh) reset all otherwise * don't. Thats for phase II I guess. */ bbr_reset_lt_bw_sampling(bbr, cts); /* reason 3 is to reset sampling due too long of sampling */ bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time); return; } /* * End sampling interval when a packet is lost, so we estimate the * policer tokens were exhausted. Stopping the sampling before the * tokens are exhausted under-estimates the policed rate. */ if (loss_detected == 0) { /* 6 is not_enough time or no-loss */ bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); return; } /* Calculate packets lost and delivered in sampling interval. */ lost = bbr->r_ctl.rc_lost - bbr->r_ctl.rc_lt_lost; delivered = bbr->r_ctl.rc_delivered - bbr->r_ctl.rc_lt_del; if ((delivered == 0) || (((lost * 1000)/delivered) < bbr_lt_loss_thresh)) { bbr_log_type_ltbw(bbr, cts, 6, lost, delivered, 0, d_time); return; } if (d_time < 1000) { /* Not enough time. wait */ /* 6 is not_enough time or no-loss */ bbr_log_type_ltbw(bbr, cts, 6, 0, 0, 0, d_time); return; } if (d_time >= (0xffffffff / USECS_IN_MSEC)) { /* Too long */ bbr_reset_lt_bw_sampling(bbr, cts); /* reason 3 is to reset sampling due too long of sampling */ bbr_log_type_ltbw(bbr, cts, 3, 0, 0, 0, d_time); return; } del_time = d_time; bw = delivered; bw *= (uint64_t)USECS_IN_SECOND; bw /= del_time; bbr_lt_bw_samp_done(bbr, bw, cts, d_time); } /* * Allocate a sendmap from our zone. */ static struct bbr_sendmap * bbr_alloc(struct tcp_bbr *bbr) { struct bbr_sendmap *rsm; BBR_STAT_INC(bbr_to_alloc); rsm = uma_zalloc(bbr_zone, (M_NOWAIT | M_ZERO)); if (rsm) { bbr->r_ctl.rc_num_maps_alloced++; return (rsm); } if (bbr->r_ctl.rc_free_cnt) { BBR_STAT_INC(bbr_to_alloc_emerg); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next); bbr->r_ctl.rc_free_cnt--; return (rsm); } BBR_STAT_INC(bbr_to_alloc_failed); return (NULL); } static struct bbr_sendmap * bbr_alloc_full_limit(struct tcp_bbr *bbr) { if ((V_tcp_map_entries_limit > 0) && (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { BBR_STAT_INC(bbr_alloc_limited); if (!bbr->alloc_limit_reported) { bbr->alloc_limit_reported = 1; BBR_STAT_INC(bbr_alloc_limited_conns); } return (NULL); } return (bbr_alloc(bbr)); } /* wrapper to allocate a sendmap entry, subject to a specific limit */ static struct bbr_sendmap * bbr_alloc_limit(struct tcp_bbr *bbr, uint8_t limit_type) { struct bbr_sendmap *rsm; if (limit_type) { /* currently there is only one limit type */ if (V_tcp_map_split_limit > 0 && bbr->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { BBR_STAT_INC(bbr_split_limited); if (!bbr->alloc_limit_reported) { bbr->alloc_limit_reported = 1; BBR_STAT_INC(bbr_alloc_limited_conns); } return (NULL); } } /* allocate and mark in the limit type, if set */ rsm = bbr_alloc(bbr); if (rsm != NULL && limit_type) { rsm->r_limit_type = limit_type; bbr->r_ctl.rc_num_split_allocs++; } return (rsm); } static void bbr_free(struct tcp_bbr *bbr, struct bbr_sendmap *rsm) { if (rsm->r_limit_type) { /* currently there is only one limit type */ bbr->r_ctl.rc_num_split_allocs--; } if (rsm->r_is_smallmap) bbr->r_ctl.rc_num_small_maps_alloced--; if (bbr->r_ctl.rc_tlp_send == rsm) bbr->r_ctl.rc_tlp_send = NULL; if (bbr->r_ctl.rc_resend == rsm) { bbr->r_ctl.rc_resend = NULL; } if (bbr->r_ctl.rc_next == rsm) bbr->r_ctl.rc_next = NULL; if (bbr->r_ctl.rc_sacklast == rsm) bbr->r_ctl.rc_sacklast = NULL; if (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) { memset(rsm, 0, sizeof(struct bbr_sendmap)); TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); rsm->r_limit_type = 0; bbr->r_ctl.rc_free_cnt++; return; } bbr->r_ctl.rc_num_maps_alloced--; uma_zfree(bbr_zone, rsm); } /* * Returns the BDP. */ static uint64_t bbr_get_bw_delay_prod(uint64_t rtt, uint64_t bw) { /* * Calculate the bytes in flight needed given the bw (in bytes per * second) and the specifyed rtt in useconds. We need to put out the - * returned value per RTT to match that rate. Gain will normaly + * returned value per RTT to match that rate. Gain will normally * raise it up from there. * * This should not overflow as long as the bandwidth is below 1 * TByte per second (bw < 10**12 = 2**40) and the rtt is smaller * than 1000 seconds (rtt < 10**3 * 10**6 = 10**9 = 2**30). */ uint64_t usec_per_sec; usec_per_sec = USECS_IN_SECOND; return ((rtt * bw) / usec_per_sec); } /* * Return the initial cwnd. */ static uint32_t bbr_initial_cwnd(struct tcp_bbr *bbr, struct tcpcb *tp) { uint32_t i_cwnd; if (bbr->rc_init_win) { i_cwnd = bbr->rc_init_win * tp->t_maxseg; } else if (V_tcp_initcwnd_segments) i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), max(2 * tp->t_maxseg, 14600)); else if (V_tcp_do_rfc3390) i_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); else { /* Per RFC5681 Section 3.1 */ if (tp->t_maxseg > 2190) i_cwnd = 2 * tp->t_maxseg; else if (tp->t_maxseg > 1095) i_cwnd = 3 * tp->t_maxseg; else i_cwnd = 4 * tp->t_maxseg; } return (i_cwnd); } /* * Given a specified gain, return the target * cwnd based on that gain. */ static uint32_t bbr_get_raw_target_cwnd(struct tcp_bbr *bbr, uint32_t gain, uint64_t bw) { uint64_t bdp, rtt; uint32_t cwnd; if ((get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) || (bbr_get_full_bw(bbr) == 0)) { /* No measurements yet */ return (bbr_initial_cwnd(bbr, bbr->rc_tp)); } /* * Get bytes per RTT needed (rttProp is normally in * bbr_cwndtarget_rtt_touse) */ rtt = bbr_get_rtt(bbr, bbr_cwndtarget_rtt_touse); /* Get the bdp from the two values */ bdp = bbr_get_bw_delay_prod(rtt, bw); /* Now apply the gain */ cwnd = (uint32_t)(((bdp * ((uint64_t)gain)) + (uint64_t)(BBR_UNIT - 1)) / ((uint64_t)BBR_UNIT)); return (cwnd); } static uint32_t bbr_get_target_cwnd(struct tcp_bbr *bbr, uint64_t bw, uint32_t gain) { uint32_t cwnd, mss; mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); /* Get the base cwnd with gain rounded to a mss */ cwnd = roundup(bbr_get_raw_target_cwnd(bbr, bw, gain), mss); /* * Add in N (2 default since we do not have a * fq layer to trap packets in) quanta's per the I-D * section 4.2.3.2 quanta adjust. */ cwnd += (bbr_quanta * bbr->r_ctl.rc_pace_max_segs); if (bbr->rc_use_google) { if((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && (bbr_state_val(bbr) == BBR_SUB_GAIN)) { /* * The linux implementation adds * an extra 2 x mss in gain cycle which * is documented no-where except in the code. * so we add more for Neal undocumented feature */ cwnd += 2 * mss; } if ((cwnd / mss) & 0x1) { /* Round up for odd num mss */ cwnd += mss; } } /* Are we below the min cwnd? */ if (cwnd < get_min_cwnd(bbr)) return (get_min_cwnd(bbr)); return (cwnd); } static uint16_t bbr_gain_adjust(struct tcp_bbr *bbr, uint16_t gain) { if (gain < 1) gain = 1; return (gain); } static uint32_t bbr_get_header_oh(struct tcp_bbr *bbr) { int seg_oh; seg_oh = 0; if (bbr->r_ctl.rc_inc_tcp_oh) { /* Do we include TCP overhead? */ seg_oh = (bbr->rc_last_options + sizeof(struct tcphdr)); } if (bbr->r_ctl.rc_inc_ip_oh) { /* Do we include IP overhead? */ #ifdef INET6 if (bbr->r_is_v6) seg_oh += sizeof(struct ip6_hdr); else #endif #ifdef INET seg_oh += sizeof(struct ip); #endif } if (bbr->r_ctl.rc_inc_enet_oh) { /* Do we include the ethernet overhead? */ seg_oh += sizeof(struct ether_header); } return(seg_oh); } static uint32_t bbr_get_pacing_length(struct tcp_bbr *bbr, uint16_t gain, uint32_t useconds_time, uint64_t bw) { uint64_t divor, res, tim; if (useconds_time == 0) return (0); gain = bbr_gain_adjust(bbr, gain); divor = (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT; tim = useconds_time; res = (tim * bw * gain) / divor; if (res == 0) res = 1; return ((uint32_t)res); } /* * Given a gain and a length return the delay in useconds that * should be used to evenly space out packets * on the connection (based on the gain factor). */ static uint32_t bbr_get_pacing_delay(struct tcp_bbr *bbr, uint16_t gain, int32_t len, uint32_t cts, int nolog) { uint64_t bw, lentim, res; uint32_t usecs, srtt, over = 0; uint32_t seg_oh, num_segs, maxseg; if (len == 0) return (0); maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; num_segs = (len + maxseg - 1) / maxseg; if (bbr->rc_use_google == 0) { seg_oh = bbr_get_header_oh(bbr); len += (num_segs * seg_oh); } gain = bbr_gain_adjust(bbr, gain); bw = bbr_get_bw(bbr); if (bbr->rc_use_google) { uint64_t cbw; /* * Reduce the b/w by the google discount * factor 10 = 1%. */ cbw = bw * (uint64_t)(1000 - bbr->r_ctl.bbr_google_discount); cbw /= (uint64_t)1000; /* We don't apply a discount if it results in 0 */ if (cbw > 0) bw = cbw; } lentim = ((uint64_t)len * (uint64_t)USECS_IN_SECOND * (uint64_t)BBR_UNIT); res = lentim / ((uint64_t)gain * bw); if (res == 0) res = 1; usecs = (uint32_t)res; srtt = bbr_get_rtt(bbr, BBR_SRTT); if (bbr_hptsi_max_mul && bbr_hptsi_max_div && (bbr->rc_use_google == 0) && (usecs > ((srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div))) { /* * We cannot let the delay be more than 1/2 the srtt time. * Otherwise we cannot pace out or send properly. */ over = usecs = (srtt * bbr_hptsi_max_mul) / bbr_hptsi_max_div; BBR_STAT_INC(bbr_hpts_min_time); } if (!nolog) bbr_log_pacing_delay_calc(bbr, gain, len, cts, usecs, bw, over, 1); return (usecs); } static void bbr_ack_received(struct tcpcb *tp, struct tcp_bbr *bbr, struct tcphdr *th, uint32_t bytes_this_ack, uint32_t sack_changed, uint32_t prev_acked, int32_t line, uint32_t losses) { INP_WLOCK_ASSERT(tp->t_inpcb); uint64_t bw; uint32_t cwnd, target_cwnd, saved_bytes, maxseg; int32_t meth; #ifdef STATS if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(th->th_ack, tp->gput_ack)) { /* * Strech acks and compressed acks will cause this to * oscillate but we are doing it the same way as the main * stack so it will be compariable (though possibly not * ideal). */ int32_t cgput; int64_t gput, time_stamp; gput = (int64_t) (th->th_ack - tp->gput_seq) * 8; time_stamp = max(1, ((bbr->r_ctl.rc_rcvtime - tp->gput_ts) / 1000)); cgput = gput / time_stamp; stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, cgput); if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = cgput; } #endif if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) { /* We don't change anything in probe-rtt */ return; } maxseg = tp->t_maxseg - bbr->rc_last_options; saved_bytes = bytes_this_ack; bytes_this_ack += sack_changed; if (bytes_this_ack > prev_acked) { bytes_this_ack -= prev_acked; /* * A byte ack'd gives us a full mss * to be like linux i.e. they count packets. */ if ((bytes_this_ack < maxseg) && bbr->rc_use_google) bytes_this_ack = maxseg; } else { /* Unlikely */ bytes_this_ack = 0; } cwnd = tp->snd_cwnd; bw = get_filter_value(&bbr->r_ctl.rc_delrate); if (bw) target_cwnd = bbr_get_target_cwnd(bbr, bw, (uint32_t)bbr->r_ctl.rc_bbr_cwnd_gain); else target_cwnd = bbr_initial_cwnd(bbr, bbr->rc_tp); if (IN_RECOVERY(tp->t_flags) && (bbr->bbr_prev_in_rec == 0)) { /* * We are entering recovery and * thus packet conservation. */ bbr->pkt_conservation = 1; bbr->r_ctl.rc_recovery_start = bbr->r_ctl.rc_rcvtime; cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + bytes_this_ack; } if (IN_RECOVERY(tp->t_flags)) { uint32_t flight; bbr->bbr_prev_in_rec = 1; if (cwnd > losses) { cwnd -= losses; if (cwnd < maxseg) cwnd = maxseg; } else cwnd = maxseg; flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr_log_type_cwndupd(bbr, flight, 0, losses, 10, 0, 0, line); if (bbr->pkt_conservation) { uint32_t time_in; if (TSTMP_GEQ(bbr->r_ctl.rc_rcvtime, bbr->r_ctl.rc_recovery_start)) time_in = bbr->r_ctl.rc_rcvtime - bbr->r_ctl.rc_recovery_start; else time_in = 0; if (time_in >= bbr_get_rtt(bbr, BBR_RTT_PROP)) { /* Clear packet conservation after an rttProp */ bbr->pkt_conservation = 0; } else { if ((flight + bytes_this_ack) > cwnd) cwnd = flight + bytes_this_ack; if (cwnd < get_min_cwnd(bbr)) cwnd = get_min_cwnd(bbr); tp->snd_cwnd = cwnd; bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, 1, target_cwnd, th->th_ack, line); return; } } } else bbr->bbr_prev_in_rec = 0; if ((bbr->rc_use_google == 0) && bbr->r_ctl.restrict_growth) { bbr->r_ctl.restrict_growth--; if (bytes_this_ack > maxseg) bytes_this_ack = maxseg; } if (bbr->rc_filled_pipe) { /* * Here we have exited startup and filled the pipe. We will * thus allow the cwnd to shrink to the target. We hit here * mostly. */ uint32_t s_cwnd; meth = 2; s_cwnd = min((cwnd + bytes_this_ack), target_cwnd); if (s_cwnd > cwnd) cwnd = s_cwnd; else if (bbr_cwnd_may_shrink || bbr->rc_use_google || bbr->rc_no_pacing) cwnd = s_cwnd; } else { /* * Here we are still in startup, we increase cwnd by what * has been acked. */ if ((cwnd < target_cwnd) || (bbr->rc_past_init_win == 0)) { meth = 3; cwnd += bytes_this_ack; } else { /* * Method 4 means we are at target so no gain in * startup and past the initial window. */ meth = 4; } } tp->snd_cwnd = max(cwnd, get_min_cwnd(bbr)); bbr_log_type_cwndupd(bbr, saved_bytes, sack_changed, prev_acked, meth, target_cwnd, th->th_ack, line); } static void tcp_bbr_partialack(struct tcpcb *tp) { struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= tp->snd_cwnd) { bbr->r_wanted_output = 1; } } static void bbr_post_recovery(struct tcpcb *tp) { struct tcp_bbr *bbr; uint32_t flight; INP_WLOCK_ASSERT(tp->t_inpcb); bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* * Here we just exit recovery. */ EXIT_RECOVERY(tp->t_flags); /* Lock in our b/w reduction for the specified number of pkt-epochs */ bbr->r_recovery_bw = 0; tp->snd_recover = tp->snd_una; tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); bbr->pkt_conservation = 0; if (bbr->rc_use_google == 0) { /* * For non-google mode lets * go ahead and make sure we clear * the recovery state so if we * bounce back in to recovery we * will do PC. */ bbr->bbr_prev_in_rec = 0; } bbr_log_type_exit_rec(bbr); if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent); bbr_log_type_cwndupd(bbr, 0, 0, 0, 15, 0, 0, __LINE__); } else { /* For probe-rtt case lets fix up its saved_cwnd */ if (bbr->r_ctl.rc_saved_cwnd < bbr->r_ctl.rc_cwnd_on_ent) { bbr->r_ctl.rc_saved_cwnd = bbr->r_ctl.rc_cwnd_on_ent; bbr_log_type_cwndupd(bbr, 0, 0, 0, 16, 0, 0, __LINE__); } } flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if ((bbr->rc_use_google == 0) && bbr_do_red) { uint64_t val, lr2use; uint32_t maxseg, newcwnd, acks_inflight, ratio, cwnd; uint32_t *cwnd_p; if (bbr_get_rtt(bbr, BBR_SRTT)) { val = ((uint64_t)bbr_get_rtt(bbr, BBR_RTT_PROP) * (uint64_t)1000); val /= bbr_get_rtt(bbr, BBR_SRTT); ratio = (uint32_t)val; } else ratio = 1000; bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, bbr->r_ctl.recovery_lr, 21, ratio, bbr->r_ctl.rc_red_cwnd_pe, __LINE__); if ((ratio < bbr_do_red) || (bbr_do_red == 0)) goto done; if (((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && bbr_prtt_slam_cwnd) || (bbr_sub_drain_slam_cwnd && (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && bbr->rc_hit_state_1 && (bbr_state_val(bbr) == BBR_SUB_DRAIN)) || ((bbr->rc_bbr_state == BBR_STATE_DRAIN) && bbr_slam_cwnd_in_main_drain)) { /* * Here we must poke at the saved cwnd * as well as the cwnd. */ cwnd = bbr->r_ctl.rc_saved_cwnd; cwnd_p = &bbr->r_ctl.rc_saved_cwnd; } else { cwnd = tp->snd_cwnd; cwnd_p = &tp->snd_cwnd; } maxseg = tp->t_maxseg - bbr->rc_last_options; /* Add the overall lr with the recovery lr */ if (bbr->r_ctl.rc_lost == 0) lr2use = 0; else if (bbr->r_ctl.rc_delivered == 0) lr2use = 1000; else { lr2use = bbr->r_ctl.rc_lost * 1000; lr2use /= bbr->r_ctl.rc_delivered; } lr2use += bbr->r_ctl.recovery_lr; acks_inflight = (flight / (maxseg * 2)); if (bbr_red_scale) { lr2use *= bbr_get_rtt(bbr, BBR_SRTT); lr2use /= bbr_red_scale; if ((bbr_red_growth_restrict) && ((bbr_get_rtt(bbr, BBR_SRTT)/bbr_red_scale) > 1)) bbr->r_ctl.restrict_growth += acks_inflight; } if (lr2use) { val = (uint64_t)cwnd * lr2use; val /= 1000; if (cwnd > val) newcwnd = roundup((cwnd - val), maxseg); else newcwnd = maxseg; } else { val = (uint64_t)cwnd * (uint64_t)bbr_red_mul; val /= (uint64_t)bbr_red_div; newcwnd = roundup((uint32_t)val, maxseg); } /* with standard delayed acks how many acks can I expect? */ if (bbr_drop_limit == 0) { /* * Anticpate how much we will * raise the cwnd based on the acks. */ if ((newcwnd + (acks_inflight * maxseg)) < get_min_cwnd(bbr)) { /* We do enforce the min (with the acks) */ newcwnd = (get_min_cwnd(bbr) - acks_inflight); } } else { /* * A strict drop limit of N is is inplace */ if (newcwnd < (bbr_drop_limit * maxseg)) { newcwnd = bbr_drop_limit * maxseg; } } /* For the next N acks do we restrict the growth */ *cwnd_p = newcwnd; if (tp->snd_cwnd > newcwnd) tp->snd_cwnd = newcwnd; bbr_log_type_cwndupd(bbr, bbr_red_mul, bbr_red_div, val, 22, (uint32_t)lr2use, bbr_get_rtt(bbr, BBR_SRTT), __LINE__); bbr->r_ctl.rc_red_cwnd_pe = bbr->r_ctl.rc_pkt_epoch; } done: bbr->r_ctl.recovery_lr = 0; if (flight <= tp->snd_cwnd) { bbr->r_wanted_output = 1; } tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); } static void bbr_setup_red_bw(struct tcp_bbr *bbr, uint32_t cts) { bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate); /* Limit the drop in b/w to 1/2 our current filter. */ if (bbr->r_ctl.red_bw > bbr->r_ctl.rc_bbr_cur_del_rate) bbr->r_ctl.red_bw = bbr->r_ctl.rc_bbr_cur_del_rate; if (bbr->r_ctl.red_bw < (get_filter_value(&bbr->r_ctl.rc_delrate) / 2)) bbr->r_ctl.red_bw = get_filter_value(&bbr->r_ctl.rc_delrate) / 2; tcp_bbr_tso_size_check(bbr, cts); } static void bbr_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type, struct bbr_sendmap *rsm) { struct tcp_bbr *bbr; INP_WLOCK_ASSERT(tp->t_inpcb); bbr = (struct tcp_bbr *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: if (!IN_RECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; /* Start a new epoch */ bbr_set_pktepoch(bbr, bbr->r_ctl.rc_rcvtime, __LINE__); if (bbr->rc_lt_is_sampling || bbr->rc_lt_use_bw) { /* * Move forward the lt epoch * so it won't count the truncated * epoch. */ bbr->r_ctl.rc_lt_epoch++; } if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { /* * Just like the policer detection code * if we are in startup we must push * forward the last startup epoch * to hide the truncated PE. */ bbr->r_ctl.rc_bbr_last_startup_epoch++; } bbr->r_ctl.rc_cwnd_on_ent = tp->snd_cwnd; ENTER_RECOVERY(tp->t_flags); bbr->rc_tlp_rtx_out = 0; bbr->r_ctl.recovery_lr = bbr->r_ctl.rc_pkt_epoch_loss_rate; tcp_bbr_tso_size_check(bbr, bbr->r_ctl.rc_rcvtime); if (bbr->rc_inp->inp_in_hpts && ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK) == 0)) { /* * When we enter recovery, we need to restart * any timers. This may mean we gain an agg * early, which will be made up for at the last * rxt out. */ bbr->rc_timer_first = 1; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } /* * Calculate a new cwnd based on to the current * delivery rate with no gain. We get the bdp * without gaining it up like we normally would and * we use the last cur_del_rate. */ if ((bbr->rc_use_google == 0) && (bbr->r_ctl.bbr_rttprobe_gain_val || (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT))) { tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + (tp->t_maxseg - bbr->rc_last_options); if (tp->snd_cwnd < get_min_cwnd(bbr)) { /* We always gate to min cwnd */ tp->snd_cwnd = get_min_cwnd(bbr); } bbr_log_type_cwndupd(bbr, 0, 0, 0, 14, 0, 0, __LINE__); } bbr_log_type_enter_rec(bbr, rsm->r_start); } break; case CC_RTO_ERR: KMOD_TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ bbr_reset_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime); if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; tp->snd_cwnd = max(tp->snd_cwnd, bbr->r_ctl.rc_cwnd_on_ent); bbr_log_type_cwndupd(bbr, 0, 0, 0, 13, 0, 0, __LINE__); } tp->t_badrxtwin = 0; break; } } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. * - Delayed acks are enabled or this is a half-synchronized T/TCP * connection. * - The data being acked is less than a full segment (a stretch ack * of more than a segment we should ack. * - nsegs is 1 (if its more than that we received more than 1 ack). */ #define DELAY_ACK(tp, bbr, nsegs) \ (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ ((tp->t_flags & TF_DELACK) == 0) && \ ((bbr->bbr_segs_rcvd + nsegs) < tp->t_delayed_ack) && \ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) /* * Return the lowest RSM in the map of * packets still in flight that is not acked. * This should normally find on the first one * since we remove packets from the send * map after they are marked ACKED. */ static struct bbr_sendmap * bbr_find_lowest_rsm(struct tcp_bbr *bbr) { struct bbr_sendmap *rsm; /* * Walk the time-order transmitted list looking for an rsm that is * not acked. This will be the one that was sent the longest time * ago that is still outstanding. */ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_tmap, r_tnext) { if (rsm->r_flags & BBR_ACKED) { continue; } goto finish; } finish: return (rsm); } static struct bbr_sendmap * bbr_find_high_nonack(struct tcp_bbr *bbr, struct bbr_sendmap *rsm) { struct bbr_sendmap *prsm; /* * Walk the sequence order list backward until we hit and arrive at * the highest seq not acked. In theory when this is called it * should be the last segment (which it was not). */ prsm = rsm; TAILQ_FOREACH_REVERSE_FROM(prsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { if (prsm->r_flags & (BBR_ACKED | BBR_HAS_FIN)) { continue; } return (prsm); } return (NULL); } /* * Returns to the caller the number of microseconds that * the packet can be outstanding before we think we * should have had an ack returned. */ static uint32_t bbr_calc_thresh_rack(struct tcp_bbr *bbr, uint32_t srtt, uint32_t cts, struct bbr_sendmap *rsm) { /* * lro is the flag we use to determine if we have seen reordering. * If it gets set we have seen reordering. The reorder logic either * works in one of two ways: * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as * passed we no longer consider reordering has occuring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro * to 1. * * In the end if lro is non-zero we add the extra time for * reordering in. */ int32_t lro; uint32_t thresh, t_rxtcur; if (srtt == 0) srtt = 1; if (bbr->r_ctl.rc_reorder_ts) { if (bbr->r_ctl.rc_reorder_fade) { if (SEQ_GEQ(cts, bbr->r_ctl.rc_reorder_ts)) { lro = cts - bbr->r_ctl.rc_reorder_ts; if (lro == 0) { /* * No time as passed since the last * reorder, mark it as reordering. */ lro = 1; } } else { /* Negative time? */ lro = 0; } if (lro > bbr->r_ctl.rc_reorder_fade) { /* Turn off reordering seen too */ bbr->r_ctl.rc_reorder_ts = 0; lro = 0; } } else { /* Reodering does not fade */ lro = 1; } } else { lro = 0; } thresh = srtt + bbr->r_ctl.rc_pkt_delay; if (lro) { /* It must be set, if not you get 1/4 rtt */ if (bbr->r_ctl.rc_reorder_shift) thresh += (srtt >> bbr->r_ctl.rc_reorder_shift); else thresh += (srtt >> 2); } else { thresh += 1000; } /* We don't let the rack timeout be above a RTO */ if ((bbr->rc_tp)->t_srtt == 0) t_rxtcur = BBR_INITIAL_RTO; else t_rxtcur = TICKS_2_USEC(bbr->rc_tp->t_rxtcur); if (thresh > t_rxtcur) { thresh = t_rxtcur; } /* And we don't want it above the RTO max either */ if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND); } bbr_log_thresh_choice(bbr, cts, thresh, lro, srtt, rsm, BBR_TO_FRM_RACK); return (thresh); } /* * Return to the caller the amount of time in mico-seconds * that should be used for the TLP timer from the last * send time of this packet. */ static uint32_t bbr_calc_thresh_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t srtt, uint32_t cts) { uint32_t thresh, len, maxseg, t_rxtcur; struct bbr_sendmap *prsm; if (srtt == 0) srtt = 1; if (bbr->rc_tlp_threshold) thresh = srtt + (srtt / bbr->rc_tlp_threshold); else thresh = (srtt * 2); maxseg = tp->t_maxseg - bbr->rc_last_options; /* Get the previous sent packet, if any */ len = rsm->r_end - rsm->r_start; /* 2.1 behavior */ prsm = TAILQ_PREV(rsm, bbr_head, r_tnext); if (prsm && (len <= maxseg)) { /* * Two packets outstanding, thresh should be (2*srtt) + * possible inter-packet delay (if any). */ uint32_t inter_gap = 0; int idx, nidx; idx = rsm->r_rtr_cnt - 1; nidx = prsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { /* Yes it was sent later (or at the same time) */ inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; } thresh += inter_gap; } else if (len <= maxseg) { /* * Possibly compensate for delayed-ack. */ uint32_t alt_thresh; alt_thresh = srtt + (srtt / 2) + bbr_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } /* Not above the current RTO */ if (tp->t_srtt == 0) t_rxtcur = BBR_INITIAL_RTO; else t_rxtcur = TICKS_2_USEC(tp->t_rxtcur); bbr_log_thresh_choice(bbr, cts, thresh, t_rxtcur, srtt, rsm, BBR_TO_FRM_TLP); /* Not above an RTO */ if (thresh > t_rxtcur) { thresh = t_rxtcur; } /* Not above a RTO max */ if (thresh > (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND)) { thresh = (((uint32_t)bbr->rc_max_rto_sec) * USECS_IN_SECOND); } /* And now apply the user TLP min */ if (thresh < bbr_tlp_min) { thresh = bbr_tlp_min; } return (thresh); } /* * Return one of three RTTs to use (in microseconds). */ static __inline uint32_t bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type) { uint32_t f_rtt; uint32_t srtt; f_rtt = get_filter_value_small(&bbr->r_ctl.rc_rttprop); if (get_filter_value_small(&bbr->r_ctl.rc_rttprop) == 0xffffffff) { /* We have no rtt at all */ if (bbr->rc_tp->t_srtt == 0) f_rtt = BBR_INITIAL_RTO; else f_rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); /* * Since we don't know how good the rtt is apply a * delayed-ack min */ if (f_rtt < bbr_delayed_ack_time) { f_rtt = bbr_delayed_ack_time; } } /* Take the filter version or last measured pkt-rtt */ if (rtt_type == BBR_RTT_PROP) { srtt = f_rtt; } else if (rtt_type == BBR_RTT_PKTRTT) { if (bbr->r_ctl.rc_pkt_epoch_rtt) { srtt = bbr->r_ctl.rc_pkt_epoch_rtt; } else { /* No pkt rtt yet */ srtt = f_rtt; } } else if (rtt_type == BBR_RTT_RACK) { srtt = bbr->r_ctl.rc_last_rtt; /* We need to add in any internal delay for our timer */ if (bbr->rc_ack_was_delayed) srtt += bbr->r_ctl.rc_ack_hdwr_delay; } else if (rtt_type == BBR_SRTT) { srtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); } else { /* TSNH */ srtt = f_rtt; #ifdef BBR_INVARIANTS panic("Unknown rtt request type %d", rtt_type); #endif } return (srtt); } static int bbr_is_lost(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts) { uint32_t thresh; thresh = bbr_calc_thresh_rack(bbr, bbr_get_rtt(bbr, BBR_RTT_RACK), cts, rsm); if ((cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) >= thresh) { /* It is lost (past time) */ return (1); } return (0); } /* * Return a sendmap if we need to retransmit something. */ static struct bbr_sendmap * bbr_check_recovery_mode(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* * Check to see that we don't need to fall into recovery. We will * need to do so if our oldest transmit is past the time we should * have had an ack. */ struct bbr_sendmap *rsm; int32_t idx; if (TAILQ_EMPTY(&bbr->r_ctl.rc_map)) { /* Nothing outstanding that we know of */ return (NULL); } rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if (rsm == NULL) { /* Nothing in the transmit map */ return (NULL); } if (tp->t_flags & TF_SENTFIN) { /* Fin restricted, don't find anything once a fin is sent */ return (NULL); } if (rsm->r_flags & BBR_ACKED) { /* * Ok the first one is acked (this really should not happen * since we remove the from the tmap once they are acked) */ rsm = bbr_find_lowest_rsm(bbr); if (rsm == NULL) return (NULL); } idx = rsm->r_rtr_cnt - 1; if (SEQ_LEQ(cts, rsm->r_tim_lastsent[idx])) { /* Send timestamp is the same or less? can't be ready */ return (NULL); } /* Get our RTT time */ if (bbr_is_lost(bbr, rsm, cts) && ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || (rsm->r_flags & BBR_SACK_PASSED))) { if ((rsm->r_flags & BBR_MARKED_LOST) == 0) { rsm->r_flags |= BBR_MARKED_LOST; bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; } bbr_cong_signal(tp, NULL, CC_NDUPACK, rsm); #ifdef BBR_INVARIANTS if ((rsm->r_end - rsm->r_start) == 0) panic("tp:%p bbr:%p rsm:%p length is 0?", tp, bbr, rsm); #endif return (rsm); } return (NULL); } /* * RACK Timer, here we simply do logging and house keeping. * the normal bbr_output_wtime() function will call the * appropriate thing to check if we need to do a RACK retransmit. * We return 1, saying don't proceed with bbr_output_wtime only * when all timers have been stopped (destroyed PCB?). */ static int bbr_timeout_rack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* * This timer simply provides an internal trigger to send out data. * The check_recovery_mode call will see if there are needed * retransmissions, if so we will enter fast-recovery. The output * call may or may not do the same thing depending on sysctl * settings. */ uint32_t lost; if (bbr->rc_all_timers_stopped) { return (1); } if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } BBR_STAT_INC(bbr_to_tot); lost = bbr->r_ctl.rc_lost; if (bbr->r_state && (bbr->r_state != tp->t_state)) bbr_set_state(tp, bbr, 0); bbr_log_to_event(bbr, cts, BBR_TO_FRM_RACK); if (bbr->r_ctl.rc_resend == NULL) { /* Lets do the check here */ bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); } if (bbr_policer_call_from_rack_to) bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost)); bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; return (0); } static __inline void bbr_clone_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *nrsm, struct bbr_sendmap *rsm, uint32_t start) { int idx; nrsm->r_start = start; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; /* We don't transfer forward the SYN flag */ nrsm->r_flags &= ~BBR_HAS_SYN; /* We move forward the FIN flag, not that this should happen */ rsm->r_flags &= ~BBR_HAS_FIN; nrsm->r_dupack = rsm->r_dupack; nrsm->r_rtr_bytes = 0; nrsm->r_is_gain = rsm->r_is_gain; nrsm->r_is_drain = rsm->r_is_drain; nrsm->r_delivered = rsm->r_delivered; nrsm->r_ts_valid = rsm->r_ts_valid; nrsm->r_del_ack_ts = rsm->r_del_ack_ts; nrsm->r_del_time = rsm->r_del_time; nrsm->r_app_limited = rsm->r_app_limited; nrsm->r_first_sent_time = rsm->r_first_sent_time; nrsm->r_flight_at_send = rsm->r_flight_at_send; /* We split a piece the lower section looses any just_ret flag. */ nrsm->r_bbr_state = rsm->r_bbr_state; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } rsm->r_end = nrsm->r_start; idx = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); idx /= 8; /* Check if we got too small */ if ((rsm->r_is_smallmap == 0) && ((rsm->r_end - rsm->r_start) <= idx)) { bbr->r_ctl.rc_num_small_maps_alloced++; rsm->r_is_smallmap = 1; } /* Check the new one as well */ if ((nrsm->r_end - nrsm->r_start) <= idx) { bbr->r_ctl.rc_num_small_maps_alloced++; nrsm->r_is_smallmap = 1; } } static int bbr_sack_mergable(struct bbr_sendmap *at, uint32_t start, uint32_t end) { /* * Given a sack block defined by * start and end, and a current postion * at. Return 1 if either side of at * would show that the block is mergable * to that side. A block to be mergable * must have overlap with the start/end * and be in the SACK'd state. */ struct bbr_sendmap *l_rsm; struct bbr_sendmap *r_rsm; /* first get the either side blocks */ l_rsm = TAILQ_PREV(at, bbr_head, r_next); r_rsm = TAILQ_NEXT(at, r_next); if (l_rsm && (l_rsm->r_flags & BBR_ACKED)) { /* Potentially mergeable */ if ((l_rsm->r_end == start) || (SEQ_LT(start, l_rsm->r_end) && SEQ_GT(end, l_rsm->r_end))) { /* * map blk |------| * sack blk |------| * * map blk |------| * sack blk |------| */ return (1); } } if (r_rsm && (r_rsm->r_flags & BBR_ACKED)) { /* Potentially mergeable */ if ((r_rsm->r_start == end) || (SEQ_LT(start, r_rsm->r_start) && SEQ_GT(end, r_rsm->r_start))) { /* * map blk |---------| * sack blk |----| * * map blk |---------| * sack blk |-------| */ return (1); } } return (0); } static struct bbr_sendmap * bbr_merge_rsm(struct tcp_bbr *bbr, struct bbr_sendmap *l_rsm, struct bbr_sendmap *r_rsm) { /* * We are merging two ack'd RSM's, * the l_rsm is on the left (lower seq * values) and the r_rsm is on the right * (higher seq value). The simplest way * to merge these is to move the right * one into the left. I don't think there * is any reason we need to try to find * the oldest (or last oldest retransmitted). */ l_rsm->r_end = r_rsm->r_end; if (l_rsm->r_dupack < r_rsm->r_dupack) l_rsm->r_dupack = r_rsm->r_dupack; if (r_rsm->r_rtr_bytes) l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; if (r_rsm->r_in_tmap) { /* This really should not happen */ TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, r_rsm, r_tnext); } if (r_rsm->r_app_limited) l_rsm->r_app_limited = r_rsm->r_app_limited; /* Now the flags */ if (r_rsm->r_flags & BBR_HAS_FIN) l_rsm->r_flags |= BBR_HAS_FIN; if (r_rsm->r_flags & BBR_TLP) l_rsm->r_flags |= BBR_TLP; if (r_rsm->r_flags & BBR_RWND_COLLAPSED) l_rsm->r_flags |= BBR_RWND_COLLAPSED; if (r_rsm->r_flags & BBR_MARKED_LOST) { /* This really should not happen */ bbr->r_ctl.rc_lost_bytes -= r_rsm->r_end - r_rsm->r_start; } TAILQ_REMOVE(&bbr->r_ctl.rc_map, r_rsm, r_next); if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { /* Transfer the split limit to the map we free */ r_rsm->r_limit_type = l_rsm->r_limit_type; l_rsm->r_limit_type = 0; } bbr_free(bbr, r_rsm); return(l_rsm); } /* * TLP Timer, here we simply setup what segment we want to * have the TLP expire on, the normal bbr_output_wtime() will then * send it out. * * We return 1, saying don't proceed with bbr_output_wtime only * when all timers have been stopped (destroyed PCB?). */ static int bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* * Tail Loss Probe. */ struct bbr_sendmap *rsm = NULL; struct socket *so; uint32_t amm; uint32_t out, avail; uint32_t maxseg; int collapsed_win = 0; if (bbr->rc_all_timers_stopped) { return (1); } if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); return (1); } /* Did we somehow get into persists? */ if (bbr->rc_in_persist) { return (0); } if (bbr->r_state && (bbr->r_state != tp->t_state)) bbr_set_state(tp, bbr, 0); BBR_STAT_INC(bbr_tlp_tot); maxseg = tp->t_maxseg - bbr->rc_last_options; /* * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. */ so = tp->t_inpcb->inp_socket; avail = sbavail(&so->so_snd); out = ctf_outstanding(tp); if (out > tp->snd_wnd) { /* special case, we need a retransmission */ collapsed_win = 1; goto need_retran; } if (avail > out) { /* New data is available */ amm = avail - out; if (amm > maxseg) { amm = maxseg; } else if ((amm < maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { /* not enough to fill a MTU and no-delay is off */ goto need_retran; } /* Set the send-new override */ if ((out + amm) <= tp->snd_wnd) { bbr->rc_tlp_new_data = 1; } else { goto need_retran; } bbr->r_ctl.rc_tlp_seg_send_cnt = 0; bbr->r_ctl.rc_last_tlp_seq = tp->snd_max; bbr->r_ctl.rc_tlp_send = NULL; /* cap any slots */ BBR_STAT_INC(bbr_tlp_newdata); goto send; } need_retran: /* * Ok we need to arrange the last un-acked segment to be re-sent, or * optionally the first un-acked segment. */ if (collapsed_win == 0) { rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); if (rsm && (BBR_ACKED | BBR_HAS_FIN)) { rsm = bbr_find_high_nonack(bbr, rsm); } if (rsm == NULL) { goto restore; } } else { /* * We must find the last segment * that was acceptable by the client. */ TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { if ((rsm->r_flags & BBR_RWND_COLLAPSED) == 0) { /* Found one */ break; } } if (rsm == NULL) { /* None? if so send the first */ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); if (rsm == NULL) goto restore; } } if ((rsm->r_end - rsm->r_start) > maxseg) { /* * We need to split this the last segment in two. */ struct bbr_sendmap *nrsm; nrsm = bbr_alloc_full_limit(bbr); if (nrsm == NULL) { /* * We can't get memory to split, we can either just * not split it. Or retransmit the whole piece, lets * do the large send (BTLP :-) ). */ goto go_for_it; } bbr_clone_rsm(bbr, nrsm, rsm, (rsm->r_end - maxseg)); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~BBR_HAS_FIN); rsm = nrsm; } go_for_it: bbr->r_ctl.rc_tlp_send = rsm; bbr->rc_tlp_rtx_out = 1; if (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) { bbr->r_ctl.rc_tlp_seg_send_cnt++; tp->t_rxtshift++; } else { bbr->r_ctl.rc_last_tlp_seq = rsm->r_start; bbr->r_ctl.rc_tlp_seg_send_cnt = 1; } send: if (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend) { /* * Can't [re]/transmit a segment we have retranmitted the * max times. We need the retransmit timer to take over. */ restore: bbr->rc_tlp_new_data = 0; bbr->r_ctl.rc_tlp_send = NULL; if (rsm) rsm->r_flags &= ~BBR_TLP; BBR_STAT_INC(bbr_tlp_retran_fail); return (0); } else if (rsm) { rsm->r_flags |= BBR_TLP; } if (rsm && (rsm->r_start == bbr->r_ctl.rc_last_tlp_seq) && (bbr->r_ctl.rc_tlp_seg_send_cnt > bbr_tlp_max_resend)) { /* * We have retransmitted to many times for TLP. Switch to * the regular RTO timer */ goto restore; } bbr_log_to_event(bbr, cts, BBR_TO_FRM_TLP); bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); } /* * Delayed ack Timer, here we simply need to setup the * ACK_NOW flag and remove the DELACK flag. From there * the output routine will send the ack out. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { if (bbr->rc_all_timers_stopped) { return (1); } bbr_log_to_event(bbr, cts, BBR_TO_FRM_DELACK); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; KMOD_TCPSTAT_INC(tcps_delack); bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; return (0); } /* * Here we send a KEEP-ALIVE like probe to the * peer, we do not send data. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { struct tcptemp *t_template; int32_t retval = 1; if (bbr->rc_all_timers_stopped) { return (1); } if (bbr->rc_in_persist == 0) return (0); KASSERT(tp->t_inpcb != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); /* * Persistence timer into zero window. Force a byte to be output, if * possible. */ bbr_log_to_event(bbr, cts, BBR_TO_FRM_PERSIST); bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; KMOD_TCPSTAT_INC(tcps_persisttimeo); /* * Have we exceeded the user specified progress time? */ if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } /* * Hack: if the peer is dead/unreachable, we do not time out if the * window is closed. After a full backoff, drop the connection if * the idle time (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { KMOD_TCPSTAT_INC(tcps_persistdrop); tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } if ((sbavail(&bbr->rc_inp->inp_socket->so_snd) == 0) && tp->snd_una == tp->snd_max) { bbr_exit_persist(tp, bbr, cts, __LINE__); retval = 0; goto out; } /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { KMOD_TCPSTAT_INC(tcps_persistdrop); tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } t_template = tcpip_maketemplate(bbr->rc_inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); /* This sends an ack */ if (tp->t_flags & TF_DELACK) tp->t_flags &= ~TF_DELACK; free(t_template, M_TEMP); } if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; bbr_start_hpts_timer(bbr, tp, cts, 3, 0, 0); out: return (retval); } /* * If a keepalive goes off, we had no other timers * happening. We always return 1 here since this * routine either drops the connection or sends * out a segment with respond. */ static int bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { struct tcptemp *t_template; struct inpcb *inp; if (bbr->rc_all_timers_stopped) { return (1); } bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; inp = tp->t_inpcb; bbr_log_to_event(bbr, cts, BBR_TO_FRM_KEEP); /* * Keep-alive timer went off; send something or drop connection if * idle for too long. */ KMOD_TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response if the peer is * up and reachable: either an ACK if the connection is * still alive, or an RST if the peer has closed the * connection due to timeout or reboot. Using sequence * number tp->snd_una-1 causes the transmitted zero-length * segment to lie outside the receive window; by the * protocol spec, this requires the correspondent TCP to * respond. */ KMOD_TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } } bbr_start_hpts_timer(bbr, tp, cts, 4, 0, 0); return (1); dropit: KMOD_TCPSTAT_INC(tcps_keepdrops); tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); return (1); } /* * Retransmit helper function, clear up all the ack * flags and take care of important book keeping. */ static void bbr_remxt_tmr(struct tcpcb *tp) { /* * The retransmit timer went off, all sack'd blocks must be * un-acked. */ struct bbr_sendmap *rsm, *trsm = NULL; struct tcp_bbr *bbr; uint32_t cts, lost; bbr = (struct tcp_bbr *)tp->t_fb_ptr; cts = tcp_get_usecs(&bbr->rc_tv); lost = bbr->r_ctl.rc_lost; if (bbr->r_state && (bbr->r_state != tp->t_state)) bbr_set_state(tp, bbr, 0); TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { if (rsm->r_flags & BBR_ACKED) { uint32_t old_flags; rsm->r_dupack = 0; if (rsm->r_in_tmap == 0) { /* We must re-add it back to the tlist */ if (trsm == NULL) { TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext); } else { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, trsm, rsm, r_tnext); } rsm->r_in_tmap = 1; } old_flags = rsm->r_flags; rsm->r_flags |= BBR_RXT_CLEARED; rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS); bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__); } else { if ((tp->t_state < TCPS_ESTABLISHED) && (rsm->r_start == tp->snd_una)) { /* * Special case for TCP FO. Where * we sent more data beyond the snd_max. * We don't mark that as lost and stop here. */ break; } if ((rsm->r_flags & BBR_MARKED_LOST) == 0) { bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; } if (bbr_marks_rxt_sack_passed) { /* * With this option, we will rack out * in 1ms increments the rest of the packets. */ rsm->r_flags |= BBR_SACK_PASSED | BBR_MARKED_LOST; rsm->r_flags &= ~BBR_WAS_SACKPASS; } else { /* * With this option we only mark them lost * and remove all sack'd markings. We will run * another RXT or a TLP. This will cause * us to eventually send more based on what * ack's come in. */ rsm->r_flags |= BBR_MARKED_LOST; rsm->r_flags &= ~BBR_WAS_SACKPASS; rsm->r_flags &= ~BBR_SACK_PASSED; } } trsm = rsm; } bbr->r_ctl.rc_resend = TAILQ_FIRST(&bbr->r_ctl.rc_map); /* Clear the count (we just un-acked them) */ bbr_log_to_event(bbr, cts, BBR_TO_FRM_TMR); bbr->rc_tlp_new_data = 0; bbr->r_ctl.rc_tlp_seg_send_cnt = 0; /* zap the behindness on a rxt */ bbr->r_ctl.rc_hptsi_agg_delay = 0; bbr->r_agg_early_set = 0; bbr->r_ctl.rc_agg_early = 0; bbr->rc_tlp_rtx_out = 0; bbr->r_ctl.rc_sacked = 0; bbr->r_ctl.rc_sacklast = NULL; bbr->r_timer_override = 1; bbr_lt_bw_sampling(bbr, cts, (bbr->r_ctl.rc_lost > lost)); } /* * Re-transmit timeout! If we drop the PCB we will return 1, otherwise * we will setup to retransmit the lowest seq number outstanding. */ static int bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { int32_t rexmt; int32_t retval = 0; bool isipv6; bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; if (bbr->rc_all_timers_stopped) { return (1); } if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_una == tp->snd_max)) { /* Nothing outstanding .. nothing to do */ return (0); } /* * Retransmission timer went off. Message has not been acked within * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ if (ctf_progress_timeout_check(tp, true)) { retval = 1; bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT); goto out; } bbr_remxt_tmr(tp); if ((bbr->r_ctl.rc_resend == NULL) || ((bbr->r_ctl.rc_resend->r_flags & BBR_RWND_COLLAPSED) == 0)) { /* * If the rwnd collapsed on * the one we are retransmitting * it does not count against the * rxt count. */ tp->t_rxtshift++; } if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; KMOD_TCPSTAT_INC(tcps_timeoutdrop); retval = 1; tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); tcp_set_inp_to_drop(bbr->rc_inp, (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; } if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be limited * to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can be * recovered if this turns out to be a "bad" retransmit. A * retransmit is considered "bad" if an ACK for this segment * is received within RTT/2 interval; the assumption here is * that the ACK was already in flight. See "On Estimating * End-to-End Network Path Properties" by Allman and Paxson * for more details. */ tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; if (!IN_RECOVERY(tp->t_flags)) { tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else { tp->t_flags &= ~TF_PREVVALID; } tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; } else { tp->snd_cwnd = tp->t_maxseg - bbr->rc_last_options; tp->t_flags &= ~TF_PREVVALID; } KMOD_TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = USEC_2_TICKS(BBR_INITIAL_RTO) * tcp_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000)); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple * of packets and process straight to FIN. In that case we won't * catch ESTABLISHED state. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; #else isipv6 = false; #endif if (((V_tcp_pmtud_blackhole_detect == 1) || (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && ((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1))) { /* * Idea here is that at each stage of mtu probe (usually, * 1448 -> 1188 -> 524) should be given 2 chances to recover * before further clamping down. 'tp->t_rxtshift % 2 == 0' * should take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: - * Disable Path MTU Discovery (IP "DF" bit). - * Reduce MTU to lower value than what we negotiated * with peer. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { /* * Record that we may have found a black * hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; } /* * Reduce the MSS to blackhole value or to the * default in an attempt to retransmit. */ #ifdef INET6 isipv6 = bbr->r_is_v6; if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole * and we restore the previous MSS and blackhole * detection flags. The limit '6' is determined by * giving each probe stage (1448, 1188, 524) 2 * chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift >= 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); } } } /* * Disable RFC1323 and SACK if we haven't got any response to our * third SYN to work-around some broken terminal servers (most of * which have hopefully been retired) that have bad VJ header * compression code which trashes TCP segments containing * unknown-to-them TCP options. */ if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current retransmit * times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if (bbr->r_is_v6) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); tp->snd_recover = tp->snd_max; tp->t_flags |= TF_ACKNOW; tp->t_rtttime = 0; out: return (retval); } static int bbr_process_timers(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, uint8_t hpts_calling) { int32_t ret = 0; int32_t timers = (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK); if (timers == 0) { return (0); } if (tp->t_state == TCPS_LISTEN) { /* no timers on listen sockets */ if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) return (0); return (1); } if (TSTMP_LT(cts, bbr->r_ctl.rc_timer_exp)) { uint32_t left; if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { ret = -1; bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling); return (0); } if (hpts_calling == 0) { ret = -2; bbr_log_to_processing(bbr, cts, ret, 0, hpts_calling); return (0); } /* * Ok our timer went off early and we are not paced false * alarm, go back to sleep. */ left = bbr->r_ctl.rc_timer_exp - cts; ret = -3; bbr_log_to_processing(bbr, cts, ret, left, hpts_calling); tcp_hpts_insert(tp->t_inpcb, HPTS_USEC_TO_SLOTS(left)); return (1); } bbr->rc_tmr_stopped = 0; bbr->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; if (timers & PACE_TMR_DELACK) { ret = bbr_timeout_delack(tp, bbr, cts); } else if (timers & PACE_TMR_PERSIT) { ret = bbr_timeout_persist(tp, bbr, cts); } else if (timers & PACE_TMR_RACK) { bbr->r_ctl.rc_tlp_rxt_last_time = cts; ret = bbr_timeout_rack(tp, bbr, cts); } else if (timers & PACE_TMR_TLP) { bbr->r_ctl.rc_tlp_rxt_last_time = cts; ret = bbr_timeout_tlp(tp, bbr, cts); } else if (timers & PACE_TMR_RXT) { bbr->r_ctl.rc_tlp_rxt_last_time = cts; ret = bbr_timeout_rxt(tp, bbr, cts); } else if (timers & PACE_TMR_KEEP) { ret = bbr_timeout_keepalive(tp, bbr, cts); } bbr_log_to_processing(bbr, cts, ret, timers, hpts_calling); return (ret); } static void bbr_timer_cancel(struct tcp_bbr *bbr, int32_t line, uint32_t cts) { if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { uint8_t hpts_removed = 0; if (bbr->rc_inp->inp_in_hpts && (bbr->rc_timer_first == 1)) { /* * If we are canceling timer's when we have the * timer ahead of the output being paced. We also * must remove ourselves from the hpts. */ hpts_removed = 1; tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT); if (bbr->r_ctl.rc_last_delay_val) { /* Update the last hptsi delay too */ uint32_t time_since_send; if (TSTMP_GT(cts, bbr->rc_pacer_started)) time_since_send = cts - bbr->rc_pacer_started; else time_since_send = 0; if (bbr->r_ctl.rc_last_delay_val > time_since_send) { /* Cut down our slot time */ bbr->r_ctl.rc_last_delay_val -= time_since_send; } else { bbr->r_ctl.rc_last_delay_val = 0; } bbr->rc_pacer_started = cts; } } bbr->rc_timer_first = 0; bbr_log_to_cancel(bbr, line, cts, hpts_removed); bbr->rc_tmr_stopped = bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK; bbr->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); } } static void bbr_timer_stop(struct tcpcb *tp, uint32_t timer_type) { struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; bbr->rc_all_timers_stopped = 1; return; } /* * stop all timers always returning 0. */ static int bbr_stopall(struct tcpcb *tp) { return (0); } static void bbr_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) { return; } /* * return true if a bbr timer (rack or tlp) is active. */ static int bbr_timer_active(struct tcpcb *tp, uint32_t timer_type) { return (0); } static uint32_t bbr_get_earliest_send_outstanding(struct tcp_bbr *bbr, struct bbr_sendmap *u_rsm, uint32_t cts) { struct bbr_sendmap *rsm; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if ((rsm == NULL) || (u_rsm == rsm)) return (cts); return(rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); } static void bbr_update_rsm(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts, uint32_t pacing_time) { int32_t idx; rsm->r_rtr_cnt++; rsm->r_dupack = 0; if (rsm->r_rtr_cnt > BBR_NUM_OF_RETRANS) { rsm->r_rtr_cnt = BBR_NUM_OF_RETRANS; rsm->r_flags |= BBR_OVERMAX; } if (rsm->r_flags & BBR_RWND_COLLAPSED) { /* Take off the collapsed flag at rxt */ rsm->r_flags &= ~BBR_RWND_COLLAPSED; } if (rsm->r_flags & BBR_MARKED_LOST) { /* We have retransmitted, its no longer lost */ rsm->r_flags &= ~BBR_MARKED_LOST; bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; } if (rsm->r_flags & BBR_RXT_CLEARED) { /* * We hit a RXT timer on it and * we cleared the "acked" flag. * We now have it going back into * flight, we can remove the cleared * flag and possibly do accounting on * this piece. */ rsm->r_flags &= ~BBR_RXT_CLEARED; } if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & BBR_TLP) == 0)) { bbr->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); } idx = rsm->r_rtr_cnt - 1; rsm->r_tim_lastsent[idx] = cts; rsm->r_pacing_delay = pacing_time; rsm->r_delivered = bbr->r_ctl.rc_delivered; rsm->r_ts_valid = bbr->rc_ts_valid; if (bbr->rc_ts_valid) rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts; if (bbr->r_ctl.r_app_limited_until) rsm->r_app_limited = 1; else rsm->r_app_limited = 0; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) rsm->r_bbr_state = bbr_state_val(bbr); else rsm->r_bbr_state = 8; if (rsm->r_flags & BBR_ACKED) { /* Problably MTU discovery messing with us */ uint32_t old_flags; old_flags = rsm->r_flags; rsm->r_flags &= ~BBR_ACKED; bbr_log_type_rsmclear(bbr, cts, rsm, old_flags, __LINE__); bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); if (bbr->r_ctl.rc_sacked == 0) bbr->r_ctl.rc_sacklast = NULL; } if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); } TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (rsm->r_flags & BBR_SACK_PASSED) { /* We have retransmitted due to the SACK pass */ rsm->r_flags &= ~BBR_SACK_PASSED; rsm->r_flags |= BBR_WAS_SACKPASS; } rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts); rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) { rsm->r_is_gain = 1; rsm->r_is_drain = 0; } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) { rsm->r_is_drain = 1; rsm->r_is_gain = 0; } else { rsm->r_is_drain = 0; rsm->r_is_gain = 0; } rsm->r_del_time = bbr->r_ctl.rc_del_time; /* TEMP GOOGLE CODE */ } /* * Returns 0, or the sequence where we stopped * updating. We also update the lenp to be the amount * of data left. */ static uint32_t bbr_update_entry(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t cts, int32_t *lenp, uint32_t pacing_time) { /* * We (re-)transmitted starting at rsm->r_start for some length * (possibly less than r_end. */ struct bbr_sendmap *nrsm; uint32_t c_end; int32_t len; len = *lenp; c_end = rsm->r_start + len; if (SEQ_GEQ(c_end, rsm->r_end)) { /* * We retransmitted the whole piece or more than the whole * slopping into the next rsm. */ bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); if (c_end == rsm->r_end) { *lenp = 0; return (0); } else { int32_t act_len; /* Hangs over the end return whats left */ act_len = rsm->r_end - rsm->r_start; *lenp = (len - act_len); return (rsm->r_end); } /* We don't get out of this block. */ } /* * Here we retransmitted less than the whole thing which means we * have to split this into what was transmitted and what was not. */ nrsm = bbr_alloc_full_limit(bbr); if (nrsm == NULL) { *lenp = 0; return (0); } /* * So here we are going to take the original rsm and make it what we * retransmitted. nrsm will be the tail portion we did not * retransmit. For example say the chunk was 1, 11 (10 bytes). And * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to * 1, 6 and the new piece will be 6, 11. */ bbr_clone_rsm(bbr, nrsm, rsm, c_end); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); nrsm->r_dupack = 0; if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~BBR_HAS_FIN); bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); *lenp = 0; return (0); } static uint64_t bbr_get_hardware_rate(struct tcp_bbr *bbr) { uint64_t bw; bw = bbr_get_bw(bbr); bw *= (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]; bw /= (uint64_t)BBR_UNIT; return(bw); } static void bbr_setup_less_of_rate(struct tcp_bbr *bbr, uint32_t cts, uint64_t act_rate, uint64_t rate_wanted) { /* * We could not get a full gains worth * of rate. */ if (get_filter_value(&bbr->r_ctl.rc_delrate) >= act_rate) { /* we can't even get the real rate */ uint64_t red; bbr->skip_gain = 1; bbr->gain_is_limited = 0; red = get_filter_value(&bbr->r_ctl.rc_delrate) - act_rate; if (red) filter_reduce_by(&bbr->r_ctl.rc_delrate, red, cts); } else { /* We can use a lower gain */ bbr->skip_gain = 0; bbr->gain_is_limited = 1; } } static void bbr_update_hardware_pacing_rate(struct tcp_bbr *bbr, uint32_t cts) { const struct tcp_hwrate_limit_table *nrte; int error, rate = -1; if (bbr->r_ctl.crte == NULL) return; if ((bbr->rc_inp->inp_route.ro_nh == NULL) || (bbr->rc_inp->inp_route.ro_nh->nh_ifp == NULL)) { /* Lost our routes? */ /* Clear the way for a re-attempt */ bbr->bbr_attempt_hdwr_pace = 0; lost_rate: bbr->gain_is_limited = 0; bbr->skip_gain = 0; bbr->bbr_hdrw_pacing = 0; counter_u64_add(bbr_flows_whdwr_pacing, -1); counter_u64_add(bbr_flows_nohdwr_pacing, 1); tcp_bbr_tso_size_check(bbr, cts); return; } rate = bbr_get_hardware_rate(bbr); nrte = tcp_chg_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp, bbr->rc_inp->inp_route.ro_nh->nh_ifp, rate, (RS_PACING_GEQ|RS_PACING_SUB_OK), &error, NULL); if (nrte == NULL) { goto lost_rate; } if (nrte != bbr->r_ctl.crte) { bbr->r_ctl.crte = nrte; if (error == 0) { BBR_STAT_INC(bbr_hdwr_rl_mod_ok); if (bbr->r_ctl.crte->rate < rate) { /* We have a problem */ bbr_setup_less_of_rate(bbr, cts, bbr->r_ctl.crte->rate, rate); } else { /* We are good */ bbr->gain_is_limited = 0; bbr->skip_gain = 0; } } else { /* A failure should release the tag */ BBR_STAT_INC(bbr_hdwr_rl_mod_fail); bbr->gain_is_limited = 0; bbr->skip_gain = 0; bbr->bbr_hdrw_pacing = 0; } bbr_type_log_hdwr_pacing(bbr, bbr->r_ctl.crte->ptbl->rs_ifp, rate, ((bbr->r_ctl.crte == NULL) ? 0 : bbr->r_ctl.crte->rate), __LINE__, cts, error); } } static void bbr_adjust_for_hw_pacing(struct tcp_bbr *bbr, uint32_t cts) { /* * If we have hardware pacing support * we need to factor that in for our * TSO size. */ const struct tcp_hwrate_limit_table *rlp; uint32_t cur_delay, seg_sz, maxseg, new_tso, delta, hdwr_delay; if ((bbr->bbr_hdrw_pacing == 0) || (IN_RECOVERY(bbr->rc_tp->t_flags)) || (bbr->r_ctl.crte == NULL)) return; if (bbr->hw_pacing_set == 0) { /* Not yet by the hdwr pacing count delay */ return; } if (bbr_hdwr_pace_adjust == 0) { /* No adjustment */ return; } rlp = bbr->r_ctl.crte; if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; else maxseg = BBR_MIN_SEG - bbr->rc_last_options; /* * So lets first get the * time we will take between * TSO sized sends currently without * hardware help. */ cur_delay = bbr_get_pacing_delay(bbr, BBR_UNIT, bbr->r_ctl.rc_pace_max_segs, cts, 1); hdwr_delay = bbr->r_ctl.rc_pace_max_segs / maxseg; hdwr_delay *= rlp->time_between; if (cur_delay > hdwr_delay) delta = cur_delay - hdwr_delay; else delta = 0; bbr_log_type_tsosize(bbr, cts, delta, cur_delay, hdwr_delay, (bbr->r_ctl.rc_pace_max_segs / maxseg), 1); if (delta && (delta < (max(rlp->time_between, bbr->r_ctl.bbr_hptsi_segments_delay_tar)))) { /* * Now lets divide by the pacing * time between each segment the * hardware sends rounding up and * derive a bytes from that. We multiply * that by bbr_hdwr_pace_adjust to get * more bang for our buck. * * The goal is to have the software pacer * waiting no more than an additional * pacing delay if we can (without the * compensation i.e. x bbr_hdwr_pace_adjust). */ seg_sz = max(((cur_delay + rlp->time_between)/rlp->time_between), (bbr->r_ctl.rc_pace_max_segs/maxseg)); seg_sz *= bbr_hdwr_pace_adjust; if (bbr_hdwr_pace_floor && (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) { /* Currently hardware paces * out rs_min_seg segments at a time. * We need to make sure we always send at least * a full burst of bbr_hdwr_pace_floor down. */ seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg; } seg_sz *= maxseg; } else if (delta == 0) { /* * The highest pacing rate is * above our b/w gained. This means * we probably are going quite fast at * the hardware highest rate. Lets just multiply * the calculated TSO size by the * multiplier factor (its probably * 4 segments in the default config for * mlx). */ seg_sz = bbr->r_ctl.rc_pace_max_segs * bbr_hdwr_pace_adjust; if (bbr_hdwr_pace_floor && (seg_sz < bbr->r_ctl.crte->ptbl->rs_min_seg)) { /* Currently hardware paces * out rs_min_seg segments at a time. * We need to make sure we always send at least * a full burst of bbr_hdwr_pace_floor down. */ seg_sz = bbr->r_ctl.crte->ptbl->rs_min_seg; } } else { /* * The pacing time difference is so * big that the hardware will * pace out more rapidly then we * really want and then we * will have a long delay. Lets just keep * the same TSO size so its as if * we were not using hdwr pacing (we * just gain a bit of spacing from the * hardware if seg_sz > 1). */ seg_sz = bbr->r_ctl.rc_pace_max_segs; } if (seg_sz > bbr->r_ctl.rc_pace_max_segs) new_tso = seg_sz; else new_tso = bbr->r_ctl.rc_pace_max_segs; if (new_tso >= (PACE_MAX_IP_BYTES-maxseg)) new_tso = PACE_MAX_IP_BYTES - maxseg; if (new_tso != bbr->r_ctl.rc_pace_max_segs) { bbr_log_type_tsosize(bbr, cts, new_tso, 0, bbr->r_ctl.rc_pace_max_segs, maxseg, 0); bbr->r_ctl.rc_pace_max_segs = new_tso; } } static void tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts) { uint64_t bw; uint32_t old_tso = 0, new_tso; uint32_t maxseg, bytes; uint32_t tls_seg=0; /* * Google/linux uses the following algorithm to determine * the TSO size based on the b/w of the link (from Neal Cardwell email 9/27/18): * * bytes = bw_in_bytes_per_second / 1000 * bytes = min(bytes, 64k) * tso_segs = bytes / MSS * if (bw < 1.2Mbs) * min_tso_segs = 1 * else * min_tso_segs = 2 * tso_segs = max(tso_segs, min_tso_segs) * * * Note apply a device specific limit (we apply this in the * tcp_m_copym). * Note that before the initial measurement is made google bursts out * a full iwnd just like new-reno/cubic. * * We do not use this algorithm. Instead we * use a two phased approach: * * if ( bw <= per-tcb-cross-over) * goal_tso = calculate how much with this bw we * can send in goal-time seconds. * if (goal_tso > mss) * seg = goal_tso / mss * tso = seg * mss * else * tso = mss * if (tso > per-tcb-max) * tso = per-tcb-max * else if ( bw > 512Mbps) * tso = max-tso (64k/mss) * else * goal_tso = bw / per-tcb-divsor * seg = (goal_tso + mss-1)/mss * tso = seg * mss * * if (tso < per-tcb-floor) * tso = per-tcb-floor * if (tso > per-tcb-utter_max) * tso = per-tcb-utter_max * * Note the default per-tcb-divisor is 1000 (same as google). * the goal cross over is 30Mbps however. To recreate googles * algorithm you need to set: * * cross-over = 23,168,000 bps * goal-time = 18000 * per-tcb-max = 2 * per-tcb-divisor = 1000 * per-tcb-floor = 1 * * This will get you "google bbr" behavior with respect to tso size. * * Note we do set anything TSO size until we are past the initial * window. Before that we gnerally use either a single MSS * or we use the full IW size (so we burst a IW at a time) */ if (bbr->rc_tp->t_maxseg > bbr->rc_last_options) { maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; } else { maxseg = BBR_MIN_SEG - bbr->rc_last_options; } old_tso = bbr->r_ctl.rc_pace_max_segs; if (bbr->rc_past_init_win == 0) { /* * Not enough data has been acknowledged to make a * judgement. Set up the initial TSO based on if we * are sending a full IW at once or not. */ if (bbr->rc_use_google) bbr->r_ctl.rc_pace_max_segs = ((bbr->rc_tp->t_maxseg - bbr->rc_last_options) * 2); else if (bbr->bbr_init_win_cheat) bbr->r_ctl.rc_pace_max_segs = bbr_initial_cwnd(bbr, bbr->rc_tp); else bbr->r_ctl.rc_pace_max_segs = bbr->rc_tp->t_maxseg - bbr->rc_last_options; if (bbr->r_ctl.rc_pace_min_segs != bbr->rc_tp->t_maxseg) bbr->r_ctl.rc_pace_min_segs = bbr->rc_tp->t_maxseg; if (bbr->r_ctl.rc_pace_max_segs == 0) { bbr->r_ctl.rc_pace_max_segs = maxseg; } bbr_log_type_tsosize(bbr, cts, bbr->r_ctl.rc_pace_max_segs, tls_seg, old_tso, maxseg, 0); bbr_adjust_for_hw_pacing(bbr, cts); return; } /** * Now lets set the TSO goal based on our delivery rate in * bytes per second. Note we only do this if * we have acked at least the initial cwnd worth of data. */ bw = bbr_get_bw(bbr); if (IN_RECOVERY(bbr->rc_tp->t_flags) && (bbr->rc_use_google == 0)) { /* We clamp to one MSS in recovery */ new_tso = maxseg; } else if (bbr->rc_use_google) { int min_tso_segs; /* Google considers the gain too */ if (bbr->r_ctl.rc_bbr_hptsi_gain != BBR_UNIT) { bw *= bbr->r_ctl.rc_bbr_hptsi_gain; bw /= BBR_UNIT; } bytes = bw / 1024; if (bytes > (64 * 1024)) bytes = 64 * 1024; new_tso = bytes / maxseg; if (bw < ONE_POINT_TWO_MEG) min_tso_segs = 1; else min_tso_segs = 2; if (new_tso < min_tso_segs) new_tso = min_tso_segs; new_tso *= maxseg; } else if (bbr->rc_no_pacing) { new_tso = (PACE_MAX_IP_BYTES / maxseg) * maxseg; } else if (bw <= bbr->r_ctl.bbr_cross_over) { /* * Calculate the worse case b/w TSO if we are inserting no * more than a delay_target number of TSO's. */ uint32_t tso_len, min_tso; tso_len = bbr_get_pacing_length(bbr, BBR_UNIT, bbr->r_ctl.bbr_hptsi_segments_delay_tar, bw); if (tso_len > maxseg) { new_tso = tso_len / maxseg; if (new_tso > bbr->r_ctl.bbr_hptsi_segments_max) new_tso = bbr->r_ctl.bbr_hptsi_segments_max; new_tso *= maxseg; } else { /* * less than a full sized frame yikes.. long rtt or * low bw? */ min_tso = bbr_minseg(bbr); if ((tso_len > min_tso) && (bbr_all_get_min == 0)) new_tso = rounddown(tso_len, min_tso); else new_tso = min_tso; } } else if (bw > FIVETWELVE_MBPS) { /* * This guy is so fast b/w wise that we can TSO as large as * possible of segments that the NIC will allow. */ new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg); } else { /* * This formula is based on attempting to send a segment or * more every bbr_hptsi_per_second. The default is 1000 * which means you are targeting what you can send every 1ms * based on the peers bw. * * If the number drops to say 500, then you are looking more * at 2ms and you will raise how much we send in a single * TSO thus saving CPU (less bbr_output_wtime() calls). The * trade off of course is you will send more at once and * thus tend to clump up the sends into larger "bursts" * building a queue. */ bw /= bbr->r_ctl.bbr_hptsi_per_second; new_tso = roundup(bw, (uint64_t)maxseg); /* * Gate the floor to match what our lower than 48Mbps * algorithm does. The ceiling (bbr_hptsi_segments_max) thus * becomes the floor for this calculation. */ if (new_tso < (bbr->r_ctl.bbr_hptsi_segments_max * maxseg)) new_tso = (bbr->r_ctl.bbr_hptsi_segments_max * maxseg); } if (bbr->r_ctl.bbr_hptsi_segments_floor && (new_tso < (maxseg * bbr->r_ctl.bbr_hptsi_segments_floor))) new_tso = maxseg * bbr->r_ctl.bbr_hptsi_segments_floor; if (new_tso > PACE_MAX_IP_BYTES) new_tso = rounddown(PACE_MAX_IP_BYTES, maxseg); /* Enforce an utter maximum. */ if (bbr->r_ctl.bbr_utter_max && (new_tso > (bbr->r_ctl.bbr_utter_max * maxseg))) { new_tso = bbr->r_ctl.bbr_utter_max * maxseg; } if (old_tso != new_tso) { /* Only log changes */ bbr_log_type_tsosize(bbr, cts, new_tso, tls_seg, old_tso, maxseg, 0); bbr->r_ctl.rc_pace_max_segs = new_tso; } /* We have hardware pacing! */ bbr_adjust_for_hw_pacing(bbr, cts); } static void bbr_log_output(struct tcp_bbr *bbr, struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t cts, struct mbuf *mb, int32_t * abandon, struct bbr_sendmap *hintrsm, uint32_t delay_calc, struct sockbuf *sb) { struct bbr_sendmap *rsm, *nrsm; register uint32_t snd_max, snd_una; uint32_t pacing_time; /* * Add to the RACK log of packets in flight or retransmitted. If * there is a TS option we will use the TS echoed, if not we will * grab a TS. * * Retransmissions will increment the count and move the ts to its * proper place. Note that if options do not include TS's then we * won't be able to effectively use the ACK for an RTT on a retran. * * Notes about r_start and r_end. Lets consider a send starting at * sequence 1 for 10 bytes. In such an example the r_start would be * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. * This means that r_end is actually the first sequence for the next * slot (11). * */ INP_WLOCK_ASSERT(tp->t_inpcb); if (err) { /* * We don't log errors -- we could but snd_max does not * advance in this case either. */ return; } if (th_flags & TH_RST) { /* * We don't log resets and we return immediately from * sending */ *abandon = 1; return; } snd_una = tp->snd_una; if (th_flags & (TH_SYN | TH_FIN) && (hintrsm == NULL)) { /* * The call to bbr_log_output is made before bumping * snd_max. This means we can record one extra byte on a SYN * or FIN if seq_out is adding more on and a FIN is present * (and we are not resending). */ if ((th_flags & TH_SYN) && (tp->iss == seq_out)) len++; if (th_flags & TH_FIN) len++; } if (SEQ_LEQ((seq_out + len), snd_una)) { /* Are sending an old segment to induce an ack (keep-alive)? */ return; } if (SEQ_LT(seq_out, snd_una)) { /* huh? should we panic? */ uint32_t end; end = seq_out + len; seq_out = snd_una; len = end - seq_out; } snd_max = tp->snd_max; if (len == 0) { /* We don't log zero window probes */ return; } pacing_time = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, len, cts, 1); /* First question is it a retransmission? */ if (seq_out == snd_max) { again: rsm = bbr_alloc(bbr); if (rsm == NULL) { return; } rsm->r_flags = 0; if (th_flags & TH_SYN) rsm->r_flags |= BBR_HAS_SYN; if (th_flags & TH_FIN) rsm->r_flags |= BBR_HAS_FIN; rsm->r_tim_lastsent[0] = cts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = seq_out; rsm->r_end = rsm->r_start + len; rsm->r_dupack = 0; rsm->r_delivered = bbr->r_ctl.rc_delivered; rsm->r_pacing_delay = pacing_time; rsm->r_ts_valid = bbr->rc_ts_valid; if (bbr->rc_ts_valid) rsm->r_del_ack_ts = bbr->r_ctl.last_inbound_ts; rsm->r_del_time = bbr->r_ctl.rc_del_time; if (bbr->r_ctl.r_app_limited_until) rsm->r_app_limited = 1; else rsm->r_app_limited = 0; rsm->r_first_sent_time = bbr_get_earliest_send_outstanding(bbr, rsm, cts); rsm->r_flight_at_send = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); /* * Here we must also add in this rsm since snd_max * is updated after we return from a new send. */ rsm->r_flight_at_send += len; TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) rsm->r_bbr_state = bbr_state_val(bbr); else rsm->r_bbr_state = 8; if (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT) { rsm->r_is_gain = 1; rsm->r_is_drain = 0; } else if (bbr->r_ctl.rc_bbr_hptsi_gain < BBR_UNIT) { rsm->r_is_drain = 1; rsm->r_is_gain = 0; } else { rsm->r_is_drain = 0; rsm->r_is_gain = 0; } return; } /* * If we reach here its a retransmission and we need to find it. */ more: if (hintrsm && (hintrsm->r_start == seq_out)) { rsm = hintrsm; hintrsm = NULL; } else if (bbr->r_ctl.rc_next) { /* We have a hint from a previous run */ rsm = bbr->r_ctl.rc_next; } else { /* No hints sorry */ rsm = NULL; } if ((rsm) && (rsm->r_start == seq_out)) { /* * We used rc_next or hintrsm to retransmit, hopefully the * likely case. */ seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time); if (len == 0) { return; } else { goto more; } } /* Ok it was not the last pointer go through it the hard way. */ TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { if (rsm->r_start == seq_out) { seq_out = bbr_update_entry(tp, bbr, rsm, cts, &len, pacing_time); bbr->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next); if (len == 0) { return; } else { continue; } } if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { /* Transmitted within this piece */ /* * Ok we must split off the front and then let the * update do the rest */ nrsm = bbr_alloc_full_limit(bbr); if (nrsm == NULL) { bbr_update_rsm(tp, bbr, rsm, cts, pacing_time); return; } /* * copy rsm to nrsm and then trim the front of rsm * to not include this part. */ bbr_clone_rsm(bbr, nrsm, rsm, seq_out); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~BBR_HAS_FIN); seq_out = bbr_update_entry(tp, bbr, nrsm, cts, &len, pacing_time); if (len == 0) { return; } } } /* * Hmm not found in map did they retransmit both old and on into the * new? */ if (seq_out == tp->snd_max) { goto again; } else if (SEQ_LT(seq_out, tp->snd_max)) { #ifdef BBR_INVARIANTS printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", seq_out, len, tp->snd_una, tp->snd_max); printf("Starting Dump of all rack entries\n"); TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { printf("rsm:%p start:%u end:%u\n", rsm, rsm->r_start, rsm->r_end); } printf("Dump complete\n"); panic("seq_out not found rack:%p tp:%p", bbr, tp); #endif } else { #ifdef BBR_INVARIANTS /* * Hmm beyond sndmax? (only if we are using the new rtt-pack * flag) */ panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", seq_out, len, tp->snd_max, tp); #endif } } static void bbr_collapse_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, int32_t rtt) { /* * Collapse timeout back the cum-ack moved. */ tp->t_rxtshift = 0; tp->t_softerror = 0; } static void tcp_bbr_xmit_timer(struct tcp_bbr *bbr, uint32_t rtt_usecs, uint32_t rsm_send_time, uint32_t r_start, uint32_t tsin) { bbr->rtt_valid = 1; bbr->r_ctl.cur_rtt = rtt_usecs; bbr->r_ctl.ts_in = tsin; if (rsm_send_time) bbr->r_ctl.cur_rtt_send_time = rsm_send_time; } static void bbr_make_timestamp_determination(struct tcp_bbr *bbr) { /** * We have in our bbr control: * 1) The timestamp we started observing cum-acks (bbr->r_ctl.bbr_ts_check_tstmp). * 2) Our timestamp indicating when we sent that packet (bbr->r_ctl.rsm->bbr_ts_check_our_cts). * 3) The current timestamp that just came in (bbr->r_ctl.last_inbound_ts) * 4) The time that the packet that generated that ack was sent (bbr->r_ctl.cur_rtt_send_time) * * Now we can calculate the time between the sends by doing: * * delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts * * And the peer's time between receiving them by doing: * * peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp * * We want to figure out if the timestamp values are in msec, 10msec or usec. * We also may find that we can't use the timestamps if say we see * that the peer_delta indicates that though we may have taken 10ms to * pace out the data, it only saw 1ms between the two packets. This would * indicate that somewhere on the path is a batching entity that is giving * out time-slices of the actual b/w. This would mean we could not use * reliably the peers timestamps. * * We expect delta > peer_delta initially. Until we figure out the * timestamp difference which we will store in bbr->r_ctl.bbr_peer_tsratio. * If we place 1000 there then its a ms vs our usec. If we place 10000 there * then its 10ms vs our usec. If the peer is running a usec clock we would * put a 1 there. If the value is faster then ours, we will disable the * use of timestamps (though we could revist this later if we find it to be not * just an isolated one or two flows)). * * To detect the batching middle boxes we will come up with our compensation and * if with it in place, we find the peer is drastically off (by some margin) in * the smaller direction, then we will assume the worst case and disable use of timestamps. * */ uint64_t delta, peer_delta, delta_up; delta = bbr->r_ctl.cur_rtt_send_time - bbr->r_ctl.bbr_ts_check_our_cts; if (delta < bbr_min_usec_delta) { /* * Have not seen a min amount of time * between our send times so we can * make a determination of the timestamp * yet. */ return; } peer_delta = bbr->r_ctl.last_inbound_ts - bbr->r_ctl.bbr_ts_check_tstmp; if (peer_delta < bbr_min_peer_delta) { /* * We may have enough in the form of * our delta but the peers number * has not changed that much. It could * be its clock ratio is such that * we need more data (10ms tick) or * there may be other compression scenarios * going on. In any event we need the * spread to be larger. */ return; } /* Ok lets first see which way our delta is going */ if (peer_delta > delta) { /* Very unlikely, the peer without * compensation shows that it saw * the two sends arrive further apart * then we saw then in micro-seconds. */ if (peer_delta < (delta + ((delta * (uint64_t)1000)/ (uint64_t)bbr_delta_percent))) { /* well it looks like the peer is a micro-second clock. */ bbr->rc_ts_clock_set = 1; bbr->r_ctl.bbr_peer_tsratio = 1; } else { bbr->rc_ts_cant_be_used = 1; bbr->rc_ts_clock_set = 1; } return; } /* Ok we know that the peer_delta is smaller than our send distance */ bbr->rc_ts_clock_set = 1; /* First question is it within the percentage that they are using usec time? */ delta_up = (peer_delta * 1000) / (uint64_t)bbr_delta_percent; if ((peer_delta + delta_up) >= delta) { /* Its a usec clock */ bbr->r_ctl.bbr_peer_tsratio = 1; bbr_log_tstmp_validation(bbr, peer_delta, delta); return; } /* Ok if not usec, what about 10usec (though unlikely)? */ delta_up = (peer_delta * 1000 * 10) / (uint64_t)bbr_delta_percent; if (((peer_delta * 10) + delta_up) >= delta) { bbr->r_ctl.bbr_peer_tsratio = 10; bbr_log_tstmp_validation(bbr, peer_delta, delta); return; } /* And what about 100usec (though again unlikely)? */ delta_up = (peer_delta * 1000 * 100) / (uint64_t)bbr_delta_percent; if (((peer_delta * 100) + delta_up) >= delta) { bbr->r_ctl.bbr_peer_tsratio = 100; bbr_log_tstmp_validation(bbr, peer_delta, delta); return; } /* And how about 1 msec (the most likely one)? */ delta_up = (peer_delta * 1000 * 1000) / (uint64_t)bbr_delta_percent; if (((peer_delta * 1000) + delta_up) >= delta) { bbr->r_ctl.bbr_peer_tsratio = 1000; bbr_log_tstmp_validation(bbr, peer_delta, delta); return; } /* Ok if not msec could it be 10 msec? */ delta_up = (peer_delta * 1000 * 10000) / (uint64_t)bbr_delta_percent; if (((peer_delta * 10000) + delta_up) >= delta) { bbr->r_ctl.bbr_peer_tsratio = 10000; return; } /* If we fall down here the clock tick so slowly we can't use it */ bbr->rc_ts_cant_be_used = 1; bbr->r_ctl.bbr_peer_tsratio = 0; bbr_log_tstmp_validation(bbr, peer_delta, delta); } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_bbr_xmit_timer_commit(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts) { int32_t delta; uint32_t rtt, tsin; int32_t rtt_ticks; if (bbr->rtt_valid == 0) /* No valid sample */ return; rtt = bbr->r_ctl.cur_rtt; tsin = bbr->r_ctl.ts_in; if (bbr->rc_prtt_set_ts) { /* * We are to force feed the rttProp filter due * to an entry into PROBE_RTT. This assures * that the times are sync'd between when we * go into PROBE_RTT and the filter expiration. * * Google does not use a true filter, so they do * this implicitly since they only keep one value * and when they enter probe-rtt they update the * value to the newest rtt. */ uint32_t rtt_prop; bbr->rc_prtt_set_ts = 0; rtt_prop = get_filter_value_small(&bbr->r_ctl.rc_rttprop); if (rtt > rtt_prop) filter_increase_by_small(&bbr->r_ctl.rc_rttprop, (rtt - rtt_prop), cts); else apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); } if (bbr->rc_ack_was_delayed) rtt += bbr->r_ctl.rc_ack_hdwr_delay; if (rtt < bbr->r_ctl.rc_lowest_rtt) bbr->r_ctl.rc_lowest_rtt = rtt; bbr_log_rtt_sample(bbr, rtt, tsin); if (bbr->r_init_rtt) { /* * The initial rtt is not-trusted, nuke it and lets get * our first valid measurement in. */ bbr->r_init_rtt = 0; tp->t_srtt = 0; } if ((bbr->rc_ts_clock_set == 0) && bbr->rc_ts_valid) { /* * So we have not yet figured out * what the peers TSTMP value is * in (most likely ms). We need a * series of cum-ack's to determine * this reliably. */ if (bbr->rc_ack_is_cumack) { if (bbr->rc_ts_data_set) { /* Lets attempt to determine the timestamp granularity. */ bbr_make_timestamp_determination(bbr); } else { bbr->rc_ts_data_set = 1; bbr->r_ctl.bbr_ts_check_tstmp = bbr->r_ctl.last_inbound_ts; bbr->r_ctl.bbr_ts_check_our_cts = bbr->r_ctl.cur_rtt_send_time; } } else { /* * We have to have consecutive acks * reset any "filled" state to none. */ bbr->rc_ts_data_set = 0; } } /* Round it up */ rtt_ticks = USEC_2_TICKS((rtt + (USECS_IN_MSEC - 1))); if (rtt_ticks == 0) rtt_ticks = 1; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic is * equivalent to the smoothing algorithm in rfc793 with an * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). * Adjust rtt to origin 0. */ delta = ((rtt_ticks - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); tp->t_srtt += delta; if (tp->t_srtt <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit timer * to smoothed rtt + 4 times the smoothed variance. rttvar * is stored as fixed point with 4 bits after the binary * point (scaled by 16). The following is equivalent to * rfc793 smoothing with an alpha of .75 (rttvar = * rttvar*3/4 + |delta| / 4). This replaces rfc793's * wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); tp->t_rttvar += delta; if (tp->t_rttvar <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. Set the * variance to half the rtt (so our first retransmit happens * at 3*rtt). */ tp->t_srtt = rtt_ticks << TCP_RTT_SHIFT; tp->t_rttvar = rtt_ticks << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } KMOD_TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt_ticks)); #endif /* * the retransmit should happen at rtt + 4 * rttvar. Because of the * way we do the smoothing, srtt and rttvar will each average +1/2 * tick of bias. When we compute the retransmit timer, we want 1/2 * tick of rounding and 1 extra tick because of +-1/2 tick * uncertainty in the firing of the timer. The bias will give us * exactly the 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below the minimum * feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(MSEC_2_TICKS(bbr->r_ctl.rc_min_rto_ms), rtt_ticks + 2), MSEC_2_TICKS(((uint32_t)bbr->rc_max_rto_sec) * 1000)); /* * We received an ack for a packet that wasn't retransmitted; it is * probably safe to discard any error indications we've received * recently. This isn't quite right, but close enough for now (a * route might have failed after we sent a segment, and the return * path might not be symmetrical). */ tp->t_softerror = 0; rtt = (TICKS_2_USEC(bbr->rc_tp->t_srtt) >> TCP_RTT_SHIFT); if (bbr->r_ctl.bbr_smallest_srtt_this_state > rtt) bbr->r_ctl.bbr_smallest_srtt_this_state = rtt; } static void bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t t, uint32_t cts, int ack_type) { /* * For this RSM, we acknowledged the data from a previous * transmission, not the last one we made. This means we did a false * retransmit. */ if (rsm->r_flags & BBR_HAS_FIN) { /* * The sending of the FIN often is multiple sent when we * have everything outstanding ack'd. We ignore this case * since its over now. */ return; } if (rsm->r_flags & BBR_TLP) { /* * We expect TLP's to have this occur often */ bbr->rc_tlp_rtx_out = 0; return; } if (ack_type != BBR_CUM_ACKED) { /* * If it was not a cum-ack we * don't really know for sure since * the timestamp could be from some * other transmission. */ return; } if (rsm->r_flags & BBR_WAS_SACKPASS) { /* * We retransmitted based on a sack and the earlier * retransmission ack'd it - re-ordering is occuring. */ BBR_STAT_INC(bbr_reorder_seen); bbr->r_ctl.rc_reorder_ts = cts; } /* Back down the loss count */ if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; rsm->r_flags &= ~BBR_MARKED_LOST; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } /***** RRS HERE ************************/ /* Do we need to do this??? */ /* bbr_reset_lt_bw_sampling(bbr, cts); */ /***** RRS HERE ************************/ BBR_STAT_INC(bbr_badfr); BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start)); } static void bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line) { bbr->r_ctl.rc_rtt_shrinks = cts; if (bbr_can_force_probertt && (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) && ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) { /* * We should enter probe-rtt its been too long * since we have been there. */ bbr_enter_probe_rtt(bbr, cts, __LINE__); } else bbr_check_probe_rtt_limits(bbr, cts); } static void tcp_bbr_commit_bw(struct tcp_bbr *bbr, uint32_t cts) { uint64_t orig_bw; if (bbr->r_ctl.rc_bbr_cur_del_rate == 0) { /* We never apply a zero measurment */ bbr_log_type_bbrupd(bbr, 20, cts, 0, 0, 0, 0, 0, 0, 0, 0); return; } if (bbr->r_ctl.r_measurement_count < 0xffffffff) bbr->r_ctl.r_measurement_count++; orig_bw = get_filter_value(&bbr->r_ctl.rc_delrate); apply_filter_max(&bbr->r_ctl.rc_delrate, bbr->r_ctl.rc_bbr_cur_del_rate, bbr->r_ctl.rc_pkt_epoch); bbr_log_type_bbrupd(bbr, 21, cts, (uint32_t)orig_bw, (uint32_t)get_filter_value(&bbr->r_ctl.rc_delrate), 0, 0, 0, 0, 0, 0); if (orig_bw && (orig_bw != get_filter_value(&bbr->r_ctl.rc_delrate))) { if (bbr->bbr_hdrw_pacing) { /* * Apply a new rate to the hardware * possibly. */ bbr_update_hardware_pacing_rate(bbr, cts); } bbr_set_state_target(bbr, __LINE__); tcp_bbr_tso_size_check(bbr, cts); if (bbr->r_recovery_bw) { bbr_setup_red_bw(bbr, cts); bbr_log_type_bw_reduce(bbr, BBR_RED_BW_USELRBW); } } else if ((orig_bw == 0) && get_filter_value(&bbr->r_ctl.rc_delrate)) tcp_bbr_tso_size_check(bbr, cts); } static void bbr_nf_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts) { if (bbr->rc_in_persist == 0) { /* We log only when not in persist */ /* Translate to a Bytes Per Second */ uint64_t tim, bw, ts_diff, ts_bw; uint32_t upper, lower, delivered; if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time)) tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time); else tim = 1; /* * Now that we have processed the tim (skipping the sample * or possibly updating the time, go ahead and * calculate the cdr. */ delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered); bw = (uint64_t)delivered; bw *= (uint64_t)USECS_IN_SECOND; bw /= tim; if (bw == 0) { /* We must have a calculatable amount */ return; } upper = (bw >> 32) & 0x00000000ffffffff; lower = bw & 0x00000000ffffffff; /* * If we are using this b/w shove it in now so we * can see in the trace viewer if it gets over-ridden. */ if (rsm->r_ts_valid && bbr->rc_ts_valid && bbr->rc_ts_clock_set && (bbr->rc_ts_cant_be_used == 0) && bbr->rc_use_ts_limit) { ts_diff = max((bbr->r_ctl.last_inbound_ts - rsm->r_del_ack_ts), 1); ts_diff *= bbr->r_ctl.bbr_peer_tsratio; if ((delivered == 0) || (rtt < 1000)) { /* Can't use the ts */ bbr_log_type_bbrupd(bbr, 61, cts, ts_diff, bbr->r_ctl.last_inbound_ts, rsm->r_del_ack_ts, 0, 0, 0, 0, delivered); } else { ts_bw = (uint64_t)delivered; ts_bw *= (uint64_t)USECS_IN_SECOND; ts_bw /= ts_diff; bbr_log_type_bbrupd(bbr, 62, cts, (ts_bw >> 32), (ts_bw & 0xffffffff), 0, 0, 0, 0, ts_diff, delivered); if ((bbr->ts_can_raise) && (ts_bw > bw)) { bbr_log_type_bbrupd(bbr, 8, cts, delivered, ts_diff, (bw >> 32), (bw & 0x00000000ffffffff), 0, 0, 0, 0); bw = ts_bw; } else if (ts_bw && (ts_bw < bw)) { bbr_log_type_bbrupd(bbr, 7, cts, delivered, ts_diff, (bw >> 32), (bw & 0x00000000ffffffff), 0, 0, 0, 0); bw = ts_bw; } } } if (rsm->r_first_sent_time && TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) { uint64_t sbw, sti; /* * We use what was in flight at the time of our * send and the size of this send to figure * out what we have been sending at (amount). * For the time we take from the time of * the send of the first send outstanding * until this send plus this sends pacing * time. This gives us a good calculation * as to the rate we have been sending at. */ sbw = (uint64_t)(rsm->r_flight_at_send); sbw *= (uint64_t)USECS_IN_SECOND; sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time; sti += rsm->r_pacing_delay; sbw /= sti; if (sbw < bw) { bbr_log_type_bbrupd(bbr, 6, cts, delivered, (uint32_t)sti, (bw >> 32), (uint32_t)bw, rsm->r_first_sent_time, 0, (sbw >> 32), (uint32_t)sbw); bw = sbw; } } /* Use the google algorithm for b/w measurements */ bbr->r_ctl.rc_bbr_cur_del_rate = bw; if ((rsm->r_app_limited == 0) || (bw > get_filter_value(&bbr->r_ctl.rc_delrate))) { tcp_bbr_commit_bw(bbr, cts); bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered, 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time); } } } static void bbr_google_measurement(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts) { if (bbr->rc_in_persist == 0) { /* We log only when not in persist */ /* Translate to a Bytes Per Second */ uint64_t tim, bw; uint32_t upper, lower, delivered; int no_apply = 0; if (TSTMP_GT(bbr->r_ctl.rc_del_time, rsm->r_del_time)) tim = (uint64_t)(bbr->r_ctl.rc_del_time - rsm->r_del_time); else tim = 1; /* * Now that we have processed the tim (skipping the sample * or possibly updating the time, go ahead and * calculate the cdr. */ delivered = (bbr->r_ctl.rc_delivered - rsm->r_delivered); bw = (uint64_t)delivered; bw *= (uint64_t)USECS_IN_SECOND; bw /= tim; if (tim < bbr->r_ctl.rc_lowest_rtt) { bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered, tim, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0); no_apply = 1; } upper = (bw >> 32) & 0x00000000ffffffff; lower = bw & 0x00000000ffffffff; /* * If we are using this b/w shove it in now so we * can see in the trace viewer if it gets over-ridden. */ bbr->r_ctl.rc_bbr_cur_del_rate = bw; /* Gate by the sending rate */ if (rsm->r_first_sent_time && TSTMP_GT(rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)],rsm->r_first_sent_time)) { uint64_t sbw, sti; /* * We use what was in flight at the time of our * send and the size of this send to figure * out what we have been sending at (amount). * For the time we take from the time of * the send of the first send outstanding * until this send plus this sends pacing * time. This gives us a good calculation * as to the rate we have been sending at. */ sbw = (uint64_t)(rsm->r_flight_at_send); sbw *= (uint64_t)USECS_IN_SECOND; sti = rsm->r_tim_lastsent[(rsm->r_rtr_cnt -1)] - rsm->r_first_sent_time; sti += rsm->r_pacing_delay; sbw /= sti; if (sbw < bw) { bbr_log_type_bbrupd(bbr, 6, cts, delivered, (uint32_t)sti, (bw >> 32), (uint32_t)bw, rsm->r_first_sent_time, 0, (sbw >> 32), (uint32_t)sbw); bw = sbw; } if ((sti > tim) && (sti < bbr->r_ctl.rc_lowest_rtt)) { bbr_log_type_bbrupd(bbr, 99, cts, (uint32_t)tim, delivered, (uint32_t)sti, bbr->r_ctl.rc_lowest_rtt, 0, 0, 0, 0); no_apply = 1; } else no_apply = 0; } bbr->r_ctl.rc_bbr_cur_del_rate = bw; if ((no_apply == 0) && ((rsm->r_app_limited == 0) || (bw > get_filter_value(&bbr->r_ctl.rc_delrate)))) { tcp_bbr_commit_bw(bbr, cts); bbr_log_type_bbrupd(bbr, 10, cts, (uint32_t)tim, delivered, 0, 0, 0, 0, bbr->r_ctl.rc_del_time, rsm->r_del_time); } } } static void bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt, uint32_t cts, uint32_t tsin, uint32_t uts, int32_t match, uint32_t rsm_send_time, int32_t ack_type, struct tcpopt *to) { uint64_t old_rttprop; /* Update our delivery time and amount */ bbr->r_ctl.rc_delivered += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_del_time = cts; if (rtt == 0) { /* * 0 means its a retransmit, for now we don't use these for * the rest of BBR. */ return; } if ((bbr->rc_use_google == 0) && (match != BBR_RTT_BY_EXACTMATCH) && (match != BBR_RTT_BY_TIMESTAMP)){ /* * We get a lot of rtt updates, lets not pay attention to * any that are not an exact match. That way we don't have * to worry about timestamps and the whole nonsense of * unsure if its a retransmission etc (if we ever had the * timestamp fixed to always have the last thing sent this * would not be a issue). */ return; } if ((bbr_no_retran && bbr->rc_use_google) && (match != BBR_RTT_BY_EXACTMATCH) && (match != BBR_RTT_BY_TIMESTAMP)){ /* * We only do measurements in google mode * with bbr_no_retran on for sure things. */ return; } /* Only update srtt if we know by exact match */ tcp_bbr_xmit_timer(bbr, rtt, rsm_send_time, rsm->r_start, tsin); if (ack_type == BBR_CUM_ACKED) bbr->rc_ack_is_cumack = 1; else bbr->rc_ack_is_cumack = 0; old_rttprop = bbr_get_rtt(bbr, BBR_RTT_PROP); /* * Note the following code differs to the original * BBR spec. It calls for <= not <. However after a * long discussion in email with Neal, he acknowledged * that it should be < than so that we will have flows * going into probe-rtt (we were seeing cases where that * did not happen and caused ugly things to occur). We * have added this agreed upon fix to our code base. */ if (rtt < old_rttprop) { /* Update when we last saw a rtt drop */ bbr_log_rtt_shrinks(bbr, cts, 0, rtt, __LINE__, BBR_RTTS_NEWRTT, 0); bbr_set_reduced_rtt(bbr, cts, __LINE__); } bbr_log_type_bbrrttprop(bbr, rtt, (rsm ? rsm->r_end : 0), uts, cts, match, rsm->r_start, rsm->r_flags); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); if (old_rttprop != bbr_get_rtt(bbr, BBR_RTT_PROP)) { /* * The RTT-prop moved, reset the target (may be a * nop for some states). */ bbr_set_state_target(bbr, __LINE__); if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_NEW_TARGET, 0); else if (old_rttprop < bbr_get_rtt(bbr, BBR_RTT_PROP)) /* It went up */ bbr_check_probe_rtt_limits(bbr, cts); } if ((bbr->rc_use_google == 0) && (match == BBR_RTT_BY_TIMESTAMP)) { /* * We don't do b/w update with * these since they are not really * reliable. */ return; } if (bbr->r_ctl.r_app_limited_until && (bbr->r_ctl.rc_delivered >= bbr->r_ctl.r_app_limited_until)) { /* We are no longer app-limited */ bbr->r_ctl.r_app_limited_until = 0; } if (bbr->rc_use_google) { bbr_google_measurement(bbr, rsm, rtt, cts); } else { bbr_nf_measurement(bbr, rsm, rtt, cts); } } /* * Convert a timestamp that the main stack * uses (milliseconds) into one that bbr uses * (microseconds). Return that converted timestamp. */ static uint32_t bbr_ts_convert(uint32_t cts) { uint32_t sec, msec; sec = cts / MS_IN_USEC; msec = cts - (MS_IN_USEC * sec); return ((sec * USECS_IN_SECOND) + (msec * MS_IN_USEC)); } /* * Return 0 if we did not update the RTT time, return * 1 if we did. */ static int bbr_update_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, uint32_t th_ack) { int32_t i; uint32_t t, uts = 0; if ((rsm->r_flags & BBR_ACKED) || (rsm->r_flags & BBR_WAS_RENEGED) || (rsm->r_flags & BBR_RXT_CLEARED)) { /* Already done */ return (0); } if (rsm->r_rtr_cnt == 1) { /* * Only one transmit. Hopefully the normal case. */ if (TSTMP_GT(cts, rsm->r_tim_lastsent[0])) t = cts - rsm->r_tim_lastsent[0]; else t = 1; if ((int)t <= 0) t = 1; bbr->r_ctl.rc_last_rtt = t; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, BBR_RTT_BY_EXACTMATCH, rsm->r_tim_lastsent[0], ack_type, to); return (1); } /* Convert to usecs */ if ((bbr_can_use_ts_for_rtt == 1) && (bbr->rc_use_google == 1) && (ack_type == BBR_CUM_ACKED) && (to->to_flags & TOF_TS) && (to->to_tsecr != 0)) { t = tcp_tv_to_mssectick(&bbr->rc_tv) - to->to_tsecr; if (t < 1) t = 1; t *= MS_IN_USEC; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, 0, BBR_RTT_BY_TIMESTAMP, rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)], ack_type, to); return (1); } uts = bbr_ts_convert(to->to_tsecr); if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0) && (ack_type == BBR_CUM_ACKED) && ((rsm->r_flags & BBR_OVERMAX) == 0)) { /* * Now which timestamp does it match? In this block the ACK * may be coming from a previous transmission. */ uint32_t fudge; fudge = BBR_TIMER_FUDGE; for (i = 0; i < rsm->r_rtr_cnt; i++) { if ((SEQ_GEQ(uts, (rsm->r_tim_lastsent[i] - fudge))) && (SEQ_LEQ(uts, (rsm->r_tim_lastsent[i] + fudge)))) { if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) t = cts - rsm->r_tim_lastsent[i]; else t = 1; if ((int)t <= 0) t = 1; bbr->r_ctl.rc_last_rtt = t; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_TSMATCHING, rsm->r_tim_lastsent[i], ack_type, to); if ((i + 1) < rsm->r_rtr_cnt) { /* Likely */ bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type); } else if (rsm->r_flags & BBR_TLP) { bbr->rc_tlp_rtx_out = 0; } return (1); } } /* Fall through if we can't find a matching timestamp */ } /* * Ok its a SACK block that we retransmitted. or a windows * machine without timestamps. We can tell nothing from the * time-stamp since its not there or the time the peer last * recieved a segment that moved forward its cum-ack point. * * Lets look at the last retransmit and see what we can tell * (with BBR for space we only keep 2 note we have to keep * at least 2 so the map can not be condensed more). */ i = rsm->r_rtr_cnt - 1; if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) t = cts - rsm->r_tim_lastsent[i]; else goto not_sure; if (t < bbr->r_ctl.rc_lowest_rtt) { /* * We retransmitted and the ack came back in less * than the smallest rtt we have observed in the * windowed rtt. We most likey did an improper * retransmit as outlined in 4.2 Step 3 point 2 in * the rack-draft. * * Use the prior transmission to update all the * information as long as there is only one prior * transmission. */ if ((rsm->r_flags & BBR_OVERMAX) == 0) { #ifdef BBR_INVARIANTS if (rsm->r_rtr_cnt == 1) panic("rsm:%p bbr:%p rsm has overmax and only 1 retranmit flags:%x?", rsm, bbr, rsm->r_flags); #endif i = rsm->r_rtr_cnt - 2; if (TSTMP_GT(cts, rsm->r_tim_lastsent[i])) t = cts - rsm->r_tim_lastsent[i]; else t = 1; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET, rsm->r_tim_lastsent[i], ack_type, to); bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type); } else { /* * Too many prior transmissions, just * updated BBR delivered */ not_sure: bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts, BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to); } } else { /* * We retransmitted it and the retransmit did the * job. */ if (rsm->r_flags & BBR_TLP) bbr->rc_tlp_rtx_out = 0; if ((rsm->r_flags & BBR_OVERMAX) == 0) bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_THIS_RETRAN, 0, ack_type, to); else bbr_update_bbr_info(bbr, rsm, 0, cts, to->to_tsecr, uts, BBR_RTT_BY_SOME_RETRAN, 0, ack_type, to); return (1); } return (0); } /* * Mark the SACK_PASSED flag on all entries prior to rsm send wise. */ static void bbr_log_sack_passed(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm) { struct bbr_sendmap *nrsm; nrsm = rsm; TAILQ_FOREACH_REVERSE_FROM(nrsm, &bbr->r_ctl.rc_tmap, bbr_head, r_tnext) { if (nrsm == rsm) { /* Skip orginal segment he is acked */ continue; } if (nrsm->r_flags & BBR_ACKED) { /* Skip ack'd segments */ continue; } if (nrsm->r_flags & BBR_SACK_PASSED) { /* * We found one that is already marked * passed, we have been here before and * so all others below this are marked. */ break; } BBR_STAT_INC(bbr_sack_passed); nrsm->r_flags |= BBR_SACK_PASSED; if (((nrsm->r_flags & BBR_MARKED_LOST) == 0) && bbr_is_lost(bbr, nrsm, bbr->r_ctl.rc_rcvtime)) { bbr->r_ctl.rc_lost += nrsm->r_end - nrsm->r_start; bbr->r_ctl.rc_lost_bytes += nrsm->r_end - nrsm->r_start; nrsm->r_flags |= BBR_MARKED_LOST; } nrsm->r_flags &= ~BBR_WAS_SACKPASS; } } /* * Returns the number of bytes that were * newly ack'd by sack blocks. */ static uint32_t bbr_proc_sack_blk(struct tcpcb *tp, struct tcp_bbr *bbr, struct sackblk *sack, struct tcpopt *to, struct bbr_sendmap **prsm, uint32_t cts) { int32_t times = 0; uint32_t start, end, maxseg, changed = 0; struct bbr_sendmap *rsm, *nrsm; int32_t used_ref = 1; uint8_t went_back = 0, went_fwd = 0; maxseg = tp->t_maxseg - bbr->rc_last_options; start = sack->start; end = sack->end; rsm = *prsm; if (rsm == NULL) used_ref = 0; /* Do we locate the block behind where we last were? */ if (rsm && SEQ_LT(start, rsm->r_start)) { went_back = 1; TAILQ_FOREACH_REVERSE_FROM(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { goto do_rest_ofb; } } } start_at_beginning: went_fwd = 1; /* * Ok lets locate the block where this guy is fwd from rsm (if its * set) */ TAILQ_FOREACH_FROM(rsm, &bbr->r_ctl.rc_map, r_next) { if (SEQ_GEQ(start, rsm->r_start) && SEQ_LT(start, rsm->r_end)) { break; } } do_rest_ofb: if (rsm == NULL) { /* * This happens when we get duplicate sack blocks with the * same end. For example SACK 4: 100 SACK 3: 100 The sort * will not change there location so we would just start at * the end of the first one and get lost. */ if (tp->t_flags & TF_SENTFIN) { /* * Check to see if we have not logged the FIN that * went out. */ nrsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); if (nrsm && (nrsm->r_end + 1) == tp->snd_max) { /* * Ok we did not get the FIN logged. */ nrsm->r_end++; rsm = nrsm; goto do_rest_ofb; } } if (times == 1) { #ifdef BBR_INVARIANTS panic("tp:%p bbr:%p sack:%p to:%p prsm:%p", tp, bbr, sack, to, prsm); #else goto out; #endif } times++; BBR_STAT_INC(bbr_sack_proc_restart); rsm = NULL; goto start_at_beginning; } /* Ok we have an ACK for some piece of rsm */ if (rsm->r_start != start) { /* * Need to split this in two pieces the before and after. */ if (bbr_sack_mergable(rsm, start, end)) nrsm = bbr_alloc_full_limit(bbr); else nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* We could not allocate ignore the sack */ struct sackblk blk; blk.start = start; blk.end = end; sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk); goto out; } bbr_clone_rsm(bbr, nrsm, rsm, start); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~BBR_HAS_FIN); rsm = nrsm; } if (SEQ_GEQ(end, rsm->r_end)) { /* * The end of this block is either beyond this guy or right * at this guy. */ if ((rsm->r_flags & BBR_ACKED) == 0) { bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0); changed += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); bbr_log_sack_passed(tp, bbr, rsm); if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; } /* Is Reordering occuring? */ if (rsm->r_flags & BBR_SACK_PASSED) { BBR_STAT_INC(bbr_reorder_seen); bbr->r_ctl.rc_reorder_ts = cts; if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } } rsm->r_flags |= BBR_ACKED; rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST); if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } } bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED); if (end == rsm->r_end) { /* This block only - done */ goto out; } /* There is more not coverend by this rsm move on */ start = rsm->r_end; nrsm = TAILQ_NEXT(rsm, r_next); rsm = nrsm; times = 0; goto do_rest_ofb; } if (rsm->r_flags & BBR_ACKED) { /* Been here done that */ goto out; } /* Ok we need to split off this one at the tail */ if (bbr_sack_mergable(rsm, start, end)) nrsm = bbr_alloc_full_limit(bbr); else nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* failed XXXrrs what can we do but loose the sack info? */ struct sackblk blk; blk.start = start; blk.end = end; sack_filter_reject(&bbr->r_ctl.bbr_sf, &blk); goto out; } /* Clone it */ bbr_clone_rsm(bbr, nrsm, rsm, end); /* The sack block does not cover this guy fully */ rsm->r_flags &= (~BBR_HAS_FIN); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } nrsm->r_dupack = 0; bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_SACKED, 0); bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_SACKED); changed += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); bbr_log_sack_passed(tp, bbr, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; } if (rsm->r_flags & BBR_SACK_PASSED) { BBR_STAT_INC(bbr_reorder_seen); bbr->r_ctl.rc_reorder_ts = cts; if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } } rsm->r_flags &= ~(BBR_TLP|BBR_WAS_RENEGED|BBR_RXT_CLEARED|BBR_MARKED_LOST); rsm->r_flags |= BBR_ACKED; if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } out: if (rsm && (rsm->r_flags & BBR_ACKED)) { /* * Now can we merge this newly acked * block with either the previous or * next block? */ nrsm = TAILQ_NEXT(rsm, r_next); if (nrsm && (nrsm->r_flags & BBR_ACKED)) { /* yep this and next can be merged */ rsm = bbr_merge_rsm(bbr, rsm, nrsm); } /* Now what about the previous? */ nrsm = TAILQ_PREV(rsm, bbr_head, r_next); if (nrsm && (nrsm->r_flags & BBR_ACKED)) { /* yep the previous and this can be merged */ rsm = bbr_merge_rsm(bbr, nrsm, rsm); } } if (used_ref == 0) { BBR_STAT_INC(bbr_sack_proc_all); } else { BBR_STAT_INC(bbr_sack_proc_short); } if (went_fwd && went_back) { BBR_STAT_INC(bbr_sack_search_both); } else if (went_fwd) { BBR_STAT_INC(bbr_sack_search_fwd); } else if (went_back) { BBR_STAT_INC(bbr_sack_search_back); } /* Save off where the next seq is */ if (rsm) bbr->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next); else bbr->r_ctl.rc_sacklast = NULL; *prsm = rsm; return (changed); } static void inline bbr_peer_reneges(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, tcp_seq th_ack) { struct bbr_sendmap *tmap; BBR_STAT_INC(bbr_reneges_seen); tmap = NULL; while (rsm && (rsm->r_flags & BBR_ACKED)) { /* Its no longer sacked, mark it so */ uint32_t oflags; bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); #ifdef BBR_INVARIANTS if (rsm->r_in_tmap) { panic("bbr:%p rsm:%p flags:0x%x in tmap?", bbr, rsm, rsm->r_flags); } #endif oflags = rsm->r_flags; if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } rsm->r_flags &= ~(BBR_ACKED | BBR_SACK_PASSED | BBR_WAS_SACKPASS | BBR_MARKED_LOST); rsm->r_flags |= BBR_WAS_RENEGED; rsm->r_flags |= BBR_RXT_CLEARED; bbr_log_type_rsmclear(bbr, bbr->r_ctl.rc_rcvtime, rsm, oflags, __LINE__); /* Rebuild it into our tmap */ if (tmap == NULL) { TAILQ_INSERT_HEAD(&bbr->r_ctl.rc_tmap, rsm, r_tnext); tmap = rsm; } else { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, tmap, rsm, r_tnext); tmap = rsm; } tmap->r_in_tmap = 1; /* * XXXrrs Delivered? Should we do anything here? * * Of course we don't on a rxt timeout so maybe its ok that * we don't? * * For now lets not. */ rsm = TAILQ_NEXT(rsm, r_next); } /* * Now lets possibly clear the sack filter so we start recognizing * sacks that cover this area. */ sack_filter_clear(&bbr->r_ctl.bbr_sf, th_ack); } static void bbr_log_syn(struct tcpcb *tp, struct tcpopt *to) { struct tcp_bbr *bbr; struct bbr_sendmap *rsm; uint32_t cts; bbr = (struct tcp_bbr *)tp->t_fb_ptr; cts = bbr->r_ctl.rc_rcvtime; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); if (rsm && (rsm->r_flags & BBR_HAS_SYN)) { if ((rsm->r_end - rsm->r_start) <= 1) { /* Log out the SYN completely */ bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } if (bbr->r_ctl.rc_next == rsm) { /* scoot along the marker */ bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map); } if (to != NULL) bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, 0); bbr_free(bbr, rsm); } else { /* There is more (Fast open)? strip out SYN. */ rsm->r_flags &= ~BBR_HAS_SYN; rsm->r_start++; } } } /* * Returns the number of bytes that were * acknowledged by SACK blocks. */ static uint32_t bbr_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, uint32_t *prev_acked) { uint32_t changed, last_seq, entered_recovery = 0; struct tcp_bbr *bbr; struct bbr_sendmap *rsm; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; register uint32_t th_ack; int32_t i, j, k, new_sb, num_sack_blks = 0; uint32_t cts, acked, ack_point, sack_changed = 0; uint32_t p_maxseg, maxseg, p_acked = 0; INP_WLOCK_ASSERT(tp->t_inpcb); if (th->th_flags & TH_RST) { /* We don't log resets */ return (0); } bbr = (struct tcp_bbr *)tp->t_fb_ptr; cts = bbr->r_ctl.rc_rcvtime; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); changed = 0; maxseg = tp->t_maxseg - bbr->rc_last_options; p_maxseg = min(bbr->r_ctl.rc_pace_max_segs, maxseg); th_ack = th->th_ack; if (SEQ_GT(th_ack, tp->snd_una)) { acked = th_ack - tp->snd_una; bbr_log_progress_event(bbr, tp, ticks, PROGRESS_UPDATE, __LINE__); bbr->rc_tp->t_acktime = ticks; } else acked = 0; if (SEQ_LEQ(th_ack, tp->snd_una)) { /* Only sent here for sack processing */ goto proc_sack; } if (rsm && SEQ_GT(th_ack, rsm->r_start)) { changed = th_ack - rsm->r_start; } else if ((rsm == NULL) && ((th_ack - 1) == tp->iss)) { /* * For the SYN incoming case we will not have called * tcp_output for the sending of the SYN, so there will be * no map. All other cases should probably be a panic. */ if ((to->to_flags & TOF_TS) && (to->to_tsecr != 0)) { /* * We have a timestamp that can be used to generate * an initial RTT. */ uint32_t ts, now, rtt; ts = bbr_ts_convert(to->to_tsecr); now = bbr_ts_convert(tcp_tv_to_mssectick(&bbr->rc_tv)); rtt = now - ts; if (rtt < 1) rtt = 1; bbr_log_type_bbrrttprop(bbr, rtt, tp->iss, 0, cts, BBR_RTT_BY_TIMESTAMP, tp->iss, 0); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); changed = 1; bbr->r_wanted_output = 1; goto out; } goto proc_sack; } else if (rsm == NULL) { goto out; } if (changed) { /* * The ACK point is advancing to th_ack, we must drop off * the packets in the rack log and calculate any eligble * RTT's. */ bbr->r_wanted_output = 1; more: if (rsm == NULL) { if (tp->t_flags & TF_SENTFIN) { /* if we send a FIN we will not hav a map */ goto proc_sack; } #ifdef BBR_INVARIANTS panic("No rack map tp:%p for th:%p state:%d bbr:%p snd_una:%u snd_max:%u chg:%d\n", tp, th, tp->t_state, bbr, tp->snd_una, tp->snd_max, changed); #endif goto proc_sack; } } if (SEQ_LT(th_ack, rsm->r_start)) { /* Huh map is missing this */ #ifdef BBR_INVARIANTS printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d bbr:%p\n", rsm->r_start, th_ack, tp->t_state, bbr->r_state, bbr); panic("th-ack is bad bbr:%p tp:%p", bbr, tp); #endif goto proc_sack; } else if (th_ack == rsm->r_start) { /* None here to ack */ goto proc_sack; } /* * Clear the dup ack counter, it will * either be freed or if there is some * remaining we need to start it at zero. */ rsm->r_dupack = 0; /* Now do we consume the whole thing? */ if (SEQ_GEQ(th_ack, rsm->r_end)) { /* Its all consumed. */ uint32_t left; if (rsm->r_flags & BBR_ACKED) { /* * It was acked on the scoreboard -- remove it from * total */ p_acked += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); if (bbr->r_ctl.rc_sacked == 0) bbr->r_ctl.rc_sacklast = NULL; } else { bbr_update_rtt(tp, bbr, rsm, to, cts, BBR_CUM_ACKED, th_ack); if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; } if (rsm->r_flags & BBR_SACK_PASSED) { /* * There are acked segments ACKED on the * scoreboard further up. We are seeing * reordering. */ BBR_STAT_INC(bbr_reorder_seen); bbr->r_ctl.rc_reorder_ts = cts; if (rsm->r_flags & BBR_MARKED_LOST) { bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) /* LT sampling also needs adjustment */ bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; } } rsm->r_flags &= ~BBR_MARKED_LOST; } bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); if (rsm->r_in_tmap) { TAILQ_REMOVE(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } if (bbr->r_ctl.rc_next == rsm) { /* scoot along the marker */ bbr->r_ctl.rc_next = TAILQ_FIRST(&bbr->r_ctl.rc_map); } bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED); /* Adjust the packet counts */ left = th_ack - rsm->r_end; /* Free back to zone */ bbr_free(bbr, rsm); if (left) { rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); goto more; } goto proc_sack; } if (rsm->r_flags & BBR_ACKED) { /* * It was acked on the scoreboard -- remove it from total * for the part being cum-acked. */ p_acked += (rsm->r_end - rsm->r_start); bbr->r_ctl.rc_sacked -= (th_ack - rsm->r_start); if (bbr->r_ctl.rc_sacked == 0) bbr->r_ctl.rc_sacklast = NULL; } else { /* * It was acked up to th_ack point for the first time */ struct bbr_sendmap lrsm; memcpy(&lrsm, rsm, sizeof(struct bbr_sendmap)); lrsm.r_end = th_ack; bbr_update_rtt(tp, bbr, &lrsm, to, cts, BBR_CUM_ACKED, th_ack); } if ((rsm->r_flags & BBR_MARKED_LOST) && ((rsm->r_flags & BBR_ACKED) == 0)) { /* * It was marked lost and partly ack'd now * for the first time. We lower the rc_lost_bytes * and still leave it MARKED. */ bbr->r_ctl.rc_lost_bytes -= th_ack - rsm->r_start; } bbr_isit_a_pkt_epoch(bbr, cts, rsm, __LINE__, BBR_CUM_ACKED); bbr->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; /* adjust packet count */ rsm->r_start = th_ack; proc_sack: /* Check for reneging */ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); if (rsm && (rsm->r_flags & BBR_ACKED) && (th_ack == rsm->r_start)) { /* * The peer has moved snd_una up to the edge of this send, * i.e. one that it had previously acked. The only way that * can be true if the peer threw away data (space issues) * that it had previously sacked (else it would have given * us snd_una up to (rsm->r_end). We need to undo the acked * markings here. * * Note we have to look to make sure th_ack is our * rsm->r_start in case we get an old ack where th_ack is * behind snd_una. */ bbr_peer_reneges(bbr, rsm, th->th_ack); } if ((to->to_flags & TOF_SACK) == 0) { /* We are done nothing left to log */ goto out; } rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_map, bbr_sendmap, r_next); if (rsm) { last_seq = rsm->r_end; } else { last_seq = tp->snd_max; } /* Sack block processing */ if (SEQ_GT(th_ack, tp->snd_una)) ack_point = th_ack; else ack_point = tp->snd_una; for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, ack_point) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, ack_point) && SEQ_LEQ(sack.end, tp->snd_max)) { if ((bbr->r_ctl.rc_num_small_maps_alloced > bbr_sack_block_limit) && (SEQ_LT(sack.end, last_seq)) && ((sack.end - sack.start) < (p_maxseg / 8))) { /* * Not the last piece and its smaller than * 1/8th of a p_maxseg. We ignore this. */ BBR_STAT_INC(bbr_runt_sacks); continue; } sack_blocks[num_sack_blks] = sack; num_sack_blks++; #ifdef NETFLIX_STATS } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { /* * Its a D-SACK block. */ tcp_record_dsack(sack.start, sack.end); #endif } } if (num_sack_blks == 0) goto out; /* * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ new_sb = sack_filter_blks(&bbr->r_ctl.bbr_sf, sack_blocks, num_sack_blks, th->th_ack); ctf_log_sack_filter(bbr->rc_tp, new_sb, sack_blocks); BBR_STAT_ADD(bbr_sack_blocks, num_sack_blks); BBR_STAT_ADD(bbr_sack_blocks_skip, (num_sack_blks - new_sb)); num_sack_blks = new_sb; if (num_sack_blks < 2) { goto do_sack_work; } /* Sort the sacks */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } /* * Now are any of the sack block ends the same (yes some * implememtations send these)? */ again: if (num_sack_blks > 1) { for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (sack_blocks[i].end == sack_blocks[j].end) { /* * Ok these two have the same end we * want the smallest end and then * throw away the larger and start * again. */ if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { /* * The second block covers * more area use that */ sack_blocks[i].start = sack_blocks[j].start; } /* * Now collapse out the dup-sack and * lower the count */ for (k = (j + 1); k < num_sack_blks; k++) { sack_blocks[j].start = sack_blocks[k].start; sack_blocks[j].end = sack_blocks[k].end; j++; } num_sack_blks--; goto again; } } } } do_sack_work: rsm = bbr->r_ctl.rc_sacklast; for (i = 0; i < num_sack_blks; i++) { acked = bbr_proc_sack_blk(tp, bbr, &sack_blocks[i], to, &rsm, cts); if (acked) { bbr->r_wanted_output = 1; changed += acked; sack_changed += acked; } } out: *prev_acked = p_acked; if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) { /* * Ok we have a high probability that we need to go in to * recovery since we have data sack'd */ struct bbr_sendmap *rsm; rsm = bbr_check_recovery_mode(tp, bbr, cts); if (rsm) { /* Enter recovery */ entered_recovery = 1; bbr->r_wanted_output = 1; /* * When we enter recovery we need to assure we send * one packet. */ if (bbr->r_ctl.rc_resend == NULL) { bbr->r_ctl.rc_resend = rsm; } } } if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { /* * See if we need to rack-retransmit anything if so set it * up as the thing to resend assuming something else is not * already in that position. */ if (bbr->r_ctl.rc_resend == NULL) { bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); } } /* * We return the amount that changed via sack, this is used by the * ack-received code to augment what was changed between th_ack <-> * snd_una. */ return (sack_changed); } static void bbr_strike_dupack(struct tcp_bbr *bbr) { struct bbr_sendmap *rsm; rsm = TAILQ_FIRST(&bbr->r_ctl.rc_tmap); if (rsm && (rsm->r_dupack < 0xff)) { rsm->r_dupack++; if (rsm->r_dupack >= DUP_ACK_THRESHOLD) bbr->r_wanted_output = 1; } } /* * Return value of 1, we do not need to call bbr_process_data(). * return value of 0, bbr_process_data can be called. * For ret_val if its 0 the TCB is locked and valid, if its non-zero * its unlocked and probably unsafe to touch the TCB. */ static int bbr_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val) { int32_t ourfinisacked = 0; int32_t acked_amount; uint16_t nsegs; int32_t acked; uint32_t lost, sack_changed = 0; struct mbuf *mfree; struct tcp_bbr *bbr; uint32_t prev_acked = 0; bbr = (struct tcp_bbr *)tp->t_fb_ptr; lost = bbr->r_ctl.rc_lost; nsegs = max(1, m->m_pkthdr.lro_nsegs); if (SEQ_GT(th->th_ack, tp->snd_max)) { ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); bbr->r_wanted_output = 1; return (1); } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { /* Process the ack */ if (bbr->rc_in_persist) tp->t_rxtshift = 0; if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) bbr_strike_dupack(bbr); sack_changed = bbr_log_ack(tp, to, th, &prev_acked); } bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, (bbr->r_ctl.rc_lost > lost)); if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* * Old ack, behind the last one rcv'd or a duplicate ack * with SACK info. */ if (th->th_ack == tp->snd_una) { bbr_ack_received(tp, bbr, th, 0, sack_changed, prev_acked, __LINE__, 0); if (bbr->r_state == TCPS_SYN_SENT) { /* * Special case on where we sent SYN. When * the SYN-ACK is processed in syn_sent * state it bumps the snd_una. This causes * us to hit here even though we did ack 1 * byte. * * Go through the nothing left case so we * send data. */ goto nothing_left; } } return (0); } /* * If we reach this point, ACK is not a duplicate, i.e., it ACKs * something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our SYN has * been ACK'd (so connection is now fully synchronized). Go * to non-starred state, increment snd_una for ACK of SYN, * and check if we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs); KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK arrives * within our recovery window, then it was a mistake to do the * retransmit in the first place. Recover our original cwnd and * ssthresh, and proceed to transmit where we left off. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) bbr_cong_signal(tp, th, CC_RTO_ERR, NULL); } SOCKBUF_LOCK(&so->so_snd); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; mfree = sbcut_locked(&so->so_snd, acked_amount); SOCKBUF_UNLOCK(&so->so_snd); tp->t_flags |= TF_WAKESOW; m_freem(mfree); if (SEQ_GT(th->th_ack, tp->snd_una)) { bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp)); } tp->snd_una = th->th_ack; bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, (bbr->r_ctl.rc_lost - lost)); if (IN_RECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover) && (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_bbr_partialack(tp); } else { bbr_post_recovery(tp); } } if (SEQ_GT(tp->snd_una, tp->snd_recover)) { tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tp->snd_nxt = tp->snd_max; } if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ nothing_left: bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__); if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) bbr->rc_tp->t_acktime = 0; if ((sbused(&so->so_snd) == 0) && (tp->t_flags & TF_SENTFIN)) { ourfinisacked = 1; } bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); if (bbr->rc_in_persist == 0) { bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime; } sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime); /* * We invalidate the last ack here since we * don't want to transfer forward the time * for our sum's calculations. */ if ((tp->t_state >= TCPS_FIN_WAIT_1) && (sbavail(&so->so_snd) == 0) && (tp->t_flags2 & TF2_DROP_AF_DATA)) { /* * The socket was gone and the peer sent data, time * to reset him. */ *ret_val = 1; tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); /* tcp_close will kill the inp pre-log the Reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); BBR_STAT_INC(bbr_dropped_af_data); return (1); } /* Set need output so persist might get set */ bbr->r_wanted_output = 1; } if (ofia) *ofia = ourfinisacked; return (0); } static void bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line) { if (bbr->rc_in_persist == 0) { bbr_timer_cancel(bbr, __LINE__, cts); bbr->r_ctl.rc_last_delay_val = 0; tp->t_rxtshift = 0; bbr->rc_in_persist = 1; bbr->r_ctl.rc_went_idle_time = cts; /* We should be capped when rw went to 0 but just in case */ bbr_log_type_pesist(bbr, cts, 0, line, 1); /* Time freezes for the state, so do the accounting now */ if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { uint32_t time_in; time_in = cts - bbr->r_ctl.rc_bbr_state_time; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { int32_t idx; idx = bbr_state_val(bbr); counter_u64_add(bbr_state_time[(idx + 5)], time_in); } else { counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } } bbr->r_ctl.rc_bbr_state_time = cts; } } static void bbr_restart_after_idle(struct tcp_bbr *bbr, uint32_t cts, uint32_t idle_time) { /* * Note that if idle time does not exceed our * threshold, we do nothing continuing the state * transitions we were last walking through. */ if (idle_time >= bbr_idle_restart_threshold) { if (bbr->rc_use_idle_restart) { bbr->rc_bbr_state = BBR_STATE_IDLE_EXIT; /* * Set our target using BBR_UNIT, so * we increase at a dramatic rate but * we stop when we get the pipe * full again for our current b/w estimate. */ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; bbr_set_state_target(bbr, __LINE__); /* Now setup our gains to ramp up */ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; bbr_log_type_statechange(bbr, cts, __LINE__); } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { bbr_substate_change(bbr, cts, __LINE__, 1); } } } static void bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line) { uint32_t idle_time; if (bbr->rc_in_persist == 0) return; idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time); bbr->rc_in_persist = 0; bbr->rc_hit_state_1 = 0; bbr->r_ctl.rc_del_time = cts; /* * We invalidate the last ack here since we * don't want to transfer forward the time * for our sum's calculations. */ if (bbr->rc_inp->inp_in_hpts) { tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT); bbr->rc_timer_first = 0; bbr->r_ctl.rc_hpts_flags = 0; bbr->r_ctl.rc_last_delay_val = 0; bbr->r_ctl.rc_hptsi_agg_delay = 0; bbr->r_agg_early_set = 0; bbr->r_ctl.rc_agg_early = 0; } bbr_log_type_pesist(bbr, cts, idle_time, line, 0); if (idle_time >= bbr_rtt_probe_time) { /* * This qualifies as a RTT_PROBE session since we drop the * data outstanding to nothing and waited more than * bbr_rtt_probe_time. */ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_PERSIST, 0); bbr->r_ctl.last_in_probertt = bbr->r_ctl.rc_rtt_shrinks = cts; } tp->t_rxtshift = 0; /* * If in probeBW and we have persisted more than an RTT lets do * special handling. */ /* Force a time based epoch */ bbr_set_epoch(bbr, cts, __LINE__); /* * Setup the lost so we don't count anything against the guy * we have been stuck with during persists. */ bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; /* Time un-freezes for the state */ bbr->r_ctl.rc_bbr_state_time = cts; if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) || (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT)) { /* * If we are going back to probe-bw * or probe_rtt, we may need to possibly * do a fast restart. */ bbr_restart_after_idle(bbr, cts, idle_time); } } static void bbr_collapsed_window(struct tcp_bbr *bbr) { /* * Now we must walk the * send map and divide the * ones left stranded. These * guys can't cause us to abort * the connection and are really * "unsent". However if a buggy * client actually did keep some * of the data i.e. collapsed the win * and refused to ack and then opened * the win and acked that data. We would * get into an ack war, the simplier * method then of just pretending we * did not send those segments something * won't work. */ struct bbr_sendmap *rsm, *nrsm; tcp_seq max_seq; uint32_t maxseg; int can_split = 0; int fnd = 0; maxseg = bbr->rc_tp->t_maxseg - bbr->rc_last_options; max_seq = bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd; bbr_log_type_rwnd_collapse(bbr, max_seq, 1, 0); TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { /* Find the first seq past or at maxseq */ if (rsm->r_flags & BBR_RWND_COLLAPSED) rsm->r_flags &= ~BBR_RWND_COLLAPSED; if (SEQ_GEQ(max_seq, rsm->r_start) && SEQ_GEQ(rsm->r_end, max_seq)) { fnd = 1; break; } } bbr->rc_has_collapsed = 0; if (!fnd) { /* Nothing to do strange */ return; } /* * Now can we split? * * We don't want to split if splitting * would generate too many small segments * less we let an attacker fragment our * send_map and leave us out of memory. */ if ((max_seq != rsm->r_start) && (max_seq != rsm->r_end)){ /* can we split? */ int res1, res2; res1 = max_seq - rsm->r_start; res2 = rsm->r_end - max_seq; if ((res1 >= (maxseg/8)) && (res2 >= (maxseg/8))) { /* No small pieces here */ can_split = 1; } else if (bbr->r_ctl.rc_num_small_maps_alloced < bbr_sack_block_limit) { /* We are under the limit */ can_split = 1; } } /* Ok do we need to split this rsm? */ if (max_seq == rsm->r_start) { /* It's this guy no split required */ nrsm = rsm; } else if (max_seq == rsm->r_end) { /* It's the next one no split required. */ nrsm = TAILQ_NEXT(rsm, r_next); if (nrsm == NULL) { /* Huh? */ return; } } else if (can_split && SEQ_LT(max_seq, rsm->r_end)) { /* yep we need to split it */ nrsm = bbr_alloc_limit(bbr, BBR_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* failed XXXrrs what can we do mark the whole? */ nrsm = rsm; goto no_split; } /* Clone it */ bbr_log_type_rwnd_collapse(bbr, max_seq, 3, 0); bbr_clone_rsm(bbr, nrsm, rsm, max_seq); TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&bbr->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } } else { /* * Split not allowed just start here just * use this guy. */ nrsm = rsm; } no_split: BBR_STAT_INC(bbr_collapsed_win); /* reuse fnd as a count */ fnd = 0; TAILQ_FOREACH_FROM(nrsm, &bbr->r_ctl.rc_map, r_next) { nrsm->r_flags |= BBR_RWND_COLLAPSED; fnd++; bbr->rc_has_collapsed = 1; } bbr_log_type_rwnd_collapse(bbr, max_seq, 4, fnd); } static void bbr_un_collapse_window(struct tcp_bbr *bbr) { struct bbr_sendmap *rsm; int cleared = 0; TAILQ_FOREACH_REVERSE(rsm, &bbr->r_ctl.rc_map, bbr_head, r_next) { if (rsm->r_flags & BBR_RWND_COLLAPSED) { /* Clear the flag */ rsm->r_flags &= ~BBR_RWND_COLLAPSED; cleared++; } else break; } bbr_log_type_rwnd_collapse(bbr, (bbr->rc_tp->snd_una + bbr->rc_tp->snd_wnd), 0, cleared); bbr->rc_has_collapsed = 0; } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { /* * Update window information. Don't look at window if no ACK: TAC's * send garbage on first SYN. */ uint16_t nsegs; int32_t tfo_syn; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); nsegs = max(1, m->m_pkthdr.lro_nsegs); if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) KMOD_TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; bbr->r_wanted_output = 1; } else if (thflags & TH_ACK) { if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; } } if (tp->snd_wnd < ctf_outstanding(tp)) /* The peer collapsed its window on us */ bbr_collapsed_window(bbr); else if (bbr->rc_has_collapsed) bbr_un_collapse_window(bbr); /* Was persist timer active and now we have window space? */ if ((bbr->rc_in_persist != 0) && (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr)))) { /* * Make the rate persist at end of persist mode if idle long * enough */ bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); /* Make sure we output to start the timer */ bbr->r_wanted_output = 1; } /* Do we need to enter persist? */ if ((bbr->rc_in_persist == 0) && (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_max == tp->snd_una) && sbavail(&tp->t_inpcb->inp_socket->so_snd) && (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { /* No send window.. we must enter persist */ bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); } if (tp->t_flags2 & TF2_DROP_AF_DATA) { m_freem(m); return (0); } /* * We don't support urgent data but * drag along the up just to make sure * if there is a stack switch no one * is surprised. */ tp->rcv_up = tp->rcv_nxt; INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing * queue, and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data is * presented to the user (this happens in tcp_usrreq.c, case * PRU_RCVD). If a FIN has already been received on this connection * then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_rnxt = tp->rcv_nxt; int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly * queue with control block tp. Set thflags to whether * reassembly now includes a segment with FIN. This handles * the common case inline (segment is the next to be * received on an established connection, and the queue is * empty), avoiding linkage into and removal from the queue * and repetition of various conversions. Set DELACK for * segments received in order, but ack immediately when * segments are out of order (so fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { #ifdef NETFLIX_SB_LIMITS u_int mcnt, appended; if (so->so_rcv.sb_shlim) { mcnt = m_memcnt(m); appended = 0; if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, CFO_NOSLEEP, NULL) == false) { counter_u64_add(tcp_sb_shlim_fails, 1); m_freem(m); return (0); } } #endif if (DELAY_ACK(tp, bbr, nsegs) || tfo_syn) { bbr->bbr_segs_rcvd += max(1, nsegs); tp->t_flags |= TF_DELACK; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } else { bbr->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } thflags = th->th_flags & TH_FIN; KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs); KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else #ifdef NETFLIX_SB_LIMITS appended = #endif sbappendstream_locked(&so->so_rcv, m, 0); SOCKBUF_UNLOCK(&so->so_rcv); tp->t_flags |= TF_WAKESOR; #ifdef NETFLIX_SB_LIMITS if (so->so_rcv.sb_shlim && appended != mcnt) counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); #endif } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs when * trimming from the head. */ tcp_seq temp = save_start; thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { /* * DSACK actually handled in the fastpath * above. */ tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { if ((tp->rcv_numsacks >= 1) && (tp->sackblks[0].end == save_start)) { /* * Partial overlap, recorded at todrop * above. */ tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { tcp_update_dsack_list(tp, save_start, save_start + tlen); } } } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know that the * connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* The socket upcall is handled by socantrcvmore. */ tp->t_flags &= ~TF_WAKESOR; /* * If connection is half-synchronized (ie NEEDSYN * flag on) then delay ACK, so it may be piggybacked * when SYN is sent. Otherwise, since we received a * FIN then no more input can be expected, send ACK * now. */ if (tp->t_flags & TF_NEEDSYN) { tp->t_flags |= TF_DELACK; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } else { tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES enter the * CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been * acked so enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the * other standard timers. */ case TCPS_FIN_WAIT_2: bbr->rc_timer_first = 1; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); INP_WLOCK_ASSERT(tp->t_inpcb); tcp_twstart(tp); return (1); } } /* * Return any desired output. */ if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > ctf_outstanding(tp))) { bbr->r_wanted_output = 1; } INP_WLOCK_ASSERT(tp->t_inpcb); return (0); } /* * Here nothing is really faster, its just that we * have broken out the fast-data path also just like * the fast-ack. Return 1 if we processed the packet * return 0 if you need to take the "slow-path". */ static int bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt) { uint16_t nsegs; int32_t newsize = 0; /* automatic sockbuf scaling */ struct tcp_bbr *bbr; #ifdef NETFLIX_SB_LIMITS u_int mcnt, appended; #endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* On the hpts and we would have called output */ bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if (bbr->r_ctl.rc_resend != NULL) { return (0); } if (tiwin && tiwin != tp->snd_wnd) { return (0); } if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { return (0); } if (__predict_false((to->to_flags & TOF_TS) && (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { return (0); } if (__predict_false((th->th_ack != tp->snd_una))) { return (0); } if (__predict_false(tlen > sbspace(&so->so_rcv))) { return (0); } if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * This is a pure, in-sequence data packet with nothing on the * reassembly queue and we have enough buffer space to take it. */ nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef NETFLIX_SB_LIMITS if (so->so_rcv.sb_shlim) { mcnt = m_memcnt(m); appended = 0; if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, CFO_NOSLEEP, NULL) == false) { counter_u64_add(tcp_sb_shlim_fails, 1); m_freem(m); return (1); } } #endif /* Clean receiver SACK report if present */ if (tp->rcv_numsacks) tcp_clean_sackreport(tp); KMOD_TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } /* * Pull snd_wl1 up to prevent seq wrap relative to th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; KMOD_TCPSTAT_ADD(tcps_rcvpack, (int)nsegs); KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. Give up when limit is * reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ #ifdef NETFLIX_SB_LIMITS appended = #endif sbappendstream_locked(&so->so_rcv, m, 0); ctf_calc_rwin(so, tp); } SOCKBUF_UNLOCK(&so->so_rcv); tp->t_flags |= TF_WAKESOR; #ifdef NETFLIX_SB_LIMITS if (so->so_rcv.sb_shlim && mcnt != appended) counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); #endif if (DELAY_ACK(tp, bbr, nsegs)) { bbr->bbr_segs_rcvd += max(1, nsegs); tp->t_flags |= TF_DELACK; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } else { bbr->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; } return (1); } /* * This subfunction is used to try to highly optimize the * fast path. We again allow window updates that are * in sequence to remain in the fast-path. We also add * in the __predict's to attempt to help the compiler. * Note that if we return a 0, then we can *not* process * it and the caller should push the packet into the * slow-path. If we return 1, then all is well and * the packet is fully processed. */ static int bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) { int32_t acked; uint16_t nsegs; uint32_t sack_changed; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif uint32_t prev_acked = 0; struct tcp_bbr *bbr; if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* Old ack, behind (or duplicate to) the last one rcv'd */ return (0); } if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { /* Above what we have sent? */ return (0); } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); } if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { /* We need a SYN or a FIN, unlikely.. */ return (0); } if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { /* Timestamp is behind .. old ack with seq wrap? */ return (0); } if (__predict_false(IN_RECOVERY(tp->t_flags))) { /* Still recovering */ return (0); } bbr = (struct tcp_bbr *)tp->t_fb_ptr; if (__predict_false(bbr->r_ctl.rc_resend != NULL)) { /* We are retransmitting */ return (0); } if (__predict_false(bbr->rc_in_persist != 0)) { /* In persist mode */ return (0); } if (bbr->r_ctl.rc_sacked) { /* We have sack holes on our scoreboard */ return (0); } /* Ok if we reach here, we can process a fast-ack */ nsegs = max(1, m->m_pkthdr.lro_nsegs); sack_changed = bbr_log_ack(tp, to, th, &prev_acked); /* * We never detect loss in fast ack [we can't * have a sack and can't be in recovery so * we always pass 0 (nothing detected)]. */ bbr_lt_bw_sampling(bbr, bbr->r_ctl.rc_rcvtime, 0); /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } /* Do we need to exit persists? */ if ((bbr->rc_in_persist != 0) && (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr)))) { bbr_exit_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); bbr->r_wanted_output = 1; } /* Do we need to enter persists? */ if ((bbr->rc_in_persist == 0) && (tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_max == tp->snd_una) && sbavail(&tp->t_inpcb->inp_socket->so_snd) && (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { /* No send window.. we must enter persist */ bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); } /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = bbr->r_ctl.rc_rcvtime; tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ KMOD_TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) bbr_cong_signal(tp, th, CC_RTO_ERR, NULL); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif KMOD_TCPSTAT_ADD(tcps_rcvackpack, (int)nsegs); KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(th->th_ack, tp->snd_una)) bbr_collapse_rtt(tp, bbr, TCP_REXMTVAL(tp)); tp->snd_una = th->th_ack; if (tp->snd_wnd < ctf_outstanding(tp)) /* The peer collapsed its window on us */ bbr_collapsed_window(bbr); else if (bbr->rc_has_collapsed) bbr_un_collapse_window(bbr); if (SEQ_GT(tp->snd_una, tp->snd_recover)) { tp->snd_recover = tp->snd_una; } bbr_ack_received(tp, bbr, th, acked, sack_changed, prev_acked, __LINE__, 0); /* * Pull snd_wl2 up to prevent seq wrap relative to th_ack. */ tp->snd_wl2 = th->th_ack; m_freem(m); /* * If all outstanding data are acked, stop retransmit timer, * otherwise restart timer using current (possibly backed-off) * value. If process is waiting for space, wakeup/selwakeup/signal. * If data are ready to send, let tcp_output decide between more * output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif /* Wake up the socket if we have room to write more */ tp->t_flags |= TF_WAKESOW; if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_CLEAR, __LINE__); if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) bbr->rc_tp->t_acktime = 0; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); if (bbr->rc_in_persist == 0) { bbr->r_ctl.rc_went_idle_time = bbr->r_ctl.rc_rcvtime; } sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); bbr_log_ack_clear(bbr, bbr->r_ctl.rc_rcvtime); /* * We invalidate the last ack here since we * don't want to transfer forward the time * for our sum's calculations. */ bbr->r_wanted_output = 1; } if (sbavail(&so->so_snd)) { bbr->r_wanted_output = 1; } return (1); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t todrop; int32_t ourfinisacked = 0; struct tcp_bbr *bbr; int32_t ret_val = 0; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); /* * If the state is SYN_SENT: if seg contains an ACK, but not for our * SYN, drop the input. if seg contains a RST, then drop the * connection. if seg does not contain SYN, then drop it. Otherwise * this is an acceptable SYN segment initialize tp->rcv_nxt and * tp->irs if seg contains ack then advance tp->snd_una. BRR does * not support ECN so we will not say we are capable. if SYN has * been acked change to ESTABLISHED else SYN_RCVD state arrange for * segment to be acked (eventually) continue processing rest of * data/controls, beginning with URG */ if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); ctf_do_drop(m, tp); return (1); } if (thflags & TH_RST) { ctf_do_drop(m, tp); return (1); } if (!(thflags & TH_SYN)) { ctf_do_drop(m, tp); return (1); } tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { int tfo_partial = 0; KMOD_TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial = 1; } /* * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ if (DELAY_ACK(tp, bbr, 1) && tlen != 0 && !tfo_partial) { bbr->bbr_segs_rcvd += 1; tp->t_flags |= TF_DELACK; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); } else { bbr->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; } if (SEQ_GT(th->th_ack, tp->iss)) { /* * The SYN is acked * handle it specially. */ bbr_log_syn(tp, to); } if (SEQ_GT(th->th_ack, tp->snd_una)) { /* * We advance snd_una for the * fast open case. If th_ack is * acknowledging data beyond * snd_una we can't just call * ack-processing since the * data stream in our send-map * will start at snd_una + 1 (one * beyond the SYN). If its just * equal we don't need to do that * and there is no send_map. */ tp->snd_una++; } /* * Received in SYN_SENT[*] state. Transitions: * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => simultaneous * open. If segment contains CC option and there is a * cached CC, apply TAO test. If it succeeds, connection is * * half-synchronized. Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If * there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_state_change(tp, TCPS_SYN_RECEIVED); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. If data, * trim to stay within window, dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. If the * remote host used T/TCP to validate the SYN, our data will be * ACK'd; if so, enter normal data segment processing in the middle * of step 5, ack processing. Otherwise, goto step 6. */ if (thflags & TH_ACK) { if ((to->to_flags & TOF_TS) != 0) { uint32_t t, rtt; t = tcp_tv_to_mssectick(&bbr->rc_tv); if (TSTMP_GEQ(t, to->to_tsecr)) { rtt = t - to->to_tsecr; if (rtt == 0) { rtt = 1; } rtt *= MS_IN_USEC; tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime); } } if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) return (ret_val); /* We may have changed to FIN_WAIT_1 above */ if (tp->t_state == TCPS_FIN_WAIT_1) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now * acknowledged then enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then * closing user can proceed. Starting the * timer is contrary to the specification, * but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and * use a compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the only valid * packets are the initial SYN, a retransmit/copy of the * initial SYN (possibly with a subset of the original * data), a valid ACK, a FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ if ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || (bbr->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || (bbr->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { ctf_do_drop(m, NULL); return (0); } } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { ctf_do_drop(m, NULL); return (0); } } if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know the * sequence numbers haven't wrapped. This is a partial fix for the * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } tp->snd_wnd = tiwin; /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (IS_FASTOPEN(tp->t_flags)) { cc_conn_init(tp); } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } KMOD_TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } /* * ok for the first time in lets see if we can use the ts to figure * out what the initial RTT was. */ if ((to->to_flags & TOF_TS) != 0) { uint32_t t, rtt; t = tcp_tv_to_mssectick(&bbr->rc_tv); if (TSTMP_GEQ(t, to->to_tsecr)) { rtt = t - to->to_tsecr; if (rtt == 0) { rtt = 1; } rtt *= MS_IN_USEC; tcp_bbr_xmit_timer(bbr, rtt, 0, 0, 0); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, bbr->r_ctl.rc_rcvtime); } } /* Drop off any SYN in the send map (probably not there) */ if (thflags & TH_ACK) bbr_log_syn(tp, to); if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } /* * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> * FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such connections * is not harmless as it would undo the snd_cwnd reduction * that occurs when a TFO SYN|ACK is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) cc_conn_init(tp); } /* * Account for the ACK of our SYN prior to * regular ACK processing below, except for * simultaneous SYN, which is handled later. */ if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) tp->snd_una++; /* * If segment contains data or ACK, will call tcp_reass() later; if * not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (tp->t_state == TCPS_FIN_WAIT_1) { /* We could have went to FIN_WAIT_1 (or EST) above */ /* * In FIN_WAIT_1 STATE in addition to the processing for the * ESTABLISHED state if our FIN is now acknowledged then * enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then closing * user can proceed. Starting the timer is contrary * to the specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { struct tcp_bbr *bbr; int32_t ret_val; /* * Header prediction: check for the two common cases of a * uni-directional data xfer. If the packet has no control flags, * is in-sequence, the window didn't change and we're not * retransmitting, it's a candidate. If the length is zero and the * ack moved forward, we're the sender side of the xfer. Just free * the data acked & wake any higher level process that was blocked * waiting for space. If the length is non-zero and the ack didn't * move, we're the receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data toc The socket * buffer and note that we need a delayed ack. Make sure that the * hidden state-flags are also off. Since we check for * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. */ bbr = (struct tcp_bbr *)tp->t_fb_ptr; if (bbr->r_ctl.rc_delivered < (4 * tp->t_maxseg)) { /* * If we have delived under 4 segments increase the initial * window if raised by the peer. We use this to determine * dynamic and static rwnd's at the end of a connection. */ bbr->r_ctl.rc_init_rwnd = max(tiwin, tp->snd_wnd); } if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) && __predict_true(SEGQ_EMPTY(tp)) && __predict_true(th->th_seq == tp->rcv_nxt)) { if (tlen == 0) { if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt, iptos)) { return (0); } } else { if (bbr_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt)) { return (0); } } } ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } /* State changes only happen in bbr_process_data() */ return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { struct tcp_bbr *bbr; int32_t ret_val; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static int bbr_check_data_after_close(struct mbuf *m, struct tcp_bbr *bbr, struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so) { if (bbr->rc_allow_data_af_clo == 0) { close_now: tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); /* tcp_close will kill the inp pre-log the Reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); KMOD_TCPSTAT_INC(tcps_rcvafterclose); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); return (1); } if (sbavail(&so->so_snd) == 0) goto close_now; /* Ok we allow data that is ignored and a followup reset */ tp->rcv_nxt = th->th_seq + *tlen; tp->t_flags2 |= TF2_DROP_AF_DATA; bbr->r_wanted_output = 1; *tlen = 0; return (0); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { /* * We call a new function now so we might continue and setup * to reset at all data being ack'd. */ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { /* * If we can't receive any more data, then closing user can * proceed. Starting the timer is contrary to the * specification, but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { /* * We call a new function now so we might continue and setup * to reset at all data being ack'd. */ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { tcp_twstart(tp); m_freem(m); return (1); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { /* * We call a new function now so we might continue and setup * to reset at all data being ack'd. */ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * case TCPS_LAST_ACK: Ack processing. */ if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { tp = tcp_close(tp); ctf_do_drop(m, tp); return (1); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCB is still * locked. */ static int bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ourfinisacked = 0; int32_t ret_val; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } INP_WLOCK_ASSERT(tp->t_inpcb); if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then we may RST the other end depending on the outcome * of bbr_check_data_after_close. */ if ((so->so_state & SS_NOFDREF) && tlen) { /* * We call a new function now so we might continue and setup * to reset at all data being ack'd. */ if (bbr_check_data_after_close(m, bbr, tp, &tlen, th, so)) return (1); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ INP_WLOCK_ASSERT(tp->t_inpcb); if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); bbr->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ INP_WLOCK_ASSERT(tp->t_inpcb); if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } INP_WLOCK_ASSERT(tp->t_inpcb); return (bbr_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static void bbr_stop_all_timers(struct tcpcb *tp) { struct tcp_bbr *bbr; /* * Assure no timers are running. */ if (tcp_timer_active(tp, TT_PERSIST)) { /* We enter in persists, set the flag appropriately */ bbr = (struct tcp_bbr *)tp->t_fb_ptr; bbr->rc_in_persist = 1; } tcp_timer_suspend(tp, TT_PERSIST); tcp_timer_suspend(tp, TT_REXMT); tcp_timer_suspend(tp, TT_KEEP); tcp_timer_suspend(tp, TT_DELACK); } static void bbr_google_mode_on(struct tcp_bbr *bbr) { bbr->rc_use_google = 1; bbr->rc_no_pacing = 0; bbr->r_ctl.bbr_google_discount = bbr_google_discount; bbr->r_use_policer = bbr_policer_detection_enabled; bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10); bbr->bbr_use_rack_cheat = 0; bbr->r_ctl.rc_incr_tmrs = 0; bbr->r_ctl.rc_inc_tcp_oh = 0; bbr->r_ctl.rc_inc_ip_oh = 0; bbr->r_ctl.rc_inc_enet_oh = 0; reset_time(&bbr->r_ctl.rc_delrate, BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT); reset_time_small(&bbr->r_ctl.rc_rttprop, (11 * USECS_IN_SECOND)); tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv)); } static void bbr_google_mode_off(struct tcp_bbr *bbr) { bbr->rc_use_google = 0; bbr->r_ctl.bbr_google_discount = 0; bbr->no_pacing_until = bbr_no_pacing_until; bbr->r_use_policer = 0; if (bbr->no_pacing_until) bbr->rc_no_pacing = 1; else bbr->rc_no_pacing = 0; if (bbr_use_rack_resend_cheat) bbr->bbr_use_rack_cheat = 1; else bbr->bbr_use_rack_cheat = 0; if (bbr_incr_timers) bbr->r_ctl.rc_incr_tmrs = 1; else bbr->r_ctl.rc_incr_tmrs = 0; if (bbr_include_tcp_oh) bbr->r_ctl.rc_inc_tcp_oh = 1; else bbr->r_ctl.rc_inc_tcp_oh = 0; if (bbr_include_ip_oh) bbr->r_ctl.rc_inc_ip_oh = 1; else bbr->r_ctl.rc_inc_ip_oh = 0; if (bbr_include_enet_oh) bbr->r_ctl.rc_inc_enet_oh = 1; else bbr->r_ctl.rc_inc_enet_oh = 0; bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; reset_time(&bbr->r_ctl.rc_delrate, bbr_num_pktepo_for_del_limit); reset_time_small(&bbr->r_ctl.rc_rttprop, (bbr_filter_len_sec * USECS_IN_SECOND)); tcp_bbr_tso_size_check(bbr, tcp_get_usecs(&bbr->rc_tv)); } /* * Return 0 on success, non-zero on failure * which indicates the error (usually no memory). */ static int bbr_init(struct tcpcb *tp) { struct tcp_bbr *bbr = NULL; struct inpcb *inp; uint32_t cts; tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO)); if (tp->t_fb_ptr == NULL) { /* * We need to allocate memory but cant. The INP and INP_INFO * locks and they are recusive (happens during setup. So a * scheme to drop the locks fails :( * */ return (ENOMEM); } bbr = (struct tcp_bbr *)tp->t_fb_ptr; bbr->rtt_valid = 0; inp = tp->t_inpcb; inp->inp_flags2 |= INP_CANNOT_DO_ECN; inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; TAILQ_INIT(&bbr->r_ctl.rc_map); TAILQ_INIT(&bbr->r_ctl.rc_free); TAILQ_INIT(&bbr->r_ctl.rc_tmap); bbr->rc_tp = tp; if (tp->t_inpcb) { bbr->rc_inp = tp->t_inpcb; } cts = tcp_get_usecs(&bbr->rc_tv); tp->t_acktime = 0; bbr->rc_allow_data_af_clo = bbr_ignore_data_after_close; bbr->r_ctl.rc_reorder_fade = bbr_reorder_fade; bbr->rc_tlp_threshold = bbr_tlp_thresh; bbr->r_ctl.rc_reorder_shift = bbr_reorder_thresh; bbr->r_ctl.rc_pkt_delay = bbr_pkt_delay; bbr->r_ctl.rc_min_to = bbr_min_to; bbr->rc_bbr_state = BBR_STATE_STARTUP; bbr->r_ctl.bbr_lost_at_state = 0; bbr->r_ctl.rc_lost_at_startup = 0; bbr->rc_all_timers_stopped = 0; bbr->r_ctl.rc_bbr_lastbtlbw = 0; bbr->r_ctl.rc_pkt_epoch_del = 0; bbr->r_ctl.rc_pkt_epoch = 0; bbr->r_ctl.rc_lowest_rtt = 0xffffffff; bbr->r_ctl.rc_bbr_hptsi_gain = bbr_high_gain; bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain; bbr->r_ctl.rc_went_idle_time = cts; bbr->rc_pacer_started = cts; bbr->r_ctl.rc_pkt_epoch_time = cts; bbr->r_ctl.rc_rcvtime = cts; bbr->r_ctl.rc_bbr_state_time = cts; bbr->r_ctl.rc_del_time = cts; bbr->r_ctl.rc_tlp_rxt_last_time = cts; bbr->r_ctl.last_in_probertt = cts; bbr->skip_gain = 0; bbr->gain_is_limited = 0; bbr->no_pacing_until = bbr_no_pacing_until; if (bbr->no_pacing_until) bbr->rc_no_pacing = 1; if (bbr_use_google_algo) { bbr->rc_no_pacing = 0; bbr->rc_use_google = 1; bbr->r_ctl.bbr_google_discount = bbr_google_discount; bbr->r_use_policer = bbr_policer_detection_enabled; } else { bbr->rc_use_google = 0; bbr->r_ctl.bbr_google_discount = 0; bbr->r_use_policer = 0; } if (bbr_ts_limiting) bbr->rc_use_ts_limit = 1; else bbr->rc_use_ts_limit = 0; if (bbr_ts_can_raise) bbr->ts_can_raise = 1; else bbr->ts_can_raise = 0; if (V_tcp_delack_enabled == 1) tp->t_delayed_ack = 2; else if (V_tcp_delack_enabled == 0) tp->t_delayed_ack = 0; else if (V_tcp_delack_enabled < 100) tp->t_delayed_ack = V_tcp_delack_enabled; else tp->t_delayed_ack = 2; if (bbr->rc_use_google == 0) bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; else bbr->r_ctl.rc_probertt_int = (USECS_IN_SECOND * 10); bbr->r_ctl.rc_min_rto_ms = bbr_rto_min_ms; bbr->rc_max_rto_sec = bbr_rto_max_sec; bbr->rc_init_win = bbr_def_init_win; if (tp->t_flags & TF_REQ_TSTMP) bbr->rc_last_options = TCP_TS_OVERHEAD; bbr->r_ctl.rc_pace_max_segs = tp->t_maxseg - bbr->rc_last_options; bbr->r_ctl.rc_high_rwnd = tp->snd_wnd; bbr->r_init_rtt = 1; counter_u64_add(bbr_flows_nohdwr_pacing, 1); if (bbr_allow_hdwr_pacing) bbr->bbr_hdw_pace_ena = 1; else bbr->bbr_hdw_pace_ena = 0; if (bbr_sends_full_iwnd) bbr->bbr_init_win_cheat = 1; else bbr->bbr_init_win_cheat = 0; bbr->r_ctl.bbr_utter_max = bbr_hptsi_utter_max; bbr->r_ctl.rc_drain_pg = bbr_drain_gain; bbr->r_ctl.rc_startup_pg = bbr_high_gain; bbr->rc_loss_exit = bbr_exit_startup_at_loss; bbr->r_ctl.bbr_rttprobe_gain_val = bbr_rttprobe_gain; bbr->r_ctl.bbr_hptsi_per_second = bbr_hptsi_per_second; bbr->r_ctl.bbr_hptsi_segments_delay_tar = bbr_hptsi_segments_delay_tar; bbr->r_ctl.bbr_hptsi_segments_max = bbr_hptsi_segments_max; bbr->r_ctl.bbr_hptsi_segments_floor = bbr_hptsi_segments_floor; bbr->r_ctl.bbr_hptsi_bytes_min = bbr_hptsi_bytes_min; bbr->r_ctl.bbr_cross_over = bbr_cross_over; bbr->r_ctl.rc_rtt_shrinks = cts; if (bbr->rc_use_google) { setup_time_filter(&bbr->r_ctl.rc_delrate, FILTER_TYPE_MAX, BBR_NUM_RTTS_FOR_GOOG_DEL_LIMIT); setup_time_filter_small(&bbr->r_ctl.rc_rttprop, FILTER_TYPE_MIN, (11 * USECS_IN_SECOND)); } else { setup_time_filter(&bbr->r_ctl.rc_delrate, FILTER_TYPE_MAX, bbr_num_pktepo_for_del_limit); setup_time_filter_small(&bbr->r_ctl.rc_rttprop, FILTER_TYPE_MIN, (bbr_filter_len_sec * USECS_IN_SECOND)); } bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_INIT, 0); if (bbr_uses_idle_restart) bbr->rc_use_idle_restart = 1; else bbr->rc_use_idle_restart = 0; bbr->r_ctl.rc_bbr_cur_del_rate = 0; bbr->r_ctl.rc_initial_hptsi_bw = bbr_initial_bw_bps; if (bbr_resends_use_tso) bbr->rc_resends_use_tso = 1; #ifdef NETFLIX_PEAKRATE tp->t_peakrate_thr = tp->t_maxpeakrate; #endif if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct bbr_sendmap *rsm; rsm = bbr_alloc(bbr); if (rsm == NULL) { uma_zfree(bbr_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; return (ENOMEM); } rsm->r_flags = BBR_OVERMAX; rsm->r_tim_lastsent[0] = cts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = tp->snd_una; rsm->r_end = tp->snd_max; rsm->r_dupack = 0; rsm->r_delivered = bbr->r_ctl.rc_delivered; rsm->r_ts_valid = 0; rsm->r_del_ack_ts = tp->ts_recent; rsm->r_del_time = cts; if (bbr->r_ctl.r_app_limited_until) rsm->r_app_limited = 1; else rsm->r_app_limited = 0; TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) rsm->r_bbr_state = bbr_state_val(bbr); else rsm->r_bbr_state = 8; } if (bbr_use_rack_resend_cheat && (bbr->rc_use_google == 0)) bbr->bbr_use_rack_cheat = 1; if (bbr_incr_timers && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_incr_tmrs = 1; if (bbr_include_tcp_oh && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_inc_tcp_oh = 1; if (bbr_include_ip_oh && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_inc_ip_oh = 1; if (bbr_include_enet_oh && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_inc_enet_oh = 1; bbr_log_type_statechange(bbr, cts, __LINE__); if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_srtt)) { uint32_t rtt; rtt = (TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); } /* announce the settings and state */ bbr_log_settings_change(bbr, BBR_RECOVERY_LOWRTT); tcp_bbr_tso_size_check(bbr, cts); /* * Now call the generic function to start a timer. This will place * the TCB on the hptsi wheel if a timer is needed with appropriate * flags. */ bbr_stop_all_timers(tp); bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0); return (0); } /* * Return 0 if we can accept the connection. Return * non-zero if we can't handle the connection. A EAGAIN * means you need to wait until the connection is up. * a EADDRNOTAVAIL means we can never handle the connection * (no SACK). */ static int bbr_handoff_ok(struct tcpcb *tp) { if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { /* Sure no problem though it may not stick */ return (0); } if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) { /* * We really don't know you have to get to ESTAB or beyond * to tell. */ return (EAGAIN); } if (tp->t_flags & TF_SENTFIN) return (EINVAL); if ((tp->t_flags & TF_SACK_PERMIT) || bbr_sack_not_required) { return (0); } /* * If we reach here we don't do SACK on this connection so we can * never do rack. */ return (EINVAL); } static void bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged) { if (tp->t_fb_ptr) { uint32_t calc; struct tcp_bbr *bbr; struct bbr_sendmap *rsm; bbr = (struct tcp_bbr *)tp->t_fb_ptr; if (bbr->r_ctl.crte) tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp); bbr_log_flowend(bbr); bbr->rc_tp = NULL; if (tp->t_inpcb) { /* Backout any flags2 we applied */ tp->t_inpcb->inp_flags2 &= ~INP_CANNOT_DO_ECN; tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; } if (bbr->bbr_hdrw_pacing) counter_u64_add(bbr_flows_whdwr_pacing, -1); else counter_u64_add(bbr_flows_nohdwr_pacing, -1); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); while (rsm) { TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); uma_zfree(bbr_zone, rsm); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); } rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); while (rsm) { TAILQ_REMOVE(&bbr->r_ctl.rc_free, rsm, r_next); uma_zfree(bbr_zone, rsm); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_free); } calc = bbr->r_ctl.rc_high_rwnd - bbr->r_ctl.rc_init_rwnd; if (calc > (bbr->r_ctl.rc_init_rwnd / 10)) BBR_STAT_INC(bbr_dynamic_rwnd); else BBR_STAT_INC(bbr_static_rwnd); bbr->r_ctl.rc_free_cnt = 0; uma_zfree(bbr_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } /* Make sure snd_nxt is correctly set */ tp->snd_nxt = tp->snd_max; } static void bbr_set_state(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t win) { switch (tp->t_state) { case TCPS_SYN_SENT: bbr->r_state = TCPS_SYN_SENT; bbr->r_substate = bbr_do_syn_sent; break; case TCPS_SYN_RECEIVED: bbr->r_state = TCPS_SYN_RECEIVED; bbr->r_substate = bbr_do_syn_recv; break; case TCPS_ESTABLISHED: bbr->r_ctl.rc_init_rwnd = max(win, bbr->rc_tp->snd_wnd); bbr->r_state = TCPS_ESTABLISHED; bbr->r_substate = bbr_do_established; break; case TCPS_CLOSE_WAIT: bbr->r_state = TCPS_CLOSE_WAIT; bbr->r_substate = bbr_do_close_wait; break; case TCPS_FIN_WAIT_1: bbr->r_state = TCPS_FIN_WAIT_1; bbr->r_substate = bbr_do_fin_wait_1; break; case TCPS_CLOSING: bbr->r_state = TCPS_CLOSING; bbr->r_substate = bbr_do_closing; break; case TCPS_LAST_ACK: bbr->r_state = TCPS_LAST_ACK; bbr->r_substate = bbr_do_lastack; break; case TCPS_FIN_WAIT_2: bbr->r_state = TCPS_FIN_WAIT_2; bbr->r_substate = bbr_do_fin_wait_2; break; case TCPS_LISTEN: case TCPS_CLOSED: case TCPS_TIME_WAIT: default: break; }; } static void bbr_substate_change(struct tcp_bbr *bbr, uint32_t cts, int32_t line, int dolog) { /* * Now what state are we going into now? Is there adjustments * needed? */ int32_t old_state, old_gain; old_state = bbr_state_val(bbr); old_gain = bbr->r_ctl.rc_bbr_hptsi_gain; if (bbr_state_val(bbr) == BBR_SUB_LEVEL1) { /* Save the lowest srtt we saw in our end of the sub-state */ bbr->rc_hit_state_1 = 0; if (bbr->r_ctl.bbr_smallest_srtt_this_state != 0xffffffff) bbr->r_ctl.bbr_smallest_srtt_state2 = bbr->r_ctl.bbr_smallest_srtt_this_state; } bbr->rc_bbr_substate++; if (bbr->rc_bbr_substate >= BBR_SUBSTATE_COUNT) { /* Cycle back to first state-> gain */ bbr->rc_bbr_substate = 0; } if (bbr_state_val(bbr) == BBR_SUB_GAIN) { /* * We enter the gain(5/4) cycle (possibly less if * shallow buffer detection is enabled) */ if (bbr->skip_gain) { /* * Hardware pacing has set our rate to * the max and limited our b/w just * do level i.e. no gain. */ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_LEVEL1]; } else if (bbr->gain_is_limited && bbr->bbr_hdrw_pacing && bbr->r_ctl.crte) { /* * We can't gain above the hardware pacing * rate which is less than our rate + the gain * calculate the gain needed to reach the hardware * pacing rate.. */ uint64_t bw, rate, gain_calc; bw = bbr_get_bw(bbr); rate = bbr->r_ctl.crte->rate; if ((rate > bw) && (((bw * (uint64_t)bbr_hptsi_gain[BBR_SUB_GAIN]) / (uint64_t)BBR_UNIT) > rate)) { gain_calc = (rate * BBR_UNIT) / bw; if (gain_calc < BBR_UNIT) gain_calc = BBR_UNIT; bbr->r_ctl.rc_bbr_hptsi_gain = (uint16_t)gain_calc; } else { bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN]; } } else bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_GAIN]; if ((bbr->rc_use_google == 0) && (bbr_gain_to_target == 0)) { bbr->r_ctl.rc_bbr_state_atflight = cts; } else bbr->r_ctl.rc_bbr_state_atflight = 0; } else if (bbr_state_val(bbr) == BBR_SUB_DRAIN) { bbr->rc_hit_state_1 = 1; bbr->r_ctl.rc_exta_time_gd = 0; bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if (bbr_state_drain_2_tar) { bbr->r_ctl.rc_bbr_state_atflight = 0; } else bbr->r_ctl.rc_bbr_state_atflight = cts; bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[BBR_SUB_DRAIN]; } else { /* All other cycles hit here 2-7 */ if ((old_state == BBR_SUB_DRAIN) && bbr->rc_hit_state_1) { if (bbr_sub_drain_slam_cwnd && (bbr->rc_use_google == 0) && (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if ((cts - bbr->r_ctl.rc_bbr_state_time) > bbr_get_rtt(bbr, BBR_RTT_PROP)) bbr->r_ctl.rc_exta_time_gd += ((cts - bbr->r_ctl.rc_bbr_state_time) - bbr_get_rtt(bbr, BBR_RTT_PROP)); else bbr->r_ctl.rc_exta_time_gd = 0; if (bbr->r_ctl.rc_exta_time_gd) { bbr->r_ctl.rc_level_state_extra = bbr->r_ctl.rc_exta_time_gd; /* Now chop up the time for each state (div by 7) */ bbr->r_ctl.rc_level_state_extra /= 7; if (bbr_rand_ot && bbr->r_ctl.rc_level_state_extra) { /* Add a randomization */ bbr_randomize_extra_state_time(bbr); } } } bbr->r_ctl.rc_bbr_state_atflight = max(1, cts); bbr->r_ctl.rc_bbr_hptsi_gain = bbr_hptsi_gain[bbr_state_val(bbr)]; } if (bbr->rc_use_google) { bbr->r_ctl.rc_bbr_state_atflight = max(1, cts); } bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain; if (dolog) bbr_log_type_statechange(bbr, cts, line); if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { uint32_t time_in; time_in = cts - bbr->r_ctl.rc_bbr_state_time; if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { counter_u64_add(bbr_state_time[(old_state + 5)], time_in); } else { counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } } bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff; bbr_set_state_target(bbr, __LINE__); if (bbr_sub_drain_slam_cwnd && (bbr->rc_use_google == 0) && (bbr_state_val(bbr) == BBR_SUB_DRAIN)) { /* Slam down the cwnd */ bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; if (bbr_sub_drain_app_limit) { /* Go app limited if we are on a long drain */ bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes))); } bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if (bbr->rc_lt_use_bw) { /* In policed mode we clamp pacing_gain to BBR_UNIT */ bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; } /* Google changes TSO size every cycle */ if (bbr->rc_use_google) tcp_bbr_tso_size_check(bbr, cts); bbr->r_ctl.gain_epoch = cts; bbr->r_ctl.rc_bbr_state_time = cts; bbr->r_ctl.substate_pe = bbr->r_ctl.rc_pkt_epoch; } static void bbr_set_probebw_google_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses) { if ((bbr_state_val(bbr) == BBR_SUB_DRAIN) && (google_allow_early_out == 1) && (bbr->r_ctl.rc_flight_at_input <= bbr->r_ctl.rc_target_at_state)) { /* We have reached out target flight size possibly early */ goto change_state; } if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) { return; } if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_get_rtt(bbr, BBR_RTT_PROP)) { /* * Must be a rttProp movement forward before * we can change states. */ return; } if (bbr_state_val(bbr) == BBR_SUB_GAIN) { /* * The needed time has passed but for * the gain cycle extra rules apply: * 1) If we have seen loss, we exit * 2) If we have not reached the target * we stay in GAIN (gain-to-target). */ if (google_consider_lost && losses) goto change_state; if (bbr->r_ctl.rc_target_at_state > bbr->r_ctl.rc_flight_at_input) { return; } } change_state: /* For gain we must reach our target, all others last 1 rttProp */ bbr_substate_change(bbr, cts, __LINE__, 1); } static void bbr_set_probebw_gains(struct tcp_bbr *bbr, uint32_t cts, uint32_t losses) { uint32_t flight, bbr_cur_cycle_time; if (bbr->rc_use_google) { bbr_set_probebw_google_gains(bbr, cts, losses); return; } if (cts == 0) { /* * Never alow cts to be 0 we * do this so we can judge if * we have set a timestamp. */ cts = 1; } if (bbr_state_is_pkt_epoch) bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PKTRTT); else bbr_cur_cycle_time = bbr_get_rtt(bbr, BBR_RTT_PROP); if (bbr->r_ctl.rc_bbr_state_atflight == 0) { if (bbr_state_val(bbr) == BBR_SUB_DRAIN) { flight = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if (bbr_sub_drain_slam_cwnd && bbr->rc_hit_state_1) { /* Keep it slam down */ if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state) { bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if (bbr_sub_drain_app_limit) { /* Go app limited if we are on a long drain */ bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.rc_delivered + flight); } } if (TSTMP_GT(cts, bbr->r_ctl.gain_epoch) && (((cts - bbr->r_ctl.gain_epoch) > bbr_get_rtt(bbr, BBR_RTT_PROP)) || (flight >= bbr->r_ctl.flightsize_at_drain))) { /* * Still here after the same time as * the gain. We need to drain harder * for the next srtt. Reduce by a set amount * the gain drop is capped at DRAIN states * value (88). */ bbr->r_ctl.flightsize_at_drain = flight; if (bbr_drain_drop_mul && bbr_drain_drop_div && (bbr_drain_drop_mul < bbr_drain_drop_div)) { /* Use your specific drop value (def 4/5 = 20%) */ bbr->r_ctl.rc_bbr_hptsi_gain *= bbr_drain_drop_mul; bbr->r_ctl.rc_bbr_hptsi_gain /= bbr_drain_drop_div; } else { /* You get drop of 20% */ bbr->r_ctl.rc_bbr_hptsi_gain *= 4; bbr->r_ctl.rc_bbr_hptsi_gain /= 5; } if (bbr->r_ctl.rc_bbr_hptsi_gain <= bbr_drain_floor) { /* Reduce our gain again to the bottom */ bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1); } bbr_log_exit_gain(bbr, cts, 4); /* * Extend out so we wait another * epoch before dropping again. */ bbr->r_ctl.gain_epoch = cts; } if (flight <= bbr->r_ctl.rc_target_at_state) { if (bbr_sub_drain_slam_cwnd && (bbr->rc_use_google == 0) && (bbr->rc_tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); bbr_log_exit_gain(bbr, cts, 3); } } else { /* Its a gain */ if (bbr->r_ctl.rc_lost > bbr->r_ctl.bbr_lost_at_state) { bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); goto change_state; } if ((ctf_outstanding(bbr->rc_tp) >= bbr->r_ctl.rc_target_at_state) || ((ctf_outstanding(bbr->rc_tp) + bbr->rc_tp->t_maxseg - 1) >= bbr->rc_tp->snd_wnd)) { bbr->r_ctl.rc_bbr_state_atflight = max(cts, 1); bbr_log_exit_gain(bbr, cts, 2); } } /** * We fall through and return always one of two things has - * occured. + * occurred. * 1) We are still not at target * * 2) We reached the target and set rc_bbr_state_atflight * which means we no longer hit this block * next time we are called. */ return; } change_state: if (TSTMP_LT(cts, bbr->r_ctl.rc_bbr_state_time)) return; if ((cts - bbr->r_ctl.rc_bbr_state_time) < bbr_cur_cycle_time) { /* Less than a full time-period has passed */ return; } if (bbr->r_ctl.rc_level_state_extra && (bbr_state_val(bbr) > BBR_SUB_DRAIN) && ((cts - bbr->r_ctl.rc_bbr_state_time) < (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) { /* Less than a full time-period + extra has passed */ return; } if (bbr_gain_gets_extra_too && bbr->r_ctl.rc_level_state_extra && (bbr_state_val(bbr) == BBR_SUB_GAIN) && ((cts - bbr->r_ctl.rc_bbr_state_time) < (bbr_cur_cycle_time + bbr->r_ctl.rc_level_state_extra))) { /* Less than a full time-period + extra has passed */ return; } bbr_substate_change(bbr, cts, __LINE__, 1); } static uint32_t bbr_get_a_state_target(struct tcp_bbr *bbr, uint32_t gain) { uint32_t mss, tar; if (bbr->rc_use_google) { /* Google just uses the cwnd target */ tar = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), gain); } else { mss = min((bbr->rc_tp->t_maxseg - bbr->rc_last_options), bbr->r_ctl.rc_pace_max_segs); /* Get the base cwnd with gain rounded to a mss */ tar = roundup(bbr_get_raw_target_cwnd(bbr, bbr_get_bw(bbr), gain), mss); /* Make sure it is within our min */ if (tar < get_min_cwnd(bbr)) return (get_min_cwnd(bbr)); } return (tar); } static void bbr_set_state_target(struct tcp_bbr *bbr, int line) { uint32_t tar, meth; if ((bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) && ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google)) { /* Special case using old probe-rtt method */ tar = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); meth = 1; } else { /* Non-probe-rtt case and reduced probe-rtt */ if ((bbr->rc_bbr_state == BBR_STATE_PROBE_BW) && (bbr->r_ctl.rc_bbr_hptsi_gain > BBR_UNIT)) { /* For gain cycle we use the hptsi gain */ tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain); meth = 2; } else if ((bbr_target_is_bbunit) || bbr->rc_use_google) { /* * If configured, or for google all other states * get BBR_UNIT. */ tar = bbr_get_a_state_target(bbr, BBR_UNIT); meth = 3; } else { /* * Or we set a target based on the pacing gain * for non-google mode and default (non-configured). * Note we don't set a target goal below drain (192). */ if (bbr->r_ctl.rc_bbr_hptsi_gain < bbr_hptsi_gain[BBR_SUB_DRAIN]) { tar = bbr_get_a_state_target(bbr, bbr_hptsi_gain[BBR_SUB_DRAIN]); meth = 4; } else { tar = bbr_get_a_state_target(bbr, bbr->r_ctl.rc_bbr_hptsi_gain); meth = 5; } } } bbr_log_set_of_state_target(bbr, tar, line, meth); bbr->r_ctl.rc_target_at_state = tar; } static void bbr_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts, int32_t line) { /* Change to probe_rtt */ uint32_t time_in; bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; bbr->r_ctl.flightsize_at_drain = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr->r_ctl.r_app_limited_until = (bbr->r_ctl.flightsize_at_drain + bbr->r_ctl.rc_delivered); /* Setup so we force feed the filter */ if (bbr->rc_use_google || bbr_probertt_sets_rtt) bbr->rc_prtt_set_ts = 1; if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { time_in = cts - bbr->r_ctl.rc_bbr_state_time; counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_ENTERPROBE, 0); bbr->r_ctl.rc_rtt_shrinks = cts; bbr->r_ctl.last_in_probertt = cts; bbr->r_ctl.rc_probertt_srttchktim = cts; bbr->r_ctl.rc_bbr_state_time = cts; bbr->rc_bbr_state = BBR_STATE_PROBE_RTT; /* We need to force the filter to update */ if ((bbr_sub_drain_slam_cwnd) && bbr->rc_hit_state_1 && (bbr->rc_use_google == 0) && (bbr_state_val(bbr) == BBR_SUB_DRAIN)) { if (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_saved_cwnd) bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; } else bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; /* Update the lost */ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; if ((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google){ /* Set to the non-configurable default of 4 (PROBE_RTT_MIN) */ bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; bbr_log_set_of_state_target(bbr, bbr->rc_tp->snd_cwnd, __LINE__, 6); bbr->r_ctl.rc_target_at_state = bbr->rc_tp->snd_cwnd; } else { /* * We bring it down slowly by using a hptsi gain that is * probably 75%. This will slowly float down our outstanding * without tampering with the cwnd. */ bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val; bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; bbr_set_state_target(bbr, __LINE__); if (bbr_prtt_slam_cwnd && (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } } if (ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= bbr->r_ctl.rc_target_at_state) { /* We are at target */ bbr->r_ctl.rc_bbr_enters_probertt = cts; } else { /* We need to come down to reach target before our time begins */ bbr->r_ctl.rc_bbr_enters_probertt = 0; } bbr->r_ctl.rc_pe_of_prtt = bbr->r_ctl.rc_pkt_epoch; BBR_STAT_INC(bbr_enter_probertt); bbr_log_exit_gain(bbr, cts, 0); bbr_log_type_statechange(bbr, cts, line); } static void bbr_check_probe_rtt_limits(struct tcp_bbr *bbr, uint32_t cts) { /* * Sanity check on probe-rtt intervals. * In crazy situations where we are competing * against new-reno flows with huge buffers * our rtt-prop interval could come to dominate * things if we can't get through a full set * of cycles, we need to adjust it. */ if (bbr_can_adjust_probertt && (bbr->rc_use_google == 0)) { uint16_t val = 0; uint32_t cur_rttp, fval, newval, baseval; /* Are we to small and go into probe-rtt to often? */ baseval = (bbr_get_rtt(bbr, BBR_RTT_PROP) * (BBR_SUBSTATE_COUNT + 1)); cur_rttp = roundup(baseval, USECS_IN_SECOND); fval = bbr_filter_len_sec * USECS_IN_SECOND; if (bbr_is_ratio == 0) { if (fval > bbr_rtt_probe_limit) newval = cur_rttp + (fval - bbr_rtt_probe_limit); else newval = cur_rttp; } else { int mul; mul = fval / bbr_rtt_probe_limit; newval = cur_rttp * mul; } if (cur_rttp > bbr->r_ctl.rc_probertt_int) { bbr->r_ctl.rc_probertt_int = cur_rttp; reset_time_small(&bbr->r_ctl.rc_rttprop, newval); val = 1; } else { /* * No adjustments were made * do we need to shrink it? */ if (bbr->r_ctl.rc_probertt_int > bbr_rtt_probe_limit) { if (cur_rttp <= bbr_rtt_probe_limit) { /* * Things have calmed down lets * shrink all the way to default */ bbr->r_ctl.rc_probertt_int = bbr_rtt_probe_limit; reset_time_small(&bbr->r_ctl.rc_rttprop, (bbr_filter_len_sec * USECS_IN_SECOND)); cur_rttp = bbr_rtt_probe_limit; newval = (bbr_filter_len_sec * USECS_IN_SECOND); val = 2; } else { /* * Well does some adjustment make sense? */ if (cur_rttp < bbr->r_ctl.rc_probertt_int) { /* We can reduce interval time some */ bbr->r_ctl.rc_probertt_int = cur_rttp; reset_time_small(&bbr->r_ctl.rc_rttprop, newval); val = 3; } } } } if (val) bbr_log_rtt_shrinks(bbr, cts, cur_rttp, newval, __LINE__, BBR_RTTS_RESETS_VALUES, val); } } static void bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts) { /* Exit probe-rtt */ if (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd) { tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } bbr_log_exit_gain(bbr, cts, 1); bbr->rc_hit_state_1 = 0; bbr->r_ctl.rc_rtt_shrinks = cts; bbr->r_ctl.last_in_probertt = cts; bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_RTTPROBE, 0); bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered); if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { uint32_t time_in; time_in = cts - bbr->r_ctl.rc_bbr_state_time; counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } if (bbr->rc_filled_pipe) { /* Switch to probe_bw */ bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); bbr->r_ctl.rc_bbr_cwnd_gain = bbr_cwnd_gain; bbr_substate_change(bbr, cts, __LINE__, 0); bbr_log_type_statechange(bbr, cts, __LINE__); } else { /* Back to startup */ bbr->rc_bbr_state = BBR_STATE_STARTUP; bbr->r_ctl.rc_bbr_state_time = cts; /* * We don't want to give a complete free 3 * measurements until we exit, so we use * the number of pe's we were in probe-rtt * to add to the startup_epoch. That way * we will still retain the old state. */ bbr->r_ctl.rc_bbr_last_startup_epoch += (bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_pe_of_prtt); bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; /* Make sure to use the lower pg when shifting back in */ if (bbr->r_ctl.rc_lost && bbr_use_lower_gain_in_startup && (bbr->rc_use_google == 0)) bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower; else bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_startup_pg; bbr->r_ctl.rc_bbr_cwnd_gain = bbr->r_ctl.rc_startup_pg; /* Probably not needed but set it anyway */ bbr_set_state_target(bbr, __LINE__); bbr_log_type_statechange(bbr, cts, __LINE__); bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 0); } bbr_check_probe_rtt_limits(bbr, cts); } static int32_t inline bbr_should_enter_probe_rtt(struct tcp_bbr *bbr, uint32_t cts) { if ((bbr->rc_past_init_win == 1) && (bbr->rc_in_persist == 0) && (bbr_calc_time(cts, bbr->r_ctl.rc_rtt_shrinks) >= bbr->r_ctl.rc_probertt_int)) { return (1); } if (bbr_can_force_probertt && (bbr->rc_in_persist == 0) && (TSTMP_GT(cts, bbr->r_ctl.last_in_probertt)) && ((cts - bbr->r_ctl.last_in_probertt) > bbr->r_ctl.rc_probertt_int)) { return (1); } return (0); } static int32_t bbr_google_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t pkt_epoch) { uint64_t btlbw, gain; if (pkt_epoch == 0) { /* * Need to be on a pkt-epoch to continue. */ return (0); } btlbw = bbr_get_full_bw(bbr); gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; if (btlbw >= gain) { bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3); bbr->r_ctl.rc_bbr_lastbtlbw = btlbw; } if ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) return (1); bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8); return(0); } static int32_t inline bbr_state_startup(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch) { /* Have we gained 25% in the last 3 packet based epoch's? */ uint64_t btlbw, gain; int do_exit; int delta, rtt_gain; if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) && (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { /* * This qualifies as a RTT_PROBE session since we drop the * data outstanding to nothing and waited more than * bbr_rtt_probe_time. */ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); bbr_set_reduced_rtt(bbr, cts, __LINE__); } if (bbr_should_enter_probe_rtt(bbr, cts)) { bbr_enter_probe_rtt(bbr, cts, __LINE__); return (0); } if (bbr->rc_use_google) return (bbr_google_startup(bbr, cts, pkt_epoch)); if ((bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) && (bbr_use_lower_gain_in_startup)) { /* Drop to a lower gain 1.5 x since we saw loss */ bbr->r_ctl.rc_bbr_hptsi_gain = bbr_startup_lower; } if (pkt_epoch == 0) { /* * Need to be on a pkt-epoch to continue. */ return (0); } if (bbr_rtt_gain_thresh) { /* * Do we allow a flow to stay * in startup with no loss and no * gain in rtt over a set threshold? */ if (bbr->r_ctl.rc_pkt_epoch_rtt && bbr->r_ctl.startup_last_srtt && (bbr->r_ctl.rc_pkt_epoch_rtt > bbr->r_ctl.startup_last_srtt)) { delta = bbr->r_ctl.rc_pkt_epoch_rtt - bbr->r_ctl.startup_last_srtt; rtt_gain = (delta * 100) / bbr->r_ctl.startup_last_srtt; } else rtt_gain = 0; if ((bbr->r_ctl.startup_last_srtt == 0) || (bbr->r_ctl.rc_pkt_epoch_rtt < bbr->r_ctl.startup_last_srtt)) /* First time or new lower value */ bbr->r_ctl.startup_last_srtt = bbr->r_ctl.rc_pkt_epoch_rtt; if ((bbr->r_ctl.rc_lost == 0) && (rtt_gain < bbr_rtt_gain_thresh)) { /* * No loss, and we are under * our gain threhold for * increasing RTT. */ if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch) bbr->r_ctl.rc_bbr_last_startup_epoch++; bbr_log_startup_event(bbr, cts, rtt_gain, delta, bbr->r_ctl.startup_last_srtt, 10); return (0); } } if ((bbr->r_ctl.r_measurement_count == bbr->r_ctl.last_startup_measure) && (bbr->r_ctl.rc_lost_at_startup == bbr->r_ctl.rc_lost) && (!IN_RECOVERY(bbr->rc_tp->t_flags))) { /* * We only assess if we have a new measurment when * we have no loss and are not in recovery. * Drag up by one our last_startup epoch so we will hold * the number of non-gain we have already accumulated. */ if (bbr->r_ctl.rc_bbr_last_startup_epoch < bbr->r_ctl.rc_pkt_epoch) bbr->r_ctl.rc_bbr_last_startup_epoch++; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 9); return (0); } /* Case where we reduced the lost (bad retransmit) */ if (bbr->r_ctl.rc_lost_at_startup > bbr->r_ctl.rc_lost) bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; bbr->r_ctl.last_startup_measure = bbr->r_ctl.r_measurement_count; btlbw = bbr_get_full_bw(bbr); if (bbr->r_ctl.rc_bbr_hptsi_gain == bbr_startup_lower) gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * (uint64_t)bbr_low_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; else gain = ((bbr->r_ctl.rc_bbr_lastbtlbw * (uint64_t)bbr_start_exit) / (uint64_t)100) + bbr->r_ctl.rc_bbr_lastbtlbw; do_exit = 0; if (btlbw > bbr->r_ctl.rc_bbr_lastbtlbw) bbr->r_ctl.rc_bbr_lastbtlbw = btlbw; if (btlbw >= gain) { bbr->r_ctl.rc_bbr_last_startup_epoch = bbr->r_ctl.rc_pkt_epoch; /* Update the lost so we won't exit in next set of tests */ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 3); } if ((bbr->rc_loss_exit && (bbr->r_ctl.rc_lost > bbr->r_ctl.rc_lost_at_startup) && (bbr->r_ctl.rc_pkt_epoch_loss_rate > bbr_startup_loss_thresh)) && ((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS)) { /* * If we had no gain, we had loss and that loss was above * our threshould, the rwnd is not constrained, and we have * had at least 3 packet epochs exit. Note that this is * switched off by sysctl. Google does not do this by the * way. */ if ((ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + (2 * max(bbr->r_ctl.rc_pace_max_segs, bbr->rc_tp->t_maxseg))) <= bbr->rc_tp->snd_wnd) { do_exit = 1; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 4); } else { /* Just record an updated loss value */ bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 5); } } else bbr->r_ctl.rc_lost_at_startup = bbr->r_ctl.rc_lost; if (((bbr->r_ctl.rc_pkt_epoch - bbr->r_ctl.rc_bbr_last_startup_epoch) >= BBR_STARTUP_EPOCHS) || do_exit) { /* Return 1 to exit the startup state. */ return (1); } /* Stay in startup */ bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 8); return (0); } static void bbr_state_change(struct tcp_bbr *bbr, uint32_t cts, int32_t epoch, int32_t pkt_epoch, uint32_t losses) { /* - * A tick occured in the rtt epoch do we need to do anything? + * A tick occurred in the rtt epoch do we need to do anything? */ #ifdef BBR_INVARIANTS if ((bbr->rc_bbr_state != BBR_STATE_STARTUP) && (bbr->rc_bbr_state != BBR_STATE_DRAIN) && (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) && (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) && (bbr->rc_bbr_state != BBR_STATE_PROBE_BW)) { /* Debug code? */ panic("Unknown BBR state %d?\n", bbr->rc_bbr_state); } #endif if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { /* Do we exit the startup state? */ if (bbr_state_startup(bbr, cts, epoch, pkt_epoch)) { uint32_t time_in; bbr_log_startup_event(bbr, cts, bbr->r_ctl.rc_bbr_last_startup_epoch, bbr->r_ctl.rc_lost_at_startup, bbr_start_exit, 6); bbr->rc_filled_pipe = 1; bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { time_in = cts - bbr->r_ctl.rc_bbr_state_time; counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } else time_in = 0; if (bbr->rc_no_pacing) bbr->rc_no_pacing = 0; bbr->r_ctl.rc_bbr_state_time = cts; bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.rc_drain_pg; bbr->rc_bbr_state = BBR_STATE_DRAIN; bbr_set_state_target(bbr, __LINE__); if ((bbr->rc_use_google == 0) && bbr_slam_cwnd_in_main_drain) { /* Here we don't have to worry about probe-rtt */ bbr->r_ctl.rc_saved_cwnd = bbr->rc_tp->snd_cwnd; bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } bbr->r_ctl.rc_bbr_cwnd_gain = bbr_high_gain; bbr_log_type_statechange(bbr, cts, __LINE__); if (ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) <= bbr->r_ctl.rc_target_at_state) { /* * Switch to probe_bw if we are already * there */ bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); bbr_substate_change(bbr, cts, __LINE__, 0); bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr_log_type_statechange(bbr, cts, __LINE__); } } } else if (bbr->rc_bbr_state == BBR_STATE_IDLE_EXIT) { uint32_t inflight; struct tcpcb *tp; tp = bbr->rc_tp; inflight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if (inflight >= bbr->r_ctl.rc_target_at_state) { /* We have reached a flight of the cwnd target */ bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; bbr->r_ctl.rc_bbr_cwnd_gain = BBR_UNIT; bbr_set_state_target(bbr, __LINE__); /* * Rig it so we don't do anything crazy and * start fresh with a new randomization. */ bbr->r_ctl.bbr_smallest_srtt_this_state = 0xffffffff; bbr->rc_bbr_substate = BBR_SUB_LEVEL6; bbr_substate_change(bbr, cts, __LINE__, 1); } } else if (bbr->rc_bbr_state == BBR_STATE_DRAIN) { /* Has in-flight reached the bdp (or less)? */ uint32_t inflight; struct tcpcb *tp; tp = bbr->rc_tp; inflight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if ((bbr->rc_use_google == 0) && bbr_slam_cwnd_in_main_drain && (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { /* * Here we don't have to worry about probe-rtt * re-slam it, but keep it slammed down. */ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if (inflight <= bbr->r_ctl.rc_target_at_state) { /* We have drained */ bbr->rc_bbr_state = BBR_STATE_PROBE_BW; bbr->r_ctl.bbr_lost_at_state = bbr->r_ctl.rc_lost; if (SEQ_GT(cts, bbr->r_ctl.rc_bbr_state_time)) { uint32_t time_in; time_in = cts - bbr->r_ctl.rc_bbr_state_time; counter_u64_add(bbr_state_time[bbr->rc_bbr_state], time_in); } if ((bbr->rc_use_google == 0) && bbr_slam_cwnd_in_main_drain && (tp->snd_cwnd < bbr->r_ctl.rc_saved_cwnd)) { /* Restore the cwnd */ tp->snd_cwnd = bbr->r_ctl.rc_saved_cwnd; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } /* Setup probe-rtt has being done now RRS-HERE */ bbr->r_ctl.rc_rtt_shrinks = cts; bbr->r_ctl.last_in_probertt = cts; bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_LEAVE_DRAIN, 0); /* Randomly pick a sub-state */ bbr->rc_bbr_substate = bbr_pick_probebw_substate(bbr, cts); bbr_substate_change(bbr, cts, __LINE__, 0); bbr_log_type_statechange(bbr, cts, __LINE__); } } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_RTT) { uint32_t flight; flight = ctf_flight_size(bbr->rc_tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr->r_ctl.r_app_limited_until = (flight + bbr->r_ctl.rc_delivered); if (((bbr->r_ctl.bbr_rttprobe_gain_val == 0) || bbr->rc_use_google) && (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { /* * We must keep cwnd at the desired MSS. */ bbr->rc_tp->snd_cwnd = bbr_rtt_probe_cwndtarg * (bbr->rc_tp->t_maxseg - bbr->rc_last_options); bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } else if ((bbr_prtt_slam_cwnd) && (bbr->rc_tp->snd_cwnd > bbr->r_ctl.rc_target_at_state)) { /* Re-slam it */ bbr->rc_tp->snd_cwnd = bbr->r_ctl.rc_target_at_state; bbr_log_type_cwndupd(bbr, 0, 0, 0, 12, 0, 0, __LINE__); } if (bbr->r_ctl.rc_bbr_enters_probertt == 0) { /* Has outstanding reached our target? */ if (flight <= bbr->r_ctl.rc_target_at_state) { bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_REACHTAR, 0); bbr->r_ctl.rc_bbr_enters_probertt = cts; /* If time is exactly 0, be 1usec off */ if (bbr->r_ctl.rc_bbr_enters_probertt == 0) bbr->r_ctl.rc_bbr_enters_probertt = 1; if (bbr->rc_use_google == 0) { /* - * Restore any lowering that as occured to + * Restore any lowering that as occurred to * reach here */ if (bbr->r_ctl.bbr_rttprobe_gain_val) bbr->r_ctl.rc_bbr_hptsi_gain = bbr->r_ctl.bbr_rttprobe_gain_val; else bbr->r_ctl.rc_bbr_hptsi_gain = BBR_UNIT; } } if ((bbr->r_ctl.rc_bbr_enters_probertt == 0) && (bbr->rc_use_google == 0) && bbr->r_ctl.bbr_rttprobe_gain_val && (((cts - bbr->r_ctl.rc_probertt_srttchktim) > bbr_get_rtt(bbr, bbr_drain_rtt)) || (flight >= bbr->r_ctl.flightsize_at_drain))) { /* * We have doddled with our current hptsi * gain an srtt and have still not made it * to target, or we have increased our flight. * Lets reduce the gain by xx% * flooring the reduce at DRAIN (based on * mul/div) */ int red; bbr->r_ctl.flightsize_at_drain = flight; bbr->r_ctl.rc_probertt_srttchktim = cts; red = max((bbr->r_ctl.bbr_rttprobe_gain_val / 10), 1); if ((bbr->r_ctl.rc_bbr_hptsi_gain - red) > max(bbr_drain_floor, 1)) { /* Reduce our gain again */ bbr->r_ctl.rc_bbr_hptsi_gain -= red; bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG, 0); } else if (bbr->r_ctl.rc_bbr_hptsi_gain > max(bbr_drain_floor, 1)) { /* one more chance before we give up */ bbr->r_ctl.rc_bbr_hptsi_gain = max(bbr_drain_floor, 1); bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_SHRINK_PG_FINAL, 0); } else { /* At the very bottom */ bbr->r_ctl.rc_bbr_hptsi_gain = max((bbr_drain_floor-1), 1); } } } if (bbr->r_ctl.rc_bbr_enters_probertt && (TSTMP_GT(cts, bbr->r_ctl.rc_bbr_enters_probertt)) && ((cts - bbr->r_ctl.rc_bbr_enters_probertt) >= bbr_rtt_probe_time)) { /* Time to exit probe RTT normally */ bbr_exit_probe_rtt(bbr->rc_tp, bbr, cts); } } else if (bbr->rc_bbr_state == BBR_STATE_PROBE_BW) { if ((bbr->rc_tp->snd_una == bbr->rc_tp->snd_max) && (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { /* * This qualifies as a RTT_PROBE session since we * drop the data outstanding to nothing and waited * more than bbr_rtt_probe_time. */ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); bbr_set_reduced_rtt(bbr, cts, __LINE__); } if (bbr_should_enter_probe_rtt(bbr, cts)) { bbr_enter_probe_rtt(bbr, cts, __LINE__); } else { bbr_set_probebw_gains(bbr, cts, losses); } } } static void bbr_check_bbr_for_state(struct tcp_bbr *bbr, uint32_t cts, int32_t line, uint32_t losses) { int32_t epoch = 0; if ((cts - bbr->r_ctl.rc_rcv_epoch_start) >= bbr_get_rtt(bbr, BBR_RTT_PROP)) { bbr_set_epoch(bbr, cts, line); /* At each epoch doe lt bw sampling */ epoch = 1; } bbr_state_change(bbr, cts, epoch, bbr->rc_is_pkt_epoch_now, losses); } static int bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, struct timeval *tv) { int32_t thflags, retval; uint32_t cts, lcts; uint32_t tiwin; struct tcpopt to; struct tcp_bbr *bbr; struct bbr_sendmap *rsm; struct timeval ltv; int32_t did_out = 0; int32_t in_recovery; uint16_t nsegs; int32_t prev_state; uint32_t lost; nsegs = max(1, m->m_pkthdr.lro_nsegs); bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* add in our stats */ kern_prefetch(bbr, &prev_state); prev_state = 0; thflags = th->th_flags; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a * race. */ INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); tp->t_rcvtime = ticks; /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif if (m->m_flags & M_TSTMP) { /* Prefer the hardware timestamp if present */ struct timespec ts; mbuf_tstmp2timespec(m, &ts); bbr->rc_tv.tv_sec = ts.tv_sec; bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); } else if (m->m_flags & M_TSTMP_LRO) { /* Next the arrival timestamp */ struct timespec ts; mbuf_tstmp2timespec(m, &ts); bbr->rc_tv.tv_sec = ts.tv_sec; bbr->rc_tv.tv_usec = ts.tv_nsec / 1000; bbr->r_ctl.rc_rcvtime = cts = tcp_tv_to_usectick(&bbr->rc_tv); } else { /* * Ok just get the current time. */ bbr->r_ctl.rc_rcvtime = lcts = cts = tcp_get_usecs(&bbr->rc_tv); } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If timestamps were negotiated during SYN/ACK and a * segment without a timestamp is received, silently drop * the segment, unless it is a RST segment or missing timestamps are * tolerated. * See section 3.2 of RFC 7323. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { retval = 0; goto done_with_input; } /* * If echoed timestamp is later than the current time, fall back to * non RFC1323 RTT calculation. Normalize timestamp if syncookies * were used when this connection was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_tv_to_mssectick(&bbr->rc_tv))) to.to_tsecr = 0; } /* * If its the first time in we need to take care of options and * verify we can do SACK for rack! */ if (bbr->r_state == 0) { /* * Process options only when we get SYN/ACK back. The SYN * case for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. XXX * this is traditional behavior, may need to be cleaned up. */ if (bbr->rc_inp == NULL) { bbr->rc_inp = tp->t_inpcb; } /* * We need to init rc_inp here since its not init'd when * bbr_init is called */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } else tp->t_flags &= ~TF_REQ_SCALE; /* * Initial send window. It will be updated with the * next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if ((to.to_flags & TOF_TS) && (tp->t_flags & TF_REQ_TSTMP)) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_tv_to_mssectick(&bbr->rc_tv); } else tp->t_flags &= ~TF_REQ_TSTMP; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { if (to.to_flags & TOF_FASTOPEN) { uint16_t mss; if (to.to_flags & TOF_MSS) mss = to.to_mss; else if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) mss = TCP6_MSS; else mss = TCP_MSS; tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); } else tcp_fastopen_disable_path(tp); } } /* * At this point we are at the initial call. Here we decide * if we are doing RACK or not. We do this by seeing if * TF_SACK_PERMIT is set, if not rack is *not* possible and * we switch to the default code. */ if ((tp->t_flags & TF_SACK_PERMIT) == 0) { /* Bail */ tcp_switch_back_to_default(tp); (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, tlen, iptos); return (1); } /* Set the flag */ bbr->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; tcp_set_hpts(tp->t_inpcb); sack_filter_clear(&bbr->r_ctl.bbr_sf, th->th_ack); } if (thflags & TH_ACK) { /* Track ack types */ if (to.to_flags & TOF_SACK) BBR_STAT_INC(bbr_acks_with_sacks); else BBR_STAT_INC(bbr_plain_acks); } /* * This is the one exception case where we set the rack state * always. All other times (timers etc) we must have a rack-state * set (so we assure we have done the checks above for SACK). */ if (thflags & TH_FIN) tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); if (bbr->r_state != tp->t_state) bbr_set_state(tp, bbr, tiwin); if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map)) != NULL) kern_prefetch(rsm, &prev_state); prev_state = bbr->r_state; bbr->rc_ack_was_delayed = 0; lost = bbr->r_ctl.rc_lost; bbr->rc_is_pkt_epoch_now = 0; if (m->m_flags & (M_TSTMP|M_TSTMP_LRO)) { /* Get the real time into lcts and figure the real delay */ lcts = tcp_get_usecs(<v); if (TSTMP_GT(lcts, cts)) { bbr->r_ctl.rc_ack_hdwr_delay = lcts - cts; bbr->rc_ack_was_delayed = 1; if (TSTMP_GT(bbr->r_ctl.rc_ack_hdwr_delay, bbr->r_ctl.highest_hdwr_delay)) bbr->r_ctl.highest_hdwr_delay = bbr->r_ctl.rc_ack_hdwr_delay; } else { bbr->r_ctl.rc_ack_hdwr_delay = 0; bbr->rc_ack_was_delayed = 0; } } else { bbr->r_ctl.rc_ack_hdwr_delay = 0; bbr->rc_ack_was_delayed = 0; } bbr_log_ack_event(bbr, th, &to, tlen, nsegs, cts, nxt_pkt, m); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { retval = 0; m_freem(m); goto done_with_input; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } in_recovery = IN_RECOVERY(tp->t_flags); if (tiwin > bbr->r_ctl.rc_high_rwnd) bbr->r_ctl.rc_high_rwnd = tiwin; #ifdef BBR_INVARIANTS if ((tp->t_inpcb->inp_flags & INP_DROPPED) || (tp->t_inpcb->inp_flags2 & INP_FREED)) { panic("tp:%p bbr:%p given a dropped inp:%p", tp, bbr, tp->t_inpcb); } #endif bbr->r_ctl.rc_flight_at_input = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); bbr->rtt_valid = 0; if (to.to_flags & TOF_TS) { bbr->rc_ts_valid = 1; bbr->r_ctl.last_inbound_ts = to.to_tsval; } else { bbr->rc_ts_valid = 0; bbr->r_ctl.last_inbound_ts = 0; } retval = (*bbr->r_substate) (m, th, so, tp, &to, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt, iptos); #ifdef BBR_INVARIANTS if ((retval == 0) && (tp->t_inpcb == NULL)) { panic("retval:%d tp:%p t_inpcb:NULL state:%d", retval, tp, prev_state); } #endif if (nxt_pkt == 0) BBR_STAT_INC(bbr_rlock_left_ret0); else BBR_STAT_INC(bbr_rlock_left_ret1); if (retval == 0) { /* * If retval is 1 the tcb is unlocked and most likely the tp * is gone. */ INP_WLOCK_ASSERT(tp->t_inpcb); tcp_bbr_xmit_timer_commit(bbr, tp, cts); if (bbr->rc_is_pkt_epoch_now) bbr_set_pktepoch(bbr, cts, __LINE__); bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost)); if (nxt_pkt == 0) { if (bbr->r_wanted_output != 0) { bbr->rc_output_starts_timer = 0; did_out = 1; (void)tp->t_fb->tfb_tcp_output(tp); } else bbr_start_hpts_timer(bbr, tp, cts, 6, 0, 0); } if ((nxt_pkt == 0) && ((bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && (SEQ_GT(tp->snd_max, tp->snd_una) || (tp->t_flags & TF_DELACK) || ((V_tcp_always_keepalive || bbr->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)))) { /* * We could not send (probably in the hpts but * stopped the timer)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && (bbr->rc_inp->inp_in_hpts) && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* * keep alive not needed if we are hptsi * output yet */ ; } else { if (bbr->rc_inp->inp_in_hpts) { tcp_hpts_remove(bbr->rc_inp, HPTS_REMOVE_OUTPUT); if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && (TSTMP_GT(lcts, bbr->rc_pacer_started))) { uint32_t del; del = lcts - bbr->rc_pacer_started; if (bbr->r_ctl.rc_last_delay_val > del) { BBR_STAT_INC(bbr_force_timer_start); bbr->r_ctl.rc_last_delay_val -= del; bbr->rc_pacer_started = lcts; } else { /* We are late */ bbr->r_ctl.rc_last_delay_val = 0; BBR_STAT_INC(bbr_force_output); (void)tp->t_fb->tfb_tcp_output(tp); } } } bbr_start_hpts_timer(bbr, tp, cts, 8, bbr->r_ctl.rc_last_delay_val, 0); } } else if ((bbr->rc_output_starts_timer == 0) && (nxt_pkt == 0)) { /* Do we have the correct timer running? */ bbr_timer_audit(tp, bbr, lcts, &so->so_snd); } /* Do we have a new state */ if (bbr->r_state != tp->t_state) bbr_set_state(tp, bbr, tiwin); done_with_input: bbr_log_doseg_done(bbr, cts, nxt_pkt, did_out); if (did_out) bbr->r_wanted_output = 0; #ifdef BBR_INVARIANTS if (tp->t_inpcb == NULL) { panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", did_out, retval, tp, prev_state); } #endif } return (retval); } static void bbr_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) { struct timeval tv; int retval; /* First lets see if we have old packets */ if (tp->t_in_pkt) { if (ctf_do_queued_segments(so, tp, 1)) { m_freem(m); return; } } if (m->m_flags & M_TSTMP_LRO) { tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; } else { /* Should not be should we kassert instead? */ tcp_get_usecs(&tv); } retval = bbr_do_segment_nounlock(m, th, so, tp, drop_hdrlen, tlen, iptos, 0, &tv); if (retval == 0) { tcp_handle_wakeup(tp, so); INP_WUNLOCK(tp->t_inpcb); } } /* * Return how much data can be sent without violating the * cwnd or rwnd. */ static inline uint32_t bbr_what_can_we_send(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t sendwin, uint32_t avail, int32_t sb_offset, uint32_t cts) { uint32_t len; if (ctf_outstanding(tp) >= tp->snd_wnd) { /* We never want to go over our peers rcv-window */ len = 0; } else { uint32_t flight; flight = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)); if (flight >= sendwin) { /* * We have in flight what we are allowed by cwnd (if * it was rwnd blocking it would have hit above out * >= tp->snd_wnd). */ return (0); } len = sendwin - flight; if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { /* We would send too much (beyond the rwnd) */ len = tp->snd_wnd - ctf_outstanding(tp); } if ((len + sb_offset) > avail) { /* * We don't have that much in the SB, how much is * there? */ len = avail - sb_offset; } } return (len); } static inline void bbr_do_error_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error) { #ifdef NETFLIX_STATS KMOD_TCPSTAT_INC(tcps_sndpack_error); KMOD_TCPSTAT_ADD(tcps_sndbyte_error, len); #endif } static inline void bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, int32_t len, int32_t error) { if (error) { bbr_do_error_accounting(tp, bbr, rsm, len, error); return; } if (rsm) { if (rsm->r_flags & BBR_TLP) { /* * TLP should not count in retran count, but in its * own bin */ #ifdef NETFLIX_STATS tp->t_sndtlppack++; tp->t_sndtlpbyte += len; KMOD_TCPSTAT_INC(tcps_tlpresends); KMOD_TCPSTAT_ADD(tcps_tlpresend_bytes, len); #endif } else { /* Retransmit */ tp->t_sndrexmitpack++; KMOD_TCPSTAT_INC(tcps_sndrexmitpack); KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif } /* * Logs in 0 - 8, 8 is all non probe_bw states 0-7 is * sub-state */ counter_u64_add(bbr_state_lost[rsm->r_bbr_state], len); if (bbr->rc_bbr_state != BBR_STATE_PROBE_BW) { /* Non probe_bw log in 1, 2, or 4. */ counter_u64_add(bbr_state_resend[bbr->rc_bbr_state], len); } else { /* * Log our probe state 3, and log also 5-13 to show * us the recovery sub-state for the send. This * means that 3 == (5+6+7+8+9+10+11+12+13) */ counter_u64_add(bbr_state_resend[BBR_STATE_PROBE_BW], len); counter_u64_add(bbr_state_resend[(bbr_state_val(bbr) + 5)], len); } /* Place in both 16's the totals of retransmitted */ counter_u64_add(bbr_state_lost[16], len); counter_u64_add(bbr_state_resend[16], len); /* Place in 17's the total sent */ counter_u64_add(bbr_state_resend[17], len); counter_u64_add(bbr_state_lost[17], len); } else { /* New sends */ KMOD_TCPSTAT_INC(tcps_sndpack); KMOD_TCPSTAT_ADD(tcps_sndbyte, len); /* Place in 17's the total sent */ counter_u64_add(bbr_state_resend[17], len); counter_u64_add(bbr_state_lost[17], len); #ifdef STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif } } static void bbr_cwnd_limiting(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t in_level) { if (bbr->rc_filled_pipe && bbr_target_cwnd_mult_limit && (bbr->rc_use_google == 0)) { /* * Limit the cwnd to not be above N x the target plus whats * is outstanding. The target is based on the current b/w * estimate. */ uint32_t target; target = bbr_get_target_cwnd(bbr, bbr_get_bw(bbr), BBR_UNIT); target += ctf_outstanding(tp); target *= bbr_target_cwnd_mult_limit; if (tp->snd_cwnd > target) tp->snd_cwnd = target; bbr_log_type_cwndupd(bbr, 0, 0, 0, 10, 0, 0, __LINE__); } } static int bbr_window_update_needed(struct tcpcb *tp, struct socket *so, uint32_t recwin, int32_t maxseg) { /* * "adv" is the amount we could increase the window, taking into * account that we are limited by TCP_MAXWIN << tp->rcv_scale. */ int32_t adv; int32_t oldwin; adv = recwin; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); if (adv > oldwin) adv -= oldwin; else { /* We can't increase the window */ adv = 0; } } else oldwin = 0; /* * If the new window size ends up being the same as or less * than the old size when it is scaled, then don't force * a window update. */ if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) return (0); if (adv >= (2 * maxseg) && (adv >= (so->so_rcv.sb_hiwat / 4) || recwin <= (so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * maxseg)) { return (1); } if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) return (1); return (0); } /* * Return 0 on success and a errno on failure to send. * Note that a 0 return may not mean we sent anything * if the TCB was on the hpts. A non-zero return * does indicate the error we got from ip[6]_output. */ static int bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv) { struct socket *so; int32_t len; uint32_t cts; uint32_t recwin, sendwin; int32_t sb_offset; int32_t flags, abandon, error = 0; struct tcp_log_buffer *lgb = NULL; struct mbuf *m; struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; uint32_t if_hw_tsomaxsegsize = 0; uint32_t if_hw_tsomax = 0; struct ip *ip = NULL; #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif struct tcp_bbr *bbr; struct tcphdr *th; #ifdef NETFLIX_TCPOUDP struct udphdr *udp = NULL; #endif u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef NETFLIX_TCPOUDP unsigned ulen; #endif uint32_t bbr_seq; uint32_t delay_calc=0; uint8_t doing_tlp = 0; uint8_t local_options; #ifdef BBR_INVARIANTS uint8_t doing_retran_from = 0; uint8_t picked_up_retran = 0; #endif uint8_t wanted_cookie = 0; uint8_t more_to_rxt=0; int32_t prefetch_so_done = 0; int32_t prefetch_rsm = 0; uint32_t what_we_can = 0; uint32_t tot_len = 0; uint32_t rtr_cnt = 0; uint32_t maxseg, pace_max_segs, p_maxseg; int32_t csum_flags; int32_t hw_tls; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif volatile int32_t sack_rxmit; struct bbr_sendmap *rsm = NULL; int32_t tso, mtu; struct tcpopt to; int32_t slot = 0; struct inpcb *inp; struct sockbuf *sb; uint32_t hpts_calling; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif uint8_t app_limited = BBR_JR_SENT_DATA; uint8_t filled_all = 0; bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* We take a cache hit here */ memcpy(&bbr->rc_tv, tv, sizeof(struct timeval)); cts = tcp_tv_to_usectick(&bbr->rc_tv); inp = bbr->rc_inp; so = inp->inp_socket; sb = &so->so_snd; if (sb->sb_flags & SB_TLS_IFNET) hw_tls = 1; else hw_tls = 0; kern_prefetch(sb, &maxseg); maxseg = tp->t_maxseg - bbr->rc_last_options; if (bbr_minseg(bbr) < maxseg) { tcp_bbr_tso_size_check(bbr, cts); } /* Remove any flags that indicate we are pacing on the inp */ pace_max_segs = bbr->r_ctl.rc_pace_max_segs; p_maxseg = min(maxseg, pace_max_segs); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif #ifdef INET6 if (bbr->r_state) { /* Use the cache line loaded if possible */ isipv6 = bbr->r_is_v6; } else { isipv6 = (inp->inp_vflag & INP_IPV6) != 0; } #endif if (((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && inp->inp_in_hpts) { /* * We are on the hpts for some timer but not hptsi output. * Possibly remove from the hpts so we can send/recv etc. */ if ((tp->t_flags & TF_ACKNOW) == 0) { /* * No immediate demand right now to send an ack, but * the user may have read, making room for new data * (a window update). If so we may want to cancel * whatever timer is running (KEEP/DEL-ACK?) and * continue to send out a window update. Or we may * have gotten more data into the socket buffer to * send. */ recwin = lmin(lmax(sbspace(&so->so_rcv), 0), (long)TCP_MAXWIN << tp->rcv_scale); if ((bbr_window_update_needed(tp, so, recwin, maxseg) == 0) && ((tcp_outflags[tp->t_state] & TH_RST) == 0) && ((sbavail(sb) + ((tcp_outflags[tp->t_state] & TH_FIN) ? 1 : 0)) <= (tp->snd_max - tp->snd_una))) { /* * Nothing new to send and no window update * is needed to send. Lets just return and * let the timer-run off. */ return (0); } } tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); bbr_timer_cancel(bbr, __LINE__, cts); } if (bbr->r_ctl.rc_last_delay_val) { /* Calculate a rough delay for early escape to sending */ if (SEQ_GT(cts, bbr->rc_pacer_started)) delay_calc = cts - bbr->rc_pacer_started; if (delay_calc >= bbr->r_ctl.rc_last_delay_val) delay_calc -= bbr->r_ctl.rc_last_delay_val; else delay_calc = 0; } /* Mark that we have called bbr_output(). */ if ((bbr->r_timer_override) || (tp->t_state < TCPS_ESTABLISHED)) { /* Timeouts or early states are exempt */ if (inp->inp_in_hpts) tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); } else if (inp->inp_in_hpts) { if ((bbr->r_ctl.rc_last_delay_val) && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && delay_calc) { /* * We were being paced for output and the delay has * already exceeded when we were supposed to be * called, lets go ahead and pull out of the hpts * and call output. */ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_LATE], 1); bbr->r_ctl.rc_last_delay_val = 0; tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); } else if (tp->t_state == TCPS_CLOSED) { bbr->r_ctl.rc_last_delay_val = 0; tcp_hpts_remove(inp, HPTS_REMOVE_OUTPUT); } else { /* * On the hpts, you shall not pass! even if ACKNOW * is on, we will when the hpts fires, unless of * course we are overdue. */ counter_u64_add(bbr_out_size[TCP_MSS_ACCT_INPACE], 1); return (0); } } bbr->rc_cwnd_limited = 0; if (bbr->r_ctl.rc_last_delay_val) { /* recalculate the real delay and deal with over/under */ if (SEQ_GT(cts, bbr->rc_pacer_started)) delay_calc = cts - bbr->rc_pacer_started; else delay_calc = 0; if (delay_calc >= bbr->r_ctl.rc_last_delay_val) /* Setup the delay which will be added in */ delay_calc -= bbr->r_ctl.rc_last_delay_val; else { /* * We are early setup to adjust * our slot time. */ uint64_t merged_val; bbr->r_ctl.rc_agg_early += (bbr->r_ctl.rc_last_delay_val - delay_calc); bbr->r_agg_early_set = 1; if (bbr->r_ctl.rc_hptsi_agg_delay) { if (bbr->r_ctl.rc_hptsi_agg_delay >= bbr->r_ctl.rc_agg_early) { /* Nope our previous late cancels out the early */ bbr->r_ctl.rc_hptsi_agg_delay -= bbr->r_ctl.rc_agg_early; bbr->r_agg_early_set = 0; bbr->r_ctl.rc_agg_early = 0; } else { bbr->r_ctl.rc_agg_early -= bbr->r_ctl.rc_hptsi_agg_delay; bbr->r_ctl.rc_hptsi_agg_delay = 0; } } merged_val = bbr->rc_pacer_started; merged_val <<= 32; merged_val |= bbr->r_ctl.rc_last_delay_val; bbr_log_pacing_delay_calc(bbr, inp->inp_hpts_calls, bbr->r_ctl.rc_agg_early, cts, delay_calc, merged_val, bbr->r_agg_early_set, 3); bbr->r_ctl.rc_last_delay_val = 0; BBR_STAT_INC(bbr_early); delay_calc = 0; } } else { /* We were not delayed due to hptsi */ if (bbr->r_agg_early_set) bbr->r_ctl.rc_agg_early = 0; bbr->r_agg_early_set = 0; delay_calc = 0; } if (delay_calc) { /* * We had a hptsi delay which means we are falling behind on * sending at the expected rate. Calculate an extra amount * of data we can send, if any, to put us back on track. */ if ((bbr->r_ctl.rc_hptsi_agg_delay + delay_calc) < bbr->r_ctl.rc_hptsi_agg_delay) bbr->r_ctl.rc_hptsi_agg_delay = 0xffffffff; else bbr->r_ctl.rc_hptsi_agg_delay += delay_calc; } sendwin = min(tp->snd_wnd, tp->snd_cwnd); if ((tp->snd_una == tp->snd_max) && (bbr->rc_bbr_state != BBR_STATE_IDLE_EXIT) && (sbavail(sb))) { /* * Ok we have been idle with nothing outstanding * we possibly need to start fresh with either a new * suite of states or a fast-ramp up. */ bbr_restart_after_idle(bbr, cts, bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time)); } /* * Now was there a hptsi delay where we are behind? We only count * being behind if: a) We are not in recovery. b) There was a delay. * c) We had room to send something. * */ hpts_calling = inp->inp_hpts_calls; inp->inp_hpts_calls = 0; if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { if (bbr_process_timers(tp, bbr, cts, hpts_calling)) { counter_u64_add(bbr_out_size[TCP_MSS_ACCT_ATIMER], 1); return (0); } } bbr->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; if (hpts_calling && (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { bbr->r_ctl.rc_last_delay_val = 0; } bbr->r_timer_override = 0; bbr->r_wanted_output = 0; /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. */ if (IS_FASTOPEN(tp->t_flags) && ((tp->t_state == TCPS_SYN_RECEIVED) || (tp->t_state == TCPS_SYN_SENT)) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ (tp->t_rxtshift == 0)) { /* not a retransmit */ len = 0; goto just_return_nolock; } /* * Before sending anything check for a state update. For hpts * calling without input this is important. If its input calling * then this was already done. */ if (bbr->rc_use_google == 0) bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_max. BBR in general does not pay much attention to snd_nxt * for historic reasons the persist timer still uses it. This means * we have to look at it. All retransmissions that are not persits * use the rsm that needs to be sent so snd_nxt is ignored. At the * end of this routine we pull snd_nxt always up to snd_max. */ doing_tlp = 0; #ifdef BBR_INVARIANTS doing_retran_from = picked_up_retran = 0; #endif error = 0; tso = 0; slot = 0; mtu = 0; sendwin = min(tp->snd_wnd, tp->snd_cwnd); sb_offset = tp->snd_max - tp->snd_una; flags = tcp_outflags[tp->t_state]; sack_rxmit = 0; len = 0; rsm = NULL; if (flags & TH_RST) { SOCKBUF_LOCK(sb); goto send; } recheck_resend: while (bbr->r_ctl.rc_free_cnt < bbr_min_req_free) { /* We need to always have one in reserve */ rsm = bbr_alloc(bbr); if (rsm == NULL) { error = ENOMEM; /* Lie to get on the hpts */ tot_len = tp->t_maxseg; if (hpts_calling) /* Retry in a ms */ slot = 1001; goto just_return_nolock; } TAILQ_INSERT_TAIL(&bbr->r_ctl.rc_free, rsm, r_next); bbr->r_ctl.rc_free_cnt++; rsm = NULL; } /* What do we send, a resend? */ if (bbr->r_ctl.rc_resend == NULL) { /* Check for rack timeout */ bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts); if (bbr->r_ctl.rc_resend) { #ifdef BBR_INVARIANTS picked_up_retran = 1; #endif bbr_cong_signal(tp, NULL, CC_NDUPACK, bbr->r_ctl.rc_resend); } } if (bbr->r_ctl.rc_resend) { rsm = bbr->r_ctl.rc_resend; #ifdef BBR_INVARIANTS doing_retran_from = 1; #endif /* Remove any TLP flags its a RACK or T-O */ rsm->r_flags &= ~BBR_TLP; bbr->r_ctl.rc_resend = NULL; if (SEQ_LT(rsm->r_start, tp->snd_una)) { #ifdef BBR_INVARIANTS panic("Huh, tp:%p bbr:%p rsm:%p start:%u < snd_una:%u\n", tp, bbr, rsm, rsm->r_start, tp->snd_una); goto recheck_resend; #else /* TSNH */ rsm = NULL; goto recheck_resend; #endif } rtr_cnt++; if (rsm->r_flags & BBR_HAS_SYN) { /* Only retransmit a SYN by itself */ len = 0; if ((flags & TH_SYN) == 0) { /* Huh something is wrong */ rsm->r_start++; if (rsm->r_start == rsm->r_end) { /* Clean it up, somehow we missed the ack? */ bbr_log_syn(tp, NULL); } else { /* TFO with data? */ rsm->r_flags &= ~BBR_HAS_SYN; len = rsm->r_end - rsm->r_start; } } else { /* Retransmitting SYN */ rsm = NULL; SOCKBUF_LOCK(sb); goto send; } } else len = rsm->r_end - rsm->r_start; if ((bbr->rc_resends_use_tso == 0) && (len > maxseg)) { len = maxseg; more_to_rxt = 1; } sb_offset = rsm->r_start - tp->snd_una; if (len > 0) { sack_rxmit = 1; KMOD_TCPSTAT_INC(tcps_sack_rexmits); KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, maxseg)); } else { /* I dont think this can happen */ rsm = NULL; goto recheck_resend; } BBR_STAT_INC(bbr_resends_set); } else if (bbr->r_ctl.rc_tlp_send) { /* * Tail loss probe */ doing_tlp = 1; rsm = bbr->r_ctl.rc_tlp_send; bbr->r_ctl.rc_tlp_send = NULL; sack_rxmit = 1; len = rsm->r_end - rsm->r_start; rtr_cnt++; if ((bbr->rc_resends_use_tso == 0) && (len > maxseg)) len = maxseg; if (SEQ_GT(tp->snd_una, rsm->r_start)) { #ifdef BBR_INVARIANTS panic("tp:%p bbc:%p snd_una:%u rsm:%p r_start:%u", tp, bbr, tp->snd_una, rsm, rsm->r_start); #else /* TSNH */ rsm = NULL; goto recheck_resend; #endif } sb_offset = rsm->r_start - tp->snd_una; BBR_STAT_INC(bbr_tlp_set); } /* * Enforce a connection sendmap count limit if set * as long as we are not retransmiting. */ if ((rsm == NULL) && (V_tcp_map_entries_limit > 0) && (bbr->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { BBR_STAT_INC(bbr_alloc_limited); if (!bbr->alloc_limit_reported) { bbr->alloc_limit_reported = 1; BBR_STAT_INC(bbr_alloc_limited_conns); } goto just_return_nolock; } #ifdef BBR_INVARIANTS if (rsm && SEQ_LT(rsm->r_start, tp->snd_una)) { panic("tp:%p bbr:%p rsm:%p sb_offset:%u len:%u", tp, bbr, rsm, sb_offset, len); } #endif /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN && (rsm == NULL)) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; if (rsm && (rsm->r_flags & BBR_HAS_FIN)) { /* we are retransmitting the fin */ len--; if (len) { /* * When retransmitting data do *not* include the * FIN. This could happen from a TLP probe if we * allowed data with a FIN. */ flags &= ~TH_FIN; } } else if (rsm) { if (flags & TH_FIN) flags &= ~TH_FIN; } if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { void *end_rsm; end_rsm = TAILQ_LAST_FAST(&bbr->r_ctl.rc_tmap, bbr_sendmap, r_tnext); if (end_rsm) kern_prefetch(end_rsm, &prefetch_rsm); prefetch_rsm = 1; } SOCKBUF_LOCK(sb); /* * If snd_nxt == snd_max and we have transmitted a FIN, the * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a * negative length. This can also occur when TCP opens up its * congestion window while receiving additional duplicate acks after * fast-retransmit because TCP will reset snd_nxt to snd_max after * the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will be * set to snd_una, the sb_offset will be 0, and the length may wind * up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if (sack_rxmit == 0) { uint32_t avail; avail = sbavail(sb); if (SEQ_GT(tp->snd_max, tp->snd_una)) sb_offset = tp->snd_max - tp->snd_una; else sb_offset = 0; if (bbr->rc_tlp_new_data) { /* TLP is forcing out new data */ uint32_t tlplen; doing_tlp = 1; tlplen = maxseg; if (tlplen > (uint32_t)(avail - sb_offset)) { tlplen = (uint32_t)(avail - sb_offset); } if (tlplen > tp->snd_wnd) { len = tp->snd_wnd; } else { len = tlplen; } bbr->rc_tlp_new_data = 0; } else { what_we_can = len = bbr_what_can_we_send(tp, bbr, sendwin, avail, sb_offset, cts); if ((len < p_maxseg) && (bbr->rc_in_persist == 0) && (ctf_outstanding(tp) >= (2 * p_maxseg)) && ((avail - sb_offset) >= p_maxseg)) { /* * We are not completing whats in the socket * buffer (i.e. there is at least a segment * waiting to send) and we have 2 or more * segments outstanding. There is no sense * of sending a little piece. Lets defer and * and wait until we can send a whole * segment. */ len = 0; } if (bbr->rc_in_persist) { /* * We are in persists, figure out if * a retransmit is available (maybe the previous * persists we sent) or if we have to send new * data. */ rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); if (rsm) { len = rsm->r_end - rsm->r_start; if (rsm->r_flags & BBR_HAS_FIN) len--; if ((bbr->rc_resends_use_tso == 0) && (len > maxseg)) len = maxseg; if (len > 1) BBR_STAT_INC(bbr_persist_reneg); /* * XXXrrs we could force the len to * 1 byte here to cause the chunk to * split apart.. but that would then * mean we always retransmit it as * one byte even after the window * opens. */ sack_rxmit = 1; sb_offset = rsm->r_start - tp->snd_una; } else { /* * First time through in persists or peer * acked our one byte. Though we do have * to have something in the sb. */ len = 1; sb_offset = 0; if (avail == 0) len = 0; } } } } if (prefetch_so_done == 0) { kern_prefetch(so, &prefetch_so_done); prefetch_so_done = 1; } /* * Lop off SYN bit if it has already been sent. However, if this is * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && (rsm == NULL) && SEQ_GT(tp->snd_max, tp->snd_una)) { if (tp->t_state != TCPS_SYN_RECEIVED) flags &= ~TH_SYN; /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; sb_offset--, len++; if (sbavail(sb) == 0) len = 0; } else if ((flags & TH_SYN) && rsm) { /* * Subtract one from the len for the SYN being * retransmitted. */ len--; } /* * Be careful not to send data and/or FIN on SYN segments. This * measure is needed to prevent interoperability problems with not * fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } /* * On TFO sockets, ensure no data is sent in the following cases: * * - When retransmitting SYN|ACK on a passively-created socket * - When retransmitting SYN on an actively created socket * - When sending a zero-length cookie (cookie request) on an * actively created socket * - When the socket is in the CLOSED state (RST is being sent) */ if (IS_FASTOPEN(tp->t_flags) && (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || ((tp->t_state == TCPS_SYN_SENT) && (tp->t_tfo_client_cookie_len == 0)) || (flags & TH_RST))) { len = 0; sack_rxmit = 0; rsm = NULL; } /* Without fast-open there should never be data sent on a SYN */ if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) len = 0; if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been * called to retransmit, len will be < 0. Otherwise, window * shrank after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back to (closed) * window, and set the persist timer if it isn't already * going. If the window didn't close completely, just wait * for an ACK. * * We also do a general check here to ensure that we will * set the persist timer when we have data to send, but a * 0-byte window. This makes sure the persist timer is set * even if the packet hits one of the "goto send" lines * below. */ len = 0; if ((tp->snd_wnd == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (tp->snd_una == tp->snd_max) && (sb_offset < (int)sbavail(sb))) { /* * Not enough room in the rwnd to send * a paced segment out. */ bbr_enter_persist(tp, bbr, cts, __LINE__); } } else if ((rsm == NULL) && (doing_tlp == 0) && (len < bbr->r_ctl.rc_pace_max_segs)) { /* * We are not sending a full segment for * some reason. Should we not send anything (think * sws or persists)? */ if ((tp->snd_wnd < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (len < (int)(sbavail(sb) - sb_offset))) { /* * Here the rwnd is less than * the pacing size, this is not a retransmit, * we are established and * the send is not the last in the socket buffer * lets not send, and possibly enter persists. */ len = 0; if (tp->snd_max == tp->snd_una) bbr_enter_persist(tp, bbr, cts, __LINE__); } else if ((tp->snd_cwnd >= bbr->r_ctl.rc_pace_max_segs) && (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) && (len < (int)(sbavail(sb) - sb_offset)) && (len < bbr_minseg(bbr))) { /* * Here we are not retransmitting, and * the cwnd is not so small that we could * not send at least a min size (rxt timer * not having gone off), We have 2 segments or * more already in flight, its not the tail end * of the socket buffer and the cwnd is blocking * us from sending out minimum pacing segment size. * Lets not send anything. */ bbr->rc_cwnd_limited = 1; len = 0; } else if (((tp->snd_wnd - ctf_outstanding(tp)) < min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) && (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) > (2 * maxseg)) && (len < (int)(sbavail(sb) - sb_offset)) && (TCPS_HAVEESTABLISHED(tp->t_state))) { /* * Here we have a send window but we have * filled it up and we can't send another pacing segment. * We also have in flight more than 2 segments * and we are not completing the sb i.e. we allow * the last bytes of the sb to go out even if * its not a full pacing segment. */ len = 0; } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, sendwin); /* * */ if (bbr->rc_in_persist && len && (rsm == NULL) && (len < min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs))) { /* * We are in persist, not doing a retransmit and don't have enough space * yet to send a full TSO. So is it at the end of the sb * if so we need to send else nuke to 0 and don't send. */ int sbleft; if (sbavail(sb) > sb_offset) sbleft = sbavail(sb) - sb_offset; else sbleft = 0; if (sbleft >= min((bbr->r_ctl.rc_high_rwnd/2), bbr->r_ctl.rc_pace_max_segs)) { /* not at end of sb lets not send */ len = 0; } } /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP * options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per * generated segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() * does the right thing below to provide length of just ip options * and thus checking for ipoptlen is enough to decide if ip options * are present. */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(inp); else #endif if (inp->inp_options) ipoptlen = inp->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); #endif /* INET */ #endif /* IPSEC */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && (len > maxseg) && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && ipoptlen == 0) tso = 1; recwin = lmin(lmax(sbspace(&so->so_rcv), 0), (long)TCP_MAXWIN << tp->rcv_scale); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) - This is the last * buffer in a write()/send() and we are either idle or running * NODELAY - we've timed out (e.g. persist timer) - we have more * then 1/2 the maximum send window's worth of data (receiver may be * limited the window size) - we need to retransmit */ if (rsm) goto send; if (len) { if (sack_rxmit) goto send; if (len >= p_maxseg) goto send; /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause us * to flush a buffer queued with moretocome. XXX * */ if (((tp->t_flags & TF_MORETOCOME) == 0) && /* normal case */ ((tp->t_flags & TF_NODELAY) || ((uint32_t)len + (uint32_t)sb_offset) >= sbavail(&so->so_snd)) && (tp->t_flags & TF_NOPUSH) == 0) { goto send; } if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ goto send; } if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { goto send; } } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read from * the receive buffer, no matter how small, causes a window update * to be sent. We also should avoid sending a flurry of window * updates when the socket buffer had queued a lot of data and the * application is doing small reads. * * Prevent a flurry of pointless window updates by only sending an * update when we can increase the advertized window by more than * 1/4th of the socket buffer capacity. When the buffer is getting * full or is very small be more aggressive and send an update * whenever we can increase by two mss sized segments. In all other * situations the ACK's to new incoming data will carry further * window increases. * * Don't send an independent window update if a delayed ACK is * pending (it will get piggy-backed on it) or the remote side * already has done a half-close and won't send more data. Skip * this if the connection is in T/TCP half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* Check to see if we should do a window update */ if (bbr_window_update_needed(tp, so, recwin, maxseg)) goto send; } /* * Send if we owe the peer an ACK, RST, SYN. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) { goto send; } if (flags & TH_RST) { /* Always send a RST if one is due */ goto send; } if ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0) { goto send; } /* * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. */ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0)) { goto send; } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(sb); just_return_nolock: if (tot_len) slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) slot = 0; if (tot_len == 0) { if ((ctf_outstanding(tp) + min((bbr->r_ctl.rc_high_rwnd/2), bbr_minseg(bbr))) >= tp->snd_wnd) { BBR_STAT_INC(bbr_rwnd_limited); app_limited = BBR_JR_RWND_LIMITED; bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); if ((bbr->rc_in_persist == 0) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_max == tp->snd_una) && sbavail(&tp->t_inpcb->inp_socket->so_snd)) { /* No send window.. we must enter persist */ bbr_enter_persist(tp, bbr, bbr->r_ctl.rc_rcvtime, __LINE__); } } else if (ctf_outstanding(tp) >= sbavail(sb)) { BBR_STAT_INC(bbr_app_limited); app_limited = BBR_JR_APP_LIMITED; bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); } else if ((ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + p_maxseg) >= tp->snd_cwnd) { BBR_STAT_INC(bbr_cwnd_limited); app_limited = BBR_JR_CWND_LIMITED; bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes))); bbr->rc_cwnd_limited = 1; } else { BBR_STAT_INC(bbr_app_limited); app_limited = BBR_JR_APP_LIMITED; bbr_cwnd_limiting(tp, bbr, ctf_outstanding(tp)); } bbr->r_ctl.rc_hptsi_agg_delay = 0; bbr->r_agg_early_set = 0; bbr->r_ctl.rc_agg_early = 0; bbr->r_ctl.rc_last_delay_val = 0; } else if (bbr->rc_use_google == 0) bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); /* Are we app limited? */ if ((app_limited == BBR_JR_APP_LIMITED) || (app_limited == BBR_JR_RWND_LIMITED)) { /** * We are application limited. */ bbr->r_ctl.r_app_limited_until = (ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) + bbr->r_ctl.rc_delivered); } if (tot_len == 0) counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1); /* Dont update the time if we did not send */ bbr->r_ctl.rc_last_delay_val = 0; bbr->rc_output_starts_timer = 1; bbr_start_hpts_timer(bbr, tp, cts, 9, slot, tot_len); bbr_log_type_just_return(bbr, cts, tot_len, hpts_calling, app_limited, p_maxseg, len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ tp->snd_nxt = tp->snd_max; } return (error); send: if (doing_tlp == 0) { /* * Data not a TLP, and its not the rxt firing. If it is the * rxt firing, we want to leave the tlp_in_progress flag on * so we don't send another TLP. It has to be a rack timer * or normal send (response to acked data) to clear the tlp * in progress flag. */ bbr->rc_tlp_in_progress = 0; bbr->rc_tlp_rtx_out = 0; } else { /* * Its a TLP. */ bbr->rc_tlp_in_progress = 1; } bbr_timer_cancel(bbr, __LINE__, cts); if (rsm == NULL) { if (sbused(sb) > 0) { /* * This is sub-optimal. We only send a stand alone * FIN on its own segment. */ if (flags & TH_FIN) { flags &= ~TH_FIN; if ((len == 0) && ((tp->t_flags & TF_ACKNOW) == 0)) { /* Lets not send this */ slot = 0; goto just_return; } } } } else { /* * We do *not* send a FIN on a retransmit if it has data. * The if clause here where len > 1 should never come true. */ if ((len > 0) && (((rsm->r_flags & BBR_HAS_FIN) == 0) && (flags & TH_FIN))) { flags &= ~TH_FIN; len--; } } SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { if ((tp->snd_una == tp->snd_max) && (bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time) >= bbr_rtt_probe_time)) { /* * This qualifies as a RTT_PROBE session since we * drop the data outstanding to nothing and waited * more than bbr_rtt_probe_time. */ bbr_log_rtt_shrinks(bbr, cts, 0, 0, __LINE__, BBR_RTTS_WASIDLE, 0); bbr_set_reduced_rtt(bbr, cts, __LINE__); } if (len >= maxseg) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options unless TCP * set not to do any options. NOTE: we assume that the IP/TCP header * plus TCP options always fit in a single mbuf, leaving room for a * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) * + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else #endif hdrlen = sizeof(struct tcpiphdr); /* * Compute options for segment. We only have to care about SYN and * established connection segments. Options for SYN-ACK segments * are handled in TCP syncache. */ to.to_flags = 0; local_options = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { to.to_mss = tcp_mssopt(&inp->inp_inc); #ifdef NETFLIX_TCPOUDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; #endif to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, * only include the TFO option if it is not a * retransmit, as the presence of the TFO option may * have caused the original SYN or SYN|ACK to have * been dropped by a middlebox. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_rxtshift == 0)) { if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = tp->t_tfo_client_cookie_len; to.to_tfo_cookie = tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } } } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = tcp_tv_to_mssectick(&bbr->rc_tv) + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; local_options += TCPOLEN_TIMESTAMP + 2; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_tv_to_mssectick(&bbr->rc_tv); /* Selective ACK's. */ if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += (optlen = tcp_addoptions(&to, opt)); /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. */ if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && !(to.to_flags & TOF_FASTOPEN)) len = 0; } #ifdef NETFLIX_TCPOUDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ SOCKBUF_UNLOCK(&so->so_snd); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); } #endif #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if (bbr->rc_last_options != local_options) { /* * Cache the options length this generally does not change * on a connection. We use this to calculate TSO. */ bbr->rc_last_options = local_options; } maxseg = tp->t_maxseg - (ipoptlen + optlen); p_maxseg = min(maxseg, pace_max_segs); /* * Adjust data length if insertion of options will bump the packet * length beyond the t_maxseg length. Clear the FIN bit because we * cut off the tail of the segment. */ if (len > maxseg) { if (len != 0 && (flags & TH_FIN)) { flags &= ~TH_FIN; } if (tso) { uint32_t moff; int32_t max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { len = max_len; } } /* * Prevent the last segment from being fractional * unless the send sockbuf can be emptied: */ if ((sb_offset + len) < sbavail(sb)) { moff = len % (uint32_t)maxseg; if (moff != 0) { len -= moff; } } /* * In case there are too many small fragments don't * use TSO: */ if (len <= maxseg) { len = maxseg; tso = 0; } } else { /* Not doing TSO */ if (optlen + ipoptlen >= tp->t_maxseg) { /* * Since we don't have enough space to put * the IP header chain and the TCP header in * one packet as required by RFC 7112, don't * send it. Also ensure that at least one * byte of the payload can be put into the * TCP segment. */ SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; sack_rxmit = 0; goto out; } len = maxseg; } } else { /* Not doing TSO */ if_hw_tsomaxsegcount = 0; tso = 0; } KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); #ifdef DIAGNOSTIC #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); #endif /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further * down. */ #ifdef BBR_INVARIANTS if (sack_rxmit) { if (SEQ_LT(rsm->r_start, tp->snd_una)) { panic("RSM:%p TP:%p bbr:%p start:%u is < snd_una:%u", rsm, tp, bbr, rsm->r_start, tp->snd_una); } } #endif KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); if ((len == 0) && (flags & TH_FIN) && (sbused(sb))) { /* * We have outstanding data, don't send a fin by itself!. */ slot = 0; goto just_return; } /* * Grab a header mbuf, attaching a copy of data to be transmitted, * and initialize the header from the template for sends on this * connection. */ if (len) { uint32_t moff; uint32_t orig_len; /* * We place a limit on sending with hptsi. */ if ((rsm == NULL) && len > pace_max_segs) len = pace_max_segs; if (len <= maxseg) tso = 0; #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { BBR_STAT_INC(bbr_failed_mbuf_aloc); bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0); SOCKBUF_UNLOCK(sb); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf to the * sb_offset in the socket buffer chain. */ if ((sb_offset > sbavail(sb)) || ((len + sb_offset) > sbavail(sb))) { #ifdef BBR_INVARIANTS if ((len + sb_offset) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) panic("tp:%p bbr:%p len:%u sb_offset:%u sbavail:%u rsm:%p %u:%u:%u", tp, bbr, len, sb_offset, sbavail(sb), rsm, doing_retran_from, picked_up_retran, doing_tlp); #endif /* * In this messed up situation we have two choices, * a) pretend the send worked, and just start timers * and what not (not good since that may lead us * back here a lot). b) Send the lowest segment * in the map. c) Drop the connection. Lets do * which if it continues to happen will lead to * via timeouts. */ BBR_STAT_INC(bbr_offset_recovery); rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); sb_offset = 0; if (rsm == NULL) { sack_rxmit = 0; len = sbavail(sb); } else { sack_rxmit = 1; if (rsm->r_start != tp->snd_una) { /* * Things are really messed up, * is the only thing to do. */ BBR_STAT_INC(bbr_offset_drop); tcp_set_inp_to_drop(inp, EFAULT); SOCKBUF_UNLOCK(sb); (void)m_free(m); return (0); } len = rsm->r_end - rsm->r_start; } if (len > sbavail(sb)) len = sbavail(sb); if (len > maxseg) len = maxseg; } mb = sbsndptr_noadv(sb, sb_offset, &moff); if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t)+hdrlen); if (rsm == NULL) sbsndptr_adv(sb, mb, len); m->m_len += len; } else { struct sockbuf *msb; if (rsm) msb = NULL; else msb = sb; #ifdef BBR_INVARIANTS if ((len + moff) > (sbavail(sb) + ((flags & (TH_FIN | TH_SYN)) ? 1 : 0))) { if (rsm) { panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u rsm:%p snd_una:%u rsm_start:%u flg:%x %u:%u:%u sr:%d ", tp, bbr, len, moff, sbavail(sb), rsm, tp->snd_una, rsm->r_flags, rsm->r_start, doing_retran_from, picked_up_retran, doing_tlp, sack_rxmit); } else { panic("tp:%p bbr:%p len:%u moff:%u sbavail:%u sb_offset:%u snd_una:%u", tp, bbr, len, moff, sbavail(sb), sb_offset, tp->snd_una); } } #endif orig_len = len; m->m_next = tcp_m_copym( mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, ((rsm == NULL) ? hw_tls : 0) #ifdef NETFLIX_COPY_ARGS , &filled_all #endif ); if (len <= maxseg) { /* * Must have ran out of mbufs for the copy * shorten it to no longer need tso. Lets * not put on sendalot since we are low on * mbufs. */ tso = 0; } if (m->m_next == NULL) { SOCKBUF_UNLOCK(sb); (void)m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } #ifdef BBR_INVARIANTS if (tso && len < maxseg) { panic("tp:%p tso on, but len:%d < maxseg:%d", tp, len, maxseg); } if (tso && if_hw_tsomaxsegcount) { int32_t seg_cnt = 0; struct mbuf *foo; foo = m; while (foo) { seg_cnt++; foo = foo->m_next; } if (seg_cnt > if_hw_tsomaxsegcount) { panic("seg_cnt:%d > max:%d", seg_cnt, if_hw_tsomaxsegcount); } } #endif /* * If we're sending everything we've got, set PUSH. (This * will keep happy those implementations which only give * data to the user when a buffer fills or a PUSH comes in.) */ if (sb_offset + len == sbused(sb) && sbused(sb) && !(flags & TH_SYN)) { flags |= TH_PUSH; } SOCKBUF_UNLOCK(sb); } else { SOCKBUF_UNLOCK(sb); if (tp->t_flags & TF_ACKNOW) KMOD_TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) KMOD_TCPSTAT_INC(tcps_sndctrl); else KMOD_TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { BBR_STAT_INC(bbr_failed_mbuf_aloc); bbr_log_enobuf_jmp(bbr, len, cts, __LINE__, len, 0, 0); error = ENOBUFS; /* Fudge the send time since we could not send */ sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(sb); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else { #endif th = (struct tcphdr *)(ip6 + 1); #ifdef NETFLIX_TCPOUDP } #endif tcpip_fillheaders(inp, #ifdef NETFLIX_TCPOUDP tp->t_port, #endif ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, #ifdef NETFLIX_TCPOUDP tp->t_port, #endif ip, th); } /* * If we are doing retransmissions, then snd_nxt will not reflect * the first unsent octet. For ACK only packets, we do not want the * sequence number of the retransmitted packet, we want the sequence * number of the next unsent octet. So, if there is no data (and no * SYN or FIN), use snd_max instead of snd_nxt when filling in * ti_seq. But if we are in persist state, snd_max might reflect * one byte beyond the right edge of the window, so use snd_nxt in * that case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len && ((flags & (TH_FIN | TH_SYN | TH_RST)) == 0)) { /* New data (including new persists) */ th->th_seq = htonl(tp->snd_max); bbr_seq = tp->snd_max; } else if (flags & TH_SYN) { /* Syn's always send from iss */ th->th_seq = htonl(tp->iss); bbr_seq = tp->iss; } else if (flags & TH_FIN) { if (flags & TH_FIN && tp->t_flags & TF_SENTFIN) { /* * If we sent the fin already its 1 minus * snd_max */ th->th_seq = (htonl(tp->snd_max - 1)); bbr_seq = (tp->snd_max - 1); } else { /* First time FIN use snd_max */ th->th_seq = htonl(tp->snd_max); bbr_seq = tp->snd_max; } } else if (flags & TH_RST) { /* * For a Reset send the last cum ack in sequence * (this like any other choice may still generate a * challenge ack, if a ack-update packet is in * flight). */ th->th_seq = htonl(tp->snd_una); bbr_seq = tp->snd_una; } else { /* * len == 0 and not persist we use snd_max, sending * an ack unless we have sent the fin then its 1 * minus. */ /* * XXXRRS Question if we are in persists and we have * nothing outstanding to send and we have not sent * a FIN, we will send an ACK. In such a case it * might be better to send (tp->snd_una - 1) which * would force the peer to ack. */ if (tp->t_flags & TF_SENTFIN) { th->th_seq = htonl(tp->snd_max - 1); bbr_seq = (tp->snd_max - 1); } else { th->th_seq = htonl(tp->snd_max); bbr_seq = tp->snd_max; } } } else { /* All retransmits use the rsm to guide the send */ th->th_seq = htonl(rsm->r_start); bbr_seq = rsm->r_start; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, but avoid silly * window syndrome. */ if ((flags & TH_RST) || ((recwin < (so->so_rcv.sb_hiwat / 4) && recwin < maxseg))) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (tp->rcv_adv - tp->rcv_nxt)) recwin = (tp->rcv_adv - tp->rcv_nxt); if (recwin > TCP_MAXWIN << tp->rcv_scale) recwin = TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a or * ) segment itself is never scaled. The case is * handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else { /* Avoid shrinking window with window scaling. */ recwin = roundup2(recwin, 1 << tp->rcv_scale); th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); } /* * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 * window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is * attempting to read more data than can be buffered prior to * transmitting on the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; /* * We don't support urgent data, but drag along * the pointer in case of a stack switch. */ tp->snd_up = tp->snd_una; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. NOTE: since TCP options buffer doesn't * point into mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ goto out; } } #endif /* * Put TCP length in extended header, and then checksum extended * header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ #ifdef NETFLIX_TCPOUDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { #endif csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); #ifdef NETFLIX_TCPOUDP } #endif } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { #ifdef NETFLIX_TCPOUDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { #endif csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); #ifdef NETFLIX_TCPOUDP } #endif /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. The TCP pseudo * header checksum is always provided. XXX: Fixme: This is currently * not the case for IPv6. */ if (tso) { KASSERT(len > maxseg, ("%s: len:%d <= tso_segsz:%d", __func__, len, maxseg)); m->m_pkthdr.csum_flags |= CSUM_TSO; csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = maxseg; } KASSERT(len + hdrlen == m_length(m, NULL), ("%s: mbuf chain different than expected: %d + %u != %u", __func__, len, hdrlen, m_length(m, NULL))); #ifdef TCP_HHOOK /* Run HHOOK_TC_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + * (th->th_off << 2) */ ); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ /* Log to the black box */ if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); /* Record info on type of transmission */ log.u_bbr.flex1 = bbr->r_ctl.rc_hptsi_agg_delay; log.u_bbr.flex2 = (bbr->r_recovery_bw << 3); log.u_bbr.flex3 = maxseg; log.u_bbr.flex4 = delay_calc; /* Encode filled_all into the upper flex5 bit */ log.u_bbr.flex5 = bbr->rc_past_init_win; log.u_bbr.flex5 <<= 1; log.u_bbr.flex5 |= bbr->rc_no_pacing; log.u_bbr.flex5 <<= 29; if (filled_all) log.u_bbr.flex5 |= 0x80000000; log.u_bbr.flex5 |= tp->t_maxseg; log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs; log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr); /* lets poke in the low and the high here for debugging */ log.u_bbr.pkts_out = bbr->rc_tp->t_maxseg; if (rsm || sack_rxmit) { if (doing_tlp) log.u_bbr.flex8 = 2; else log.u_bbr.flex8 = 1; } else { log.u_bbr.flex8 = 0; } lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, &log, false, NULL, NULL, 0, tv); } else { lgb = NULL; } /* * Fill in IP length and desired time to live and send to IP level. * There should be a better way to handle ttl and tos; we could keep * them in the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. Also, * desired default hop limit might be changed via Neighbor * Discovery. */ ip6->ip6_hlim = in6_selecthlim(inp, NULL); /* * Set the packet size here for the benefit of DTrace * probes. ip6_output() will set it properly; it's supposed * to include the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, inp->in6p_outputopts, &inp->inp_route6, ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), NULL, NULL, inp); if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) mtu = inp->inp_route6.ro_nh->nh_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (isipv6) ip->ip_ttl = in6_selecthlim(inp, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every * packet. This might not be the best thing to do according * to RFC3390 Section 2. However the tcp hostcache migitates * the problem so it affects only the first tcp connection * with a host. * * NB: Don't set DF on small MTU/MSS to have a safe * fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; if (tp->t_port == 0 || len < V_tcp_minmss) { ip->ip_off |= htons(IP_DF); } } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); error = ip_output(m, inp->inp_options, &inp->inp_route, ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, inp); if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) mtu = inp->inp_route.ro_nh->nh_mtu; } #endif /* INET */ out: if (lgb) { lgb->tlb_errno = error; lgb = NULL; } /* * In transmit state, time the transmission and arrange for the * retransmit. In persist state, just set snd_max. */ if (error == 0) { if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) tcp_clean_dsack_blocks(tp); /* We sent an ack clear the bbr_segs_rcvd count */ bbr->output_error_seen = 0; bbr->oerror_cnt = 0; bbr->bbr_segs_rcvd = 0; if (len == 0) counter_u64_add(bbr_out_size[TCP_MSS_ACCT_SNDACK], 1); /* Do accounting for new sends */ if ((len > 0) && (rsm == NULL)) { int idx; if (tp->snd_una == tp->snd_max) { /* * Special case to match google, when * nothing is in flight the delivered * time does get updated to the current * time (see tcp_rate_bsd.c). */ bbr->r_ctl.rc_del_time = cts; } if (len >= maxseg) { idx = (len / maxseg) + 3; if (idx >= TCP_MSS_ACCT_ATIMER) counter_u64_add(bbr_out_size[(TCP_MSS_ACCT_ATIMER - 1)], 1); else counter_u64_add(bbr_out_size[idx], 1); } else { /* smaller than a MSS */ idx = len / (bbr_hptsi_bytes_min - bbr->rc_last_options); if (idx >= TCP_MSS_SMALL_MAX_SIZE_DIV) idx = (TCP_MSS_SMALL_MAX_SIZE_DIV - 1); counter_u64_add(bbr_out_size[(idx + TCP_MSS_SMALL_SIZE_OFF)], 1); } } } abandon = 0; /* * We must do the send accounting before we log the output, * otherwise the state of the rsm could change and we account to the * wrong bucket. */ if (len > 0) { bbr_do_send_accounting(tp, bbr, rsm, len, error); if (error == 0) { if (tp->snd_una == tp->snd_max) bbr->r_ctl.rc_tlp_rxt_last_time = cts; } } bbr_log_output(bbr, tp, &to, len, bbr_seq, (uint8_t) flags, error, cts, mb, &abandon, rsm, 0, sb); if (abandon) { /* * If bbr_log_output destroys the TCB or sees a TH_RST being * sent we should hit this condition. */ return (0); } if (bbr->rc_in_persist == 0) { /* * Advance snd_nxt over sequence space of this segment. */ if (error) /* We don't log or do anything with errors */ goto skip_upd; if (tp->snd_una == tp->snd_max && (len || (flags & (TH_SYN | TH_FIN)))) { /* * Update the time we just added data since none was * outstanding. */ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__); bbr->rc_tp->t_acktime = ticks; } if (flags & (TH_SYN | TH_FIN) && (rsm == NULL)) { if (flags & TH_SYN) { /* * Smack the snd_max to iss + 1 * if its a FO we will add len below. */ tp->snd_max = tp->iss + 1; } if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) { tp->snd_max++; tp->t_flags |= TF_SENTFIN; } } if (sack_rxmit == 0) tp->snd_max += len; skip_upd: if ((error == 0) && len) tot_len += len; } else { /* Persists case */ int32_t xlen = len; if (error) goto nomore; if (flags & TH_SYN) ++xlen; if ((flags & TH_FIN) && ((tp->t_flags & TF_SENTFIN) == 0)) { ++xlen; tp->t_flags |= TF_SENTFIN; } if (xlen && (tp->snd_una == tp->snd_max)) { /* * Update the time we just added data since none was * outstanding. */ bbr_log_progress_event(bbr, tp, ticks, PROGRESS_START, __LINE__); bbr->rc_tp->t_acktime = ticks; } if (sack_rxmit == 0) tp->snd_max += xlen; tot_len += (len + optlen + ipoptlen); } nomore: if (error) { /* * Failures do not advance the seq counter above. For the * case of ENOBUFS we will fall out and become ack-clocked. * capping the cwnd at the current flight. * Everything else will just have to retransmit with the timer * (no pacer). */ SOCKBUF_UNLOCK_ASSERT(sb); BBR_STAT_INC(bbr_saw_oerr); /* Clear all delay/early tracks */ bbr->r_ctl.rc_hptsi_agg_delay = 0; bbr->r_ctl.rc_agg_early = 0; bbr->r_agg_early_set = 0; bbr->output_error_seen = 1; if (bbr->oerror_cnt < 0xf) bbr->oerror_cnt++; if (bbr_max_net_error_cnt && (bbr->oerror_cnt >= bbr_max_net_error_cnt)) { /* drop the session */ tcp_set_inp_to_drop(inp, ENETDOWN); } switch (error) { case ENOBUFS: /* * Make this guy have to get ack's to send * more but lets make sure we don't * slam him below a T-O (1MSS). */ if (bbr->rc_bbr_state != BBR_STATE_PROBE_RTT) { tp->snd_cwnd = ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes)) - maxseg; if (tp->snd_cwnd < maxseg) tp->snd_cwnd = maxseg; } slot = (bbr_error_base_paceout + 1) << bbr->oerror_cnt; BBR_STAT_INC(bbr_saw_enobuf); if (bbr->bbr_hdrw_pacing) counter_u64_add(bbr_hdwr_pacing_enobuf, 1); else counter_u64_add(bbr_nohdwr_pacing_enobuf, 1); /* * Here even in the enobuf's case we want to do our * state update. The reason being we may have been * called by the input function. If so we have had * things change. */ error = 0; goto enobufs; case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. If TSO was active we either got an * interface without TSO capabilits or TSO was * turned off. If we obtained mtu from ip_output() * then update it and try again. */ /* Turn on tracing (or try to) */ { int old_maxseg; old_maxseg = tp->t_maxseg; BBR_STAT_INC(bbr_saw_emsgsiz); bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, csum_flags, tso, cts); if (mtu != 0) tcp_mss_update(tp, -1, mtu, NULL, NULL); if (old_maxseg <= tp->t_maxseg) { /* Huh it did not shrink? */ tp->t_maxseg = old_maxseg - 40; bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts); } /* * Nuke all other things that can interfere * with slot */ if ((tot_len + len) && (len >= tp->t_maxseg)) { slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, (tot_len + len), cts, 0); if (slot < bbr_error_base_paceout) slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; } else slot = (bbr_error_base_paceout + 2) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; bbr_start_hpts_timer(bbr, tp, cts, 10, slot, tot_len); return (error); } case EPERM: tp->t_softerror = error; /* Fall through */ case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: case ENETUNREACH: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; } /* FALLTHROUGH */ default: slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt; bbr->rc_output_starts_timer = 1; bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0); return (error); } #ifdef STATS } else if (((tp->t_flags & TF_GPUTINPROG) == 0) && len && (rsm == NULL) && (bbr->rc_in_persist == 0)) { tp->gput_seq = bbr_seq; tp->gput_ack = bbr_seq + min(sbavail(&so->so_snd) - sb_offset, sendwin); tp->gput_ts = cts; tp->t_flags |= TF_GPUTINPROG; #endif } KMOD_TCPSTAT_INC(tcps_sndtotal); if ((bbr->bbr_hdw_pace_ena) && (bbr->bbr_attempt_hdwr_pace == 0) && (bbr->rc_past_init_win) && (bbr->rc_bbr_state != BBR_STATE_STARTUP) && (get_filter_value(&bbr->r_ctl.rc_delrate)) && (inp->inp_route.ro_nh && inp->inp_route.ro_nh->nh_ifp)) { /* * We are past the initial window and * have at least one measurement so we * could use hardware pacing if its available. * We have an interface and we have not attempted * to setup hardware pacing, lets try to now. */ uint64_t rate_wanted; int err = 0; rate_wanted = bbr_get_hardware_rate(bbr); bbr->bbr_attempt_hdwr_pace = 1; bbr->r_ctl.crte = tcp_set_pacing_rate(bbr->rc_tp, inp->inp_route.ro_nh->nh_ifp, rate_wanted, (RS_PACING_GEQ|RS_PACING_SUB_OK), &err, NULL); if (bbr->r_ctl.crte) { bbr_type_log_hdwr_pacing(bbr, bbr->r_ctl.crte->ptbl->rs_ifp, rate_wanted, bbr->r_ctl.crte->rate, __LINE__, cts, err); BBR_STAT_INC(bbr_hdwr_rl_add_ok); counter_u64_add(bbr_flows_nohdwr_pacing, -1); counter_u64_add(bbr_flows_whdwr_pacing, 1); bbr->bbr_hdrw_pacing = 1; /* Now what is our gain status? */ if (bbr->r_ctl.crte->rate < rate_wanted) { /* We have a problem */ bbr_setup_less_of_rate(bbr, cts, bbr->r_ctl.crte->rate, rate_wanted); } else { /* We are good */ bbr->gain_is_limited = 0; bbr->skip_gain = 0; } tcp_bbr_tso_size_check(bbr, cts); } else { bbr_type_log_hdwr_pacing(bbr, inp->inp_route.ro_nh->nh_ifp, rate_wanted, 0, __LINE__, cts, err); BBR_STAT_INC(bbr_hdwr_rl_add_fail); } } if (bbr->bbr_hdrw_pacing) { /* * Worry about cases where the route * changes or something happened that we * lost our hardware pacing possibly during * the last ip_output call. */ if (inp->inp_snd_tag == NULL) { /* A change during ip output disabled hw pacing? */ bbr->bbr_hdrw_pacing = 0; } else if ((inp->inp_route.ro_nh == NULL) || (inp->inp_route.ro_nh->nh_ifp != inp->inp_snd_tag->ifp)) { /* * We had an interface or route change, * detach from the current hdwr pacing * and setup to re-attempt next go * round. */ bbr->bbr_hdrw_pacing = 0; bbr->bbr_attempt_hdwr_pace = 0; tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp); tcp_bbr_tso_size_check(bbr, cts); } } /* * Data sent (as far as we can tell). If this advertises a larger * window than any other segment, then remember the size of the * advertised window. Any pending ACK has now been sent. */ if (SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; if ((error == 0) && (bbr->r_ctl.rc_pace_max_segs > tp->t_maxseg) && (doing_tlp == 0) && (tso == 0) && (len > 0) && ((flags & TH_RST) == 0) && ((flags & TH_SYN) == 0) && (IN_RECOVERY(tp->t_flags) == 0) && (bbr->rc_in_persist == 0) && (tot_len < bbr->r_ctl.rc_pace_max_segs)) { /* * For non-tso we need to goto again until we have sent out * enough data to match what we are hptsi out every hptsi * interval. */ if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ tp->snd_nxt = tp->snd_max; } if (rsm != NULL) { rsm = NULL; goto skip_again; } rsm = NULL; sack_rxmit = 0; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); goto again; } skip_again: if ((error == 0) && (flags & TH_FIN)) tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); if ((error == 0) && (flags & TH_RST)) tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) { /* * Calculate/Re-Calculate the hptsi slot in usecs based on * what we have sent so far */ slot = bbr_get_pacing_delay(bbr, bbr->r_ctl.rc_bbr_hptsi_gain, tot_len, cts, 0); if (bbr->rc_no_pacing) slot = 0; } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: if (bbr->rc_use_google == 0) bbr_check_bbr_for_state(bbr, cts, __LINE__, 0); bbr_cwnd_limiting(tp, bbr, ctf_flight_size(tp, (bbr->r_ctl.rc_sacked + bbr->r_ctl.rc_lost_bytes))); bbr->rc_output_starts_timer = 1; if (bbr->bbr_use_rack_cheat && (more_to_rxt || ((bbr->r_ctl.rc_resend = bbr_check_recovery_mode(tp, bbr, cts)) != NULL))) { /* Rack cheats and shotguns out all rxt's 1ms apart */ if (slot > 1000) slot = 1000; } if (bbr->bbr_hdrw_pacing && (bbr->hw_pacing_set == 0)) { /* * We don't change the tso size until some number of sends * to give the hardware commands time to get down * to the interface. */ bbr->r_ctl.bbr_hdwr_cnt_noset_snt++; if (bbr->r_ctl.bbr_hdwr_cnt_noset_snt >= bbr_hdwr_pacing_delay_cnt) { bbr->hw_pacing_set = 1; tcp_bbr_tso_size_check(bbr, cts); } } bbr_start_hpts_timer(bbr, tp, cts, 12, slot, tot_len); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* Make sure snd_nxt is drug up */ tp->snd_nxt = tp->snd_max; } return (error); } /* * See bbr_output_wtime() for return values. */ static int bbr_output(struct tcpcb *tp) { int32_t ret; struct timeval tv; struct tcp_bbr *bbr; NET_EPOCH_ASSERT(); bbr = (struct tcp_bbr *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); (void)tcp_get_usecs(&tv); ret = bbr_output_wtime(tp, &tv); return (ret); } static void bbr_mtu_chg(struct tcpcb *tp) { struct tcp_bbr *bbr; struct bbr_sendmap *rsm, *frsm = NULL; uint32_t maxseg; /* * The MTU has changed. a) Clear the sack filter. b) Mark everything * over the current size as SACK_PASS so a retransmit will occur. */ bbr = (struct tcp_bbr *)tp->t_fb_ptr; maxseg = tp->t_maxseg - bbr->rc_last_options; sack_filter_clear(&bbr->r_ctl.bbr_sf, tp->snd_una); TAILQ_FOREACH(rsm, &bbr->r_ctl.rc_map, r_next) { /* Don't mess with ones acked (by sack?) */ if (rsm->r_flags & BBR_ACKED) continue; if ((rsm->r_end - rsm->r_start) > maxseg) { /* * We mark sack-passed on all the previous large * sends we did. This will force them to retransmit. */ rsm->r_flags |= BBR_SACK_PASSED; if (((rsm->r_flags & BBR_MARKED_LOST) == 0) && bbr_is_lost(bbr, rsm, bbr->r_ctl.rc_rcvtime)) { bbr->r_ctl.rc_lost_bytes += rsm->r_end - rsm->r_start; bbr->r_ctl.rc_lost += rsm->r_end - rsm->r_start; rsm->r_flags |= BBR_MARKED_LOST; } if (frsm == NULL) frsm = rsm; } } if (frsm) { bbr->r_ctl.rc_resend = frsm; } } /* * bbr_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ static int bbr_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr) { struct epoch_tracker et; int32_t error = 0, optval; switch (sopt->sopt_name) { case TCP_RACK_PACE_MAX_SEG: case TCP_RACK_MIN_TO: case TCP_RACK_REORD_THRESH: case TCP_RACK_REORD_FADE: case TCP_RACK_TLP_THRESH: case TCP_RACK_PKT_DELAY: case TCP_BBR_ALGORITHM: case TCP_BBR_TSLIMITS: case TCP_BBR_IWINTSO: case TCP_BBR_RECFORCE: case TCP_BBR_STARTUP_PG: case TCP_BBR_DRAIN_PG: case TCP_BBR_RWND_IS_APP: case TCP_BBR_PROBE_RTT_INT: case TCP_BBR_PROBE_RTT_GAIN: case TCP_BBR_PROBE_RTT_LEN: case TCP_BBR_STARTUP_LOSS_EXIT: case TCP_BBR_USEDEL_RATE: case TCP_BBR_MIN_RTO: case TCP_BBR_MAX_RTO: case TCP_BBR_PACE_PER_SEC: case TCP_DELACK: case TCP_BBR_PACE_DEL_TAR: case TCP_BBR_SEND_IWND_IN_TSO: case TCP_BBR_EXTRA_STATE: case TCP_BBR_UTTER_MAX_TSO: case TCP_BBR_MIN_TOPACEOUT: case TCP_BBR_FLOOR_MIN_TSO: case TCP_BBR_TSTMP_RAISES: case TCP_BBR_POLICER_DETECT: case TCP_BBR_USE_RACK_CHEAT: case TCP_DATA_AFTER_CLOSE: case TCP_BBR_HDWR_PACE: case TCP_BBR_PACE_SEG_MAX: case TCP_BBR_PACE_SEG_MIN: case TCP_BBR_PACE_CROSS: case TCP_BBR_PACE_OH: #ifdef NETFLIX_PEAKRATE case TCP_MAXPEAKRATE: #endif case TCP_BBR_TMR_PACE_OH: case TCP_BBR_RACK_RTT_USE: case TCP_BBR_RETRAN_WTSO: break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) return (error); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); bbr = (struct tcp_bbr *)tp->t_fb_ptr; switch (sopt->sopt_name) { case TCP_BBR_PACE_PER_SEC: BBR_OPTS_INC(tcp_bbr_pace_per_sec); bbr->r_ctl.bbr_hptsi_per_second = optval; break; case TCP_BBR_PACE_DEL_TAR: BBR_OPTS_INC(tcp_bbr_pace_del_tar); bbr->r_ctl.bbr_hptsi_segments_delay_tar = optval; break; case TCP_BBR_PACE_SEG_MAX: BBR_OPTS_INC(tcp_bbr_pace_seg_max); bbr->r_ctl.bbr_hptsi_segments_max = optval; break; case TCP_BBR_PACE_SEG_MIN: BBR_OPTS_INC(tcp_bbr_pace_seg_min); bbr->r_ctl.bbr_hptsi_bytes_min = optval; break; case TCP_BBR_PACE_CROSS: BBR_OPTS_INC(tcp_bbr_pace_cross); bbr->r_ctl.bbr_cross_over = optval; break; case TCP_BBR_ALGORITHM: BBR_OPTS_INC(tcp_bbr_algorithm); if (optval && (bbr->rc_use_google == 0)) { /* Turn on the google mode */ bbr_google_mode_on(bbr); if ((optval > 3) && (optval < 500)) { /* * Must be at least greater than .3% * and must be less than 50.0%. */ bbr->r_ctl.bbr_google_discount = optval; } } else if ((optval == 0) && (bbr->rc_use_google == 1)) { /* Turn off the google mode */ bbr_google_mode_off(bbr); } break; case TCP_BBR_TSLIMITS: BBR_OPTS_INC(tcp_bbr_tslimits); if (optval == 1) bbr->rc_use_ts_limit = 1; else if (optval == 0) bbr->rc_use_ts_limit = 0; else error = EINVAL; break; case TCP_BBR_IWINTSO: BBR_OPTS_INC(tcp_bbr_iwintso); if ((optval >= 0) && (optval < 128)) { uint32_t twin; bbr->rc_init_win = optval; twin = bbr_initial_cwnd(bbr, tp); if ((bbr->rc_past_init_win == 0) && (twin > tp->snd_cwnd)) tp->snd_cwnd = twin; else error = EBUSY; } else error = EINVAL; break; case TCP_BBR_STARTUP_PG: BBR_OPTS_INC(tcp_bbr_startup_pg); if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) { bbr->r_ctl.rc_startup_pg = optval; if (bbr->rc_bbr_state == BBR_STATE_STARTUP) { bbr->r_ctl.rc_bbr_hptsi_gain = optval; } } else error = EINVAL; break; case TCP_BBR_DRAIN_PG: BBR_OPTS_INC(tcp_bbr_drain_pg); if ((optval > 0) && (optval < BBR_MAX_GAIN_VALUE)) bbr->r_ctl.rc_drain_pg = optval; else error = EINVAL; break; case TCP_BBR_PROBE_RTT_LEN: BBR_OPTS_INC(tcp_bbr_probertt_len); if (optval <= 1) reset_time_small(&bbr->r_ctl.rc_rttprop, (optval * USECS_IN_SECOND)); else error = EINVAL; break; case TCP_BBR_PROBE_RTT_GAIN: BBR_OPTS_INC(tcp_bbr_probertt_gain); if (optval <= BBR_UNIT) bbr->r_ctl.bbr_rttprobe_gain_val = optval; else error = EINVAL; break; case TCP_BBR_PROBE_RTT_INT: BBR_OPTS_INC(tcp_bbr_probe_rtt_int); if (optval > 1000) bbr->r_ctl.rc_probertt_int = optval; else error = EINVAL; break; case TCP_BBR_MIN_TOPACEOUT: BBR_OPTS_INC(tcp_bbr_topaceout); if (optval == 0) { bbr->no_pacing_until = 0; bbr->rc_no_pacing = 0; } else if (optval <= 0x00ff) { bbr->no_pacing_until = optval; if ((bbr->r_ctl.rc_pkt_epoch < bbr->no_pacing_until) && (bbr->rc_bbr_state == BBR_STATE_STARTUP)){ /* Turn on no pacing */ bbr->rc_no_pacing = 1; } } else error = EINVAL; break; case TCP_BBR_STARTUP_LOSS_EXIT: BBR_OPTS_INC(tcp_bbr_startup_loss_exit); bbr->rc_loss_exit = optval; break; case TCP_BBR_USEDEL_RATE: error = EINVAL; break; case TCP_BBR_MIN_RTO: BBR_OPTS_INC(tcp_bbr_min_rto); bbr->r_ctl.rc_min_rto_ms = optval; break; case TCP_BBR_MAX_RTO: BBR_OPTS_INC(tcp_bbr_max_rto); bbr->rc_max_rto_sec = optval; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ BBR_OPTS_INC(tcp_rack_min_to); bbr->r_ctl.rc_min_to = optval; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ BBR_OPTS_INC(tcp_rack_reord_thresh); if ((optval > 0) && (optval < 31)) bbr->r_ctl.rc_reorder_shift = optval; else error = EINVAL; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ BBR_OPTS_INC(tcp_rack_reord_fade); bbr->r_ctl.rc_reorder_fade = optval; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ BBR_OPTS_INC(tcp_rack_tlp_thresh); if (optval) bbr->rc_tlp_threshold = optval; else error = EINVAL; break; case TCP_BBR_USE_RACK_CHEAT: BBR_OPTS_INC(tcp_use_rackcheat); if (bbr->rc_use_google) { error = EINVAL; break; } BBR_OPTS_INC(tcp_rack_cheat); if (optval) bbr->bbr_use_rack_cheat = 1; else bbr->bbr_use_rack_cheat = 0; break; case TCP_BBR_FLOOR_MIN_TSO: BBR_OPTS_INC(tcp_utter_max_tso); if ((optval >= 0) && (optval < 40)) bbr->r_ctl.bbr_hptsi_segments_floor = optval; else error = EINVAL; break; case TCP_BBR_UTTER_MAX_TSO: BBR_OPTS_INC(tcp_utter_max_tso); if ((optval >= 0) && (optval < 0xffff)) bbr->r_ctl.bbr_utter_max = optval; else error = EINVAL; break; case TCP_BBR_EXTRA_STATE: BBR_OPTS_INC(tcp_extra_state); if (optval) bbr->rc_use_idle_restart = 1; else bbr->rc_use_idle_restart = 0; break; case TCP_BBR_SEND_IWND_IN_TSO: BBR_OPTS_INC(tcp_iwnd_tso); if (optval) { bbr->bbr_init_win_cheat = 1; if (bbr->rc_past_init_win == 0) { uint32_t cts; cts = tcp_get_usecs(&bbr->rc_tv); tcp_bbr_tso_size_check(bbr, cts); } } else bbr->bbr_init_win_cheat = 0; break; case TCP_BBR_HDWR_PACE: BBR_OPTS_INC(tcp_hdwr_pacing); if (optval){ bbr->bbr_hdw_pace_ena = 1; bbr->bbr_attempt_hdwr_pace = 0; } else { bbr->bbr_hdw_pace_ena = 0; #ifdef RATELIMIT if (bbr->bbr_hdrw_pacing) { bbr->bbr_hdrw_pacing = 0; in_pcbdetach_txrtlmt(bbr->rc_inp); } #endif } break; case TCP_DELACK: BBR_OPTS_INC(tcp_delack); if (optval < 100) { if (optval == 0) /* off */ tp->t_delayed_ack = 0; else if (optval == 1) /* on which is 2 */ tp->t_delayed_ack = 2; else /* higher than 2 and less than 100 */ tp->t_delayed_ack = optval; if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; NET_EPOCH_ENTER(et); bbr_output(tp); NET_EPOCH_EXIT(et); } } else error = EINVAL; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ BBR_OPTS_INC(tcp_rack_pkt_delay); bbr->r_ctl.rc_pkt_delay = optval; break; #ifdef NETFLIX_PEAKRATE case TCP_MAXPEAKRATE: BBR_OPTS_INC(tcp_maxpeak); error = tcp_set_maxpeakrate(tp, optval); if (!error) tp->t_peakrate_thr = tp->t_maxpeakrate; break; #endif case TCP_BBR_RETRAN_WTSO: BBR_OPTS_INC(tcp_retran_wtso); if (optval) bbr->rc_resends_use_tso = 1; else bbr->rc_resends_use_tso = 0; break; case TCP_DATA_AFTER_CLOSE: BBR_OPTS_INC(tcp_data_ac); if (optval) bbr->rc_allow_data_af_clo = 1; else bbr->rc_allow_data_af_clo = 0; break; case TCP_BBR_POLICER_DETECT: BBR_OPTS_INC(tcp_policer_det); if (bbr->rc_use_google == 0) error = EINVAL; else if (optval) bbr->r_use_policer = 1; else bbr->r_use_policer = 0; break; case TCP_BBR_TSTMP_RAISES: BBR_OPTS_INC(tcp_ts_raises); if (optval) bbr->ts_can_raise = 1; else bbr->ts_can_raise = 0; break; case TCP_BBR_TMR_PACE_OH: BBR_OPTS_INC(tcp_pacing_oh_tmr); if (bbr->rc_use_google) { error = EINVAL; } else { if (optval) bbr->r_ctl.rc_incr_tmrs = 1; else bbr->r_ctl.rc_incr_tmrs = 0; } break; case TCP_BBR_PACE_OH: BBR_OPTS_INC(tcp_pacing_oh); if (bbr->rc_use_google) { error = EINVAL; } else { if (optval > (BBR_INCL_TCP_OH| BBR_INCL_IP_OH| BBR_INCL_ENET_OH)) { error = EINVAL; break; } if (optval & BBR_INCL_TCP_OH) bbr->r_ctl.rc_inc_tcp_oh = 1; else bbr->r_ctl.rc_inc_tcp_oh = 0; if (optval & BBR_INCL_IP_OH) bbr->r_ctl.rc_inc_ip_oh = 1; else bbr->r_ctl.rc_inc_ip_oh = 0; if (optval & BBR_INCL_ENET_OH) bbr->r_ctl.rc_inc_enet_oh = 1; else bbr->r_ctl.rc_inc_enet_oh = 0; } break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } #ifdef NETFLIX_STATS tcp_log_socket_option(tp, sopt->sopt_name, optval, error); #endif INP_WUNLOCK(inp); return (error); } /* * return 0 on success, error-num on failure */ static int bbr_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_bbr *bbr) { int32_t error, optval; /* * Because all our options are either boolean or an int, we can just * pull everything into optval and then unlock and copy. If we ever * add a option that is not a int, then this will have quite an * impact to this routine. */ switch (sopt->sopt_name) { case TCP_BBR_PACE_PER_SEC: optval = bbr->r_ctl.bbr_hptsi_per_second; break; case TCP_BBR_PACE_DEL_TAR: optval = bbr->r_ctl.bbr_hptsi_segments_delay_tar; break; case TCP_BBR_PACE_SEG_MAX: optval = bbr->r_ctl.bbr_hptsi_segments_max; break; case TCP_BBR_MIN_TOPACEOUT: optval = bbr->no_pacing_until; break; case TCP_BBR_PACE_SEG_MIN: optval = bbr->r_ctl.bbr_hptsi_bytes_min; break; case TCP_BBR_PACE_CROSS: optval = bbr->r_ctl.bbr_cross_over; break; case TCP_BBR_ALGORITHM: optval = bbr->rc_use_google; break; case TCP_BBR_TSLIMITS: optval = bbr->rc_use_ts_limit; break; case TCP_BBR_IWINTSO: optval = bbr->rc_init_win; break; case TCP_BBR_STARTUP_PG: optval = bbr->r_ctl.rc_startup_pg; break; case TCP_BBR_DRAIN_PG: optval = bbr->r_ctl.rc_drain_pg; break; case TCP_BBR_PROBE_RTT_INT: optval = bbr->r_ctl.rc_probertt_int; break; case TCP_BBR_PROBE_RTT_LEN: optval = (bbr->r_ctl.rc_rttprop.cur_time_limit / USECS_IN_SECOND); break; case TCP_BBR_PROBE_RTT_GAIN: optval = bbr->r_ctl.bbr_rttprobe_gain_val; break; case TCP_BBR_STARTUP_LOSS_EXIT: optval = bbr->rc_loss_exit; break; case TCP_BBR_USEDEL_RATE: error = EINVAL; break; case TCP_BBR_MIN_RTO: optval = bbr->r_ctl.rc_min_rto_ms; break; case TCP_BBR_MAX_RTO: optval = bbr->rc_max_rto_sec; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ optval = bbr->r_ctl.rc_pace_max_segs; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ optval = bbr->r_ctl.rc_min_to; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ optval = bbr->r_ctl.rc_reorder_shift; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ optval = bbr->r_ctl.rc_reorder_fade; break; case TCP_BBR_USE_RACK_CHEAT: /* Do we use the rack cheat for rxt */ optval = bbr->bbr_use_rack_cheat; break; case TCP_BBR_FLOOR_MIN_TSO: optval = bbr->r_ctl.bbr_hptsi_segments_floor; break; case TCP_BBR_UTTER_MAX_TSO: optval = bbr->r_ctl.bbr_utter_max; break; case TCP_BBR_SEND_IWND_IN_TSO: /* Do we send TSO size segments initially */ optval = bbr->bbr_init_win_cheat; break; case TCP_BBR_EXTRA_STATE: optval = bbr->rc_use_idle_restart; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ optval = bbr->rc_tlp_threshold; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ optval = bbr->r_ctl.rc_pkt_delay; break; case TCP_BBR_RETRAN_WTSO: optval = bbr->rc_resends_use_tso; break; case TCP_DATA_AFTER_CLOSE: optval = bbr->rc_allow_data_af_clo; break; case TCP_DELACK: optval = tp->t_delayed_ack; break; case TCP_BBR_HDWR_PACE: optval = bbr->bbr_hdw_pace_ena; break; case TCP_BBR_POLICER_DETECT: optval = bbr->r_use_policer; break; case TCP_BBR_TSTMP_RAISES: optval = bbr->ts_can_raise; break; case TCP_BBR_TMR_PACE_OH: optval = bbr->r_ctl.rc_incr_tmrs; break; case TCP_BBR_PACE_OH: optval = 0; if (bbr->r_ctl.rc_inc_tcp_oh) optval |= BBR_INCL_TCP_OH; if (bbr->r_ctl.rc_inc_ip_oh) optval |= BBR_INCL_IP_OH; if (bbr->r_ctl.rc_inc_enet_oh) optval |= BBR_INCL_ENET_OH; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); return (error); } /* * return 0 on success, error-num on failure */ static int bbr_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { int32_t error = EINVAL; struct tcp_bbr *bbr; bbr = (struct tcp_bbr *)tp->t_fb_ptr; if (bbr == NULL) { /* Huh? */ goto out; } if (sopt->sopt_dir == SOPT_SET) { return (bbr_set_sockopt(so, sopt, inp, tp, bbr)); } else if (sopt->sopt_dir == SOPT_GET) { return (bbr_get_sockopt(so, sopt, inp, tp, bbr)); } out: INP_WUNLOCK(inp); return (error); } static int bbr_pru_options(struct tcpcb *tp, int flags) { if (flags & PRUS_OOB) return (EOPNOTSUPP); return (0); } struct tcp_function_block __tcp_bbr = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = bbr_output, .tfb_do_queued_segments = ctf_do_queued_segments, .tfb_do_segment_nounlock = bbr_do_segment_nounlock, .tfb_tcp_do_segment = bbr_do_segment, .tfb_tcp_ctloutput = bbr_ctloutput, .tfb_tcp_fb_init = bbr_init, .tfb_tcp_fb_fini = bbr_fini, .tfb_tcp_timer_stop_all = bbr_stopall, .tfb_tcp_timer_activate = bbr_timer_activate, .tfb_tcp_timer_active = bbr_timer_active, .tfb_tcp_timer_stop = bbr_timer_stop, .tfb_tcp_rexmit_tmr = bbr_remxt_tmr, .tfb_tcp_handoff_ok = bbr_handoff_ok, .tfb_tcp_mtu_chg = bbr_mtu_chg, .tfb_pru_options = bbr_pru_options, }; static const char *bbr_stack_names[] = { __XSTRING(STACKNAME), #ifdef STACKALIAS __XSTRING(STACKALIAS), #endif }; static bool bbr_mod_inited = false; static int tcp_addbbr(module_t mod, int32_t type, void *data) { int32_t err = 0; int num_stacks; switch (type) { case MOD_LOAD: printf("Attempting to load " __XSTRING(MODNAME) "\n"); bbr_zone = uma_zcreate(__XSTRING(MODNAME) "_map", sizeof(struct bbr_sendmap), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); bbr_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", sizeof(struct tcp_bbr), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); sysctl_ctx_init(&bbr_sysctl_ctx); bbr_sysctl_root = SYSCTL_ADD_NODE(&bbr_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp), OID_AUTO, #ifdef STACKALIAS __XSTRING(STACKALIAS), #else __XSTRING(STACKNAME), #endif CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); if (bbr_sysctl_root == NULL) { printf("Failed to add sysctl node\n"); err = EFAULT; goto free_uma; } bbr_init_sysctls(); num_stacks = nitems(bbr_stack_names); err = register_tcp_functions_as_names(&__tcp_bbr, M_WAITOK, bbr_stack_names, &num_stacks); if (err) { printf("Failed to register %s stack name for " "%s module\n", bbr_stack_names[num_stacks], __XSTRING(MODNAME)); sysctl_ctx_free(&bbr_sysctl_ctx); free_uma: uma_zdestroy(bbr_zone); uma_zdestroy(bbr_pcb_zone); bbr_counter_destroy(); printf("Failed to register " __XSTRING(MODNAME) " module err:%d\n", err); return (err); } tcp_lro_reg_mbufq(); bbr_mod_inited = true; printf(__XSTRING(MODNAME) " is now available\n"); break; case MOD_QUIESCE: err = deregister_tcp_functions(&__tcp_bbr, true, false); break; case MOD_UNLOAD: err = deregister_tcp_functions(&__tcp_bbr, false, true); if (err == EBUSY) break; if (bbr_mod_inited) { uma_zdestroy(bbr_zone); uma_zdestroy(bbr_pcb_zone); sysctl_ctx_free(&bbr_sysctl_ctx); bbr_counter_destroy(); printf(__XSTRING(MODNAME) " is now no longer available\n"); bbr_mod_inited = false; } tcp_lro_dereg_mbufq(); err = 0; break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t tcp_bbr = { .name = __XSTRING(MODNAME), .evhand = tcp_addbbr, .priv = 0 }; MODULE_VERSION(MODNAME, 1); DECLARE_MODULE(MODNAME, tcp_bbr, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c index f5b0de7cbc59..5bea540c6ab6 100644 --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -1,14979 +1,14979 @@ /*- * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_ratelimit.h" #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #ifdef STATS #include #include #include /* Must come after qmath.h and tree.h */ #else #include #endif #include #include #include #include #include #include #include #include #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #define TCPOUTFLAGS #include #include #include #include #include #include #include #include #include #include #include #ifdef NETFLIX_SHARED_CWND #include #endif #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef INET6 #include #endif #include #if defined(IPSEC) || defined(IPSEC_SUPPORT) #include #include #endif /* IPSEC */ #include #include #include #ifdef MAC #include #endif #include "sack_filter.h" #include "tcp_rack.h" #include "rack_bbr_common.h" uma_zone_t rack_zone; uma_zone_t rack_pcb_zone; #ifndef TICKS2SBT #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) #endif struct sysctl_ctx_list rack_sysctl_ctx; struct sysctl_oid *rack_sysctl_root; #define CUM_ACKED 1 #define SACKED 2 /* * The RACK module incorporates a number of * TCP ideas that have been put out into the IETF * over the last few years: * - Matt Mathis's Rate Halving which slowly drops * the congestion window so that the ack clock can * be maintained during a recovery. * - Yuchung Cheng's RACK TCP (for which its named) that * will stop us using the number of dup acks and instead * use time as the gage of when we retransmit. * - Reorder Detection of RFC4737 and the Tail-Loss probe draft * of Dukkipati et.al. * RACK depends on SACK, so if an endpoint arrives that * cannot do SACK the state machine below will shuttle the * connection back to using the "default" TCP stack that is * in FreeBSD. * * To implement RACK the original TCP stack was first decomposed * into a functional state machine with individual states * for each of the possible TCP connection states. The do_segement * functions role in life is to mandate the connection supports SACK * initially and then assure that the RACK state matches the conenction * state before calling the states do_segment function. Each * state is simplified due to the fact that the original do_segment * has been decomposed and we *know* what state we are in (no * switches on the state) and all tests for SACK are gone. This * greatly simplifies what each state does. * * TCP output is also over-written with a new version since it * must maintain the new rack scoreboard. * */ static int32_t rack_tlp_thresh = 1; static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ static int32_t rack_tlp_use_greater = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 * - 60 seconds */ /* Attack threshold detections */ static uint32_t rack_highest_sack_thresh_seen = 0; static uint32_t rack_highest_move_thresh_seen = 0; static int32_t rack_pkt_delay = 1; static int32_t rack_early_recovery = 1; static int32_t rack_send_a_lot_in_prr = 1; static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; static int32_t rack_enable_shared_cwnd = 0; static int32_t rack_limits_scwnd = 1; static int32_t rack_enable_mqueue_for_nonpaced = 0; static int32_t rack_disable_prr = 0; static int32_t use_rack_rr = 1; static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ static int32_t rack_persist_min = 250; /* 250ms */ static int32_t rack_persist_max = 2000; /* 2 Second */ static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ static int32_t rack_default_init_window = 0; /* Use system default */ static int32_t rack_limit_time_with_srtt = 0; static int32_t rack_hw_pace_adjust = 0; /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up * being a total of 122.850 seconds before a * connection is killed. */ static uint32_t rack_def_data_window = 20; static uint32_t rack_goal_bdp = 2; static uint32_t rack_min_srtts = 1; static uint32_t rack_min_measure_usec = 0; static int32_t rack_tlp_min = 10; static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ static int32_t rack_rto_max = 4000; /* 4 seconds */ static const int32_t rack_free_cache = 2; static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; static int32_t rack_pace_every_seg = 0; static int32_t rack_delayed_ack_time = 200; /* 200ms */ static int32_t rack_slot_reduction = 4; static int32_t rack_wma_divisor = 8; /* For WMA calculation */ static int32_t rack_cwnd_block_ends_measure = 0; static int32_t rack_rwnd_block_ends_measure = 0; static int32_t rack_lower_cwnd_at_tlp = 0; static int32_t rack_use_proportional_reduce = 0; static int32_t rack_proportional_rate = 10; static int32_t rack_tlp_max_resend = 2; static int32_t rack_limited_retran = 0; static int32_t rack_always_send_oldest = 0; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ /* Probertt */ static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ static uint32_t rack_probertt_use_min_rtt_exit = 0; static uint32_t rack_probe_rtt_sets_cwnd = 0; static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */ static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */ static uint32_t rack_probertt_filter_life = 10000000; static uint32_t rack_probertt_lower_within = 10; static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */ static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ static int32_t rack_probertt_clear_is = 1; static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ /* Part of pacing */ static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ /* Timely information */ /* Combine these two gives the range of 'no change' to bw */ /* ie the up/down provide the upper and lower bound */ static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ static int32_t rack_use_max_for_nobackoff = 0; static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ static int32_t rack_timely_no_stopping = 0; static int32_t rack_down_raise_thresh = 100; static int32_t rack_req_segs = 1; /* Weird delayed ack mode */ static int32_t rack_use_imac_dack = 0; /* Rack specific counters */ counter_u64_t rack_badfr; counter_u64_t rack_badfr_bytes; counter_u64_t rack_rtm_prr_retran; counter_u64_t rack_rtm_prr_newdata; counter_u64_t rack_timestamp_mismatch; counter_u64_t rack_reorder_seen; counter_u64_t rack_paced_segments; counter_u64_t rack_unpaced_segments; counter_u64_t rack_calc_zero; counter_u64_t rack_calc_nonzero; counter_u64_t rack_saw_enobuf; counter_u64_t rack_saw_enetunreach; counter_u64_t rack_per_timer_hole; /* Tail loss probe counters */ counter_u64_t rack_tlp_tot; counter_u64_t rack_tlp_newdata; counter_u64_t rack_tlp_retran; counter_u64_t rack_tlp_retran_bytes; counter_u64_t rack_tlp_retran_fail; counter_u64_t rack_to_tot; counter_u64_t rack_to_arm_rack; counter_u64_t rack_to_arm_tlp; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; counter_u64_t rack_to_alloc_limited; counter_u64_t rack_alloc_limited_conns; counter_u64_t rack_split_limited; counter_u64_t rack_sack_proc_all; counter_u64_t rack_sack_proc_short; counter_u64_t rack_sack_proc_restart; counter_u64_t rack_sack_attacks_detected; counter_u64_t rack_sack_attacks_reversed; counter_u64_t rack_sack_used_next_merge; counter_u64_t rack_sack_splits; counter_u64_t rack_sack_used_prev_merge; counter_u64_t rack_sack_skipped_acked; counter_u64_t rack_ack_total; counter_u64_t rack_express_sack; counter_u64_t rack_sack_total; counter_u64_t rack_move_none; counter_u64_t rack_move_some; counter_u64_t rack_used_tlpmethod; counter_u64_t rack_used_tlpmethod2; counter_u64_t rack_enter_tlp_calc; counter_u64_t rack_input_idle_reduces; counter_u64_t rack_collapsed_win; counter_u64_t rack_tlp_does_nada; counter_u64_t rack_try_scwnd; /* Temp CPU counters */ counter_u64_t rack_find_high; counter_u64_t rack_progress_drops; counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; static void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); static int rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); static int rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type); static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused); static void rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); static void rack_counter_destroy(void); static int rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); static void rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line); static void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); static void rack_dtor(void *mem, int32_t size, void *arg); static void rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, uint32_t t, uint32_t cts); static void rack_log_alt_to_to_cancel(struct tcp_rack *rack, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint32_t flex4, uint32_t flex5, uint32_t flex6, uint16_t flex7, uint8_t mod); static void rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm); static struct rack_sendmap * rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm); static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); static void rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, int line); static uint32_t rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); static int32_t rack_handoff_ok(struct tcpcb *tp); static int32_t rack_init(struct tcpcb *tp); static void rack_init_sysctls(void); static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th); static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts); static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm); static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); static int32_t rack_output(struct tcpcb *tp); static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two); static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); static void rack_remxt_tmr(struct tcpcb *tp); static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); static int32_t rack_stopall(struct tcpcb *tp); static void rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta); static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts); static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); static int32_t tcp_addrack(module_t mod, int32_t type, void *data); static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); static int rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); static int rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); static int rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); static int rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); static int rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused); static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); int32_t rack_clear_counter=0; static int sysctl_rack_clear(SYSCTL_HANDLER_ARGS) { uint32_t stat; int32_t error; error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); if (error || req->newptr == NULL) return error; error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); if (error) return (error); if (stat == 1) { #ifdef INVARIANTS printf("Clearing RACK counters\n"); #endif counter_u64_zero(rack_badfr); counter_u64_zero(rack_badfr_bytes); counter_u64_zero(rack_rtm_prr_retran); counter_u64_zero(rack_rtm_prr_newdata); counter_u64_zero(rack_timestamp_mismatch); counter_u64_zero(rack_reorder_seen); counter_u64_zero(rack_tlp_tot); counter_u64_zero(rack_tlp_newdata); counter_u64_zero(rack_tlp_retran); counter_u64_zero(rack_tlp_retran_bytes); counter_u64_zero(rack_tlp_retran_fail); counter_u64_zero(rack_to_tot); counter_u64_zero(rack_to_arm_rack); counter_u64_zero(rack_to_arm_tlp); counter_u64_zero(rack_paced_segments); counter_u64_zero(rack_calc_zero); counter_u64_zero(rack_calc_nonzero); counter_u64_zero(rack_unpaced_segments); counter_u64_zero(rack_saw_enobuf); counter_u64_zero(rack_saw_enetunreach); counter_u64_zero(rack_per_timer_hole); counter_u64_zero(rack_to_alloc_hard); counter_u64_zero(rack_to_alloc_emerg); counter_u64_zero(rack_sack_proc_all); counter_u64_zero(rack_sack_proc_short); counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); counter_u64_zero(rack_to_alloc_limited); counter_u64_zero(rack_alloc_limited_conns); counter_u64_zero(rack_split_limited); counter_u64_zero(rack_find_high); counter_u64_zero(rack_sack_attacks_detected); counter_u64_zero(rack_sack_attacks_reversed); counter_u64_zero(rack_sack_used_next_merge); counter_u64_zero(rack_sack_used_prev_merge); counter_u64_zero(rack_sack_splits); counter_u64_zero(rack_sack_skipped_acked); counter_u64_zero(rack_ack_total); counter_u64_zero(rack_express_sack); counter_u64_zero(rack_sack_total); counter_u64_zero(rack_move_none); counter_u64_zero(rack_move_some); counter_u64_zero(rack_used_tlpmethod); counter_u64_zero(rack_used_tlpmethod2); counter_u64_zero(rack_enter_tlp_calc); counter_u64_zero(rack_progress_drops); counter_u64_zero(rack_tlp_does_nada); counter_u64_zero(rack_try_scwnd); counter_u64_zero(rack_collapsed_win); } rack_clear_counter = 0; return (0); } static void rack_init_sysctls(void) { struct sysctl_oid *rack_counters; struct sysctl_oid *rack_attack; struct sysctl_oid *rack_pacing; struct sysctl_oid *rack_timely; struct sysctl_oid *rack_timers; struct sysctl_oid *rack_tlp; struct sysctl_oid *rack_misc; struct sysctl_oid *rack_measure; struct sysctl_oid *rack_probertt; rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sack_attack", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Rack Sack Attack Counters and Controls"); rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "stats", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Rack Counters"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rate_sample_method", CTLFLAG_RW, &rack_rate_sample_method , USE_RTT_LOW, "What method should we use for rate sampling 0=high, 1=low "); /* Probe rtt related controls */ rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "probertt", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "ProbeRTT related Controls"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "exit_per_hpb", CTLFLAG_RW, &rack_atexit_prtt_hbp, 130, "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, &rack_atexit_prtt, 130, "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "gp_per_mul", CTLFLAG_RW, &rack_per_of_gp_probertt, 60, "What percentage of goodput do we pace at in probertt"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "gp_per_reduce", CTLFLAG_RW, &rack_per_of_gp_probertt_reduce, 10, "What percentage of goodput do we reduce every gp_srtt"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "gp_per_low", CTLFLAG_RW, &rack_per_of_gp_lowthresh, 40, "What percentage of goodput do we allow the multiplier to fall to"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "time_between", CTLFLAG_RW, & rack_time_between_probertt, 96000000, "How many useconds between the lowest rtt falling must past before we enter probertt"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "safety", CTLFLAG_RW, &rack_probe_rtt_safety_val, 2000000, "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "sets_cwnd", CTLFLAG_RW, &rack_probe_rtt_sets_cwnd, 0, "Do we set the cwnd too (if always_lower is on)"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, &rack_max_drain_wait, 2, "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, &rack_must_drain, 1, "We must drain this many gp_srtt's waiting for flight to reach goal"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, &rack_probertt_use_min_rtt_entry, 1, "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, &rack_probertt_use_min_rtt_exit, 0, "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "length_div", CTLFLAG_RW, &rack_probertt_gpsrtt_cnt_div, 0, "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "length_mul", CTLFLAG_RW, &rack_probertt_gpsrtt_cnt_mul, 0, "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "holdtim_at_target", CTLFLAG_RW, &rack_min_probertt_hold, 200000, "What is the minimum time we hold probertt at target"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "filter_life", CTLFLAG_RW, &rack_probertt_filter_life, 10000000, "What is the time for the filters life in useconds"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "lower_within", CTLFLAG_RW, &rack_probertt_lower_within, 10, "If the rtt goes lower within this percentage of the time, go into probe-rtt"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "must_move", CTLFLAG_RW, &rack_min_rtt_movement, 250, "How much is the minimum movement in rtt to count as a drop for probertt purposes"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "clear_is_cnts", CTLFLAG_RW, &rack_probertt_clear_is, 1, "Do we clear I/S counts on exiting probe-rtt"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, &rack_max_drain_hbp, 1, "How many extra drain gpsrtt's do we get in highly buffered paths"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_probertt), OID_AUTO, "hbp_threshold", CTLFLAG_RW, &rack_hbp_thresh, 3, "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); /* Pacing related sysctls */ rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "pacing", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Pacing related Controls"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "max_pace_over", CTLFLAG_RW, &rack_max_per_above, 30, "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "pace_to_one", CTLFLAG_RW, &rack_pace_one_seg, 0, "Do we allow low b/w pacing of 1MSS instead of two"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "limit_wsrtt", CTLFLAG_RW, &rack_limit_time_with_srtt, 0, "Do we limit pacing time based on srtt"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "init_win", CTLFLAG_RW, &rack_default_init_window, 0, "Do we have a rack initial window 0 = system default"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW, &rack_hw_pace_adjust, 0, "What percentage do we raise the MSS by (11 = 1.1%)"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "gp_per_ss", CTLFLAG_RW, &rack_per_of_gp_ss, 250, "If non zero, what percentage of goodput to pace at in slow start"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "gp_per_ca", CTLFLAG_RW, &rack_per_of_gp_ca, 150, "If non zero, what percentage of goodput to pace at in congestion avoidance"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "gp_per_rec", CTLFLAG_RW, &rack_per_of_gp_rec, 200, "If non zero, what percentage of goodput to pace at in recovery"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "pace_max_seg", CTLFLAG_RW, &rack_hptsi_segments, 40, "What size is the max for TSO segments in pacing and burst mitigation"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "burst_reduces", CTLFLAG_RW, &rack_slot_reduction, 4, "When doing only burst mitigation what is the reduce divisor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "use_pacing", CTLFLAG_RW, &rack_pace_every_seg, 0, "If set we use pacing, if clear we use only the original burst mitigation"); rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "timely", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Rack Timely RTT Controls"); /* Timely based GP dynmics */ SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "upper", CTLFLAG_RW, &rack_gp_per_bw_mul_up, 2, "Rack timely upper range for equal b/w (in percentage)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "lower", CTLFLAG_RW, &rack_gp_per_bw_mul_down, 4, "Rack timely lower range for equal b/w (in percentage)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "rtt_max_mul", CTLFLAG_RW, &rack_gp_rtt_maxmul, 3, "Rack timely multipler of lowest rtt for rtt_max"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "rtt_min_div", CTLFLAG_RW, &rack_gp_rtt_mindiv, 4, "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "rtt_min_mul", CTLFLAG_RW, &rack_gp_rtt_minmul, 1, "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "decrease", CTLFLAG_RW, &rack_gp_decrease_per, 20, "Rack timely decrease percentage of our GP multiplication factor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "increase", CTLFLAG_RW, &rack_gp_increase_per, 2, "Rack timely increase perentage of our GP multiplication factor"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "lowerbound", CTLFLAG_RW, &rack_per_lower_bound, 50, "Rack timely lowest percentage we allow GP multiplier to fall to"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "upperboundss", CTLFLAG_RW, &rack_per_upper_bound_ss, 0, "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "upperboundca", CTLFLAG_RW, &rack_per_upper_bound_ca, 0, "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "dynamicgp", CTLFLAG_RW, &rack_do_dyn_mul, 0, "Rack timely do we enable dynmaic timely goodput by default"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "no_rec_red", CTLFLAG_RW, &rack_gp_no_rec_chg, 1, "Rack timely do we prohibit the recovery multiplier from being lowered"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "red_clear_cnt", CTLFLAG_RW, &rack_timely_dec_clear, 6, "Rack timely what threshold do we count to before another boost during b/w decent"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "max_push_rise", CTLFLAG_RW, &rack_timely_max_push_rise, 3, "Rack timely how many times do we push up with b/w increase"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "max_push_drop", CTLFLAG_RW, &rack_timely_max_push_drop, 3, "Rack timely how many times do we push back on b/w decent"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "min_segs", CTLFLAG_RW, &rack_timely_min_segs, 4, "Rack timely when setting the cwnd what is the min num segments"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "noback_max", CTLFLAG_RW, &rack_use_max_for_nobackoff, 0, "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "interim_timely_only", CTLFLAG_RW, &rack_timely_int_timely_only, 0, "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "nonstop", CTLFLAG_RW, &rack_timely_no_stopping, 0, "Rack timely don't stop increase"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, &rack_down_raise_thresh, 100, "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timely), OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, &rack_req_segs, 1, "Bottom dragging if not these many segments outstanding and room"); /* TLP and Rack related parameters */ rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlp", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TLP and Rack related Controls"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "use_rrr", CTLFLAG_RW, &use_rack_rr, 1, "Do we use Rack Rapid Recovery"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, &rack_non_rxt_use_cr, 0, "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlpmethod", CTLFLAG_RW, &rack_tlp_threshold_use, TLP_USE_TWO_ONE, "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "limit", CTLFLAG_RW, &rack_tlp_limit, 2, "How many TLP's can be sent without sending new data"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "use_greater", CTLFLAG_RW, &rack_tlp_use_greater, 1, "Should we use the rack_rtt time if its greater than srtt"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlpminto", CTLFLAG_RW, &rack_tlp_min, 10, "TLP minimum timeout per the specification (10ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "send_oldest", CTLFLAG_RW, &rack_always_send_oldest, 0, "Should we always send the oldest TLP and RACK-TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "rack_tlimit", CTLFLAG_RW, &rack_limited_retran, 0, "How many times can a rack timeout drive out sends"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlp_retry", CTLFLAG_RW, &rack_tlp_max_resend, 2, "How many times does TLP retry a single segment or multiple with no ACK"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, &rack_lower_cwnd_at_tlp, 0, "When a TLP completes a retran should we enter recovery"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "reorder_thresh", CTLFLAG_RW, &rack_reorder_thresh, 2, "What factor for rack will be added when seeing reordering (shift right)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, &rack_tlp_thresh, 1, "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "reorder_fade", CTLFLAG_RW, &rack_reorder_fade, 0, "Does reorder detection fade, if so how many ms (0 means never)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "pktdelay", CTLFLAG_RW, &rack_pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); /* Timer related controls */ rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "timers", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Timer related controls"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "persmin", CTLFLAG_RW, &rack_persist_min, 250, "What is the minimum time in milliseconds between persists"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "persmax", CTLFLAG_RW, &rack_persist_max, 2000, "What is the largest delay in milliseconds between persists"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "delayed_ack", CTLFLAG_RW, &rack_delayed_ack_time, 200, "Delayed ack time (200ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "minrto", CTLFLAG_RW, &rack_rto_min, 0, "Minimum RTO in ms -- set with caution below 1000 due to TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "maxrto", CTLFLAG_RW, &rack_rto_max, 0, "Maxiumum RTO in ms -- should be at least as large as min_rto"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "minto", CTLFLAG_RW, &rack_min_to, 1, "Minimum rack timeout in milliseconds"); /* Measure controls */ rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "measure", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Measure related controls"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_measure), OID_AUTO, "wma_divisor", CTLFLAG_RW, &rack_wma_divisor, 8, "When doing b/w calculation what is the divisor for the WMA"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_measure), OID_AUTO, "end_cwnd", CTLFLAG_RW, &rack_cwnd_block_ends_measure, 0, "Does a cwnd just-return end the measurement window (app limited)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_measure), OID_AUTO, "end_rwnd", CTLFLAG_RW, &rack_rwnd_block_ends_measure, 0, "Does an rwnd just-return end the measurement window (app limited -- not persists)"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_measure), OID_AUTO, "min_target", CTLFLAG_RW, &rack_def_data_window, 20, "What is the minimum target window (in mss) for a GP measurements"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_measure), OID_AUTO, "goal_bdp", CTLFLAG_RW, &rack_goal_bdp, 2, "What is the goal BDP to measure"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_measure), OID_AUTO, "min_srtts", CTLFLAG_RW, &rack_min_srtts, 1, "What is the goal BDP to measure"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_measure), OID_AUTO, "min_measure_tim", CTLFLAG_RW, &rack_min_measure_usec, 0, "What is the Minimum time time for a measurement if 0, this is off"); /* Misc rack controls */ rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "misc", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Misc related controls"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "shared_cwnd", CTLFLAG_RW, &rack_enable_shared_cwnd, 0, "Should RACK try to use the shared cwnd on connections where allowed"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, &rack_limits_scwnd, 1, "Should RACK place low end time limits on the shared cwnd feature"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, &rack_enable_mqueue_for_nonpaced, 0, "Should RACK use mbuf queuing for non-paced connections"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "iMac_dack", CTLFLAG_RW, &rack_use_imac_dack, 0, "Should RACK try to emulate iMac delayed ack"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "no_prr", CTLFLAG_RW, &rack_disable_prr, 0, "Should RACK not use prr and only pace (must have pacing on)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "bb_verbose", CTLFLAG_RW, &rack_verbose_logging, 0, "Should RACK black box logging be verbose"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "data_after_close", CTLFLAG_RW, &rack_ignore_data_after_close, 1, "Do we hold off sending a RST until all pending data is ack'd"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "no_sack_needed", CTLFLAG_RW, &rack_sack_not_required, 0, "Do we allow rack to run on connections not supporting SACK"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, &rack_use_proportional_reduce, 0, "Should we proportionaly reduce cwnd based on the number of losses "); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "recovery_prop", CTLFLAG_RW, &rack_proportional_rate, 10, "What percent reduction per loss"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "prr_sendalot", CTLFLAG_RW, &rack_send_a_lot_in_prr, 1, "Send a lot in prr"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "earlyrecovery", CTLFLAG_RW, &rack_early_recovery, 1, "Do we do early recovery with rack"); /* Sack Attacker detection stuff */ SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "detect_highsackratio", CTLFLAG_RW, &rack_highest_sack_thresh_seen, 0, "Highest sack to ack ratio seen"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, &rack_highest_move_thresh_seen, 0, "Highest move to non-move ratio seen"); rack_ack_total = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "acktotal", CTLFLAG_RD, &rack_ack_total, "Total number of Ack's"); rack_express_sack = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "exp_sacktotal", CTLFLAG_RD, &rack_express_sack, "Total expresss number of Sack's"); rack_sack_total = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "sacktotal", CTLFLAG_RD, &rack_sack_total, "Total number of SACKs"); rack_move_none = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "move_none", CTLFLAG_RD, &rack_move_none, "Total number of SACK index reuse of postions under threshold"); rack_move_some = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "move_some", CTLFLAG_RD, &rack_move_some, "Total number of SACK index reuse of postions over threshold"); rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "attacks", CTLFLAG_RD, &rack_sack_attacks_detected, "Total number of SACK attackers that had sack disabled"); rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "reversed", CTLFLAG_RD, &rack_sack_attacks_reversed, "Total number of SACK attackers that were later determined false positive"); rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "nextmerge", CTLFLAG_RD, &rack_sack_used_next_merge, "Total number of times we used the next merge"); rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "prevmerge", CTLFLAG_RD, &rack_sack_used_prev_merge, "Total number of times we used the prev merge"); /* Counters */ rack_badfr = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "badfr", CTLFLAG_RD, &rack_badfr, "Total number of bad FRs"); rack_badfr_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "badfr_bytes", CTLFLAG_RD, &rack_badfr_bytes, "Total number of bad FRs"); rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "prrsndret", CTLFLAG_RD, &rack_rtm_prr_retran, "Total number of prr based retransmits"); rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "prrsndnew", CTLFLAG_RD, &rack_rtm_prr_newdata, "Total number of prr based new transmits"); rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tsnf", CTLFLAG_RD, &rack_timestamp_mismatch, "Total number of timestamps that we could not find the reported ts"); rack_find_high = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "findhigh", CTLFLAG_RD, &rack_find_high, "Total number of FIN causing find-high"); rack_reorder_seen = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "reordering", CTLFLAG_RD, &rack_reorder_seen, "Total number of times we added delay due to reordering"); rack_tlp_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_to_total", CTLFLAG_RD, &rack_tlp_tot, "Total number of tail loss probe expirations"); rack_tlp_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_new", CTLFLAG_RD, &rack_tlp_newdata, "Total number of tail loss probe sending new data"); rack_tlp_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_retran", CTLFLAG_RD, &rack_tlp_retran, "Total number of tail loss probe sending retransmitted data"); rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, &rack_tlp_retran_bytes, "Total bytes of tail loss probe sending retransmitted data"); rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, &rack_tlp_retran_fail, "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); rack_to_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "rack_to_tot", CTLFLAG_RD, &rack_to_tot, "Total number of times the rack to expired"); rack_to_arm_rack = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "arm_rack", CTLFLAG_RD, &rack_to_arm_rack, "Total number of times the rack timer armed"); rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "arm_tlp", CTLFLAG_RD, &rack_to_arm_tlp, "Total number of times the tlp timer armed"); rack_calc_zero = counter_u64_alloc(M_WAITOK); rack_calc_nonzero = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "calc_zero", CTLFLAG_RD, &rack_calc_zero, "Total number of times pacing time worked out to zero"); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "calc_nonzero", CTLFLAG_RD, &rack_calc_nonzero, "Total number of times pacing time worked out to non-zero"); rack_paced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "paced", CTLFLAG_RD, &rack_paced_segments, "Total number of times a segment send caused hptsi"); rack_unpaced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "unpaced", CTLFLAG_RD, &rack_unpaced_segments, "Total number of times a segment did not cause hptsi"); rack_saw_enobuf = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "saw_enobufs", CTLFLAG_RD, &rack_saw_enobuf, "Total number of times a segment did not cause hptsi"); rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "saw_enetunreach", CTLFLAG_RD, &rack_saw_enetunreach, "Total number of times a segment did not cause hptsi"); rack_to_alloc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "allocs", CTLFLAG_RD, &rack_to_alloc, "Total allocations of tracking structures"); rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "allochard", CTLFLAG_RD, &rack_to_alloc_hard, "Total allocations done with sleeping the hard way"); rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "allocemerg", CTLFLAG_RD, &rack_to_alloc_emerg, "Total allocations done from emergency cache"); rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "alloc_limited", CTLFLAG_RD, &rack_to_alloc_limited, "Total allocations dropped due to limit"); rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, &rack_alloc_limited_conns, "Connections with allocations dropped due to limit"); rack_split_limited = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "split_limited", CTLFLAG_RD, &rack_split_limited, "Split allocations dropped due to limit"); rack_sack_proc_all = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "sack_long", CTLFLAG_RD, &rack_sack_proc_all, "Total times we had to walk whole list for sack processing"); rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "sack_restart", CTLFLAG_RD, &rack_sack_proc_restart, "Total times we had to walk whole list due to a restart"); rack_sack_proc_short = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "sack_short", CTLFLAG_RD, &rack_sack_proc_short, "Total times we took shortcut for sack processing"); rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, &rack_enter_tlp_calc, "Total times we called calc-tlp"); rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "hit_tlp_method", CTLFLAG_RD, &rack_used_tlpmethod, "Total number of runt sacks"); rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, &rack_used_tlpmethod2, "Total number of times we hit TLP method 2"); rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "skipacked", CTLFLAG_RD, &rack_sack_skipped_acked, "Total number of times we skipped previously sacked"); rack_sack_splits = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), OID_AUTO, "ofsplit", CTLFLAG_RD, &rack_sack_splits, "Total number of times we did the old fashion tree split"); rack_progress_drops = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "prog_drops", CTLFLAG_RD, &rack_progress_drops, "Total number of progress drops"); rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, &rack_input_idle_reduces, "Total number of idle reductions on input"); rack_collapsed_win = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "collapsed_win", CTLFLAG_RD, &rack_collapsed_win, "Total number of collapsed windows"); rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_nada", CTLFLAG_RD, &rack_tlp_does_nada, "Total number of nada tlp calls"); rack_try_scwnd = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tried_scwnd", CTLFLAG_RD, &rack_try_scwnd, "Total number of scwnd attempts"); rack_per_timer_hole = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "timer_hole", CTLFLAG_RD, &rack_per_timer_hole, "Total persists start in timer hole"); COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "opts", CTLFLAG_RD, rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); SYSCTL_ADD_PROC(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); } static __inline int rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) { if (SEQ_GEQ(b->r_start, a->r_start) && SEQ_LT(b->r_start, a->r_end)) { /* * The entry b is within the * block a. i.e.: * a -- |-------------| * b -- |----| * * b -- |------| * * b -- |-----------| */ return (0); } else if (SEQ_GEQ(b->r_start, a->r_end)) { /* * b falls as either the next * sequence block after a so a * is said to be smaller than b. * i.e: * a -- |------| * b -- |--------| * or * b -- |-----| */ return (1); } /* * Whats left is where a is * larger than b. i.e: * a -- |-------| * b -- |---| * or even possibly * b -- |--------------| */ return (-1); } RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); static uint32_t rc_init_window(struct tcp_rack *rack) { uint32_t win; if (rack->rc_init_win == 0) { /* * Nothing set by the user, use the system stack * default. */ return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); } win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; return(win); } static uint64_t rack_get_fixed_pacing_bw(struct tcp_rack *rack) { if (IN_RECOVERY(rack->rc_tp->t_flags)) return (rack->r_ctl.rc_fixed_pacing_rate_rec); else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) return (rack->r_ctl.rc_fixed_pacing_rate_ss); else return (rack->r_ctl.rc_fixed_pacing_rate_ca); } static uint64_t rack_get_bw(struct tcp_rack *rack) { if (rack->use_fixed_rate) { /* Return the fixed pacing rate */ return (rack_get_fixed_pacing_bw(rack)); } if (rack->r_ctl.gp_bw == 0) { /* * We have yet no b/w measurement, * if we have a user set initial bw * return it. If we don't have that and * we have an srtt, use the tcp IW (10) to * calculate a fictional b/w over the SRTT * which is more or less a guess. Note * we don't use our IW from rack on purpose * so if we have like IW=30, we are not * calculating a "huge" b/w. */ uint64_t bw, srtt; if (rack->r_ctl.init_rate) return (rack->r_ctl.init_rate); /* Has the user set a max peak rate? */ #ifdef NETFLIX_PEAKRATE if (rack->rc_tp->t_maxpeakrate) return (rack->rc_tp->t_maxpeakrate); #endif /* Ok lets come up with the IW guess, if we have a srtt */ if (rack->rc_tp->t_srtt == 0) { /* * Go with old pacing method * i.e. burst mitigation only. */ return (0); } /* Ok lets get the initial TCP win (not racks) */ bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); bw *= (uint64_t)USECS_IN_SECOND; bw /= srtt; return (bw); } else { uint64_t bw; if(rack->r_ctl.num_avg >= RACK_REQ_AVG) { /* Averaging is done, we can return the value */ bw = rack->r_ctl.gp_bw; } else { /* Still doing initial average must calculate */ bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg; } #ifdef NETFLIX_PEAKRATE if ((rack->rc_tp->t_maxpeakrate) && (bw > rack->rc_tp->t_maxpeakrate)) { /* The user has set a peak rate to pace at * don't allow us to pace faster than that. */ return (rack->rc_tp->t_maxpeakrate); } #endif return (bw); } } static uint16_t rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) { if (rack->use_fixed_rate) { return (100); } else if (rack->in_probe_rtt && (rsm == NULL)) return(rack->r_ctl.rack_per_of_gp_probertt); else if ((IN_RECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.rack_per_of_gp_rec)) { if (rsm) { /* a retransmission always use the recovery rate */ return(rack->r_ctl.rack_per_of_gp_rec); } else if (rack->rack_rec_nonrxt_use_cr) { /* Directed to use the configured rate */ goto configured_rate; } else if (rack->rack_no_prr && (rack->r_ctl.rack_per_of_gp_rec > 100)) { /* No PRR, lets just use the b/w estimate only */ return(100); } else { /* * Here we may have a non-retransmit but we * have no overrides, so just use the recovery * rate (prr is in effect). */ return(rack->r_ctl.rack_per_of_gp_rec); } } configured_rate: /* For the configured rate we look at our cwnd vs the ssthresh */ if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) return (rack->r_ctl.rack_per_of_gp_ss); else return(rack->r_ctl.rack_per_of_gp_ca); } static uint64_t rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm) { /* * We allow rack_per_of_gp_xx to dictate our bw rate we want. */ uint64_t bw_est; uint64_t gain; gain = (uint64_t)rack_get_output_gain(rack, rsm); bw_est = bw * gain; bw_est /= (uint64_t)100; /* Never fall below the minimum (def 64kbps) */ if (bw_est < RACK_MIN_BW) bw_est = RACK_MIN_BW; return (bw_est); } static void rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; if ((mod != 1) && (rack_verbose_logging == 0)) { /* * We get 3 values currently for mod * 1 - We are retransmitting and this tells the reason. * 2 - We are clearing a dup-ack count. * 3 - We are incrementing a dup-ack count. * * The clear/increment are only logged * if you have BBverbose on. */ return; } memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = tsused; log.u_bbr.flex2 = thresh; log.u_bbr.flex3 = rsm->r_flags; log.u_bbr.flex4 = rsm->r_dupack; log.u_bbr.flex5 = rsm->r_start; log.u_bbr.flex6 = rsm->r_end; log.u_bbr.flex8 = mod; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_SETTINGS_CHG, 0, 0, &log, false, &tv); } } static void rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); log.u_bbr.flex2 = to * 1000; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex7 = rack->rc_in_persist; log.u_bbr.flex8 = which; if (rack->rack_no_prr) log.u_bbr.pkts_out = 0; else log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERSTAR, 0, 0, &log, false, &tv); } } static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; if (rsm == NULL) log.u_bbr.flex3 = 0; else log.u_bbr.flex3 = rsm->r_end - rsm->r_start; if (rack->rack_no_prr) log.u_bbr.flex5 = 0; else log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_RTO, 0, 0, &log, false, &tv); } } static void rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, struct rack_sendmap *rsm, int conf) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = t; log.u_bbr.flex2 = len; log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC; log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC; log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC; log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; log.u_bbr.flex7 = conf; log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC; log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; if (rack->rack_no_prr) log.u_bbr.pkts_out = 0; else log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt; log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); if (rsm) { log.u_bbr.pkt_epoch = rsm->r_start; log.u_bbr.lost = rsm->r_end; log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; } else { /* Its a SYN */ log.u_bbr.pkt_epoch = rack->rc_tp->iss; log.u_bbr.lost = 0; log.u_bbr.cwnd_gain = 0; } /* Write out general bits of interest rrs here */ log.u_bbr.use_lt_bw = rack->rc_highly_buffered; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->forced_ack; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->in_probe_rtt; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->rc_gp_filled; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRRTT, 0, 0, &log, false, &tv); } } static void rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) { /* * Log the rtt sample we are * applying to the srtt algorithm in * useconds. */ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; /* Convert our ms to a microsecond */ memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = rtt * 1000; log.u_bbr.flex2 = rack->r_ctl.ack_count; log.u_bbr.flex3 = rack->r_ctl.sack_count; log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; log.u_bbr.flex8 = rack->sack_attack_disable; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, TCP_LOG_RTT, 0, 0, &log, false, &tv); } } static inline void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) { if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = line; log.u_bbr.flex2 = tick; log.u_bbr.flex3 = tp->t_maxunacktime; log.u_bbr.flex4 = tp->t_acktime; log.u_bbr.flex8 = event; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_PROGRESS, 0, 0, &log, false, &tv); } } static void rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; if (rack->rack_no_prr) log.u_bbr.flex2 = 0; else log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = rack->rc_in_persist; log.u_bbr.timeStamp = cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRSND, 0, 0, &log, false, tv); } } static void rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = way_out; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; if (rack->rack_no_prr) log.u_bbr.flex5 = 0; else log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex7 = rack->r_wanted_output; log.u_bbr.flex8 = rack->rc_in_persist; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_DOSEG_DONE, 0, 0, &log, false, &tv); } } static void rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; uint32_t cts; memset(&log, 0, sizeof(log)); cts = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; log.u_bbr.flex4 = len; log.u_bbr.flex5 = orig_len; log.u_bbr.flex6 = rack->r_ctl.rc_sacked; log.u_bbr.flex7 = mod; log.u_bbr.flex8 = frm; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv); } } static void rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = reason; if (rack->rack_no_prr) log.u_bbr.flex5 = 0; else log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex7 = hpts_calling; log.u_bbr.flex8 = rack->rc_in_persist; log.u_bbr.lt_epoch = cwnd_to_use; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_JUSTRET, 0, tlen, &log, false, &tv); } } static void rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, struct timeval *tv, uint32_t flags_on_entry) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = line; log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; log.u_bbr.flex3 = flags_on_entry; log.u_bbr.flex4 = us_cts; if (rack->rack_no_prr) log.u_bbr.flex5 = 0; else log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex7 = hpts_removed; log.u_bbr.flex8 = 1; log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; log.u_bbr.timeStamp = us_cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERCANC, 0, 0, &log, false, tv); } } static void rack_log_alt_to_to_cancel(struct tcp_rack *rack, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint32_t flex4, uint32_t flex5, uint32_t flex6, uint16_t flex7, uint8_t mod) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; if (mod == 1) { /* No you can't use 1, its for the real to cancel */ return; } memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.flex1 = flex1; log.u_bbr.flex2 = flex2; log.u_bbr.flex3 = flex3; log.u_bbr.flex4 = flex4; log.u_bbr.flex5 = flex5; log.u_bbr.flex6 = flex6; log.u_bbr.flex7 = flex7; log.u_bbr.flex8 = mod; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERCANC, 0, 0, &log, false, &tv); } } static void rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = timers; log.u_bbr.flex2 = ret; log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = cts; if (rack->rack_no_prr) log.u_bbr.flex6 = 0; else log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TO_PROCESS, 0, 0, &log, false, &tv); } } static void rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; if (rack->rack_no_prr) log.u_bbr.flex3 = 0; else log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; log.u_bbr.flex5 = rack->r_ctl.rc_sacked; log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; log.u_bbr.flex8 = frm; log.u_bbr.pkts_out = orig_cwnd; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRUPD, 0, 0, &log, false, &tv); } } #ifdef NETFLIX_EXP_DETECTION static void rack_log_sad(struct tcp_rack *rack, int event) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = rack->r_ctl.sack_count; log.u_bbr.flex2 = rack->r_ctl.ack_count; log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; log.u_bbr.flex6 = tcp_sack_to_ack_thresh; log.u_bbr.pkts_out = tcp_sack_to_move_thresh; log.u_bbr.lt_epoch = (tcp_force_detection << 8); log.u_bbr.lt_epoch |= rack->do_detection; log.u_bbr.applimited = tcp_map_minimum; log.u_bbr.flex7 = rack->sack_attack_disable; log.u_bbr.flex8 = event; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.delivered = tcp_sad_decay_val; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, TCP_SAD_DETECTION, 0, 0, &log, false, &tv); } } #endif static void rack_counter_destroy(void) { counter_u64_free(rack_ack_total); counter_u64_free(rack_express_sack); counter_u64_free(rack_sack_total); counter_u64_free(rack_move_none); counter_u64_free(rack_move_some); counter_u64_free(rack_sack_attacks_detected); counter_u64_free(rack_sack_attacks_reversed); counter_u64_free(rack_sack_used_next_merge); counter_u64_free(rack_sack_used_prev_merge); counter_u64_free(rack_badfr); counter_u64_free(rack_badfr_bytes); counter_u64_free(rack_rtm_prr_retran); counter_u64_free(rack_rtm_prr_newdata); counter_u64_free(rack_timestamp_mismatch); counter_u64_free(rack_find_high); counter_u64_free(rack_reorder_seen); counter_u64_free(rack_tlp_tot); counter_u64_free(rack_tlp_newdata); counter_u64_free(rack_tlp_retran); counter_u64_free(rack_tlp_retran_bytes); counter_u64_free(rack_tlp_retran_fail); counter_u64_free(rack_to_tot); counter_u64_free(rack_to_arm_rack); counter_u64_free(rack_to_arm_tlp); counter_u64_free(rack_calc_zero); counter_u64_free(rack_calc_nonzero); counter_u64_free(rack_paced_segments); counter_u64_free(rack_unpaced_segments); counter_u64_free(rack_saw_enobuf); counter_u64_free(rack_saw_enetunreach); counter_u64_free(rack_to_alloc); counter_u64_free(rack_to_alloc_hard); counter_u64_free(rack_to_alloc_emerg); counter_u64_free(rack_to_alloc_limited); counter_u64_free(rack_alloc_limited_conns); counter_u64_free(rack_split_limited); counter_u64_free(rack_sack_proc_all); counter_u64_free(rack_sack_proc_restart); counter_u64_free(rack_sack_proc_short); counter_u64_free(rack_enter_tlp_calc); counter_u64_free(rack_used_tlpmethod); counter_u64_free(rack_used_tlpmethod2); counter_u64_free(rack_sack_skipped_acked); counter_u64_free(rack_sack_splits); counter_u64_free(rack_progress_drops); counter_u64_free(rack_input_idle_reduces); counter_u64_free(rack_collapsed_win); counter_u64_free(rack_tlp_does_nada); counter_u64_free(rack_try_scwnd); counter_u64_free(rack_per_timer_hole); COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); } static struct rack_sendmap * rack_alloc(struct tcp_rack *rack) { struct rack_sendmap *rsm; rsm = uma_zalloc(rack_zone, M_NOWAIT); if (rsm) { rack->r_ctl.rc_num_maps_alloced++; counter_u64_add(rack_to_alloc, 1); return (rsm); } if (rack->rc_free_cnt) { counter_u64_add(rack_to_alloc_emerg, 1); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); rack->rc_free_cnt--; return (rsm); } return (NULL); } static struct rack_sendmap * rack_alloc_full_limit(struct tcp_rack *rack) { if ((V_tcp_map_entries_limit > 0) && (rack->do_detection == 0) && (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { counter_u64_add(rack_to_alloc_limited, 1); if (!rack->alloc_limit_reported) { rack->alloc_limit_reported = 1; counter_u64_add(rack_alloc_limited_conns, 1); } return (NULL); } return (rack_alloc(rack)); } /* wrapper to allocate a sendmap entry, subject to a specific limit */ static struct rack_sendmap * rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) { struct rack_sendmap *rsm; if (limit_type) { /* currently there is only one limit type */ if (V_tcp_map_split_limit > 0 && (rack->do_detection == 0) && rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { counter_u64_add(rack_split_limited, 1); if (!rack->alloc_limit_reported) { rack->alloc_limit_reported = 1; counter_u64_add(rack_alloc_limited_conns, 1); } return (NULL); } } /* allocate and mark in the limit type, if set */ rsm = rack_alloc(rack); if (rsm != NULL && limit_type) { rsm->r_limit_type = limit_type; rack->r_ctl.rc_num_split_allocs++; } return (rsm); } static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) { if (rsm->r_flags & RACK_APP_LIMITED) { if (rack->r_ctl.rc_app_limited_cnt > 0) { rack->r_ctl.rc_app_limited_cnt--; } } if (rsm->r_limit_type) { /* currently there is only one limit type */ rack->r_ctl.rc_num_split_allocs--; } if (rsm == rack->r_ctl.rc_first_appl) { if (rack->r_ctl.rc_app_limited_cnt == 0) rack->r_ctl.rc_first_appl = NULL; else { /* Follow the next one out */ struct rack_sendmap fe; fe.r_start = rsm->r_nseq_appl; rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); } } if (rsm == rack->r_ctl.rc_resend) rack->r_ctl.rc_resend = NULL; if (rsm == rack->r_ctl.rc_rsm_at_retran) rack->r_ctl.rc_rsm_at_retran = NULL; if (rsm == rack->r_ctl.rc_end_appl) rack->r_ctl.rc_end_appl = NULL; if (rack->r_ctl.rc_tlpsend == rsm) rack->r_ctl.rc_tlpsend = NULL; if (rack->r_ctl.rc_sacklast == rsm) rack->r_ctl.rc_sacklast = NULL; if (rack->rc_free_cnt < rack_free_cache) { memset(rsm, 0, sizeof(struct rack_sendmap)); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); rsm->r_limit_type = 0; rack->rc_free_cnt++; return; } rack->r_ctl.rc_num_maps_alloced--; uma_zfree(rack_zone, rsm); } static uint32_t rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) { uint64_t srtt, bw, len, tim; uint32_t segsiz, def_len, minl; segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); def_len = rack_def_data_window * segsiz; if (rack->rc_gp_filled == 0) { /* * We have no measurement (IW is in flight?) so * we can only guess using our data_window sysctl * value (usually 100MSS). */ return (def_len); } /* * Now we have a number of factors to consider. * * 1) We have a desired BDP which is usually * at least 2. * 2) We have a minimum number of rtt's usually 1 SRTT * but we allow it too to be more. * 3) We want to make sure a measurement last N useconds (if * we have set rack_min_measure_usec. * * We handle the first concern here by trying to create a data * window of max(rack_def_data_window, DesiredBDP). The * second concern we handle in not letting the measurement * window end normally until at least the required SRTT's * have gone by which is done further below in * rack_enough_for_measurement(). Finally the third concern * we also handle here by calculating how long that time * would take at the current BW and then return the * max of our first calculation and that length. Note * that if rack_min_measure_usec is 0, we don't deal * with concern 3. Also for both Concern 1 and 3 an * application limited period could end the measurement * earlier. * * So lets calculate the BDP with the "known" b/w using * the SRTT has our rtt and then multiply it by the * goal. */ bw = rack_get_bw(rack); srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); len = bw * srtt; len /= (uint64_t)HPTS_USEC_IN_SEC; len *= max(1, rack_goal_bdp); /* Now we need to round up to the nearest MSS */ len = roundup(len, segsiz); if (rack_min_measure_usec) { /* Now calculate our min length for this b/w */ tim = rack_min_measure_usec; minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; if (minl == 0) minl = 1; minl = roundup(minl, segsiz); if (len < minl) len = minl; } /* * Now if we have a very small window we want * to attempt to get the window that is * as small as possible. This happens on * low b/w connections and we don't want to * span huge numbers of rtt's between measurements. * * We basically include 2 over our "MIN window" so * that the measurement can be shortened (possibly) by * an ack'ed packet. */ if (len < def_len) return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); else return (max((uint32_t)len, def_len)); } static int rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack) { uint32_t tim, srtts, segsiz; /* * Has enough time passed for the GP measurement to be valid? */ if ((tp->snd_max == tp->snd_una) || (th_ack == tp->snd_max)){ /* All is acked */ return (1); } if (SEQ_LT(th_ack, tp->gput_seq)) { /* Not enough bytes yet */ return (0); } segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); if (SEQ_LT(th_ack, tp->gput_ack) && ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { /* Not enough bytes yet */ return (0); } if (rack->r_ctl.rc_first_appl && (rack->r_ctl.rc_first_appl->r_start == th_ack)) { /* * We are up to the app limited point * we have to measure irrespective of the time.. */ return (1); } /* Now what about time? */ srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; if (tim >= srtts) { return (1); } /* Nope not even a full SRTT has passed */ return (0); } static void rack_log_timely(struct tcp_rack *rack, uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, uint64_t up_bnd, int line, uint8_t method) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = logged; log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; log.u_bbr.flex2 <<= 4; log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; log.u_bbr.flex2 <<= 4; log.u_bbr.flex2 |= rack->rc_gp_incr; log.u_bbr.flex2 <<= 4; log.u_bbr.flex2 |= rack->rc_gp_bwred; log.u_bbr.flex3 = rack->rc_gp_incr; log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; log.u_bbr.flex7 = rack->rc_gp_bwred; log.u_bbr.flex8 = method; log.u_bbr.cur_del_rate = cur_bw; log.u_bbr.delRate = low_bnd; log.u_bbr.bw_inuse = up_bnd; log.u_bbr.rttProp = rack_get_bw(rack); log.u_bbr.pkt_epoch = line; log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; log.u_bbr.cwnd_gain <<= 1; log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; log.u_bbr.cwnd_gain <<= 1; log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; log.u_bbr.cwnd_gain <<= 1; log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; log.u_bbr.lost = rack->r_ctl.rc_loss_count; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, TCP_TIMELY_WORK, 0, 0, &log, false, &tv); } } static int rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) { /* * Before we increase we need to know if * the estimate just made was less than * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) * * If we already are pacing at a fast enough * rate to push us faster there is no sense of * increasing. * * We first caculate our actual pacing rate (ss or ca multipler * times our cur_bw). * * Then we take the last measured rate and multipy by our * maximum pacing overage to give us a max allowable rate. * * If our act_rate is smaller than our max_allowable rate * then we should increase. Else we should hold steady. * */ uint64_t act_rate, max_allow_rate; if (rack_timely_no_stopping) return (1); if ((cur_bw == 0) || (last_bw_est == 0)) { /* * Initial startup case or * everything is acked case. */ rack_log_timely(rack, mult, cur_bw, 0, 0, __LINE__, 9); return (1); } if (mult <= 100) { /* * We can always pace at or slightly above our rate. */ rack_log_timely(rack, mult, cur_bw, 0, 0, __LINE__, 9); return (1); } act_rate = cur_bw * (uint64_t)mult; act_rate /= 100; max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); max_allow_rate /= 100; if (act_rate < max_allow_rate) { /* * Here the rate we are actually pacing at * is smaller than 10% above our last measurement. * This means we are pacing below what we would * like to try to achieve (plus some wiggle room). */ rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, __LINE__, 9); return (1); } else { /* * Here we are already pacing at least rack_max_per_above(10%) * what we are getting back. This indicates most likely * that we are being limited (cwnd/rwnd/app) and can't * get any more b/w. There is no sense of trying to * raise up the pacing rate its not speeding us up * and we already are pacing faster than we are getting. */ rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, __LINE__, 8); return (0); } } static void rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) { /* * When we drag bottom, we want to assure * that no multiplier is below 1.0, if so * we want to restore it to at least that. */ if (rack->r_ctl.rack_per_of_gp_rec < 100) { /* This is unlikely we usually do not touch recovery */ rack->r_ctl.rack_per_of_gp_rec = 100; } if (rack->r_ctl.rack_per_of_gp_ca < 100) { rack->r_ctl.rack_per_of_gp_ca = 100; } if (rack->r_ctl.rack_per_of_gp_ss < 100) { rack->r_ctl.rack_per_of_gp_ss = 100; } } static void rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) { if (rack->r_ctl.rack_per_of_gp_ca > 100) { rack->r_ctl.rack_per_of_gp_ca = 100; } if (rack->r_ctl.rack_per_of_gp_ss > 100) { rack->r_ctl.rack_per_of_gp_ss = 100; } } static void rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) { int32_t calc, logged, plus; logged = 0; if (override) { /* * override is passed when we are * loosing b/w and making one last * gasp at trying to not loose out * to a new-reno flow. */ goto extra_boost; } /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ if (rack->rc_gp_incr && ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { /* * Reset and get 5 strokes more before the boost. Note * that the count is 0 based so we have to add one. */ extra_boost: plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; rack->rc_gp_timely_inc_cnt = 0; } else plus = (uint32_t)rack_gp_increase_per; /* Must be at least 1% increase for true timely increases */ if ((plus < 1) && ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) plus = 1; if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0) && rack_bw_can_be_raised(rack, cur_bw, last_bw_est, rack->r_ctl.rack_per_of_gp_rec)) { /* We have been in recovery ding it too */ calc = rack->r_ctl.rack_per_of_gp_rec + plus; if (calc > 0xffff) calc = 0xffff; logged |= 1; rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; if (rack_per_upper_bound_ss && (rack->rc_dragged_bottom == 0) && (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; } if (rack->rc_gp_saw_ca && (rack->rc_gp_saw_ss == 0) && rack_bw_can_be_raised(rack, cur_bw, last_bw_est, rack->r_ctl.rack_per_of_gp_ca)) { /* In CA */ calc = rack->r_ctl.rack_per_of_gp_ca + plus; if (calc > 0xffff) calc = 0xffff; logged |= 2; rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; if (rack_per_upper_bound_ca && (rack->rc_dragged_bottom == 0) && (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; } if (rack->rc_gp_saw_ss && rack_bw_can_be_raised(rack, cur_bw, last_bw_est, rack->r_ctl.rack_per_of_gp_ss)) { /* In SS */ calc = rack->r_ctl.rack_per_of_gp_ss + plus; if (calc > 0xffff) calc = 0xffff; rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; if (rack_per_upper_bound_ss && (rack->rc_dragged_bottom == 0) && (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; logged |= 4; } if (logged && (rack->rc_gp_incr == 0)){ /* Go into increment mode */ rack->rc_gp_incr = 1; rack->rc_gp_timely_inc_cnt = 0; } if (rack->rc_gp_incr && logged && (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { rack->rc_gp_timely_inc_cnt++; } rack_log_timely(rack, logged, plus, 0, 0, __LINE__, 1); } static uint32_t rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) { /* * norm_grad = rtt_diff / minrtt; * new_per = curper * (1 - B * norm_grad) * * B = rack_gp_decrease_per (default 10%) * rtt_dif = input var current rtt-diff * curper = input var current percentage * minrtt = from rack filter * */ uint64_t perf; perf = (((uint64_t)curper * ((uint64_t)1000000 - ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * (((uint64_t)rtt_diff * (uint64_t)1000000)/ (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ (uint64_t)1000000)) / (uint64_t)1000000); if (perf > curper) { /* TSNH */ perf = curper - 1; } return ((uint32_t)perf); } static uint32_t rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) { /* * highrttthresh * result = curper * (1 - (B * ( 1 - ------ )) * gp_srtt * * B = rack_gp_decrease_per (default 10%) * highrttthresh = filter_min * rack_gp_rtt_maxmul */ uint64_t perf; uint32_t highrttthresh; highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; perf = (((uint64_t)curper * ((uint64_t)1000000 - ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - ((uint64_t)highrttthresh * (uint64_t)1000000) / (uint64_t)rtt)) / 100)) /(uint64_t)1000000); return (perf); } static void rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) { uint64_t logvar, logvar2, logvar3; uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; if (rack->rc_gp_incr) { /* Turn off increment counting */ rack->rc_gp_incr = 0; rack->rc_gp_timely_inc_cnt = 0; } ss_red = ca_red = rec_red = 0; logged = 0; /* Calculate the reduction value */ if (rtt_diff < 0) { rtt_diff *= -1; } /* Must be at least 1% reduction */ if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { /* We have been in recovery ding it too */ if (timely_says == 2) { new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); if (alt < new_per) val = alt; else val = new_per; } else val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); if (rack->r_ctl.rack_per_of_gp_rec > val) { rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; } else { rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; rec_red = 0; } if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; logged |= 1; } if (rack->rc_gp_saw_ss) { /* Sent in SS */ if (timely_says == 2) { new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); if (alt < new_per) val = alt; else val = new_per; } else val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); if (rack->r_ctl.rack_per_of_gp_ss > new_per) { ss_red = rack->r_ctl.rack_per_of_gp_ss - val; rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; } else { ss_red = new_per; rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; logvar = new_per; logvar <<= 32; logvar |= alt; logvar2 = (uint32_t)rtt; logvar2 <<= 32; logvar2 |= (uint32_t)rtt_diff; logvar3 = rack_gp_rtt_maxmul; logvar3 <<= 32; logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); rack_log_timely(rack, timely_says, logvar2, logvar3, logvar, __LINE__, 10); } if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; logged |= 4; } else if (rack->rc_gp_saw_ca) { /* Sent in CA */ if (timely_says == 2) { new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); if (alt < new_per) val = alt; else val = new_per; } else val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); if (rack->r_ctl.rack_per_of_gp_ca > val) { ca_red = rack->r_ctl.rack_per_of_gp_ca - val; rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; } else { rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; ca_red = 0; logvar = new_per; logvar <<= 32; logvar |= alt; logvar2 = (uint32_t)rtt; logvar2 <<= 32; logvar2 |= (uint32_t)rtt_diff; logvar3 = rack_gp_rtt_maxmul; logvar3 <<= 32; logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); rack_log_timely(rack, timely_says, logvar2, logvar3, logvar, __LINE__, 10); } if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; logged |= 2; } if (rack->rc_gp_timely_dec_cnt < 0x7) { rack->rc_gp_timely_dec_cnt++; if (rack_timely_dec_clear && (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) rack->rc_gp_timely_dec_cnt = 0; } logvar = ss_red; logvar <<= 32; logvar |= ca_red; rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, __LINE__, 2); } static void rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, uint32_t rtt, uint32_t line, uint8_t reas) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = line; log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; log.u_bbr.flex5 = rtt; log.u_bbr.flex6 = rack->rc_highly_buffered; log.u_bbr.flex6 <<= 1; log.u_bbr.flex6 |= rack->forced_ack; log.u_bbr.flex6 <<= 1; log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; log.u_bbr.flex6 <<= 1; log.u_bbr.flex6 |= rack->in_probe_rtt; log.u_bbr.flex6 <<= 1; log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; log.u_bbr.flex8 = reas; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.delRate = rack_get_bw(rack); log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; log.u_bbr.cur_del_rate <<= 32; log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); log.u_bbr.rttProp = us_cts; log.u_bbr.rttProp <<= 32; log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_RTT_SHRINKS, 0, 0, &log, false, &rack->r_ctl.act_rcv_time); } } static void rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) { uint64_t bwdp; bwdp = rack_get_bw(rack); bwdp *= (uint64_t)rtt; bwdp /= (uint64_t)HPTS_USEC_IN_SEC; rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { /* * A window protocol must be able to have 4 packets * outstanding as the floor in order to function * (especially considering delayed ack :D). */ rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); } } static void rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) { /** * ProbeRTT is a bit different in rack_pacing than in * BBR. It is like BBR in that it uses the lowering of * the RTT as a signal that we saw something new and * counts from there for how long between. But it is * different in that its quite simple. It does not * play with the cwnd and wait until we get down * to N segments outstanding and hold that for * 200ms. Instead it just sets the pacing reduction * rate to a set percentage (70 by default) and hold * that for a number of recent GP Srtt's. */ uint32_t segsiz; if (rack->rc_gp_dyn_mul == 0) return; if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { /* We are idle */ return; } if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { /* * Stop the goodput now, the idea here is * that future measurements with in_probe_rtt * won't register if they are not greater so * we want to get what info (if any) is available * now. */ rack_do_goodput_measurement(rack->rc_tp, rack, rack->rc_tp->snd_una, __LINE__); } rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; rack->r_ctl.rc_time_probertt_entered = us_cts; segsiz = min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs); rack->in_probe_rtt = 1; rack->measure_saw_probe_rtt = 1; rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_probertt_starts = 0; rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; if (rack_probertt_use_min_rtt_entry) rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); else rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), __LINE__, RACK_RTTS_ENTERPROBE); } static void rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) { struct rack_sendmap *rsm; uint32_t segsiz; segsiz = min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs); rack->in_probe_rtt = 0; if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { /* * Stop the goodput now, the idea here is * that future measurements with in_probe_rtt * won't register if they are not greater so * we want to get what info (if any) is available * now. */ rack_do_goodput_measurement(rack->rc_tp, rack, rack->rc_tp->snd_una, __LINE__); } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { /* * We don't have enough data to make a measurement. * So lets just stop and start here after exiting * probe-rtt. We probably are not interested in * the results anyway. */ rack->rc_tp->t_flags &= ~TF_GPUTINPROG; } /* * Measurements through the current snd_max are going * to be limited by the slower pacing rate. * * We need to mark these as app-limited so we * don't collapse the b/w. */ rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { if (rack->r_ctl.rc_app_limited_cnt == 0) rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; else { /* * Go out to the end app limited and mark * this new one as next and move the end_appl up * to this guy. */ if (rack->r_ctl.rc_end_appl) rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; rack->r_ctl.rc_end_appl = rsm; } rsm->r_flags |= RACK_APP_LIMITED; rack->r_ctl.rc_app_limited_cnt++; } /* * Now, we need to examine our pacing rate multipliers. * If its under 100%, we need to kick it back up to * 100%. We also don't let it be over our "max" above * the actual rate i.e. 100% + rack_clamp_atexit_prtt. * Note setting clamp_atexit_prtt to 0 has the effect * of setting CA/SS to 100% always at exit (which is * the default behavior). */ if (rack_probertt_clear_is) { rack->rc_gp_incr = 0; rack->rc_gp_bwred = 0; rack->rc_gp_timely_inc_cnt = 0; rack->rc_gp_timely_dec_cnt = 0; } /* Do we do any clamping at exit? */ if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; } if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; } /* * Lets set rtt_diff to 0, so that we will get a "boost" * after exiting. */ rack->r_ctl.rc_rtt_diff = 0; /* Clear all flags so we start fresh */ rack->rc_tp->t_bytes_acked = 0; rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; /* * If configured to, set the cwnd and ssthresh to * our targets. */ if (rack_probe_rtt_sets_cwnd) { uint64_t ebdp; uint32_t setto; /* Set ssthresh so we get into CA once we hit our target */ if (rack_probertt_use_min_rtt_exit == 1) { /* Set to min rtt */ rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); } else if (rack_probertt_use_min_rtt_exit == 2) { /* Set to current gp rtt */ rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); } else if (rack_probertt_use_min_rtt_exit == 3) { /* Set to entry gp rtt */ rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_entry_gp_rtt); } else { uint64_t sum; uint32_t setval; sum = rack->r_ctl.rc_entry_gp_rtt; sum *= 10; sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); if (sum >= 20) { /* * A highly buffered path needs * cwnd space for timely to work. * Lets set things up as if * we are heading back here again. */ setval = rack->r_ctl.rc_entry_gp_rtt; } else if (sum >= 15) { /* * Lets take the smaller of the * two since we are just somewhat * buffered. */ setval = rack->r_ctl.rc_gp_srtt; if (setval > rack->r_ctl.rc_entry_gp_rtt) setval = rack->r_ctl.rc_entry_gp_rtt; } else { /* * Here we are not highly buffered * and should pick the min we can to * keep from causing loss. */ setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); } rack_set_prtt_target(rack, segsiz, setval); } if (rack_probe_rtt_sets_cwnd > 1) { /* There is a percentage here to boost */ ebdp = rack->r_ctl.rc_target_probertt_flight; ebdp *= rack_probe_rtt_sets_cwnd; ebdp /= 100; setto = rack->r_ctl.rc_target_probertt_flight + ebdp; } else setto = rack->r_ctl.rc_target_probertt_flight; rack->rc_tp->snd_cwnd = roundup(setto, segsiz); if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { /* Enforce a min */ rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; } /* If we set in the cwnd also set the ssthresh point so we are in CA */ rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); } rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), __LINE__, RACK_RTTS_EXITPROBE); /* Clear times last so log has all the info */ rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; rack->r_ctl.rc_time_probertt_entered = us_cts; rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_of_last_probertt = us_cts; } static void rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) { /* Check in on probe-rtt */ if (rack->rc_gp_filled == 0) { /* We do not do p-rtt unless we have gp measurements */ return; } if (rack->in_probe_rtt) { uint64_t no_overflow; uint32_t endtime, must_stay; if (rack->r_ctl.rc_went_idle_time && ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { /* * We went idle during prtt, just exit now. */ rack_exit_probertt(rack, us_cts); } else if (rack_probe_rtt_safety_val && TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { /* * Probe RTT safety value triggered! */ rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), __LINE__, RACK_RTTS_SAFETY); rack_exit_probertt(rack, us_cts); } /* Calculate the max we will wait */ endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); if (rack->rc_highly_buffered) endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); /* Calculate the min we must wait */ must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && TSTMP_LT(us_cts, endtime)) { uint32_t calc; /* Do we lower more? */ no_exit: if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) calc = us_cts - rack->r_ctl.rc_time_probertt_entered; else calc = 0; calc /= max(rack->r_ctl.rc_gp_srtt, 1); if (calc) { /* Maybe */ calc *= rack_per_of_gp_probertt_reduce; rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; /* Limit it too */ if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; } /* We must reach target or the time set */ return; } if (rack->r_ctl.rc_time_probertt_starts == 0) { if ((TSTMP_LT(us_cts, must_stay) && rack->rc_highly_buffered) || (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight)) { /* We are not past the must_stay time */ goto no_exit; } rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), __LINE__, RACK_RTTS_REACHTARGET); rack->r_ctl.rc_time_probertt_starts = us_cts; if (rack->r_ctl.rc_time_probertt_starts == 0) rack->r_ctl.rc_time_probertt_starts = 1; /* Restore back to our rate we want to pace at in prtt */ rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; } /* * Setup our end time, some number of gp_srtts plus 200ms. */ no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * (uint64_t)rack_probertt_gpsrtt_cnt_mul); if (rack_probertt_gpsrtt_cnt_div) endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); else endtime = 0; endtime += rack_min_probertt_hold; endtime += rack->r_ctl.rc_time_probertt_starts; if (TSTMP_GEQ(us_cts, endtime)) { /* yes, exit probertt */ rack_exit_probertt(rack, us_cts); } } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { /* Go into probertt, its been too long since we went lower */ rack_enter_probertt(rack, us_cts); } } static void rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, uint32_t rtt, int32_t rtt_diff) { uint64_t cur_bw, up_bnd, low_bnd, subfr; uint32_t losses; if ((rack->rc_gp_dyn_mul == 0) || (rack->use_fixed_rate) || (rack->in_probe_rtt) || (rack->rc_always_pace == 0)) { /* No dynamic GP multipler in play */ return; } losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; cur_bw = rack_get_bw(rack); /* Calculate our up and down range */ up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; up_bnd /= 100; up_bnd += rack->r_ctl.last_gp_comp_bw; subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; subfr /= 100; low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { /* * This is the case where our RTT is above * the max target and we have been configured * to just do timely no bonus up stuff in that case. * * There are two configurations, set to 1, and we * just do timely if we are over our max. If its * set above 1 then we slam the multipliers down * to 100 and then decrement per timely. */ rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, __LINE__, 3); if (rack->r_ctl.rc_no_push_at_mrtt > 1) rack_validate_multipliers_at_or_below_100(rack); rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); } else if ((last_bw_est < low_bnd) && !losses) { /* * We are decreasing this is a bit complicated this * means we are loosing ground. This could be * because another flow entered and we are competing * for b/w with it. This will push the RTT up which * makes timely unusable unless we want to get shoved * into a corner and just be backed off (the age * old problem with delay based CC). * * On the other hand if it was a route change we * would like to stay somewhat contained and not * blow out the buffers. */ rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, __LINE__, 3); rack->r_ctl.last_gp_comp_bw = cur_bw; if (rack->rc_gp_bwred == 0) { /* Go into reduction counting */ rack->rc_gp_bwred = 1; rack->rc_gp_timely_dec_cnt = 0; } if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || (timely_says == 0)) { /* * Push another time with a faster pacing * to try to gain back (we include override to * get a full raise factor). */ if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || (timely_says == 0) || (rack_down_raise_thresh == 0)) { /* * Do an override up in b/w if we were * below the threshold or if the threshold * is zero we always do the raise. */ rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); } else { /* Log it stays the same */ rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, __LINE__, 11); } rack->rc_gp_timely_dec_cnt++; /* We are not incrementing really no-count */ rack->rc_gp_incr = 0; rack->rc_gp_timely_inc_cnt = 0; } else { /* * Lets just use the RTT * information and give up * pushing. */ goto use_timely; } } else if ((timely_says != 2) && !losses && (last_bw_est > up_bnd)) { /* * We are increasing b/w lets keep going, updating * our b/w and ignoring any timely input, unless * of course we are at our max raise (if there is one). */ rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, __LINE__, 3); rack->r_ctl.last_gp_comp_bw = cur_bw; if (rack->rc_gp_saw_ss && rack_per_upper_bound_ss && (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { /* * In cases where we can't go higher * we should just use timely. */ goto use_timely; } if (rack->rc_gp_saw_ca && rack_per_upper_bound_ca && (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { /* * In cases where we can't go higher * we should just use timely. */ goto use_timely; } rack->rc_gp_bwred = 0; rack->rc_gp_timely_dec_cnt = 0; /* You get a set number of pushes if timely is trying to reduce */ if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); } else { /* Log it stays the same */ rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, __LINE__, 12); } return; } else { /* * We are staying between the lower and upper range bounds * so use timely to decide. */ rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, __LINE__, 3); use_timely: if (timely_says) { rack->rc_gp_incr = 0; rack->rc_gp_timely_inc_cnt = 0; if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && !losses && (last_bw_est < low_bnd)) { /* We are loosing ground */ rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); rack->rc_gp_timely_dec_cnt++; /* We are not incrementing really no-count */ rack->rc_gp_incr = 0; rack->rc_gp_timely_inc_cnt = 0; } else rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); } else { rack->rc_gp_bwred = 0; rack->rc_gp_timely_dec_cnt = 0; rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); } } } static int32_t rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) { int32_t timely_says; uint64_t log_mult, log_rtt_a_diff; log_rtt_a_diff = rtt; log_rtt_a_diff <<= 32; log_rtt_a_diff |= (uint32_t)rtt_diff; if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul)) { /* Reduce the b/w multipler */ timely_says = 2; log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; log_mult <<= 32; log_mult |= prev_rtt; rack_log_timely(rack, timely_says, log_mult, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 4); } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / max(rack_gp_rtt_mindiv , 1)))) { /* Increase the b/w multipler */ log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / max(rack_gp_rtt_mindiv , 1)); log_mult <<= 32; log_mult |= prev_rtt; timely_says = 0; rack_log_timely(rack, timely_says, log_mult , get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 5); } else { /* * Use a gradient to find it the timely gradient * is: * grad = rc_rtt_diff / min_rtt; * * anything below or equal to 0 will be * a increase indication. Anything above * zero is a decrease. Note we take care * of the actual gradient calculation * in the reduction (its not needed for * increase). */ log_mult = prev_rtt; if (rtt_diff <= 0) { /* * Rttdiff is less than zero, increase the * b/w multipler (its 0 or negative) */ timely_says = 0; rack_log_timely(rack, timely_says, log_mult, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); } else { /* Reduce the b/w multipler */ timely_says = 1; rack_log_timely(rack, timely_says, log_mult, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); } } return (timely_says); } static void rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, int line) { uint64_t tim, bytes_ps, ltim, stim, utim; uint32_t segsiz, bytes, reqbytes, us_cts; int32_t gput, new_rtt_diff, timely_says; us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); if (TSTMP_GEQ(us_cts, tp->gput_ts)) tim = us_cts - tp->gput_ts; else tim = 0; if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts)) stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; else stim = 0; /* * Use the larger of the send time or ack time. This prevents us * from being influenced by ack artifacts to come up with too * high of measurement. Note that since we are spanning over many more * bytes in most of our measurements hopefully that is less likely to * occur. */ if (tim > stim) utim = max(tim, 1); else utim = max(stim, 1); /* Lets validate utim */ ltim = max(1, (utim/HPTS_USEC_IN_MSEC)); gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); if ((tim == 0) && (stim == 0)) { /* * Invalid measurement time, maybe * all on one ack/one send? */ bytes = 0; bytes_ps = 0; rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 0, 0, 0, 10, __LINE__, NULL); goto skip_measurement; } if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { /* We never made a us_rtt measurement? */ bytes = 0; bytes_ps = 0; rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 0, 0, 0, 10, __LINE__, NULL); goto skip_measurement; } /* * Calculate the maximum possible b/w this connection * could have. We base our calculation on the lowest * rtt we have seen during the measurement and the * largest rwnd the client has given us in that time. This * forms a BDP that is the maximum that we could ever * get to the client. Anything larger is not valid. * * I originally had code here that rejected measurements * where the time was less than 1/2 the latest us_rtt. * But after thinking on that I realized its wrong since * say you had a 150Mbps or even 1Gbps link, and you * were a long way away.. example I am in Europe (100ms rtt) * talking to my 1Gbps link in S.C. Now measuring say 150,000 * bytes my time would be 1.2ms, and yet my rtt would say * the measurement was invalid the time was < 50ms. The * same thing is true for 150Mb (8ms of time). * * A better way I realized is to look at what the maximum * the connection could possibly do. This is gated on * the lowest RTT we have seen and the highest rwnd. * We should in theory never exceed that, if we are * then something on the path is storing up packets * and then feeding them all at once to our endpoint * messing up our measurement. */ rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; if (SEQ_LT(th_ack, tp->gput_seq)) { /* No measurement can be made */ bytes = 0; bytes_ps = 0; rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 0, 0, 0, 10, __LINE__, NULL); goto skip_measurement; } else bytes = (th_ack - tp->gput_seq); bytes_ps = (uint64_t)bytes; /* * Don't measure a b/w for pacing unless we have gotten at least * an initial windows worth of data in this measurement interval. * * Small numbers of bytes get badly influenced by delayed ack and * other artifacts. Note we take the initial window or our * defined minimum GP (defaulting to 10 which hopefully is the * IW). */ if (rack->rc_gp_filled == 0) { /* * The initial estimate is special. We * have blasted out an IW worth of packets * without a real valid ack ts results. We * then setup the app_limited_needs_set flag, * this should get the first ack in (probably 2 * MSS worth) to be recorded as the timestamp. * We thus allow a smaller number of bytes i.e. * IW - 2MSS. */ reqbytes -= (2 * segsiz); /* Also lets fill previous for our first measurement to be neutral */ rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; } if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, rack->r_ctl.rc_app_limited_cnt, 0, 0, 10, __LINE__, NULL); goto skip_measurement; } /* * We now need to calculate the Timely like status so * we can update (possibly) the b/w multipliers. */ new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; if (rack->rc_gp_filled == 0) { /* No previous reading */ rack->r_ctl.rc_rtt_diff = new_rtt_diff; } else { if (rack->measure_saw_probe_rtt == 0) { /* * We don't want a probertt to be counted * since it will be negative incorrectly. We * expect to be reducing the RTT when we * pace at a slower rate. */ rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); } } timely_says = rack_make_timely_judgement(rack, rack->r_ctl.rc_gp_srtt, rack->r_ctl.rc_rtt_diff, rack->r_ctl.rc_prev_gp_srtt ); bytes_ps *= HPTS_USEC_IN_SEC; bytes_ps /= utim; if (bytes_ps > rack->r_ctl.last_max_bw) { /* * Something is on path playing * since this b/w is not possible based * on our BDP (highest rwnd and lowest rtt * we saw in the measurement window). * * Another option here would be to * instead skip the measurement. */ rack_log_pacing_delay_calc(rack, bytes, reqbytes, bytes_ps, rack->r_ctl.last_max_bw, 0, 11, __LINE__, NULL); bytes_ps = rack->r_ctl.last_max_bw; } /* We store gp for b/w in bytes per second */ if (rack->rc_gp_filled == 0) { /* Initial measurment */ if (bytes_ps) { rack->r_ctl.gp_bw = bytes_ps; rack->rc_gp_filled = 1; rack->r_ctl.num_avg = 1; rack_set_pace_segments(rack->rc_tp, rack, __LINE__); } else { rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, rack->r_ctl.rc_app_limited_cnt, 0, 0, 10, __LINE__, NULL); } if (rack->rc_inp->inp_in_hpts && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* * Ok we can't trust the pacer in this case * where we transition from un-paced to paced. * Or for that matter when the burst mitigation * was making a wild guess and got it wrong. * Stop the pacer and clear up all the aggregate * delays etc. */ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack->r_ctl.rc_hpts_flags = 0; rack->r_ctl.rc_last_output_to = 0; } } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) { /* Still a small number run an average */ rack->r_ctl.gp_bw += bytes_ps; rack->r_ctl.num_avg++; if (rack->r_ctl.num_avg >= RACK_REQ_AVG) { /* We have collected enought to move forward */ rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg; } } else { /* * We want to take 1/wma of the goodput and add in to 7/8th * of the old value weighted by the srtt. So if your measurement * period is say 2 SRTT's long you would get 1/4 as the * value, if it was like 1/2 SRTT then you would get 1/16th. * * But we must be careful not to take too much i.e. if the * srtt is say 20ms and the measurement is taken over * 400ms our weight would be 400/20 i.e. 20. On the * other hand if we get a measurement over 1ms with a * 10ms rtt we only want to take a much smaller portion. */ uint64_t resid_bw, subpart, addpart, srtt; srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); if (srtt == 0) { /* * Strange why did t_srtt go back to zero? */ if (rack->r_ctl.rc_rack_min_rtt) srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC); else srtt = HPTS_USEC_IN_MSEC; } /* * XXXrrs: Note for reviewers, in playing with * dynamic pacing I discovered this GP calculation * as done originally leads to some undesired results. * Basically you can get longer measurements contributing * too much to the WMA. Thus I changed it if you are doing * dynamic adjustments to only do the aportioned adjustment * if we have a very small (time wise) measurement. Longer * measurements just get there weight (defaulting to 1/8) * add to the WMA. We may want to think about changing * this to always do that for both sides i.e. dynamic * and non-dynamic... but considering lots of folks * were playing with this I did not want to change the * calculation per.se. without your thoughts.. Lawerence? * Peter?? */ if (rack->rc_gp_dyn_mul == 0) { subpart = rack->r_ctl.gp_bw * utim; subpart /= (srtt * 8); if (subpart < (rack->r_ctl.gp_bw / 2)) { /* * The b/w update takes no more * away then 1/2 our running total * so factor it in. */ addpart = bytes_ps * utim; addpart /= (srtt * 8); } else { /* * Don't allow a single measurement * to account for more than 1/2 of the * WMA. This could happen on a retransmission * where utim becomes huge compared to * srtt (multiple retransmissions when using * the sending rate which factors in all the * transmissions from the first one). */ subpart = rack->r_ctl.gp_bw / 2; addpart = bytes_ps / 2; } resid_bw = rack->r_ctl.gp_bw - subpart; rack->r_ctl.gp_bw = resid_bw + addpart; } else { if ((utim / srtt) <= 1) { /* * The b/w update was over a small period * of time. The idea here is to prevent a small * measurement time period from counting * too much. So we scale it based on the * time so it attributes less than 1/rack_wma_divisor * of its measurement. */ subpart = rack->r_ctl.gp_bw * utim; subpart /= (srtt * rack_wma_divisor); addpart = bytes_ps * utim; addpart /= (srtt * rack_wma_divisor); } else { /* * The scaled measurement was long * enough so lets just add in the * portion of the measurment i.e. 1/rack_wma_divisor */ subpart = rack->r_ctl.gp_bw / rack_wma_divisor; addpart = bytes_ps / rack_wma_divisor; } if ((rack->measure_saw_probe_rtt == 0) || (bytes_ps > rack->r_ctl.gp_bw)) { /* * For probe-rtt we only add it in * if its larger, all others we just * add in. */ resid_bw = rack->r_ctl.gp_bw - subpart; rack->r_ctl.gp_bw = resid_bw + addpart; } } } /* We do not update any multipliers if we are in or have seen a probe-rtt */ if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) rack_update_multiplier(rack, timely_says, bytes_ps, rack->r_ctl.rc_gp_srtt, rack->r_ctl.rc_rtt_diff); rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, rack_get_bw(rack), 3, line, NULL); /* reset the gp srtt and setup the new prev */ rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; /* Record the lost count for the next measurement */ rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; /* * We restart our diffs based on the gpsrtt in the * measurement window. */ rack->rc_gp_rtt_set = 0; rack->rc_gp_saw_rec = 0; rack->rc_gp_saw_ca = 0; rack->rc_gp_saw_ss = 0; rack->rc_dragged_bottom = 0; skip_measurement: #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* * XXXLAS: This is a temporary hack, and should be * chained off VOI_TCP_GPUT when stats(9) grows an * API to deal with chained VOIs. */ if (tp->t_stats_gput_prev > 0) stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_GPUT_ND, ((gput - tp->t_stats_gput_prev) * 100) / tp->t_stats_gput_prev); #endif tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; /* * Now are we app limited now and there is space from where we * were to where we want to go? * * We don't do the other case i.e. non-applimited here since * the next send will trigger us picking up the missing data. */ if (rack->r_ctl.rc_first_appl && TCPS_HAVEESTABLISHED(tp->t_state) && rack->r_ctl.rc_app_limited_cnt && (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && ((rack->r_ctl.rc_first_appl->r_start - th_ack) > max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { /* * Yep there is enough outstanding to make a measurement here. */ struct rack_sendmap *rsm, fe; tp->t_flags |= TF_GPUTINPROG; rack->r_ctl.rc_gp_lowrtt = 0xffffffff; rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rack->app_limited_needs_set = 0; tp->gput_seq = th_ack; if (rack->in_probe_rtt) rack->measure_saw_probe_rtt = 1; else if ((rack->measure_saw_probe_rtt) && (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) rack->measure_saw_probe_rtt = 0; if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) { /* There is a full window to gain info from */ tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); } else { /* We can only measure up to the applimited point */ tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack); } /* * Now we need to find the timestamp of the send at tp->gput_seq * for the send based measurement. */ fe.r_start = tp->gput_seq; rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); if (rsm) { /* Ok send-based limit is set */ if (SEQ_LT(rsm->r_start, tp->gput_seq)) { /* * Move back to include the earlier part * so our ack time lines up right (this may * make an overlapping measurement but thats * ok). */ tp->gput_seq = rsm->r_start; } if (rsm->r_flags & RACK_ACKED) tp->gput_ts = rsm->r_ack_arrival; else rack->app_limited_needs_set = 1; rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; } else { /* * If we don't find the rsm due to some * send-limit set the current time, which * basically disables the send-limit. */ rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); } rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, (uint64_t)rsm, tp->gput_ts, rack->r_ctl.rc_app_limited_cnt, 9, __LINE__, NULL); } } /* * CC wrapper hook functions */ static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery) { INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { uint32_t max; max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); if (tp->ccv->bytes_this_ack > max) { tp->ccv->bytes_this_ack = max; } } if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd) tp->ccv->flags |= CCF_CWND_LIMITED; else tp->ccv->flags &= ~CCF_CWND_LIMITED; #ifdef STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); #endif if ((tp->t_flags & TF_GPUTINPROG) && rack_enough_for_measurement(tp, rack, th->th_ack)) { /* Measure the Goodput */ rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__); #ifdef NETFLIX_PEAKRATE if ((type == CC_ACK) && (tp->t_maxpeakrate)) { /* * We update t_peakrate_thr. This gives us roughly * one update per round trip time. Note * it will only be used if pace_always is off i.e * we don't do this for paced flows. */ tcp_update_peakrate_thr(tp); } #endif } if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) { tp->t_bytes_acked += tp->ccv->bytes_this_ack; if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; tp->ccv->flags |= CCF_ABC_SENTAWND; } } else { tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); #endif if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; } #ifdef NETFLIX_PEAKRATE /* we enforce max peak rate if it is set and we are not pacing */ if ((rack->rc_always_pace == 0) && tp->t_peakrate_thr && (tp->snd_cwnd > tp->t_peakrate_thr)) { tp->snd_cwnd = tp->t_peakrate_thr; } #endif } static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); /* * If we are doing PRR and have enough * room to send we are pacing and prr * is disabled we will want to see if we * can send data (by setting r_wanted_output to * true). */ if ((rack->r_ctl.rc_prr_sndcnt > 0) || rack->rack_no_prr) rack->r_wanted_output = 1; } static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) { struct tcp_rack *rack; uint32_t orig_cwnd; orig_cwnd = tp->snd_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->rc_not_backing_off == 0) { /* only alert CC if we alerted when we entered */ if (CC_ALGO(tp)->post_recovery != NULL) { tp->ccv->curack = th->th_ack; CC_ALGO(tp)->post_recovery(tp->ccv); } if (tp->snd_cwnd > tp->snd_ssthresh) { /* Drop us down to the ssthresh (1/2 cwnd at loss) */ tp->snd_cwnd = tp->snd_ssthresh; } } if ((rack->rack_no_prr == 0) && (rack->r_ctl.rc_prr_sndcnt > 0)) { /* Suck the next prr cnt back into cwnd */ tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; rack->r_ctl.rc_prr_sndcnt = 0; rack_log_to_prr(rack, 1, 0); } rack_log_to_prr(rack, 14, orig_cwnd); tp->snd_recover = tp->snd_una; EXIT_RECOVERY(tp->t_flags); } static void rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) { struct tcp_rack *rack; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: tp->t_flags &= ~TF_WASFRECOVERY; tp->t_flags &= ~TF_WASCRECOVERY; if (!IN_FASTRECOVERY(tp->t_flags)) { rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; if (rack->rack_no_prr == 0) { rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); rack_log_to_prr(rack, 2, 0); } rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; tp->snd_recover = tp->snd_max; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_ECN: if (!IN_CONGRECOVERY(tp->t_flags) || /* * Allow ECN reaction on ACK to CWR, if * that data segment was also CE marked. */ SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_CONGRECOVERY(tp->t_flags); KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max + 1; if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; } break; case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); tp->snd_cwnd = ctf_fixed_maxseg(tp); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; case CC_RTO_ERR: KMOD_TCPSTAT_INC(tcps_sndrexmitbad); /* RTO was unnecessary, so reset everything. */ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; if (tp->t_flags & TF_WASFRECOVERY) { ENTER_FASTRECOVERY(tp->t_flags); tp->t_flags &= ~TF_WASFRECOVERY; } if (tp->t_flags & TF_WASCRECOVERY) { ENTER_CONGRECOVERY(tp->t_flags); tp->t_flags &= ~TF_WASCRECOVERY; } tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; break; } /* * If we are below our max rtt, don't * signal the CC control to change things. * instead set it up so that we are in * recovery but not going to back off. */ if (rack->rc_highly_buffered) { /* * Do we use the higher rtt for * our threshold to not backoff (like CDG)? */ uint32_t rtt_mul, rtt_div; if (rack_use_max_for_nobackoff) { rtt_mul = (rack_gp_rtt_maxmul - 1); rtt_div = 1; } else { rtt_mul = rack_gp_rtt_minmul; rtt_div = max(rack_gp_rtt_mindiv , 1); } if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt + ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) / rtt_div))) { /* below our min threshold */ rack->rc_not_backing_off = 1; ENTER_RECOVERY(rack->rc_tp->t_flags); rack_log_rtt_shrinks(rack, 0, rtt_mul, rtt_div, RACK_RTTS_NOBACKOFF); return; } } rack->rc_not_backing_off = 0; if (CC_ALGO(tp)->cong_signal != NULL) { if (th != NULL) tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } } static inline void rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) { uint32_t i_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef NETFLIX_STATS KMOD_TCPSTAT_INC(tcps_idle_restarts); if (tp->t_state == TCPS_ESTABLISHED) KMOD_TCPSTAT_INC(tcps_idle_estrestarts); #endif if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); if (tp->snd_cwnd == 1) i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ else i_cwnd = rc_init_window(rack); /* * Being idle is no differnt than the initial window. If the cc * clamps it down below the initial window raise it to the initial * window. */ if (tp->snd_cwnd < i_cwnd) { tp->snd_cwnd = i_cwnd; } } /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. * - Delayed acks are enabled or this is a half-synchronized T/TCP * connection. */ #define DELAY_ACK(tp, tlen) \ (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ ((tp->t_flags & TF_DELACK) == 0) && \ (tlen <= tp->t_maxseg) && \ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) static struct rack_sendmap * rack_find_lowest_rsm(struct tcp_rack *rack) { struct rack_sendmap *rsm; /* * Walk the time-order transmitted list looking for an rsm that is * not acked. This will be the one that was sent the longest time * ago that is still outstanding. */ TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { if (rsm->r_flags & RACK_ACKED) { continue; } goto finish; } finish: return (rsm); } static struct rack_sendmap * rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) { struct rack_sendmap *prsm; /* * Walk the sequence order list backward until we hit and arrive at * the highest seq not acked. In theory when this is called it * should be the last segment (which it was not). */ counter_u64_add(rack_find_high, 1); prsm = rsm; RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { continue; } return (prsm); } return (NULL); } static uint32_t rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) { int32_t lro; uint32_t thresh; /* * lro is the flag we use to determine if we have seen reordering. * If it gets set we have seen reordering. The reorder logic either * works in one of two ways: * * If reorder-fade is configured, then we track the last time we saw * re-ordering occur. If we reach the point where enough time as * passed we no longer consider reordering has occuring. * * Or if reorder-face is 0, then once we see reordering we consider * the connection to alway be subject to reordering and just set lro * to 1. * * In the end if lro is non-zero we add the extra time for * reordering in. */ if (srtt == 0) srtt = 1; if (rack->r_ctl.rc_reorder_ts) { if (rack->r_ctl.rc_reorder_fade) { if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { lro = cts - rack->r_ctl.rc_reorder_ts; if (lro == 0) { /* * No time as passed since the last * reorder, mark it as reordering. */ lro = 1; } } else { /* Negative time? */ lro = 0; } if (lro > rack->r_ctl.rc_reorder_fade) { /* Turn off reordering seen too */ rack->r_ctl.rc_reorder_ts = 0; lro = 0; } } else { /* Reodering does not fade */ lro = 1; } } else { lro = 0; } thresh = srtt + rack->r_ctl.rc_pkt_delay; if (lro) { /* It must be set, if not you get 1/4 rtt */ if (rack->r_ctl.rc_reorder_shift) thresh += (srtt >> rack->r_ctl.rc_reorder_shift); else thresh += (srtt >> 2); } else { thresh += 1; } /* We don't let the rack timeout be above a RTO */ if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); } /* And we don't want it above the RTO max either */ if (thresh > rack_rto_max) { thresh = rack_rto_max; } return (thresh); } static uint32_t rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t srtt) { struct rack_sendmap *prsm; uint32_t thresh, len; int segsiz; if (srtt == 0) srtt = 1; if (rack->r_ctl.rc_tlp_threshold) thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); else thresh = (srtt * 2); /* Get the previous sent packet, if any */ segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); counter_u64_add(rack_enter_tlp_calc, 1); len = rsm->r_end - rsm->r_start; if (rack->rack_tlp_threshold_use == TLP_USE_ID) { /* Exactly like the ID */ if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. */ counter_u64_add(rack_used_tlpmethod, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { /* 2.1 behavior */ prsm = TAILQ_PREV(rsm, rack_head, r_tnext); if (prsm && (len <= segsiz)) { /* * Two packets outstanding, thresh should be (2*srtt) + * possible inter-packet delay (if any). */ uint32_t inter_gap = 0; int idx, nidx; counter_u64_add(rack_used_tlpmethod, 1); idx = rsm->r_rtr_cnt - 1; nidx = prsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { /* Yes it was sent later (or at the same time) */ inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; } thresh += inter_gap; } else if (len <= segsiz) { /* * Possibly compensate for delayed-ack. */ uint32_t alt_thresh; counter_u64_add(rack_used_tlpmethod2, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { /* 2.2 behavior */ if (len <= segsiz) { uint32_t alt_thresh; /* * Compensate for delayed-ack with the d-ack time. */ counter_u64_add(rack_used_tlpmethod, 1); alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; if (alt_thresh > thresh) thresh = alt_thresh; } } /* Not above an RTO */ if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { thresh = TICKS_2_MSEC(tp->t_rxtcur); } /* Not above a RTO max */ if (thresh > rack_rto_max) { thresh = rack_rto_max; } /* Apply user supplied min TLP */ if (thresh < rack_tlp_min) { thresh = rack_tlp_min; } return (thresh); } static uint32_t rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) { /* * We want the rack_rtt which is the * last rtt we measured. However if that * does not exist we fallback to the srtt (which * we probably will never do) and then as a last * resort we use RACK_INITIAL_RTO if no srtt is * yet set. */ if (rack->rc_rack_rtt) return(rack->rc_rack_rtt); else if (tp->t_srtt == 0) return(RACK_INITIAL_RTO); return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); } static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) { /* * Check to see that we don't need to fall into recovery. We will * need to do so if our oldest transmit is past the time we should * have had an ack. */ struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t idx; uint32_t srtt, thresh; rack = (struct tcp_rack *)tp->t_fb_ptr; if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { return (NULL); } rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) return (NULL); if (rsm->r_flags & RACK_ACKED) { rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) return (NULL); } idx = rsm->r_rtr_cnt - 1; srtt = rack_grab_rtt(tp, rack); thresh = rack_calc_thresh_rack(rack, srtt, tsused); if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) { return (NULL); } if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { return (NULL); } /* Ok if we reach here we are over-due and this guy can be sent */ if (IN_RECOVERY(tp->t_flags) == 0) { /* * For the one that enters us into recovery record undo * info. */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; } rack_cong_signal(tp, NULL, CC_NDUPACK); return (rsm); } static uint32_t rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) { int32_t t; int32_t tt; uint32_t ret_val; t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], rack_persist_min, rack_persist_max); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; ret_val = (uint32_t)tt; return (ret_val); } static uint32_t rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) { /* * Start the FR timer, we do this based on getting the first one in * the rc_tmap. Note that if its NULL we must stop the timer. in all * events we need to stop the running timer (if its running) before * starting the new one. */ uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; uint32_t srtt_cur; int32_t idx; int32_t is_tlp_timer = 0; struct rack_sendmap *rsm; if (rack->t_timers_stopped) { /* All timers have been stopped none are to run */ return (0); } if (rack->rc_in_persist) { /* We can't start any timer in persists */ return (rack_get_persists_timer_val(tp, rack)); } rack->rc_on_min_to = 0; if ((tp->t_state < TCPS_ESTABLISHED) || ((tp->t_flags & TF_SACK_PERMIT) == 0)) goto activate_rxt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if ((rsm == NULL) || sup_rack) { /* Nothing on the send map */ activate_rxt: time_since_sent = 0; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm) { idx = rsm->r_rtr_cnt - 1; if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) tstmp_touse = rsm->r_tim_lastsent[idx]; else tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; if (TSTMP_GT(cts, tstmp_touse)) time_since_sent = cts - tstmp_touse; } if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; to = TICKS_2_MSEC(tp->t_rxtcur); if (to > time_since_sent) to -= time_since_sent; else to = rack->r_ctl.rc_min_to; if (to == 0) to = 1; return (to); } return (0); } if (rsm->r_flags & RACK_ACKED) { rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) { /* No lowest? */ goto activate_rxt; } } if (rack->sack_attack_disable) { /* * We don't want to do * any TLP's if you are an attacker. * Though if you are doing what * is expected you may still have * SACK-PASSED marks. */ goto activate_rxt; } /* Convert from ms to usecs */ if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) == 1) && (rsm->r_flags & RACK_HAS_FIN)) { /* * We don't start a rack timer if all we have is a * FIN outstanding. */ goto activate_rxt; } if ((rack->use_rack_rr == 0) && (IN_RECOVERY(tp->t_flags)) && (rack->rack_no_prr == 0) && (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { /* * We are not cheating, in recovery and * not enough ack's to yet get our next * retransmission out. * * Note that classified attackers do not * get to use the rack-cheat. */ goto activate_tlp; } srtt = rack_grab_rtt(tp, rack); thresh = rack_calc_thresh_rack(rack, srtt, cts); idx = rsm->r_rtr_cnt - 1; exp = rsm->r_tim_lastsent[idx] + thresh; if (SEQ_GEQ(exp, cts)) { to = exp - cts; if (to < rack->r_ctl.rc_min_to) { to = rack->r_ctl.rc_min_to; if (rack->r_rr_config == 3) rack->rc_on_min_to = 1; } } else { to = rack->r_ctl.rc_min_to; if (rack->r_rr_config == 3) rack->rc_on_min_to = 1; } } else { /* Ok we need to do a TLP not RACK */ activate_tlp: if ((rack->rc_tlp_in_progress != 0) && (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { /* * The previous send was a TLP and we have sent * N TLP's without sending new data. */ goto activate_rxt; } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (rsm == NULL) { /* We found no rsm to TLP with. */ goto activate_rxt; } if (rsm->r_flags & RACK_HAS_FIN) { /* If its a FIN we dont do TLP */ rsm = NULL; goto activate_rxt; } idx = rsm->r_rtr_cnt - 1; time_since_sent = 0; if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) tstmp_touse = rsm->r_tim_lastsent[idx]; else tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; if (TSTMP_GT(cts, tstmp_touse)) time_since_sent = cts - tstmp_touse; is_tlp_timer = 1; if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); srtt = TICKS_2_MSEC(srtt_cur); } else srtt = RACK_INITIAL_RTO; /* * If the SRTT is not keeping up and the * rack RTT has spiked we want to use * the last RTT not the smoothed one. */ if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack))) srtt = rack_grab_rtt(tp, rack); thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); if (thresh > time_since_sent) to = thresh - time_since_sent; else { to = rack->r_ctl.rc_min_to; rack_log_alt_to_to_cancel(rack, thresh, /* flex1 */ time_since_sent, /* flex2 */ tstmp_touse, /* flex3 */ rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ rsm->r_tim_lastsent[idx], srtt, idx, 99); } if (to > TCPTV_REXMTMAX) { /* * If the TLP time works out to larger than the max * RTO lets not do TLP.. just RTO. */ goto activate_rxt; } } if (is_tlp_timer == 0) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; } else { rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; } if (to == 0) to = 1; return (to); } static void rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (rack->rc_in_persist == 0) { if (tp->t_flags & TF_GPUTINPROG) { /* * Stop the goodput now, the calling of the * measurement function clears the flag. */ rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__); } #ifdef NETFLIX_SHARED_CWND if (rack->r_ctl.rc_scw) { tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); rack->rack_scwnd_is_idle = 1; } #endif rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; rack_timer_cancel(tp, rack, cts, __LINE__); tp->t_rxtshift = 0; rack->rc_in_persist = 1; } } static void rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (rack->rc_inp->inp_in_hpts) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); rack->r_ctl.rc_hpts_flags = 0; } #ifdef NETFLIX_SHARED_CWND if (rack->r_ctl.rc_scw) { tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); rack->rack_scwnd_is_idle = 0; } #endif if (rack->rc_gp_dyn_mul && (rack->use_fixed_rate == 0) && (rack->rc_always_pace)) { /* * Do we count this as if a probe-rtt just * finished? */ uint32_t time_idle, idle_min; time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; idle_min = rack_min_probertt_hold; if (rack_probertt_gpsrtt_cnt_div) { uint64_t extra; extra = (uint64_t)rack->r_ctl.rc_gp_srtt * (uint64_t)rack_probertt_gpsrtt_cnt_mul; extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; idle_min += (uint32_t)extra; } if (time_idle >= idle_min) { /* Yes, we count it as a probe-rtt. */ uint32_t us_cts; us_cts = tcp_get_usecs(NULL); if (rack->in_probe_rtt == 0) { rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; } else { rack_exit_probertt(rack, us_cts); } } } rack->rc_in_persist = 0; rack->r_ctl.rc_went_idle_time = 0; tp->t_rxtshift = 0; rack->r_ctl.rc_agg_delayed = 0; rack->r_early = 0; rack->r_late = 0; rack->r_ctl.rc_agg_early = 0; } static void rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, struct hpts_diag *diag, struct timeval *tv) { if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = diag->p_nxt_slot; log.u_bbr.flex2 = diag->p_cur_slot; log.u_bbr.flex3 = diag->slot_req; log.u_bbr.flex4 = diag->inp_hptsslot; log.u_bbr.flex5 = diag->slot_remaining; log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; /* Hijack other fields as needed */ log.u_bbr.epoch = diag->have_slept; log.u_bbr.lt_epoch = diag->yet_to_sleep; log.u_bbr.pkts_out = diag->co_ret; log.u_bbr.applimited = diag->hpts_sleep_time; log.u_bbr.delivered = diag->p_prev_slot; log.u_bbr.inflight = diag->p_runningtick; log.u_bbr.bw_inuse = diag->wheel_tick; log.u_bbr.rttProp = diag->wheel_cts; log.u_bbr.timeStamp = cts; log.u_bbr.delRate = diag->maxticks; log.u_bbr.cur_del_rate = diag->p_curtick; log.u_bbr.cur_del_rate <<= 32; log.u_bbr.cur_del_rate |= diag->p_lasttick; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_HPTSDIAG, 0, 0, &log, false, tv); } } static void rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t slot, uint32_t tot_len_this_send, int sup_rack) { struct hpts_diag diag; struct inpcb *inp; struct timeval tv; uint32_t delayed_ack = 0; uint32_t hpts_timeout; uint8_t stopped; uint32_t left = 0; uint32_t us_cts; inp = tp->t_inpcb; if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { return; } if (inp->inp_in_hpts) { /* Already on the pacer */ return; } stopped = rack->rc_tmr_stopped; if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { left = rack->r_ctl.rc_timer_exp - cts; } rack->r_ctl.rc_timer_exp = 0; rack->r_ctl.rc_hpts_flags = 0; us_cts = tcp_get_usecs(&tv); /* Now early/late accounting */ if (rack->r_early) { /* * We have a early carry over set, * we can always add more time so we * can always make this compensation. */ slot += rack->r_ctl.rc_agg_early; rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; } if (rack->r_late) { /* * This is harder, we can * compensate some but it * really depends on what * the current pacing time is. */ if (rack->r_ctl.rc_agg_delayed >= slot) { /* * We can't compensate for it all. * And we have to have some time * on the clock. We always have a min * 10 slots (10 x 10 i.e. 100 usecs). */ if (slot <= HPTS_TICKS_PER_USEC) { /* We gain delay */ rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); slot = HPTS_TICKS_PER_USEC; } else { /* We take off some */ rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); slot = HPTS_TICKS_PER_USEC; } } else { slot -= rack->r_ctl.rc_agg_delayed; rack->r_ctl.rc_agg_delayed = 0; /* Make sure we have 100 useconds at minimum */ if (slot < HPTS_TICKS_PER_USEC) { rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; slot = HPTS_TICKS_PER_USEC; } if (rack->r_ctl.rc_agg_delayed == 0) rack->r_late = 0; } } if (slot) { /* We are pacing too */ rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; } hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); #ifdef NETFLIX_EXP_DETECTION if (rack->sack_attack_disable && (slot < tcp_sad_pacing_interval)) { /* * We have a potential attacker on * the line. We have possibly some * (or now) pacing time set. We want to * slow down the processing of sacks by some * amount (if it is an attacker). Set the default * slot for attackers in place (unless the orginal * interval is longer). Its stored in * micro-seconds, so lets convert to msecs. */ slot = tcp_sad_pacing_interval; } #endif if (tp->t_flags & TF_DELACK) { delayed_ack = TICKS_2_MSEC(tcp_delacktime); rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; } if (delayed_ack && ((hpts_timeout == 0) || (delayed_ack < hpts_timeout))) hpts_timeout = delayed_ack; else rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; /* * If no timers are going to run and we will fall off the hptsi * wheel, we resort to a keep-alive timer if its configured. */ if ((hpts_timeout == 0) && (slot == 0)) { if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) { /* * Ok we have no timer (persists, rack, tlp, rxt or * del-ack), we don't have segments being paced. So * all that is left is the keepalive timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { /* Get the established keep-alive time */ hpts_timeout = TP_KEEPIDLE(tp); } else { /* Get the initial setup keep-alive time */ hpts_timeout = TP_KEEPINIT(tp); } rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; if (rack->in_probe_rtt) { /* * We want to instead not wake up a long time from * now but to wake up about the time we would * exit probe-rtt and initiate a keep-alive ack. * This will get us out of probe-rtt and update * our min-rtt. */ hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC); } } } if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { /* * RACK, TLP, persists and RXT timers all are restartable * based on actions input .. i.e we received a packet (ack * or sack) and that changes things (rw, or snd_una etc). * Thus we can restart them with a new value. For * keep-alive, delayed_ack we keep track of what was left * and restart the timer with a smaller value. */ if (left < hpts_timeout) hpts_timeout = left; } if (hpts_timeout) { /* * Hack alert for now we can't time-out over 2,147,483 * seconds (a bit more than 596 hours), which is probably ok * :). */ if (hpts_timeout > 0x7ffffffe) hpts_timeout = 0x7ffffffe; rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } if ((rack->rc_gp_filled == 0) && (hpts_timeout < slot) && (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* * We have no good estimate yet for the * old clunky burst mitigation or the * real pacing. And the tlp or rxt is smaller * than the pacing calculation. Lets not * pace that long since we know the calculation * so far is not accurate. */ slot = hpts_timeout; } rack->r_ctl.last_pacing_time = slot; if (slot) { rack->r_ctl.rc_last_output_to = us_cts + slot; if (rack->rc_always_pace || rack->r_mbuf_queue) { if ((rack->rc_gp_filled == 0) || rack->pacing_longer_than_rtt) { inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); } else { inp->inp_flags2 |= INP_MBUF_QUEUE_READY; if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && (rack->r_rr_config != 3)) inp->inp_flags2 |= INP_DONT_SACK_QUEUE; else inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; } } if ((rack->use_rack_rr) && (rack->r_rr_config < 2) && ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) { /* * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } else { (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 1); } } else if (hpts_timeout) { if (rack->rc_always_pace || rack->r_mbuf_queue) { if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { /* For a rack timer, don't wake us */ inp->inp_flags2 |= INP_MBUF_QUEUE_READY; if (rack->r_rr_config != 3) inp->inp_flags2 |= INP_DONT_SACK_QUEUE; else inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; } else { /* All other timers wake us up */ inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; } } (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } else { /* No timer starting */ #ifdef INVARIANTS if (SEQ_GT(tp->snd_max, tp->snd_una)) { panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", tp, rack, tot_len_this_send, cts, slot, hpts_timeout); } #endif } rack->rc_tmr_stopped = 0; if (slot) rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); } /* * RACK Timer, here we simply do logging and house keeping. * the normal rack_output() function will call the * appropriate thing to check if we need to do a RACK retransmit. * We return 1, saying don't proceed with rack_output only * when all timers have been stopped (destroyed PCB?). */ static int rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * This timer simply provides an internal trigger to send out data. * The check_recovery_mode call will see if there are needed * retransmissions, if so we will enter fast-recovery. The output * call may or may not do the same thing depending on sysctl * settings. */ struct rack_sendmap *rsm; int32_t recovery; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } recovery = IN_RECOVERY(tp->t_flags); counter_u64_add(rack_to_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); rack->rc_on_min_to = 0; rsm = rack_check_recovery_mode(tp, cts); rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); if (rsm) { uint32_t rtt; rack->r_ctl.rc_resend = rsm; if (rack->use_rack_rr) { /* * Don't accumulate extra pacing delay * we are allowing the rack timer to * over-ride pacing i.e. rrr takes precedence * if the pacing interval is longer than the rrr * time (in other words we get the min pacing * time versus rrr pacing time). */ rack->r_timer_override = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; } rtt = rack->rc_rack_rtt; if (rtt == 0) rtt = 1; if (rack->rack_no_prr == 0) { if ((recovery == 0) && (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { /* * The rack-timeout that enter's us into recovery * will force out one MSS and set us up so that we * can do one more send in 2*rtt (transitioning the * rack timeout into a rack-tlp). */ rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); rack->r_timer_override = 1; rack_log_to_prr(rack, 3, 0); } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && rack->use_rack_rr) { /* * When a rack timer goes, if the rack rr is * on, arrange it so we can send a full segment * overriding prr (though we pay a price for this * for future new sends). */ rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); rack_log_to_prr(rack, 4, 0); } } } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; if (rsm == NULL) { /* restart a timer and return 1 */ rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); return (1); } return (0); } static __inline void rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, struct rack_sendmap *rsm, uint32_t start) { int idx; nrsm->r_start = start; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_dupack = rsm->r_dupack; nrsm->usec_orig_send = rsm->usec_orig_send; nrsm->r_rtr_bytes = 0; rsm->r_end = nrsm->r_start; nrsm->r_just_ret = rsm->r_just_ret; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } } static struct rack_sendmap * rack_merge_rsm(struct tcp_rack *rack, struct rack_sendmap *l_rsm, struct rack_sendmap *r_rsm) { /* * We are merging two ack'd RSM's, * the l_rsm is on the left (lower seq * values) and the r_rsm is on the right * (higher seq value). The simplest way * to merge these is to move the right * one into the left. I don't think there * is any reason we need to try to find * the oldest (or last oldest retransmitted). */ struct rack_sendmap *rm; l_rsm->r_end = r_rsm->r_end; if (l_rsm->r_dupack < r_rsm->r_dupack) l_rsm->r_dupack = r_rsm->r_dupack; if (r_rsm->r_rtr_bytes) l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; if (r_rsm->r_in_tmap) { /* This really should not happen */ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); r_rsm->r_in_tmap = 0; } /* Now the flags */ if (r_rsm->r_flags & RACK_HAS_FIN) l_rsm->r_flags |= RACK_HAS_FIN; if (r_rsm->r_flags & RACK_TLP) l_rsm->r_flags |= RACK_TLP; if (r_rsm->r_flags & RACK_RWND_COLLAPSED) l_rsm->r_flags |= RACK_RWND_COLLAPSED; if ((r_rsm->r_flags & RACK_APP_LIMITED) && ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { /* * If both are app-limited then let the * free lower the count. If right is app * limited and left is not, transfer. */ l_rsm->r_flags |= RACK_APP_LIMITED; r_rsm->r_flags &= ~RACK_APP_LIMITED; if (r_rsm == rack->r_ctl.rc_first_appl) rack->r_ctl.rc_first_appl = l_rsm; } rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); #ifdef INVARIANTS if (rm != r_rsm) { panic("removing head in rack:%p rsm:%p rm:%p", rack, r_rsm, rm); } #endif if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { /* Transfer the split limit to the map we free */ r_rsm->r_limit_type = l_rsm->r_limit_type; l_rsm->r_limit_type = 0; } rack_free(rack, r_rsm); return(l_rsm); } /* * TLP Timer, here we simply setup what segment we want to * have the TLP expire on, the normal rack_output() will then * send it out. * * We return 1, saying don't proceed with rack_output only * when all timers have been stopped (destroyed PCB?). */ static int rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { /* * Tail Loss Probe. */ struct rack_sendmap *rsm = NULL; struct rack_sendmap *insret; struct socket *so; uint32_t amm, old_prr_snd = 0; uint32_t out, avail; int collapsed_win = 0; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { /* Its not time yet */ return (0); } if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); return (1); } /* * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. */ rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); counter_u64_add(rack_tlp_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); so = tp->t_inpcb->inp_socket; avail = sbavail(&so->so_snd); out = tp->snd_max - tp->snd_una; if (out > tp->snd_wnd) { /* special case, we need a retransmission */ collapsed_win = 1; goto need_retran; } /* * Check our send oldest always settings, and if * there is an oldest to send jump to the need_retran. */ if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) goto need_retran; if (avail > out) { /* New data is available */ amm = avail - out; if (amm > ctf_fixed_maxseg(tp)) { amm = ctf_fixed_maxseg(tp); if ((amm + out) > tp->snd_wnd) { /* We are rwnd limited */ goto need_retran; } } else if (amm < ctf_fixed_maxseg(tp)) { /* not enough to fill a MTU */ goto need_retran; } if (IN_RECOVERY(tp->t_flags)) { /* Unlikely */ if (rack->rack_no_prr == 0) { old_prr_snd = rack->r_ctl.rc_prr_sndcnt; if (out + amm <= tp->snd_wnd) { rack->r_ctl.rc_prr_sndcnt = amm; rack_log_to_prr(rack, 4, 0); } } else goto need_retran; } else { /* Set the send-new override */ if (out + amm <= tp->snd_wnd) rack->r_ctl.rc_tlp_new_data = amm; else goto need_retran; } rack->r_ctl.rc_tlpsend = NULL; counter_u64_add(rack_tlp_newdata, 1); goto send; } need_retran: /* * Ok we need to arrange the last un-acked segment to be re-sent, or * optionally the first un-acked segment. */ if (collapsed_win == 0) { if (rack_always_send_oldest) rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); else { rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { rsm = rack_find_high_nonack(rack, rsm); } } if (rsm == NULL) { counter_u64_add(rack_tlp_does_nada, 1); #ifdef TCP_BLACKBOX tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); #endif goto out; } } else { /* * We must find the last segment * that was acceptable by the client. */ RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { /* Found one */ break; } } if (rsm == NULL) { /* None? if so send the first */ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm == NULL) { counter_u64_add(rack_tlp_does_nada, 1); #ifdef TCP_BLACKBOX tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); #endif goto out; } } } if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { /* * We need to split this the last segment in two. */ struct rack_sendmap *nrsm; nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { /* * No memory to split, we will just exit and punt * off to the RXT timer. */ counter_u64_add(rack_tlp_does_nada, 1); goto out; } rack_clone_rsm(rack, nrsm, rsm, (rsm->r_end - ctf_fixed_maxseg(tp))); insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #ifdef INVARIANTS if (insret != NULL) { panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rsm = nrsm; } rack->r_ctl.rc_tlpsend = rsm; send: rack->r_timer_override = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); out: rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); } /* * Delayed ack Timer, here we simply need to setup the * ACK_NOW flag and remove the DELACK flag. From there * the output routine will send the ack out. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; KMOD_TCPSTAT_INC(tcps_delack); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; return (0); } /* * Persists timer, here we simply send the * same thing as a keepalive will. * the one byte send. * * We only return 1, saying don't proceed, if all timers * are stopped (destroyed PCB?). */ static int rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { struct tcptemp *t_template; struct inpcb *inp; int32_t retval = 1; inp = tp->t_inpcb; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (rack->rc_in_persist == 0) return (0); if (ctf_progress_timeout_check(tp, false)) { tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); /* * Persistence timer into zero window. Force a byte to be output, if * possible. */ KMOD_TCPSTAT_INC(tcps_persisttimeo); /* * Hack: if the peer is dead/unreachable, we do not time out if the * window is closed. After a full backoff, drop the connection if * the idle time (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { KMOD_TCPSTAT_INC(tcps_persistdrop); retval = 1; tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && tp->snd_una == tp->snd_max) rack_exit_persist(tp, rack, cts); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; /* * If the user has closed the socket then drop a persisting * connection after a much reduced timeout. */ if (tp->t_state > TCPS_CLOSE_WAIT && (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { retval = 1; KMOD_TCPSTAT_INC(tcps_persistdrop); tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } t_template = tcpip_maketemplate(rack->rc_inp); if (t_template) { /* only set it if we were answered */ if (rack->forced_ack == 0) { rack->forced_ack = 1; rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); } tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); /* This sends an ack */ if (tp->t_flags & TF_DELACK) tp->t_flags &= ~TF_DELACK; free(t_template, M_TEMP); } if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; out: rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); return (retval); } /* * If a keepalive goes off, we had no other timers * happening. We always return 1 here since this * routine either drops the connection or sends * out a segment with respond. */ static int rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { struct tcptemp *t_template; struct inpcb *inp; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; inp = tp->t_inpcb; rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); /* * Keep-alive timer went off; send something or drop connection if * idle for too long. */ KMOD_TCPSTAT_INC(tcps_keeptimeo); if (tp->t_state < TCPS_ESTABLISHED) goto dropit; if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response if the peer is * up and reachable: either an ACK if the connection is * still alive, or an RST if the peer has closed the * connection due to timeout or reboot. Using sequence * number tp->snd_una-1 causes the transmitted zero-length * segment to lie outside the receive window; by the * protocol spec, this requires the correspondent TCP to * respond. */ KMOD_TCPSTAT_INC(tcps_keepprobe); t_template = tcpip_maketemplate(inp); if (t_template) { if (rack->forced_ack == 0) { rack->forced_ack = 1; rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); } tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); free(t_template, M_TEMP); } } rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); return (1); dropit: KMOD_TCPSTAT_INC(tcps_keepdrops); tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); return (1); } /* * Retransmit helper function, clear up all the ack * flags and take care of important book keeping. */ static void rack_remxt_tmr(struct tcpcb *tp) { /* * The retransmit timer went off, all sack'd blocks must be * un-acked. */ struct rack_sendmap *rsm, *trsm = NULL; struct tcp_rack *rack; int32_t cnt = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); /* * Ideally we would like to be able to * mark SACK-PASS on anything not acked here. * However, if we do that we would burst out * all that data 1ms apart. This would be unwise, * so for now we will just let the normal rxt timer * and tlp timer take care of it. */ RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { if (rsm->r_flags & RACK_ACKED) { cnt++; rsm->r_dupack = 0; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); if (rsm->r_in_tmap == 0) { /* We must re-add it back to the tlist */ if (trsm == NULL) { TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); } else { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); } rsm->r_in_tmap = 1; } } trsm = rsm; if (rsm->r_flags & RACK_ACKED) rsm->r_flags |= RACK_WAS_ACKED; rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); } /* Clear the count (we just un-acked them) */ rack->r_ctl.rc_sacked = 0; rack->r_ctl.rc_agg_delayed = 0; rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; rack->r_late = 0; /* Clear the tlp rtx mark */ rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); rack->r_ctl.rc_prr_sndcnt = 0; rack_log_to_prr(rack, 6, 0); rack->r_timer_override = 1; } static void rack_cc_conn_init(struct tcpcb *tp) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; cc_conn_init(tp); /* * We want a chance to stay in slowstart as * we create a connection. TCP spec says that * initially ssthresh is infinite. For our * purposes that is the snd_wnd. */ if (tp->snd_ssthresh < tp->snd_wnd) { tp->snd_ssthresh = tp->snd_wnd; } /* * We also want to assure a IW worth of * data can get inflight. */ if (rc_init_window(rack) < tp->snd_cwnd) tp->snd_cwnd = rc_init_window(rack); } /* * Re-transmit timeout! If we drop the PCB we will return 1, otherwise * we will setup to retransmit the lowest seq number outstanding. */ static int rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { int32_t rexmt; struct inpcb *inp; int32_t retval = 0; bool isipv6; inp = tp->t_inpcb; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } if (ctf_progress_timeout_check(tp, false)) { tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(inp, ETIMEDOUT); return (1); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_una == tp->snd_max)) { /* Nothing outstanding .. nothing to do */ return (0); } /* * Retransmission timer went off. Message has not been acked within * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ rack_remxt_tmr(tp); if ((rack->r_ctl.rc_resend == NULL) || ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { /* * If the rwnd collapsed on * the one we are retransmitting * it does not count against the * rxt count. */ tp->t_rxtshift++; } if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; KMOD_TCPSTAT_INC(tcps_timeoutdrop); retval = 1; tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); tcp_set_inp_to_drop(rack->rc_inp, (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; } if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be limited * to 1 segment in cc_conn_init(). */ tp->snd_cwnd = 1; } else if (tp->t_rxtshift == 1) { /* * first retransmit; record ssthresh and cwnd so they can be * recovered if this turns out to be a "bad" retransmit. A * retransmit is considered "bad" if an ACK for this segment * is received within RTT/2 interval; the assumption here is * that the ACK was already in flight. See "On Estimating * End-to-End Network Path Properties" by Allman and Paxson * for more details. */ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else tp->t_flags &= ~TF_WASFRECOVERY; if (IN_CONGRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; KMOD_TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, max(MSEC_2_TICKS(rack_rto_min), rexmt), MSEC_2_TICKS(rack_rto_max)); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if * amount of data we send is very small, we could send it in couple * of packets and process straight to FIN. In that case we won't * catch ESTABLISHED state. */ #ifdef INET6 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; #else isipv6 = false; #endif if (((V_tcp_pmtud_blackhole_detect == 1) || (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && ((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1))) { /* * Idea here is that at each stage of mtu probe (usually, * 1448 -> 1188 -> 524) should be given 2 chances to recover * before further clamping down. 'tp->t_rxtshift % 2 == 0' * should take care of that. */ if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && tp->t_rxtshift % 2 == 0)) { /* * Enter Path MTU Black-hole Detection mechanism: - * Disable Path MTU Discovery (IP "DF" bit). - * Reduce MTU to lower value than what we negotiated * with peer. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { /* Record that we may have found a black hole. */ tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; /* Keep track of previous MSS. */ tp->t_pmtud_saved_maxseg = tp->t_maxseg; } /* * Reduce the MSS to blackhole value or to the * default in an attempt to retransmit. */ #ifdef INET6 if (isipv6 && tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else if (isipv6) { /* Use the default MSS. */ tp->t_maxseg = V_tcp_v6mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { /* Use the sysctl tuneable blackhole MSS. */ tp->t_maxseg = V_tcp_pmtud_blackhole_mss; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); } else { /* Use the default MSS. */ tp->t_maxseg = V_tcp_mssdflt; /* * Disable Path MTU Discovery when we switch * to minmss. */ tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); } #endif } else { /* * If further retransmissions are still unsuccessful * with a lowered MTU, maybe this isn't a blackhole * and we restore the previous MSS and blackhole * detection flags. The limit '6' is determined by * giving each probe stage (1448, 1188, 524) 2 * chances to recover. */ if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && (tp->t_rxtshift >= 6)) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; tp->t_maxseg = tp->t_pmtud_saved_maxseg; KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); } } } /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current retransmit * times until then. */ if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { #ifdef INET6 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) in6_losing(tp->t_inpcb); else #endif in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); tp->snd_recover = tp->snd_max; tp->t_flags |= TF_ACKNOW; tp->t_rtttime = 0; rack_cong_signal(tp, NULL, CC_RTO); out: return (retval); } static int rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) { int32_t ret = 0; int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); if (timers == 0) { return (0); } if (tp->t_state == TCPS_LISTEN) { /* no timers on listen sockets */ if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) return (0); return (1); } if ((timers & PACE_TMR_RACK) && rack->rc_on_min_to) { /* * For the rack timer when we * are on a min-timeout (which means rrr_conf = 3) * we don't want to check the timer. It may * be going off for a pace and thats ok we * want to send the retransmit (if its ready). * * If its on a normal rack timer (non-min) then * we will check if its expired. */ goto skip_time_check; } if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { uint32_t left; if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { ret = -1; rack_log_to_processing(rack, cts, ret, 0); return (0); } if (hpts_calling == 0) { /* * A user send or queued mbuf (sack) has called us? We * return 0 and let the pacing guards * deal with it if they should or * should not cause a send. */ ret = -2; rack_log_to_processing(rack, cts, ret, 0); return (0); } /* * Ok our timer went off early and we are not paced false * alarm, go back to sleep. */ ret = -3; left = rack->r_ctl.rc_timer_exp - cts; tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); rack_log_to_processing(rack, cts, ret, left); return (1); } skip_time_check: rack->rc_tmr_stopped = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; if (timers & PACE_TMR_DELACK) { ret = rack_timeout_delack(tp, rack, cts); } else if (timers & PACE_TMR_RACK) { rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_rack(tp, rack, cts); } else if (timers & PACE_TMR_TLP) { rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_tlp(tp, rack, cts); } else if (timers & PACE_TMR_RXT) { rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_rxt(tp, rack, cts); } else if (timers & PACE_TMR_PERSIT) { ret = rack_timeout_persist(tp, rack, cts); } else if (timers & PACE_TMR_KEEP) { ret = rack_timeout_keepalive(tp, rack, cts); } rack_log_to_processing(rack, cts, ret, timers); return (ret); } static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) { struct timeval tv; uint32_t us_cts, flags_on_entry; uint8_t hpts_removed = 0; flags_on_entry = rack->r_ctl.rc_hpts_flags; us_cts = tcp_get_usecs(&tv); if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || ((tp->snd_max - tp->snd_una) == 0))) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; /* If we were not delayed cancel out the flag. */ if ((tp->snd_max - tp->snd_una) == 0) rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); } if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (rack->rc_inp->inp_in_hpts && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { /* * Canceling timer's when we have no output being * paced. We also must remove ourselves from the * hpts. */ tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); hpts_removed = 1; } rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); } if (hpts_removed == 0) rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); } static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) { return; } static int rack_stopall(struct tcpcb *tp) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; rack->t_timers_stopped = 1; return (0); } static void rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) { return; } static int rack_timer_active(struct tcpcb *tp, uint32_t timer_type) { return (0); } static void rack_stop_all_timers(struct tcpcb *tp) { struct tcp_rack *rack; /* * Assure no timers are running. */ if (tcp_timer_active(tp, TT_PERSIST)) { /* We enter in persists, set the flag appropriately */ rack = (struct tcp_rack *)tp->t_fb_ptr; rack->rc_in_persist = 1; } tcp_timer_suspend(tp, TT_PERSIST); tcp_timer_suspend(tp, TT_REXMT); tcp_timer_suspend(tp, TT_KEEP); tcp_timer_suspend(tp, TT_DELACK); } static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts) { int32_t idx; rsm->r_rtr_cnt++; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); rsm->r_dupack = 0; if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; rsm->r_flags |= RACK_OVERMAX; } if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); } idx = rsm->r_rtr_cnt - 1; rsm->r_tim_lastsent[idx] = ts; if (rsm->r_flags & RACK_ACKED) { /* Problably MTU discovery messing with us */ rsm->r_flags &= ~RACK_ACKED; rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); } if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; if (rsm->r_flags & RACK_SACK_PASSED) { /* We have retransmitted due to the SACK pass */ rsm->r_flags &= ~RACK_SACK_PASSED; rsm->r_flags |= RACK_WAS_SACKPASS; } } static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) { /* * We (re-)transmitted starting at rsm->r_start for some length * (possibly less than r_end. */ struct rack_sendmap *nrsm, *insret; uint32_t c_end; int32_t len; len = *lenp; c_end = rsm->r_start + len; if (SEQ_GEQ(c_end, rsm->r_end)) { /* * We retransmitted the whole piece or more than the whole * slopping into the next rsm. */ rack_update_rsm(tp, rack, rsm, ts); if (c_end == rsm->r_end) { *lenp = 0; return (0); } else { int32_t act_len; /* Hangs over the end return whats left */ act_len = rsm->r_end - rsm->r_start; *lenp = (len - act_len); return (rsm->r_end); } /* We don't get out of this block. */ } /* * Here we retransmitted less than the whole thing which means we * have to split this into what was transmitted and what was not. */ nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { /* * We can't get memory, so lets not proceed. */ *lenp = 0; return (0); } /* * So here we are going to take the original rsm and make it what we * retransmitted. nrsm will be the tail portion we did not * retransmit. For example say the chunk was 1, 11 (10 bytes). And * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to * 1, 6 and the new piece will be 6, 11. */ rack_clone_rsm(rack, nrsm, rsm, c_end); nrsm->r_dupack = 0; rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #ifdef INVARIANTS if (insret != NULL) { panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); rack_update_rsm(tp, rack, rsm, ts); *lenp = 0; return (0); } static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts) { struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm, *insret, fe; register uint32_t snd_max, snd_una; /* * Add to the RACK log of packets in flight or retransmitted. If * there is a TS option we will use the TS echoed, if not we will * grab a TS. * * Retransmissions will increment the count and move the ts to its * proper place. Note that if options do not include TS's then we * won't be able to effectively use the ACK for an RTT on a retran. * * Notes about r_start and r_end. Lets consider a send starting at * sequence 1 for 10 bytes. In such an example the r_start would be * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. * This means that r_end is actually the first sequence for the next * slot (11). * */ /* * If err is set what do we do XXXrrs? should we not add the thing? * -- i.e. return if err != 0 or should we pretend we sent it? -- * i.e. proceed with add ** do this for now. */ INP_WLOCK_ASSERT(tp->t_inpcb); if (err) /* * We don't log errors -- we could but snd_max does not * advance in this case either. */ return; if (th_flags & TH_RST) { /* * We don't log resets and we return immediately from * sending */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; snd_una = tp->snd_una; if (SEQ_LEQ((seq_out + len), snd_una)) { /* Are sending an old segment to induce an ack (keep-alive)? */ return; } if (SEQ_LT(seq_out, snd_una)) { /* huh? should we panic? */ uint32_t end; end = seq_out + len; seq_out = snd_una; if (SEQ_GEQ(end, seq_out)) len = end - seq_out; else len = 0; } snd_max = tp->snd_max; if (th_flags & (TH_SYN | TH_FIN)) { /* * The call to rack_log_output is made before bumping * snd_max. This means we can record one extra byte on a SYN * or FIN if seq_out is adding more on and a FIN is present * (and we are not resending). */ if ((th_flags & TH_SYN) && (seq_out == tp->iss)) len++; if (th_flags & TH_FIN) len++; if (SEQ_LT(snd_max, tp->snd_nxt)) { /* * The add/update as not been done for the FIN/SYN * yet. */ snd_max = tp->snd_nxt; } } if (len == 0) { /* We don't log zero window probes */ return; } rack->r_ctl.rc_time_last_sent = ts; if (IN_RECOVERY(tp->t_flags)) { rack->r_ctl.rc_prr_out += len; } /* First question is it a retransmission or new? */ if (seq_out == snd_max) { /* Its new */ again: rsm = rack_alloc(rack); if (rsm == NULL) { /* * Hmm out of memory and the tcb got destroyed while * we tried to wait. */ return; } if (th_flags & TH_FIN) { rsm->r_flags = RACK_HAS_FIN; } else { rsm->r_flags = 0; } rsm->r_tim_lastsent[0] = ts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->usec_orig_send = us_cts; if (th_flags & TH_SYN) { /* The data space is one beyond snd_una */ rsm->r_flags |= RACK_HAS_SIN; rsm->r_start = seq_out + 1; rsm->r_end = rsm->r_start + (len - 1); } else { /* Normal case */ rsm->r_start = seq_out; rsm->r_end = rsm->r_start + len; } rsm->r_dupack = 0; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); #ifdef INVARIANTS if (insret != NULL) { panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; /* * Special case detection, is there just a single * packet outstanding when we are not in recovery? * * If this is true mark it so. */ if ((IN_RECOVERY(tp->t_flags) == 0) && (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { struct rack_sendmap *prsm; prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); if (prsm) prsm->r_one_out_nr = 1; } return; } /* * If we reach here its a retransmission and we need to find it. */ memset(&fe, 0, sizeof(fe)); more: if (hintrsm && (hintrsm->r_start == seq_out)) { rsm = hintrsm; hintrsm = NULL; } else { /* No hints sorry */ rsm = NULL; } if ((rsm) && (rsm->r_start == seq_out)) { seq_out = rack_update_entry(tp, rack, rsm, ts, &len); if (len == 0) { return; } else { goto more; } } /* Ok it was not the last pointer go through it the hard way. */ refind: fe.r_start = seq_out; rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); if (rsm) { if (rsm->r_start == seq_out) { seq_out = rack_update_entry(tp, rack, rsm, ts, &len); if (len == 0) { return; } else { goto refind; } } if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { /* Transmitted within this piece */ /* * Ok we must split off the front and then let the * update do the rest */ nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { rack_update_rsm(tp, rack, rsm, ts); return; } /* * copy rsm to nrsm and then trim the front of rsm * to not include this part. */ rack_clone_rsm(rack, nrsm, rsm, seq_out); insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #ifdef INVARIANTS if (insret != NULL) { panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); if (len == 0) { return; } else if (len > 0) goto refind; } } /* * Hmm not found in map did they retransmit both old and on into the * new? */ if (seq_out == tp->snd_max) { goto again; } else if (SEQ_LT(seq_out, tp->snd_max)) { #ifdef INVARIANTS printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", seq_out, len, tp->snd_una, tp->snd_max); printf("Starting Dump of all rack entries\n"); RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { printf("rsm:%p start:%u end:%u\n", rsm, rsm->r_start, rsm->r_end); } printf("Dump complete\n"); panic("seq_out not found rack:%p tp:%p", rack, tp); #endif } else { #ifdef INVARIANTS /* * Hmm beyond sndmax? (only if we are using the new rtt-pack * flag) */ panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", seq_out, len, tp->snd_max, tp); #endif } } /* * Record one of the RTT updates from an ack into * our sample structure. */ static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) { if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; } if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { rack->r_ctl.rack_rs.rs_rtt_highest = rtt; } if (rack->rc_tp->t_flags & TF_GPUTINPROG) { if (us_rtt < rack->r_ctl.rc_gp_lowrtt) rack->r_ctl.rc_gp_lowrtt = us_rtt; if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; } if ((confidence == 1) && ((rsm == NULL) || (rsm->r_just_ret) || (rsm->r_one_out_nr && len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { /* * If the rsm had a just return * hit it then we can't trust the * rtt measurement for buffer deterimination * Note that a confidence of 2, indicates * SACK'd which overrides the r_just_ret or * the r_one_out_nr. If it was a CUM-ACK and * we had only two outstanding, but get an * ack for only 1. Then that also lowers our * confidence. */ confidence = 0; } if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { if (rack->r_ctl.rack_rs.confidence == 0) { /* * We take anything with no current confidence * saved. */ rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; rack->r_ctl.rack_rs.confidence = confidence; rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; } else if (confidence || rack->r_ctl.rack_rs.confidence) { /* * Once we have a confident number, * we can update it with a smaller * value since this confident number * may include the DSACK time until * the next segment (the second one) arrived. */ rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; rack->r_ctl.rack_rs.confidence = confidence; rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; } } rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; rack->r_ctl.rack_rs.rs_rtt_tot += rtt; rack->r_ctl.rack_rs.rs_rtt_cnt++; } /* * Collect new round-trip time estimate * and update averages and current timeout. */ static void tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) { int32_t delta; uint32_t o_srtt, o_var; int32_t hrtt_up = 0; int32_t rtt; if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) /* No valid sample */ return; if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { /* We are to use the lowest RTT seen in a single ack */ rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { /* We are to use the highest RTT seen in a single ack */ rtt = rack->r_ctl.rack_rs.rs_rtt_highest; } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { /* We are to use the average RTT seen in a single ack */ rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); } else { #ifdef INVARIANTS panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); #endif return; } if (rtt == 0) rtt = 1; if (rack->rc_gp_rtt_set == 0) { /* * With no RTT we have to accept * even one we are not confident of. */ rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; rack->rc_gp_rtt_set = 1; } else if (rack->r_ctl.rack_rs.confidence) { /* update the running gp srtt */ rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; } if (rack->r_ctl.rack_rs.confidence) { /* * record the low and high for highly buffered path computation, * we only do this if we are confident (not a retransmission). */ if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; hrtt_up = 1; } if (rack->rc_highly_buffered == 0) { /* * Currently once we declare a path has * highly buffered there is no going * back, which may be a problem... */ if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, rack->r_ctl.rc_highest_us_rtt, rack->r_ctl.rc_lowest_us_rtt, RACK_RTTS_SEEHBP); rack->rc_highly_buffered = 1; } } } if ((rack->r_ctl.rack_rs.confidence) || (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { /* * If we are highly confident of it it was * never retransmitted we accept it as the last us_rtt. */ rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; /* The lowest rtt can be set if its was not retransmited */ if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; if (rack->r_ctl.rc_lowest_us_rtt == 0) rack->r_ctl.rc_lowest_us_rtt = 1; } } rack_log_rtt_sample(rack, rtt); o_srtt = tp->t_srtt; o_var = tp->t_rttvar; rack = (struct tcp_rack *)tp->t_fb_ptr; if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the * binary point (i.e., scaled by 8). The following magic is * equivalent to the smoothing algorithm in rfc793 with an * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). * Adjust rtt to origin 0. */ delta = ((rtt - 1) << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); tp->t_srtt += delta; if (tp->t_srtt <= 0) tp->t_srtt = 1; /* * We accumulate a smoothed rtt variance (actually, a * smoothed mean difference), then set the retransmit timer * to smoothed rtt + 4 times the smoothed variance. rttvar * is stored as fixed point with 4 bits after the binary * point (scaled by 16). The following is equivalent to * rfc793 smoothing with an alpha of .75 (rttvar = * rttvar*3/4 + |delta| / 4). This replaces rfc793's * wired-in beta. */ if (delta < 0) delta = -delta; delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); tp->t_rttvar += delta; if (tp->t_rttvar <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { /* * No rtt measurement yet - use the unsmoothed rtt. Set the * variance to half the rtt (so our first retransmit happens * at 3*rtt). */ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } KMOD_TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); #endif tp->t_rxtshift = 0; /* * the retransmit should happen at rtt + 4 * rttvar. Because of the * way we do the smoothing, srtt and rttvar will each average +1/2 * tick of bias. When we compute the retransmit timer, we want 1/2 * tick of rounding and 1 extra tick because of +-1/2 tick * uncertainty in the firing of the timer. The bias will give us * exactly the 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below the minimum * feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); tp->t_softerror = 0; } static void rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, uint32_t t, uint32_t cts) { /* * For this RSM, we acknowledged the data from a previous * transmission, not the last one we made. This means we did a false * retransmit. */ struct tcp_rack *rack; if (rsm->r_flags & RACK_HAS_FIN) { /* * The sending of the FIN often is multiple sent when we * have everything outstanding ack'd. We ignore this case * since its over now. */ return; } if (rsm->r_flags & RACK_TLP) { /* * We expect TLP's to have this occur. */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; /* should we undo cc changes and exit recovery? */ if (IN_RECOVERY(tp->t_flags)) { if (rack->r_ctl.rc_rsm_start == rsm->r_start) { /* * Undo what we ratched down and exit recovery if * possible */ EXIT_RECOVERY(tp->t_flags); tp->snd_recover = tp->snd_una; if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; } } if (rsm->r_flags & RACK_WAS_SACKPASS) { /* * We retransmitted based on a sack and the earlier * retransmission ack'd it - re-ordering is occuring. */ counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } counter_u64_add(rack_badfr, 1); counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); } static void rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) { /* * Apply to filter the inbound us-rtt at us_cts. */ uint32_t old_rtt; old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, us_rtt, us_cts); if (rack->r_ctl.last_pacing_time && rack->rc_gp_dyn_mul && (rack->r_ctl.last_pacing_time > us_rtt)) rack->pacing_longer_than_rtt = 1; else rack->pacing_longer_than_rtt = 0; if (old_rtt > us_rtt) { /* We just hit a new lower rtt time */ rack_log_rtt_shrinks(rack, us_cts, old_rtt, __LINE__, RACK_RTTS_NEWRTT); /* * Only count it if its lower than what we saw within our * calculated range. */ if ((old_rtt - us_rtt) > rack_min_rtt_movement) { if (rack_probertt_lower_within && rack->rc_gp_dyn_mul && (rack->use_fixed_rate == 0) && (rack->rc_always_pace)) { /* * We are seeing a new lower rtt very close * to the time that we would have entered probe-rtt. * This is probably due to the fact that a peer flow * has entered probe-rtt. Lets go in now too. */ uint32_t val; val = rack_probertt_lower_within * rack_time_between_probertt; val /= 100; if ((rack->in_probe_rtt == 0) && ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { rack_enter_probertt(rack, us_cts); } } rack->r_ctl.rc_lower_rtt_us_cts = us_cts; } } } static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) { int32_t i; uint32_t t, len_acked; if ((rsm->r_flags & RACK_ACKED) || (rsm->r_flags & RACK_WAS_ACKED)) /* Already done */ return (0); if (ack_type == CUM_ACKED) { if (SEQ_GT(th_ack, rsm->r_end)) len_acked = rsm->r_end - rsm->r_start; else len_acked = th_ack - rsm->r_start; } else len_acked = rsm->r_end - rsm->r_start; if (rsm->r_rtr_cnt == 1) { uint32_t us_rtt; t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; if ((int)t <= 0) t = 1; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send; if (us_rtt == 0) us_rtt = 1; rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); if (ack_type == SACKED) tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); else { /* * For cum-ack we are only confident if what * is being acked is included in a measurement. * Otherwise it could be an idle period that * includes Delayed-ack time. */ tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt); } if ((rsm->r_flags & RACK_TLP) && (!IN_RECOVERY(tp->t_flags))) { /* Segment was a TLP and our retrans matched */ if (rack->r_ctl.rc_tlp_cwnd_reduce) { rack->r_ctl.rc_rsm_start = tp->snd_max; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure * we send one packet. */ if (rack->rack_no_prr == 0) { rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); rack_log_to_prr(rack, 7, 0); } } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } return (1); } /* * We clear the soft/rxtshift since we got an ack. * There is no assurance we will call the commit() function * so we need to clear these to avoid incorrect handling. */ tp->t_rxtshift = 0; tp->t_softerror = 0; if ((to->to_flags & TOF_TS) && (ack_type == CUM_ACKED) && (to->to_tsecr) && ((rsm->r_flags & RACK_OVERMAX) == 0)) { /* * Now which timestamp does it match? In this block the ACK * must be coming from a previous transmission. */ for (i = 0; i < rsm->r_rtr_cnt; i++) { if (rsm->r_tim_lastsent[i] == to->to_tsecr) { t = cts - rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if ((i + 1) < rsm->r_rtr_cnt) { /* Likely */ rack_earlier_retran(tp, rsm, t, cts); } if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm, rsm->r_rtr_cnt); return (1); } } goto ts_not_found; } else { /* * Ok its a SACK block that we retransmitted. or a windows * machine without timestamps. We can tell nothing from the * time-stamp since its not there or the time the peer last * recieved a segment that moved forward its cum-ack point. */ ts_not_found: i = rsm->r_rtr_cnt - 1; t = cts - rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { /* * We retransmitted and the ack came back in less * than the smallest rtt we have observed. We most * likey did an improper retransmit as outlined in * 4.2 Step 3 point 2 in the rack-draft. */ i = rsm->r_rtr_cnt - 2; t = cts - rsm->r_tim_lastsent[i]; rack_earlier_retran(tp, rsm, t, cts); } else if (rack->r_ctl.rc_rack_min_rtt) { /* * We retransmitted it and the retransmit did the * job. */ if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { rack->r_ctl.rc_rack_min_rtt = t; if (rack->r_ctl.rc_rack_min_rtt == 0) { rack->r_ctl.rc_rack_min_rtt = 1; } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { /* New more recent rack_tmit_time */ rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; rack->rc_rack_rtt = t; } return (1); } } return (0); } /* * Mark the SACK_PASSED flag on all entries prior to rsm send wise. */ static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm) { struct rack_sendmap *nrsm; nrsm = rsm; TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, rack_head, r_tnext) { if (nrsm == rsm) { /* Skip orginal segment he is acked */ continue; } if (nrsm->r_flags & RACK_ACKED) { /* * Skip ack'd segments, though we * should not see these, since tmap * should not have ack'd segments. */ continue; } if (nrsm->r_flags & RACK_SACK_PASSED) { /* * We found one that is already marked * passed, we have been here before and * so all others below this are marked. */ break; } nrsm->r_flags |= RACK_SACK_PASSED; nrsm->r_flags &= ~RACK_WAS_SACKPASS; } } static void rack_need_set_test(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack, int line, int use_which) { if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GEQ(rsm->r_end, tp->gput_seq)) { /* * We were app limited, and this ack * butts up or goes beyond the point where we want * to start our next measurement. We need * to record the new gput_ts as here and * possibly update the start sequence. */ uint32_t seq, ts; if (rsm->r_rtr_cnt > 1) { /* * This is a retransmit, can we * really make any assessment at this * point? We are not really sure of * the timestamp, is it this or the * previous transmission? * * Lets wait for something better that * is not retransmitted. */ return; } seq = tp->gput_seq; ts = tp->gput_ts; rack->app_limited_needs_set = 0; tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); /* Do we start at a new end? */ if ((use_which == RACK_USE_BEG) && SEQ_GEQ(rsm->r_start, tp->gput_seq)) { /* * When we get an ACK that just eats * up some of the rsm, we set RACK_USE_BEG * since whats at r_start (i.e. th_ack) * is left unacked and thats where the * measurement not starts. */ tp->gput_seq = rsm->r_start; rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; } if ((use_which == RACK_USE_END) && SEQ_GEQ(rsm->r_end, tp->gput_seq)) { /* * We use the end when the cumack * is moving forward and completely * deleting the rsm passed so basically * r_end holds th_ack. * * For SACK's we also want to use the end * since this piece just got sacked and * we want to target anything after that * in our measurement. */ tp->gput_seq = rsm->r_end; rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; } if (use_which == RACK_USE_END_OR_THACK) { /* * special case for ack moving forward, * not a sack, we need to move all the * way up to where this ack cum-ack moves * to. */ if (SEQ_GT(th_ack, rsm->r_end)) tp->gput_seq = th_ack; else tp->gput_seq = rsm->r_end; rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; } if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { /* * We moved beyond this guy's range, re-calculate * the new end point. */ if (rack->rc_gp_filled == 0) { tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); } else { tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); } } /* * We are moving the goal post, we may be able to clear the * measure_saw_probe_rtt flag. */ if ((rack->in_probe_rtt == 0) && (rack->measure_saw_probe_rtt) && (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) rack->measure_saw_probe_rtt = 0; rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, seq, tp->gput_seq, 0, 5, line, NULL); if (rack->rc_gp_filled && ((tp->gput_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))))) { /* * There is no sense of continuing this measurement * because its too small to gain us anything we * trust. Skip it and that way we can start a new * measurement quicker. */ rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 0, 0, 0, 6, __LINE__, NULL); tp->t_flags &= ~TF_GPUTINPROG; } } } static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) { uint32_t start, end, changed = 0; struct rack_sendmap stack_map; struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; int32_t used_ref = 1; int moved = 0; start = sack->start; end = sack->end; rsm = *prsm; memset(&fe, 0, sizeof(fe)); do_rest_ofb: if ((rsm == NULL) || (SEQ_LT(end, rsm->r_start)) || (SEQ_GEQ(start, rsm->r_end)) || (SEQ_LT(start, rsm->r_start))) { /* * We are not in the right spot, * find the correct spot in the tree. */ used_ref = 0; fe.r_start = start; rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); moved++; } if (rsm == NULL) { /* TSNH */ goto out; } /* Ok we have an ACK for some piece of this rsm */ if (rsm->r_start != start) { if ((rsm->r_flags & RACK_ACKED) == 0) { /** * Need to split this in two pieces the before and after, * the before remains in the map, the after must be * added. In other words we have: * rsm |--------------| * sackblk |-------> * rsm will become * rsm |---| * and nrsm will be the sacked piece * nrsm |----------| * * But before we start down that path lets * see if the sack spans over on top of * the next guy and it is already sacked. */ next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); if (next && (next->r_flags & RACK_ACKED) && SEQ_GEQ(end, next->r_start)) { /** * So the next one is already acked, and * we can thus by hookery use our stack_map * to reflect the piece being sacked and * then adjust the two tree entries moving * the start and ends around. So we start like: * rsm |------------| (not-acked) * next |-----------| (acked) * sackblk |--------> * We want to end like so: * rsm |------| (not-acked) * next |-----------------| (acked) * nrsm |-----| * Where nrsm is a temporary stack piece we * use to update all the gizmos. */ /* Copy up our fudge block */ nrsm = &stack_map; memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); /* Now adjust our tree blocks */ rsm->r_end = start; next->r_start = start; /* Clear out the dup ack count of the remainder */ rsm->r_dupack = 0; rsm->r_just_ret = 0; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); /* Now lets make sure our fudge block is right */ nrsm->r_start = start; /* Now lets update all the stats and such */ rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); changed += (nrsm->r_end - nrsm->r_start); rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); if (nrsm->r_flags & RACK_SACK_PASSED) { counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } /* * Now we want to go up from rsm (the * one left un-acked) to the next one * in the tmap. We do this so when * we walk backwards we include marking * sack-passed on rsm (The one passed in * is skipped since it is generally called * on something sacked before removing it * from the tmap). */ if (rsm->r_in_tmap) { nrsm = TAILQ_NEXT(rsm, r_tnext); /* * Now that we have the next * one walk backwards from there. */ if (nrsm && nrsm->r_in_tmap) rack_log_sack_passed(tp, rack, nrsm); } /* Now are we done? */ if (SEQ_LT(end, next->r_end) || (end == next->r_end)) { /* Done with block */ goto out; } counter_u64_add(rack_sack_used_next_merge, 1); /* Postion for the next block */ start = next->r_end; rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); if (rsm == NULL) goto out; } else { /** * We can't use any hookery here, so we * need to split the map. We enter like * so: * rsm |--------| * sackblk |-----> * We will add the new block nrsm and * that will be the new portion, and then * fall through after reseting rsm. So we * split and look like this: * rsm |----| * sackblk |-----> * nrsm |---| * We then fall through reseting * rsm to nrsm, so the next block * picks it up. */ nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* * failed XXXrrs what can we do but loose the sack * info? */ goto out; } counter_u64_add(rack_sack_splits, 1); rack_clone_rsm(rack, nrsm, rsm, start); rsm->r_just_ret = 0; insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #ifdef INVARIANTS if (insret != NULL) { panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); /* Position us to point to the new nrsm that starts the sack blk */ rsm = nrsm; } } else { /* Already sacked this piece */ counter_u64_add(rack_sack_skipped_acked, 1); moved++; if (end == rsm->r_end) { /* Done with block */ rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); goto out; } else if (SEQ_LT(end, rsm->r_end)) { /* A partial sack to a already sacked block */ moved++; rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); goto out; } else { /* * The end goes beyond this guy * repostion the start to the * next block. */ start = rsm->r_end; rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); if (rsm == NULL) goto out; } } } if (SEQ_GEQ(end, rsm->r_end)) { /** * The end of this block is either beyond this guy or right * at this guy. I.e.: * rsm --- |-----| * end |-----| * * end |---------| */ if ((rsm->r_flags & RACK_ACKED) == 0) { rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); if (rsm->r_in_tmap) /* should be true */ rack_log_sack_passed(tp, rack, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { rsm->r_flags &= ~RACK_SACK_PASSED; counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } } else { counter_u64_add(rack_sack_skipped_acked, 1); moved++; } if (end == rsm->r_end) { /* This block only - done, setup for next */ goto out; } /* * There is more not coverend by this rsm move on * to the next block in the RB tree. */ nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); start = rsm->r_end; rsm = nrsm; if (rsm == NULL) goto out; goto do_rest_ofb; } /** * The end of this sack block is smaller than * our rsm i.e.: * rsm --- |-----| * end |--| */ if ((rsm->r_flags & RACK_ACKED) == 0) { prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); if (prev && (prev->r_flags & RACK_ACKED)) { /** * Goal, we want the right remainder of rsm to shrink * in place and span from (rsm->r_start = end) to rsm->r_end. * We want to expand prev to go all the way * to prev->r_end <- end. * so in the tree we have before: * prev |--------| (acked) * rsm |-------| (non-acked) * sackblk |-| * We churn it so we end up with * prev |----------| (acked) * rsm |-----| (non-acked) * nrsm |-| (temporary) */ nrsm = &stack_map; memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); prev->r_end = end; rsm->r_start = end; /* Now adjust nrsm (stack copy) to be * the one that is the small * piece that was "sacked". */ nrsm->r_end = end; rsm->r_dupack = 0; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); /* * Now nrsm is our new little piece * that is acked (which was merged * to prev). Update the rtt and changed * based on that. Also check for reordering. */ rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); changed += (nrsm->r_end - nrsm->r_start); rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); if (nrsm->r_flags & RACK_SACK_PASSED) { counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } rsm = prev; counter_u64_add(rack_sack_used_prev_merge, 1); } else { /** * This is the case where our previous * block is not acked either, so we must * split the block in two. */ nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* failed rrs what can we do but loose the sack info? */ goto out; } /** * In this case nrsm becomes * nrsm->r_start = end; * nrsm->r_end = rsm->r_end; * which is un-acked. * * rsm->r_end = nrsm->r_start; * i.e. the remaining un-acked * piece is left on the left * hand side. * * So we start like this * rsm |----------| (not acked) * sackblk |---| * build it so we have * rsm |---| (acked) * nrsm |------| (not acked) */ counter_u64_add(rack_sack_splits, 1); rack_clone_rsm(rack, nrsm, rsm, end); rsm->r_flags &= (~RACK_HAS_FIN); rsm->r_just_ret = 0; insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #ifdef INVARIANTS if (insret != NULL) { panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } nrsm->r_dupack = 0; rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); if (rsm->r_in_tmap) /* should be true */ rack_log_sack_passed(tp, rack, rsm); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { rsm->r_flags &= ~RACK_SACK_PASSED; counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } } } else if (start != end){ /* * The block was already acked. */ counter_u64_add(rack_sack_skipped_acked, 1); moved++; } out: if (rsm && (rsm->r_flags & RACK_ACKED)) { /* * Now can we merge where we worked * with either the previous or * next block? */ next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); while (next) { if (next->r_flags & RACK_ACKED) { /* yep this and next can be merged */ rsm = rack_merge_rsm(rack, rsm, next); next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); } else break; } /* Now what about the previous? */ prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); while (prev) { if (prev->r_flags & RACK_ACKED) { /* yep the previous and this can be merged */ rsm = rack_merge_rsm(rack, prev, rsm); prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); } else break; } } if (used_ref == 0) { counter_u64_add(rack_sack_proc_all, 1); } else { counter_u64_add(rack_sack_proc_short, 1); } /* Save off the next one for quick reference. */ if (rsm) nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); else nrsm = NULL; *prsm = rack->r_ctl.rc_sacklast = nrsm; /* Pass back the moved. */ *moved_two = moved; return (changed); } static void inline rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) { struct rack_sendmap *tmap; tmap = NULL; while (rsm && (rsm->r_flags & RACK_ACKED)) { /* Its no longer sacked, mark it so */ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); #ifdef INVARIANTS if (rsm->r_in_tmap) { panic("rack:%p rsm:%p flags:0x%x in tmap?", rack, rsm, rsm->r_flags); } #endif rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); /* Rebuild it into our tmap */ if (tmap == NULL) { TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); tmap = rsm; } else { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); tmap = rsm; } tmap->r_in_tmap = 1; rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); } /* * Now lets possibly clear the sack filter so we start * recognizing sacks that cover this area. */ sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); } static void rack_do_decay(struct tcp_rack *rack) { struct timeval res; #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (0) timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); #undef timersub rack->r_ctl.input_pkt++; if ((rack->rc_in_persist) || (res.tv_sec >= 1) || (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { /* * Check for decay of non-SAD, * we want all SAD detection metrics to * decay 1/4 per second (or more) passed. */ uint32_t pkt_delta; pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; /* Update our saved tracking values */ rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; /* Now do we escape without decay? */ #ifdef NETFLIX_EXP_DETECTION if (rack->rc_in_persist || (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || (pkt_delta < tcp_sad_low_pps)){ /* * We don't decay idle connections * or ones that have a low input pps. */ return; } /* Decay the counters */ rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, tcp_sad_decay_val); rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, tcp_sad_decay_val); rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, tcp_sad_decay_val); rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, tcp_sad_decay_val); #endif } } static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) { uint32_t changed, entered_recovery = 0; struct tcp_rack *rack; struct rack_sendmap *rsm, *rm; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; register uint32_t th_ack; int32_t i, j, k, num_sack_blks = 0; uint32_t cts, acked, ack_point, sack_changed = 0; int loop_start = 0, moved_two = 0; uint32_t tsused; INP_WLOCK_ASSERT(tp->t_inpcb); if (th->th_flags & TH_RST) { /* We don't log resets */ return; } rack = (struct tcp_rack *)tp->t_fb_ptr; cts = tcp_ts_getticks(); rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); changed = 0; th_ack = th->th_ack; if (rack->sack_attack_disable == 0) rack_do_decay(rack); if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { /* * You only get credit for * MSS and greater (and you get extra * credit for larger cum-ack moves). */ int ac; ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); rack->r_ctl.ack_count += ac; counter_u64_add(rack_ack_total, ac); } if (rack->r_ctl.ack_count > 0xfff00000) { /* * reduce the number to keep us under * a uint32_t. */ rack->r_ctl.ack_count /= 2; rack->r_ctl.sack_count /= 2; } if (SEQ_GT(th_ack, tp->snd_una)) { rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); tp->t_acktime = ticks; } if (rsm && SEQ_GT(th_ack, rsm->r_start)) changed = th_ack - rsm->r_start; if (changed) { /* * The ACK point is advancing to th_ack, we must drop off * the packets in the rack log and calculate any eligble * RTT's. */ rack->r_wanted_output = 1; more: rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm == NULL) { if ((th_ack - 1) == tp->iss) { /* * For the SYN incoming case we will not * have called tcp_output for the sending of * the SYN, so there will be no map. All * other cases should probably be a panic. */ goto proc_sack; } if (tp->t_flags & TF_SENTFIN) { /* if we send a FIN we will not hav a map */ goto proc_sack; } #ifdef INVARIANTS panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", tp, th, tp->t_state, rack, tp->snd_una, tp->snd_max, tp->snd_nxt, changed); #endif goto proc_sack; } if (SEQ_LT(th_ack, rsm->r_start)) { /* Huh map is missing this */ #ifdef INVARIANTS printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", rsm->r_start, th_ack, tp->t_state, rack->r_state); #endif goto proc_sack; } rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); /* Now do we consume the whole thing? */ if (SEQ_GEQ(th_ack, rsm->r_end)) { /* Its all consumed. */ uint32_t left; uint8_t newly_acked; rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; /* Record the time of highest cumack sent */ rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); #ifdef INVARIANTS if (rm != rsm) { panic("removing head in rack:%p rsm:%p rm:%p", rack, rsm, rm); } #endif if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } newly_acked = 1; if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove * it from total */ rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); newly_acked = 0; } else if (rsm->r_flags & RACK_SACK_PASSED) { /* * There are segments ACKED on the * scoreboard further up. We are seeing * reordering. */ rsm->r_flags &= ~RACK_SACK_PASSED; counter_u64_add(rack_reorder_seen, 1); rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; rack->r_ctl.rc_reorder_ts = cts; } left = th_ack - rsm->r_end; if (rack->app_limited_needs_set && newly_acked) rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); /* Free back to zone */ rack_free(rack, rsm); if (left) { goto more; } goto proc_sack; } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove it from * total for the part being cum-acked. */ rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); } /* * Clear the dup ack count for * the piece that remains. */ rsm->r_dupack = 0; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); if (rsm->r_rtr_bytes) { /* * It was retransmitted adjust the * sack holes for what was acked. */ int ack_am; ack_am = (th_ack - rsm->r_start); if (ack_am >= rsm->r_rtr_bytes) { rack->r_ctl.rc_holes_rxt -= ack_am; rsm->r_rtr_bytes -= ack_am; } } /* * Update where the piece starts and record * the time of send of highest cumack sent. */ rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; rsm->r_start = th_ack; if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); } proc_sack: /* Check for reneging */ rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { /* * The peer has moved snd_una up to * the edge of this send, i.e. one * that it had previously acked. The only * way that can be true if the peer threw * away data (space issues) that it had * previously sacked (else it would have * given us snd_una up to (rsm->r_end). * We need to undo the acked markings here. * * Note we have to look to make sure th_ack is * our rsm->r_start in case we get an old ack * where th_ack is behind snd_una. */ rack_peer_reneges(rack, rsm, th->th_ack); } if ((to->to_flags & TOF_SACK) == 0) { /* We are done nothing left */ goto out; } /* Sack block processing */ if (SEQ_GT(th_ack, tp->snd_una)) ack_point = th_ack; else ack_point = tp->snd_una; for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); sack.start = ntohl(sack.start); sack.end = ntohl(sack.end); if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, ack_point) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, ack_point) && SEQ_LEQ(sack.end, tp->snd_max)) { sack_blocks[num_sack_blks] = sack; num_sack_blks++; #ifdef NETFLIX_STATS } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { /* * Its a D-SACK block. */ tcp_record_dsack(sack.start, sack.end); #endif } } /* * Sort the SACK blocks so we can update the rack scoreboard with * just one pass. */ num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); if (num_sack_blks == 0) { /* Nothing to sack (DSACKs?) */ goto out_with_totals; } if (num_sack_blks < 2) { /* Only one, we don't need to sort */ goto do_sack_work; } /* Sort the sacks */ for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { sack = sack_blocks[i]; sack_blocks[i] = sack_blocks[j]; sack_blocks[j] = sack; } } } /* * Now are any of the sack block ends the same (yes some * implementations send these)? */ again: if (num_sack_blks == 0) goto out_with_totals; if (num_sack_blks > 1) { for (i = 0; i < num_sack_blks; i++) { for (j = i + 1; j < num_sack_blks; j++) { if (sack_blocks[i].end == sack_blocks[j].end) { /* * Ok these two have the same end we * want the smallest end and then * throw away the larger and start * again. */ if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { /* * The second block covers * more area use that */ sack_blocks[i].start = sack_blocks[j].start; } /* * Now collapse out the dup-sack and * lower the count */ for (k = (j + 1); k < num_sack_blks; k++) { sack_blocks[j].start = sack_blocks[k].start; sack_blocks[j].end = sack_blocks[k].end; j++; } num_sack_blks--; goto again; } } } } do_sack_work: /* * First lets look to see if * we have retransmitted and * can use the transmit next? */ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm && SEQ_GT(sack_blocks[0].end, rsm->r_start) && SEQ_LT(sack_blocks[0].start, rsm->r_end)) { /* * We probably did the FR and the next * SACK in continues as we would expect. */ acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); if (acked) { rack->r_wanted_output = 1; changed += acked; sack_changed += acked; } if (num_sack_blks == 1) { /* * This is what we would expect from * a normal implementation to happen * after we have retransmitted the FR, * i.e the sack-filter pushes down * to 1 block and the next to be retransmitted * is the sequence in the sack block (has more * are acked). Count this as ACK'd data to boost * up the chances of recovering any false positives. */ rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); counter_u64_add(rack_express_sack, 1); if (rack->r_ctl.ack_count > 0xfff00000) { /* * reduce the number to keep us under * a uint32_t. */ rack->r_ctl.ack_count /= 2; rack->r_ctl.sack_count /= 2; } goto out_with_totals; } else { /* * Start the loop through the * rest of blocks, past the first block. */ moved_two = 0; loop_start = 1; } } /* Its a sack of some sort */ rack->r_ctl.sack_count++; if (rack->r_ctl.sack_count > 0xfff00000) { /* * reduce the number to keep us under * a uint32_t. */ rack->r_ctl.ack_count /= 2; rack->r_ctl.sack_count /= 2; } counter_u64_add(rack_sack_total, 1); if (rack->sack_attack_disable) { /* An attacker disablement is in place */ if (num_sack_blks > 1) { rack->r_ctl.sack_count += (num_sack_blks - 1); rack->r_ctl.sack_moved_extra++; counter_u64_add(rack_move_some, 1); if (rack->r_ctl.sack_moved_extra > 0xfff00000) { rack->r_ctl.sack_moved_extra /= 2; rack->r_ctl.sack_noextra_move /= 2; } } goto out; } rsm = rack->r_ctl.rc_sacklast; for (i = loop_start; i < num_sack_blks; i++) { acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); if (acked) { rack->r_wanted_output = 1; changed += acked; sack_changed += acked; } if (moved_two) { /* * If we did not get a SACK for at least a MSS and * had to move at all, or if we moved more than our * threshold, it counts against the "extra" move. */ rack->r_ctl.sack_moved_extra += moved_two; counter_u64_add(rack_move_some, 1); } else { /* * else we did not have to move * any more than we would expect. */ rack->r_ctl.sack_noextra_move++; counter_u64_add(rack_move_none, 1); } if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { /* * If the SACK was not a full MSS then * we add to sack_count the number of * MSS's (or possibly more than * a MSS if its a TSO send) we had to skip by. */ rack->r_ctl.sack_count += moved_two; counter_u64_add(rack_sack_total, moved_two); } /* * Now we need to setup for the next * round. First we make sure we won't * exceed the size of our uint32_t on * the various counts, and then clear out * moved_two. */ if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || (rack->r_ctl.sack_noextra_move > 0xfff00000)) { rack->r_ctl.sack_moved_extra /= 2; rack->r_ctl.sack_noextra_move /= 2; } if (rack->r_ctl.sack_count > 0xfff00000) { rack->r_ctl.ack_count /= 2; rack->r_ctl.sack_count /= 2; } moved_two = 0; } out_with_totals: if (num_sack_blks > 1) { /* * You get an extra stroke if * you have more than one sack-blk, this * could be where we are skipping forward * and the sack-filter is still working, or * it could be an attacker constantly * moving us. */ rack->r_ctl.sack_moved_extra++; counter_u64_add(rack_move_some, 1); } out: #ifdef NETFLIX_EXP_DETECTION if ((rack->do_detection || tcp_force_detection) && tcp_sack_to_ack_thresh && tcp_sack_to_move_thresh && ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { /* * We have thresholds set to find * possible attackers and disable sack. * Check them. */ uint64_t ackratio, moveratio, movetotal; /* Log detecting */ rack_log_sad(rack, 1); ackratio = (uint64_t)(rack->r_ctl.sack_count); ackratio *= (uint64_t)(1000); if (rack->r_ctl.ack_count) ackratio /= (uint64_t)(rack->r_ctl.ack_count); else { /* We really should not hit here */ ackratio = 1000; } if ((rack->sack_attack_disable == 0) && (ackratio > rack_highest_sack_thresh_seen)) rack_highest_sack_thresh_seen = (uint32_t)ackratio; movetotal = rack->r_ctl.sack_moved_extra; movetotal += rack->r_ctl.sack_noextra_move; moveratio = rack->r_ctl.sack_moved_extra; moveratio *= (uint64_t)1000; if (movetotal) moveratio /= movetotal; else { /* No moves, thats pretty good */ moveratio = 0; } if ((rack->sack_attack_disable == 0) && (moveratio > rack_highest_move_thresh_seen)) rack_highest_move_thresh_seen = (uint32_t)moveratio; if (rack->sack_attack_disable == 0) { if ((ackratio > tcp_sack_to_ack_thresh) && (moveratio > tcp_sack_to_move_thresh)) { /* Disable sack processing */ rack->sack_attack_disable = 1; if (rack->r_rep_attack == 0) { rack->r_rep_attack = 1; counter_u64_add(rack_sack_attacks_detected, 1); } if (tcp_attack_on_turns_on_logging) { /* * Turn on logging, used for debugging * false positives. */ rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; } /* Clamp the cwnd at flight size */ rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); rack_log_sad(rack, 2); } } else { /* We are sack-disabled check for false positives */ if ((ackratio <= tcp_restoral_thresh) || (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { rack->sack_attack_disable = 0; rack_log_sad(rack, 3); /* Restart counting */ rack->r_ctl.sack_count = 0; rack->r_ctl.sack_moved_extra = 0; rack->r_ctl.sack_noextra_move = 1; rack->r_ctl.ack_count = max(1, (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); if (rack->r_rep_reverse == 0) { rack->r_rep_reverse = 1; counter_u64_add(rack_sack_attacks_reversed, 1); } /* Restore the cwnd */ if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; } } } #endif if (changed) { /* Something changed cancel the rack timer */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } tsused = tcp_ts_getticks(); rsm = tcp_rack_output(tp, rack, tsused); if ((!IN_RECOVERY(tp->t_flags)) && rsm) { /* Enter recovery */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; entered_recovery = 1; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure we send * one packet. */ if (rack->rack_no_prr == 0) { rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); rack_log_to_prr(rack, 8, 0); } rack->r_timer_override = 1; rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; } else if (IN_RECOVERY(tp->t_flags) && rsm && (rack->r_rr_config == 3)) { /* * Assure we can output and we get no * remembered pace time except the retransmit. */ rack->r_timer_override = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; rack->r_ctl.rc_resend = rsm; } if (IN_RECOVERY(tp->t_flags) && (rack->rack_no_prr == 0) && (entered_recovery == 0)) { /* Deal with PRR here (in recovery only) */ uint32_t pipe, snd_una; rack->r_ctl.rc_prr_delivered += changed; /* Compute prr_sndcnt */ if (SEQ_GT(tp->snd_una, th_ack)) { snd_una = tp->snd_una; } else { snd_una = th_ack; } pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; if (pipe > tp->snd_ssthresh) { long sndcnt; sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; if (rack->r_ctl.rc_prr_recovery_fs > 0) sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; else { rack->r_ctl.rc_prr_sndcnt = 0; rack_log_to_prr(rack, 9, 0); sndcnt = 0; } sndcnt++; if (sndcnt > (long)rack->r_ctl.rc_prr_out) sndcnt -= rack->r_ctl.rc_prr_out; else sndcnt = 0; rack->r_ctl.rc_prr_sndcnt = sndcnt; rack_log_to_prr(rack, 10, 0); } else { uint32_t limit; if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); else limit = 0; if (changed > limit) limit = changed; limit += ctf_fixed_maxseg(tp); if (tp->snd_ssthresh > pipe) { rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); rack_log_to_prr(rack, 11, 0); } else { rack->r_ctl.rc_prr_sndcnt = min(0, limit); rack_log_to_prr(rack, 12, 0); } } if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && ((rack->rc_inp->inp_in_hpts == 0) && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { /* * If you are pacing output you don't want * to override. */ rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; rack->r_timer_override = 1; } } } static void rack_strike_dupack(struct tcp_rack *rack) { struct rack_sendmap *rsm; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm && (rsm->r_dupack < 0xff)) { rsm->r_dupack++; if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { rack->r_wanted_output = 1; rack->r_timer_override = 1; rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); } else { rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); } } } static void rack_check_bottom_drag(struct tcpcb *tp, struct tcp_rack *rack, struct socket *so, int32_t acked) { uint32_t segsiz, minseg; segsiz = ctf_fixed_maxseg(tp); minseg = segsiz; if (tp->snd_max == tp->snd_una) { /* * We are doing dynamic pacing and we are way * under. Basically everything got acked while * we were still waiting on the pacer to expire. * * This means we need to boost the b/w in * addition to any earlier boosting of * the multipler. */ rack->rc_dragged_bottom = 1; rack_validate_multipliers_at_or_above100(rack); /* * Lets use the segment bytes acked plus * the lowest RTT seen as the basis to * form a b/w estimate. This will be off * due to the fact that the true estimate * should be around 1/2 the time of the RTT * but we can settle for that. */ if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && acked) { uint64_t bw, calc_bw, rtt; rtt = rack->r_ctl.rack_rs.rs_us_rtt; bw = acked; calc_bw = bw * 1000000; calc_bw /= rtt; if (rack->r_ctl.last_max_bw && (rack->r_ctl.last_max_bw < calc_bw)) { /* * If we have a last calculated max bw * enforce it. */ calc_bw = rack->r_ctl.last_max_bw; } /* now plop it in */ if (rack->rc_gp_filled == 0) { if (calc_bw > ONE_POINT_TWO_MEG) { /* * If we have no measurement * don't let us set in more than * 1.2Mbps. If we are still too * low after pacing with this we * will hopefully have a max b/w * available to sanity check things. */ calc_bw = ONE_POINT_TWO_MEG; } rack->r_ctl.rc_rtt_diff = 0; rack->r_ctl.gp_bw = calc_bw; rack->rc_gp_filled = 1; rack->r_ctl.num_avg = RACK_REQ_AVG; rack_set_pace_segments(rack->rc_tp, rack, __LINE__); } else if (calc_bw > rack->r_ctl.gp_bw) { rack->r_ctl.rc_rtt_diff = 0; rack->r_ctl.num_avg = RACK_REQ_AVG; rack->r_ctl.gp_bw = calc_bw; rack_set_pace_segments(rack->rc_tp, rack, __LINE__); } else rack_increase_bw_mul(rack, -1, 0, 0, 1); /* * For acks over 1mss we do a extra boost to simulate * where we would get 2 acks (we want 110 for the mul). */ if (acked > segsiz) rack_increase_bw_mul(rack, -1, 0, 0, 1); } else { /* * Huh, this should not be, settle * for just an old increase. */ rack_increase_bw_mul(rack, -1, 0, 0, 1); } } else if ((IN_RECOVERY(tp->t_flags) == 0) && (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), minseg)) && (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= (segsiz * rack_req_segs))) { /* * We are doing dynamic GP pacing and * we have everything except 1MSS or less * bytes left out. We are still pacing away. * And there is data that could be sent, This * means we are inserting delayed ack time in * our measurements because we are pacing too slow. */ rack_validate_multipliers_at_or_above100(rack); rack->rc_dragged_bottom = 1; rack_increase_bw_mul(rack, -1, 0, 0, 1); } } /* * Return value of 1, we do not need to call rack_process_data(). * return value of 0, rack_process_data can be called. * For ret_val if its 0 the TCP is locked, if its non-zero * its unlocked and probably unsafe to touch the TCB. */ static int rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val) { int32_t ourfinisacked = 0; int32_t nsegs, acked_amount; int32_t acked; struct mbuf *mfree; struct tcp_rack *rack; int32_t under_pacing = 0; int32_t recovery = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; if (SEQ_GT(th->th_ack, tp->snd_max)) { ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); rack->r_wanted_output = 1; return (1); } if (rack->rc_gp_filled && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { under_pacing = 1; } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { if (rack->rc_in_persist) tp->t_rxtshift = 0; if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) rack_strike_dupack(rack); rack_log_ack(tp, to, th); } if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* * Old ack, behind (or duplicate to) the last one rcv'd * Note: Should mark reordering is occuring! We should also * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, * 3-3, 4-4 would be reording. As well as ack 1, 3-3 ack 3 */ return (0); } /* * If we reach this point, ACK is not a duplicate, i.e., it ACKs * something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our SYN has * been ACK'd (so connection is now fully synchronized). Go * to non-starred state, increment snd_una for ACK of SYN, * and check if we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } nsegs = max(1, m->m_pkthdr.lro_nsegs); INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK arrives * within our recovery window, then it was a mistake to do the * retransmit in the first place. Recover our original cwnd and * ssthresh, and proceed to transmit where we left off. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) rack_cong_signal(tp, th, CC_RTO_ERR); } if (acked) { /* assure we are not backed off */ tp->t_rxtshift = 0; rack->rc_tlp_in_progress = 0; rack->r_ctl.rc_tlp_cnt_out = 0; /* * If it is the RXT timer we want to * stop it, so we can restart a TLP. */ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); #ifdef NETFLIX_HTTP_LOGGING tcp_http_check_for_comp(rack->rc_tp, th->th_ack); #endif } /* * If we have a timestamp reply, update smoothed round trip time. If * no timestamp is present but transmit timer is running and timed * sequence number was acked, update smoothed round trip time. Since * we now have an rtt measurement, cancel the timer backoff (cf., * Phil Karn's retransmit alg.). Recompute the initial retransmit * timer. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ /* * If all outstanding data is acked, stop retransmit timer and * remember to restart (more output or persist). If there is more * data to be acked, restart retransmit timer, using current * (possibly backed-off) value. */ if (acked == 0) { if (ofia) *ofia = ourfinisacked; return (0); } if (rack->r_ctl.rc_early_recovery) { if (IN_RECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover) && (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); recovery = 1; } } } /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the * congestion window. */ rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); SOCKBUF_LOCK(&so->so_snd); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; mfree = sbcut_locked(&so->so_snd, acked_amount); if ((sbused(&so->so_snd) == 0) && (acked > acked_amount) && (tp->t_state >= TCPS_FIN_WAIT_1) && (tp->t_flags & TF_SENTFIN)) { /* * We must be sure our fin * was sent and acked (we can be * in FIN_WAIT_1 without having * sent the fin). */ ourfinisacked = 1; } SOCKBUF_UNLOCK(&so->so_snd); tp->t_flags |= TF_WAKESOW; m_freem(mfree); if (rack->r_ctl.rc_early_recovery == 0) { if (IN_RECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover) && (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); } } } tp->snd_una = th->th_ack; if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { tp->snd_nxt = tp->snd_una; } if (under_pacing && (rack->use_fixed_rate == 0) && (rack->in_probe_rtt == 0) && rack->rc_gp_dyn_mul && rack->rc_always_pace) { /* Check if we are dragging bottom */ rack_check_bottom_drag(tp, rack, so, acked); } if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); /* Set need output so persist might get set */ rack->r_wanted_output = 1; sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); if ((tp->t_state >= TCPS_FIN_WAIT_1) && (sbavail(&so->so_snd) == 0) && (tp->t_flags2 & TF2_DROP_AF_DATA)) { /* * The socket was gone and the * peer sent data, time to * reset him. */ *ret_val = 1; /* tcp_close will kill the inp pre-log the Reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); return (1); } } if (ofia) *ofia = ourfinisacked; return (0); } static void rack_collapsed_window(struct tcp_rack *rack) { /* * Now we must walk the * send map and divide the * ones left stranded. These * guys can't cause us to abort * the connection and are really * "unsent". However if a buggy * client actually did keep some * of the data i.e. collapsed the win * and refused to ack and then opened * the win and acked that data. We would * get into an ack war, the simplier * method then of just pretending we * did not send those segments something * won't work. */ struct rack_sendmap *rsm, *nrsm, fe, *insret; tcp_seq max_seq; max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; memset(&fe, 0, sizeof(fe)); fe.r_start = max_seq; /* Find the first seq past or at maxseq */ rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); if (rsm == NULL) { /* Nothing to do strange */ rack->rc_has_collapsed = 0; return; } /* * Now do we need to split at * the collapse point? */ if (SEQ_GT(max_seq, rsm->r_start)) { nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); if (nrsm == NULL) { /* We can't get a rsm, mark all? */ nrsm = rsm; goto no_split; } /* Clone it */ rack_clone_rsm(rack, nrsm, rsm, max_seq); insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #ifdef INVARIANTS if (insret != NULL) { panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", nrsm, insret, rack, rsm); } #endif if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } /* * Set in the new RSM as the * collapsed starting point */ rsm = nrsm; } no_split: counter_u64_add(rack_collapsed_win, 1); RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { nrsm->r_flags |= RACK_RWND_COLLAPSED; rack->rc_has_collapsed = 1; } } static void rack_un_collapse_window(struct tcp_rack *rack) { struct rack_sendmap *rsm; RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { if (rsm->r_flags & RACK_RWND_COLLAPSED) rsm->r_flags &= ~RACK_RWND_COLLAPSED; else break; } rack->rc_has_collapsed = 0; } static void rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, int32_t tlen, int32_t tfo_syn) { if (DELAY_ACK(tp, tlen) || tfo_syn) { if (rack->rc_dack_mode && (tlen > 500) && (rack->rc_dack_toggle == 1)) { goto no_delayed_ack; } rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { no_delayed_ack: rack->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; if (rack->rc_dack_mode) { if (tp->t_flags & TF_DELACK) rack->rc_dack_toggle = 1; else rack->rc_dack_toggle = 0; } } } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { /* * Update window information. Don't look at window if no ACK: TAC's * send garbage on first SYN. */ int32_t nsegs; int32_t tfo_syn; struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; INP_WLOCK_ASSERT(tp->t_inpcb); nsegs = max(1, m->m_pkthdr.lro_nsegs); if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) KMOD_TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; rack->r_wanted_output = 1; } else if (thflags & TH_ACK) { if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; } } if (tp->snd_wnd < ctf_outstanding(tp)) /* The peer collapsed the window */ rack_collapsed_window(rack); else if (rack->rc_has_collapsed) rack_un_collapse_window(rack); /* Was persist timer active and now we have window space? */ if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) { rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); tp->snd_nxt = tp->snd_max; /* Make sure we output to start the timer */ rack->r_wanted_output = 1; } /* Do we enter persists? */ if ((rack->rc_in_persist == 0) && (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_max == tp->snd_una) && sbavail(&tp->t_inpcb->inp_socket->so_snd) && (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { /* * Here the rwnd is less than * the pacing size, we are established, * nothing is outstanding, and there is * data to send. Enter persists. */ tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); } if (tp->t_flags2 & TF2_DROP_AF_DATA) { m_freem(m); return (0); } /* * don't process the URG bit, ignore them drag * along the up. */ tp->rcv_up = tp->rcv_nxt; INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing * queue, and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data is * presented to the user (this happens in tcp_usrreq.c, case * PRU_RCVD). If a FIN has already been received on this connection * then we just ignore the text. */ tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && IS_FASTOPEN(tp->t_flags)); if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_rnxt = tp->rcv_nxt; int save_tlen = tlen; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly * queue with control block tp. Set thflags to whether * reassembly now includes a segment with FIN. This handles * the common case inline (segment is the next to be * received on an established connection, and the queue is * empty), avoiding linkage into and removal from the queue * and repetition of various conversions. Set DELACK for * segments received in order, but ack immediately when * segments are out of order (so fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && SEGQ_EMPTY(tp) && (TCPS_HAVEESTABLISHED(tp->t_state) || tfo_syn)) { #ifdef NETFLIX_SB_LIMITS u_int mcnt, appended; if (so->so_rcv.sb_shlim) { mcnt = m_memcnt(m); appended = 0; if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, CFO_NOSLEEP, NULL) == false) { counter_u64_add(tcp_sb_shlim_fails, 1); m_freem(m); return (0); } } #endif rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } thflags = th->th_flags & TH_FIN; KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else #ifdef NETFLIX_SB_LIMITS appended = #endif sbappendstream_locked(&so->so_rcv, m, 0); SOCKBUF_UNLOCK(&so->so_rcv); tp->t_flags |= TF_WAKESOR; #ifdef NETFLIX_SB_LIMITS if (so->so_rcv.sb_shlim && appended != mcnt) counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); #endif } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs when * trimming from the head. */ tcp_seq temp = save_start; thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { /* * DSACK actually handled in the fastpath * above. */ RACK_OPTS_INC(tcp_sack_path_1); tcp_update_sack_list(tp, save_start, save_start + save_tlen); } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { if ((tp->rcv_numsacks >= 1) && (tp->sackblks[0].end == save_start)) { /* * Partial overlap, recorded at todrop * above. */ RACK_OPTS_INC(tcp_sack_path_2a); tcp_update_sack_list(tp, tp->sackblks[0].start, tp->sackblks[0].end); } else { RACK_OPTS_INC(tcp_sack_path_2b); tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } } else if (tlen >= save_tlen) { /* Update of sackblks. */ RACK_OPTS_INC(tcp_sack_path_3); tcp_update_dsack_list(tp, save_start, save_start + save_tlen); } else if (tlen > 0) { RACK_OPTS_INC(tcp_sack_path_4); tcp_update_dsack_list(tp, save_start, save_start + tlen); } } } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know that the * connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* The socket upcall is handled by socantrcvmore. */ tp->t_flags &= ~TF_WAKESOR; /* * If connection is half-synchronized (ie NEEDSYN * flag on) then delay ACK, so it may be piggybacked * when SYN is sent. Otherwise, since we received a * FIN then no more input can be expected, send ACK * now. */ if (tp->t_flags & TF_NEEDSYN) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES enter the * CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been * acked so enter the CLOSING state. */ case TCPS_FIN_WAIT_1: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the * other standard timers. */ case TCPS_FIN_WAIT_2: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tcp_twstart(tp); return (1); } } /* * Return any desired output. */ if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { rack->r_wanted_output = 1; } INP_WLOCK_ASSERT(tp->t_inpcb); return (0); } /* * Here nothing is really faster, its just that we * have broken out the fast-data path also just like * the fast-ack. */ static int rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) { int32_t nsegs; int32_t newsize = 0; /* automatic sockbuf scaling */ struct tcp_rack *rack; #ifdef NETFLIX_SB_LIMITS u_int mcnt, appended; #endif #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if (__predict_false(th->th_seq != tp->rcv_nxt)) { return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { return (0); } if (tiwin && tiwin != tp->snd_wnd) { return (0); } if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { return (0); } if (__predict_false((to->to_flags & TOF_TS) && (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { return (0); } if (__predict_false((th->th_ack != tp->snd_una))) { return (0); } if (__predict_false(tlen > sbspace(&so->so_rcv))) { return (0); } if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } rack = (struct tcp_rack *)tp->t_fb_ptr; /* * This is a pure, in-sequence data packet with nothing on the * reassembly queue and we have enough buffer space to take it. */ nsegs = max(1, m->m_pkthdr.lro_nsegs); #ifdef NETFLIX_SB_LIMITS if (so->so_rcv.sb_shlim) { mcnt = m_memcnt(m); appended = 0; if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, CFO_NOSLEEP, NULL) == false) { counter_u64_add(tcp_sb_shlim_fails, 1); m_freem(m); return (1); } } #endif /* Clean receiver SACK report if present */ if (tp->rcv_numsacks) tcp_clean_sackreport(tp); KMOD_TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; if (tlen && ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && (tp->t_fbyte_in == 0)) { tp->t_fbyte_in = ticks; if (tp->t_fbyte_in == 0) tp->t_fbyte_in = 1; if (tp->t_fbyte_out && tp->t_fbyte_in) tp->t_flags2 |= TF2_FBYTES_COMPLETE; } /* * Pull snd_wl1 up to prevent seq wrap relative to th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif newsize = tcp_autorcvbuf(m, th, so, tp, tlen); /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. Give up when limit is * reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ #ifdef NETFLIX_SB_LIMITS appended = #endif sbappendstream_locked(&so->so_rcv, m, 0); ctf_calc_rwin(so, tp); } SOCKBUF_UNLOCK(&so->so_rcv); tp->t_flags |= TF_WAKESOR; #ifdef NETFLIX_SB_LIMITS if (so->so_rcv.sb_shlim && mcnt != appended) counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); #endif rack_handle_delayed_ack(tp, rack, tlen, 0); if (tp->snd_una == tp->snd_max) sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); return (1); } /* * This subfunction is used to try to highly optimize the * fast path. We again allow window updates that are * in sequence to remain in the fast-path. We also add * in the __predict's to attempt to help the compiler. * Note that if we return a 0, then we can *not* process * it and the caller should push the packet into the * slow-path. */ static int rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) { int32_t acked; int32_t nsegs; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif int32_t under_pacing = 0; struct tcp_rack *rack; if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* Old ack, behind (or duplicate to) the last one rcv'd */ return (0); } if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { /* Above what we have sent? */ return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { /* We are retransmitting */ return (0); } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); } if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { /* We need a SYN or a FIN, unlikely.. */ return (0); } if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { /* Timestamp is behind .. old ack with seq wrap? */ return (0); } if (__predict_false(IN_RECOVERY(tp->t_flags))) { /* Still recovering */ return (0); } rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->r_ctl.rc_sacked) { /* We have sack holes on our scoreboard */ return (0); } /* Ok if we reach here, we can process a fast-ack */ if (rack->rc_gp_filled && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { under_pacing = 1; } nsegs = max(1, m->m_pkthdr.lro_nsegs); rack_log_ack(tp, to, th); /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } /* Do we exit persists? */ if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) { rack_exit_persist(tp, rack, cts); } /* Do we enter persists? */ if ((rack->rc_in_persist == 0) && (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_max == tp->snd_una) && sbavail(&tp->t_inpcb->inp_socket->so_snd) && (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { /* * Here the rwnd is less than * the pacing size, we are established, * nothing is outstanding, and there is * data to send. Enter persists. */ tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); } /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ KMOD_TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_flags & TF_PREVVALID) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) rack_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies during the SYN+ACK * phase, ignore timestamps of 0 or we could calculate a huge RTT * and blow up the retransmit timer. */ acked = BYTES_THIS_ACK(tp, th); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (acked) { /* assure we are not backed off */ tp->t_rxtshift = 0; rack->rc_tlp_in_progress = 0; rack->r_ctl.rc_tlp_cnt_out = 0; /* * If it is the RXT timer we want to * stop it, so we can restart a TLP. */ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); #ifdef NETFLIX_HTTP_LOGGING tcp_http_check_for_comp(rack->rc_tp, th->th_ack); #endif } /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the * congestion window. */ rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); tp->snd_una = th->th_ack; if (tp->snd_wnd < ctf_outstanding(tp)) { /* The peer collapsed the window */ rack_collapsed_window(rack); } else if (rack->rc_has_collapsed) rack_un_collapse_window(rack); /* * Pull snd_wl2 up to prevent seq wrap relative to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; m_freem(m); /* ND6_HINT(tp); *//* Some progress has been made. */ /* * If all outstanding data are acked, stop retransmit timer, * otherwise restart timer using current (possibly backed-off) * value. If process is waiting for space, wakeup/selwakeup/signal. * If data are ready to send, let tcp_output decide between more * output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif if (under_pacing && (rack->use_fixed_rate == 0) && (rack->in_probe_rtt == 0) && rack->rc_gp_dyn_mul && rack->rc_always_pace) { /* Check if we are dragging bottom */ rack_check_bottom_drag(tp, rack, so, acked); } if (tp->snd_una == tp->snd_max) { rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } /* Wake up the socket if we have room to write more */ tp->t_flags |= TF_WAKESOW; if (sbavail(&so->so_snd)) { rack->r_wanted_output = 1; } return (1); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; int32_t todrop; int32_t ourfinisacked = 0; struct tcp_rack *rack; ctf_calc_rwin(so, tp); /* * If the state is SYN_SENT: if seg contains an ACK, but not for our * SYN, drop the input. if seg contains a RST, then drop the * connection. if seg does not contain SYN, then drop it. Otherwise * this is an acceptable SYN segment initialize tp->rcv_nxt and * tp->irs if seg contains ack then advance tp->snd_una if seg * contains an ECE and ECN support is enabled, the stream is ECN * capable. if SYN has been acked change to ESTABLISHED else * SYN_RCVD state arrange for segment to be acked (eventually) * continue processing rest of data/controls. */ if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); ctf_do_drop(m, tp); return (1); } if (thflags & TH_RST) { ctf_do_drop(m, tp); return (1); } if (!(thflags & TH_SYN)) { ctf_do_drop(m, tp); return (1); } tp->irs = th->th_seq; tcp_rcvseqinit(tp); rack = (struct tcp_rack *)tp->t_fb_ptr; if (thflags & TH_ACK) { int tfo_partial = 0; KMOD_TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); /* * If not all the data that was sent in the TFO SYN * has been acked, resend the remainder right away. */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { tp->snd_nxt = th->th_ack; tfo_partial = 1; } /* * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { rack->r_wanted_output = 1; tp->t_flags |= TF_ACKNOW; rack->rc_dack_toggle = 0; } if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && (V_tcp_do_ecn == 1)) { tp->t_flags2 |= TF2_ECN_PERMIT; KMOD_TCPSTAT_INC(tcps_ecn_shs); } if (SEQ_GT(th->th_ack, tp->snd_una)) { /* * We advance snd_una for the * fast open case. If th_ack is * acknowledging data beyond * snd_una we can't just call * ack-processing since the * data stream in our send-map * will start at snd_una + 1 (one * beyond the SYN). If its just * equal we don't need to do that * and there is no send_map. */ tp->snd_una++; } /* * Received in SYN_SENT[*] state. Transitions: * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); rack_cc_conn_init(tp); } } else { /* * Received initial SYN in SYN-SENT[*] state => simultaneous * open. If segment contains CC option and there is a * cached CC, apply TAO test. If it succeeds, connection is * * half-synchronized. Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If * there was no CC option, clear cached CC value. */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_state_change(tp, TCPS_SYN_RECEIVED); } INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. If data, * trim to stay within window, dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. If the * remote host used T/TCP to validate the SYN, our data will be * ACK'd; if so, enter normal data segment processing in the middle * of step 5, ack processing. Otherwise, goto step 6. */ if (thflags & TH_ACK) { /* For syn-sent we need to possibly update the rtt */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); tcp_rack_xmit_timer_commit(rack, tp); } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) return (ret_val); /* We may have changed to FIN_WAIT_1 above */ if (tp->t_state == TCPS_FIN_WAIT_1) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now * acknowledged then enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then * closing user can proceed. Starting the * timer is contrary to the specification, * but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and * use a compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { struct tcp_rack *rack; int32_t ret_val = 0; int32_t ourfinisacked = 0; ctf_calc_rwin(so, tp); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } rack = (struct tcp_rack *)tp->t_fb_ptr; if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the * only valid packets are the initial SYN, a * retransmit/copy of the initial SYN (possibly with * a subset of the original data), a valid ACK, a * FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { ctf_do_drop(m, NULL); return (0); } } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { ctf_do_drop(m, NULL); return (0); } } if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know the * sequence numbers haven't wrapped. This is a partial fix for the * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } tp->snd_wnd = tiwin; /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (IS_FASTOPEN(tp->t_flags)) { rack_cc_conn_init(tp); } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } KMOD_TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } /* * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> * FIN-WAIT-1 */ tp->t_starttime = ticks; if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such connections * is not harmless as it would undo the snd_cwnd reduction * that occurs when a TFO SYN|ACK is retransmitted. */ if (!IS_FASTOPEN(tp->t_flags)) rack_cc_conn_init(tp); } /* * Account for the ACK of our SYN prior to * regular ACK processing below, except for * simultaneous SYN, which is handled later. */ if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) tp->snd_una++; /* * If segment contains data or ACK, will call tcp_reass() later; if * not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* For syn-recv we need to possibly update the rtt */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { uint32_t t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); tcp_rack_xmit_timer_commit(rack, tp); } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (tp->t_state == TCPS_FIN_WAIT_1) { /* We could have went to FIN_WAIT_1 (or EST) above */ /* * In FIN_WAIT_1 STATE in addition to the processing for the * ESTABLISHED state if our FIN is now acknowledged then * enter FIN_WAIT_2. */ if (ourfinisacked) { /* * If we can't receive any more data, then closing * user can proceed. Starting the timer is contrary * to the specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; struct tcp_rack *rack; /* * Header prediction: check for the two common cases of a * uni-directional data xfer. If the packet has no control flags, * is in-sequence, the window didn't change and we're not * retransmitting, it's a candidate. If the length is zero and the * ack moved forward, we're the sender side of the xfer. Just free * the data acked & wake any higher level process that was blocked * waiting for space. If the length is non-zero and the ack didn't * move, we're the receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data toc The socket * buffer and note that we need a delayed ack. Make sure that the * hidden state-flags are also off. Since we check for * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. */ rack = (struct tcp_rack *)tp->t_fb_ptr; if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && __predict_true(SEGQ_EMPTY(tp)) && __predict_true(th->th_seq == tp->rcv_nxt)) { if (tlen == 0) { if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { return (0); } } else { if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, tiwin, nxt_pkt, iptos)) { return (0); } } } ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } /* State changes only happen in rack_process_data() */ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static int rack_check_data_after_close(struct mbuf *m, struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) { struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->rc_allow_data_af_clo == 0) { close_now: tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); /* tcp_close will kill the inp pre-log the Reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); tp = tcp_close(tp); KMOD_TCPSTAT_INC(tcps_rcvafterclose); ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); return (1); } if (sbavail(&so->so_snd) == 0) goto close_now; /* Ok we allow data that is ignored and a followup reset */ tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); tp->rcv_nxt = th->th_seq + *tlen; tp->t_flags2 |= TF2_DROP_AF_DATA; rack->r_wanted_output = 1; *tlen = 0; return (0); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; int32_t ourfinisacked = 0; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { /* * If we can't receive any more data, then closing user can * proceed. Starting the timer is contrary to the * specification, but if we don't get a FIN we'll hang * forever. * * XXXjl: we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; int32_t ourfinisacked = 0; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { tcp_twstart(tp); m_freem(m); return (1); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; int32_t ourfinisacked = 0; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * case TCPS_LAST_ACK: Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (ourfinisacked) { tp = tcp_close(tp); ctf_do_drop(m, tp); return (1); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still * locked. */ static int rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; int32_t ourfinisacked = 0; ctf_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* * If new data are received on a connection after the user processes * are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tlen) { if (rack_check_data_after_close(m, tp, &tlen, th, so)) return (1); } /* * If last ACK falls within this segment's sequence numbers, record * its timestamp. NOTE: 1) That the test incorporates suggestions * from the latest proposal of the tcplw@cray.com list (Braden * 1993/04/26). 2) That updating only on newer timestamps interferes * with our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. 3) That we * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + * SEG.Len, This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated Vol. 2 * p.869. In such cases, we can still calculate the RTT correctly * when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN | TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_flags & TF_NEEDSYN) { return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); return (0); } } /* * Ack processing. */ if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } if (sbavail(&so->so_snd)) { if (ctf_progress_timeout_check(tp, true)) { rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, tp, tick, PROGRESS_DROP, __LINE__); tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } static void inline rack_clear_rate_sample(struct tcp_rack *rack) { rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; rack->r_ctl.rack_rs.rs_rtt_cnt = 0; rack->r_ctl.rack_rs.rs_rtt_tot = 0; } static void rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line) { uint64_t bw_est, rate_wanted; int chged = 0; uint32_t user_max; user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) chged = 1; rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); if (rack->use_fixed_rate || rack->rc_force_max_seg) { if (user_max != rack->r_ctl.rc_pace_max_segs) chged = 1; } if (rack->rc_force_max_seg) { rack->r_ctl.rc_pace_max_segs = user_max; } else if (rack->use_fixed_rate) { bw_est = rack_get_bw(rack); if ((rack->r_ctl.crte == NULL) || (bw_est != rack->r_ctl.crte->rate)) { rack->r_ctl.rc_pace_max_segs = user_max; } else { /* We are pacing right at the hardware rate */ uint32_t segsiz; segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( tp, bw_est, segsiz, 0, rack->r_ctl.crte, NULL); } } else if (rack->rc_always_pace) { if (rack->r_ctl.gp_bw || #ifdef NETFLIX_PEAKRATE rack->rc_tp->t_maxpeakrate || #endif rack->r_ctl.init_rate) { /* We have a rate of some sort set */ uint32_t orig; bw_est = rack_get_bw(rack); orig = rack->r_ctl.rc_pace_max_segs; rate_wanted = rack_get_output_bw(rack, bw_est, NULL); if (rate_wanted) { /* We have something */ rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rate_wanted, ctf_fixed_maxseg(rack->rc_tp)); } else rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; if (orig != rack->r_ctl.rc_pace_max_segs) chged = 1; } else if ((rack->r_ctl.gp_bw == 0) && (rack->r_ctl.rc_pace_max_segs == 0)) { /* * If we have nothing limit us to bursting * out IW sized pieces. */ chged = 1; rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); } } if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { chged = 1; rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; } if (chged) rack_log_type_hrdwtso(tp, rack, 0, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2); } static int rack_init(struct tcpcb *tp) { struct tcp_rack *rack = NULL; struct rack_sendmap *insret; uint32_t iwin, snt, us_cts; tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); if (tp->t_fb_ptr == NULL) { /* * We need to allocate memory but cant. The INP and INP_INFO * locks and they are recusive (happens during setup. So a * scheme to drop the locks fails :( * */ return (ENOMEM); } memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); rack = (struct tcp_rack *)tp->t_fb_ptr; RB_INIT(&rack->r_ctl.rc_mtree); TAILQ_INIT(&rack->r_ctl.rc_free); TAILQ_INIT(&rack->r_ctl.rc_tmap); rack->rc_tp = tp; if (tp->t_inpcb) { rack->rc_inp = tp->t_inpcb; } /* Probably not needed but lets be sure */ rack_clear_rate_sample(rack); rack->r_ctl.rc_reorder_fade = rack_reorder_fade; rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; if (use_rack_rr) rack->use_rack_rr = 1; if (V_tcp_delack_enabled) tp->t_delayed_ack = 1; else tp->t_delayed_ack = 0; if (rack_enable_shared_cwnd) rack->rack_enable_scwnd = 1; rack->rc_user_set_max_segs = rack_hptsi_segments; rack->rc_force_max_seg = 0; if (rack_use_imac_dack) rack->rc_dack_mode = 1; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; rack->r_ctl.rc_prop_rate = rack_proportional_rate; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; rack->r_ctl.rc_early_recovery = rack_early_recovery; rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; rack->r_ctl.rc_highest_us_rtt = 0; if (rack_disable_prr) rack->rack_no_prr = 1; if (rack_gp_no_rec_chg) rack->rc_gp_no_rec_chg = 1; rack->rc_always_pace = rack_pace_every_seg; if (rack_enable_mqueue_for_nonpaced) rack->r_mbuf_queue = 1; else rack->r_mbuf_queue = 0; if (rack->r_mbuf_queue || rack->rc_always_pace) tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; else tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; rack_set_pace_segments(tp, rack, __LINE__); if (rack_limits_scwnd) rack->r_limit_scw = 1; else rack->r_limit_scw = 0; rack->r_ctl.rc_high_rwnd = tp->snd_wnd; rack->r_ctl.cwnd_to_use = tp->snd_cwnd; rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; rack->rack_tlp_threshold_use = rack_tlp_threshold_use; rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; microuptime(&rack->r_ctl.act_rcv_time); rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; rack->r_running_late = 0; rack->r_running_early = 0; rack->rc_init_win = rack_default_init_window; rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; if (rack_do_dyn_mul) { /* When dynamic adjustment is on CA needs to start at 100% */ rack->rc_gp_dyn_mul = 1; if (rack_do_dyn_mul >= 100) rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; } else rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, rack_probertt_filter_life); us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_of_last_probertt = us_cts; rack->r_ctl.rc_time_probertt_starts = 0; /* Do we force on detection? */ #ifdef NETFLIX_EXP_DETECTION if (tcp_force_detection) rack->do_detection = 1; else #endif rack->do_detection = 0; if (rack_non_rxt_use_cr) rack->rack_rec_nonrxt_use_cr = 1; if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; rsm = rack_alloc(rack); if (rsm == NULL) { uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; return (ENOMEM); } rsm->r_flags = RACK_OVERMAX; rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = tp->snd_una; if (tp->t_flags & TF_SENTFIN) { rsm->r_end = tp->snd_max - 1; rsm->r_flags |= RACK_HAS_FIN; } else { rsm->r_end = tp->snd_max; } rsm->usec_orig_send = us_cts; rsm->r_dupack = 0; insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); #ifdef INVARIANTS if (insret != NULL) { panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", insret, rack, rsm); } #endif TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } /* Cancel the GP measurement in progress */ tp->t_flags &= ~TF_GPUTINPROG; if (SEQ_GT(tp->snd_max, tp->iss)) snt = tp->snd_max - tp->iss; else snt = 0; iwin = rc_init_window(rack); if (snt < iwin) { /* We are not past the initial window * so we need to make sure cwnd is * correct. */ if (tp->snd_cwnd < iwin) tp->snd_cwnd = iwin; /* * If we are within the initial window * we want ssthresh to be unlimited. Setting * it to the rwnd (which the default stack does * and older racks) is not really a good idea * since we want to be in SS and grow both the * cwnd and the rwnd (via dynamic rwnd growth). If * we set it to the rwnd then as the peer grows its * rwnd we will be stuck in CA and never hit SS. * * Its far better to raise it up high (this takes the * risk that there as been a loss already, probably * we should have an indicator in all stacks of loss * but we don't), but considering the normal use this * is a risk worth taking. The consequences of not * hitting SS are far worse than going one more time * into it early on (before we have sent even a IW). * It is highly unlikely that we will have had a loss * before getting the IW out. */ tp->snd_ssthresh = 0xffffffff; } rack_stop_all_timers(tp); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); rack_log_rtt_shrinks(rack, us_cts, 0, __LINE__, RACK_RTTS_INIT); return (0); } static int rack_handoff_ok(struct tcpcb *tp) { if ((tp->t_state == TCPS_CLOSED) || (tp->t_state == TCPS_LISTEN)) { /* Sure no problem though it may not stick */ return (0); } if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) { /* * We really don't know if you support sack, * you have to get to ESTAB or beyond to tell. */ return (EAGAIN); } if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { /* * Rack will only send a FIN after all data is acknowledged. * So in this case we have more data outstanding. We can't * switch stacks until either all data and only the FIN * is left (in which case rack_init() now knows how * to deal with that) all is acknowledged and we * are only left with incoming data, though why you * would want to switch to rack after all data is acknowledged * I have no idea (rrs)! */ return (EAGAIN); } if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ return (0); } /* * If we reach here we don't do SACK on this connection so we can * never do rack. */ return (EINVAL); } static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) { if (tp->t_fb_ptr) { struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm, *rm; rack = (struct tcp_rack *)tp->t_fb_ptr; #ifdef NETFLIX_SHARED_CWND if (rack->r_ctl.rc_scw) { uint32_t limit; if (rack->r_limit_scw) limit = max(1, rack->r_ctl.rc_lowest_us_rtt); else limit = 0; tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index, limit); rack->r_ctl.rc_scw = NULL; } #endif /* rack does not use force data but other stacks may clear it */ tp->t_flags &= ~TF_FORCEDATA; if (tp->t_inpcb) { tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; } #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); #ifdef INVARIANTS if (rm != rsm) { panic("At fini, rack:%p rsm:%p rm:%p", rack, rsm, rm); } #endif uma_zfree(rack_zone, rsm); } rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); while (rsm) { TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); } rack->rc_free_cnt = 0; uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } /* Cancel the GP measurement in progress */ tp->t_flags &= ~TF_GPUTINPROG; /* Make sure snd_nxt is correctly set */ tp->snd_nxt = tp->snd_max; } static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) { switch (tp->t_state) { case TCPS_SYN_SENT: rack->r_state = TCPS_SYN_SENT; rack->r_substate = rack_do_syn_sent; break; case TCPS_SYN_RECEIVED: rack->r_state = TCPS_SYN_RECEIVED; rack->r_substate = rack_do_syn_recv; break; case TCPS_ESTABLISHED: rack_set_pace_segments(tp, rack, __LINE__); rack->r_state = TCPS_ESTABLISHED; rack->r_substate = rack_do_established; break; case TCPS_CLOSE_WAIT: rack->r_state = TCPS_CLOSE_WAIT; rack->r_substate = rack_do_close_wait; break; case TCPS_FIN_WAIT_1: rack->r_state = TCPS_FIN_WAIT_1; rack->r_substate = rack_do_fin_wait_1; break; case TCPS_CLOSING: rack->r_state = TCPS_CLOSING; rack->r_substate = rack_do_closing; break; case TCPS_LAST_ACK: rack->r_state = TCPS_LAST_ACK; rack->r_substate = rack_do_lastack; break; case TCPS_FIN_WAIT_2: rack->r_state = TCPS_FIN_WAIT_2; rack->r_substate = rack_do_fin_wait_2; break; case TCPS_LISTEN: case TCPS_CLOSED: case TCPS_TIME_WAIT: default: break; }; } static void rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) { /* * We received an ack, and then did not * call send or were bounced out due to the * hpts was running. Now a timer is up as well, is * it the right timer? */ struct rack_sendmap *rsm; int tmr_up; tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) return; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && (tmr_up == PACE_TMR_RXT)) { /* Should be an RXT */ return; } if (rsm == NULL) { /* Nothing outstanding? */ if (tp->t_flags & TF_DELACK) { if (tmr_up == PACE_TMR_DELACK) /* We are supposed to have delayed ack up and we do */ return; } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { /* * if we hit enobufs then we would expect the possiblity * of nothing outstanding and the RXT up (and the hptsi timer). */ return; } else if (((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)) && (tmr_up == PACE_TMR_KEEP) && (tp->snd_max == tp->snd_una)) { /* We should have keep alive up and we do */ return; } } if (SEQ_GT(tp->snd_max, tp->snd_una) && ((tmr_up == PACE_TMR_TLP) || (tmr_up == PACE_TMR_RACK) || (tmr_up == PACE_TMR_RXT))) { /* * Either a Rack, TLP or RXT is fine if we * have outstanding data. */ return; } else if (tmr_up == PACE_TMR_DELACK) { /* * If the delayed ack was going to go off * before the rtx/tlp/rack timer were going to * expire, then that would be the timer in control. * Note we don't check the time here trusting the * code is correct. */ return; } /* * Ok the timer originally started is not what we want now. * We will force the hpts to be stopped if any, and restart * with the slot set to what was in the saved slot. */ if (rack->rc_inp->inp_in_hpts) { if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { uint32_t us_cts; us_cts = tcp_get_usecs(NULL); if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { rack->r_early = 1; rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); } rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; } tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); } rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); } static int rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, struct timeval *tv) { int32_t thflags, retval, did_out = 0; int32_t way_out = 0; uint32_t cts; uint32_t tiwin; struct timespec ts; struct tcpopt to; struct tcp_rack *rack; struct rack_sendmap *rsm; int32_t prev_state = 0; uint32_t us_cts; /* * tv passed from common code is from either M_TSTMP_LRO or * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The * rack_pacing stack assumes tv always refers to 'now', so we overwrite * tv here to guarantee that. */ if (m->m_flags & M_TSTMP_LRO) tcp_get_usecs(tv); cts = tcp_tv_to_mssectick(tv); rack = (struct tcp_rack *)tp->t_fb_ptr; if ((m->m_flags & M_TSTMP) || (m->m_flags & M_TSTMP_LRO)) { mbuf_tstmp2timespec(m, &ts); rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; } else rack->r_ctl.act_rcv_time = *tv; kern_prefetch(rack, &prev_state); prev_state = 0; thflags = th->th_flags; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval ltv; #ifdef NETFLIX_HTTP_LOGGING struct http_sendfile_track *http_req; if (SEQ_GT(th->th_ack, tp->snd_una)) { http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); } else { http_req = tcp_http_find_req_for_seq(tp, th->th_ack); } #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; if (rack->rack_no_prr == 0) log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; else log.u_bbr.flex1 = 0; log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; log.u_bbr.flex3 = m->m_flags; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; if (m->m_flags & M_TSTMP) { /* Record the hardware timestamp if present */ mbuf_tstmp2timespec(m, &ts); ltv.tv_sec = ts.tv_sec; ltv.tv_usec = ts.tv_nsec / 1000; log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); } else if (m->m_flags & M_TSTMP_LRO) { /* Record the LRO the arrival timestamp */ mbuf_tstmp2timespec(m, &ts); ltv.tv_sec = ts.tv_sec; ltv.tv_usec = ts.tv_nsec / 1000; log.u_bbr.flex5 = tcp_tv_to_usectick(<v); } log.u_bbr.timeStamp = tcp_get_usecs(<v); /* Log the rcv time */ log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; #ifdef NETFLIX_HTTP_LOGGING log.u_bbr.applimited = tp->t_http_closed; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_http_open; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_http_req; if (http_req) { /* Copy out any client req info */ /* seconds */ log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); /* useconds */ log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); log.u_bbr.rttProp = http_req->timestamp; log.u_bbr.cur_del_rate = http_req->start; if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { log.u_bbr.flex8 |= 1; } else { log.u_bbr.flex8 |= 2; log.u_bbr.bw_inuse = http_req->end; } log.u_bbr.flex6 = http_req->start_seq; if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { log.u_bbr.flex8 |= 4; log.u_bbr.epoch = http_req->end_seq; } } #endif TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, &log, true, <v); } if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { way_out = 4; retval = 0; goto done_with_input; } /* * If a segment with the ACK-bit set arrives in the SYN-SENT state * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return(1); } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If timestamps were negotiated during SYN/ACK and a * segment without a timestamp is received, silently drop * the segment, unless it is a RST segment or missing timestamps are * tolerated. * See section 3.2 of RFC 7323. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { way_out = 5; retval = 0; goto done_with_input; } /* * Segment received on connection. Reset idle time and keep-alive * timer. XXX: This should be done after segment validation to * ignore broken/spoofed segs. */ if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una) && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { counter_u64_add(rack_input_idle_reduces, 1); rack_cc_after_idle(rack, tp); } tp->t_rcvtime = ticks; /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. */ tiwin = th->th_win << tp->snd_scale; #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif if (tiwin > rack->r_ctl.rc_high_rwnd) rack->r_ctl.rc_high_rwnd = tiwin; /* * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move * this to occur after we've validated the segment. */ if (tp->t_flags2 & TF2_ECN_PERMIT) { if (thflags & TH_CWR) { tp->t_flags2 &= ~TF2_ECN_SND_ECE; tp->t_flags |= TF_ACKNOW; } switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags2 |= TF2_ECN_SND_ECE; KMOD_TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: KMOD_TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: KMOD_TCPSTAT_INC(tcps_ecn_ect1); break; } /* Process a packet differently from RFC3168. */ cc_ecnpkt_handler(tp, th, iptos); /* Congestion experienced. */ if (thflags & TH_ECE) { rack_cong_signal(tp, th, CC_ECN); } } /* * If echoed timestamp is later than the current time, fall back to * non RFC1323 RTT calculation. Normalize timestamp if syncookies * were used when this connection was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, cts)) to.to_tsecr = 0; } /* * If its the first time in we need to take care of options and * verify we can do SACK for rack! */ if (rack->r_state == 0) { /* Should be init'd by rack_init() */ KASSERT(rack->rc_inp != NULL, ("%s: rack->rc_inp unexpectedly NULL", __func__)); if (rack->rc_inp == NULL) { rack->rc_inp = tp->t_inpcb; } /* * Process options only when we get SYN/ACK back. The SYN * case for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. XXX * this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { /* Handle parallel SYN for ECN */ if (!(thflags & TH_ACK) && ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { tp->t_flags2 |= TF2_ECN_PERMIT; tp->t_flags2 |= TF2_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_shs); } if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } else tp->t_flags &= ~TF_REQ_SCALE; /* * Initial send window. It will be updated with the * next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if ((to.to_flags & TOF_TS) && (tp->t_flags & TF_REQ_TSTMP)) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = cts; } else tp->t_flags &= ~TF_REQ_TSTMP; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; if (IS_FASTOPEN(tp->t_flags)) { if (to.to_flags & TOF_FASTOPEN) { uint16_t mss; if (to.to_flags & TOF_MSS) mss = to.to_mss; else if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) mss = TCP6_MSS; else mss = TCP_MSS; tcp_fastopen_update_cache(tp, mss, to.to_tfo_len, to.to_tfo_cookie); } else tcp_fastopen_disable_path(tp); } } /* * At this point we are at the initial call. Here we decide * if we are doing RACK or not. We do this by seeing if * TF_SACK_PERMIT is set and the sack-not-required is clear. * The code now does do dup-ack counting so if you don't * switch back you won't get rack & TLP, but you will still * get this stack. */ if ((rack_sack_not_required == 0) && ((tp->t_flags & TF_SACK_PERMIT) == 0)) { tcp_switch_back_to_default(tp); (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, tlen, iptos); return (1); } /* Set the flag */ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; tcp_set_hpts(tp->t_inpcb); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } if (thflags & TH_FIN) tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); if ((rack->rc_gp_dyn_mul) && (rack->use_fixed_rate == 0) && (rack->rc_always_pace)) { /* Check in on probertt */ rack_check_probe_rtt(rack, us_cts); } if (rack->forced_ack) { uint32_t us_rtt; /* * A persist or keep-alive was forced out, update our * min rtt time. Note we do not worry about lost * retransmissions since KEEP-ALIVES and persists * are usually way long on times of sending (though * if we were really paranoid or worried we could * at least use timestamps if available to validate). */ rack->forced_ack = 0; us_rtt = us_cts - rack->r_ctl.forced_ack_ts; if (us_rtt == 0) us_rtt = 1; rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); rack_apply_updated_usrtt(rack, us_rtt, us_cts); } /* * This is the one exception case where we set the rack state * always. All other times (timers etc) we must have a rack-state * set (so we assure we have done the checks above for SACK). */ rack->r_ctl.rc_rcvtime = cts; if (rack->r_state != tp->t_state) rack_set_state(tp, rack); if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) kern_prefetch(rsm, &prev_state); prev_state = rack->r_state; rack_clear_rate_sample(rack); retval = (*rack->r_substate) (m, th, so, tp, &to, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt, iptos); #ifdef INVARIANTS if ((retval == 0) && (tp->t_inpcb == NULL)) { panic("retval:%d tp:%p t_inpcb:NULL state:%d", retval, tp, prev_state); } #endif if (retval == 0) { /* * If retval is 1 the tcb is unlocked and most likely the tp * is gone. */ INP_WLOCK_ASSERT(tp->t_inpcb); if ((rack->rc_gp_dyn_mul) && (rack->rc_always_pace) && (rack->use_fixed_rate == 0) && rack->in_probe_rtt && (rack->r_ctl.rc_time_probertt_starts == 0)) { /* * If we are going for target, lets recheck before * we output. */ rack_check_probe_rtt(rack, us_cts); } if (rack->set_pacing_done_a_iw == 0) { /* How much has been acked? */ if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { /* We have enough to set in the pacing segment size */ rack->set_pacing_done_a_iw = 1; rack_set_pace_segments(tp, rack, __LINE__); } } tcp_rack_xmit_timer_commit(rack, tp); if (nxt_pkt == 0) { if (rack->r_wanted_output != 0) { do_output_now: did_out = 1; (void)tp->t_fb->tfb_tcp_output(tp); } rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); } if ((nxt_pkt == 0) && ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && (SEQ_GT(tp->snd_max, tp->snd_una) || (tp->t_flags & TF_DELACK) || ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING)))) { /* We could not send (probably in the hpts but stopped the timer earlier)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && (rack->rc_inp->inp_in_hpts) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* keep alive not needed if we are hptsi output yet */ ; } else { int late = 0; if (rack->rc_inp->inp_in_hpts) { if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { us_cts = tcp_get_usecs(NULL); if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { rack->r_early = 1; rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); } else late = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; } tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); } if (late && (did_out == 0)) { /* * We are late in the sending * and we did not call the output * (this probably should not happen). */ goto do_output_now; } rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); } way_out = 1; } else if (nxt_pkt == 0) { /* Do we have the correct timer running? */ rack_timer_audit(tp, rack, &so->so_snd); way_out = 2; } done_with_input: rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); if (did_out) rack->r_wanted_output = 0; #ifdef INVARIANTS if (tp->t_inpcb == NULL) { panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", did_out, retval, tp, prev_state); } #endif } return (retval); } void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) { struct timeval tv; /* First lets see if we have old packets */ if (tp->t_in_pkt) { if (ctf_do_queued_segments(so, tp, 1)) { m_freem(m); return; } } if (m->m_flags & M_TSTMP_LRO) { tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; } else { /* Should not be should we kassert instead? */ tcp_get_usecs(&tv); } if(rack_do_segment_nounlock(m, th, so, tp, drop_hdrlen, tlen, iptos, 0, &tv) == 0) { tcp_handle_wakeup(tp, so); INP_WUNLOCK(tp->t_inpcb); } } struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) { struct rack_sendmap *rsm = NULL; int32_t idx; uint32_t srtt = 0, thresh = 0, ts_low = 0; /* Return the next guy to be re-transmitted */ if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { return (NULL); } if (tp->t_flags & TF_SENTFIN) { /* retran the end FIN? */ return (NULL); } /* ok lets look at this one */ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { goto check_it; } rsm = rack_find_lowest_rsm(rack); if (rsm == NULL) { return (NULL); } check_it: if (rsm->r_flags & RACK_ACKED) { return (NULL); } if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && (rsm->r_dupack < DUP_ACK_THRESHOLD)) { /* Its not yet ready */ return (NULL); } srtt = rack_grab_rtt(tp, rack); idx = rsm->r_rtr_cnt - 1; ts_low = rsm->r_tim_lastsent[idx]; thresh = rack_calc_thresh_rack(rack, srtt, tsused); if ((tsused == ts_low) || (TSTMP_LT(tsused, ts_low))) { /* No time since sending */ return (NULL); } if ((tsused - ts_low) < thresh) { /* It has not been long enough yet */ return (NULL); } if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || ((rsm->r_flags & RACK_SACK_PASSED) && (rack->sack_attack_disable == 0))) { /* * We have passed the dup-ack threshold * a SACK has indicated this is missing. * Note that if you are a declared attacker * it is only the dup-ack threshold that * will cause retransmits. */ /* log retransmit reason */ rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); return (rsm); } return (NULL); } static void rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = slot; log.u_bbr.flex2 = len; log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; log.u_bbr.use_lt_bw = rack->app_limited_needs_set; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw = rack->rc_gp_filled; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; log.u_bbr.use_lt_bw <<= 1; log.u_bbr.use_lt_bw |= rack->in_probe_rtt; log.u_bbr.pkt_epoch = line; log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; log.u_bbr.bw_inuse = bw_est; log.u_bbr.delRate = bw; if (rack->r_ctl.gp_bw == 0) log.u_bbr.cur_del_rate = 0; else log.u_bbr.cur_del_rate = rack_get_bw(rack); log.u_bbr.rttProp = len_time; log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { /* We are in slow start */ log.u_bbr.flex7 = 1; } else { /* we are on congestion avoidance */ log.u_bbr.flex7 = 0; } log.u_bbr.flex8 = method; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; log.u_bbr.cwnd_gain <<= 1; log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; log.u_bbr.cwnd_gain <<= 1; log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_HPTSI_CALC, 0, 0, &log, false, &tv); } } static uint32_t rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) { uint32_t new_tso, user_max; user_max = rack->rc_user_set_max_segs * mss; if (rack->rc_force_max_seg) { return (user_max); } if (rack->use_fixed_rate && ((rack->r_ctl.crte == NULL) || (bw != rack->r_ctl.crte->rate))) { /* Use the user mss since we are not exactly matched */ return (user_max); } new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); if (new_tso > user_max) new_tso = user_max; return(new_tso); } static void rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp, uint64_t rate, uint64_t hw_rate, int line, int error) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.bw_inuse = rate; log.u_bbr.flex5 = line; log.u_bbr.flex6 = error; log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; log.u_bbr.flex8 = rack->use_fixed_rate; log.u_bbr.flex8 <<= 1; log.u_bbr.flex8 |= rack->rack_hdrw_pacing; log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_HDWR_PACE, 0, 0, &log, false, &tv); } } static int32_t pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz) { uint64_t lentim, fill_bw; /* Lets first see if we are full, if so continue with normal rate */ if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) return (slot); if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) return (slot); if (rack->r_ctl.rc_last_us_rtt == 0) return (slot); if (rack->rc_pace_fill_if_rttin_range && (rack->r_ctl.rc_last_us_rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { /* The rtt is huge, N * smallest, lets not fill */ return (slot); } /* * first lets calculate the b/w based on the last us-rtt * and the sndwnd. */ fill_bw = rack->r_ctl.cwnd_to_use; /* Take the rwnd if its smaller */ if (fill_bw > rack->rc_tp->snd_wnd) fill_bw = rack->rc_tp->snd_wnd; fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; /* We are below the min b/w */ if (fill_bw < RACK_MIN_BW) return (slot); /* * Ok fill_bw holds our mythical b/w to fill the cwnd * in a rtt, what does that time wise equate too? */ lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; lentim /= fill_bw; if (lentim < slot) { rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 0, lentim, 12, __LINE__, NULL); return ((int32_t)lentim); } else return (slot); } static int32_t rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) { struct rack_sendmap *lrsm; int32_t slot = 0; int err; if (rack->rc_always_pace == 0) { /* * We use the most optimistic possible cwnd/srtt for * sending calculations. This will make our * calculation anticipate getting more through * quicker then possible. But thats ok we don't want * the peer to have a gap in data sending. */ uint32_t srtt, cwnd, tr_perms = 0; int32_t reduce = 0; old_method: /* * We keep no precise pacing with the old method * instead we use the pacer to mitigate bursts. */ rack->r_ctl.rc_agg_delayed = 0; rack->r_early = 0; rack->r_late = 0; rack->r_ctl.rc_agg_early = 0; if (rack->r_ctl.rc_rack_min_rtt) srtt = rack->r_ctl.rc_rack_min_rtt; else srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); if (rack->r_ctl.rc_rack_largest_cwnd) cwnd = rack->r_ctl.rc_rack_largest_cwnd; else cwnd = rack->r_ctl.cwnd_to_use; tr_perms = cwnd / srtt; if (tr_perms == 0) { tr_perms = ctf_fixed_maxseg(tp); } /* * Calculate how long this will take to drain, if * the calculation comes out to zero, thats ok we * will use send_a_lot to possibly spin around for * more increasing tot_len_this_send to the point * that its going to require a pace, or we hit the * cwnd. Which in that case we are just waiting for * a ACK. */ slot = len / tr_perms; /* Now do we reduce the time so we don't run dry? */ if (slot && rack_slot_reduction) { reduce = (slot / rack_slot_reduction); if (reduce < slot) { slot -= reduce; } else slot = 0; } slot *= HPTS_USEC_IN_MSEC; if (rsm == NULL) { /* * We always consider ourselves app limited with old style * that are not retransmits. This could be the initial * measurement, but thats ok its all setup and specially * handled. If another send leaks out, then that too will * be mark app-limited. */ lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { rack->r_ctl.rc_first_appl = lrsm; lrsm->r_flags |= RACK_APP_LIMITED; rack->r_ctl.rc_app_limited_cnt++; } } rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); } else { uint64_t bw_est, res, lentim, rate_wanted; uint32_t orig_val, srtt, segs, oh; if ((rack->r_rr_config == 1) && rsm) { return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC); } if (rack->use_fixed_rate) { rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); } else if ((rack->r_ctl.init_rate == 0) && #ifdef NETFLIX_PEAKRATE (rack->rc_tp->t_maxpeakrate == 0) && #endif (rack->r_ctl.gp_bw == 0)) { /* no way to yet do an estimate */ bw_est = rate_wanted = 0; } else { bw_est = rack_get_bw(rack); rate_wanted = rack_get_output_bw(rack, bw_est, rsm); } if ((bw_est == 0) || (rate_wanted == 0)) { /* * No way yet to make a b/w estimate or * our raise is set incorrectly. */ goto old_method; } /* We need to account for all the overheads */ segs = (len + segsiz - 1) / segsiz; /* * We need the diff between 1514 bytes (e-mtu with e-hdr) * and how much data we put in each packet. Yes this * means we may be off if we are larger than 1500 bytes * or smaller. But this just makes us more conservative. */ if (ETHERNET_SEGMENT_SIZE > segsiz) oh = ETHERNET_SEGMENT_SIZE - segsiz; else oh = 0; segs *= oh; lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; res = lentim / rate_wanted; slot = (uint32_t)res; orig_val = rack->r_ctl.rc_pace_max_segs; rack_set_pace_segments(rack->rc_tp, rack, __LINE__); /* Did we change the TSO size, if so log it */ if (rack->r_ctl.rc_pace_max_segs != orig_val) rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); if ((rack->rc_pace_to_cwnd) && (rack->in_probe_rtt == 0) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { /* * We want to pace at our rate *or* faster to * fill the cwnd to the max if its not full. */ slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz); } if ((rack->rc_inp->inp_route.ro_nh != NULL) && (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { if ((rack->rack_hdw_pace_ena) && (rack->rack_hdrw_pacing == 0) && (rack->rack_attempt_hdwr_pace == 0)) { /* * Lets attempt to turn on hardware pacing * if we can. */ rack->rack_attempt_hdwr_pace = 1; rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, rack->rc_inp->inp_route.ro_nh->nh_ifp, rate_wanted, RS_PACING_GEQ, &err, NULL); if (rack->r_ctl.crte) { rack->rack_hdrw_pacing = 1; rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rack->rc_tp, rate_wanted, segsiz, 0, rack->r_ctl.crte, NULL); rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, rate_wanted, rack->r_ctl.crte->rate, __LINE__, err); } } else if (rack->rack_hdrw_pacing && (rack->r_ctl.crte->rate != rate_wanted)) { /* Do we need to adjust our rate? */ const struct tcp_hwrate_limit_table *nrte; nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, rack->rc_tp, rack->rc_inp->inp_route.ro_nh->nh_ifp, rate_wanted, RS_PACING_GEQ, &err, NULL); if (nrte == NULL) { /* Lost the rate */ rack->rack_hdrw_pacing = 0; rack_set_pace_segments(rack->rc_tp, rack, __LINE__); } else if (nrte != rack->r_ctl.crte) { rack->r_ctl.crte = nrte; rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rack->rc_tp, rate_wanted, segsiz, 0, rack->r_ctl.crte, NULL); rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, rate_wanted, rack->r_ctl.crte->rate, __LINE__, err); } } } if (rack_limit_time_with_srtt && (rack->use_fixed_rate == 0) && #ifdef NETFLIX_PEAKRATE (rack->rc_tp->t_maxpeakrate == 0) && #endif (rack->rack_hdrw_pacing == 0)) { /* * Sanity check, we do not allow the pacing delay * to be longer than the SRTT of the path. If it is * a slow path, then adding a packet should increase * the RTT and compensate for this i.e. the srtt will * be greater so the allowed pacing time will be greater. * * Note this restriction is not for where a peak rate * is set, we are doing fixed pacing or hardware pacing. */ if (rack->rc_tp->t_srtt) srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); else srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ if (srtt < slot) { rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); slot = srtt; } } rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); } if (slot) counter_u64_add(rack_calc_nonzero, 1); else counter_u64_add(rack_calc_zero, 1); return (slot); } static void rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq startseq, uint32_t sb_offset) { struct rack_sendmap *my_rsm = NULL; struct rack_sendmap fe; if (tp->t_state < TCPS_ESTABLISHED) { /* * We don't start any measurements if we are * not at least established. */ return; } tp->t_flags |= TF_GPUTINPROG; rack->r_ctl.rc_gp_lowrtt = 0xffffffff; rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; tp->gput_seq = startseq; rack->app_limited_needs_set = 0; if (rack->in_probe_rtt) rack->measure_saw_probe_rtt = 1; else if ((rack->measure_saw_probe_rtt) && (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) rack->measure_saw_probe_rtt = 0; if (rack->rc_gp_filled) tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); else { /* Special case initial measurement */ rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL); } /* * We take a guess out into the future, * if we have no measurement and no * initial rate, we measure the first * initial-windows worth of data to * speed up getting some GP measurement and * thus start pacing. */ if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { rack->app_limited_needs_set = 1; tp->gput_ack = startseq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, 0, tp->gput_ts, rack->r_ctl.rc_app_limited_cnt, 9, __LINE__, NULL); return; } if (sb_offset) { /* * We are out somewhere in the sb * can we use the already outstanding data? */ if (rack->r_ctl.rc_app_limited_cnt == 0) { /* * Yes first one is good and in this case * the tp->gput_ts is correctly set based on * the last ack that arrived (no need to * set things up when an ack comes in). */ my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if ((my_rsm == NULL) || (my_rsm->r_rtr_cnt != 1)) { /* retransmission? */ goto use_latest; } } else { if (rack->r_ctl.rc_first_appl == NULL) { /* * If rc_first_appl is NULL * then the cnt should be 0. * This is probably an error, maybe * a KASSERT would be approprate. */ goto use_latest; } /* * If we have a marker pointer to the last one that is * app limited we can use that, but we need to set * things up so that when it gets ack'ed we record * the ack time (if its not already acked). */ rack->app_limited_needs_set = 1; /* * We want to get to the rsm that is either * next with space i.e. over 1 MSS or the one * after that (after the app-limited). */ my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rack->r_ctl.rc_first_appl); if (my_rsm) { if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) /* Have to use the next one */ my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, my_rsm); else { /* Use after the first MSS of it is acked */ tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); goto start_set; } } if ((my_rsm == NULL) || (my_rsm->r_rtr_cnt != 1)) { /* * Either its a retransmit or * the last is the app-limited one. */ goto use_latest; } } tp->gput_seq = my_rsm->r_start; start_set: if (my_rsm->r_flags & RACK_ACKED) { /* * This one has been acked use the arrival ack time */ tp->gput_ts = my_rsm->r_ack_arrival; rack->app_limited_needs_set = 0; } rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, (uint64_t)my_rsm, tp->gput_ts, rack->r_ctl.rc_app_limited_cnt, 9, __LINE__, NULL); return; } use_latest: /* * We don't know how long we may have been * idle or if this is the first-send. Lets * setup the flag so we will trim off * the first ack'd data so we get a true * measurement. */ rack->app_limited_needs_set = 1; tp->gput_ack = startseq + rack_get_measure_window(tp, rack); /* Find this guy so we can pull the send time */ fe.r_start = startseq; my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); if (my_rsm) { rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; if (my_rsm->r_flags & RACK_ACKED) { /* * Unlikely since its probably what was * just transmitted (but I am paranoid). */ tp->gput_ts = my_rsm->r_ack_arrival; rack->app_limited_needs_set = 0; } if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { /* This also is unlikely */ tp->gput_seq = my_rsm->r_start; } } else { /* * TSNH unless we have some send-map limit, * and even at that it should not be hitting * that limit (we should have stopped sending). */ rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); } rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, (uint64_t)my_rsm, tp->gput_ts, rack->r_ctl.rc_app_limited_cnt, 9, __LINE__, NULL); } static inline uint32_t rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, uint32_t avail, int32_t sb_offset) { uint32_t len; uint32_t sendwin; if (tp->snd_wnd > cwnd_to_use) sendwin = cwnd_to_use; else sendwin = tp->snd_wnd; if (ctf_outstanding(tp) >= tp->snd_wnd) { /* We never want to go over our peers rcv-window */ len = 0; } else { uint32_t flight; flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); if (flight >= sendwin) { /* * We have in flight what we are allowed by cwnd (if * it was rwnd blocking it would have hit above out * >= tp->snd_wnd). */ return (0); } len = sendwin - flight; if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { /* We would send too much (beyond the rwnd) */ len = tp->snd_wnd - ctf_outstanding(tp); } if ((len + sb_offset) > avail) { /* * We don't have that much in the SB, how much is * there? */ len = avail - sb_offset; } } return (len); } static int rack_output(struct tcpcb *tp) { struct socket *so; uint32_t recwin; uint32_t sb_offset; int32_t len, flags, error = 0; struct mbuf *m; struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; uint32_t if_hw_tsomaxsegsize; int32_t segsiz, minseg; long tot_len_this_send = 0; struct ip *ip = NULL; #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif struct udphdr *udp = NULL; struct tcp_rack *rack; struct tcphdr *th; uint8_t pass = 0; uint8_t mark = 0; uint8_t wanted_cookie = 0; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen, ulen=0; uint32_t rack_seq; #if defined(IPSEC) || defined(IPSEC_SUPPORT) unsigned ipsec_optlen = 0; #endif int32_t idle, sendalot; int32_t sub_from_prr = 0; volatile int32_t sack_rxmit; struct rack_sendmap *rsm = NULL; int32_t tso, mtu; struct tcpopt to; int32_t slot = 0; int32_t sup_rack = 0; uint32_t cts, us_cts, delayed, early; uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; uint32_t cwnd_to_use; int32_t do_a_prefetch; int32_t prefetch_rsm = 0; int32_t orig_len; struct timeval tv; int32_t prefetch_so_done = 0; struct tcp_log_buffer *lgb = NULL; struct inpcb *inp; struct sockbuf *sb; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif uint8_t filled_all = 0; bool hw_tls = false; /* setup and take the cache hits here */ rack = (struct tcp_rack *)tp->t_fb_ptr; inp = rack->rc_inp; so = inp->inp_socket; sb = &so->so_snd; kern_prefetch(sb, &do_a_prefetch); do_a_prefetch = 1; hpts_calling = inp->inp_hpts_calls; hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif /* * For TFO connections in SYN_RECEIVED, only allow the initial * SYN|ACK and those sent by the retransmit timer. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ return (0); #ifdef INET6 if (rack->r_state) { /* Use the cache line loaded if possible */ isipv6 = rack->r_is_v6; } else { isipv6 = (inp->inp_vflag & INP_IPV6) != 0; } #endif early = 0; us_cts = tcp_get_usecs(&tv); cts = tcp_tv_to_mssectick(&tv); if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && inp->inp_in_hpts) { /* * We are on the hpts for some timer but not hptsi output. * Remove from the hpts unconditionally. */ rack_timer_cancel(tp, rack, cts, __LINE__); } /* Are we pacing and late? */ if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) { /* We are delayed */ delayed = us_cts - rack->r_ctl.rc_last_output_to; } else { delayed = 0; } /* Do the timers, which may override the pacer */ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { if (rack_process_timers(tp, rack, cts, hpts_calling)) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); return (0); } } if ((rack->r_timer_override) || (delayed) || (tp->t_state < TCPS_ESTABLISHED)) { if (tp->t_inpcb->inp_in_hpts) tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); } else if (tp->t_inpcb->inp_in_hpts) { /* * On the hpts you can't pass even if ACKNOW is on, we will * when the hpts fires. */ counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); return (0); } inp->inp_hpts_calls = 0; /* Finish out both pacing early and late accounting */ if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { early = rack->r_ctl.rc_last_output_to - us_cts; } else early = 0; if (delayed) { rack->r_ctl.rc_agg_delayed += delayed; rack->r_late = 1; } else if (early) { rack->r_ctl.rc_agg_early += early; rack->r_early = 1; } /* Now that early/late accounting is done turn off the flag */ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; rack->r_wanted_output = 0; rack->r_timer_override = 0; /* * For TFO connections in SYN_SENT or SYN_RECEIVED, * only allow the initial SYN or SYN|ACK and those sent * by the retransmit timer. */ if (IS_FASTOPEN(tp->t_flags) && ((tp->t_state == TCPS_SYN_RECEIVED) || (tp->t_state == TCPS_SYN_SENT)) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ (tp->t_rxtshift == 0)) { /* not a retransmit */ cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; goto just_return_nolock; } /* * Determine length of data that should be transmitted, and flags * that will be used. If there is some data or critical controls * (SYN, RST) to send, then transmit; otherwise, investigate * further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); if (tp->t_idle_reduce) { if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) rack_cc_after_idle(rack, tp); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { if (tp->t_flags & TF_MORETOCOME) { tp->t_flags |= TF_LASTIDLE; idle = 0; } } if ((tp->snd_una == tp->snd_max) && rack->r_ctl.rc_went_idle_time && TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) { idle = us_cts - rack->r_ctl.rc_went_idle_time; if (idle > rack_min_probertt_hold) { /* Count as a probe rtt */ if (rack->in_probe_rtt == 0) { rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; } else { rack_exit_probertt(rack, us_cts); } } idle = 0; } again: /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ sendalot = 0; us_cts = tcp_get_usecs(&tv); cts = tcp_tv_to_mssectick(&tv); tso = 0; mtu = 0; segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); minseg = segsiz; sb_offset = tp->snd_max - tp->snd_una; cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; #ifdef NETFLIX_SHARED_CWND if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && rack->rack_enable_scwnd) { /* We are doing cwnd sharing */ if (rack->rc_gp_filled && (rack->rack_attempted_scwnd == 0) && (rack->r_ctl.rc_scw == NULL) && tp->t_lib) { /* The pcbid is in, lets make an attempt */ counter_u64_add(rack_try_scwnd, 1); rack->rack_attempted_scwnd = 1; rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, &rack->r_ctl.rc_scw_index, segsiz); } if (rack->r_ctl.rc_scw && (rack->rack_scwnd_is_idle == 1) && (rack->rc_in_persist == 0) && sbavail(sb)) { /* we are no longer out of data */ tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); rack->rack_scwnd_is_idle = 0; } if (rack->r_ctl.rc_scw) { /* First lets update and get the cwnd */ rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index, tp->snd_cwnd, tp->snd_wnd, segsiz); } } #endif flags = tcp_outflags[tp->t_state]; while (rack->rc_free_cnt < rack_free_cache) { rsm = rack_alloc(rack); if (rsm == NULL) { if (inp->inp_hpts_calls) /* Retry in a ms */ slot = (1 * HPTS_USEC_IN_MSEC); goto just_return_nolock; } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); rack->rc_free_cnt++; rsm = NULL; } if (inp->inp_hpts_calls) inp->inp_hpts_calls = 0; sack_rxmit = 0; len = 0; rsm = NULL; if (flags & TH_RST) { SOCKBUF_LOCK(sb); goto send; } if (rack->r_ctl.rc_resend) { /* Retransmit timer */ rsm = rack->r_ctl.rc_resend; rack->r_ctl.rc_resend = NULL; rsm->r_flags &= ~RACK_TLP; len = rsm->r_end - rsm->r_start; sack_rxmit = 1; sendalot = 0; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; if (len >= segsiz) len = segsiz; } else if ((rack->rc_in_persist == 0) && ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { /* We have a retransmit that takes precedence */ rsm->r_flags &= ~RACK_TLP; if ((!IN_RECOVERY(tp->t_flags)) && ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { /* Enter recovery if not induced by a time-out */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; rack_cong_signal(tp, NULL, CC_NDUPACK); /* * When we enter recovery we need to assure we send * one packet. */ if (rack->rack_no_prr == 0) { rack->r_ctl.rc_prr_sndcnt = segsiz; rack_log_to_prr(rack, 13, 0); } } #ifdef INVARIANTS if (SEQ_LT(rsm->r_start, tp->snd_una)) { panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", tp, rack, rsm, rsm->r_start, tp->snd_una); } #endif len = rsm->r_end - rsm->r_start; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; /* Can we send it within the PRR boundary? */ if (rack->rack_no_prr == 0) { if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { /* It does not fit */ if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && (rack->r_ctl.rc_prr_sndcnt < segsiz)) { /* * prr is less than a segment, we * have more acks due in besides * what we need to resend. Lets not send * to avoid sending small pieces of * what we need to retransmit. */ len = 0; goto just_return_nolock; } len = rack->r_ctl.rc_prr_sndcnt; } } sendalot = 0; if (len >= segsiz) len = segsiz; if (len > 0) { sub_from_prr = 1; sack_rxmit = 1; KMOD_TCPSTAT_INC(tcps_sack_rexmits); KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, segsiz)); counter_u64_add(rack_rtm_prr_retran, 1); } } else if (rack->r_ctl.rc_tlpsend) { /* Tail loss probe */ long cwin; long tlen; doing_tlp = 1; /* * Check if we can do a TLP with a RACK'd packet * this can happen if we are not doing the rack * cheat and we skipped to a TLP and it * went off. */ rsm = rack->r_ctl.rc_tlpsend; rsm->r_flags |= RACK_TLP; rack->r_ctl.rc_tlpsend = NULL; sack_rxmit = 1; tlen = rsm->r_end - rsm->r_start; if (tlen > segsiz) tlen = segsiz; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; cwin = min(tp->snd_wnd, tlen); len = cwin; } /* * Enforce a connection sendmap count limit if set * as long as we are not retransmiting. */ if ((rsm == NULL) && (rack->do_detection == 0) && (V_tcp_map_entries_limit > 0) && (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { counter_u64_add(rack_to_alloc_limited, 1); if (!rack->alloc_limit_reported) { rack->alloc_limit_reported = 1; counter_u64_add(rack_alloc_limited_conns, 1); } goto just_return_nolock; } if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { /* we are retransmitting the fin */ len--; if (len) { /* * When retransmitting data do *not* include the * FIN. This could happen from a TLP probe. */ flags &= ~TH_FIN; } } #ifdef INVARIANTS /* For debugging */ rack->r_ctl.rc_rsm_at_retran = rsm; #endif /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ if (tp->t_flags & TF_NEEDFIN) flags |= TH_FIN; if (tp->t_flags & TF_NEEDSYN) flags |= TH_SYN; if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { void *end_rsm; end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (end_rsm) kern_prefetch(end_rsm, &prefetch_rsm); prefetch_rsm = 1; } SOCKBUF_LOCK(sb); /* * If snd_nxt == snd_max and we have transmitted a FIN, the * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a * negative length. This can also occur when TCP opens up its * congestion window while receiving additional duplicate acks after * fast-retransmit because TCP will reset snd_nxt to snd_max after * the fast-retransmit. * * In the normal retransmit-FIN-only case, however, snd_nxt will be * set to snd_una, the sb_offset will be 0, and the length may wind * up 0. * * If sack_rxmit is true we are retransmitting from the scoreboard * in which case len is already set. */ if ((sack_rxmit == 0) && (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) { uint32_t avail; avail = sbavail(sb); if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) sb_offset = tp->snd_nxt - tp->snd_una; else sb_offset = 0; if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { if (rack->r_ctl.rc_tlp_new_data) { /* TLP is forcing out new data */ if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); } if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) len = tp->snd_wnd; else len = rack->r_ctl.rc_tlp_new_data; rack->r_ctl.rc_tlp_new_data = 0; new_data_tlp = doing_tlp = 1; } else len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); if (IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) { /* * For prr=off, we need to send only 1 MSS * at a time. We do this because another sack could * be arriving that causes us to send retransmits and * we don't want to be on a long pace due to a larger send * that keeps us from sending out the retransmit. */ len = segsiz; } } else { uint32_t outstanding; /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible so far in the scoreboard. */ outstanding = tp->snd_max - tp->snd_una; if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { if (tp->snd_wnd > outstanding) { len = tp->snd_wnd - outstanding; /* Check to see if we have the data */ if ((sb_offset + len) > avail) { /* It does not all fit */ if (avail > sb_offset) len = avail - sb_offset; else len = 0; } } else len = 0; } else if (avail > sb_offset) len = avail - sb_offset; else len = 0; if (len > 0) { if (len > rack->r_ctl.rc_prr_sndcnt) len = rack->r_ctl.rc_prr_sndcnt; if (len > 0) { sub_from_prr = 1; counter_u64_add(rack_rtm_prr_newdata, 1); } } if (len > segsiz) { /* * We should never send more than a MSS when * retransmitting or sending new data in prr * mode unless the override flag is on. Most * likely the PRR algorithm is not going to * let us send a lot as well :-) */ if (rack->r_ctl.rc_prr_sendalot == 0) len = segsiz; } else if (len < segsiz) { /* * Do we send any? The idea here is if the * send empty's the socket buffer we want to * do it. However if not then lets just wait * for our prr_sndcnt to get bigger. */ long leftinsb; leftinsb = sbavail(sb) - sb_offset; if (leftinsb > len) { /* This send does not empty the sb */ len = 0; } } } } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { /* * If you have not established * and are not doing FAST OPEN * no data please. */ if ((sack_rxmit == 0) && (!IS_FASTOPEN(tp->t_flags))){ len = 0; sb_offset = 0; } } if (prefetch_so_done == 0) { kern_prefetch(so, &prefetch_so_done); prefetch_so_done = 1; } /* * Lop off SYN bit if it has already been sent. However, if this is * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; } /* * Be careful not to send data and/or FIN on SYN segments. This * measure is needed to prevent interoperability problems with not * fully conformant TCP implementations. */ if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { len = 0; flags &= ~TH_FIN; } /* * On TFO sockets, ensure no data is sent in the following cases: * * - When retransmitting SYN|ACK on a passively-created socket * * - When retransmitting SYN on an actively created socket * * - When sending a zero-length cookie (cookie request) on an * actively created socket * * - When the socket is in the CLOSED state (RST is being sent) */ if (IS_FASTOPEN(tp->t_flags) && (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || ((tp->t_state == TCPS_SYN_SENT) && (tp->t_tfo_client_cookie_len == 0)) || (flags & TH_RST))) { sack_rxmit = 0; len = 0; } /* Without fast-open there should never be data sent on a SYN */ if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { tp->snd_nxt = tp->iss; len = 0; } orig_len = len; if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been * called to retransmit, len will be < 0. Otherwise, window * shrank after we sent into it. If window shrank to 0, * cancel pending retransmit, pull snd_nxt back to (closed) * window, and set the persist timer if it isn't already * going. If the window didn't close completely, just wait * for an ACK. * * We also do a general check here to ensure that we will * set the persist timer when we have data to send, but a * 0-byte window. This makes sure the persist timer is set * even if the packet hits one of the "goto send" lines * below. */ len = 0; if ((tp->snd_wnd == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (tp->snd_una == tp->snd_max) && (sb_offset < (int)sbavail(sb))) { tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, cts); } } else if ((rsm == NULL) && ((doing_tlp == 0) || (new_data_tlp == 1)) && (len < rack->r_ctl.rc_pace_max_segs)) { /* * We are not sending a maximum sized segment for * some reason. Should we not send anything (think * sws or persists)? */ if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (len < minseg) && (len < (int)(sbavail(sb) - sb_offset))) { /* * Here the rwnd is less than * the minimum pacing size, this is not a retransmit, * we are established and * the send is not the last in the socket buffer * we send nothing, and we may enter persists * if nothing is outstanding. */ len = 0; if (tp->snd_max == tp->snd_una) { /* * Nothing out we can * go into persists. */ rack_enter_persist(tp, rack, cts); tp->snd_nxt = tp->snd_una; } } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && (len < (int)(sbavail(sb) - sb_offset)) && (len < minseg)) { /* * Here we are not retransmitting, and * the cwnd is not so small that we could * not send at least a min size (rxt timer * not having gone off), We have 2 segments or * more already in flight, its not the tail end * of the socket buffer and the cwnd is blocking * us from sending out a minimum pacing segment size. * Lets not send anything. */ len = 0; } else if (((tp->snd_wnd - ctf_outstanding(tp)) < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && (len < (int)(sbavail(sb) - sb_offset)) && (TCPS_HAVEESTABLISHED(tp->t_state))) { /* * Here we have a send window but we have * filled it up and we can't send another pacing segment. * We also have in flight more than 2 segments * and we are not completing the sb i.e. we allow * the last bytes of the sb to go out even if * its not a full pacing segment. */ len = 0; } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use)); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). * * TSO may only be used if we are in a pure bulk sending state. The * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP * options prevent using TSO. With TSO the TCP header is the same * (except for the sequence number) for all generated packets. This * makes it impossible to transmit any options which vary per * generated segment or packet. * * IPv4 handling has a clear separation of ip options and ip header * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does * the right thing below to provide length of just ip options and thus * checking for ipoptlen is enough to decide if ip options are present. */ #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses * of IPsec that way and can actually decide if TSO is ok. */ #ifdef INET6 if (isipv6 && IPSEC_ENABLED(ipv6)) ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); #ifdef INET else #endif #endif /* INET6 */ #ifdef INET if (IPSEC_ENABLED(ipv4)) ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); #endif /* INET */ #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0) tso = 1; { uint32_t outstanding; outstanding = tp->snd_max - tp->snd_una; if (tp->t_flags & TF_SENTFIN) { /* * If we sent a fin, snd_max is 1 higher than * snd_una */ outstanding--; } if (sack_rxmit) { if ((rsm->r_flags & RACK_HAS_FIN) == 0) flags &= ~TH_FIN; } else { if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + sbused(sb))) flags &= ~TH_FIN; } } recwin = lmin(lmax(sbspace(&so->so_rcv), 0), (long)TCP_MAXWIN << tp->rcv_scale); /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * * - We have a full segment (or more with TSO) - This is the last * buffer in a write()/send() and we are either idle or running * NODELAY - we've timed out (e.g. persist timer) - we have more * then 1/2 the maximum send window's worth of data (receiver may be * limited the window size) - we need to retransmit */ if (len) { if (len >= segsiz) { goto send; } /* * NOTE! on localhost connections an 'ack' from the remote * end may occur synchronously with the output and cause us * to flush a buffer queued with moretocome. XXX * */ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && (tp->t_flags & TF_NOPUSH) == 0) { pass = 2; goto send; } if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ pass = 22; goto send; } if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { pass = 4; goto send; } if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ pass = 5; goto send; } if (sack_rxmit) { pass = 6; goto send; } if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && (ctf_outstanding(tp) < (segsiz * 2))) { /* * We have less than two MSS outstanding (delayed ack) * and our rwnd will not let us send a full sized * MSS. Lets go ahead and let this small segment * out because we want to try to have at least two * packets inflight to not be caught by delayed ack. */ pass = 12; goto send; } } /* * Sending of standalone window updates. * * Window updates are important when we close our window due to a * full socket buffer and are opening it again after the application * reads data from it. Once the window has opened again and the * remote end starts to send again the ACK clock takes over and * provides the most current window information. * * We must avoid the silly window syndrome whereas every read from * the receive buffer, no matter how small, causes a window update * to be sent. We also should avoid sending a flurry of window * updates when the socket buffer had queued a lot of data and the * application is doing small reads. * * Prevent a flurry of pointless window updates by only sending an * update when we can increase the advertized window by more than * 1/4th of the socket buffer capacity. When the buffer is getting * full or is very small be more aggressive and send an update * whenever we can increase by two mss sized segments. In all other * situations the ACK's to new incoming data will carry further * window increases. * * Don't send an independent window update if a delayed ACK is * pending (it will get piggy-backed on it) or the remote side * already has done a half-close and won't send more data. Skip * this if the connection is in T/TCP half-open state. */ if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && !(tp->t_flags & TF_DELACK) && !TCPS_HAVERCVDFIN(tp->t_state)) { /* * "adv" is the amount we could increase the window, taking * into account that we are limited by TCP_MAXWIN << * tp->rcv_scale. */ int32_t adv; int oldwin; adv = recwin; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { oldwin = (tp->rcv_adv - tp->rcv_nxt); if (adv > oldwin) adv -= oldwin; else { /* We can't increase the window */ adv = 0; } } else oldwin = 0; /* * If the new window size ends up being the same as or less * than the old size when it is scaled, then don't force * a window update. */ if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) goto dontupdate; if (adv >= (int32_t)(2 * segsiz) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || so->so_rcv.sb_hiwat <= 8 * segsiz)) { pass = 7; goto send; } if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { pass = 23; goto send; } } dontupdate: /* * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW * is also a catch-all for the retransmit timer timeout case. */ if (tp->t_flags & TF_ACKNOW) { pass = 8; goto send; } if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { pass = 9; goto send; } /* * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. */ if ((flags & TH_FIN) && (tp->snd_nxt == tp->snd_una)) { pass = 11; goto send; } /* * No reason to send a segment, just return. */ just_return: SOCKBUF_UNLOCK(sb); just_return_nolock: { int app_limited = CTF_JR_SENT_DATA; if (tot_len_this_send > 0) { /* Make sure snd_nxt is up to max */ if (SEQ_GT(tp->snd_max, tp->snd_nxt)) tp->snd_nxt = tp->snd_max; slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); } else { int end_window = 0; uint32_t seq = tp->gput_ack; rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm) { /* * Mark the last sent that we just-returned (hinting * that delayed ack may play a role in any rtt measurement). */ rsm->r_just_ret = 1; } counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); rack->r_ctl.rc_agg_delayed = 0; rack->r_early = 0; rack->r_late = 0; rack->r_ctl.rc_agg_early = 0; if ((ctf_outstanding(tp) + min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) >= tp->snd_wnd) { /* We are limited by the rwnd */ app_limited = CTF_JR_RWND_LIMITED; } else if (ctf_outstanding(tp) >= sbavail(sb)) { /* We are limited by whats available -- app limited */ app_limited = CTF_JR_APP_LIMITED; } else if ((idle == 0) && ((tp->t_flags & TF_NODELAY) == 0) && ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && (len < segsiz)) { /* * No delay is not on and the * user is sending less than 1MSS. This * brings out SWS avoidance so we * don't send. Another app-limited case. */ app_limited = CTF_JR_APP_LIMITED; } else if (tp->t_flags & TF_NOPUSH) { /* * The user has requested no push of * the last segment and we are * at the last segment. Another app * limited case. */ app_limited = CTF_JR_APP_LIMITED; } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { /* Its the cwnd */ app_limited = CTF_JR_CWND_LIMITED; } else if (rack->rc_in_persist == 1) { /* We are in persists */ app_limited = CTF_JR_PERSISTS; } else if (IN_RECOVERY(tp->t_flags) && (rack->rack_no_prr == 0) && (rack->r_ctl.rc_prr_sndcnt < segsiz)) { app_limited = CTF_JR_PRR; } else { /* Now why here are we not sending? */ #ifdef NOW #ifdef INVARIANTS panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); #endif #endif app_limited = CTF_JR_ASSESSING; } /* * App limited in some fashion, for our pacing GP * measurements we don't want any gap (even cwnd). * Close down the measurement window. */ if (rack_cwnd_block_ends_measure && ((app_limited == CTF_JR_CWND_LIMITED) || (app_limited == CTF_JR_PRR))) { /* * The reason we are not sending is * the cwnd (or prr). We have been configured * to end the measurement window in * this case. */ end_window = 1; } else if (app_limited == CTF_JR_PERSISTS) { /* * We never end the measurement window * in persists, though in theory we * should be only entering after everything * is acknowledged (so we will probably * never come here). */ end_window = 0; } else if (rack_rwnd_block_ends_measure && (app_limited == CTF_JR_RWND_LIMITED)) { /* * We are rwnd limited and have been * configured to end the measurement * window in this case. */ end_window = 1; } else if (app_limited == CTF_JR_APP_LIMITED) { /* * A true application limited period, we have * ran out of data. */ end_window = 1; } else if (app_limited == CTF_JR_ASSESSING) { /* * In the assessing case we hit the end of * the if/else and had no known reason * This will panic us under invariants.. * * If we get this out in logs we need to * investagate which reason we missed. */ end_window = 1; } if (end_window) { uint8_t log = 0; if ((tp->t_flags & TF_GPUTINPROG) && SEQ_GT(tp->gput_ack, tp->snd_max)) { /* Mark the last packet has app limited */ tp->gput_ack = tp->snd_max; log = 1; } rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { if (rack->r_ctl.rc_app_limited_cnt == 0) rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; else { /* * Go out to the end app limited and mark * this new one as next and move the end_appl up * to this guy. */ if (rack->r_ctl.rc_end_appl) rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; rack->r_ctl.rc_end_appl = rsm; } rsm->r_flags |= RACK_APP_LIMITED; rack->r_ctl.rc_app_limited_cnt++; } if (log) rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_app_limited_cnt, seq, tp->gput_ack, 0, 0, 4, __LINE__, NULL); } } if (slot) { /* set the rack tcb into the slot N */ counter_u64_add(rack_paced_segments, 1); } else if (tot_len_this_send) { counter_u64_add(rack_unpaced_segments, 1); } /* Check if we need to go into persists or not */ if ((rack->rc_in_persist == 0) && (tp->snd_max == tp->snd_una) && TCPS_HAVEESTABLISHED(tp->t_state) && sbavail(sb) && (sbavail(sb) > tp->snd_wnd) && (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { /* Yes lets make sure to move to persist before timer-start */ rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); } rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); } #ifdef NETFLIX_SHARED_CWND if ((sbavail(sb) == 0) && rack->r_ctl.rc_scw) { tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); rack->rack_scwnd_is_idle = 1; } #endif return (0); send: if ((flags & TH_FIN) && sbavail(sb)) { /* * We do not transmit a FIN * with data outstanding. We * need to make it so all data * is acked first. */ flags &= ~TH_FIN; } /* Enforce stack imposed max seg size if we have one */ if (rack->r_ctl.rc_pace_max_segs && (len > rack->r_ctl.rc_pace_max_segs)) { mark = 1; len = rack->r_ctl.rc_pace_max_segs; } SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { if (len >= segsiz) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; } /* * Before ESTABLISHED, force sending of initial options unless TCP * set not to do any options. NOTE: we assume that the IP/TCP header * plus TCP options always fit in a single mbuf, leaving room for a * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) * + optlen <= MCLBYTES */ optlen = 0; #ifdef INET6 if (isipv6) hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else #endif hdrlen = sizeof(struct tcpiphdr); /* * Compute options for segment. We only have to care about SYN and * established connection segments. Options for SYN-ACK segments * are handled in TCP syncache. */ to.to_flags = 0; if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&inp->inp_inc); #ifdef NETFLIX_TCPOUDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; #endif to.to_flags |= TOF_MSS; /* * On SYN or SYN|ACK transmits on TFO connections, * only include the TFO option if it is not a * retransmit, as the presence of the TFO option may * have caused the original SYN or SYN|ACK to have * been dropped by a middlebox. */ if (IS_FASTOPEN(tp->t_flags) && (tp->t_rxtshift == 0)) { if (tp->t_state == TCPS_SYN_RECEIVED) { to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = (u_int8_t *)&tp->t_tfo_cookie.server; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; } else if (tp->t_state == TCPS_SYN_SENT) { to.to_tfo_len = tp->t_tfo_client_cookie_len; to.to_tfo_cookie = tp->t_tfo_cookie.client; to.to_flags |= TOF_FASTOPEN; wanted_cookie = 1; /* * If we wind up having more data to * send with the SYN than can fit in * one segment, don't send any more * until the SYN|ACK comes back from * the other end. */ sendalot = 0; } } } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { to.to_wscale = tp->request_r_scale; to.to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { to.to_tsval = cts + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); /* Selective ACK's. */ if (flags & TH_SYN) to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->rcv_numsacks > 0) { to.to_flags |= TOF_SACK; to.to_nsacks = tp->rcv_numsacks; to.to_sacks = (u_char *)tp->sackblks; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. */ if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && !(to.to_flags & TOF_FASTOPEN)) len = 0; } #ifdef NETFLIX_TCPOUDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ SOCKBUF_UNLOCK(&so->so_snd); return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); } #endif #ifdef INET6 if (isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif if (tp->t_inpcb->inp_options) ipoptlen = tp->t_inpcb->inp_options->m_len - offsetof(struct ipoption, ipopt_list); else ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif /* * Adjust data length if insertion of options will bump the packet * length beyond the t_maxseg length. Clear the FIN bit because we * cut off the tail of the segment. */ if (len + optlen + ipoptlen > tp->t_maxseg) { if (tso) { uint32_t if_hw_tsomax; uint32_t moff; int32_t max_len; /* extract TSO information */ if_hw_tsomax = tp->t_tsomax; if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; KASSERT(ipoptlen == 0, ("%s: TSO can't do IP options", __func__)); /* * Check if we should limit by maximum payload * length: */ if (if_hw_tsomax != 0) { /* compute maximum TSO length */ max_len = (if_hw_tsomax - hdrlen - max_linkhdr); if (max_len <= 0) { len = 0; } else if (len > max_len) { sendalot = 1; len = max_len; mark = 2; } } /* * Prevent the last segment from being fractional * unless the send sockbuf can be emptied: */ max_len = (tp->t_maxseg - optlen); if ((sb_offset + len) < sbavail(sb)) { moff = len % (u_int)max_len; if (moff != 0) { mark = 3; len -= moff; } } /* * In case there are too many small fragments don't * use TSO: */ if (len <= segsiz) { mark = 4; tso = 0; } /* * Send the FIN in a separate segment after the bulk * sending is done. We don't trust the TSO * implementations to clear the FIN flag on all but * the last segment. */ if (tp->t_flags & TF_NEEDFIN) { sendalot = 4; } } else { mark = 5; if (optlen + ipoptlen >= tp->t_maxseg) { /* * Since we don't have enough space to put * the IP header chain and the TCP header in * one packet as required by RFC 7112, don't * send it. Also ensure that at least one * byte of the payload can be put into the * TCP segment. */ SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; sack_rxmit = 0; goto out; } len = tp->t_maxseg - optlen - ipoptlen; sendalot = 5; } } else { tso = 0; mark = 6; } KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, ("%s: len > IP_MAXPACKET", __func__)); #ifdef DIAGNOSTIC #ifdef INET6 if (max_linkhdr + hdrlen > MCLBYTES) #else if (max_linkhdr + hdrlen > MHLEN) #endif panic("tcphdr too big"); #endif /* * This KASSERT is here to catch edge cases at a well defined place. * Before, those had triggered (random) panic conditions further * down. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); if ((len == 0) && (flags & TH_FIN) && (sbused(sb))) { /* * We have outstanding data, don't send a fin by itself!. */ goto just_return; } /* * Grab a header mbuf, attaching a copy of data to be transmitted, * and initialize the header from the template for sends on this * connection. */ if (len) { uint32_t max_val; uint32_t moff; if (rack->r_ctl.rc_pace_max_segs) max_val = rack->r_ctl.rc_pace_max_segs; else if (rack->rc_user_set_max_segs) max_val = rack->rc_user_set_max_segs * segsiz; else max_val = len; /* * We allow a limit on sending with hptsi. */ if (len > max_val) { mark = 7; len = max_val; } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else #endif m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { SOCKBUF_UNLOCK(sb); error = ENOBUFS; sack_rxmit = 0; goto out; } m->m_data += max_linkhdr; m->m_len = hdrlen; /* * Start the m_copy functions from the closest mbuf to the * sb_offset in the socket buffer chain. */ mb = sbsndptr_noadv(sb, sb_offset, &moff); if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t)+hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) sbsndptr_adv(sb, mb, len); m->m_len += len; } else { struct sockbuf *msb; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) msb = NULL; else msb = sb; m->m_next = tcp_m_copym( mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, ((rsm == NULL) ? hw_tls : 0) #ifdef NETFLIX_COPY_ARGS , &filled_all #endif ); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy * shorten it to no longer need tso. Lets * not put on sendalot since we are low on * mbufs. */ tso = 0; } if (m->m_next == NULL) { SOCKBUF_UNLOCK(sb); (void)m_free(m); error = ENOBUFS; sack_rxmit = 0; goto out; } } if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { if (rsm && (rsm->r_flags & RACK_TLP)) { /* * TLP should not count in retran count, but * in its own bin */ counter_u64_add(rack_tlp_retran, 1); counter_u64_add(rack_tlp_retran_bytes, len); } else { tp->t_sndrexmitpack++; KMOD_TCPSTAT_INC(tcps_sndrexmitpack); KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); } #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); #endif } else { KMOD_TCPSTAT_INC(tcps_sndpack); KMOD_TCPSTAT_ADD(tcps_sndbyte, len); #ifdef STATS stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, len); #endif } /* * If we're sending everything we've got, set PUSH. (This * will keep happy those implementations which only give * data to the user when a buffer fills or a PUSH comes in.) */ if (sb_offset + len == sbused(sb) && sbused(sb) && !(flags & TH_SYN)) flags |= TH_PUSH; SOCKBUF_UNLOCK(sb); } else { SOCKBUF_UNLOCK(sb); if (tp->t_flags & TF_ACKNOW) KMOD_TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN | TH_FIN | TH_RST)) KMOD_TCPSTAT_INC(tcps_sndctrl); else KMOD_TCPSTAT_INC(tcps_sndwinup); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; sack_rxmit = 0; goto out; } #ifdef INET6 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && MHLEN >= hdrlen) { M_ALIGN(m, hdrlen); } else #endif m->m_data += max_linkhdr; m->m_len = hdrlen; } SOCKBUF_UNLOCK_ASSERT(sb); m->m_pkthdr.rcvif = (struct ifnet *)0; #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(inp, #ifdef NETFLIX_TCPOUDP tp->t_port, #endif ip6, th); } else #endif /* INET6 */ { ip = mtod(m, struct ip *); #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif #ifdef NETFLIX_TCPOUDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); udp->uh_dport = tp->t_port; ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); } else #endif th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(inp, #ifdef NETFLIX_TCPOUDP tp->t_port, #endif ip, th); } /* * Fill in fields, remembering maximum advertised window for use in * delaying messages about window sizes. If resending a FIN, be sure * not to use a new sequence number. */ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* * If we are starting a connection, send ECN setup SYN packet. If we * are on a retransmit, we may resend those bits a number of times * as per RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { if (tp->t_rxtshift >= 1) { if (tp->t_rxtshift <= V_tcp_ecn_maxretries) flags |= TH_ECE | TH_CWR; } else flags |= TH_ECE | TH_CWR; } /* Handle parallel SYN for ECN */ if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) { flags |= TH_ECE; tp->t_flags2 &= ~TF2_ECN_SND_ECE; } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags2 & TF2_ECN_PERMIT)) { /* * If the peer has ECN, mark data packets with ECN capable * transmission (ECT). Ignore pure ack packets, * retransmissions. */ if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && (sack_rxmit == 0)) { #ifdef INET6 if (isipv6) ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); else #endif ip->ip_tos |= IPTOS_ECN_ECT0; KMOD_TCPSTAT_INC(tcps_ecn_ect0); /* * Reply with proper ECN notifications. * Only set CWR on new data segments. */ if (tp->t_flags2 & TF2_ECN_SND_CWR) { flags |= TH_CWR; tp->t_flags2 &= ~TF2_ECN_SND_CWR; } } if (tp->t_flags2 & TF2_ECN_SND_ECE) flags |= TH_ECE; } /* * If we are doing retransmissions, then snd_nxt will not reflect * the first unsent octet. For ACK only packets, we do not want the * sequence number of the retransmitted packet, we want the sequence * number of the next unsent octet. So, if there is no data (and no * SYN or FIN), use snd_max instead of snd_nxt when filling in * ti_seq. But if we are in persist state, snd_max might reflect * one byte beyond the right edge of the window, so use snd_nxt in * that case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { if (len || (flags & (TH_SYN | TH_FIN)) || rack->rc_in_persist) { th->th_seq = htonl(tp->snd_nxt); rack_seq = tp->snd_nxt; } else if (flags & TH_RST) { /* * For a Reset send the last cum ack in sequence * (this like any other choice may still generate a * challenge ack, if a ack-update packet is in * flight). */ th->th_seq = htonl(tp->snd_una); rack_seq = tp->snd_una; } else { th->th_seq = htonl(tp->snd_max); rack_seq = tp->snd_max; } } else { th->th_seq = htonl(rsm->r_start); rack_seq = rsm->r_start; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, but avoid silly * window syndrome. * If a RST segment is sent, advertise a window of zero. */ if (flags & TH_RST) { recwin = 0; } else { if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && recwin < (long)segsiz) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); } /* * According to RFC1323 the window field in a SYN (i.e., a or * ) segment itself is never scaled. The case is * handled in syncache. */ if (flags & TH_SYN) th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else { /* Avoid shrinking window with window scaling. */ recwin = roundup2(recwin, 1 << tp->rcv_scale); th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); } /* * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 * window. This may cause the remote transmitter to stall. This * flag tells soreceive() to disable delayed acknowledgements when * draining the buffer. This can occur if the receiver is * attempting to read more data than can be buffered prior to * transmitting on the connection. */ if (th->th_win == 0) { tp->t_sndzerowin++; tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* * Calculate MD5 signature and put it into the place * determined before. * NOTE: since TCP options buffer doesn't point into * mbuf's data, calculate offset and use it. */ if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { /* * Do not send segment if the calculation of MD5 * digest has failed. */ goto out; } } #endif /* * Put TCP length in extended header, and then checksum extended * header and data. */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #ifdef INET6 if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); UDPSTAT_INC(udps_opackets); } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); } /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif /* * Enable TSO and specify the size of the segments. The TCP pseudo * header checksum is always provided. XXX: Fixme: This is currently * not the case for IPv6. */ if (tso) { KASSERT(len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; } KASSERT(len + hdrlen == m_length(m, NULL), ("%s: mbuf chain different than expected: %d + %u != %u", __func__, len, hdrlen, m_length(m, NULL))); #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif #ifdef TCPDEBUG /* * Trace. */ if (so->so_options & SO_DEBUG) { u_short save = 0; #ifdef INET6 if (!isipv6) #endif { save = ipov->ih_len; ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + * (th->th_off << 2) */ ); } tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 if (!isipv6) #endif ipov->ih_len = save; } #endif /* TCPDEBUG */ /* We're getting ready to send; log now. */ if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; if (rack->rack_no_prr) log.u_bbr.flex1 = 0; else log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; log.u_bbr.flex4 = orig_len; if (filled_all) log.u_bbr.flex5 = 0x80000000; else log.u_bbr.flex5 = 0; /* Save off the early/late values */ log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; log.u_bbr.bw_inuse = rack_get_bw(rack); if (rsm || sack_rxmit) { if (doing_tlp) log.u_bbr.flex8 = 2; else log.u_bbr.flex8 = 1; } else { log.u_bbr.flex8 = 0; } log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); log.u_bbr.flex7 = mark; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.lt_epoch = cwnd_to_use; log.u_bbr.delivered = sendalot; lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, len, &log, false, NULL, NULL, 0, &tv); } else lgb = NULL; /* * Fill in IP length and desired time to live and send to IP level. * There should be a better way to handle ttl and tos; we could keep * them in the template, but need a way to checksum without them. */ /* * m->m_pkthdr.len should have been set before cksum calcuration, * because in6_cksum() need it. */ #ifdef INET6 if (isipv6) { /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. Also, * desired default hop limit might be changed via Neighbor * Discovery. */ ip6->ip6_hlim = in6_selecthlim(inp, NULL); /* * Set the packet size here for the benefit of DTrace * probes. ip6_output() will set it properly; it's supposed * to include the option header lengths as well. */ ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) tp->t_flags2 |= TF2_PLPMTU_PMTUD; else tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ error = ip6_output(m, inp->in6p_outputopts, &inp->inp_route6, ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), NULL, NULL, inp); if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) mtu = inp->inp_route6.ro_nh->nh_mtu; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { ip->ip_len = htons(m->m_pkthdr.len); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(inp, NULL); #endif /* INET6 */ /* * If we do path MTU discovery, then we set DF on every * packet. This might not be the best thing to do according * to RFC3390 Section 2. However the tcp hostcache migitates * the problem so it affects only the first tcp connection * with a host. * * NB: Don't set DF on small MTU/MSS to have a safe * fallback. */ if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { tp->t_flags2 |= TF2_PLPMTU_PMTUD; if (tp->t_port == 0 || len < V_tcp_minmss) { ip->ip_off |= htons(IP_DF); } } else { tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; } if (tp->t_state == TCPS_SYN_SENT) TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); error = ip_output(m, inp->inp_options, &inp->inp_route, ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, inp); if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) mtu = inp->inp_route.ro_nh->nh_mtu; } #endif /* INET */ out: if (lgb) { lgb->tlb_errno = error; lgb = NULL; } /* * In transmit state, time the transmission and arrange for the * retransmit. In persist state, just set snd_max. */ if (error == 0) { rack->forced_ack = 0; /* If we send something zap the FA flag */ if (rsm && (doing_tlp == 0)) { /* Set we retransmitted */ rack->rc_gp_saw_rec = 1; } else { if (cwnd_to_use > tp->snd_ssthresh) { /* Set we sent in CA */ rack->rc_gp_saw_ca = 1; } else { /* Set we sent in SS */ rack->rc_gp_saw_ss = 1; } } if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) tcp_clean_dsack_blocks(tp); tot_len_this_send += len; if (len == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); else if (len == 1) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); } else if (len > 1) { int idx; idx = (len / segsiz) + 3; if (idx >= TCP_MSS_ACCT_ATIMER) counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); else counter_u64_add(rack_out_size[idx], 1); } } if (rack->rack_no_prr == 0) { if (sub_from_prr && (error == 0)) { if (rack->r_ctl.rc_prr_sndcnt >= len) rack->r_ctl.rc_prr_sndcnt -= len; else rack->r_ctl.rc_prr_sndcnt = 0; } } sub_from_prr = 0; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, pass, rsm, us_cts); if ((error == 0) && (len > 0) && (tp->snd_una == tp->snd_max)) rack->r_ctl.rc_tlp_rxt_last_time = cts; /* Now are we in persists? */ if (rack->rc_in_persist == 0) { tcp_seq startseq = tp->snd_nxt; /* Track our lost count */ if (rsm && (doing_tlp == 0)) rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; /* * Advance snd_nxt over sequence space of this segment. */ if (error) /* We don't log or do anything with errors */ goto nomore; if (doing_tlp == 0) { if (rsm == NULL) { /* * Not a retransmission of some * sort, new data is going out so * clear our TLP count and flag. */ rack->rc_tlp_in_progress = 0; rack->r_ctl.rc_tlp_cnt_out = 0; } } else { /* * We have just sent a TLP, mark that it is true * and make sure our in progress is set so we * continue to check the count. */ rack->rc_tlp_in_progress = 1; rack->r_ctl.rc_tlp_cnt_out++; } if (flags & (TH_SYN | TH_FIN)) { if (flags & TH_SYN) tp->snd_nxt++; if (flags & TH_FIN) { tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } } /* In the ENOBUFS case we do *not* update snd_max */ if (sack_rxmit) goto nomore; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { if (tp->snd_una == tp->snd_max) { /* * Update the time we just added data since * none was outstanding. */ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and * not currently timing anything. * This is only relevant in case of switching back to * the base stack. */ if (tp->t_rtttime == 0) { tp->t_rtttime = ticks; tp->t_rtseq = startseq; KMOD_TCPSTAT_INC(tcps_segstimed); } if (len && ((tp->t_flags & TF_GPUTINPROG) == 0)) rack_start_gp_measurement(tp, rack, startseq, sb_offset); } } else { /* * Persist case, update snd_max but since we are in persist * mode (no window) we do not update snd_nxt. */ int32_t xlen = len; if (error) goto nomore; if (flags & TH_SYN) ++xlen; if (flags & TH_FIN) { ++xlen; tp->t_flags |= TF_SENTFIN; } /* In the ENOBUFS case we do *not* update snd_max */ if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { if (tp->snd_una == tp->snd_max) { /* * Update the time we just added data since * none was outstanding. */ rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt + len; } } nomore: if (error) { rack->r_ctl.rc_agg_delayed = 0; rack->r_early = 0; rack->r_late = 0; rack->r_ctl.rc_agg_early = 0; SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ /* * Failures do not advance the seq counter above. For the * case of ENOBUFS we will fall out and retry in 1ms with * the hpts. Everything else will just have to retransmit * with the timer. * * In any case, we do not want to loop around for another * send without a good reason. */ sendalot = 0; switch (error) { case EPERM: tp->t_softerror = error; return (error); case ENOBUFS: if (slot == 0) { /* * Pace us right away to retry in a some * time */ slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); if (rack->rc_enobuf < 126) rack->rc_enobuf++; if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) { slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC; } if (slot < (10 * HPTS_USEC_IN_MSEC)) slot = 10 * HPTS_USEC_IN_MSEC; } counter_u64_add(rack_saw_enobuf, 1); error = 0; goto enobufs; case EMSGSIZE: /* * For some reason the interface we used initially * to send segments changed to another or lowered * its MTU. If TSO was active we either got an * interface without TSO capabilits or TSO was * turned off. If we obtained mtu from ip_output() * then update it and try again. */ if (tso) tp->t_flags &= ~TF_TSO; if (mtu != 0) { tcp_mss_update(tp, -1, mtu, NULL, NULL); goto again; } slot = 10 * HPTS_USEC_IN_MSEC; rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); return (error); case ENETUNREACH: counter_u64_add(rack_saw_enetunreach, 1); case EHOSTDOWN: case EHOSTUNREACH: case ENETDOWN: if (TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; } /* FALLTHROUGH */ default: slot = 10 * HPTS_USEC_IN_MSEC; rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); return (error); } } else { rack->rc_enobuf = 0; } KMOD_TCPSTAT_INC(tcps_sndtotal); /* * Data sent (as far as we can tell). If this advertises a larger * window than any other segment, then remember the size of the * advertised window. Any pending ACK has now been sent. */ if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: /* Assure when we leave that snd_nxt will point to top */ if (SEQ_GT(tp->snd_max, tp->snd_nxt)) tp->snd_nxt = tp->snd_max; if (sendalot) { /* Do we need to turn off sendalot? */ if (rack->r_ctl.rc_pace_max_segs && (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { /* We hit our max. */ sendalot = 0; } else if ((rack->rc_user_set_max_segs) && (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { /* We hit the user defined max */ sendalot = 0; } } if ((error == 0) && (flags & TH_FIN)) tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); if (flags & TH_RST) { /* * We don't send again after sending a RST. */ slot = 0; sendalot = 0; if (error == 0) tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { /* * Get our pacing rate, if an error - * occured in sending (ENOBUF) we would + * occurred in sending (ENOBUF) we would * hit the else if with slot preset. Other * errors return. */ slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); } if (rsm && rack->use_rack_rr) { /* Its a retransmit and we use the rack cheat? */ if ((slot == 0) || (rack->rc_always_pace == 0) || (rack->r_rr_config == 1)) { /* * We have no pacing set or we * are using old-style rack or * we are overriden to use the old 1ms pacing. */ slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC; } } if (slot) { /* set the rack tcb into the slot N */ counter_u64_add(rack_paced_segments, 1); } else if (sendalot) { if (len) counter_u64_add(rack_unpaced_segments, 1); sack_rxmit = 0; goto again; } else if (len) { counter_u64_add(rack_unpaced_segments, 1); } rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); return (error); } static void rack_update_seg(struct tcp_rack *rack) { uint32_t orig_val; orig_val = rack->r_ctl.rc_pace_max_segs; rack_set_pace_segments(rack->rc_tp, rack, __LINE__); if (orig_val != rack->r_ctl.rc_pace_max_segs) rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); } /* * rack_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { struct epoch_tracker et; uint64_t val; int32_t error = 0, optval; uint16_t ca, ss; switch (sopt->sopt_name) { case TCP_RACK_PROP_RATE: /* URL:prop_rate */ case TCP_RACK_PROP : /* URL:prop */ case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ case TCP_RACK_EARLY_RECOV: /* URL:early_recov */ case TCP_RACK_PACE_REDUCE: /* Not used */ /* Pacing related ones */ case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ case TCP_BBR_IWINTSO: /* URL:tso_iwin */ case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ case TCP_RACK_RR_CONF: /* URL:rrr_conf */ case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ /* End pacing related */ case TCP_DELACK: case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ case TCP_RACK_MIN_TO: /* URL:min_to */ case TCP_RACK_EARLY_SEG: /* URL:early_seg */ case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ case TCP_RACK_REORD_FADE: /* URL:reord_fade */ case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ case TCP_RACK_TLP_USE: /* URL:tlp_use */ case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */ case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */ case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ case TCP_RACK_DO_DETECTION: /* URL:detect */ case TCP_NO_PRR: /* URL:noprr */ case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ case TCP_DATA_AFTER_CLOSE: case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ case TCP_RACK_PROFILE: /* URL:profile */ break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) return (error); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); } tp = intotcpcb(inp); rack = (struct tcp_rack *)tp->t_fb_ptr; switch (sopt->sopt_name) { case TCP_RACK_PROFILE: RACK_OPTS_INC(tcp_profile); if (optval == 1) { /* pace_always=1 */ rack->rc_always_pace = 1; tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; /* scwnd=1 */ rack->rack_enable_scwnd = 1; /* dynamic=100 */ rack->rc_gp_dyn_mul = 1; rack->r_ctl.rack_per_of_gp_ca = 100; /* rrr_conf=3 */ rack->r_rr_config = 3; /* npush=2 */ rack->r_ctl.rc_no_push_at_mrtt = 2; /* fillcw=1 */ rack->rc_pace_to_cwnd = 1; rack->rc_pace_fill_if_rttin_range = 0; rack->rtt_limit_mul = 0; /* noprr=1 */ rack->rack_no_prr = 1; /* lscwnd=1 */ rack->r_limit_scw = 1; } else if (optval == 2) { /* pace_always=1 */ rack->rc_always_pace = 1; tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; /* scwnd=1 */ rack->rack_enable_scwnd = 1; /* dynamic=100 */ rack->rc_gp_dyn_mul = 1; rack->r_ctl.rack_per_of_gp_ca = 100; /* rrr_conf=3 */ rack->r_rr_config = 3; /* npush=2 */ rack->r_ctl.rc_no_push_at_mrtt = 2; /* fillcw=1 */ rack->rc_pace_to_cwnd = 1; rack->rc_pace_fill_if_rttin_range = 0; rack->rtt_limit_mul = 0; /* noprr=1 */ rack->rack_no_prr = 1; /* lscwnd=0 */ rack->r_limit_scw = 0; } break; case TCP_SHARED_CWND_TIME_LIMIT: RACK_OPTS_INC(tcp_lscwnd); if (optval) rack->r_limit_scw = 1; else rack->r_limit_scw = 0; break; case TCP_RACK_PACE_TO_FILL: RACK_OPTS_INC(tcp_fillcw); if (optval == 0) rack->rc_pace_to_cwnd = 0; else rack->rc_pace_to_cwnd = 1; if ((optval >= rack_gp_rtt_maxmul) && rack_gp_rtt_maxmul && (optval < 0xf)) { rack->rc_pace_fill_if_rttin_range = 1; rack->rtt_limit_mul = optval; } else { rack->rc_pace_fill_if_rttin_range = 0; rack->rtt_limit_mul = 0; } break; case TCP_RACK_NO_PUSH_AT_MAX: RACK_OPTS_INC(tcp_npush); if (optval == 0) rack->r_ctl.rc_no_push_at_mrtt = 0; else if (optval < 0xff) rack->r_ctl.rc_no_push_at_mrtt = optval; else error = EINVAL; break; case TCP_SHARED_CWND_ENABLE: RACK_OPTS_INC(tcp_rack_scwnd); if (optval == 0) rack->rack_enable_scwnd = 0; else rack->rack_enable_scwnd = 1; break; case TCP_RACK_MBUF_QUEUE: /* Now do we use the LRO mbuf-queue feature */ RACK_OPTS_INC(tcp_rack_mbufq); if (optval) rack->r_mbuf_queue = 1; else rack->r_mbuf_queue = 0; if (rack->r_mbuf_queue || rack->rc_always_pace) tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; else tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; break; case TCP_RACK_NONRXT_CFG_RATE: RACK_OPTS_INC(tcp_rack_cfg_rate); if (optval == 0) rack->rack_rec_nonrxt_use_cr = 0; else rack->rack_rec_nonrxt_use_cr = 1; break; case TCP_NO_PRR: RACK_OPTS_INC(tcp_rack_noprr); if (optval == 0) rack->rack_no_prr = 0; else rack->rack_no_prr = 1; break; case TCP_TIMELY_DYN_ADJ: RACK_OPTS_INC(tcp_timely_dyn); if (optval == 0) rack->rc_gp_dyn_mul = 0; else { rack->rc_gp_dyn_mul = 1; if (optval >= 100) { /* * If the user sets something 100 or more * its the gp_ca value. */ rack->r_ctl.rack_per_of_gp_ca = optval; } } break; case TCP_RACK_DO_DETECTION: RACK_OPTS_INC(tcp_rack_do_detection); if (optval == 0) rack->do_detection = 0; else rack->do_detection = 1; break; case TCP_RACK_PROP_RATE: if ((optval <= 0) || (optval >= 100)) { error = EINVAL; break; } RACK_OPTS_INC(tcp_rack_prop_rate); rack->r_ctl.rc_prop_rate = optval; break; case TCP_RACK_TLP_USE: if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { error = EINVAL; break; } RACK_OPTS_INC(tcp_tlp_use); rack->rack_tlp_threshold_use = optval; break; case TCP_RACK_PROP: /* RACK proportional rate reduction (bool) */ RACK_OPTS_INC(tcp_rack_prop); rack->r_ctl.rc_prop_reduce = optval; break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ RACK_OPTS_INC(tcp_rack_tlp_reduce); rack->r_ctl.rc_tlp_cwnd_reduce = optval; break; case TCP_RACK_EARLY_RECOV: /* Should recovery happen early (bool) */ RACK_OPTS_INC(tcp_rack_early_recov); rack->r_ctl.rc_early_recovery = optval; break; /* Pacing related ones */ case TCP_RACK_PACE_ALWAYS: /* * zero is old rack method, 1 is new * method using a pacing rate. */ RACK_OPTS_INC(tcp_rack_pace_always); if (optval > 0) rack->rc_always_pace = 1; else rack->rc_always_pace = 0; if (rack->r_mbuf_queue || rack->rc_always_pace) tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; else tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; /* A rate may be set irate or other, if so set seg size */ rack_update_seg(rack); break; case TCP_BBR_RACK_INIT_RATE: RACK_OPTS_INC(tcp_initial_rate); val = optval; /* Change from kbits per second to bytes per second */ val *= 1000; val /= 8; rack->r_ctl.init_rate = val; if (rack->rc_init_win != rack_default_init_window) { uint32_t win, snt; /* * Options don't always get applied * in the order you think. So in order * to assure we update a cwnd we need * to check and see if we are still * where we should raise the cwnd. */ win = rc_init_window(rack); if (SEQ_GT(tp->snd_max, tp->iss)) snt = tp->snd_max - tp->iss; else snt = 0; if ((snt < win) && (tp->snd_cwnd < win)) tp->snd_cwnd = win; } if (rack->rc_always_pace) rack_update_seg(rack); break; case TCP_BBR_IWINTSO: RACK_OPTS_INC(tcp_initial_win); if (optval && (optval <= 0xff)) { uint32_t win, snt; rack->rc_init_win = optval; win = rc_init_window(rack); if (SEQ_GT(tp->snd_max, tp->iss)) snt = tp->snd_max - tp->iss; else snt = 0; if ((snt < win) && (tp->t_srtt | #ifdef NETFLIX_PEAKRATE tp->t_maxpeakrate | #endif rack->r_ctl.init_rate)) { /* * We are not past the initial window * and we have some bases for pacing, * so we need to possibly adjust up * the cwnd. Note even if we don't set * the cwnd, its still ok to raise the rc_init_win * which can be used coming out of idle when we * would have a rate. */ if (tp->snd_cwnd < win) tp->snd_cwnd = win; } if (rack->rc_always_pace) rack_update_seg(rack); } else error = EINVAL; break; case TCP_RACK_FORCE_MSEG: RACK_OPTS_INC(tcp_rack_force_max_seg); if (optval) rack->rc_force_max_seg = 1; else rack->rc_force_max_seg = 0; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments size in a pace in bytes */ RACK_OPTS_INC(tcp_rack_max_seg); rack->rc_user_set_max_segs = optval; rack_set_pace_segments(tp, rack, __LINE__); break; case TCP_RACK_PACE_RATE_REC: /* Set the fixed pacing rate in Bytes per second ca */ RACK_OPTS_INC(tcp_rack_pace_rate_rec); rack->r_ctl.rc_fixed_pacing_rate_rec = optval; if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) rack->r_ctl.rc_fixed_pacing_rate_ca = optval; if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) rack->r_ctl.rc_fixed_pacing_rate_ss = optval; rack->use_fixed_rate = 1; rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_fixed_pacing_rate_ss, rack->r_ctl.rc_fixed_pacing_rate_ca, rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, __LINE__, NULL); break; case TCP_RACK_PACE_RATE_SS: /* Set the fixed pacing rate in Bytes per second ca */ RACK_OPTS_INC(tcp_rack_pace_rate_ss); rack->r_ctl.rc_fixed_pacing_rate_ss = optval; if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) rack->r_ctl.rc_fixed_pacing_rate_ca = optval; if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) rack->r_ctl.rc_fixed_pacing_rate_rec = optval; rack->use_fixed_rate = 1; rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_fixed_pacing_rate_ss, rack->r_ctl.rc_fixed_pacing_rate_ca, rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, __LINE__, NULL); break; case TCP_RACK_PACE_RATE_CA: /* Set the fixed pacing rate in Bytes per second ca */ RACK_OPTS_INC(tcp_rack_pace_rate_ca); rack->r_ctl.rc_fixed_pacing_rate_ca = optval; if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) rack->r_ctl.rc_fixed_pacing_rate_ss = optval; if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) rack->r_ctl.rc_fixed_pacing_rate_rec = optval; rack->use_fixed_rate = 1; rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_fixed_pacing_rate_ss, rack->r_ctl.rc_fixed_pacing_rate_ca, rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, __LINE__, NULL); break; case TCP_RACK_GP_INCREASE_REC: RACK_OPTS_INC(tcp_gp_inc_rec); rack->r_ctl.rack_per_of_gp_rec = optval; rack_log_pacing_delay_calc(rack, rack->r_ctl.rack_per_of_gp_ss, rack->r_ctl.rack_per_of_gp_ca, rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, __LINE__, NULL); break; case TCP_RACK_GP_INCREASE_CA: RACK_OPTS_INC(tcp_gp_inc_ca); ca = optval; if (ca < 100) { /* * We don't allow any reduction * over the GP b/w. */ error = EINVAL; break; } rack->r_ctl.rack_per_of_gp_ca = ca; rack_log_pacing_delay_calc(rack, rack->r_ctl.rack_per_of_gp_ss, rack->r_ctl.rack_per_of_gp_ca, rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, __LINE__, NULL); break; case TCP_RACK_GP_INCREASE_SS: RACK_OPTS_INC(tcp_gp_inc_ss); ss = optval; if (ss < 100) { /* * We don't allow any reduction * over the GP b/w. */ error = EINVAL; break; } rack->r_ctl.rack_per_of_gp_ss = ss; rack_log_pacing_delay_calc(rack, rack->r_ctl.rack_per_of_gp_ss, rack->r_ctl.rack_per_of_gp_ca, rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, __LINE__, NULL); break; case TCP_RACK_RR_CONF: RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); if (optval && optval <= 3) rack->r_rr_config = optval; else rack->r_rr_config = 0; break; case TCP_BBR_HDWR_PACE: RACK_OPTS_INC(tcp_hdwr_pacing); if (optval){ if (rack->rack_hdrw_pacing == 0) { rack->rack_hdw_pace_ena = 1; rack->rack_attempt_hdwr_pace = 0; } else error = EALREADY; } else { rack->rack_hdw_pace_ena = 0; #ifdef RATELIMIT if (rack->rack_hdrw_pacing) { rack->rack_hdrw_pacing = 0; in_pcbdetach_txrtlmt(rack->rc_inp); } #endif } break; /* End Pacing related ones */ case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ RACK_OPTS_INC(tcp_rack_prr_sendalot); rack->r_ctl.rc_prr_sendalot = optval; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ RACK_OPTS_INC(tcp_rack_min_to); rack->r_ctl.rc_min_to = optval; break; case TCP_RACK_EARLY_SEG: /* If early recovery max segments */ RACK_OPTS_INC(tcp_rack_early_seg); rack->r_ctl.rc_early_recovery_segs = optval; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ RACK_OPTS_INC(tcp_rack_reord_thresh); if ((optval > 0) && (optval < 31)) rack->r_ctl.rc_reorder_shift = optval; else error = EINVAL; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ RACK_OPTS_INC(tcp_rack_reord_fade); rack->r_ctl.rc_reorder_fade = optval; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ RACK_OPTS_INC(tcp_rack_tlp_thresh); if (optval) rack->r_ctl.rc_tlp_threshold = optval; else error = EINVAL; break; case TCP_BBR_USE_RACK_RR: RACK_OPTS_INC(tcp_rack_rr); if (optval) rack->use_rack_rr = 1; else rack->use_rack_rr = 0; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ RACK_OPTS_INC(tcp_rack_pkt_delay); rack->r_ctl.rc_pkt_delay = optval; break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ error = EINVAL; break; case TCP_RACK_IDLE_REDUCE_HIGH: error = EINVAL; break; case TCP_DELACK: if (optval == 0) tp->t_delayed_ack = 0; else tp->t_delayed_ack = 1; if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; NET_EPOCH_ENTER(et); rack_output(tp); NET_EPOCH_EXIT(et); } break; case TCP_BBR_RACK_RTT_USE: if ((optval != USE_RTT_HIGH) && (optval != USE_RTT_LOW) && (optval != USE_RTT_AVG)) error = EINVAL; else rack->r_ctl.rc_rate_sample_method = optval; break; case TCP_DATA_AFTER_CLOSE: if (optval) rack->rc_allow_data_af_clo = 1; else rack->rc_allow_data_af_clo = 0; break; case TCP_RACK_PACE_REDUCE: /* sysctl only now */ error = EINVAL; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } #ifdef NETFLIX_STATS tcp_log_socket_option(tp, sopt->sopt_name, optval, error); #endif INP_WUNLOCK(inp); return (error); } static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { int32_t error, optval; uint64_t val; /* * Because all our options are either boolean or an int, we can just * pull everything into optval and then unlock and copy. If we ever * add a option that is not a int, then this will have quite an * impact to this routine. */ error = 0; switch (sopt->sopt_name) { case TCP_RACK_PROFILE: /* You cannot retrieve a profile, its write only */ error = EINVAL; break; case TCP_RACK_PACE_TO_FILL: optval = rack->rc_pace_to_cwnd; break; case TCP_RACK_NO_PUSH_AT_MAX: optval = rack->r_ctl.rc_no_push_at_mrtt; break; case TCP_SHARED_CWND_ENABLE: optval = rack->rack_enable_scwnd; break; case TCP_RACK_NONRXT_CFG_RATE: optval = rack->rack_rec_nonrxt_use_cr; break; case TCP_NO_PRR: optval = rack->rack_no_prr; break; case TCP_RACK_DO_DETECTION: optval = rack->do_detection; break; case TCP_RACK_MBUF_QUEUE: /* Now do we use the LRO mbuf-queue feature */ optval = rack->r_mbuf_queue; break; case TCP_TIMELY_DYN_ADJ: optval = rack->rc_gp_dyn_mul; break; case TCP_BBR_IWINTSO: optval = rack->rc_init_win; break; case TCP_RACK_PROP_RATE: optval = rack->r_ctl.rc_prop_rate; break; case TCP_RACK_PROP: /* RACK proportional rate reduction (bool) */ optval = rack->r_ctl.rc_prop_reduce; break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ optval = rack->r_ctl.rc_tlp_cwnd_reduce; break; case TCP_RACK_EARLY_RECOV: /* Should recovery happen early (bool) */ optval = rack->r_ctl.rc_early_recovery; break; case TCP_RACK_PACE_REDUCE: /* RACK Hptsi reduction factor (divisor) */ error = EINVAL; break; case TCP_BBR_RACK_INIT_RATE: val = rack->r_ctl.init_rate; /* convert to kbits per sec */ val *= 8; val /= 1000; optval = (uint32_t)val; break; case TCP_RACK_FORCE_MSEG: optval = rack->rc_force_max_seg; break; case TCP_RACK_PACE_MAX_SEG: /* Max segments in a pace */ optval = rack->rc_user_set_max_segs; break; case TCP_RACK_PACE_ALWAYS: /* Use the always pace method */ optval = rack->rc_always_pace; break; case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ optval = rack->r_ctl.rc_prr_sendalot; break; case TCP_RACK_MIN_TO: /* Minimum time between rack t-o's in ms */ optval = rack->r_ctl.rc_min_to; break; case TCP_RACK_EARLY_SEG: /* If early recovery max segments */ optval = rack->r_ctl.rc_early_recovery_segs; break; case TCP_RACK_REORD_THRESH: /* RACK reorder threshold (shift amount) */ optval = rack->r_ctl.rc_reorder_shift; break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ optval = rack->r_ctl.rc_reorder_fade; break; case TCP_BBR_USE_RACK_RR: /* Do we use the rack cheat for rxt */ optval = rack->use_rack_rr; break; case TCP_RACK_RR_CONF: optval = rack->r_rr_config; break; case TCP_BBR_HDWR_PACE: optval = rack->rack_hdw_pace_ena; break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ optval = rack->r_ctl.rc_tlp_threshold; break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ optval = rack->r_ctl.rc_pkt_delay; break; case TCP_RACK_TLP_USE: optval = rack->rack_tlp_threshold_use; break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ error = EINVAL; break; case TCP_RACK_IDLE_REDUCE_HIGH: error = EINVAL; break; case TCP_RACK_PACE_RATE_CA: optval = rack->r_ctl.rc_fixed_pacing_rate_ca; break; case TCP_RACK_PACE_RATE_SS: optval = rack->r_ctl.rc_fixed_pacing_rate_ss; break; case TCP_RACK_PACE_RATE_REC: optval = rack->r_ctl.rc_fixed_pacing_rate_rec; break; case TCP_RACK_GP_INCREASE_SS: optval = rack->r_ctl.rack_per_of_gp_ca; break; case TCP_RACK_GP_INCREASE_CA: optval = rack->r_ctl.rack_per_of_gp_ss; break; case TCP_BBR_RACK_RTT_USE: optval = rack->r_ctl.rc_rate_sample_method; break; case TCP_DELACK: optval = tp->t_delayed_ack; break; case TCP_DATA_AFTER_CLOSE: optval = rack->rc_allow_data_af_clo; break; case TCP_SHARED_CWND_TIME_LIMIT: optval = rack->r_limit_scw; break; default: return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } INP_WUNLOCK(inp); if (error == 0) { error = sooptcopyout(sopt, &optval, sizeof optval); } return (error); } static int rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { int32_t error = EINVAL; struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack == NULL) { /* Huh? */ goto out; } if (sopt->sopt_dir == SOPT_SET) { return (rack_set_sockopt(so, sopt, inp, tp, rack)); } else if (sopt->sopt_dir == SOPT_GET) { return (rack_get_sockopt(so, sopt, inp, tp, rack)); } out: INP_WUNLOCK(inp); return (error); } static int rack_pru_options(struct tcpcb *tp, int flags) { if (flags & PRUS_OOB) return (EOPNOTSUPP); return (0); } static struct tcp_function_block __tcp_rack = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = rack_output, .tfb_do_queued_segments = ctf_do_queued_segments, .tfb_do_segment_nounlock = rack_do_segment_nounlock, .tfb_tcp_do_segment = rack_do_segment, .tfb_tcp_ctloutput = rack_ctloutput, .tfb_tcp_fb_init = rack_init, .tfb_tcp_fb_fini = rack_fini, .tfb_tcp_timer_stop_all = rack_stopall, .tfb_tcp_timer_activate = rack_timer_activate, .tfb_tcp_timer_active = rack_timer_active, .tfb_tcp_timer_stop = rack_timer_stop, .tfb_tcp_rexmit_tmr = rack_remxt_tmr, .tfb_tcp_handoff_ok = rack_handoff_ok, .tfb_pru_options = rack_pru_options, }; static const char *rack_stack_names[] = { __XSTRING(STACKNAME), #ifdef STACKALIAS __XSTRING(STACKALIAS), #endif }; static int rack_ctor(void *mem, int32_t size, void *arg, int32_t how) { memset(mem, 0, size); return (0); } static void rack_dtor(void *mem, int32_t size, void *arg) { } static bool rack_mod_inited = false; static int tcp_addrack(module_t mod, int32_t type, void *data) { int32_t err = 0; int num_stacks; switch (type) { case MOD_LOAD: rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", sizeof(struct rack_sendmap), rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", sizeof(struct tcp_rack), rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); sysctl_ctx_init(&rack_sysctl_ctx); rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_net_inet_tcp), OID_AUTO, #ifdef STACKALIAS __XSTRING(STACKALIAS), #else __XSTRING(STACKNAME), #endif CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); if (rack_sysctl_root == NULL) { printf("Failed to add sysctl node\n"); err = EFAULT; goto free_uma; } rack_init_sysctls(); num_stacks = nitems(rack_stack_names); err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, rack_stack_names, &num_stacks); if (err) { printf("Failed to register %s stack name for " "%s module\n", rack_stack_names[num_stacks], __XSTRING(MODNAME)); sysctl_ctx_free(&rack_sysctl_ctx); free_uma: uma_zdestroy(rack_zone); uma_zdestroy(rack_pcb_zone); rack_counter_destroy(); printf("Failed to register rack module -- err:%d\n", err); return (err); } tcp_lro_reg_mbufq(); rack_mod_inited = true; break; case MOD_QUIESCE: err = deregister_tcp_functions(&__tcp_rack, true, false); break; case MOD_UNLOAD: err = deregister_tcp_functions(&__tcp_rack, false, true); if (err == EBUSY) break; if (rack_mod_inited) { uma_zdestroy(rack_zone); uma_zdestroy(rack_pcb_zone); sysctl_ctx_free(&rack_sysctl_ctx); rack_counter_destroy(); rack_mod_inited = false; } tcp_lro_dereg_mbufq(); err = 0; break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t tcp_rack = { .name = __XSTRING(MODNAME), .evhand = tcp_addrack, .priv = 0 }; MODULE_VERSION(MODNAME, 1); DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h index c45b7c75cd56..04c5c4940e10 100644 --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -1,482 +1,482 @@ /*- * Copyright (c) 2016-2020 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NETINET_TCP_RACK_H_ #define _NETINET_TCP_RACK_H_ #define RACK_ACKED 0x0001/* The remote endpoint acked this */ -#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order - not used */ +#define RACK_TO_MIXED 0x0002/* A timeout occurred that mixed the send order - not used */ #define RACK_DEFERRED 0x0004/* We can't use this for RTT calc - not used */ #define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */ #define RACK_SACK_PASSED 0x0010/* A sack was done above this block */ #define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */ #define RACK_HAS_FIN 0x0040/* segment is sent with fin */ #define RACK_TLP 0x0080/* segment sent as tail-loss-probe */ #define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */ #define RACK_APP_LIMITED 0x0200/* We went app limited after this send */ #define RACK_WAS_ACKED 0x0400/* a RTO undid the ack, but it already had a rtt calc done */ #define RACK_HAS_SIN 0x0800/* SIN is on this guy */ #define RACK_NUM_OF_RETRANS 3 #define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */ #define RACK_REQ_AVG 4 /* Must be less than 256 */ struct rack_sendmap { uint32_t r_start; /* Sequence number of the segment */ uint32_t r_end; /* End seq, this is 1 beyond actually */ TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */ RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time * sent */ uint16_t r_flags; /* Flags as defined above */ uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; uint32_t usec_orig_send; /* time of orginal send in useconds */ uint32_t r_nseq_appl; /* If this one is app limited, this is the nxt seq limited */ uint32_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */ uint8_t r_dupack; /* Dup ack count */ uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */ uint8_t r_limit_type; /* is this entry counted against a limit? */ uint8_t r_just_ret : 1, /* After sending, the next pkt was just returned, i.e. limited */ r_one_out_nr : 1, /* Special case 1 outstanding and not in recovery */ r_avail : 6; uint8_t r_resv[36]; }; RB_HEAD(rack_rb_tree_head, rack_sendmap); TAILQ_HEAD(rack_head, rack_sendmap); #define RACK_LIMIT_TYPE_SPLIT 1 /* * We use the rate sample structure to * assist in single sack/ack rate and rtt * calculation. In the future we will expand * this in BBR to do forward rate sample * b/w estimation. */ #define RACK_RTT_EMPTY 0x00000001 /* Nothing yet stored in RTT's */ #define RACK_RTT_VALID 0x00000002 /* We have at least one valid RTT */ struct rack_rtt_sample { uint32_t rs_flags; uint32_t rs_rtt_lowest; uint32_t rs_rtt_highest; uint32_t rs_rtt_cnt; uint32_t rs_us_rtt; int32_t confidence; uint64_t rs_rtt_tot; uint16_t rs_us_rtrcnt; }; #define RACK_LOG_TYPE_ACK 0x01 #define RACK_LOG_TYPE_OUT 0x02 #define RACK_LOG_TYPE_TO 0x03 #define RACK_LOG_TYPE_ALLOC 0x04 #define RACK_LOG_TYPE_FREE 0x05 struct rack_log { union { struct rack_sendmap *rsm; /* For alloc/free */ uint64_t sb_acc;/* For out/ack or t-o */ }; uint32_t th_seq; uint32_t th_ack; uint32_t snd_una; uint32_t snd_nxt; /* th_win for TYPE_ACK */ uint32_t snd_max; uint32_t blk_start[4]; uint32_t blk_end[4]; uint8_t type; uint8_t n_sackblks; uint16_t len; /* Timeout T3=1, TLP=2, RACK=3 */ }; /* * Magic numbers for logging timeout events if the * logging is enabled. */ #define RACK_TO_FRM_TMR 1 #define RACK_TO_FRM_TLP 2 #define RACK_TO_FRM_RACK 3 #define RACK_TO_FRM_KEEP 4 #define RACK_TO_FRM_PERSIST 5 #define RACK_TO_FRM_DELACK 6 struct rack_opts_stats { uint64_t tcp_rack_prop_rate; uint64_t tcp_rack_prop; uint64_t tcp_rack_tlp_reduce; uint64_t tcp_rack_early_recov; uint64_t tcp_rack_pace_always; uint64_t tcp_rack_pace_reduce; uint64_t tcp_rack_max_seg; uint64_t tcp_rack_prr_sendalot; uint64_t tcp_rack_min_to; uint64_t tcp_rack_early_seg; uint64_t tcp_rack_reord_thresh; uint64_t tcp_rack_reord_fade; uint64_t tcp_rack_tlp_thresh; uint64_t tcp_rack_pkt_delay; uint64_t tcp_rack_tlp_inc_var; uint64_t tcp_tlp_use; uint64_t tcp_rack_idle_reduce; uint64_t tcp_rack_idle_reduce_high; uint64_t rack_no_timer_in_hpts; uint64_t tcp_rack_min_pace_seg; uint64_t tcp_rack_pace_rate_ca; uint64_t tcp_rack_rr; uint64_t tcp_rack_do_detection; uint64_t tcp_rack_rrr_no_conf_rate; uint64_t tcp_initial_rate; uint64_t tcp_initial_win; uint64_t tcp_hdwr_pacing; uint64_t tcp_gp_inc_ss; uint64_t tcp_gp_inc_ca; uint64_t tcp_gp_inc_rec; uint64_t tcp_rack_force_max_seg; uint64_t tcp_rack_pace_rate_ss; uint64_t tcp_rack_pace_rate_rec; /* Temp counters for dsack */ uint64_t tcp_sack_path_1; uint64_t tcp_sack_path_2a; uint64_t tcp_sack_path_2b; uint64_t tcp_sack_path_3; uint64_t tcp_sack_path_4; /* non temp counters */ uint64_t tcp_rack_scwnd; uint64_t tcp_rack_noprr; uint64_t tcp_rack_cfg_rate; uint64_t tcp_timely_dyn; uint64_t tcp_rack_mbufq; uint64_t tcp_fillcw; uint64_t tcp_npush; uint64_t tcp_lscwnd; uint64_t tcp_profile; }; /* RTT shrink reasons */ #define RACK_RTTS_INIT 0 #define RACK_RTTS_NEWRTT 1 #define RACK_RTTS_EXITPROBE 2 #define RACK_RTTS_ENTERPROBE 3 #define RACK_RTTS_REACHTARGET 4 #define RACK_RTTS_SEEHBP 5 #define RACK_RTTS_NOBACKOFF 6 #define RACK_RTTS_SAFETY 7 #define RACK_USE_BEG 1 #define RACK_USE_END 2 #define RACK_USE_END_OR_THACK 3 #define TLP_USE_ID 1 /* Internet draft behavior */ #define TLP_USE_TWO_ONE 2 /* Use 2.1 behavior */ #define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */ #define RACK_MIN_BW 8000 /* 64kbps in Bps */ #define MIN_GP_WIN 6 /* We need at least 6 MSS in a GP measurement */ #ifdef _KERNEL #define RACK_OPTS_SIZE (sizeof(struct rack_opts_stats)/sizeof(uint64_t)) extern counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; #define RACK_OPTS_ADD(name, amm) counter_u64_add(rack_opts_arry[(offsetof(struct rack_opts_stats, name)/sizeof(uint64_t))], (amm)) #define RACK_OPTS_INC(name) RACK_OPTS_ADD(name, 1) #endif /* * As we get each SACK we wade through the * rc_map and mark off what is acked. * We also increment rc_sacked as well. * * We also pay attention to missing entries * based on the time and possibly mark them * for retransmit. If we do and we are not already * in recovery we enter recovery. In doing * so we claer prr_delivered/holes_rxt and prr_sent_dur_rec. * We also setup rc_next/rc_snd_nxt/rc_send_end so * we will know where to send from. When not in * recovery rc_next will be NULL and rc_snd_nxt should * equal snd_max. * * Whenever we retransmit from recovery we increment * rc_holes_rxt as we retran a block and mark it as retransmitted * with the time it was sent. During non-recovery sending we * add to our map and note the time down of any send expanding * the rc_map at the tail and moving rc_snd_nxt up with snd_max. * * In recovery during SACK/ACK processing if a chunk has * been retransmitted and it is now acked, we decrement rc_holes_rxt. * When we retransmit from the scoreboard we use * rc_next and rc_snd_nxt/rc_send_end to help us * find what needs to be retran. * * To calculate pipe we simply take (snd_max - snd_una) + rc_holes_rxt * This gets us the effect of RFC6675 pipe, counting twice for * bytes retransmitted. */ #define TT_RACK_FR_TMR 0x2000 /* * Locking for the rack control block. * a) Locked by INP_WLOCK * b) Locked by the hpts-mutex * */ #define RACK_GP_HIST 4 /* How much goodput history do we maintain? */ struct rack_control { /* Second cache line 0x40 from tcp_rack */ struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */ struct rack_head rc_tmap; /* List in transmit order Lock(a) */ struct rack_sendmap *rc_tlpsend; /* Remembered place for * tlp_sending Lock(a) */ struct rack_sendmap *rc_resend; /* something we have been asked to * resend */ uint32_t input_pkt; uint32_t saved_input_pkt; uint32_t rc_hpts_flags; uint32_t rc_fixed_pacing_rate_ca; uint32_t rc_fixed_pacing_rate_rec; uint32_t rc_fixed_pacing_rate_ss; uint32_t cwnd_to_use; /* The cwnd in use */ uint32_t rc_timer_exp; /* If a timer ticks of expiry */ uint32_t rc_rack_min_rtt; /* lowest RTT seen Lock(a) */ uint32_t rc_rack_largest_cwnd; /* Largest CWND we have seen Lock(a) */ /* Third Cache line 0x80 */ struct rack_head rc_free; /* Allocation array */ uint32_t rc_time_last_sent; /* Time we last sent some data and * logged it Lock(a). */ uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ uint32_t rc_tlp_new_data; /* we need to send new-data on a TLP * Lock(a) */ uint32_t rc_prr_out; /* bytes sent during recovery Lock(a) */ uint32_t rc_prr_recovery_fs; /* recovery fs point Lock(a) */ uint32_t rc_prr_sndcnt; /* Prr sndcnt Lock(a) */ uint32_t rc_sacked; /* Tot sacked on scoreboard Lock(a) */ uint32_t xxx_rc_last_tlp_seq; /* Last tlp sequence Lock(a) */ uint32_t rc_prr_delivered; /* during recovery prr var Lock(a) */ uint16_t rc_tlp_cnt_out; /* count of times we have sent a TLP without new data */ uint16_t xxx_rc_tlp_seg_send_cnt; /* Number of times we have TLP sent * rc_last_tlp_seq Lock(a) */ uint32_t rc_loss_count; /* How many bytes have been retransmitted * Lock(a) */ uint32_t rc_reorder_fade; /* Socket option value Lock(a) */ /* Forth cache line 0xc0 */ /* Times */ uint32_t rc_rack_tmit_time; /* Rack transmit time Lock(a) */ uint32_t rc_holes_rxt; /* Tot retraned from scoreboard Lock(a) */ /* Variables to track bad retransmits and recover */ uint32_t rc_rsm_start; /* RSM seq number we retransmitted Lock(a) */ uint32_t rc_cwnd_at; /* cwnd at the retransmit Lock(a) */ uint32_t rc_ssthresh_at;/* ssthresh at the retransmit Lock(a) */ uint32_t rc_num_maps_alloced; /* Number of map blocks (sacks) we * have allocated */ uint32_t rc_rcvtime; /* When we last received data */ uint32_t rc_num_split_allocs; /* num split map entries allocated */ uint32_t rc_last_output_to; uint32_t rc_went_idle_time; struct rack_sendmap *rc_sacklast; /* sack remembered place * Lock(a) */ struct rack_sendmap *rc_rsm_at_retran; /* Debug variable kept for * cache line alignment * Lock(a) */ struct rack_sendmap *rc_first_appl; /* Pointer to first app limited */ struct rack_sendmap *rc_end_appl; /* Pointer to last app limited */ /* Cache line split 0x100 */ struct sack_filter rack_sf; /* Cache line split 0x140 */ /* Flags for various things */ uint32_t last_pacing_time; uint32_t rc_pace_max_segs; uint32_t rc_pace_min_segs; uint32_t rc_app_limited_cnt; uint16_t rack_per_of_gp_ss; /* 100 = 100%, so from 65536 = 655 x bw */ uint16_t rack_per_of_gp_ca; /* 100 = 100%, so from 65536 = 655 x bw */ uint16_t rack_per_of_gp_rec; /* 100 = 100%, so from 65536 = 655 x bw, 0=off */ uint16_t rack_per_of_gp_probertt; /* 100 = 100%, so from 65536 = 655 x bw, 0=off */ uint32_t rc_high_rwnd; uint32_t ack_count; uint32_t sack_count; uint32_t sack_noextra_move; uint32_t sack_moved_extra; struct rack_rtt_sample rack_rs; const struct tcp_hwrate_limit_table *crte; uint32_t rc_agg_early; uint32_t rc_agg_delayed; uint32_t rc_tlp_rxt_last_time; uint32_t rc_saved_cwnd; uint32_t rc_gp_output_ts; uint32_t rc_gp_cumack_ts; struct timeval act_rcv_time; struct timeval rc_last_time_decay; /* SAD time decay happened here */ uint64_t gp_bw; uint64_t init_rate; #ifdef NETFLIX_SHARED_CWND struct shared_cwnd *rc_scw; #endif uint64_t last_gp_comp_bw; uint64_t last_max_bw; /* Our calculated max b/w last */ struct time_filter_small rc_gp_min_rtt; int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */ uint32_t rc_gp_srtt; /* Current GP srtt */ uint32_t rc_prev_gp_srtt; /* Previous RTT */ uint32_t rc_entry_gp_rtt; /* Entry to PRTT gp-rtt */ uint32_t rc_loss_at_start; /* At measurement window where was our lost value */ uint32_t forced_ack_ts; uint32_t rc_lower_rtt_us_cts; /* Time our GP rtt was last lowered */ uint32_t rc_time_probertt_entered; uint32_t rc_time_probertt_starts; uint32_t rc_lowest_us_rtt; uint32_t rc_highest_us_rtt; uint32_t rc_last_us_rtt; uint32_t rc_time_of_last_probertt; uint32_t rc_target_probertt_flight; uint32_t rc_probertt_sndmax_atexit; /* Highest sent to in probe-rtt */ uint32_t rc_gp_lowrtt; /* Lowest rtt seen during GPUT measurement */ uint32_t rc_gp_high_rwnd; /* Highest rwnd seen during GPUT measurement */ int32_t rc_scw_index; uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */ uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ uint16_t rc_pkt_delay; /* Socket option value Lock(a) */ uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */ uint8_t num_avg; /* average count before we go to normal decay */ uint8_t rc_prop_rate; /* Socket option value Lock(a) */ uint8_t rc_prop_reduce; /* Socket option value Lock(a) */ uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */ uint8_t rc_early_recovery; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ uint8_t rc_min_to; /* Socket option value Lock(a) */ uint8_t rc_rate_sample_method; uint8_t rc_gp_hist_idx; }; #define RACK_TIMELY_CNT_BOOST 5 /* At 5th increase boost */ #define RACK_MINRTT_FILTER_TIM 10 /* Seconds */ #ifdef _KERNEL struct tcp_rack { /* First cache line 0x00 */ TAILQ_ENTRY(tcp_rack) r_hpts; /* hptsi queue next Lock(b) */ int32_t(*r_substate) (struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, struct tcpopt *, int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */ struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ struct inpcb *rc_inp; /* The inpcb Lock(a) */ uint32_t rc_free_cnt; /* Number of free entries on the rc_free list * Lock(a) */ uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */ uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */ rtt_limit_mul : 4, /* muliply this by low rtt */ r_limit_scw : 1, r_avail_bits : 10; /* Available */ uint16_t rc_user_set_max_segs; /* Socket option value Lock(a) */ uint16_t forced_ack : 1, rc_gp_incr : 1, rc_gp_bwred : 1, rc_gp_timely_inc_cnt : 3, rc_gp_timely_dec_cnt : 3, rc_not_backing_off: 1, rc_highly_buffered: 1, /* The path is highly buffered */ rc_dragged_bottom: 1, rc_dack_mode : 1, /* Mac O/S emulation of d-ack */ rc_dack_toggle : 1, /* For Mac O/S emulation of d-ack */ pacing_longer_than_rtt : 1, rc_gp_filled : 1; uint8_t r_state; /* Current rack state Lock(a) */ uint8_t rc_tmr_stopped : 7, t_timers_stopped : 1; uint8_t rc_enobuf : 7, /* count of enobufs on connection provides */ rc_on_min_to : 1; uint8_t r_timer_override : 1, /* hpts override Lock(a) */ r_is_v6 : 1, /* V6 pcb Lock(a) */ rc_in_persist : 1, rc_tlp_in_progress : 1, rc_always_pace : 1, /* Socket option value Lock(a) */ rc_pace_to_cwnd : 1, rc_pace_fill_if_rttin_range : 1, xxx_avail_bits : 1; uint8_t app_limited_needs_set : 1, use_fixed_rate : 1, rc_has_collapsed : 1, r_rep_attack : 1, r_rep_reverse : 1, rack_hdrw_pacing : 1, /* We are doing Hardware pacing */ rack_hdw_pace_ena : 1, /* Is hardware pacing enabled? */ rack_attempt_hdwr_pace : 1; /* Did we attempt hdwr pacing (if allowed) */ uint8_t rack_tlp_threshold_use : 3, /* only 1, 2 and 3 used so far */ rack_rec_nonrxt_use_cr : 1, rack_enable_scwnd : 1, rack_attempted_scwnd : 1, rack_no_prr : 1, rack_scwnd_is_idle : 1; uint8_t rc_allow_data_af_clo: 1, delayed_ack : 1, set_pacing_done_a_iw : 1, use_rack_rr : 1, alloc_limit_reported : 1, sack_attack_disable : 1, do_detection : 1, rc_force_max_seg : 1; uint8_t rack_cwnd_limited : 1, r_early : 1, r_late : 1, r_running_early : 1, r_running_late : 1, r_wanted_output: 1, r_rr_config : 2; uint16_t rc_init_win : 8, rc_gp_rtt_set : 1, rc_gp_dyn_mul : 1, rc_gp_saw_rec : 1, rc_gp_saw_ca : 1, rc_gp_saw_ss : 1, rc_gp_no_rec_chg : 1, in_probe_rtt : 1, measure_saw_probe_rtt : 1; /* Cache line 2 0x40 */ struct rack_control r_ctl; } __aligned(CACHE_LINE_SIZE); #endif #endif diff --git a/tools/test/stress2/misc/nullfs17.sh b/tools/test/stress2/misc/nullfs17.sh index 121225f67c00..c7e447117969 100755 --- a/tools/test/stress2/misc/nullfs17.sh +++ b/tools/test/stress2/misc/nullfs17.sh @@ -1,84 +1,84 @@ #!/bin/sh # # Copyright (c) 2016 EMC Corp. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # Variation of nullfs.sh # "panic: LK_RETRY set with incompatible flags (0x202400) or -# an error occured (11)" seen. +# an error occurred (11)" seen. # https://people.freebsd.org/~pho/stress/log/matt001.txt [ `id -u ` -ne 0 ] && echo "Must be root!" && exit 1 . ../default.cfg mounts=15 # Number of parallel scripts : ${nullfs_srcdir:=/tmp} : ${nullfs_dstdir:=$mntpoint} CONT=/tmp/nullfs17.continue if [ $# -eq 0 ]; then for i in `jot $mounts`; do [ ! -d ${nullfs_dstdir}$i ] && mkdir ${nullfs_dstdir}$i mount | grep -q " ${nullfs_dstdir}$i " && umount ${nullfs_dstdir}$i done # start the parallel tests for i in `jot $mounts`; do ./$0 $i & ./$0 find $i & done wait for i in `jot $mounts`; do umount ${nullfs_dstdir}$i > /dev/null 2>&1 done exit 0 else if [ $1 = find ]; then while [ -f $CONT ]; do find ${nullfs_dstdir}$2 -type f -maxdepth 2 -ls > \ /dev/null 2>&1 done else # The test: Parallel mount and unmounts touch $CONT start=`date '+%s'` while [ `date '+%s'` -lt $((start + 300)) ]; do m=$1 mount_nullfs $nullfs_srcdir ${nullfs_dstdir}$m > \ /dev/null 2>&1 opt=`[ $(( m % 2 )) -eq 0 ] && echo -f` while mount | grep -q ${nullfs_dstdir}$m; do umount $opt ${nullfs_dstdir}$m > \ /dev/null 2>&1 done done rm -f $CONT fi fi diff --git a/usr.bin/ar/write.c b/usr.bin/ar/write.c index 24bac2f25b92..6ca20a372fdb 100644 --- a/usr.bin/ar/write.c +++ b/usr.bin/ar/write.c @@ -1,952 +1,952 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Kai Wang * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ar.h" #define _ARMAG_LEN 8 /* length of ar magic string */ #define _ARHDR_LEN 60 /* length of ar header */ #define _INIT_AS_CAP 128 /* initial archive string table size */ #define _INIT_SYMOFF_CAP (256*(sizeof(uint64_t))) /* initial so table size */ #define _INIT_SYMNAME_CAP 1024 /* initial sn table size */ #define _MAXNAMELEN_SVR4 15 /* max member name length in svr4 variant */ #define _TRUNCATE_LEN 15 /* number of bytes to keep for member name */ static void add_to_ar_str_table(struct bsdar *bsdar, const char *name); static void add_to_ar_sym_table(struct bsdar *bsdar, const char *name); static struct ar_obj *create_obj_from_file(struct bsdar *bsdar, const char *name, time_t mtime); static void create_symtab_entry(struct bsdar *bsdar, void *maddr, size_t size); static void free_obj(struct bsdar *bsdar, struct ar_obj *obj); static void insert_obj(struct bsdar *bsdar, struct ar_obj *obj, struct ar_obj *pos); static void prefault_buffer(const char *buf, size_t s); static void read_objs(struct bsdar *bsdar, const char *archive, int checkargv); static void write_archive(struct bsdar *bsdar, char mode); static void write_cleanup(struct bsdar *bsdar); static void write_data(struct bsdar *bsdar, struct archive *a, const void *buf, size_t s); static void write_objs(struct bsdar *bsdar); void ar_mode_d(struct bsdar *bsdar) { write_archive(bsdar, 'd'); } void ar_mode_m(struct bsdar *bsdar) { write_archive(bsdar, 'm'); } void ar_mode_q(struct bsdar *bsdar) { write_archive(bsdar, 'q'); } void ar_mode_r(struct bsdar *bsdar) { write_archive(bsdar, 'r'); } void ar_mode_s(struct bsdar *bsdar) { write_archive(bsdar, 's'); } void ar_mode_A(struct bsdar *bsdar) { write_archive(bsdar, 'A'); } /* * Create object from file, return created obj upon success, or NULL * when an error occurs or the member is not newer than existing * one while -u is specified. */ static struct ar_obj * create_obj_from_file(struct bsdar *bsdar, const char *name, time_t mtime) { struct ar_obj *obj; struct stat sb; const char *bname; char *tmpname; if (name == NULL) return (NULL); obj = malloc(sizeof(struct ar_obj)); if (obj == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "malloc failed"); if ((obj->fd = open(name, O_RDONLY, 0)) < 0) { bsdar_warnc(bsdar, errno, "can't open file: %s", name); free(obj); return (NULL); } tmpname = strdup(name); if (tmpname == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "strdup failed"); if ((bname = basename(tmpname)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "basename failed"); if (bsdar->options & AR_TR && strlen(bname) > _TRUNCATE_LEN) { if ((obj->name = malloc(_TRUNCATE_LEN + 1)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "malloc failed"); (void)strncpy(obj->name, bname, _TRUNCATE_LEN); obj->name[_TRUNCATE_LEN] = '\0'; } else if ((obj->name = strdup(bname)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "strdup failed"); free(tmpname); if (fstat(obj->fd, &sb) < 0) { bsdar_warnc(bsdar, errno, "can't fstat file: %s", obj->name); goto giveup; } if (!S_ISREG(sb.st_mode)) { bsdar_warnc(bsdar, 0, "%s is not an ordinary file", obj->name); goto giveup; } /* * When option '-u' is specified and member is not newer than the * existing one, the replace will not happen. While if mtime == 0, * which indicates that this is to "replace a none exist member", * the replace will proceed regardless of '-u'. */ if (mtime != 0 && bsdar->options & AR_U && sb.st_mtime <= mtime) goto giveup; /* * When option '-D' is specified, mtime and UID / GID from the file - * will be replaced with 0, and file mode with 644. This ensures that + * will be replaced with 0, and file mode with 644. This ensures that * checksums will match for two archives containing the exact same * files. */ if (bsdar->options & AR_D) { obj->uid = 0; obj->gid = 0; obj->mtime = 0; obj->md = S_IFREG | 0644; } else { obj->uid = sb.st_uid; obj->gid = sb.st_gid; obj->mtime = sb.st_mtime; obj->md = sb.st_mode; } obj->size = sb.st_size; obj->dev = sb.st_dev; obj->ino = sb.st_ino; if (obj->size == 0) { obj->maddr = NULL; return (obj); } if ((obj->maddr = mmap(NULL, obj->size, PROT_READ, MAP_PRIVATE, obj->fd, (off_t)0)) == MAP_FAILED) { bsdar_warnc(bsdar, errno, "can't mmap file: %s", obj->name); goto giveup; } if (close(obj->fd) < 0) bsdar_errc(bsdar, EX_SOFTWARE, errno, "close failed: %s", obj->name); return (obj); giveup: if (close(obj->fd) < 0) bsdar_errc(bsdar, EX_SOFTWARE, errno, "close failed: %s", obj->name); free(obj->name); free(obj); return (NULL); } /* * Free object itself and its associated allocations. */ static void free_obj(struct bsdar *bsdar, struct ar_obj *obj) { if (obj->fd == -1) free(obj->maddr); else if (obj->maddr != NULL && munmap(obj->maddr, obj->size)) bsdar_warnc(bsdar, errno, "can't munmap file: %s", obj->name); free(obj->name); free(obj); } /* * Insert obj to the tail, or before/after the pos obj. */ static void insert_obj(struct bsdar *bsdar, struct ar_obj *obj, struct ar_obj *pos) { if (obj == NULL) bsdar_errc(bsdar, EX_SOFTWARE, 0, "try to insert a null obj"); if (pos == NULL || obj == pos) /* * If the object to move happens to be the position obj, * or if there is not a pos obj, move it to tail. */ goto tail; if (bsdar->options & AR_B) { TAILQ_INSERT_BEFORE(pos, obj, objs); return; } if (bsdar->options & AR_A) { TAILQ_INSERT_AFTER(&bsdar->v_obj, pos, obj, objs); return; } tail: TAILQ_INSERT_TAIL(&bsdar->v_obj, obj, objs); } /* * Read objects from archive into v_obj list. Note that checkargv is * set when read_objs is used to read objects from the target of * ADDLIB command (ar script mode), in this case argv array possibly * specifies the members ADDLIB want. */ static void read_objs(struct bsdar *bsdar, const char *archive, int checkargv) { struct archive *a; struct archive_entry *entry; struct ar_obj *obj; const char *name; const char *bname; char *buff; char **av; size_t size; int i, r, find; if ((a = archive_read_new()) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, 0, "archive_read_new failed"); archive_read_support_format_ar(a); AC(archive_read_open_filename(a, archive, DEF_BLKSZ)); for (;;) { r = archive_read_next_header(a, &entry); if (r == ARCHIVE_FATAL) bsdar_errc(bsdar, EX_DATAERR, archive_errno(a), "%s", archive_error_string(a)); if (r == ARCHIVE_EOF) break; if (r == ARCHIVE_WARN || r == ARCHIVE_RETRY) bsdar_warnc(bsdar, archive_errno(a), "%s", archive_error_string(a)); if (r == ARCHIVE_RETRY) { bsdar_warnc(bsdar, 0, "Retrying..."); continue; } name = archive_entry_pathname(entry); /* * skip pseudo members. */ if (strcmp(name, "/") == 0 || strcmp(name, "//") == 0) continue; /* * If checkargv is set, only read those members specified * in argv. */ if (checkargv && bsdar->argc > 0) { find = 0; for(i = 0; i < bsdar->argc; i++) { av = &bsdar->argv[i]; if (*av == NULL) continue; if ((bname = basename(*av)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "basename failed"); if (strcmp(bname, name) != 0) continue; *av = NULL; find = 1; break; } if (!find) continue; } size = archive_entry_size(entry); if (size > 0) { if ((buff = malloc(size)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "malloc failed"); if (archive_read_data(a, buff, size) != (ssize_t)size) { bsdar_warnc(bsdar, archive_errno(a), "%s", archive_error_string(a)); free(buff); continue; } } else buff = NULL; obj = malloc(sizeof(struct ar_obj)); if (obj == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "malloc failed"); obj->maddr = buff; if ((obj->name = strdup(name)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "strdup failed"); obj->size = size; obj->uid = archive_entry_uid(entry); obj->gid = archive_entry_gid(entry); obj->md = archive_entry_mode(entry); obj->mtime = archive_entry_mtime(entry); obj->dev = 0; obj->ino = 0; /* * Objects from archive have obj->fd set to -1, * for the ease of cleaning up. */ obj->fd = -1; TAILQ_INSERT_TAIL(&bsdar->v_obj, obj, objs); } AC(archive_read_close(a)); AC(archive_read_free(a)); } /* * Determine the constitution of resulting archive. */ static void write_archive(struct bsdar *bsdar, char mode) { struct ar_obj *nobj, *obj, *obj_temp, *pos; struct stat sb; const char *bname; char **av; int i; TAILQ_INIT(&bsdar->v_obj); nobj = NULL; pos = NULL; memset(&sb, 0, sizeof(sb)); /* * Test if the specified archive exists, to figure out * whether we are creating one here. */ if (stat(bsdar->filename, &sb) != 0) { if (errno != ENOENT) { bsdar_warnc(bsdar, 0, "stat %s failed", bsdar->filename); return; } /* We do not create archive in mode 'd', 'm' and 's'. */ if (mode != 'r' && mode != 'q') { bsdar_warnc(bsdar, 0, "%s: no such file", bsdar->filename); return; } /* Issue a warning if -c is not specified when creating. */ if (!(bsdar->options & AR_C)) bsdar_warnc(bsdar, 0, "creating %s", bsdar->filename); goto new_archive; } /* * First read members from existing archive. */ read_objs(bsdar, bsdar->filename, 0); /* * For mode 's', no member will be moved, deleted or replaced. */ if (mode == 's') goto write_objs; /* * For mode 'q', we don't need to adjust existing members either. * Also, -a, -b and -i are ignored in this mode. New members are * always inserted at tail. */ if (mode == 'q') goto new_archive; /* * Mode 'A' adds the contents of another archive to the tail of * current archive. Note that mode 'A' is a special mode for the * ADDLIB command of the ar script mode. Currently there is no * access to this function from the ar command line mode. */ if (mode == 'A') { /* * Read objects from the target archive of ADDLIB command. * If there are members specified in argv, read those members * only, otherwise the entire archive will be read. */ read_objs(bsdar, bsdar->addlib, 1); goto write_objs; } /* * Try to find the position member specified by user. */ if (bsdar->options & AR_A || bsdar->options & AR_B) { TAILQ_FOREACH(obj, &bsdar->v_obj, objs) { if (strcmp(obj->name, bsdar->posarg) == 0) { pos = obj; break; } } /* * If can't find `pos' specified by user, * silently insert objects at tail. */ if (pos == NULL) bsdar->options &= ~(AR_A | AR_B); } for (i = 0; i < bsdar->argc; i++) { av = &bsdar->argv[i]; TAILQ_FOREACH_SAFE(obj, &bsdar->v_obj, objs, obj_temp) { if ((bname = basename(*av)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "basename failed"); if (bsdar->options & AR_TR) { if (strncmp(bname, obj->name, _TRUNCATE_LEN)) continue; } else if (strcmp(bname, obj->name) != 0) continue; if (mode == 'r') { /* * if the new member is not qualified * to replace the old one, skip it. */ nobj = create_obj_from_file(bsdar, *av, obj->mtime); if (nobj == NULL) goto skip_obj; } if (bsdar->options & AR_V) (void)fprintf(stdout, "%c - %s\n", mode, *av); TAILQ_REMOVE(&bsdar->v_obj, obj, objs); if (mode == 'd' || mode == 'r') free_obj(bsdar, obj); if (mode == 'm') insert_obj(bsdar, obj, pos); if (mode == 'r') insert_obj(bsdar, nobj, pos); skip_obj: *av = NULL; break; } } new_archive: /* * When operating in mode 'r', directly add those user specified * objects which do not exist in current archive. When operating * in mode 'q', all objects specified in command line args are * appended to the archive, without comparing with existing ones. */ for (i = 0; i < bsdar->argc; i++) { av = &bsdar->argv[i]; if (*av != NULL && (mode == 'r' || mode == 'q')) { nobj = create_obj_from_file(bsdar, *av, 0); if (nobj != NULL) insert_obj(bsdar, nobj, pos); if (bsdar->options & AR_V && nobj != NULL) (void)fprintf(stdout, "a - %s\n", *av); *av = NULL; } } write_objs: write_objs(bsdar); write_cleanup(bsdar); } /* * Memory cleaning up. */ static void write_cleanup(struct bsdar *bsdar) { struct ar_obj *obj, *obj_temp; TAILQ_FOREACH_SAFE(obj, &bsdar->v_obj, objs, obj_temp) { TAILQ_REMOVE(&bsdar->v_obj, obj, objs); free_obj(bsdar, obj); } free(bsdar->as); free(bsdar->s_so); free(bsdar->s_sn); bsdar->as = NULL; bsdar->s_so = NULL; bsdar->s_so_max = 0; bsdar->s_sn = NULL; } /* * Fault in the buffer prior to writing as a workaround for poor performance * due to interaction with kernel fs deadlock avoidance code. See the comment * above vn_io_fault_doio() in sys/kern/vfs_vnops.c for details of the issue. */ static void prefault_buffer(const char *buf, size_t s) { volatile const char *p; size_t page_size; if (s == 0) return; page_size = sysconf(_SC_PAGESIZE); for (p = buf; p < buf + s; p += page_size) *p; /* * Ensure we touch the last page as well, in case the buffer is not * page-aligned. */ *(volatile const char *)(buf + s - 1); } /* * Wrapper for archive_write_data(). */ static void write_data(struct bsdar *bsdar, struct archive *a, const void *buf, size_t s) { ssize_t written; prefault_buffer(buf, s); while (s > 0) { written = archive_write_data(a, buf, s); if (written < 0) bsdar_errc(bsdar, EX_SOFTWARE, archive_errno(a), "%s", archive_error_string(a)); buf = (const char *)buf + written; s -= written; } } /* * Write the resulting archive members. */ static void write_objs(struct bsdar *bsdar) { struct ar_obj *obj; struct archive *a; struct archive_entry *entry; size_t s_sz; /* size of archive symbol table. */ size_t pm_sz; /* size of pseudo members */ size_t w_sz; /* size of words in symbol table */ size_t i; uint64_t nr; uint32_t nr32; if (elf_version(EV_CURRENT) == EV_NONE) bsdar_errc(bsdar, EX_SOFTWARE, 0, "ELF library initialization failed: %s", elf_errmsg(-1)); bsdar->rela_off = 0; /* Create archive symbol table and archive string table, if need. */ TAILQ_FOREACH(obj, &bsdar->v_obj, objs) { if (!(bsdar->options & AR_SS) && obj->maddr != NULL) create_symtab_entry(bsdar, obj->maddr, obj->size); if (strlen(obj->name) > _MAXNAMELEN_SVR4) add_to_ar_str_table(bsdar, obj->name); bsdar->rela_off += _ARHDR_LEN + obj->size + obj->size % 2; } /* * Pad the symbol name string table. It is treated specially because * symbol name table should be padded by a '\0', not the common '\n' * for other members. The size of sn table includes the pad bit. */ if (bsdar->s_cnt != 0 && bsdar->s_sn_sz % 2 != 0) bsdar->s_sn[bsdar->s_sn_sz++] = '\0'; /* * Archive string table is padded by a "\n" as the normal members. * The difference is that the size of archive string table counts - * in the pad bit, while normal members' size fileds do not. + * in the pad bit, while normal members' size fields do not. */ if (bsdar->as != NULL && bsdar->as_sz % 2 != 0) bsdar->as[bsdar->as_sz++] = '\n'; /* * If there is a symbol table, calculate the size of pseudo members, * convert previously stored relative offsets to absolute ones, and * then make them Big Endian. * * absolute_offset = htobe32(relative_offset + size_of_pseudo_members) */ w_sz = sizeof(uint32_t); if (bsdar->s_cnt != 0) { s_sz = (bsdar->s_cnt + 1) * sizeof(uint32_t) + bsdar->s_sn_sz; pm_sz = _ARMAG_LEN + (_ARHDR_LEN + s_sz); if (bsdar->as != NULL) pm_sz += _ARHDR_LEN + bsdar->as_sz; /* Use the 64-bit word size format if necessary. */ if (bsdar->s_so_max > UINT32_MAX - pm_sz) { w_sz = sizeof(uint64_t); pm_sz -= s_sz; s_sz = (bsdar->s_cnt + 1) * sizeof(uint64_t) + bsdar->s_sn_sz; pm_sz += s_sz; /* Convert to big-endian. */ for (i = 0; i < bsdar->s_cnt; i++) bsdar->s_so[i] = htobe64(bsdar->s_so[i] + pm_sz); } else { /* * Convert to big-endian and shuffle in-place to * the front of the allocation. XXX UB */ for (i = 0; i < bsdar->s_cnt; i++) ((uint32_t *)(bsdar->s_so))[i] = htobe32(bsdar->s_so[i] + pm_sz); } } if ((a = archive_write_new()) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, 0, "archive_write_new failed"); archive_write_set_format_ar_svr4(a); AC(archive_write_open_filename(a, bsdar->filename)); /* * write the archive symbol table, if there is one. * If options -s is explicitly specified or we are invoked * as ranlib, write the symbol table even if it is empty. */ if ((bsdar->s_cnt != 0 && !(bsdar->options & AR_SS)) || bsdar->options & AR_S) { entry = archive_entry_new(); if (entry == NULL) bsdar_errc(bsdar, EX_SOFTWARE, 0, "archive_entry_new failed"); if (w_sz == sizeof(uint64_t)) archive_entry_copy_pathname(entry, "/SYM64/"); else archive_entry_copy_pathname(entry, "/"); if ((bsdar->options & AR_D) == 0) archive_entry_set_mtime(entry, time(NULL), 0); archive_entry_set_size(entry, (bsdar->s_cnt + 1) * w_sz + bsdar->s_sn_sz); AC(archive_write_header(a, entry)); if (w_sz == sizeof(uint64_t)) { nr = htobe64(bsdar->s_cnt); write_data(bsdar, a, &nr, sizeof(nr)); } else { nr32 = htobe32((uint32_t)bsdar->s_cnt); write_data(bsdar, a, &nr32, sizeof(nr32)); } write_data(bsdar, a, bsdar->s_so, w_sz * bsdar->s_cnt); write_data(bsdar, a, bsdar->s_sn, bsdar->s_sn_sz); archive_entry_free(entry); } /* write the archive string table, if any. */ if (bsdar->as != NULL) { entry = archive_entry_new(); if (entry == NULL) bsdar_errc(bsdar, EX_SOFTWARE, 0, "archive_entry_new failed"); archive_entry_copy_pathname(entry, "//"); archive_entry_set_size(entry, bsdar->as_sz); AC(archive_write_header(a, entry)); write_data(bsdar, a, bsdar->as, bsdar->as_sz); archive_entry_free(entry); } /* write normal members. */ TAILQ_FOREACH(obj, &bsdar->v_obj, objs) { entry = archive_entry_new(); if (entry == NULL) bsdar_errc(bsdar, EX_SOFTWARE, 0, "archive_entry_new failed"); archive_entry_copy_pathname(entry, obj->name); archive_entry_set_uid(entry, obj->uid); archive_entry_set_gid(entry, obj->gid); archive_entry_set_mode(entry, obj->md); archive_entry_set_size(entry, obj->size); archive_entry_set_mtime(entry, obj->mtime, 0); archive_entry_set_dev(entry, obj->dev); archive_entry_set_ino(entry, obj->ino); archive_entry_set_filetype(entry, AE_IFREG); AC(archive_write_header(a, entry)); write_data(bsdar, a, obj->maddr, obj->size); archive_entry_free(entry); } AC(archive_write_close(a)); AC(archive_write_free(a)); } /* * Extract global symbols from ELF binary members. */ static void create_symtab_entry(struct bsdar *bsdar, void *maddr, size_t size) { Elf *e; Elf_Scn *scn; GElf_Shdr shdr; GElf_Sym sym; Elf_Data *data; char *name; size_t n, shstrndx; int elferr, tabndx, len, i; if ((e = elf_memory(maddr, size)) == NULL) { bsdar_warnc(bsdar, 0, "elf_memory() failed: %s", elf_errmsg(-1)); return; } if (elf_kind(e) != ELF_K_ELF) { /* Silently ignore non-elf member. */ elf_end(e); return; } if (elf_getshstrndx(e, &shstrndx) == 0) { bsdar_warnc(bsdar, EX_SOFTWARE, 0, "elf_getshstrndx failed: %s", elf_errmsg(-1)); elf_end(e); return; } tabndx = -1; scn = NULL; while ((scn = elf_nextscn(e, scn)) != NULL) { if (gelf_getshdr(scn, &shdr) != &shdr) { bsdar_warnc(bsdar, 0, "elf_getshdr failed: %s", elf_errmsg(-1)); continue; } if ((name = elf_strptr(e, shstrndx, shdr.sh_name)) == NULL) { bsdar_warnc(bsdar, 0, "elf_strptr failed: %s", elf_errmsg(-1)); continue; } if (strcmp(name, ".strtab") == 0) { tabndx = elf_ndxscn(scn); break; } } elferr = elf_errno(); if (elferr != 0) bsdar_warnc(bsdar, 0, "elf_nextscn failed: %s", elf_errmsg(elferr)); if (tabndx == -1) { bsdar_warnc(bsdar, 0, "can't find .strtab section"); elf_end(e); return; } scn = NULL; while ((scn = elf_nextscn(e, scn)) != NULL) { if (gelf_getshdr(scn, &shdr) != &shdr) { bsdar_warnc(bsdar, EX_SOFTWARE, 0, "elf_getshdr failed: %s", elf_errmsg(-1)); continue; } if (shdr.sh_type != SHT_SYMTAB) continue; data = NULL; n = 0; while (n < shdr.sh_size && (data = elf_getdata(scn, data)) != NULL) { len = data->d_size / shdr.sh_entsize; for (i = 0; i < len; i++) { if (gelf_getsym(data, i, &sym) != &sym) { bsdar_warnc(bsdar, EX_SOFTWARE, 0, "gelf_getsym failed: %s", elf_errmsg(-1)); continue; } /* keep only global or weak symbols */ if (GELF_ST_BIND(sym.st_info) != STB_GLOBAL && GELF_ST_BIND(sym.st_info) != STB_WEAK) continue; /* keep only defined symbols */ if (sym.st_shndx == SHN_UNDEF) continue; if ((name = elf_strptr(e, tabndx, sym.st_name)) == NULL) { bsdar_warnc(bsdar, EX_SOFTWARE, 0, "elf_strptr failed: %s", elf_errmsg(-1)); continue; } add_to_ar_sym_table(bsdar, name); } } } elferr = elf_errno(); if (elferr != 0) bsdar_warnc(bsdar, EX_SOFTWARE, 0, "elf_nextscn failed: %s", elf_errmsg(elferr)); elf_end(e); } /* * Append to the archive string table buffer. */ static void add_to_ar_str_table(struct bsdar *bsdar, const char *name) { if (bsdar->as == NULL) { bsdar->as_cap = _INIT_AS_CAP; bsdar->as_sz = 0; if ((bsdar->as = malloc(bsdar->as_cap)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "malloc failed"); } /* * The space required for holding one member name in as table includes: * strlen(name) + (1 for '/') + (1 for '\n') + (possibly 1 for padding). */ while (bsdar->as_sz + strlen(name) + 3 > bsdar->as_cap) { bsdar->as_cap *= 2; bsdar->as = realloc(bsdar->as, bsdar->as_cap); if (bsdar->as == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "realloc failed"); } strncpy(&bsdar->as[bsdar->as_sz], name, strlen(name)); bsdar->as_sz += strlen(name); bsdar->as[bsdar->as_sz++] = '/'; bsdar->as[bsdar->as_sz++] = '\n'; } /* * Append to the archive symbol table buffer. */ static void add_to_ar_sym_table(struct bsdar *bsdar, const char *name) { if (bsdar->s_so == NULL) { if ((bsdar->s_so = malloc(_INIT_SYMOFF_CAP)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "malloc failed"); bsdar->s_so_cap = _INIT_SYMOFF_CAP; bsdar->s_cnt = 0; } if (bsdar->s_sn == NULL) { if ((bsdar->s_sn = malloc(_INIT_SYMNAME_CAP)) == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "malloc failed"); bsdar->s_sn_cap = _INIT_SYMNAME_CAP; bsdar->s_sn_sz = 0; } if (bsdar->s_cnt * sizeof(uint64_t) >= bsdar->s_so_cap) { bsdar->s_so_cap *= 2; bsdar->s_so = realloc(bsdar->s_so, bsdar->s_so_cap); if (bsdar->s_so == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "realloc failed"); } bsdar->s_so[bsdar->s_cnt] = bsdar->rela_off; if ((uint64_t)bsdar->rela_off > bsdar->s_so_max) bsdar->s_so_max = (uint64_t)bsdar->rela_off; bsdar->s_cnt++; /* * The space required for holding one symbol name in sn table includes: * strlen(name) + (1 for '\n') + (possibly 1 for padding). */ while (bsdar->s_sn_sz + strlen(name) + 2 > bsdar->s_sn_cap) { bsdar->s_sn_cap *= 2; bsdar->s_sn = realloc(bsdar->s_sn, bsdar->s_sn_cap); if (bsdar->s_sn == NULL) bsdar_errc(bsdar, EX_SOFTWARE, errno, "realloc failed"); } strncpy(&bsdar->s_sn[bsdar->s_sn_sz], name, strlen(name)); bsdar->s_sn_sz += strlen(name); bsdar->s_sn[bsdar->s_sn_sz++] = '\0'; } diff --git a/usr.bin/fmt/fmt.c b/usr.bin/fmt/fmt.c index be196c26977a..967de1309e9e 100644 --- a/usr.bin/fmt/fmt.c +++ b/usr.bin/fmt/fmt.c @@ -1,786 +1,786 @@ /* $OpenBSD: fmt.c,v 1.21 2004/04/01 23:14:19 tedu Exp $ */ /* Sensible version of fmt * * Syntax: fmt [ options ] [ goal [ max ] ] [ filename ... ] * * Since the documentation for the original fmt is so poor, here * is an accurate description of what this one does. It's usually * the same. The *mechanism* used may differ from that suggested * here. Note that we are *not* entirely compatible with fmt, * because fmt gets so many things wrong. * * 1. Tabs are expanded, assuming 8-space tab stops. * If the `-t ' option is given, we assume -space * tab stops instead. * Trailing blanks are removed from all lines. * x\b == nothing, for any x other than \b. * Other control characters are simply stripped. This * includes \r. * 2. Each line is split into leading whitespace and * everything else. Maximal consecutive sequences of * lines with the same leading whitespace are considered * to form paragraphs, except that a blank line is always * a paragraph to itself. * If the `-p' option is given then the first line of a * paragraph is permitted to have indentation different * from that of the other lines. * If the `-m' option is given then a line that looks * like a mail message header, if it is not immediately * preceded by a non-blank non-message-header line, is * taken to start a new paragraph, which also contains * any subsequent lines with non-empty leading whitespace. * Unless the `-n' option is given, lines beginning with * a . (dot) are not formatted. * 3. The "everything else" is split into words; a word * includes its trailing whitespace, and a word at the * end of a line is deemed to be followed by a single * space, or two spaces if it ends with a sentence-end * character. (See the `-d' option for how to change that.) * If the `-s' option has been given, then a word's trailing * whitespace is replaced by what it would have had if it * had occurred at end of line. * 4. Each paragraph is sent to standard output as follows. * We output the leading whitespace, and then enough words * to make the line length as near as possible to the goal * without exceeding the maximum. (If a single word would * exceed the maximum, we output that anyway.) Of course * the trailing whitespace of the last word is ignored. * We then emit a newline and start again if there are any * words left. * Note that for a blank line this translates as "We emit * a newline". * If the `-l ' option is given, then leading whitespace * is modified slightly: spaces are replaced by a tab. * Indented paragraphs (see above under `-p') make matters * more complicated than this suggests. Actually every paragraph * has two `leading whitespace' values; the value for the first * line, and the value for the most recent line. (While processing * the first line, the two are equal. When `-p' has not been * given, they are always equal.) The leading whitespace * actually output is that of the first line (for the first * line of *output*) or that of the most recent line (for * all other lines of output). * When `-m' has been given, message header paragraphs are * taken as having first-leading-whitespace empty and * subsequent-leading-whitespace two spaces. * * Multiple input files are formatted one at a time, so that a file * never ends in the middle of a line. * * There's an alternative mode of operation, invoked by giving * the `-c' option. In that case we just center every line, * and most of the other options are ignored. This should * really be in a separate program, but we must stay compatible * with old `fmt'. * * QUERY: Should `-m' also try to do the right thing with quoted text? * QUERY: `-b' to treat backslashed whitespace as old `fmt' does? * QUERY: Option meaning `never join lines'? * QUERY: Option meaning `split in mid-word to avoid overlong lines'? * (Those last two might not be useful, since we have `fold'.) * * Differences from old `fmt': * * - We have many more options. Options that aren't understood * generate a lengthy usage message, rather than being * treated as filenames. * - Even with `-m', our handling of message headers is * significantly different. (And much better.) * - We don't treat `\ ' as non-word-breaking. * - Downward changes of indentation start new paragraphs * for us, as well as upward. (I think old `fmt' behaves * in the way it does in order to allow indented paragraphs, * but this is a broken way of making indented paragraphs * behave right.) * - Given the choice of going over or under |goal_length| * by the same amount, we go over; old `fmt' goes under. * - We treat `?' as ending a sentence, and not `:'. Old `fmt' * does the reverse. * - We return approved return codes. Old `fmt' returns * 1 for some errors, and *the number of unopenable files* * when that was all that went wrong. * - We have fewer crashes and more helpful error messages. * - We don't turn spaces into tabs at starts of lines unless * specifically requested. * - New `fmt' is somewhat smaller and slightly faster than * old `fmt'. * * Bugs: * * None known. There probably are some, though. * * Portability: * * I believe this code to be pretty portable. It does require * that you have `getopt'. If you need to include "getopt.h" * for this (e.g., if your system didn't come with `getopt' * and you installed it yourself) then you should arrange for * NEED_getopt_h to be #defined. * * Everything here should work OK even on nasty 16-bit * machines and nice 64-bit ones. However, it's only really * been tested on my FreeBSD machine. Your mileage may vary. */ /* Copyright (c) 1997 Gareth McCaughan. All rights reserved. * * Redistribution and use of this code, in source or binary forms, * with or without modification, are permitted subject to the following * conditions: * * - Redistribution of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - If you distribute modified source code it must also include * a notice saying that it has been modified, and giving a brief * description of what changes have been made. * * Disclaimer: I am not responsible for the results of using this code. * If it formats your hard disc, sends obscene messages to * your boss and kills your children then that's your problem * not mine. I give absolutely no warranty of any sort as to * what the program will do, and absolutely refuse to be held * liable for any consequences of your using it. * Thank you. Have a nice day. */ /* RCS change log: * Revision 1.5 1998/03/02 18:02:21 gjm11 * Minor changes for portability. * * Revision 1.4 1997/10/01 11:51:28 gjm11 * Repair broken indented-paragraph handling. * Add mail message header stuff. * Improve comments and layout. * Make usable with non-BSD systems. * Add revision display to usage message. * * Revision 1.3 1997/09/30 16:24:47 gjm11 * Add copyright notice, rcsid string and log message. * * Revision 1.2 1997/09/30 16:13:39 gjm11 * Add options: -d , -l , -p, -s, -t , -h . * Parse options with `getopt'. Clean up code generally. * Make comments more accurate. * * Revision 1.1 1997/09/30 11:29:57 gjm11 * Initial revision */ #ifndef lint static const char copyright[] = "Copyright (c) 1997 Gareth McCaughan. All rights reserved.\n"; #endif /* not lint */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include /* Something that, we hope, will never be a genuine line length, * indentation etc. */ #define SILLY ((size_t)-1) /* I used to use |strtoul| for this, but (1) not all systems have it * and (2) it's probably better to use |strtol| to detect negative * numbers better. * If |fussyp==0| then we don't complain about non-numbers * (returning 0 instead), but we do complain about bad numbers. */ static size_t get_positive(const char *s, const char *err_mess, int fussyP) { char *t; long result = strtol(s, &t, 0); if (*t) { if (fussyP) goto Lose; else return 0; } if (result <= 0) { Lose: errx(EX_USAGE, "%s", err_mess); } return (size_t)result; } static size_t get_nonnegative(const char *s, const char *err_mess, int fussyP) { char *t; long result = strtol(s, &t, 0); if (*t) { if (fussyP) goto Lose; else return 0; } if (result < 0) { Lose: errx(EX_USAGE, "%s", err_mess); } return (size_t)result; } /* Global variables */ static int centerP = 0; /* Try to center lines? */ static size_t goal_length = 0; /* Target length for output lines */ static size_t max_length = 0; /* Maximum length for output lines */ static int coalesce_spaces_P = 0; /* Coalesce multiple whitespace -> ' ' ? */ static int allow_indented_paragraphs = 0; /* Can first line have diff. ind.? */ static int tab_width = 8; /* Number of spaces per tab stop */ static size_t output_tab_width = 8; /* Ditto, when squashing leading spaces */ static const wchar_t *sentence_enders = L".?!"; /* Double-space after these */ static int grok_mail_headers = 0; /* treat embedded mail headers magically? */ static int format_troff = 0; /* Format troff? */ static int n_errors = 0; /* Number of failed files. Return on exit. */ static wchar_t *output_buffer = NULL; /* Output line will be built here */ static size_t x; /* Horizontal position in output line */ static size_t x0; /* Ditto, ignoring leading whitespace */ static size_t output_buffer_length = 0; static size_t pending_spaces; /* Spaces to add before next word */ static int output_in_paragraph = 0; /* Any of current para written out yet? */ /* Prototypes */ static void process_named_file(const char *); static void process_stream(FILE *, const char *); static size_t indent_length(const wchar_t *, size_t); static int might_be_header(const wchar_t *); static void new_paragraph(size_t, size_t); static void output_word(size_t, size_t, const wchar_t *, size_t, size_t); static void output_indent(size_t); static void center_stream(FILE *, const char *); static wchar_t *get_line(FILE *, size_t *); static void *xrealloc(void *, size_t); #define XMALLOC(x) xrealloc(0,x) /* Here is perhaps the right place to mention that this code is * all in top-down order. Hence, |main| comes first. */ int main(int argc, char *argv[]) { int ch; /* used for |getopt| processing */ wchar_t *tmp; size_t len; const char *src; (void)setlocale(LC_CTYPE, ""); /* 1. Grok parameters. */ while ((ch = getopt(argc, argv, "0123456789cd:hl:mnpst:w:")) != -1) switch (ch) { case 'c': centerP = 1; format_troff = 1; continue; case 'd': src = optarg; len = mbsrtowcs(NULL, &src, 0, NULL); if (len == (size_t)-1) err(EX_USAGE, "bad sentence-ending character set"); tmp = XMALLOC((len + 1) * sizeof(wchar_t)); mbsrtowcs(tmp, &src, len + 1, NULL); sentence_enders = tmp; continue; case 'l': output_tab_width = get_nonnegative(optarg, "output tab width must be non-negative", 1); continue; case 'm': grok_mail_headers = 1; continue; case 'n': format_troff = 1; continue; case 'p': allow_indented_paragraphs = 1; continue; case 's': coalesce_spaces_P = 1; continue; case 't': tab_width = get_positive(optarg, "tab width must be positive", 1); continue; case 'w': goal_length = get_positive(optarg, "width must be positive", 1); max_length = goal_length; continue; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* * XXX this is not a stylistically approved use of * getopt() */ if (goal_length == 0) { char *p; p = argv[optind - 1]; if (p[0] == '-' && p[1] == ch && !p[2]) goal_length = get_positive(++p, "width must be nonzero", 1); else goal_length = get_positive(argv[optind] + 1, "width must be nonzero", 1); max_length = goal_length; } continue; case 'h': default: fprintf(stderr, "usage: fmt [-cmps] [-d chars] [-l num] [-t num]\n" " [-w width | -width | goal [maximum]] [file ...]\n" "Options: -c center each line instead of formatting\n" " -d double-space after at line end\n" " -l turn each spaces at start of line into a tab\n" " -m try to make sure mail header lines stay separate\n" " -n format lines beginning with a dot\n" " -p allow indented paragraphs\n" " -s coalesce whitespace inside lines\n" " -t have tabs every columns\n" " -w set maximum width to \n" " goal set target width to goal\n"); exit(ch == 'h' ? 0 : EX_USAGE); } argc -= optind; argv += optind; /* [ goal [ maximum ] ] */ if (argc > 0 && goal_length == 0 && (goal_length = get_positive(*argv, "goal length must be positive", 0)) != 0) { --argc; ++argv; if (argc > 0 && (max_length = get_positive(*argv, "max length must be positive", 0)) != 0) { --argc; ++argv; if (max_length < goal_length) errx(EX_USAGE, "max length must be >= goal length"); } } if (goal_length == 0) goal_length = 65; if (max_length == 0) max_length = goal_length + 10; if (max_length >= SIZE_T_MAX / sizeof(wchar_t)) errx(EX_USAGE, "max length too large"); /* really needn't be longer */ output_buffer = XMALLOC((max_length + 1) * sizeof(wchar_t)); /* 2. Process files. */ if (argc > 0) { while (argc-- > 0) process_named_file(*argv++); } else { process_stream(stdin, "standard input"); } /* We're done. */ return n_errors ? EX_NOINPUT : 0; } /* Process a single file, given its name. */ static void process_named_file(const char *name) { FILE *f = fopen(name, "r"); if (!f) { warn("%s", name); ++n_errors; } else { process_stream(f, name); if (ferror(f)) { warn("%s", name); ++n_errors; } fclose(f); } } /* Types of mail header continuation lines: */ typedef enum { hdr_ParagraphStart = -1, hdr_NonHeader = 0, hdr_Header = 1, hdr_Continuation = 2 } HdrType; /* Process a stream. This is where the real work happens, * except that centering is handled separately. */ static void process_stream(FILE *stream, const char *name) { size_t last_indent = SILLY; /* how many spaces in last indent? */ size_t para_line_number = 0; /* how many lines already read in this para? */ size_t first_indent = SILLY; /* indentation of line 0 of paragraph */ HdrType prev_header_type = hdr_ParagraphStart; /* ^-- header_type of previous line; -1 at para start */ wchar_t *line; size_t length; if (centerP) { center_stream(stream, name); return; } while ((line = get_line(stream, &length)) != NULL) { size_t np = indent_length(line, length); { HdrType header_type = hdr_NonHeader; if (grok_mail_headers && prev_header_type != hdr_NonHeader) { if (np == 0 && might_be_header(line)) header_type = hdr_Header; else if (np > 0 && prev_header_type > hdr_NonHeader) header_type = hdr_Continuation; } /* * We need a new paragraph if and only if: this line * is blank, OR it's a troff request (and we don't * format troff), OR it's a mail header, OR it's not * a mail header AND the last line was one, OR the * indentation has changed AND the line isn't a mail * header continuation line AND this isn't the * second line of an indented paragraph. */ if (length == 0 || (line[0] == '.' && !format_troff) || header_type == hdr_Header || (header_type == hdr_NonHeader && prev_header_type > hdr_NonHeader) || (np != last_indent && header_type != hdr_Continuation && (!allow_indented_paragraphs || para_line_number != 1))) { new_paragraph(output_in_paragraph ? last_indent : first_indent, np); para_line_number = 0; first_indent = np; last_indent = np; if (header_type == hdr_Header) last_indent = 2; /* for cont. lines */ if (length == 0 || (line[0] == '.' && !format_troff)) { if (length == 0) putwchar('\n'); else wprintf(L"%.*ls\n", (int)length, line); prev_header_type = hdr_ParagraphStart; continue; } } else { /* * If this is an indented paragraph other * than a mail header continuation, set * |last_indent|. */ if (np != last_indent && header_type != hdr_Continuation) last_indent = np; } prev_header_type = header_type; } { size_t n = np; while (n < length) { /* Find word end and count spaces after it */ size_t word_length = 0, space_length = 0; while (n + word_length < length && line[n + word_length] != ' ') ++word_length; space_length = word_length; while (n + space_length < length && line[n + space_length] == ' ') ++space_length; /* Send the word to the output machinery. */ output_word(first_indent, last_indent, line + n, word_length, space_length - word_length); n += space_length; } } ++para_line_number; } new_paragraph(output_in_paragraph ? last_indent : first_indent, 0); if (ferror(stream)) { warn("%s", name); ++n_errors; } } /* How long is the indent on this line? */ static size_t indent_length(const wchar_t *line, size_t length) { size_t n = 0; while (n < length && *line++ == ' ') ++n; return n; } /* Might this line be a mail header? * We deem a line to be a possible header if it matches the * Perl regexp /^[A-Z][-A-Za-z0-9]*:\s/. This is *not* the same * as in RFC whatever-number-it-is; we want to be gratuitously * conservative to avoid mangling ordinary civilised text. */ static int might_be_header(const wchar_t *line) { if (!iswupper(*line++)) return 0; while (*line && (iswalnum(*line) || *line == '-')) ++line; return (*line == ':' && iswspace(line[1])); } /* Begin a new paragraph with an indent of |indent| spaces. */ static void new_paragraph(size_t old_indent, size_t indent) { if (output_buffer_length) { if (old_indent > 0) output_indent(old_indent); wprintf(L"%.*ls\n", (int)output_buffer_length, output_buffer); } x = indent; x0 = 0; output_buffer_length = 0; pending_spaces = 0; output_in_paragraph = 0; } /* Output spaces or tabs for leading indentation. */ static void output_indent(size_t n_spaces) { if (output_tab_width) { while (n_spaces >= output_tab_width) { putwchar('\t'); n_spaces -= output_tab_width; } } while (n_spaces-- > 0) putwchar(' '); } /* Output a single word, or add it to the buffer. * indent0 and indent1 are the indents to use on the first and subsequent * lines of a paragraph. They'll often be the same, of course. */ static void output_word(size_t indent0, size_t indent1, const wchar_t *word, size_t length, size_t spaces) { size_t new_x; size_t indent = output_in_paragraph ? indent1 : indent0; size_t width; const wchar_t *p; int cwidth; for (p = word, width = 0; p < &word[length]; p++) width += (cwidth = wcwidth(*p)) > 0 ? cwidth : 1; new_x = x + pending_spaces + width; /* * If either |spaces==0| (at end of line) or |coalesce_spaces_P| * (squashing internal whitespace), then add just one space; except * that if the last character was a sentence-ender we actually add * two spaces. */ if (coalesce_spaces_P || spaces == 0) spaces = wcschr(sentence_enders, word[length - 1]) ? 2 : 1; if (new_x <= goal_length) { /* * After adding the word we still aren't at the goal length, - * so clearly we add it to the buffer rather than outputing + * so clearly we add it to the buffer rather than outputting * it. */ wmemset(output_buffer + output_buffer_length, L' ', pending_spaces); x0 += pending_spaces; x += pending_spaces; output_buffer_length += pending_spaces; wmemcpy(output_buffer + output_buffer_length, word, length); x0 += width; x += width; output_buffer_length += length; pending_spaces = spaces; } else { /* * Adding the word takes us past the goal. Print the * line-so-far, and the word too iff either (1) the lsf is * empty or (2) that makes us nearer the goal but doesn't * take us over the limit, or (3) the word on its own takes * us over the limit. In case (3) we put a newline in * between. */ if (indent > 0) output_indent(indent); wprintf(L"%.*ls", (int)output_buffer_length, output_buffer); if (x0 == 0 || (new_x <= max_length && new_x - goal_length <= goal_length - x)) { wprintf(L"%*ls", (int)pending_spaces, L""); goto write_out_word; } else { /* * If the word takes us over the limit on its own, * just spit it out and don't bother buffering it. */ if (indent + width > max_length) { putwchar('\n'); if (indent > 0) output_indent(indent); write_out_word: wprintf(L"%.*ls", (int)length, word); x0 = 0; x = indent1; pending_spaces = 0; output_buffer_length = 0; } else { wmemcpy(output_buffer, word, length); x0 = width; x = width + indent1; pending_spaces = spaces; output_buffer_length = length; } } putwchar('\n'); output_in_paragraph = 1; } } /* Process a stream, but just center its lines rather than trying to * format them neatly. */ static void center_stream(FILE *stream, const char *name) { wchar_t *line, *p; size_t length; size_t width; int cwidth; while ((line = get_line(stream, &length)) != NULL) { size_t l = length; while (l > 0 && iswspace(*line)) { ++line; --l; } length = l; for (p = line, width = 0; p < &line[length]; p++) width += (cwidth = wcwidth(*p)) > 0 ? cwidth : 1; l = width; while (l < goal_length) { putwchar(' '); l += 2; } wprintf(L"%.*ls\n", (int)length, line); } if (ferror(stream)) { warn("%s", name); ++n_errors; } } /* Get a single line from a stream. Expand tabs, strip control * characters and trailing whitespace, and handle backspaces. * Return the address of the buffer containing the line, and * put the length of the line in |lengthp|. * This can cope with arbitrarily long lines, and with lines * without terminating \n. * If there are no characters left or an error happens, we * return 0. * Don't confuse |spaces_pending| here with the global * |pending_spaces|. */ static wchar_t * get_line(FILE *stream, size_t *lengthp) { static wchar_t *buf = NULL; static size_t length = 0; size_t len = 0; wint_t ch; size_t spaces_pending = 0; int troff = 0; size_t col = 0; int cwidth; if (buf == NULL) { length = 100; buf = XMALLOC(length * sizeof(wchar_t)); } while ((ch = getwc(stream)) != '\n' && ch != WEOF) { if (len + spaces_pending == 0 && ch == '.' && !format_troff) troff = 1; if (ch == ' ') ++spaces_pending; else if (troff || iswprint(ch)) { while (len + spaces_pending >= length) { length *= 2; buf = xrealloc(buf, length * sizeof(wchar_t)); } while (spaces_pending > 0) { --spaces_pending; buf[len++] = ' '; col++; } buf[len++] = ch; col += (cwidth = wcwidth(ch)) > 0 ? cwidth : 1; } else if (ch == '\t') spaces_pending += tab_width - (col + spaces_pending) % tab_width; else if (ch == '\b') { if (len) --len; if (col) --col; } } *lengthp = len; return (len > 0 || ch != WEOF) ? buf : 0; } /* (Re)allocate some memory, exiting with an error if we can't. */ static void * xrealloc(void *ptr, size_t nbytes) { void *p = realloc(ptr, nbytes); if (p == NULL) errx(EX_OSERR, "out of memory"); return p; }