Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h +++ sys/netinet/tcp.h @@ -210,6 +210,7 @@ #define TCP_PCAP_OUT 2048 /* number of output packets to keep */ #define TCP_PCAP_IN 4096 /* number of input packets to keep */ #define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ +#define TCP_FUNCTION_ALIAS 8193 /* Get the current tcp function pointer name alias */ /* Options for Rack and BBR */ #define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */ #define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */ Index: sys/netinet/tcp_stacks/bbr.c =================================================================== --- sys/netinet/tcp_stacks/bbr.c +++ sys/netinet/tcp_stacks/bbr.c @@ -14253,6 +14253,12 @@ struct epoch_tracker et; int32_t error = 0, optval; + switch (sopt->sopt_level) { + case IPPROTO_IPV6: + case IPPROTO_IP: + return (tcp_default_ctloutput(so, sopt, inp, tp)); + } + switch (sopt->sopt_name) { case TCP_RACK_PACE_MAX_SEG: case TCP_RACK_MIN_TO: Index: sys/netinet/tcp_stacks/rack.c =================================================================== --- sys/netinet/tcp_stacks/rack.c +++ sys/netinet/tcp_stacks/rack.c @@ -20244,9 +20244,55 @@ rack_set_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { +#ifdef INET6 + struct ip6_hdr *ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; +#endif +#ifdef INET + struct ip *ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; +#endif uint64_t loptval; int32_t error = 0, optval; + switch (sopt->sopt_level) { +#ifdef INET6 + case IPPROTO_IPV6: + MPASS(inp->inp_vflag & INP_IPV6PROTO); + switch (sopt->sopt_name) { + case IPV6_USE_MIN_MTU: + tcp6_use_min_mtu(tp); + break; + case IPV6_TCLASS: + /* + * The DSCP codepoint has changed, update the fsb. + */ + ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | + (rack->rc_inp->inp_flow & IPV6_FLOWINFO_MASK); + break; + } + INP_WUNLOCK(inp); + return (0); +#endif +#ifdef INET + case IPPROTO_IP: + switch (sopt->sopt_name) { + case IP_TOS: + /* + * The DSCP codepoint has changed, update the fsb. + */ + ip->ip_tos = rack->rc_inp->inp_ip_tos; + break; + case IP_TTL: + /* + * The TTL has changed, update the fsb. + */ + ip->ip_ttl = rack->rc_inp->inp_ip_ttl; + break; + } + INP_WUNLOCK(inp); + return (0); +#endif + } + switch (sopt->sopt_name) { case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ /* Pacing related ones */ Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -473,6 +473,37 @@ return(rblk); } +/* Find a matching alias for the given tcp_function_block. */ +int +find_tcp_function_alias(struct tcp_function_block *blk, + struct tcp_function_set *fs) +{ + struct tcp_function *f; + int found; + + found = 0; + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH(f, &t_functions, tf_next) { + if ((f->tf_fb == blk) && + (strncmp(f->tf_name, blk->tfb_tcp_block_name, + TCP_FUNCTION_NAME_LEN_MAX) != 0)) { + /* Matching function block with different name. */ + strncpy(fs->function_set_name, f->tf_name, + TCP_FUNCTION_NAME_LEN_MAX); + found = 1; + break; + } + } + /* Null terminate the string appropriately. */ + if (found) { + fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; + } else { + fs->function_set_name[0] = '\0'; + } + rw_runlock(&tcp_function_lock); + return (found); +} + static struct tcp_function_block * find_and_ref_tcp_default_fb(void) { @@ -3528,6 +3559,41 @@ return (maxmtu); } + +/* + * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack. + * + * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag. + * The right place to do that is ip6_setpktopt() that has just been + * executed. By the way it just filled ip6po_minmtu for us. + */ +void +tcp6_use_min_mtu(struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + + INP_WLOCK_ASSERT(inp); + /* + * In case of the IPV6_USE_MIN_MTU socket + * option, the INC_IPV6MINMTU flag to announce + * a corresponding MSS during the initial + * handshake. If the TCP connection is not in + * the front states, just reduce the MSS being + * used. This avoids the sending of TCP + * segments which will be fragmented at the + * IPv6 layer. + */ + inp->inp_inc.inc_flags |= INC_IPV6MINMTU; + if ((tp->t_state >= TCPS_SYN_SENT) && + (inp->inp_inc.inc_flags & INC_ISIPV6)) { + struct ip6_pktopts *opt; + + opt = inp->in6p_outputopts; + if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL && + tp->t_maxseg > TCP6_MSS) + tp->t_maxseg = TCP6_MSS; + } +} #endif /* INET6 */ /* Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -1729,88 +1729,81 @@ } while(0) #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) -int -tcp_ctloutput(struct socket *so, struct sockopt *sopt) +static int +tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt) { - int error; - struct inpcb *inp; - struct tcpcb *tp; - struct tcp_function_block *blk; - struct tcp_function_set fsn; + struct tcpcb *tp = intotcpcb(inp); + int error = 0; + + MPASS(sopt->sopt_dir == SOPT_SET); - error = 0; - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); if (sopt->sopt_level != IPPROTO_TCP) { #ifdef INET6 - if (inp->inp_vflag & INP_IPV6PROTO) { - error = ip6_ctloutput(so, sopt); - /* - * In case of the IPV6_USE_MIN_MTU socket option, - * the INC_IPV6MINMTU flag to announce a corresponding - * MSS during the initial handshake. - * If the TCP connection is not in the front states, - * just reduce the MSS being used. - * This avoids the sending of TCP segments which will - * be fragmented at the IPv6 layer. - */ - if ((error == 0) && - (sopt->sopt_dir == SOPT_SET) && - (sopt->sopt_level == IPPROTO_IPV6) && - (sopt->sopt_name == IPV6_USE_MIN_MTU)) { - INP_WLOCK(inp); - if ((inp->inp_flags & - (INP_TIMEWAIT | INP_DROPPED))) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } - inp->inp_inc.inc_flags |= INC_IPV6MINMTU; - tp = intotcpcb(inp); - if ((tp->t_state >= TCPS_SYN_SENT) && - (inp->inp_inc.inc_flags & INC_ISIPV6)) { - struct ip6_pktopts *opt; - - opt = inp->in6p_outputopts; - if ((opt != NULL) && - (opt->ip6po_minmtu == - IP6PO_MINMTU_ALL)) { - if (tp->t_maxseg > TCP6_MSS) { - tp->t_maxseg = TCP6_MSS; - } - } - } - INP_WUNLOCK(inp); - } - } -#endif /* INET6 */ + if (inp->inp_vflag & INP_IPV6PROTO) + error = ip6_ctloutput(inp->inp_socket, sopt); +#endif #if defined(INET6) && defined(INET) else #endif #ifdef INET - { - error = ip_ctloutput(so, sopt); - } + error = ip_ctloutput(inp->inp_socket, sopt); #endif - return (error); - } - INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } - tp = intotcpcb(inp); - /* - * Protect the TCP option TCP_FUNCTION_BLK so - * that a sub-function can *never* overwrite this. - */ - if ((sopt->sopt_dir == SOPT_SET) && - (sopt->sopt_name == TCP_FUNCTION_BLK)) { - INP_WUNLOCK(inp); - error = sooptcopyin(sopt, &fsn, sizeof fsn, - sizeof fsn); + /* + * When an IP-level socket option affects TCP, pass control + * down to stack tfb_tcp_ctloutput, otherwise return what + * IP level returned. + */ + switch (sopt->sopt_level) { +#ifdef INET6 + case IPPROTO_IPV6: + if ((inp->inp_vflag & INP_IPV6PROTO) == 0) + return (error); + switch (sopt->sopt_name) { + case IPV6_TCLASS: + /* Notify tcp stacks that care (e.g. RACK). */ + break; + case IPV6_USE_MIN_MTU: + /* Update t_maxseg accordingly. */ + break; + default: + return (error); + } + break; +#endif +#ifdef INET + case IPPROTO_IP: + switch (sopt->sopt_name) { + case IP_TOS: + case IP_TTL: + /* Notify tcp stacks that care (e.g. RACK). */ + break; + default: + return (error); + } + break; +#endif + default: + return (error); + } + } else if (sopt->sopt_name == TCP_FUNCTION_BLK) { + /* + * Protect the TCP option TCP_FUNCTION_BLK so + * that a sub-function can *never* overwrite this. + */ + struct tcp_function_set fsn; + struct tcp_function_block *blk; + + error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn); if (error) return (error); - INP_WLOCK_RECHECK(inp); + + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } + tp = intotcpcb(inp); + blk = find_and_ref_tcp_functions(&fsn); if (blk == NULL) { INP_WUNLOCK(inp); @@ -1875,7 +1868,7 @@ if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) { /* Fall back failed, drop the connection */ INP_WUNLOCK(inp); - soabort(so); + soabort(inp->inp_socket); return(error); } } @@ -1893,18 +1886,85 @@ err_out: INP_WUNLOCK(inp); return (error); - } else if ((sopt->sopt_dir == SOPT_GET) && - (sopt->sopt_name == TCP_FUNCTION_BLK)) { - strncpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name, - TCP_FUNCTION_NAME_LEN_MAX); - fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; + } + + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } + tp = intotcpcb(inp); + + /* Pass in the INP locked, caller must unlock it. */ + return (tp->t_fb->tfb_tcp_ctloutput(inp->inp_socket, sopt, inp, tp)); +} + +static int +tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt) +{ + int error = 0; + struct tcpcb *tp; + + MPASS(sopt->sopt_dir == SOPT_GET); + + if (sopt->sopt_level != IPPROTO_TCP) { +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) + error = ip6_ctloutput(inp->inp_socket, sopt); +#endif /* INET6 */ +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + error = ip_ctloutput(inp->inp_socket, sopt); +#endif + return (error); + } + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } + tp = intotcpcb(inp); + if (((sopt->sopt_name == TCP_FUNCTION_BLK) || + (sopt->sopt_name == TCP_FUNCTION_ALIAS))) { + struct tcp_function_set fsn; + + if (sopt->sopt_name == TCP_FUNCTION_ALIAS) { + memset(&fsn, 0, sizeof(fsn)); + find_tcp_function_alias(tp->t_fb, &fsn); + } else { + strncpy(fsn.function_set_name, + tp->t_fb->tfb_tcp_block_name, + TCP_FUNCTION_NAME_LEN_MAX); + fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; + } fsn.pcbcnt = tp->t_fb->tfb_refcnt; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &fsn, sizeof fsn); return (error); } - /* Pass in the INP locked, called must unlock it */ - return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); + + /* Pass in the INP locked, caller must unlock it. */ + return (tp->t_fb->tfb_tcp_ctloutput(inp->inp_socket, sopt, inp, tp)); +} + +int +tcp_ctloutput(struct socket *so, struct sockopt *sopt) +{ + int error; + struct inpcb *inp; + + error = 0; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); + + if (sopt->sopt_dir == SOPT_SET) + return (tcp_ctloutput_set(inp, sopt)); + else if (sopt->sopt_dir == SOPT_GET) + return (tcp_ctloutput_get(inp, sopt)); + else + panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); } /* @@ -1963,6 +2023,27 @@ #endif size_t len; + INP_WLOCK_ASSERT(inp); + + switch (sopt->sopt_level) { +#ifdef INET6 + case IPPROTO_IPV6: + MPASS(inp->inp_vflag & INP_IPV6PROTO); + switch (sopt->sopt_name) { + case IPV6_USE_MIN_MTU: + tcp6_use_min_mtu(tp); + /* FALLTHROUGH */ + } + INP_WUNLOCK(inp); + return (0); +#endif +#ifdef INET + case IPPROTO_IP: + INP_WUNLOCK(inp); + return (0); +#endif + } + /* * For TCP_CCALGOOPT forward the control to CC module, for both * SOPT_SET and SOPT_GET. Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -1020,6 +1020,7 @@ int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); +int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); @@ -1052,6 +1053,7 @@ uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); +void tcp6_use_min_mtu(struct tcpcb *); u_int tcp_maxseg(const struct tcpcb *); u_int tcp_fixed_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,