Index: head/sys/modules/Makefile =================================================================== --- head/sys/modules/Makefile +++ head/sys/modules/Makefile @@ -346,6 +346,7 @@ ${_syscons} \ sysvipc \ ${_ti} \ + tcp/fastpath \ tests/framework \ tests/callout_test \ tl \ Index: head/sys/netinet/tcp.h =================================================================== --- head/sys/netinet/tcp.h +++ head/sys/netinet/tcp.h @@ -167,7 +167,7 @@ #define TCP_KEEPCNT 1024 /* L,N number of keepalives before close */ #define TCP_PCAP_OUT 2048 /* number of output packets to keep */ #define TCP_PCAP_IN 4096 /* number of input packets to keep */ - +#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR @@ -245,5 +245,11 @@ u_int32_t __tcpi_pad[26]; /* Padding. */ }; #endif +#define TCP_FUNCTION_NAME_LEN_MAX 32 + +struct tcp_function_set { + char function_set_name[TCP_FUNCTION_NAME_LEN_MAX]; + uint32_t pcbcnt; +}; #endif /* !_NETINET_TCP_H_ */ Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c +++ head/sys/netinet/tcp_input.c @@ -230,23 +230,6 @@ #define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); -static void tcp_dooptions(struct tcpopt *, u_char *, int, int); -static void tcp_do_segment(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int, uint8_t, - int); -static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, - struct tcpcb *, int, int); -static void tcp_pulloutofband(struct socket *, - struct tcphdr *, struct mbuf *, int); -static void tcp_xmit_timer(struct tcpcb *, int); -static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); -static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th, - uint16_t type); -static void inline cc_conn_init(struct tcpcb *tp); -static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); -static void inline hhook_run_tcp_est_in(struct tcpcb *tp, - struct tcphdr *th, struct tcpopt *to); - /* * TCP statistics are stored in an "array" of counter(9)s. */ @@ -272,7 +255,7 @@ /* * Wrapper for the TCP established input helper hook. */ -static void inline +void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) { struct tcp_hhook_data hhook_data; @@ -290,7 +273,7 @@ /* * CC wrapper hook functions */ -static void inline +void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type) { INP_WLOCK_ASSERT(tp->t_inpcb); @@ -322,7 +305,7 @@ } } -static void inline +void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; @@ -446,7 +429,7 @@ } } -static void inline +void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th) { INP_WLOCK_ASSERT(tp->t_inpcb); @@ -601,9 +584,6 @@ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ int ti_locked; -#define TI_UNLOCKED 1 -#define TI_RLOCKED 2 - #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -1175,7 +1155,7 @@ * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); @@ -1421,7 +1401,7 @@ * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); + tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); @@ -1476,7 +1456,7 @@ return (IPPROTO_DONE); } -static void +void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) @@ -1788,7 +1768,7 @@ tp->t_rxtcur); sowwakeup(so); if (sbavail(&so->so_snd)) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); goto check_delack; } } else if (th->th_ack == tp->snd_una && @@ -1907,7 +1887,7 @@ tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } goto check_delack; } @@ -2522,7 +2502,7 @@ } } else tp->snd_cwnd += tp->t_maxseg; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; @@ -2556,12 +2536,12 @@ tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); @@ -2608,7 +2588,7 @@ (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && @@ -3074,7 +3054,7 @@ * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); check_delack: KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", @@ -3122,7 +3102,7 @@ ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; @@ -3168,7 +3148,7 @@ * The mbuf must still include the original packet header. * tp may be NULL. */ -static void +void tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int tlen, int rstreason) { @@ -3231,7 +3211,7 @@ /* * Parse TCP options and place in tcpopt. */ -static void +void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; @@ -3325,7 +3305,7 @@ * It is still reflected in the segment length for * sequencing purposes. */ -static void +void tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off) { @@ -3358,7 +3338,7 @@ * Collect new round-trip time estimate * and update averages and current timeout. */ -static void +void tcp_xmit_timer(struct tcpcb *tp, int rtt) { int delta; @@ -3738,7 +3718,7 @@ * By setting snd_nxt to ti_ack, this forces retransmission timer to * be started again. */ -static void +void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th) { tcp_seq onxt = tp->snd_nxt; @@ -3755,7 +3735,7 @@ */ tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th); tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; Index: head/sys/netinet/tcp_sack.c =================================================================== --- head/sys/netinet/tcp_sack.c +++ head/sys/netinet/tcp_sack.c @@ -599,7 +599,7 @@ if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); } #if 0 Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c +++ head/sys/netinet/tcp_subr.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #ifdef INET6 #include @@ -125,6 +126,8 @@ VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif +struct rwlock tcp_function_lock; + static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { @@ -236,6 +239,179 @@ void *ip4hdr, const void *ip6hdr); static void tcp_timer_discard(struct tcpcb *, uint32_t); + +static struct tcp_function_block tcp_def_funcblk = { + "default", + tcp_output, + tcp_do_segment, + tcp_default_ctloutput, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + 0, + 0 +}; + +struct tcp_funchead t_functions; +static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk; + +static struct tcp_function_block * +find_tcp_functions_locked(struct tcp_function_set *fs) +{ + struct tcp_function *f; + struct tcp_function_block *blk=NULL; + + TAILQ_FOREACH(f, &t_functions, tf_next) { + if (strcmp(f->tf_fb->tfb_tcp_block_name, fs->function_set_name) == 0) { + blk = f->tf_fb; + break; + } + } + return(blk); +} + +static struct tcp_function_block * +find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) +{ + struct tcp_function_block *rblk=NULL; + struct tcp_function *f; + + TAILQ_FOREACH(f, &t_functions, tf_next) { + if (f->tf_fb == blk) { + rblk = blk; + if (s) { + *s = f; + } + break; + } + } + return (rblk); +} + +struct tcp_function_block * +find_and_ref_tcp_functions(struct tcp_function_set *fs) +{ + struct tcp_function_block *blk; + + rw_rlock(&tcp_function_lock); + blk = find_tcp_functions_locked(fs); + if (blk) + refcount_acquire(&blk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return(blk); +} + +struct tcp_function_block * +find_and_ref_tcp_fb(struct tcp_function_block *blk) +{ + struct tcp_function_block *rblk; + + rw_rlock(&tcp_function_lock); + rblk = find_tcp_fb_locked(blk, NULL); + if (rblk) + refcount_acquire(&rblk->tfb_refcnt); + rw_runlock(&tcp_function_lock); + return(rblk); +} + + +static int +sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) +{ + int error=ENOENT; + struct tcp_function_set fs; + struct tcp_function_block *blk; + + memset(&fs, 0, sizeof(fs)); + rw_rlock(&tcp_function_lock); + blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL); + if (blk) { + /* Found him */ + strcpy(fs.function_set_name, blk->tfb_tcp_block_name); + fs.pcbcnt = blk->tfb_refcnt; + } + rw_runlock(&tcp_function_lock); + error = sysctl_handle_string(oidp, fs.function_set_name, + sizeof(fs.function_set_name), req); + + /* Check for error or no change */ + if (error != 0 || req->newptr == NULL) + return(error); + + rw_wlock(&tcp_function_lock); + blk = find_tcp_functions_locked(&fs); + if ((blk == NULL) || + (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { + error = ENOENT; + goto done; + } + tcp_func_set_ptr = blk; +done: + rw_wunlock(&tcp_function_lock); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, + CTLTYPE_STRING | CTLFLAG_RW, + NULL, 0, sysctl_net_inet_default_tcp_functions, "A", + "Set/get the default TCP functions"); + +static int +sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) +{ + int error, cnt, linesz; + struct tcp_function *f; + char *buffer, *cp; + size_t bufsz, outsz; + + cnt = 0; + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH(f, &t_functions, tf_next) { + cnt++; + } + rw_runlock(&tcp_function_lock); + + bufsz = (cnt+2) * (TCP_FUNCTION_NAME_LEN_MAX + 12) + 1; + buffer = malloc(bufsz, M_TEMP, M_WAITOK); + + error = 0; + cp = buffer; + + linesz = snprintf(cp, bufsz, "\n%-32s%c %s\n", "Stack", 'D', "PCB count"); + cp += linesz; + bufsz -= linesz; + outsz = linesz; + + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH(f, &t_functions, tf_next) { + linesz = snprintf(cp, bufsz, "%-32s%c %u\n", + f->tf_fb->tfb_tcp_block_name, + (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ', + f->tf_fb->tfb_refcnt); + if (linesz >= bufsz) { + error = EOVERFLOW; + break; + } + cp += linesz; + bufsz -= linesz; + outsz += linesz; + } + rw_runlock(&tcp_function_lock); + if (error == 0) + error = sysctl_handle_string(oidp, buffer, outsz + 1, req); + free(buffer, M_TEMP); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, + CTLTYPE_STRING|CTLFLAG_RD, + NULL, 0, sysctl_net_inet_list_available, "A", + "list available TCP Function sets"); + /* * Target size of TCP PCB hash tables. Must be a power of two. * @@ -263,6 +439,8 @@ #define V_tcpcb_zone VNET(tcpcb_zone) MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); +MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); + static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) @@ -311,6 +489,96 @@ return (hashsize); } +int +register_tcp_functions(struct tcp_function_block *blk, int wait) +{ + struct tcp_function_block *lblk; + struct tcp_function *n; + struct tcp_function_set fs; + + if ((blk->tfb_tcp_output == NULL) || + (blk->tfb_tcp_do_segment == NULL) || + (blk->tfb_tcp_ctloutput == NULL) || + (strlen(blk->tfb_tcp_block_name) == 0)) { + /* + * These functions are required and you + * need a name. + */ + return (EINVAL); + } + if (blk->tfb_tcp_timer_stop_all || + blk->tfb_tcp_timers_left || + blk->tfb_tcp_timer_activate || + blk->tfb_tcp_timer_active || + blk->tfb_tcp_timer_stop) { + /* + * If you define one timer function you + * must have them all. + */ + if ((blk->tfb_tcp_timer_stop_all == NULL) || + (blk->tfb_tcp_timers_left == NULL) || + (blk->tfb_tcp_timer_activate == NULL) || + (blk->tfb_tcp_timer_active == NULL) || + (blk->tfb_tcp_timer_stop == NULL)) { + return (EINVAL); + } + } + n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); + if (n == NULL) { + return (ENOMEM); + } + n->tf_fb = blk; + strcpy(fs.function_set_name, blk->tfb_tcp_block_name); + rw_wlock(&tcp_function_lock); + lblk = find_tcp_functions_locked(&fs); + if (lblk) { + /* Duplicate name space not allowed */ + rw_wunlock(&tcp_function_lock); + free(n, M_TCPFUNCTIONS); + return (EALREADY); + } + refcount_init(&blk->tfb_refcnt, 0); + blk->tfb_flags = 0; + TAILQ_INSERT_TAIL(&t_functions, n, tf_next); + rw_wunlock(&tcp_function_lock); + return(0); +} + +int +deregister_tcp_functions(struct tcp_function_block *blk) +{ + struct tcp_function_block *lblk; + struct tcp_function *f; + int error=ENOENT; + + if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { + /* You can't un-register the default */ + return (EPERM); + } + rw_wlock(&tcp_function_lock); + if (blk == tcp_func_set_ptr) { + /* You can't free the current default */ + rw_wunlock(&tcp_function_lock); + return (EBUSY); + } + if (blk->tfb_refcnt) { + /* Still tcb attached, mark it. */ + blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; + rw_wunlock(&tcp_function_lock); + return (EBUSY); + } + lblk = find_tcp_fb_locked(blk, &f); + if (lblk) { + /* Found */ + TAILQ_REMOVE(&t_functions, f, tf_next); + f->tf_fb = NULL; + free(f, M_TCPFUNCTIONS); + error = 0; + } + rw_wunlock(&tcp_function_lock); + return (error); +} + void tcp_init(void) { @@ -325,7 +593,10 @@ if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); - + /* Setup the tcp function block list */ + TAILQ_INIT(&t_functions); + rw_init_flags(&tcp_function_lock, "tcp_func_lock" , 0); + register_tcp_functions(&tcp_def_funcblk, M_WAITOK); hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { @@ -768,7 +1039,13 @@ tp->ccv = &tm->ccv; tp->ccv->type = IPPROTO_TCP; tp->ccv->ccvc.tcp = tp; - + rw_rlock(&tcp_function_lock); + tp->t_fb = tcp_func_set_ptr; + refcount_acquire(&tp->t_fb->tfb_refcnt); + rw_runlock(&tcp_function_lock); + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } /* * Use the current system default CC algorithm. */ @@ -779,12 +1056,18 @@ if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); } @@ -925,7 +1208,7 @@ if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); @@ -960,6 +1243,10 @@ tcp_timer_stop(tp, TT_KEEP); tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); + if (tp->t_fb->tfb_tcp_timer_stop_all) { + /* Call the stop-all function of the methods */ + tp->t_fb->tfb_tcp_timer_stop_all(tp); + } /* * If we got enough samples through the srtt filter, @@ -1044,6 +1331,14 @@ inp->inp_ppcb = NULL; if ((tp->t_timers->tt_flags & TT_MASK) == 0) { /* We own the last reference on tcpcb, let's free it. */ + if ((tp->t_fb->tfb_tcp_timers_left) && + (tp->t_fb->tfb_tcp_timers_left(tp))) { + /* Some fb timers left running! */ + return; + } + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); released = in_pcbrele_wlocked(inp); @@ -1105,6 +1400,14 @@ tp->t_timers->tt_flags &= ~timer_type; if ((tp->t_timers->tt_flags & TT_MASK) == 0) { /* We own the last reference on this tcpcb, let's free it. */ + if ((tp->t_fb->tfb_tcp_timers_left) && + (tp->t_fb->tfb_tcp_timers_left(tp))) { + /* Some fb timers left running! */ + goto leave; + } + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { @@ -1113,6 +1416,7 @@ return; } } +leave: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); @@ -1865,7 +2169,7 @@ tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } #ifdef INET Index: head/sys/netinet/tcp_syncache.c =================================================================== --- head/sys/netinet/tcp_syncache.c +++ head/sys/netinet/tcp_syncache.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -626,6 +627,7 @@ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { + struct tcp_function_block *blk; struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; @@ -817,6 +819,26 @@ tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); + blk = sototcpcb(lso)->t_fb; + if (blk != tp->t_fb) { + /* + * Our parents t_fb was not the default, + * we need to release our ref on tp->t_fb and + * pickup one on the new entry. + */ + struct tcp_function_block *rblk; + + rblk = find_and_ref_tcp_fb(blk); + KASSERT(rblk != NULL, + ("cannot find blk %p out of syncache?", blk)); + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_fb = rblk; + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } + } tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; Index: head/sys/netinet/tcp_timer.c =================================================================== --- head/sys/netinet/tcp_timer.c +++ head/sys/netinet/tcp_timer.c @@ -292,7 +292,7 @@ tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); CURVNET_RESTORE(); } @@ -543,7 +543,7 @@ } tcp_setpersist(tp); tp->t_flags |= TF_FORCEDATA; - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; out: @@ -798,7 +798,7 @@ cc_cong_signal(tp, NULL, CC_RTO); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); out: #ifdef TCPDEBUG @@ -858,6 +858,10 @@ f_reset = TT_2MSL_RST; break; default: + if (tp->t_fb->tfb_tcp_timer_activate) { + tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); + return; + } panic("tp %p bad timer_type %#x", tp, timer_type); } if (delta == 0) { @@ -904,6 +908,9 @@ t_callout = &tp->t_timers->tt_2msl; break; default: + if (tp->t_fb->tfb_tcp_timer_active) { + return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); + } panic("tp %p bad timer_type %#x", tp, timer_type); } return callout_active(t_callout); @@ -945,6 +952,14 @@ f_reset = TT_2MSL_RST; break; default: + if (tp->t_fb->tfb_tcp_timer_stop) { + /* + * XXXrrs we need to look at this with the + * stop case below (flags). + */ + tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); + return; + } panic("tp %p bad timer_type %#x", tp, timer_type); } Index: head/sys/netinet/tcp_usrreq.c =================================================================== --- head/sys/netinet/tcp_usrreq.c +++ head/sys/netinet/tcp_usrreq.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -509,7 +510,7 @@ goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); INP_WUNLOCK(inp); @@ -579,7 +580,7 @@ (error = tcp_offload_connect(so, nam)) == 0) goto out; #endif - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); goto out; } #endif @@ -597,7 +598,7 @@ goto out; #endif tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_CONNECT); @@ -773,7 +774,7 @@ socantsendmore(so); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_SHUTDOWN); @@ -809,7 +810,7 @@ tcp_offload_rcvd(tp); else #endif - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); out: TCPDEBUG2(PRU_RCVD); @@ -911,7 +912,7 @@ !(flags & PRUS_NOTREADY)) { if (flags & PRUS_MORETOCOME) tp->t_flags |= TF_MORETOCOME; - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); if (flags & PRUS_MORETOCOME) tp->t_flags &= ~TF_MORETOCOME; } @@ -961,7 +962,7 @@ tp->snd_up = tp->snd_una + sbavail(&so->so_snd); if (!(flags & PRUS_NOTREADY)) { tp->t_flags |= TF_FORCEDATA; - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); tp->t_flags &= ~TF_FORCEDATA; } } @@ -997,7 +998,7 @@ error = sbready(&so->so_snd, m, count); SOCKBUF_UNLOCK(&so->so_snd); if (error == 0) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(inp); return (error); @@ -1349,13 +1350,11 @@ int tcp_ctloutput(struct socket *so, struct sockopt *sopt) { - int error, opt, optval; - u_int ui; + int error; struct inpcb *inp; struct tcpcb *tp; - struct tcp_info ti; - char buf[TCP_CA_NAME_MAX]; - struct cc_algo *algo; + struct tcp_function_block *blk; + struct tcp_function_set fsn; error = 0; inp = sotoinpcb(so); @@ -1383,7 +1382,83 @@ INP_WUNLOCK(inp); return (ECONNRESET); } + tp = intotcpcb(inp); + /* + * Protect the TCP option TCP_FUNCTION_BLK so + * that a sub-function can *never* overwrite this. + */ + if ((sopt->sopt_dir == SOPT_SET) && + (sopt->sopt_name == TCP_FUNCTION_BLK)) { + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &fsn, sizeof fsn, + sizeof fsn); + if (error) + return (error); + INP_WLOCK_RECHECK(inp); + if (tp->t_state != TCPS_CLOSED) { + /* + * The user has advanced the state + * past the initial point, we can't + * switch since we are down the road + * and a new set of functions may + * not be compatibile. + */ + INP_WUNLOCK(inp); + return(EINVAL); + } + blk = find_and_ref_tcp_functions(&fsn); + if (blk == NULL) { + INP_WUNLOCK(inp); + return (ENOENT); + } + if (tp->t_fb != blk) { + if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return (ENOENT); + } + /* + * Release the old refcnt, the + * lookup acquires a ref on the + * new one. + */ + if (tp->t_fb->tfb_tcp_fb_fini) + (*tp->t_fb->tfb_tcp_fb_fini)(tp); + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_fb = blk; + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } + } +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + tcp_offload_ctloutput(tp, sopt->sopt_dir, + sopt->sopt_name); + } +#endif + INP_WUNLOCK(inp); + return (error); + } else if ((sopt->sopt_dir == SOPT_GET) && + (sopt->sopt_name == TCP_FUNCTION_BLK)) { + strcpy(fsn.function_set_name, tp->t_fb->tfb_tcp_block_name); + fsn.pcbcnt = tp->t_fb->tfb_refcnt; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &fsn, sizeof fsn); + return (error); + } + /* Pass in the INP locked, called must unlock it */ + return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); +} +int +tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) +{ + int error, opt, optval; + u_int ui; + struct tcp_info ti; + struct cc_algo *algo; + char buf[TCP_CA_NAME_MAX]; + switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { @@ -1451,7 +1526,7 @@ else if (tp->t_flags & TF_NOPUSH) { tp->t_flags &= ~TF_NOPUSH; if (TCPS_HAVEESTABLISHED(tp->t_state)) - error = tcp_output(tp); + error = tp->t_fb->tfb_tcp_output(tp); } goto unlock_and_done; @@ -1770,7 +1845,7 @@ sbflush(&so->so_rcv); tcp_usrclosed(tp); if (!(inp->inp_flags & INP_DROPPED)) - tcp_output(tp); + tp->t_fb->tfb_tcp_output(tp); } } Index: head/sys/netinet/tcp_var.h =================================================================== --- head/sys/netinet/tcp_var.h +++ head/sys/netinet/tcp_var.h @@ -89,6 +89,52 @@ #define tcp6cb tcpcb /* for KAME src sync over BSD*'s */ +/* + * TODO: We yet need to brave plowing in + * to tcp_input() and the pru_usrreq() block. + * Right now these go to the old standards which + * are somewhat ok, but in the long term may + * need to be changed. If we do tackle tcp_input() + * then we need to get rid of the tcp_do_segment() + * function below. + */ +/* Flags for tcp functions */ +#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ +struct tcpcb; +struct inpcb; +struct sockopt; +struct socket; + +struct tcp_function_block { + char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; + int (*tfb_tcp_output)(struct tcpcb *); + void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, + int, int, uint8_t, + int); + int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp); + /* Optional memory allocation/free routine */ + void (*tfb_tcp_fb_init)(struct tcpcb *); + void (*tfb_tcp_fb_fini)(struct tcpcb *); + /* Optional timers, must define all if you define one */ + int (*tfb_tcp_timer_stop_all)(struct tcpcb *); + int (*tfb_tcp_timers_left)(struct tcpcb *); + void (*tfb_tcp_timer_activate)(struct tcpcb *, + uint32_t, u_int); + int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); + void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); + volatile uint32_t tfb_refcnt; + uint32_t tfb_flags; +}; + +struct tcp_function { + TAILQ_ENTRY(tcp_function) tf_next; + struct tcp_function_block *tf_fb; +}; + +TAILQ_HEAD(tcp_funchead, tcp_function); + /* * Tcp control block, one per tcp; fields: * Organized for 16 byte cacheline efficiency. @@ -207,9 +253,10 @@ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ u_int t_pmtud_saved_maxopd; /* pre-blackhole MSS */ u_int t_flags2; /* More tcpcb flags storage */ - uint32_t t_ispare[8]; /* 5 UTO, 3 TBD */ - void *t_pspare2[4]; /* 1 TCP_SIGNATURE, 3 TBD */ + struct tcp_function_block *t_fb;/* TCP function call block */ + void *t_fb_ptr; /* Pointer to t_fb specific data */ + void *t_pspare2[2]; /* 1 TCP_SIGNATURE, 1 TBD */ #if defined(_KERNEL) && defined(TCPPCAP) struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ @@ -534,6 +581,8 @@ #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL +#define TI_UNLOCKED 1 +#define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ @@ -684,7 +733,32 @@ int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); +void tcp_dooptions(struct tcpopt *, u_char *, int, int); +void tcp_dropwithreset(struct mbuf *, struct tcphdr *, + struct tcpcb *, int, int); +void tcp_pulloutofband(struct socket *, + struct tcphdr *, struct mbuf *, int); +void tcp_xmit_timer(struct tcpcb *, int); +void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); +void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, + uint16_t type); +void cc_conn_init(struct tcpcb *tp); +void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); +void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); +void hhook_run_tcp_est_in(struct tcpcb *tp, + struct tcphdr *th, struct tcpopt *to); + int tcp_input(struct mbuf **, int *, int); +void tcp_do_segment(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, int, int, uint8_t, + int); + +int register_tcp_functions(struct tcp_function_block *blk, int wait); +int deregister_tcp_functions(struct tcp_function_block *blk); +struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); +struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk); +int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); + u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, @@ -752,8 +826,6 @@ u_long tcp_seq_subtract(u_long, u_long ); int tcp_compute_pipe(struct tcpcb *); -void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); - static inline void tcp_fields_to_host(struct tcphdr *th) { Index: head/sys/netinet/toecore.c =================================================================== --- head/sys/netinet/toecore.c +++ head/sys/netinet/toecore.c @@ -509,7 +509,7 @@ KASSERT(!(tp->t_flags & TF_TOE), ("%s: tp %p still offloaded.", __func__, tp)); tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); - (void) tcp_output(tp); + (void) tp->t_fb->tfb_tcp_output(tp); } else { INP_INFO_RLOCK_ASSERT(&V_tcbinfo);