Index: tcp_subr.c =================================================================== --- tcp_subr.c +++ tcp_subr.c @@ -1200,7 +1200,7 @@ if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) { if (tp->t_fb->tfb_tcp_fb_fini) - (*tp->t_fb->tfb_tcp_fb_fini)(tp); + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); @@ -1209,7 +1209,7 @@ tp->osd = &tm->osd; if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) { if (tp->t_fb->tfb_tcp_fb_fini) - (*tp->t_fb->tfb_tcp_fb_fini)(tp); + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); uma_zfree(V_tcpcb_zone, tm); return (NULL); @@ -1483,7 +1483,7 @@ if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on tcpcb, let's free it. */ if (tp->t_fb->tfb_tcp_fb_fini) - (*tp->t_fb->tfb_tcp_fb_fini)(tp); + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); @@ -1512,7 +1512,7 @@ if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on this tcpcb, let's free it. */ if (tp->t_fb->tfb_tcp_fb_fini) - (*tp->t_fb->tfb_tcp_fb_fini)(tp); + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); Index: tcp_syncache.c =================================================================== --- tcp_syncache.c +++ tcp_syncache.c @@ -842,7 +842,7 @@ KASSERT(rblk != NULL, ("cannot find blk %p out of syncache?", blk)); if (tp->t_fb->tfb_tcp_fb_fini) - (*tp->t_fb->tfb_tcp_fb_fini)(tp); + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = rblk; if (tp->t_fb->tfb_tcp_fb_init) { Index: tcp_usrreq.c =================================================================== --- tcp_usrreq.c +++ tcp_usrreq.c @@ -1420,41 +1420,60 @@ if (error) return (error); INP_WLOCK_RECHECK(inp); + blk = find_and_ref_tcp_functions(&fsn); + if (blk == NULL) { + INP_WUNLOCK(inp); + return (ENOENT); + } + if (tp->t_fb == blk) { + /* You already have this */ + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return (0); + } if (tp->t_state != TCPS_CLOSED) { + int error=EINVAL; /* * The user has advanced the state - * past the initial point, we can't - * switch since we are down the road - * and a new set of functions may - * not be compatibile. + * past the initial point, we may not + * be able to switch. */ - INP_WUNLOCK(inp); - return(EINVAL); + if (blk->tfb_tcp_handoff_ok != NULL) { + /* + * Does the stack provide a + * query mechanism, if so it may + * still be possible? + */ + error = (*blk->tfb_tcp_handoff_ok)(tp); + } + if (error) { + refcount_release(&blk->tfb_refcnt); + INP_WUNLOCK(inp); + return(error); + } } - blk = find_and_ref_tcp_functions(&fsn); - if (blk == NULL) { + if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { + refcount_release(&blk->tfb_refcnt); INP_WUNLOCK(inp); return (ENOENT); } - if (tp->t_fb != blk) { - if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { - refcount_release(&blk->tfb_refcnt); - INP_WUNLOCK(inp); - return (ENOENT); - } + /* + * Release the old refcnt, the + * lookup acquired a ref on the + * new one already. + */ + if (tp->t_fb->tfb_tcp_fb_fini) { /* - * Release the old refcnt, the - * lookup acquires a ref on the - * new one. + * Tell the stack to cleanup with 0 i.e. + * the tcb is not going away. */ - if (tp->t_fb->tfb_tcp_fb_fini) - (*tp->t_fb->tfb_tcp_fb_fini)(tp); - refcount_release(&tp->t_fb->tfb_refcnt); - tp->t_fb = blk; - if (tp->t_fb->tfb_tcp_fb_init) { - (*tp->t_fb->tfb_tcp_fb_init)(tp); - } + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); } + refcount_release(&tp->t_fb->tfb_refcnt); + tp->t_fb = blk; + if (tp->t_fb->tfb_tcp_fb_init) { + (*tp->t_fb->tfb_tcp_fb_init)(tp); + } #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, Index: tcp_var.h =================================================================== --- tcp_var.h +++ tcp_var.h @@ -116,6 +116,18 @@ * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. + * Adding a tfb_tcp_handoff_ok function allows the socket + * option to change stacks to query you even if the + * connection is in a later stage. You return 0 to + * say you can take over and run your stack, you return + * non-zero (an error number) to say no you can't. + * If the function is undefined you can only change + * in the early states (before connect or listen). + * tfb_tcp_fb_fini is changed to add a flag to tell + * the old stack if the tcb is being destroyed or + * not. A one in the flag means the TCB is being + * destroyed, a zero indicates its transitioning to + * another stack (via socket option). */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; @@ -128,7 +140,7 @@ struct inpcb *inp, struct tcpcb *tp); /* Optional memory allocation/free routine */ void (*tfb_tcp_fb_init)(struct tcpcb *); - void (*tfb_tcp_fb_fini)(struct tcpcb *); + void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_timer_activate)(struct tcpcb *, @@ -136,6 +148,7 @@ int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); + int (*tfb_tcp_handoff_ok)(struct tcpcb *); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; };