Index: sys/dev/cxgbe/cxgbei/icl_cxgbei.c =================================================================== --- sys/dev/cxgbe/cxgbei/icl_cxgbei.c +++ sys/dev/cxgbe/cxgbei/icl_cxgbei.c @@ -697,7 +697,7 @@ ISCSI_DATA_DIGEST_SIZE; } so->so_options |= SO_NO_DDP; - toep->ulp_mode = ULP_MODE_ISCSI; + toep->params.ulp_mode = ULP_MODE_ISCSI; toep->ulpcb = icc; send_iscsi_flowc_wr(icc->sc, toep, ci->max_tx_pdu_len); Index: sys/dev/cxgbe/iw_cxgbe/qp.c =================================================================== --- sys/dev/cxgbe/iw_cxgbe/qp.c +++ sys/dev/cxgbe/iw_cxgbe/qp.c @@ -1415,7 +1415,7 @@ ret = c4iw_wait_for_reply(rdev, &ep->com.wr_wait, ep->hwtid, qhp->wq.sq.qid, ep->com.so, __func__); - toep->ulp_mode = ULP_MODE_RDMA; + toep->params.ulp_mode = ULP_MODE_RDMA; free_ird(rhp, qhp->attr.max_ird); return ret; Index: sys/dev/cxgbe/t4_main.c =================================================================== --- sys/dev/cxgbe/t4_main.c +++ sys/dev/cxgbe/t4_main.c @@ -6229,9 +6229,9 @@ "(-1 = default, 0 = reno, 1 = tahoe, 2 = newreno, " "3 = highspeed)"); - sc->tt.sndbuf = 256 * 1024; + sc->tt.sndbuf = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW, - &sc->tt.sndbuf, 0, "max hardware send buffer size"); + &sc->tt.sndbuf, 0, "hardware send buffer"); sc->tt.ddp = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", @@ -6239,7 +6239,7 @@ SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_zcopy", CTLFLAG_RW, &sc->tt.ddp, 0, "Enable zero-copy aio_read(2)"); - sc->tt.rx_coalesce = 1; + sc->tt.rx_coalesce = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce", CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing"); @@ -6251,7 +6251,7 @@ CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_tls_rx_ports, "I", "TCP ports that use inline TLS+TOE RX"); - sc->tt.tx_align = 1; + sc->tt.tx_align = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align", CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload"); Index: sys/dev/cxgbe/tom/t4_connect.c =================================================================== --- sys/dev/cxgbe/tom/t4_connect.c +++ sys/dev/cxgbe/tom/t4_connect.c @@ -102,7 +102,7 @@ make_established(toep, be32toh(cpl->snd_isn) - 1, be32toh(cpl->rcv_isn) - 1, cpl->tcp_opt); - if (toep->ulp_mode == ULP_MODE_TLS) + if (ulp_mode(toep) == ULP_MODE_TLS) tls_establish(toep); done: @@ -165,96 +165,6 @@ return (0); } -/* - * Options2 for active open. - */ -static uint32_t -calc_opt2a(struct socket *so, struct toepcb *toep, - const struct offload_settings *s) -{ - struct tcpcb *tp = so_sototcpcb(so); - struct port_info *pi = toep->vi->pi; - struct adapter *sc = pi->adapter; - uint32_t opt2 = 0; - - /* - * rx flow control, rx coalesce, congestion control, and tx pace are all - * explicitly set by the driver. On T5+ the ISS is also set by the - * driver to the value picked by the kernel. - */ - if (is_t4(sc)) { - opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; - opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; - } else { - opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ - opt2 |= F_T5_ISS; /* ISS provided in CPL */ - } - - if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) - opt2 |= F_SACK_EN; - - if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) - opt2 |= F_TSTAMPS_EN; - - if (tp->t_flags & TF_REQ_SCALE) - opt2 |= F_WND_SCALE_EN; - - if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) - opt2 |= F_CCTRL_ECN; - - /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ - - opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); - - /* These defaults are subject to ULP specific fixups later. */ - opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); - - opt2 |= V_PACE(0); - - if (s->cong_algo >= 0) - opt2 |= V_CONG_CNTRL(s->cong_algo); - else if (sc->tt.cong_algorithm >= 0) - opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); - else { - struct cc_algo *cc = CC_ALGO(tp); - - if (strcasecmp(cc->name, "reno") == 0) - opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); - else if (strcasecmp(cc->name, "tahoe") == 0) - opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); - if (strcasecmp(cc->name, "newreno") == 0) - opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); - if (strcasecmp(cc->name, "highspeed") == 0) - opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); - else { - /* - * Use newreno in case the algorithm selected by the - * host stack is not supported by the hardware. - */ - opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); - } - } - - if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) - opt2 |= V_RX_COALESCE(M_RX_COALESCE); - - /* Note that ofld_rxq is already set according to s->rxq. */ - opt2 |= F_RSS_QUEUE_VALID; - opt2 |= V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id); - -#ifdef USE_DDP_RX_FLOW_CONTROL - if (toep->ulp_mode == ULP_MODE_TCPDDP) - opt2 |= F_RX_FC_DDP; -#endif - - if (toep->ulp_mode == ULP_MODE_TLS) { - opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); - opt2 |= F_RX_FC_DISABLE; - } - - return (htobe32(opt2)); -} - void t4_init_connect_cpl_handlers(void) { @@ -322,7 +232,7 @@ struct wrqe *wr = NULL; struct ifnet *rt_ifp = rt->rt_ifp; struct vi_info *vi; - int mtu_idx, rscale, qid_atid, rc, isipv6, txqid, rxqid; + int qid_atid, rc, isipv6; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); int reason; @@ -353,18 +263,7 @@ if (!settings.offload) DONT_OFFLOAD_ACTIVE_OPEN(EPERM); - if (settings.txq >= 0 && settings.txq < vi->nofldtxq) - txqid = settings.txq; - else - txqid = arc4random() % vi->nofldtxq; - txqid += vi->first_ofld_txq; - if (settings.rxq >= 0 && settings.rxq < vi->nofldrxq) - rxqid = settings.rxq; - else - rxqid = arc4random() % vi->nofldrxq; - rxqid += vi->first_ofld_rxq; - - toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT | M_ZERO); + toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); @@ -377,27 +276,16 @@ if (toep->l2te == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); + toep->vnet = so->so_vnet; + init_conn_params(vi, &settings, &inp->inp_inc, so, NULL, + toep->l2te->idx, &toep->params); + init_toepcb(vi, toep); + isipv6 = nam->sa_family == AF_INET6; wr = alloc_wrqe(act_open_cpl_size(sc, isipv6), toep->ctrlq); if (wr == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); - toep->vnet = so->so_vnet; - set_ulp_mode(toep, select_ulp_mode(so, sc, &settings)); - SOCKBUF_LOCK(&so->so_rcv); - toep->opt0_rcv_bufsize = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); - SOCKBUF_UNLOCK(&so->so_rcv); - - /* - * The kernel sets request_r_scale based on sb_max whereas we need to - * take hardware's MAX_RCV_WND into account too. This is normally a - * no-op as MAX_RCV_WND is much larger than the default sb_max. - */ - if (tp->t_flags & TF_REQ_SCALE) - rscale = tp->request_r_scale = select_rcv_wscale(); - else - rscale = 0; - mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, &settings); qid_atid = V_TID_QID(toep->ofld_rxq->iq.abs_id) | V_TID_TID(toep->tid) | V_TID_COOKIE(CPL_COOKIE_TOM); @@ -438,9 +326,13 @@ cpl->peer_port = inp->inp_fport; cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; - cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, - toep->opt0_rcv_bufsize, toep->ulp_mode, &settings); - cpl->opt2 = calc_opt2a(so, toep, &settings); + cpl->opt0 = calc_options0(vi, &toep->params); + cpl->opt2 = calc_options2(vi, &toep->params); + + CTR6(KTR_CXGBE, + "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", + __func__, toep->tid, toep, inp, be64toh(cpl->opt0), + be32toh(cpl->opt2)); } else { struct cpl_act_open_req *cpl = wrtod(wr); struct cpl_t5_act_open_req *cpl5 = (void *)cpl; @@ -467,13 +359,14 @@ qid_atid)); inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); - cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, - toep->opt0_rcv_bufsize, toep->ulp_mode, &settings); - cpl->opt2 = calc_opt2a(so, toep, &settings); - } + cpl->opt0 = calc_options0(vi, &toep->params); + cpl->opt2 = calc_options2(vi, &toep->params); - CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__, - toep->tid, tcpstates[tp->t_state], toep, inp); + CTR6(KTR_CXGBE, + "%s: atid %u, toep %p, inp %p, opt0 %#016lx, opt2 %#08x", + __func__, toep->tid, toep, inp, be64toh(cpl->opt0), + be32toh(cpl->opt2)); + } offload_socket(so, toep); rc = t4_l2t_send(sc, wr, toep->l2te); Index: sys/dev/cxgbe/tom/t4_cpl_io.c =================================================================== --- sys/dev/cxgbe/tom/t4_cpl_io.c +++ sys/dev/cxgbe/tom/t4_cpl_io.c @@ -77,7 +77,7 @@ static void t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep); void -send_flowc_wr(struct toepcb *toep, struct flowc_tx_params *ftxp) +send_flowc_wr(struct toepcb *toep, struct tcpcb *tp) { struct wrqe *wr; struct fw_flowc_wr *flowc; @@ -91,17 +91,17 @@ KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT), ("%s: flowc for tid %u sent already", __func__, toep->tid)); - if (ftxp != NULL) + if (tp != NULL) nparams = 8; else nparams = 6; - if (toep->ulp_mode == ULP_MODE_TLS) + if (ulp_mode(toep) == ULP_MODE_TLS) nparams++; if (toep->tls.fcplenmax != 0) nparams++; - if (toep->tc_idx != -1) { - MPASS(toep->tc_idx >= 0 && - toep->tc_idx < sc->chip_params->nsched_cls); + if (toep->params.tc_idx != -1) { + MPASS(toep->params.tc_idx >= 0 && + toep->params.tc_idx < sc->chip_params->nsched_cls); nparams++; } @@ -133,30 +133,23 @@ FLOWC_PARAM(CH, pi->tx_chan); FLOWC_PARAM(PORT, pi->tx_chan); FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id); - if (ftxp) { - uint32_t sndbuf = min(ftxp->snd_space, sc->tt.sndbuf); - - FLOWC_PARAM(SNDNXT, ftxp->snd_nxt); - FLOWC_PARAM(RCVNXT, ftxp->rcv_nxt); - FLOWC_PARAM(SNDBUF, sndbuf); - FLOWC_PARAM(MSS, ftxp->mss); - - CTR6(KTR_CXGBE, - "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", - __func__, toep->tid, ftxp->mss, sndbuf, ftxp->snd_nxt, - ftxp->rcv_nxt); - } else { - FLOWC_PARAM(SNDBUF, 512); - FLOWC_PARAM(MSS, 512); - - CTR2(KTR_CXGBE, "%s: tid %u", __func__, toep->tid); + FLOWC_PARAM(SNDBUF, toep->params.sndbuf); + FLOWC_PARAM(MSS, toep->params.emss); + if (tp) { + FLOWC_PARAM(SNDNXT, tp->snd_nxt); + FLOWC_PARAM(RCVNXT, tp->rcv_nxt); } - if (toep->ulp_mode == ULP_MODE_TLS) - FLOWC_PARAM(ULP_MODE, toep->ulp_mode); + CTR6(KTR_CXGBE, + "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x", + __func__, toep->tid, toep->params.emss, toep->params.sndbuf, + tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0); + + if (ulp_mode(toep) == ULP_MODE_TLS) + FLOWC_PARAM(ULP_MODE, ulp_mode(toep)); if (toep->tls.fcplenmax != 0) FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); - if (toep->tc_idx != -1) - FLOWC_PARAM(SCHEDCLASS, toep->tc_idx); + if (toep->params.tc_idx != -1) + FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx); #undef FLOWC_PARAM KASSERT(paramidx == nparams, ("nparams mismatch")); @@ -197,7 +190,7 @@ MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); } - if (toep->tc_idx != tc_idx) { + if (toep->params.tc_idx != tc_idx) { struct wrqe *wr; struct fw_flowc_wr *flowc; int nparams = 1, flowclen, flowclen16; @@ -236,9 +229,9 @@ t4_wrq_tx(sc, wr); } - if (toep->tc_idx >= 0) - t4_release_cl_rl(sc, port_id, toep->tc_idx); - toep->tc_idx = tc_idx; + if (toep->params.tc_idx >= 0) + t4_release_cl_rl(sc, port_id, toep->params.tc_idx); + toep->params.tc_idx = tc_idx; return (0); } @@ -313,30 +306,30 @@ INP_LOCK_ASSERT(inp); - toep->tcp_opt = opt; - toep->mtu_idx = G_TCPOPT_MSS(opt); - tp->t_maxseg = sc->params.mtus[toep->mtu_idx]; + toep->params.mtu_idx = G_TCPOPT_MSS(opt); + tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx]; if (inp->inp_inc.inc_flags & INC_ISIPV6) tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr); - toep->emss = tp->t_maxseg; + toep->params.emss = tp->t_maxseg; if (G_TCPOPT_TSTAMP(opt)) { + toep->params.tstamp = 1; + toep->params.emss -= TCPOLEN_TSTAMP_APPA; tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); - toep->emss -= TCPOLEN_TSTAMP_APPA; - } + } else + toep->params.tstamp = 0; - CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u", - __func__, toep->tid, toep->mtu_idx, - sc->params.mtus[G_TCPOPT_MSS(opt)], tp->t_maxseg, toep->emss); - - if (G_TCPOPT_SACK(opt)) + if (G_TCPOPT_SACK(opt)) { + toep->params.sack = 1; tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ - else + } else { + toep->params.sack = 0; tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */ + } if (G_TCPOPT_WSCALE_OK(opt)) tp->t_flags |= TF_RCVD_SCALE; @@ -346,7 +339,13 @@ (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_scale = G_TCPOPT_SND_WSCALE(opt); - } + } else + toep->params.wscale = 0; + + CTR6(KTR_CXGBE, + "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u", + toep->tid, toep->params.mtu_idx, toep->params.emss, + toep->params.tstamp, toep->params.sack, toep->params.wscale); } /* @@ -361,9 +360,7 @@ struct inpcb *inp = toep->inp; struct socket *so = inp->inp_socket; struct tcpcb *tp = intotcpcb(inp); - long bufsize; uint16_t tcpopt = be16toh(opt); - struct flowc_tx_params ftxp; INP_WLOCK_ASSERT(inp); KASSERT(tp->t_state == TCPS_SYN_SENT || @@ -379,7 +376,7 @@ tp->irs = irs; tcp_rcvseqinit(tp); - tp->rcv_wnd = (u_int)toep->opt0_rcv_bufsize << 10; + tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; @@ -390,20 +387,8 @@ tp->snd_max = iss + 1; assign_rxopt(tp, tcpopt); + send_flowc_wr(toep, tp); - SOCKBUF_LOCK(&so->so_snd); - if (so->so_snd.sb_flags & SB_AUTOSIZE && V_tcp_do_autosndbuf) - bufsize = V_tcp_autosndbuf_max; - else - bufsize = sbspace(&so->so_snd); - SOCKBUF_UNLOCK(&so->so_snd); - - ftxp.snd_nxt = tp->snd_nxt; - ftxp.rcv_nxt = tp->rcv_nxt; - ftxp.snd_space = bufsize; - ftxp.mss = toep->emss; - send_flowc_wr(toep, &ftxp); - soisconnected(so); } @@ -459,7 +444,7 @@ SOCKBUF_LOCK_ASSERT(sb); rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0; - if (toep->ulp_mode == ULP_MODE_TLS) { + if (ulp_mode(toep) == ULP_MODE_TLS) { if (toep->tls.rcv_over >= rx_credits) { toep->tls.rcv_over -= rx_credits; rx_credits = 0; @@ -578,7 +563,7 @@ static inline void write_tx_wr(void *dst, struct toepcb *toep, unsigned int immdlen, - unsigned int plen, uint8_t credits, int shove, int ulp_submode, int txalign) + unsigned int plen, uint8_t credits, int shove, int ulp_submode) { struct fw_ofld_tx_data_wr *txwr = dst; @@ -586,20 +571,18 @@ V_FW_WR_IMMDLEN(immdlen)); txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) | V_FW_WR_LEN16(credits)); - txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(toep->ulp_mode) | + txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) | V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove)); txwr->plen = htobe32(plen); - if (txalign > 0) { - struct tcpcb *tp = intotcpcb(toep->inp); - - if (plen < 2 * toep->emss) + if (toep->params.tx_align > 0) { + if (plen < 2 * toep->params.emss) txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE); else txwr->lsodisable_to_flags |= htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD | - (tp->t_flags & TF_NODELAY ? 0 : + (toep->params.nagle == 0 ? 0 : F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE)); } } @@ -694,11 +677,11 @@ KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); - KASSERT(toep->ulp_mode == ULP_MODE_NONE || - toep->ulp_mode == ULP_MODE_TCPDDP || - toep->ulp_mode == ULP_MODE_TLS || - toep->ulp_mode == ULP_MODE_RDMA, - ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); + KASSERT(ulp_mode(toep) == ULP_MODE_NONE || + ulp_mode(toep) == ULP_MODE_TCPDDP || + ulp_mode(toep) == ULP_MODE_TLS || + ulp_mode(toep) == ULP_MODE_RDMA, + ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); #ifdef VERBOSE_TRACES CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d", @@ -837,8 +820,7 @@ } txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); - write_tx_wr(txwr, toep, plen, plen, credits, shove, 0, - sc->tt.tx_align); + write_tx_wr(txwr, toep, plen, plen, credits, shove, 0); m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); nsegs = 0; } else { @@ -856,8 +838,7 @@ } txwr = wrtod(wr); credits = howmany(wr_len, 16); - write_tx_wr(txwr, toep, 0, plen, credits, shove, 0, - sc->tt.tx_align); + write_tx_wr(txwr, toep, 0, plen, credits, shove, 0); write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { @@ -877,7 +858,7 @@ toep->tx_nocompl >= toep->tx_total / 4) compl = 1; - if (compl || toep->ulp_mode == ULP_MODE_RDMA) { + if (compl || ulp_mode(toep) == ULP_MODE_RDMA) { txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL); toep->tx_nocompl = 0; toep->plen_nocompl = 0; @@ -951,8 +932,8 @@ INP_WLOCK_ASSERT(inp); KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); - KASSERT(toep->ulp_mode == ULP_MODE_ISCSI, - ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); + KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI, + ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; @@ -1035,7 +1016,7 @@ txwr = wrtod(wr); credits = howmany(wr->wr_len, 16); write_tx_wr(txwr, toep, plen, adjusted_plen, credits, - shove, ulp_submode, sc->tt.tx_align); + shove, ulp_submode); m_copydata(sndptr, 0, plen, (void *)(txwr + 1)); nsegs = 0; } else { @@ -1053,7 +1034,7 @@ txwr = wrtod(wr); credits = howmany(wr_len, 16); write_tx_wr(txwr, toep, 0, adjusted_plen, credits, - shove, ulp_submode, sc->tt.tx_align); + shove, ulp_submode); write_tx_sgl(txwr + 1, sndptr, m, nsegs, max_nsegs_1mbuf); if (wr_len & 0xf) { @@ -1119,7 +1100,7 @@ ("%s: inp %p dropped.", __func__, inp)); KASSERT(toep != NULL, ("%s: toep is NULL", __func__)); - if (toep->ulp_mode == ULP_MODE_ISCSI) + if (ulp_mode(toep) == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, 0); else if (tls_tx_key(toep)) t4_push_tls_records(sc, toep, 0); @@ -1145,7 +1126,7 @@ toep->flags |= TPF_SEND_FIN; if (tp->t_state >= TCPS_ESTABLISHED) { - if (toep->ulp_mode == ULP_MODE_ISCSI) + if (ulp_mode(toep) == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, 0); else if (tls_tx_key(toep)) t4_push_tls_records(sc, toep, 0); @@ -1232,7 +1213,7 @@ so = inp->inp_socket; socantrcvmore(so); - if (toep->ulp_mode == ULP_MODE_TCPDDP) { + if (ulp_mode(toep) == ULP_MODE_TCPDDP) { DDP_LOCK(toep); if (__predict_false(toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE))) @@ -1240,7 +1221,7 @@ DDP_UNLOCK(toep); } - if (toep->ulp_mode != ULP_MODE_RDMA) { + if (ulp_mode(toep) != ULP_MODE_RDMA) { KASSERT(tp->rcv_nxt == be32toh(cpl->rcv_nxt), ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt, be32toh(cpl->rcv_nxt))); @@ -1551,14 +1532,14 @@ tp->rcv_nxt += len; if (tp->rcv_wnd < len) { - KASSERT(toep->ulp_mode == ULP_MODE_RDMA, + KASSERT(ulp_mode(toep) == ULP_MODE_RDMA, ("%s: negative window size", __func__)); } tp->rcv_wnd -= len; tp->t_rcvtime = ticks; - if (toep->ulp_mode == ULP_MODE_TCPDDP) + if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_LOCK(toep); so = inp_inpcbtosocket(inp); sb = &so->so_rcv; @@ -1569,7 +1550,7 @@ __func__, tid, len); m_freem(m); SOCKBUF_UNLOCK(sb); - if (toep->ulp_mode == ULP_MODE_TCPDDP) + if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); @@ -1600,7 +1581,7 @@ sb->sb_flags &= ~SB_AUTOSIZE; } - if (toep->ulp_mode == ULP_MODE_TCPDDP) { + if (ulp_mode(toep) == ULP_MODE_TCPDDP) { int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off; if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0) @@ -1643,7 +1624,7 @@ tp->rcv_adv += rx_credits; } - if (toep->ulp_mode == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && + if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && sbavail(sb) != 0) { CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, tid); @@ -1651,7 +1632,7 @@ } sorwakeup_locked(so); SOCKBUF_UNLOCK_ASSERT(sb); - if (toep->ulp_mode == ULP_MODE_TCPDDP) + if (ulp_mode(toep) == ULP_MODE_TCPDDP) DDP_UNLOCK(toep); INP_WUNLOCK(inp); @@ -1761,7 +1742,7 @@ #endif toep->flags &= ~TPF_TX_SUSPENDED; CURVNET_SET(toep->vnet); - if (toep->ulp_mode == ULP_MODE_ISCSI) + if (ulp_mode(toep) == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, plen); else if (tls_tx_key(toep)) t4_push_tls_records(sc, toep, plen); @@ -1774,7 +1755,7 @@ SOCKBUF_LOCK(sb); sbu = sbused(sb); - if (toep->ulp_mode == ULP_MODE_ISCSI) { + if (ulp_mode(toep) == ULP_MODE_ISCSI) { if (__predict_false(sbu > 0)) { /* Index: sys/dev/cxgbe/tom/t4_ddp.c =================================================================== --- sys/dev/cxgbe/tom/t4_ddp.c +++ sys/dev/cxgbe/tom/t4_ddp.c @@ -767,7 +767,7 @@ __func__, vld, tid, toep); } - if (toep->ulp_mode == ULP_MODE_ISCSI) { + if (ulp_mode(toep) == ULP_MODE_ISCSI) { t4_cpl_handler[CPL_RX_ISCSI_DDP](iq, rss, m); return (0); } Index: sys/dev/cxgbe/tom/t4_listen.c =================================================================== --- sys/dev/cxgbe/tom/t4_listen.c +++ sys/dev/cxgbe/tom/t4_listen.c @@ -348,7 +348,7 @@ struct ifnet *ifp = m->m_pkthdr.rcvif; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; - struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; + struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; struct wrqe *wr; struct fw_flowc_wr *flowc; struct cpl_abort_req *req; @@ -368,8 +368,8 @@ return; /* abort already in progress */ synqe->flags |= TPF_ABORT_SHUTDOWN; - ofld_txq = &sc->sge.ofld_txq[synqe->txqid]; - ofld_rxq = &sc->sge.ofld_rxq[synqe->rxqid]; + ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; + ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx]; /* The wrqe will have two WRs - a flowc followed by an abort_req */ flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); @@ -836,7 +836,7 @@ { struct listen_ctx *lctx = synqe->lctx; struct inpcb *inp = lctx->inp; - struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; + struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; int ntids; INP_WLOCK_ASSERT(inp); @@ -887,7 +887,7 @@ INP_WLOCK(inp); - ofld_txq = &sc->sge.ofld_txq[synqe->txqid]; + ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx]; /* * If we'd initiated an abort earlier the reply to it is responsible for @@ -962,28 +962,6 @@ synqe->flags |= TPF_SYNQE_EXPANDED; } -static inline void -save_qids_in_synqe(struct synq_entry *synqe, struct vi_info *vi, - struct offload_settings *s) -{ - uint32_t txqid, rxqid; - - if (s->txq >= 0 && s->txq < vi->nofldtxq) - txqid = s->txq; - else - txqid = arc4random() % vi->nofldtxq; - txqid += vi->first_ofld_txq; - - if (s->rxq >= 0 && s->rxq < vi->nofldrxq) - rxqid = s->rxq; - else - rxqid = arc4random() % vi->nofldrxq; - rxqid += vi->first_ofld_rxq; - - synqe->txqid = txqid; - synqe->rxqid = rxqid; -} - static void t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to) { @@ -1006,95 +984,6 @@ to->to_flags |= TOF_SACKPERM; } -/* - * Options2 for passive open. - */ -static uint32_t -calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, - const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode, - struct cc_algo *cc, const struct offload_settings *s) -{ - struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; - uint32_t opt2 = 0; - - /* - * rx flow control, rx coalesce, congestion control, and tx pace are all - * explicitly set by the driver. On T5+ the ISS is also set by the - * driver to the value picked by the kernel. - */ - if (is_t4(sc)) { - opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; - opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; - } else { - opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ - opt2 |= F_T5_ISS; /* ISS provided in CPL */ - } - - if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323))) - opt2 |= F_SACK_EN; - - if (tcpopt->tstamp && - (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) - opt2 |= F_TSTAMPS_EN; - - if (tcpopt->wsf < 15 && V_tcp_do_rfc1323) - opt2 |= F_WND_SCALE_EN; - - if (th->th_flags & (TH_ECE | TH_CWR) && - (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) - opt2 |= F_CCTRL_ECN; - - /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ - - opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); - - /* These defaults are subject to ULP specific fixups later. */ - opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); - - opt2 |= V_PACE(0); - - if (s->cong_algo >= 0) - opt2 |= V_CONG_CNTRL(s->cong_algo); - else if (sc->tt.cong_algorithm >= 0) - opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); - else { - if (strcasecmp(cc->name, "reno") == 0) - opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); - else if (strcasecmp(cc->name, "tahoe") == 0) - opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); - if (strcasecmp(cc->name, "newreno") == 0) - opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); - if (strcasecmp(cc->name, "highspeed") == 0) - opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); - else { - /* - * Use newreno in case the algorithm selected by the - * host stack is not supported by the hardware. - */ - opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); - } - } - - if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) - opt2 |= V_RX_COALESCE(M_RX_COALESCE); - - /* Note that ofld_rxq is already set according to s->rxq. */ - opt2 |= F_RSS_QUEUE_VALID; - opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id); - -#ifdef USE_DDP_RX_FLOW_CONTROL - if (ulp_mode == ULP_MODE_TCPDDP) - opt2 |= F_RX_FC_DDP; -#endif - - if (ulp_mode == ULP_MODE_TLS) { - opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); - opt2 |= F_RX_FC_DISABLE; - } - - return (htobe32(opt2)); -} - static void pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, struct in_conninfo *inc, struct tcphdr *th) @@ -1189,7 +1078,7 @@ { struct wrqe *wr; struct cpl_pass_accept_rpl *rpl; - struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx]; + struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx]; wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) : sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]); @@ -1385,6 +1274,9 @@ } atomic_store_int(&synqe->ok_to_respond, 0); + init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx, + &synqe->params); + /* * If all goes well t4_syncache_respond will get called during * syncache_add. Note that syncache_add releases the pcb lock. @@ -1395,27 +1287,12 @@ if (atomic_load_int(&synqe->ok_to_respond) > 0) { uint64_t opt0; uint32_t opt2; - u_int wnd; - int rscale, mtu_idx, rx_credits; - mtu_idx = find_best_mtu_idx(sc, &inc, &settings); - rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; - wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); - wnd = min(wnd, MAX_RCV_WND); - rx_credits = min(wnd >> 10, M_RCV_BUFSIZ); + opt0 = calc_options0(vi, &synqe->params); + opt2 = calc_options2(vi, &synqe->params); - save_qids_in_synqe(synqe, vi, &settings); - synqe->ulp_mode = select_ulp_mode(so, sc, &settings); - - opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, - synqe->ulp_mode, &settings); - opt2 = calc_opt2p(sc, pi, synqe->rxqid, &cpl->tcpopt, &th, - synqe->ulp_mode, CC_ALGO(intotcpcb(inp)), &settings); - insert_tid(sc, tid, synqe, ntids); synqe->tid = tid; - synqe->l2e_idx = e->idx; - synqe->rcv_bufsize = rx_credits; synqe->syn = m; m = NULL; @@ -1427,8 +1304,8 @@ } CTR6(KTR_CXGBE, - "%s: stid %u, tid %u, lctx %p, synqe %p, mode %d, SYNACK", - __func__, stid, tid, lctx, synqe, synqe->ulp_mode); + "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x", + __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2)); } else REJECT_PASS_ACCEPT_REQ(false); @@ -1540,18 +1417,19 @@ return (0); } - KASSERT(synqe->rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], + KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0], ("%s: CPL arrived on unexpected rxq. %d %d", __func__, - synqe->rxqid, (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); + synqe->params.rxq_idx, + (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0]))); - toep = alloc_toepcb(vi, synqe->txqid, synqe->rxqid, M_NOWAIT); + toep = alloc_toepcb(vi, M_NOWAIT); if (toep == NULL) goto reset; toep->tid = tid; - toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx]; + toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx]; toep->vnet = lctx->vnet; - set_ulp_mode(toep, synqe->ulp_mode); - toep->opt0_rcv_bufsize = synqe->rcv_bufsize; + bcopy(&synqe->params, &toep->params, sizeof(toep->params)); + init_toepcb(vi, toep); MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss); MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs); Index: sys/dev/cxgbe/tom/t4_tls.c =================================================================== --- sys/dev/cxgbe/tom/t4_tls.c +++ sys/dev/cxgbe/tom/t4_tls.c @@ -590,7 +590,7 @@ "KEY_WRITE_TX", uk_ctx->proto_ver); if (G_KEY_GET_LOC(uk_ctx->l_p_key) == KEY_WRITE_RX && - toep->ulp_mode != ULP_MODE_TLS) + ulp_mode(toep) != ULP_MODE_TLS) return (EOPNOTSUPP); /* Don't copy the 'tx' and 'rx' fields. */ @@ -788,7 +788,7 @@ INP_WUNLOCK(inp); break; case TCP_TLSOM_CLR_TLS_TOM: - if (toep->ulp_mode == ULP_MODE_TLS) { + if (ulp_mode(toep) == ULP_MODE_TLS) { CTR2(KTR_CXGBE, "%s: tid %d CLR_TLS_TOM", __func__, toep->tid); tls_clr_ofld_mode(toep); @@ -797,7 +797,7 @@ INP_WUNLOCK(inp); break; case TCP_TLSOM_CLR_QUIES: - if (toep->ulp_mode == ULP_MODE_TLS) { + if (ulp_mode(toep) == ULP_MODE_TLS) { CTR2(KTR_CXGBE, "%s: tid %d CLR_QUIES", __func__, toep->tid); tls_clr_quiesce(toep); @@ -820,7 +820,7 @@ */ optval = TLS_TOM_NONE; if (can_tls_offload(td_adapter(toep->td))) { - switch (toep->ulp_mode) { + switch (ulp_mode(toep)) { case ULP_MODE_NONE: case ULP_MODE_TCPDDP: optval = TLS_TOM_TXONLY; @@ -853,7 +853,7 @@ tls_ofld->key_location = TLS_SFO_WR_CONTEXTLOC_DDR; tls_ofld->rx_key_addr = -1; tls_ofld->tx_key_addr = -1; - if (toep->ulp_mode == ULP_MODE_TLS) + if (ulp_mode(toep) == ULP_MODE_TLS) callout_init_mtx(&tls_ofld->handshake_timer, &tls_handshake_lock, 0); } @@ -882,7 +882,7 @@ tls_uninit_toep(struct toepcb *toep) { - if (toep->ulp_mode == ULP_MODE_TLS) + if (ulp_mode(toep) == ULP_MODE_TLS) tls_stop_handshake_timer(toep); clear_tls_keyid(toep); } @@ -1097,9 +1097,9 @@ KASSERT(toep->flags & TPF_FLOWC_WR_SENT, ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid)); - KASSERT(toep->ulp_mode == ULP_MODE_NONE || - toep->ulp_mode == ULP_MODE_TCPDDP || toep->ulp_mode == ULP_MODE_TLS, - ("%s: ulp_mode %u for toep %p", __func__, toep->ulp_mode, toep)); + KASSERT(ulp_mode(toep) == ULP_MODE_NONE || + ulp_mode(toep) == ULP_MODE_TCPDDP || ulp_mode(toep) == ULP_MODE_TLS, + ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep)); KASSERT(tls_tx_key(toep), ("%s: TX key not set for toep %p", __func__, toep)); Index: sys/dev/cxgbe/tom/t4_tom.h =================================================================== --- sys/dev/cxgbe/tom/t4_tom.h +++ sys/dev/cxgbe/tom/t4_tom.h @@ -34,6 +34,7 @@ #define __T4_TOM_H__ #include #include "common/t4_hw.h" +#include "common/t4_msg.h" #include "tom/t4_tls.h" #define LISTEN_HASH_SIZE 32 @@ -86,6 +87,31 @@ struct sockopt; struct offload_settings; +/* + * Connection parameters for an offloaded connection. These are mostly (but not + * all) hardware TOE parameters. + */ +struct conn_params { + int8_t rx_coalesce; + int8_t cong_algo; + int8_t tc_idx; + int8_t tstamp; + int8_t sack; + int8_t nagle; + int8_t keepalive; + int8_t wscale; + int8_t ecn; + int8_t mtu_idx; + int8_t ulp_mode; + int8_t tx_align; + int16_t txq_idx; /* ofld_txq = &sc->sge.ofld_txq[txq_idx] */ + int16_t rxq_idx; /* ofld_rxq = &sc->sge.ofld_rxq[rxq_idx] */ + int16_t l2t_idx; + uint16_t emss; + uint16_t opt0_bufsize; + u_int sndbuf; /* controls TP tx pages */ +}; + struct ofld_tx_sdesc { uint32_t plen; /* payload length */ uint8_t tx_credits; /* firmware tx credits (unit is 16B) */ @@ -161,7 +187,6 @@ struct l2t_entry *l2te; /* L2 table entry used by this connection */ struct clip_entry *ce; /* CLIP table entry used by this tid */ int tid; /* Connection identifier */ - int tc_idx; /* traffic class that this tid is bound to */ /* tx credit handling */ u_int tx_total; /* total tx WR credits (in 16B units) */ @@ -169,12 +194,8 @@ u_int tx_nocompl; /* tx WR credits since last compl request */ u_int plen_nocompl; /* payload since last compl request */ - uint16_t opt0_rcv_bufsize; /* XXX: save full opt0/opt2 for later? */ - uint16_t mtu_idx; - uint16_t emss; - uint16_t tcp_opt; + struct conn_params params; - u_int ulp_mode; /* ULP mode */ void *ulpcb; void *ulpcb2; struct mbufq ulp_pduq; /* PDUs waiting to be sent out. */ @@ -195,17 +216,17 @@ struct ofld_tx_sdesc txsd[]; }; +static inline int +ulp_mode(struct toepcb *toep) +{ + + return (toep->params.ulp_mode); +} + #define DDP_LOCK(toep) mtx_lock(&(toep)->ddp.lock) #define DDP_UNLOCK(toep) mtx_unlock(&(toep)->ddp.lock) #define DDP_ASSERT_LOCKED(toep) mtx_assert(&(toep)->ddp.lock, MA_OWNED) -struct flowc_tx_params { - uint32_t snd_nxt; - uint32_t rcv_nxt; - unsigned int snd_space; - unsigned int mss; -}; - /* * Compressed state for embryonic connections for a listener. */ @@ -219,13 +240,10 @@ uint32_t iss; uint32_t irs; uint32_t ts; - uint16_t txqid; - uint16_t rxqid; - uint16_t l2e_idx; - uint16_t ulp_mode; - uint16_t rcv_bufsize; __be16 tcp_opt; /* from cpl_pass_establish */ struct toepcb *toep; + + struct conn_params params; }; /* listen_ctx flags */ @@ -324,7 +342,8 @@ } /* t4_tom.c */ -struct toepcb *alloc_toepcb(struct vi_info *, int, int, int); +struct toepcb *alloc_toepcb(struct vi_info *, int); +int init_toepcb(struct vi_info *, struct toepcb *); struct toepcb *hold_toepcb(struct toepcb *); void free_toepcb(struct toepcb *); void offload_socket(struct socket *, struct toepcb *); @@ -334,16 +353,14 @@ void *lookup_tid(struct adapter *, int); void update_tid(struct adapter *, int, void *); void remove_tid(struct adapter *, int, int); -int find_best_mtu_idx(struct adapter *, struct in_conninfo *, - struct offload_settings *); u_long select_rcv_wnd(struct socket *); int select_rcv_wscale(void); -uint64_t calc_opt0(struct socket *, struct vi_info *, struct l2t_entry *, - int, int, int, int, struct offload_settings *); +void init_conn_params(struct vi_info *, struct offload_settings *, + struct in_conninfo *, struct socket *, const struct tcp_options *, int16_t, + struct conn_params *cp); +__be64 calc_options0(struct vi_info *, struct conn_params *); +__be32 calc_options2(struct vi_info *, struct conn_params *); uint64_t select_ntuple(struct vi_info *, struct l2t_entry *); -int select_ulp_mode(struct socket *, struct adapter *, - struct offload_settings *); -void set_ulp_mode(struct toepcb *, int); int negative_advice(int); int add_tid_to_history(struct adapter *, u_int); @@ -375,7 +392,7 @@ void t4_init_cpl_io_handlers(void); void t4_uninit_cpl_io_handlers(void); void send_abort_rpl(struct adapter *, struct sge_wrq *, int , int); -void send_flowc_wr(struct toepcb *, struct flowc_tx_params *); +void send_flowc_wr(struct toepcb *, struct tcpcb *); void send_reset(struct adapter *, struct toepcb *, uint32_t); int send_rx_credits(struct adapter *, struct toepcb *, int); void send_rx_modulate(struct adapter *, struct toepcb *); @@ -388,8 +405,8 @@ int t4_send_rst(struct toedev *, struct tcpcb *); void t4_set_tcb_field(struct adapter *, struct sge_wrq *, struct toepcb *, uint16_t, uint64_t, uint64_t, int, int); -void t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop); -void t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop); +void t4_push_frames(struct adapter *, struct toepcb *, int); +void t4_push_pdus(struct adapter *, struct toepcb *, int); /* t4_ddp.c */ int t4_init_ppod_region(struct ppod_region *, struct t4_range *, u_int, Index: sys/dev/cxgbe/tom/t4_tom.c =================================================================== --- sys/dev/cxgbe/tom/t4_tom.c +++ sys/dev/cxgbe/tom/t4_tom.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,7 @@ #include #include #include +#include #ifdef TCP_OFFLOAD #include "common/common.h" @@ -104,7 +106,7 @@ static void reclaim_wr_resources(void *, int); struct toepcb * -alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, int flags) +alloc_toepcb(struct vi_info *vi, int flags) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; @@ -127,16 +129,6 @@ txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); - KASSERT(txqid >= vi->first_ofld_txq && - txqid < vi->first_ofld_txq + vi->nofldtxq, - ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi, - vi->first_ofld_txq, vi->nofldtxq)); - - KASSERT(rxqid >= vi->first_ofld_rxq && - rxqid < vi->first_ofld_rxq + vi->nofldrxq, - ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi, - vi->first_ofld_rxq, vi->nofldrxq)); - len = offsetof(struct toepcb, txsd) + txsd_total * sizeof(struct ofld_tx_sdesc); @@ -147,12 +139,9 @@ refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->vi = vi; - toep->tc_idx = -1; + toep->tid = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; - toep->ofld_txq = &sc->sge.ofld_txq[txqid]; - toep->ofld_rxq = &sc->sge.ofld_rxq[rxqid]; - toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; mbufq_init(&toep->ulp_pduq, INT_MAX); mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX); toep->txsd_total = txsd_total; @@ -164,6 +153,42 @@ return (toep); } +/* + * Initialize a toepcb after its params have been filled out. + */ +int +init_toepcb(struct vi_info *vi, struct toepcb *toep) +{ + struct conn_params *cp = &toep->params; + struct port_info *pi = vi->pi; + struct adapter *sc = pi->adapter; + struct tx_cl_rl_params *tc; + + if (cp->tc_idx >= 0 && cp->tc_idx < sc->chip_params->nsched_cls) { + tc = &pi->sched_params->cl_rl[cp->tc_idx]; + mtx_lock(&sc->tc_lock); + if (tc->flags & CLRL_ERR) { + log(LOG_ERR, + "%s: failed to associate traffic class %u with tid %u\n", + device_get_nameunit(vi->dev), cp->tc_idx, + toep->tid); + cp->tc_idx = -1; + } else { + tc->refcount++; + } + mtx_unlock(&sc->tc_lock); + } + toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx]; + toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx]; + toep->ctrlq = &sc->sge.ctrlq[pi->port_id]; + + tls_init_toep(toep); + if (ulp_mode(toep) == ULP_MODE_TCPDDP) + ddp_init_toep(toep); + + return (0); +} + struct toepcb * hold_toepcb(struct toepcb *toep) { @@ -184,7 +209,7 @@ KASSERT(!(toep->flags & TPF_CPL_PENDING), ("%s: CPL pending", __func__)); - if (toep->ulp_mode == ULP_MODE_TCPDDP) + if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_uninit_toep(toep); tls_uninit_toep(toep); free(toep, M_CXGBE); @@ -291,7 +316,7 @@ MPASS(mbufq_len(&toep->ulp_pduq) == 0); MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0); #ifdef INVARIANTS - if (toep->ulp_mode == ULP_MODE_TCPDDP) + if (ulp_mode(toep) == ULP_MODE_TCPDDP) ddp_assert_empty(toep); #endif MPASS(TAILQ_EMPTY(&toep->aiotx_jobq)); @@ -307,8 +332,8 @@ if (toep->ce) t4_release_lip(sc, toep->ce); - if (toep->tc_idx != -1) - t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->tc_idx); + if (toep->params.tc_idx != -1) + t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx); mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); @@ -378,9 +403,9 @@ case TCP_NODELAY: if (tp->t_state != TCPS_ESTABLISHED) break; + toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, - V_TF_NAGLE(1), V_TF_NAGLE(tp->t_flags & TF_NODELAY ? 0 : 1), - 0, 0); + V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0); break; default: break; @@ -798,7 +823,7 @@ CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)", __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags); - if (toep->ulp_mode == ULP_MODE_TCPDDP) + if (ulp_mode(toep) == ULP_MODE_TCPDDP) release_ddp_resources(toep); toep->inp = NULL; toep->flags &= ~TPF_CPL_PENDING; @@ -854,7 +879,7 @@ * account for any TCP options so the effective MSS (only payload, no headers or * options) could be different. */ -int +static int find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, struct offload_settings *s) { @@ -907,41 +932,97 @@ return (wscale); } -/* - * socket so could be a listening socket too. - */ -uint64_t -calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e, - int mtu_idx, int rscale, int rx_credits, int ulp_mode, - struct offload_settings *s) +__be64 +calc_options0(struct vi_info *vi, struct conn_params *cp) { - int keepalive; - uint64_t opt0; + uint64_t opt0 = 0; - MPASS(so != NULL); - MPASS(vi != NULL); - KASSERT(rx_credits <= M_RCV_BUFSIZ, - ("%s: rcv_bufsiz too high", __func__)); + opt0 |= F_TCAM_BYPASS; - opt0 = F_TCAM_BYPASS | V_WND_SCALE(rscale) | V_MSS_IDX(mtu_idx) | - V_ULP_MODE(ulp_mode) | V_RCV_BUFSIZ(rx_credits) | - V_L2T_IDX(e->idx) | V_SMAC_SEL(vi->smt_idx) | - V_TX_CHAN(vi->pi->tx_chan); + MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE); + opt0 |= V_WND_SCALE(cp->wscale); - keepalive = tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE; - opt0 |= V_KEEP_ALIVE(keepalive != 0); + MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS); + opt0 |= V_MSS_IDX(cp->mtu_idx); - if (s->nagle < 0) { - struct inpcb *inp = sotoinpcb(so); - struct tcpcb *tp = intotcpcb(inp); + MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE); + opt0 |= V_ULP_MODE(cp->ulp_mode); - opt0 |= V_NAGLE((tp->t_flags & TF_NODELAY) == 0); - } else - opt0 |= V_NAGLE(s->nagle != 0); + MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ); + opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize); - return htobe64(opt0); + MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->pi->adapter->vres.l2t.size); + opt0 |= V_L2T_IDX(cp->l2t_idx); + + opt0 |= V_SMAC_SEL(vi->smt_idx); + opt0 |= V_TX_CHAN(vi->pi->tx_chan); + + MPASS(cp->keepalive == 0 || cp->keepalive == 1); + opt0 |= V_KEEP_ALIVE(cp->keepalive); + + MPASS(cp->nagle == 0 || cp->nagle == 1); + opt0 |= V_NAGLE(cp->nagle); + + return (htobe64(opt0)); } +__be32 +calc_options2(struct vi_info *vi, struct conn_params *cp) +{ + uint32_t opt2 = 0; + struct port_info *pi = vi->pi; + struct adapter *sc = pi->adapter; + + /* + * rx flow control, rx coalesce, congestion control, and tx pace are all + * explicitly set by the driver. On T5+ the ISS is also set by the + * driver to the value picked by the kernel. + */ + if (is_t4(sc)) { + opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; + opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; + } else { + opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ + opt2 |= F_T5_ISS; /* ISS provided in CPL */ + } + + MPASS(cp->sack == 0 || cp->sack == 1); + opt2 |= V_SACK_EN(cp->sack); + + MPASS(cp->tstamp == 0 || cp->tstamp == 1); + opt2 |= V_TSTAMPS_EN(cp->tstamp); + + if (cp->wscale > 0) + opt2 |= F_WND_SCALE_EN; + + MPASS(cp->ecn == 0 || cp->ecn == 1); + opt2 |= V_CCTRL_ECN(cp->ecn); + + /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ + + opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); + opt2 |= V_PACE(0); + opt2 |= F_RSS_QUEUE_VALID; + opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id); + + MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL); + opt2 |= V_CONG_CNTRL(cp->cong_algo); + + MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1); + if (cp->rx_coalesce == 1) + opt2 |= V_RX_COALESCE(M_RX_COALESCE); + + opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); +#ifdef USE_DDP_RX_FLOW_CONTROL + if (cp->ulp_mode == ULP_MODE_TCPDDP) + opt2 |= F_RX_FC_DDP; +#endif + if (cp->ulp_mode == ULP_MODE_TLS) + opt2 |= F_RX_FC_DISABLE; + + return (htobe32(opt2)); +} + uint64_t select_ntuple(struct vi_info *vi, struct l2t_entry *e) { @@ -994,31 +1075,209 @@ return (rc); } -int -select_ulp_mode(struct socket *so, struct adapter *sc, - struct offload_settings *s) +/* + * Initialize various connection parameters. + */ +void +init_conn_params(struct vi_info *vi , struct offload_settings *s, + struct in_conninfo *inc, struct socket *so, + const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp) { + struct port_info *pi = vi->pi; + struct adapter *sc = pi->adapter; + struct tom_tunables *tt = &sc->tt; + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + u_long wnd; + MPASS(s->offload != 0); + + /* Congestion control algorithm */ + if (s->cong_algo >= 0) + cp->cong_algo = s->cong_algo & M_CONG_CNTRL; + else if (sc->tt.cong_algorithm >= 0) + cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL; + else { + struct cc_algo *cc = CC_ALGO(tp); + + if (strcasecmp(cc->name, "reno") == 0) + cp->cong_algo = CONG_ALG_RENO; + else if (strcasecmp(cc->name, "tahoe") == 0) + cp->cong_algo = CONG_ALG_TAHOE; + if (strcasecmp(cc->name, "newreno") == 0) + cp->cong_algo = CONG_ALG_NEWRENO; + if (strcasecmp(cc->name, "highspeed") == 0) + cp->cong_algo = CONG_ALG_HIGHSPEED; + else { + /* + * Use newreno in case the algorithm selected by the + * host stack is not supported by the hardware. + */ + cp->cong_algo = CONG_ALG_NEWRENO; + } + } + + /* Tx traffic scheduling class. */ + if (s->sched_class >= 0 && + s->sched_class < sc->chip_params->nsched_cls) { + cp->tc_idx = s->sched_class; + } else + cp->tc_idx = -1; + + /* Nagle's algorithm. */ + if (s->nagle >= 0) + cp->nagle = s->nagle > 0 ? 1 : 0; + else + cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1; + + /* TCP Keepalive. */ + if (tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE) + cp->keepalive = 1; + else + cp->keepalive = 0; + + /* Optimization that's specific to T5 @ 40G. */ + if (tt->tx_align >= 0) + cp->tx_align = tt->tx_align > 0 ? 1 : 0; + else if (chip_id(sc) == CHELSIO_T5 && + (port_top_speed(pi) > 10 || sc->params.nports > 2)) + cp->tx_align = 1; + else + cp->tx_align = 0; + + /* ULP mode. */ if (can_tls_offload(sc) && (s->tls > 0 || (s->tls < 0 && is_tls_sock(so, sc)))) - return (ULP_MODE_TLS); + cp->ulp_mode = ULP_MODE_TLS; else if (s->ddp > 0 || - (s->ddp < 0 && sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0)) - return (ULP_MODE_TCPDDP); + (s->ddp < 0 && sc->tt.ddp && (so_options_get(so) & SO_NO_DDP) == 0)) + cp->ulp_mode = ULP_MODE_TCPDDP; else - return (ULP_MODE_NONE); -} + cp->ulp_mode = ULP_MODE_NONE; -void -set_ulp_mode(struct toepcb *toep, int ulp_mode) -{ + /* Rx coalescing. */ + if (s->rx_coalesce >= 0) + cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0; + else if (cp->ulp_mode == ULP_MODE_TLS) + cp->rx_coalesce = 0; + else if (tt->rx_coalesce >= 0) + cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0; + else + cp->rx_coalesce = 1; /* default */ - CTR4(KTR_CXGBE, "%s: toep %p (tid %d) ulp_mode %d", - __func__, toep, toep->tid, ulp_mode); - toep->ulp_mode = ulp_mode; - tls_init_toep(toep); - if (toep->ulp_mode == ULP_MODE_TCPDDP) - ddp_init_toep(toep); + /* + * Index in the PMTU table. This controls the MSS that we announce in + * our SYN initially, but after ESTABLISHED it controls the MSS that we + * use to send data. + */ + cp->mtu_idx = find_best_mtu_idx(sc, inc, s); + + /* Tx queue for this connection. */ + if (s->txq >= 0 && s->txq < vi->nofldtxq) + cp->txq_idx = s->txq; + else + cp->txq_idx = arc4random() % vi->nofldtxq; + cp->txq_idx += vi->first_ofld_txq; + + /* Rx queue for this connection. */ + if (s->rxq >= 0 && s->rxq < vi->nofldrxq) + cp->rxq_idx = s->rxq; + else + cp->rxq_idx = arc4random() % vi->nofldrxq; + cp->rxq_idx += vi->first_ofld_rxq; + + if (SOLISTENING(so)) { + /* Passive open */ + MPASS(tcpopt != NULL); + + /* TCP timestamp option */ + if (tcpopt->tstamp && + (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) + cp->tstamp = 1; + else + cp->tstamp = 0; + + /* SACK */ + if (tcpopt->sack && + (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack))) + cp->sack = 1; + else + cp->sack = 0; + + /* Receive window scaling. */ + if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323) + cp->wscale = select_rcv_wscale(); + else + cp->wscale = 0; + + /* ECN */ + if (tcpopt->ecn && /* XXX: review. */ + (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) + cp->ecn = 1; + else + cp->ecn = 0; + + wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND); + cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); + + if (tt->sndbuf > 0) + cp->sndbuf = tt->sndbuf; + else if (so->sol_sbsnd_flags & SB_AUTOSIZE && + V_tcp_do_autosndbuf) + cp->sndbuf = 256 * 1024; + else + cp->sndbuf = so->sol_sbsnd_hiwat; + } else { + /* Active open */ + + /* TCP timestamp option */ + if (s->tstamp > 0 || + (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) + cp->tstamp = 1; + else + cp->tstamp = 0; + + /* SACK */ + if (s->sack > 0 || + (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) + cp->sack = 1; + else + cp->sack = 0; + + /* Receive window scaling */ + if (tp->t_flags & TF_REQ_SCALE) + cp->wscale = select_rcv_wscale(); + else + cp->wscale = 0; + + /* ECN */ + if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) + cp->ecn = 1; + else + cp->ecn = 0; + + SOCKBUF_LOCK(&so->so_rcv); + wnd = max(select_rcv_wnd(so), MIN_RCV_WND); + SOCKBUF_UNLOCK(&so->so_rcv); + cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ); + + if (tt->sndbuf > 0) + cp->sndbuf = tt->sndbuf; + else { + SOCKBUF_LOCK(&so->so_snd); + if (so->so_snd.sb_flags & SB_AUTOSIZE && + V_tcp_do_autosndbuf) + cp->sndbuf = 256 * 1024; + else + cp->sndbuf = so->so_snd.sb_hiwat; + SOCKBUF_UNLOCK(&so->so_snd); + } + } + + cp->l2t_idx = l2t_idx; + + /* This will be initialized on ESTABLISHED. */ + cp->emss = 0; } int @@ -1527,7 +1786,7 @@ struct toepcb *toep = tp->t_toe; int error; - if (toep->ulp_mode == ULP_MODE_TCPDDP) { + if (ulp_mode(toep) == ULP_MODE_TCPDDP) { error = t4_aio_queue_ddp(so, job); if (error != EOPNOTSUPP) return (error);