diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -690,6 +690,10 @@ uint64_t rx_aio_ddp_octets; u_long rx_toe_tls_records; u_long rx_toe_tls_octets; + u_long rx_toe_ddp_octets; + counter_u64_t ddp_buffer_alloc; + counter_u64_t ddp_buffer_reuse; + counter_u64_t ddp_buffer_free; } __aligned(CACHE_LINE_SIZE); static inline struct sge_ofld_rxq * @@ -1344,6 +1348,8 @@ extern int t4_pktc_idx; extern unsigned int t4_qsize_rxq; extern unsigned int t4_qsize_txq; +extern int t4_ddp_rcvbuf_len; +extern unsigned int t4_ddp_rcvbuf_cache; extern device_method_t cxgbe_methods[]; int t4_os_find_pci_capability(struct adapter *, int); diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c --- a/sys/dev/cxgbe/t4_main.c +++ b/sys/dev/cxgbe/t4_main.c @@ -412,6 +412,15 @@ &t4_toe_rexmt_backoff[14], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[15], 0, ""); + +int t4_ddp_rcvbuf_len = 256 * 1024; +SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, ddp_rcvbuf_len, CTLFLAG_RWTUN, + &t4_ddp_rcvbuf_len, 0, "length of each DDP RX buffer"); + +unsigned int t4_ddp_rcvbuf_cache = 4; +SYSCTL_UINT(_hw_cxgbe_toe, OID_AUTO, ddp_rcvbuf_cache, CTLFLAG_RWTUN, + &t4_ddp_rcvbuf_cache, 0, + "maximum number of free DDP RX buffers to cache per connection"); #endif #ifdef DEV_NETMAP @@ -12046,6 +12055,10 @@ ofld_rxq->rx_aio_ddp_octets = 0; ofld_rxq->rx_toe_tls_records = 0; ofld_rxq->rx_toe_tls_octets = 0; + ofld_rxq->rx_toe_ddp_octets = 0; + counter_u64_zero(ofld_rxq->ddp_buffer_alloc); + counter_u64_zero(ofld_rxq->ddp_buffer_reuse); + counter_u64_zero(ofld_rxq->ddp_buffer_free); } #endif diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -4098,6 +4098,9 @@ ofld_rxq->rx_iscsi_ddp_setup_ok = counter_u64_alloc(M_WAITOK); ofld_rxq->rx_iscsi_ddp_setup_error = counter_u64_alloc(M_WAITOK); + ofld_rxq->ddp_buffer_alloc = counter_u64_alloc(M_WAITOK); + ofld_rxq->ddp_buffer_reuse = counter_u64_alloc(M_WAITOK); + ofld_rxq->ddp_buffer_free = counter_u64_alloc(M_WAITOK); add_ofld_rxq_sysctls(&vi->ctx, oid, ofld_rxq); } @@ -4132,6 +4135,9 @@ MPASS(!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED)); counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_ok); counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_error); + counter_u64_free(ofld_rxq->ddp_buffer_alloc); + counter_u64_free(ofld_rxq->ddp_buffer_reuse); + counter_u64_free(ofld_rxq->ddp_buffer_free); bzero(ofld_rxq, sizeof(*ofld_rxq)); } } @@ -4158,6 +4164,18 @@ SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_toe_tls_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_octets, "# of payload octets in received TOE TLS records"); + SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, + "rx_toe_ddp_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_ddp_octets, + "# of payload octets received via TCP DDP"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, + "ddp_buffer_alloc", CTLFLAG_RD, &ofld_rxq->ddp_buffer_alloc, + "# of DDP RCV buffers allocated"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, + "ddp_buffer_reuse", CTLFLAG_RD, &ofld_rxq->ddp_buffer_reuse, + "# of DDP RCV buffers reused"); + SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO, + "ddp_buffer_free", CTLFLAG_RD, &ofld_rxq->ddp_buffer_free, + "# of DDP RCV buffers freed"); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "iscsi", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE iSCSI statistics"); diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -1352,8 +1352,6 @@ if (toep->flags & TPF_ABORT_SHUTDOWN) goto done; - so = inp->inp_socket; - socantrcvmore(so); if (ulp_mode(toep) == ULP_MODE_TCPDDP) { DDP_LOCK(toep); if (__predict_false(toep->ddp.flags & @@ -1361,6 +1359,8 @@ handle_ddp_close(toep, tp, cpl->rcv_nxt); DDP_UNLOCK(toep); } + so = inp->inp_socket; + socantrcvmore(so); if (ulp_mode(toep) == ULP_MODE_RDMA || (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) { @@ -1782,7 +1782,8 @@ sbappendstream_locked(sb, m, 0); t4_rcvd_locked(&toep->td->tod, tp); - if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 && + if (ulp_mode(toep) == ULP_MODE_TCPDDP && + (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 && sbavail(sb) != 0) { CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__, tid); diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c --- a/sys/dev/cxgbe/tom/t4_ddp.c +++ b/sys/dev/cxgbe/tom/t4_ddp.c @@ -81,6 +81,10 @@ static void ddp_complete_all(struct toepcb *toep, int error); static void t4_aio_cancel_active(struct kaiocb *job); static void t4_aio_cancel_queued(struct kaiocb *job); +static int t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr, + struct ddp_rcv_buffer *drb); +static int t4_write_page_pods_for_rcvbuf(struct adapter *sc, + struct sge_wrq *wrq, int tid, struct ddp_rcv_buffer *drb); static TAILQ_HEAD(, pageset) ddp_orphan_pagesets; static struct mtx ddp_orphan_pagesets_lock; @@ -89,15 +93,15 @@ #define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN) /* - * A page set holds information about a buffer used for DDP. The page - * set holds resources such as the VM pages backing the buffer (either - * held or wired) and the page pods associated with the buffer. - * Recently used page sets are cached to allow for efficient reuse of - * buffers (avoiding the need to re-fault in pages, hold them, etc.). - * Note that cached page sets keep the backing pages wired. The - * number of wired pages is capped by only allowing for two wired - * pagesets per connection. This is not a perfect cap, but is a - * trade-off for performance. + * A page set holds information about a user buffer used for AIO DDP. + * The page set holds resources such as the VM pages backing the + * buffer (either held or wired) and the page pods associated with the + * buffer. Recently used page sets are cached to allow for efficient + * reuse of buffers (avoiding the need to re-fault in pages, hold + * them, etc.). Note that cached page sets keep the backing pages + * wired. The number of wired pages is capped by only allowing for + * two wired pagesets per connection. This is not a perfect cap, but + * is a trade-off for performance. * * If an application ping-pongs two buffers for a connection via * aio_read(2) then those buffers should remain wired and expensive VM @@ -174,8 +178,99 @@ } static void -free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db) +free_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb) { + t4_free_page_pods(&drb->prsv); + contigfree(drb->buf, drb->len, M_CXGBE); + free(drb, M_CXGBE); + counter_u64_add(toep->ofld_rxq->ddp_buffer_free, 1); + free_toepcb(toep); +} + +static void +recycle_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb) +{ + DDP_CACHE_LOCK(toep); + if (!(toep->ddp.flags & DDP_DEAD) && + toep->ddp.cached_count < t4_ddp_rcvbuf_cache) { + TAILQ_INSERT_HEAD(&toep->ddp.cached_buffers, drb, link); + toep->ddp.cached_count++; + DDP_CACHE_UNLOCK(toep); + } else { + DDP_CACHE_UNLOCK(toep); + free_ddp_rcv_buffer(toep, drb); + } +} + +static struct ddp_rcv_buffer * +alloc_cached_ddp_rcv_buffer(struct toepcb *toep) +{ + struct ddp_rcv_buffer *drb; + + DDP_CACHE_LOCK(toep); + if (!TAILQ_EMPTY(&toep->ddp.cached_buffers)) { + drb = TAILQ_FIRST(&toep->ddp.cached_buffers); + TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link); + toep->ddp.cached_count--; + counter_u64_add(toep->ofld_rxq->ddp_buffer_reuse, 1); + } else + drb = NULL; + DDP_CACHE_UNLOCK(toep); + return (drb); +} + +static struct ddp_rcv_buffer * +alloc_ddp_rcv_buffer(struct toepcb *toep, int how) +{ + struct tom_data *td = toep->td; + struct adapter *sc = td_adapter(td); + struct ddp_rcv_buffer *drb; + int error; + + drb = malloc(sizeof(*drb), M_CXGBE, how | M_ZERO); + if (drb == NULL) + return (NULL); + + drb->buf = contigmalloc(t4_ddp_rcvbuf_len, M_CXGBE, how, 0, ~0, + t4_ddp_rcvbuf_len, 0); + if (drb->buf == NULL) { + free(drb, M_CXGBE); + return (NULL); + } + drb->len = t4_ddp_rcvbuf_len; + drb->refs = 1; + + error = t4_alloc_page_pods_for_rcvbuf(&td->pr, drb); + if (error != 0) { + contigfree(drb->buf, drb->len, M_CXGBE); + free(drb, M_CXGBE); + return (NULL); + } + + error = t4_write_page_pods_for_rcvbuf(sc, toep->ctrlq, toep->tid, drb); + if (error != 0) { + t4_free_page_pods(&drb->prsv); + contigfree(drb->buf, drb->len, M_CXGBE); + free(drb, M_CXGBE); + return (NULL); + } + + hold_toepcb(toep); + counter_u64_add(toep->ofld_rxq->ddp_buffer_alloc, 1); + return (drb); +} + +static void +free_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db) +{ + if ((toep->ddp.flags & DDP_RCVBUF) != 0) { + if (db->drb != NULL) + free_ddp_rcv_buffer(toep, db->drb); +#ifdef INVARIANTS + db->drb = NULL; +#endif + return; + } if (db->job) { /* @@ -192,7 +287,7 @@ } if (db->ps) { - free_pageset(td, db->ps); + free_pageset(toep->td, db->ps); #ifdef INVARIANTS db->ps = NULL; #endif @@ -203,11 +298,10 @@ ddp_init_toep(struct toepcb *toep) { - TAILQ_INIT(&toep->ddp.aiojobq); - TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, toep); toep->ddp.flags = DDP_OK; toep->ddp.active_id = -1; mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF); + mtx_init(&toep->ddp.cache_lock, "t4 ddp cache", NULL, MTX_DEF); } void @@ -215,24 +309,38 @@ { mtx_destroy(&toep->ddp.lock); + mtx_destroy(&toep->ddp.cache_lock); } void release_ddp_resources(struct toepcb *toep) { + struct ddp_rcv_buffer *drb; struct pageset *ps; int i; DDP_LOCK(toep); + DDP_CACHE_LOCK(toep); toep->ddp.flags |= DDP_DEAD; + DDP_CACHE_UNLOCK(toep); for (i = 0; i < nitems(toep->ddp.db); i++) { - free_ddp_buffer(toep->td, &toep->ddp.db[i]); + free_ddp_buffer(toep, &toep->ddp.db[i]); } - while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) { - TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link); - free_pageset(toep->td, ps); + if ((toep->ddp.flags & DDP_AIO) != 0) { + while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) { + TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link); + free_pageset(toep->td, ps); + } + ddp_complete_all(toep, 0); + } + if ((toep->ddp.flags & DDP_RCVBUF) != 0) { + DDP_CACHE_LOCK(toep); + while ((drb = TAILQ_FIRST(&toep->ddp.cached_buffers)) != NULL) { + TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link); + free_ddp_rcv_buffer(toep, drb); + } + DDP_CACHE_UNLOCK(toep); } - ddp_complete_all(toep, 0); DDP_UNLOCK(toep); } @@ -242,13 +350,20 @@ { int i; - MPASS(!(toep->ddp.flags & DDP_TASK_ACTIVE)); + MPASS((toep->ddp.flags & (DDP_TASK_ACTIVE | DDP_DEAD)) != DDP_TASK_ACTIVE); for (i = 0; i < nitems(toep->ddp.db); i++) { - MPASS(toep->ddp.db[i].job == NULL); - MPASS(toep->ddp.db[i].ps == NULL); + if ((toep->ddp.flags & DDP_AIO) != 0) { + MPASS(toep->ddp.db[i].job == NULL); + MPASS(toep->ddp.db[i].ps == NULL); + } else + MPASS(toep->ddp.db[i].drb == NULL); } - MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets)); - MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq)); + if ((toep->ddp.flags & DDP_AIO) != 0) { + MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets)); + MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq)); + } + if ((toep->ddp.flags & DDP_RCVBUF) != 0) + MPASS(TAILQ_EMPTY(&toep->ddp.cached_buffers)); } #endif @@ -256,13 +371,18 @@ complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db, unsigned int db_idx) { + struct ddp_rcv_buffer *drb; unsigned int db_flag; toep->ddp.active_count--; if (toep->ddp.active_id == db_idx) { if (toep->ddp.active_count == 0) { - KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL, - ("%s: active_count mismatch", __func__)); + if ((toep->ddp.flags & DDP_AIO) != 0) + KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL, + ("%s: active_count mismatch", __func__)); + else + KASSERT(toep->ddp.db[db_idx ^ 1].drb == NULL, + ("%s: active_count mismatch", __func__)); toep->ddp.active_id = -1; } else toep->ddp.active_id ^= 1; @@ -276,10 +396,18 @@ ("%s: active count mismatch", __func__)); } - db->cancel_pending = 0; - db->job = NULL; - recycle_pageset(toep, db->ps); - db->ps = NULL; + if ((toep->ddp.flags & DDP_AIO) != 0) { + db->cancel_pending = 0; + db->job = NULL; + recycle_pageset(toep, db->ps); + db->ps = NULL; + } else { + drb = db->drb; + if (atomic_fetchadd_int(&drb->refs, -1) == 1) + recycle_ddp_rcv_buffer(toep, drb); + db->drb = NULL; + db->placed = 0; + } db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE; KASSERT(toep->ddp.flags & db_flag, @@ -288,6 +416,47 @@ toep->ddp.flags &= ~db_flag; } +/* Called when m_free drops the last reference. */ +static void +ddp_rcv_mbuf_done(struct mbuf *m) +{ + struct toepcb *toep = m->m_ext.ext_arg1; + struct ddp_rcv_buffer *drb = m->m_ext.ext_arg2; + + recycle_ddp_rcv_buffer(toep, drb); +} + +static void +queue_ddp_rcvbuf_mbuf(struct toepcb *toep, u_int db_idx, u_int len) +{ + struct inpcb *inp = toep->inp; + struct sockbuf *sb; + struct ddp_buffer *db; + struct ddp_rcv_buffer *drb; + struct mbuf *m; + + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) { + printf("%s: failed to allocate mbuf", __func__); + return; + } + m->m_pkthdr.rcvif = toep->vi->ifp; + + db = &toep->ddp.db[db_idx]; + drb = db->drb; + m_extaddref(m, (char *)drb->buf + db->placed, len, &drb->refs, + ddp_rcv_mbuf_done, toep, drb); + m->m_pkthdr.len = len; + m->m_len = len; + + sb = &inp->inp_socket->so_rcv; + SOCKBUF_LOCK_ASSERT(sb); + sbappendstream_locked(sb, m, 0); + + db->placed += len; + toep->ofld_rxq->rx_toe_ddp_octets += len; +} + /* XXX: handle_ddp_data code duplication */ void insert_ddp_data(struct toepcb *toep, uint32_t n) @@ -302,10 +471,12 @@ #ifdef INVARIANTS unsigned int db_flag; #endif + bool ddp_rcvbuf; INP_WLOCK_ASSERT(inp); DDP_ASSERT_LOCKED(toep); + ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0; tp->rcv_nxt += n; #ifndef USE_DDP_RX_FLOW_CONTROL KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__)); @@ -321,6 +492,16 @@ #endif MPASS((toep->ddp.flags & db_flag) != 0); db = &toep->ddp.db[db_idx]; + if (ddp_rcvbuf) { + placed = n; + if (placed > db->drb->len - db->placed) + placed = db->drb->len - db->placed; + if (placed != 0) + queue_ddp_rcvbuf_mbuf(toep, db_idx, placed); + complete_ddp_buffer(toep, db, db_idx); + n -= placed; + continue; + } job = db->job; copied = job->aio_received; placed = n; @@ -423,12 +604,13 @@ static struct wrqe * mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx, - struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask) + struct ppod_reservation *prsv, int offset, uint32_t len, + uint64_t ddp_flags, uint64_t ddp_flags_mask) { struct wrqe *wr; struct work_request_hdr *wrh; struct ulp_txpkt *ulpmc; - int len; + int wrlen; KASSERT(db_idx == 0 || db_idx == 1, ("%s: bad DDP buffer index %d", __func__, db_idx)); @@ -441,21 +623,21 @@ * The ULPTX master commands that follow must all end at 16B boundaries * too so we round up the size to 16. */ - len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + + wrlen = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) + roundup2(LEN__RX_DATA_ACK_ULP, 16); - wr = alloc_wrqe(len, toep->ctrlq); + wr = alloc_wrqe(wrlen, toep->ctrlq); if (wr == NULL) return (NULL); wrh = wrtod(wr); - INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */ + INIT_ULPTX_WRH(wrh, wrlen, 1, 0); /* atomic */ ulpmc = (struct ulp_txpkt *)(wrh + 1); /* Write the buffer's tag */ ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_BUF0_TAG + db_idx, V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG), - V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag)); + V_TCB_RX_DDP_BUF0_TAG(prsv->prsv_tag)); /* Update the current offset in the DDP buffer and its total length */ if (db_idx == 0) @@ -464,14 +646,14 @@ V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) | V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN), V_TCB_RX_DDP_BUF0_OFFSET(offset) | - V_TCB_RX_DDP_BUF0_LEN(ps->len)); + V_TCB_RX_DDP_BUF0_LEN(len)); else ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_BUF1_OFFSET, V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) | V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32), V_TCB_RX_DDP_BUF1_OFFSET(offset) | - V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32)); + V_TCB_RX_DDP_BUF1_LEN((u64)len << 32)); /* Update DDP flags */ ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS, @@ -484,7 +666,8 @@ } static int -handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) +handle_ddp_data_aio(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, + int len) { uint32_t report = be32toh(ddp_report); unsigned int db_idx; @@ -607,11 +790,239 @@ return (0); } +static bool +queue_ddp_rcvbuf(struct toepcb *toep, struct ddp_rcv_buffer *drb) +{ + struct adapter *sc = td_adapter(toep->td); + struct ddp_buffer *db; + struct wrqe *wr; + uint64_t ddp_flags, ddp_flags_mask; + int buf_flag, db_idx; + + DDP_ASSERT_LOCKED(toep); + + KASSERT((toep->ddp.flags & DDP_DEAD) == 0, ("%s: DDP_DEAD", __func__)); + KASSERT(toep->ddp.active_count < nitems(toep->ddp.db), + ("%s: no empty DDP buffer slot", __func__)); + + /* Determine which DDP buffer to use. */ + if (toep->ddp.db[0].drb == NULL) { + db_idx = 0; + } else { + MPASS(toep->ddp.db[1].drb == NULL); + db_idx = 1; + } + + /* + * Permit PSH to trigger a partial completion without + * invalidating the rest of the buffer, but disable the PUSH + * timer. + */ + ddp_flags = 0; + ddp_flags_mask = 0; + if (db_idx == 0) { + ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PUSH_DISABLE_0(0) | V_TF_DDP_PSHF_ENABLE_0(1) | + V_TF_DDP_BUF0_VALID(1); + ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) | + V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) | + V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1); + buf_flag = DDP_BUF0_ACTIVE; + } else { + ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE1(1) | + V_TF_DDP_PUSH_DISABLE_1(0) | V_TF_DDP_PSHF_ENABLE_1(1) | + V_TF_DDP_BUF1_VALID(1); + ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) | + V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) | + V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1); + buf_flag = DDP_BUF1_ACTIVE; + } + MPASS((toep->ddp.flags & buf_flag) == 0); + if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) { + MPASS(db_idx == 0); + MPASS(toep->ddp.active_id == -1); + MPASS(toep->ddp.active_count == 0); + ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1); + } + + /* + * The TID for this connection should still be valid. If + * DDP_DEAD is set, SBS_CANTRCVMORE should be set, so we + * shouldn't be this far anyway. + */ + wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &drb->prsv, 0, drb->len, + ddp_flags, ddp_flags_mask); + if (wr == NULL) { + recycle_ddp_rcv_buffer(toep, drb); + printf("%s: mk_update_tcb_for_ddp failed\n", __func__); + return (false); + } + +#ifdef VERBOSE_TRACES + CTR(KTR_CXGBE, + "%s: tid %u, scheduling DDP[%d] (flags %#lx/%#lx)", __func__, + toep->tid, db_idx, ddp_flags, ddp_flags_mask); +#endif + /* + * Hold a reference on scheduled buffers that is dropped in + * complete_ddp_buffer. + */ + drb->refs = 1; + + /* Give the chip the go-ahead. */ + t4_wrq_tx(sc, wr); + db = &toep->ddp.db[db_idx]; + db->drb = drb; + toep->ddp.flags |= buf_flag; + toep->ddp.active_count++; + if (toep->ddp.active_count == 1) { + MPASS(toep->ddp.active_id == -1); + toep->ddp.active_id = db_idx; + CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__, + toep->ddp.active_id); + } + return (true); +} + +static int +handle_ddp_data_rcvbuf(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, + int len) +{ + uint32_t report = be32toh(ddp_report); + struct inpcb *inp = toep->inp; + struct tcpcb *tp; + struct socket *so; + struct sockbuf *sb; + struct ddp_buffer *db; + struct ddp_rcv_buffer *drb; + unsigned int db_idx; + bool invalidated; + + db_idx = report & F_DDP_BUF_IDX ? 1 : 0; + + invalidated = (report & F_DDP_INV) != 0; + + INP_WLOCK(inp); + so = inp_inpcbtosocket(inp); + sb = &so->so_rcv; + DDP_LOCK(toep); + + KASSERT(toep->ddp.active_id == db_idx, + ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx, + toep->ddp.active_id, toep->tid)); + db = &toep->ddp.db[db_idx]; + + if (__predict_false(inp->inp_flags & INP_DROPPED)) { + /* + * This can happen due to an administrative tcpdrop(8). + * Just ignore the received data. + */ + CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x", + __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags); + if (invalidated) + complete_ddp_buffer(toep, db, db_idx); + goto out; + } + + tp = intotcpcb(inp); + + /* + * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the + * sequence number of the next byte to receive. The length of + * the data received for this message must be computed by + * comparing the new and old values of rcv_nxt. + * + * For RX_DATA_DDP, len might be non-zero, but it is only the + * length of the most recent DMA. It does not include the + * total length of the data received since the previous update + * for this DDP buffer. rcv_nxt is the sequence number of the + * first received byte from the most recent DMA. + */ + len += be32toh(rcv_nxt) - tp->rcv_nxt; + tp->rcv_nxt += len; + tp->t_rcvtime = ticks; +#ifndef USE_DDP_RX_FLOW_CONTROL + KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__)); + tp->rcv_wnd -= len; +#endif +#ifdef VERBOSE_TRACES + CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__, + toep->tid, db_idx, len, report); +#endif + + /* receive buffer autosize */ + MPASS(toep->vnet == so->so_vnet); + CURVNET_SET(toep->vnet); + SOCKBUF_LOCK(sb); + if (sb->sb_flags & SB_AUTOSIZE && + V_tcp_do_autorcvbuf && + sb->sb_hiwat < V_tcp_autorcvbuf_max && + len > (sbspace(sb) / 8 * 7)) { + struct adapter *sc = td_adapter(toep->td); + unsigned int hiwat = sb->sb_hiwat; + unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc, + V_tcp_autorcvbuf_max); + + if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) + sb->sb_flags &= ~SB_AUTOSIZE; + } + + if (len > 0) { + queue_ddp_rcvbuf_mbuf(toep, db_idx, len); + t4_rcvd_locked(&toep->td->tod, tp); + } + sorwakeup_locked(so); + SOCKBUF_UNLOCK_ASSERT(sb); + CURVNET_RESTORE(); + + if (invalidated) + complete_ddp_buffer(toep, db, db_idx); + else + KASSERT(db->placed < db->drb->len, + ("%s: full DDP buffer not invalidated", __func__)); + + if (toep->ddp.active_count != nitems(toep->ddp.db)) { + drb = alloc_cached_ddp_rcv_buffer(toep); + if (drb == NULL) + drb = alloc_ddp_rcv_buffer(toep, M_NOWAIT); + if (drb == NULL) + ddp_queue_toep(toep); + else { + if (!queue_ddp_rcvbuf(toep, drb)) { + ddp_queue_toep(toep); + } + } + } +out: + DDP_UNLOCK(toep); + INP_WUNLOCK(inp); + + return (0); +} + +static int +handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) +{ + if ((toep->ddp.flags & DDP_RCVBUF) != 0) + return (handle_ddp_data_rcvbuf(toep, ddp_report, rcv_nxt, len)); + else + return (handle_ddp_data_aio(toep, ddp_report, rcv_nxt, len)); +} + void handle_ddp_indicate(struct toepcb *toep) { DDP_ASSERT_LOCKED(toep); + if ((toep->ddp.flags & DDP_RCVBUF) != 0) { + /* + * Indicates are not meaningful for RCVBUF since + * buffers are activated when the socket option is + * set. + */ + return; + } + MPASS(toep->ddp.active_count == 0); MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0); if (toep->ddp.waiting_count == 0) { @@ -654,6 +1065,8 @@ /* * XXX: This duplicates a lot of code with handle_ddp_data(). */ + KASSERT((toep->ddp.flags & DDP_AIO) != 0, + ("%s: DDP_RCVBUF", __func__)); db_idx = G_COOKIE(cpl->cookie) - CPL_COOKIE_DDP0; MPASS(db_idx < nitems(toep->ddp.db)); INP_WLOCK(inp); @@ -707,6 +1120,8 @@ void handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt) { + struct socket *so = toep->inp->inp_socket; + struct sockbuf *sb = &so->so_rcv; struct ddp_buffer *db; struct kaiocb *job; long copied; @@ -715,14 +1130,19 @@ unsigned int db_flag; #endif int len, placed; + bool ddp_rcvbuf; INP_WLOCK_ASSERT(toep->inp); DDP_ASSERT_LOCKED(toep); + ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0; + /* - 1 is to ignore the byte for FIN */ len = be32toh(rcv_nxt) - tp->rcv_nxt - 1; tp->rcv_nxt += len; + CTR(KTR_CXGBE, "%s: tid %d placed %u bytes before FIN", __func__, + toep->tid, len); while (toep->ddp.active_count > 0) { MPASS(toep->ddp.active_id != -1); db_idx = toep->ddp.active_id; @@ -731,6 +1151,20 @@ #endif MPASS((toep->ddp.flags & db_flag) != 0); db = &toep->ddp.db[db_idx]; + if (ddp_rcvbuf) { + placed = len; + if (placed > db->drb->len - db->placed) + placed = db->drb->len - db->placed; + if (placed != 0) { + SOCKBUF_LOCK(sb); + queue_ddp_rcvbuf_mbuf(toep, db_idx, placed); + sorwakeup_locked(so); + SOCKBUF_UNLOCK_ASSERT(sb); + } + complete_ddp_buffer(toep, db, db_idx); + len -= placed; + continue; + } job = db->job; copied = job->aio_received; placed = len; @@ -758,7 +1192,8 @@ } MPASS(len == 0); - ddp_complete_all(toep, 0); + if ((toep->ddp.flags & DDP_AIO) != 0) + ddp_complete_all(toep, 0); } #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\ @@ -892,6 +1327,7 @@ static void enable_ddp(struct adapter *sc, struct toepcb *toep) { + uint64_t ddp_flags; KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK, ("%s: toep %p has bad ddp_flags 0x%x", @@ -900,13 +1336,16 @@ CTR3(KTR_CXGBE, "%s: tid %u (time %u)", __func__, toep->tid, time_uptime); + ddp_flags = 0; + if ((toep->ddp.flags & DDP_AIO) != 0) + ddp_flags |= V_TF_DDP_BUF0_INDICATE(1) | + V_TF_DDP_BUF1_INDICATE(1); DDP_ASSERT_LOCKED(toep); toep->ddp.flags |= DDP_SC_REQ; t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) | V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) | - V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), - V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0); + V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), ddp_flags, 0, 0); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0); } @@ -1103,6 +1542,19 @@ return (0); } +static int +t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr, + struct ddp_rcv_buffer *drb) +{ + struct ppod_reservation *prsv = &drb->prsv; + + KASSERT(prsv->prsv_nppods == 0, + ("%s: page pods already allocated", __func__)); + + return (t4_alloc_page_pods_for_buf(pr, (vm_offset_t)drb->buf, drb->len, + prsv)); +} + int t4_alloc_page_pods_for_sgl(struct ppod_region *pr, struct ctl_sg_entry *sgl, int entries, struct ppod_reservation *prsv) @@ -1223,7 +1675,6 @@ ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { - /* How many page pods are we writing in this cycle */ n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); chunk = PPOD_SZ(n); @@ -1276,6 +1727,96 @@ return (0); } +static int +t4_write_page_pods_for_rcvbuf(struct adapter *sc, struct sge_wrq *wrq, int tid, + struct ddp_rcv_buffer *drb) +{ + struct wrqe *wr; + struct ulp_mem_io *ulpmc; + struct ulptx_idata *ulpsc; + struct pagepod *ppod; + int i, j, k, n, chunk, len, ddp_pgsz; + u_int ppod_addr, offset; + uint32_t cmd; + struct ppod_reservation *prsv = &drb->prsv; + struct ppod_region *pr = prsv->prsv_pr; + uintptr_t end_pva, pva; + vm_paddr_t pa; + + MPASS(prsv->prsv_nppods > 0); + + cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE)); + if (is_t4(sc)) + cmd |= htobe32(F_ULP_MEMIO_ORDER); + else + cmd |= htobe32(F_T5_ULP_MEMIO_IMM); + ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)]; + offset = (uintptr_t)drb->buf & PAGE_MASK; + ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask); + pva = trunc_page((uintptr_t)drb->buf); + end_pva = trunc_page((uintptr_t)drb->buf + drb->len - 1); + for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) { + /* How many page pods are we writing in this cycle */ + n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS); + MPASS(n > 0); + chunk = PPOD_SZ(n); + len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16); + + wr = alloc_wrqe(len, wrq); + if (wr == NULL) + return (ENOMEM); /* ok to just bail out */ + ulpmc = wrtod(wr); + + INIT_ULPTX_WR(ulpmc, len, 0, 0); + ulpmc->cmd = cmd; + ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32)); + ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16)); + ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5)); + + ulpsc = (struct ulptx_idata *)(ulpmc + 1); + ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM)); + ulpsc->len = htobe32(chunk); + + ppod = (struct pagepod *)(ulpsc + 1); + for (j = 0; j < n; i++, j++, ppod++) { + ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID | + V_PPOD_TID(tid) | prsv->prsv_tag); + ppod->len_offset = htobe64(V_PPOD_LEN(drb->len) | + V_PPOD_OFST(offset)); + ppod->rsvd = 0; + + for (k = 0; k < nitems(ppod->addr); k++) { + if (pva > end_pva) + ppod->addr[k] = 0; + else { + pa = pmap_kextract(pva); + ppod->addr[k] = htobe64(pa); + pva += ddp_pgsz; + } +#if 0 + CTR5(KTR_CXGBE, + "%s: tid %d ppod[%d]->addr[%d] = %p", + __func__, tid, i, k, + be64toh(ppod->addr[k])); +#endif + } + + /* + * Walk back 1 segment so that the first address in the + * next pod is the same as the last one in the current + * pod. + */ + pva -= ddp_pgsz; + } + + t4_wrq_tx(sc, wr); + } + + MPASS(pva <= end_pva); + + return (0); +} + static struct mbuf * alloc_raw_wr_mbuf(int len) { @@ -1761,6 +2302,7 @@ struct kaiocb *job; DDP_ASSERT_LOCKED(toep); + KASSERT((toep->ddp.flags & DDP_AIO) != 0, ("%s: DDP_RCVBUF", __func__)); while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) { job = TAILQ_FIRST(&toep->ddp.aiojobq); TAILQ_REMOVE(&toep->ddp.aiojobq, job, list); @@ -2143,8 +2685,8 @@ * which will keep it open and keep the TCP PCB attached until * after the job is completed. */ - wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received, - ddp_flags, ddp_flags_mask); + wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &ps->prsv, ps->len, + job->aio_received, ddp_flags, ddp_flags_mask); if (wr == NULL) { recycle_pageset(toep, ps); aio_ddp_requeue_one(toep, job); @@ -2286,7 +2828,6 @@ struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; - /* Ignore writes. */ if (job->uaiocb.aio_lio_opcode != LIO_READ) return (EOPNOTSUPP); @@ -2302,6 +2843,15 @@ DDP_LOCK(toep); + /* + * If DDP is being used for all normal receive, don't use it + * for AIO. + */ + if ((toep->ddp.flags & DDP_RCVBUF) != 0) { + DDP_UNLOCK(toep); + return (EOPNOTSUPP); + } + /* * XXX: Think about possibly returning errors for ENOTCONN, * etc. Perhaps the caller would only queue the request @@ -2315,7 +2865,14 @@ panic("new job was cancelled"); TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list); toep->ddp.waiting_count++; - toep->ddp.flags |= DDP_OK; + + if ((toep->ddp.flags & DDP_AIO) == 0) { + toep->ddp.flags |= DDP_AIO; + TAILQ_INIT(&toep->ddp.cached_pagesets); + TAILQ_INIT(&toep->ddp.aiojobq); + TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, + toep); + } /* * Try to handle this request synchronously. If this has @@ -2327,9 +2884,146 @@ return (0); } +static void +ddp_rcvbuf_requeue(struct toepcb *toep) +{ + struct socket *so; + struct sockbuf *sb; + struct inpcb *inp; + struct ddp_rcv_buffer *drb; + + DDP_ASSERT_LOCKED(toep); +restart: + if ((toep->ddp.flags & DDP_DEAD) != 0) { + MPASS(toep->ddp.active_count == 0); + return; + } + + /* If both buffers are active, nothing to do. */ + if (toep->ddp.active_count == nitems(toep->ddp.db)) { + return; + } + + inp = toep->inp; + so = inp->inp_socket; + sb = &so->so_rcv; + + drb = alloc_cached_ddp_rcv_buffer(toep); + DDP_UNLOCK(toep); + + if (drb == NULL) { + drb = alloc_ddp_rcv_buffer(toep, M_WAITOK); + if (drb == NULL) { + printf("%s: failed to allocate buffer\n", __func__); + DDP_LOCK(toep); + return; + } + } + + DDP_LOCK(toep); + if ((toep->ddp.flags & DDP_DEAD) != 0 || + toep->ddp.active_count == nitems(toep->ddp.db)) { + recycle_ddp_rcv_buffer(toep, drb); + return; + } + + /* We will never get anything unless we are or were connected. */ + SOCKBUF_LOCK(sb); + if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { + SOCKBUF_UNLOCK(sb); + recycle_ddp_rcv_buffer(toep, drb); + return; + } + + /* Abort if socket has reported problems or is closed. */ + if (so->so_error != 0 || (sb->sb_state & SBS_CANTRCVMORE) != 0) { + SOCKBUF_UNLOCK(sb); + recycle_ddp_rcv_buffer(toep, drb); + return; + } + SOCKBUF_UNLOCK(sb); + + if (!queue_ddp_rcvbuf(toep, drb)) { + /* + * XXX: Need a way to kick a retry here. + * + * XXX: We know the fixed size needed and could + * preallocate the work request using a blocking + * request at the start of the task to avoid having to + * handle this edge case. + */ + return; + } + goto restart; +} + +static void +ddp_rcvbuf_requeue_task(void *context, int pending) +{ + struct toepcb *toep = context; + + DDP_LOCK(toep); + ddp_rcvbuf_requeue(toep); + toep->ddp.flags &= ~DDP_TASK_ACTIVE; + DDP_UNLOCK(toep); + + free_toepcb(toep); +} + +int +t4_enable_ddp_rcv(struct socket *so, struct toepcb *toep) +{ + struct inpcb *inp = sotoinpcb(so); + struct adapter *sc = td_adapter(toep->td); + + INP_WLOCK(inp); + switch (ulp_mode(toep)) { + case ULP_MODE_TCPDDP: + break; + case ULP_MODE_NONE: + if (set_ddp_ulp_mode(toep)) + break; + /* FALLTHROUGH */ + default: + INP_WUNLOCK(inp); + return (EOPNOTSUPP); + } + INP_WUNLOCK(inp); + + DDP_LOCK(toep); + + /* + * If DDP is being used for AIO already, don't use it for + * normal receive. + */ + if ((toep->ddp.flags & DDP_AIO) != 0) { + DDP_UNLOCK(toep); + return (EOPNOTSUPP); + } + + if ((toep->ddp.flags & DDP_RCVBUF) != 0) { + DDP_UNLOCK(toep); + return (EBUSY); + } + + toep->ddp.flags |= DDP_RCVBUF; + TAILQ_INIT(&toep->ddp.cached_buffers); + enable_ddp(sc, toep); + TASK_INIT(&toep->ddp.requeue_task, 0, ddp_rcvbuf_requeue_task, toep); + ddp_queue_toep(toep); + DDP_UNLOCK(toep); + return (0); +} + void t4_ddp_mod_load(void) { + if (t4_ddp_rcvbuf_len < PAGE_SIZE) + t4_ddp_rcvbuf_len = PAGE_SIZE; + if (t4_ddp_rcvbuf_len > MAX_DDP_BUFFER_SIZE) + t4_ddp_rcvbuf_len = MAX_DDP_BUFFER_SIZE; + if (!powerof2(t4_ddp_rcvbuf_len)) + t4_ddp_rcvbuf_len = 1 << fls(t4_ddp_rcvbuf_len); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl, CPL_COOKIE_DDP0); diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -85,6 +85,8 @@ DDP_BUF1_ACTIVE = (1 << 4), /* buffer 1 in use (not invalidated) */ DDP_TASK_ACTIVE = (1 << 5), /* requeue task is queued / running */ DDP_DEAD = (1 << 6), /* toepcb is shutting down */ + DDP_AIO = (1 << 7), /* DDP used for AIO, not so_rcv */ + DDP_RCVBUF = (1 << 8), /* DDP used for so_rcv, not AIO */ }; struct bio; @@ -156,25 +158,51 @@ #define PS_PPODS_WRITTEN 0x0001 /* Page pods written to the card. */ +struct ddp_rcv_buffer { + TAILQ_ENTRY(ddp_rcv_buffer) link; + void *buf; + struct ppod_reservation prsv; + size_t len; + u_int refs; +}; + struct ddp_buffer { - struct pageset *ps; + union { + /* DDP_AIO fields */ + struct { + struct pageset *ps; + struct kaiocb *job; + int cancel_pending; + }; - struct kaiocb *job; - int cancel_pending; + /* DDP_RCVBUF fields */ + struct { + struct ddp_rcv_buffer *drb; + uint32_t placed; + }; + }; }; +/* + * (a) - DDP_AIO only + * (r) - DDP_RCVBUF only + */ struct ddp_pcb { + struct mtx lock; u_int flags; + int active_id; /* the currently active DDP buffer */ struct ddp_buffer db[2]; - TAILQ_HEAD(, pageset) cached_pagesets; - TAILQ_HEAD(, kaiocb) aiojobq; - u_int waiting_count; + union { + TAILQ_HEAD(, pageset) cached_pagesets; /* (a) */ + TAILQ_HEAD(, ddp_rcv_buffer) cached_buffers; /* (r) */ + }; + TAILQ_HEAD(, kaiocb) aiojobq; /* (a) */ + u_int waiting_count; /* (a) */ u_int active_count; u_int cached_count; - int active_id; /* the currently active DDP buffer */ struct task requeue_task; - struct kaiocb *queueing; - struct mtx lock; + struct kaiocb *queueing; /* (a) */ + struct mtx cache_lock; /* (r) */ }; struct toepcb { @@ -230,6 +258,8 @@ #define DDP_LOCK(toep) mtx_lock(&(toep)->ddp.lock) #define DDP_UNLOCK(toep) mtx_unlock(&(toep)->ddp.lock) #define DDP_ASSERT_LOCKED(toep) mtx_assert(&(toep)->ddp.lock, MA_OWNED) +#define DDP_CACHE_LOCK(toep) mtx_lock(&(toep)->ddp.cache_lock) +#define DDP_CACHE_UNLOCK(toep) mtx_unlock(&(toep)->ddp.cache_lock) /* * Compressed state for embryonic connections for a listener. @@ -502,6 +532,7 @@ struct ppod_reservation *, struct ctl_sg_entry *, int, int, struct mbufq *); void t4_free_page_pods(struct ppod_reservation *); int t4_aio_queue_ddp(struct socket *, struct kaiocb *); +int t4_enable_ddp_rcv(struct socket *, struct toepcb *); void t4_ddp_mod_load(void); void t4_ddp_mod_unload(void); void ddp_assert_empty(struct toepcb *); diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -1950,6 +1950,35 @@ return (rc); } +static int +t4_ctloutput_tom(struct socket *so, struct sockopt *sopt) +{ + struct tcpcb *tp = sototcpcb(so); + struct toepcb *toep = tp->t_toe; + int error, optval; + + if (sopt->sopt_level == IPPROTO_TCP && sopt->sopt_name == TCP_USE_DDP) { + if (sopt->sopt_dir != SOPT_SET) + return (EOPNOTSUPP); + + if (sopt->sopt_td != NULL) { + /* Only settable by the kernel. */ + return (EPERM); + } + + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error != 0) + return (error); + + if (optval != 0) + return (t4_enable_ddp_rcv(so, toep)); + else + return (EOPNOTSUPP); + } + return (tcp_ctloutput(so, sopt)); +} + static int t4_aio_queue_tom(struct socket *so, struct kaiocb *job) { @@ -1989,9 +2018,11 @@ t4_tls_mod_load(); bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw)); + toe_protosw.pr_ctloutput = t4_ctloutput_tom; toe_protosw.pr_aio_queue = t4_aio_queue_tom; bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw)); + toe6_protosw.pr_ctloutput = t4_ctloutput_tom; toe6_protosw.pr_aio_queue = t4_aio_queue_tom; return (t4_register_uld(&tom_uld_info));