Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F103779861
D44001.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
38 KB
Referenced Files
None
Subscribers
None
D44001.diff
View Options
diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h
--- a/sys/dev/cxgbe/adapter.h
+++ b/sys/dev/cxgbe/adapter.h
@@ -690,6 +690,10 @@
uint64_t rx_aio_ddp_octets;
u_long rx_toe_tls_records;
u_long rx_toe_tls_octets;
+ u_long rx_toe_ddp_octets;
+ counter_u64_t ddp_buffer_alloc;
+ counter_u64_t ddp_buffer_reuse;
+ counter_u64_t ddp_buffer_free;
} __aligned(CACHE_LINE_SIZE);
static inline struct sge_ofld_rxq *
@@ -1344,6 +1348,8 @@
extern int t4_pktc_idx;
extern unsigned int t4_qsize_rxq;
extern unsigned int t4_qsize_txq;
+extern int t4_ddp_rcvbuf_len;
+extern unsigned int t4_ddp_rcvbuf_cache;
extern device_method_t cxgbe_methods[];
int t4_os_find_pci_capability(struct adapter *, int);
diff --git a/sys/dev/cxgbe/t4_main.c b/sys/dev/cxgbe/t4_main.c
--- a/sys/dev/cxgbe/t4_main.c
+++ b/sys/dev/cxgbe/t4_main.c
@@ -412,6 +412,15 @@
&t4_toe_rexmt_backoff[14], 0, "");
SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN,
&t4_toe_rexmt_backoff[15], 0, "");
+
+int t4_ddp_rcvbuf_len = 256 * 1024;
+SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, ddp_rcvbuf_len, CTLFLAG_RWTUN,
+ &t4_ddp_rcvbuf_len, 0, "length of each DDP RX buffer");
+
+unsigned int t4_ddp_rcvbuf_cache = 4;
+SYSCTL_UINT(_hw_cxgbe_toe, OID_AUTO, ddp_rcvbuf_cache, CTLFLAG_RWTUN,
+ &t4_ddp_rcvbuf_cache, 0,
+ "maximum number of free DDP RX buffers to cache per connection");
#endif
#ifdef DEV_NETMAP
@@ -12046,6 +12055,10 @@
ofld_rxq->rx_aio_ddp_octets = 0;
ofld_rxq->rx_toe_tls_records = 0;
ofld_rxq->rx_toe_tls_octets = 0;
+ ofld_rxq->rx_toe_ddp_octets = 0;
+ counter_u64_zero(ofld_rxq->ddp_buffer_alloc);
+ counter_u64_zero(ofld_rxq->ddp_buffer_reuse);
+ counter_u64_zero(ofld_rxq->ddp_buffer_free);
}
#endif
diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c
--- a/sys/dev/cxgbe/t4_sge.c
+++ b/sys/dev/cxgbe/t4_sge.c
@@ -4098,6 +4098,9 @@
ofld_rxq->rx_iscsi_ddp_setup_ok = counter_u64_alloc(M_WAITOK);
ofld_rxq->rx_iscsi_ddp_setup_error =
counter_u64_alloc(M_WAITOK);
+ ofld_rxq->ddp_buffer_alloc = counter_u64_alloc(M_WAITOK);
+ ofld_rxq->ddp_buffer_reuse = counter_u64_alloc(M_WAITOK);
+ ofld_rxq->ddp_buffer_free = counter_u64_alloc(M_WAITOK);
add_ofld_rxq_sysctls(&vi->ctx, oid, ofld_rxq);
}
@@ -4132,6 +4135,9 @@
MPASS(!(ofld_rxq->iq.flags & IQ_SW_ALLOCATED));
counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_ok);
counter_u64_free(ofld_rxq->rx_iscsi_ddp_setup_error);
+ counter_u64_free(ofld_rxq->ddp_buffer_alloc);
+ counter_u64_free(ofld_rxq->ddp_buffer_reuse);
+ counter_u64_free(ofld_rxq->ddp_buffer_free);
bzero(ofld_rxq, sizeof(*ofld_rxq));
}
}
@@ -4158,6 +4164,18 @@
SYSCTL_ADD_ULONG(ctx, children, OID_AUTO,
"rx_toe_tls_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_tls_octets,
"# of payload octets in received TOE TLS records");
+ SYSCTL_ADD_ULONG(ctx, children, OID_AUTO,
+ "rx_toe_ddp_octets", CTLFLAG_RD, &ofld_rxq->rx_toe_ddp_octets,
+ "# of payload octets received via TCP DDP");
+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO,
+ "ddp_buffer_alloc", CTLFLAG_RD, &ofld_rxq->ddp_buffer_alloc,
+ "# of DDP RCV buffers allocated");
+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO,
+ "ddp_buffer_reuse", CTLFLAG_RD, &ofld_rxq->ddp_buffer_reuse,
+ "# of DDP RCV buffers reused");
+ SYSCTL_ADD_COUNTER_U64(ctx, children, OID_AUTO,
+ "ddp_buffer_free", CTLFLAG_RD, &ofld_rxq->ddp_buffer_free,
+ "# of DDP RCV buffers freed");
oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "iscsi",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TOE iSCSI statistics");
diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c
--- a/sys/dev/cxgbe/tom/t4_cpl_io.c
+++ b/sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -1352,8 +1352,6 @@
if (toep->flags & TPF_ABORT_SHUTDOWN)
goto done;
- so = inp->inp_socket;
- socantrcvmore(so);
if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
DDP_LOCK(toep);
if (__predict_false(toep->ddp.flags &
@@ -1361,6 +1359,8 @@
handle_ddp_close(toep, tp, cpl->rcv_nxt);
DDP_UNLOCK(toep);
}
+ so = inp->inp_socket;
+ socantrcvmore(so);
if (ulp_mode(toep) == ULP_MODE_RDMA ||
(ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) {
@@ -1782,7 +1782,8 @@
sbappendstream_locked(sb, m, 0);
t4_rcvd_locked(&toep->td->tod, tp);
- if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
+ if (ulp_mode(toep) == ULP_MODE_TCPDDP &&
+ (toep->ddp.flags & DDP_AIO) != 0 && toep->ddp.waiting_count > 0 &&
sbavail(sb) != 0) {
CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
tid);
diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c
--- a/sys/dev/cxgbe/tom/t4_ddp.c
+++ b/sys/dev/cxgbe/tom/t4_ddp.c
@@ -81,6 +81,10 @@
static void ddp_complete_all(struct toepcb *toep, int error);
static void t4_aio_cancel_active(struct kaiocb *job);
static void t4_aio_cancel_queued(struct kaiocb *job);
+static int t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr,
+ struct ddp_rcv_buffer *drb);
+static int t4_write_page_pods_for_rcvbuf(struct adapter *sc,
+ struct sge_wrq *wrq, int tid, struct ddp_rcv_buffer *drb);
static TAILQ_HEAD(, pageset) ddp_orphan_pagesets;
static struct mtx ddp_orphan_pagesets_lock;
@@ -89,15 +93,15 @@
#define MAX_DDP_BUFFER_SIZE (M_TCB_RX_DDP_BUF0_LEN)
/*
- * A page set holds information about a buffer used for DDP. The page
- * set holds resources such as the VM pages backing the buffer (either
- * held or wired) and the page pods associated with the buffer.
- * Recently used page sets are cached to allow for efficient reuse of
- * buffers (avoiding the need to re-fault in pages, hold them, etc.).
- * Note that cached page sets keep the backing pages wired. The
- * number of wired pages is capped by only allowing for two wired
- * pagesets per connection. This is not a perfect cap, but is a
- * trade-off for performance.
+ * A page set holds information about a user buffer used for AIO DDP.
+ * The page set holds resources such as the VM pages backing the
+ * buffer (either held or wired) and the page pods associated with the
+ * buffer. Recently used page sets are cached to allow for efficient
+ * reuse of buffers (avoiding the need to re-fault in pages, hold
+ * them, etc.). Note that cached page sets keep the backing pages
+ * wired. The number of wired pages is capped by only allowing for
+ * two wired pagesets per connection. This is not a perfect cap, but
+ * is a trade-off for performance.
*
* If an application ping-pongs two buffers for a connection via
* aio_read(2) then those buffers should remain wired and expensive VM
@@ -174,8 +178,99 @@
}
static void
-free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
+free_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb)
{
+ t4_free_page_pods(&drb->prsv);
+ contigfree(drb->buf, drb->len, M_CXGBE);
+ free(drb, M_CXGBE);
+ counter_u64_add(toep->ofld_rxq->ddp_buffer_free, 1);
+ free_toepcb(toep);
+}
+
+static void
+recycle_ddp_rcv_buffer(struct toepcb *toep, struct ddp_rcv_buffer *drb)
+{
+ DDP_CACHE_LOCK(toep);
+ if (!(toep->ddp.flags & DDP_DEAD) &&
+ toep->ddp.cached_count < t4_ddp_rcvbuf_cache) {
+ TAILQ_INSERT_HEAD(&toep->ddp.cached_buffers, drb, link);
+ toep->ddp.cached_count++;
+ DDP_CACHE_UNLOCK(toep);
+ } else {
+ DDP_CACHE_UNLOCK(toep);
+ free_ddp_rcv_buffer(toep, drb);
+ }
+}
+
+static struct ddp_rcv_buffer *
+alloc_cached_ddp_rcv_buffer(struct toepcb *toep)
+{
+ struct ddp_rcv_buffer *drb;
+
+ DDP_CACHE_LOCK(toep);
+ if (!TAILQ_EMPTY(&toep->ddp.cached_buffers)) {
+ drb = TAILQ_FIRST(&toep->ddp.cached_buffers);
+ TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link);
+ toep->ddp.cached_count--;
+ counter_u64_add(toep->ofld_rxq->ddp_buffer_reuse, 1);
+ } else
+ drb = NULL;
+ DDP_CACHE_UNLOCK(toep);
+ return (drb);
+}
+
+static struct ddp_rcv_buffer *
+alloc_ddp_rcv_buffer(struct toepcb *toep, int how)
+{
+ struct tom_data *td = toep->td;
+ struct adapter *sc = td_adapter(td);
+ struct ddp_rcv_buffer *drb;
+ int error;
+
+ drb = malloc(sizeof(*drb), M_CXGBE, how | M_ZERO);
+ if (drb == NULL)
+ return (NULL);
+
+ drb->buf = contigmalloc(t4_ddp_rcvbuf_len, M_CXGBE, how, 0, ~0,
+ t4_ddp_rcvbuf_len, 0);
+ if (drb->buf == NULL) {
+ free(drb, M_CXGBE);
+ return (NULL);
+ }
+ drb->len = t4_ddp_rcvbuf_len;
+ drb->refs = 1;
+
+ error = t4_alloc_page_pods_for_rcvbuf(&td->pr, drb);
+ if (error != 0) {
+ contigfree(drb->buf, drb->len, M_CXGBE);
+ free(drb, M_CXGBE);
+ return (NULL);
+ }
+
+ error = t4_write_page_pods_for_rcvbuf(sc, toep->ctrlq, toep->tid, drb);
+ if (error != 0) {
+ t4_free_page_pods(&drb->prsv);
+ contigfree(drb->buf, drb->len, M_CXGBE);
+ free(drb, M_CXGBE);
+ return (NULL);
+ }
+
+ hold_toepcb(toep);
+ counter_u64_add(toep->ofld_rxq->ddp_buffer_alloc, 1);
+ return (drb);
+}
+
+static void
+free_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db)
+{
+ if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
+ if (db->drb != NULL)
+ free_ddp_rcv_buffer(toep, db->drb);
+#ifdef INVARIANTS
+ db->drb = NULL;
+#endif
+ return;
+ }
if (db->job) {
/*
@@ -192,7 +287,7 @@
}
if (db->ps) {
- free_pageset(td, db->ps);
+ free_pageset(toep->td, db->ps);
#ifdef INVARIANTS
db->ps = NULL;
#endif
@@ -203,11 +298,10 @@
ddp_init_toep(struct toepcb *toep)
{
- TAILQ_INIT(&toep->ddp.aiojobq);
- TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task, toep);
toep->ddp.flags = DDP_OK;
toep->ddp.active_id = -1;
mtx_init(&toep->ddp.lock, "t4 ddp", NULL, MTX_DEF);
+ mtx_init(&toep->ddp.cache_lock, "t4 ddp cache", NULL, MTX_DEF);
}
void
@@ -215,24 +309,38 @@
{
mtx_destroy(&toep->ddp.lock);
+ mtx_destroy(&toep->ddp.cache_lock);
}
void
release_ddp_resources(struct toepcb *toep)
{
+ struct ddp_rcv_buffer *drb;
struct pageset *ps;
int i;
DDP_LOCK(toep);
+ DDP_CACHE_LOCK(toep);
toep->ddp.flags |= DDP_DEAD;
+ DDP_CACHE_UNLOCK(toep);
for (i = 0; i < nitems(toep->ddp.db); i++) {
- free_ddp_buffer(toep->td, &toep->ddp.db[i]);
+ free_ddp_buffer(toep, &toep->ddp.db[i]);
}
- while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
- TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
- free_pageset(toep->td, ps);
+ if ((toep->ddp.flags & DDP_AIO) != 0) {
+ while ((ps = TAILQ_FIRST(&toep->ddp.cached_pagesets)) != NULL) {
+ TAILQ_REMOVE(&toep->ddp.cached_pagesets, ps, link);
+ free_pageset(toep->td, ps);
+ }
+ ddp_complete_all(toep, 0);
+ }
+ if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
+ DDP_CACHE_LOCK(toep);
+ while ((drb = TAILQ_FIRST(&toep->ddp.cached_buffers)) != NULL) {
+ TAILQ_REMOVE(&toep->ddp.cached_buffers, drb, link);
+ free_ddp_rcv_buffer(toep, drb);
+ }
+ DDP_CACHE_UNLOCK(toep);
}
- ddp_complete_all(toep, 0);
DDP_UNLOCK(toep);
}
@@ -242,13 +350,20 @@
{
int i;
- MPASS(!(toep->ddp.flags & DDP_TASK_ACTIVE));
+ MPASS((toep->ddp.flags & (DDP_TASK_ACTIVE | DDP_DEAD)) != DDP_TASK_ACTIVE);
for (i = 0; i < nitems(toep->ddp.db); i++) {
- MPASS(toep->ddp.db[i].job == NULL);
- MPASS(toep->ddp.db[i].ps == NULL);
+ if ((toep->ddp.flags & DDP_AIO) != 0) {
+ MPASS(toep->ddp.db[i].job == NULL);
+ MPASS(toep->ddp.db[i].ps == NULL);
+ } else
+ MPASS(toep->ddp.db[i].drb == NULL);
+ }
+ if ((toep->ddp.flags & DDP_AIO) != 0) {
+ MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
+ MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
}
- MPASS(TAILQ_EMPTY(&toep->ddp.cached_pagesets));
- MPASS(TAILQ_EMPTY(&toep->ddp.aiojobq));
+ if ((toep->ddp.flags & DDP_RCVBUF) != 0)
+ MPASS(TAILQ_EMPTY(&toep->ddp.cached_buffers));
}
#endif
@@ -256,13 +371,18 @@
complete_ddp_buffer(struct toepcb *toep, struct ddp_buffer *db,
unsigned int db_idx)
{
+ struct ddp_rcv_buffer *drb;
unsigned int db_flag;
toep->ddp.active_count--;
if (toep->ddp.active_id == db_idx) {
if (toep->ddp.active_count == 0) {
- KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
- ("%s: active_count mismatch", __func__));
+ if ((toep->ddp.flags & DDP_AIO) != 0)
+ KASSERT(toep->ddp.db[db_idx ^ 1].job == NULL,
+ ("%s: active_count mismatch", __func__));
+ else
+ KASSERT(toep->ddp.db[db_idx ^ 1].drb == NULL,
+ ("%s: active_count mismatch", __func__));
toep->ddp.active_id = -1;
} else
toep->ddp.active_id ^= 1;
@@ -276,10 +396,18 @@
("%s: active count mismatch", __func__));
}
- db->cancel_pending = 0;
- db->job = NULL;
- recycle_pageset(toep, db->ps);
- db->ps = NULL;
+ if ((toep->ddp.flags & DDP_AIO) != 0) {
+ db->cancel_pending = 0;
+ db->job = NULL;
+ recycle_pageset(toep, db->ps);
+ db->ps = NULL;
+ } else {
+ drb = db->drb;
+ if (atomic_fetchadd_int(&drb->refs, -1) == 1)
+ recycle_ddp_rcv_buffer(toep, drb);
+ db->drb = NULL;
+ db->placed = 0;
+ }
db_flag = db_idx == 1 ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
KASSERT(toep->ddp.flags & db_flag,
@@ -288,6 +416,47 @@
toep->ddp.flags &= ~db_flag;
}
+/* Called when m_free drops the last reference. */
+static void
+ddp_rcv_mbuf_done(struct mbuf *m)
+{
+ struct toepcb *toep = m->m_ext.ext_arg1;
+ struct ddp_rcv_buffer *drb = m->m_ext.ext_arg2;
+
+ recycle_ddp_rcv_buffer(toep, drb);
+}
+
+static void
+queue_ddp_rcvbuf_mbuf(struct toepcb *toep, u_int db_idx, u_int len)
+{
+ struct inpcb *inp = toep->inp;
+ struct sockbuf *sb;
+ struct ddp_buffer *db;
+ struct ddp_rcv_buffer *drb;
+ struct mbuf *m;
+
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ printf("%s: failed to allocate mbuf", __func__);
+ return;
+ }
+ m->m_pkthdr.rcvif = toep->vi->ifp;
+
+ db = &toep->ddp.db[db_idx];
+ drb = db->drb;
+ m_extaddref(m, (char *)drb->buf + db->placed, len, &drb->refs,
+ ddp_rcv_mbuf_done, toep, drb);
+ m->m_pkthdr.len = len;
+ m->m_len = len;
+
+ sb = &inp->inp_socket->so_rcv;
+ SOCKBUF_LOCK_ASSERT(sb);
+ sbappendstream_locked(sb, m, 0);
+
+ db->placed += len;
+ toep->ofld_rxq->rx_toe_ddp_octets += len;
+}
+
/* XXX: handle_ddp_data code duplication */
void
insert_ddp_data(struct toepcb *toep, uint32_t n)
@@ -302,10 +471,12 @@
#ifdef INVARIANTS
unsigned int db_flag;
#endif
+ bool ddp_rcvbuf;
INP_WLOCK_ASSERT(inp);
DDP_ASSERT_LOCKED(toep);
+ ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0;
tp->rcv_nxt += n;
#ifndef USE_DDP_RX_FLOW_CONTROL
KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
@@ -321,6 +492,16 @@
#endif
MPASS((toep->ddp.flags & db_flag) != 0);
db = &toep->ddp.db[db_idx];
+ if (ddp_rcvbuf) {
+ placed = n;
+ if (placed > db->drb->len - db->placed)
+ placed = db->drb->len - db->placed;
+ if (placed != 0)
+ queue_ddp_rcvbuf_mbuf(toep, db_idx, placed);
+ complete_ddp_buffer(toep, db, db_idx);
+ n -= placed;
+ continue;
+ }
job = db->job;
copied = job->aio_received;
placed = n;
@@ -423,12 +604,13 @@
static struct wrqe *
mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
- struct pageset *ps, int offset, uint64_t ddp_flags, uint64_t ddp_flags_mask)
+ struct ppod_reservation *prsv, int offset, uint32_t len,
+ uint64_t ddp_flags, uint64_t ddp_flags_mask)
{
struct wrqe *wr;
struct work_request_hdr *wrh;
struct ulp_txpkt *ulpmc;
- int len;
+ int wrlen;
KASSERT(db_idx == 0 || db_idx == 1,
("%s: bad DDP buffer index %d", __func__, db_idx));
@@ -441,21 +623,21 @@
* The ULPTX master commands that follow must all end at 16B boundaries
* too so we round up the size to 16.
*/
- len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
+ wrlen = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
roundup2(LEN__RX_DATA_ACK_ULP, 16);
- wr = alloc_wrqe(len, toep->ctrlq);
+ wr = alloc_wrqe(wrlen, toep->ctrlq);
if (wr == NULL)
return (NULL);
wrh = wrtod(wr);
- INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */
+ INIT_ULPTX_WRH(wrh, wrlen, 1, 0); /* atomic */
ulpmc = (struct ulp_txpkt *)(wrh + 1);
/* Write the buffer's tag */
ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
W_TCB_RX_DDP_BUF0_TAG + db_idx,
V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
- V_TCB_RX_DDP_BUF0_TAG(ps->prsv.prsv_tag));
+ V_TCB_RX_DDP_BUF0_TAG(prsv->prsv_tag));
/* Update the current offset in the DDP buffer and its total length */
if (db_idx == 0)
@@ -464,14 +646,14 @@
V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
V_TCB_RX_DDP_BUF0_OFFSET(offset) |
- V_TCB_RX_DDP_BUF0_LEN(ps->len));
+ V_TCB_RX_DDP_BUF0_LEN(len));
else
ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
W_TCB_RX_DDP_BUF1_OFFSET,
V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
V_TCB_RX_DDP_BUF1_OFFSET(offset) |
- V_TCB_RX_DDP_BUF1_LEN((u64)ps->len << 32));
+ V_TCB_RX_DDP_BUF1_LEN((u64)len << 32));
/* Update DDP flags */
ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
@@ -484,7 +666,8 @@
}
static int
-handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
+handle_ddp_data_aio(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt,
+ int len)
{
uint32_t report = be32toh(ddp_report);
unsigned int db_idx;
@@ -607,11 +790,239 @@
return (0);
}
+static bool
+queue_ddp_rcvbuf(struct toepcb *toep, struct ddp_rcv_buffer *drb)
+{
+ struct adapter *sc = td_adapter(toep->td);
+ struct ddp_buffer *db;
+ struct wrqe *wr;
+ uint64_t ddp_flags, ddp_flags_mask;
+ int buf_flag, db_idx;
+
+ DDP_ASSERT_LOCKED(toep);
+
+ KASSERT((toep->ddp.flags & DDP_DEAD) == 0, ("%s: DDP_DEAD", __func__));
+ KASSERT(toep->ddp.active_count < nitems(toep->ddp.db),
+ ("%s: no empty DDP buffer slot", __func__));
+
+ /* Determine which DDP buffer to use. */
+ if (toep->ddp.db[0].drb == NULL) {
+ db_idx = 0;
+ } else {
+ MPASS(toep->ddp.db[1].drb == NULL);
+ db_idx = 1;
+ }
+
+ /*
+ * Permit PSH to trigger a partial completion without
+ * invalidating the rest of the buffer, but disable the PUSH
+ * timer.
+ */
+ ddp_flags = 0;
+ ddp_flags_mask = 0;
+ if (db_idx == 0) {
+ ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+ V_TF_DDP_PUSH_DISABLE_0(0) | V_TF_DDP_PSHF_ENABLE_0(1) |
+ V_TF_DDP_BUF0_VALID(1);
+ ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE0(1) |
+ V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PSHF_ENABLE_0(1) |
+ V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF0_VALID(1);
+ buf_flag = DDP_BUF0_ACTIVE;
+ } else {
+ ddp_flags |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
+ V_TF_DDP_PUSH_DISABLE_1(0) | V_TF_DDP_PSHF_ENABLE_1(1) |
+ V_TF_DDP_BUF1_VALID(1);
+ ddp_flags_mask |= V_TF_DDP_PSH_NO_INVALIDATE1(1) |
+ V_TF_DDP_PUSH_DISABLE_1(1) | V_TF_DDP_PSHF_ENABLE_1(1) |
+ V_TF_DDP_BUF1_FLUSH(1) | V_TF_DDP_BUF1_VALID(1);
+ buf_flag = DDP_BUF1_ACTIVE;
+ }
+ MPASS((toep->ddp.flags & buf_flag) == 0);
+ if ((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0) {
+ MPASS(db_idx == 0);
+ MPASS(toep->ddp.active_id == -1);
+ MPASS(toep->ddp.active_count == 0);
+ ddp_flags_mask |= V_TF_DDP_ACTIVE_BUF(1);
+ }
+
+ /*
+ * The TID for this connection should still be valid. If
+ * DDP_DEAD is set, SBS_CANTRCVMORE should be set, so we
+ * shouldn't be this far anyway.
+ */
+ wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &drb->prsv, 0, drb->len,
+ ddp_flags, ddp_flags_mask);
+ if (wr == NULL) {
+ recycle_ddp_rcv_buffer(toep, drb);
+ printf("%s: mk_update_tcb_for_ddp failed\n", __func__);
+ return (false);
+ }
+
+#ifdef VERBOSE_TRACES
+ CTR(KTR_CXGBE,
+ "%s: tid %u, scheduling DDP[%d] (flags %#lx/%#lx)", __func__,
+ toep->tid, db_idx, ddp_flags, ddp_flags_mask);
+#endif
+ /*
+ * Hold a reference on scheduled buffers that is dropped in
+ * complete_ddp_buffer.
+ */
+ drb->refs = 1;
+
+ /* Give the chip the go-ahead. */
+ t4_wrq_tx(sc, wr);
+ db = &toep->ddp.db[db_idx];
+ db->drb = drb;
+ toep->ddp.flags |= buf_flag;
+ toep->ddp.active_count++;
+ if (toep->ddp.active_count == 1) {
+ MPASS(toep->ddp.active_id == -1);
+ toep->ddp.active_id = db_idx;
+ CTR2(KTR_CXGBE, "%s: ddp_active_id = %d", __func__,
+ toep->ddp.active_id);
+ }
+ return (true);
+}
+
+static int
+handle_ddp_data_rcvbuf(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt,
+ int len)
+{
+ uint32_t report = be32toh(ddp_report);
+ struct inpcb *inp = toep->inp;
+ struct tcpcb *tp;
+ struct socket *so;
+ struct sockbuf *sb;
+ struct ddp_buffer *db;
+ struct ddp_rcv_buffer *drb;
+ unsigned int db_idx;
+ bool invalidated;
+
+ db_idx = report & F_DDP_BUF_IDX ? 1 : 0;
+
+ invalidated = (report & F_DDP_INV) != 0;
+
+ INP_WLOCK(inp);
+ so = inp_inpcbtosocket(inp);
+ sb = &so->so_rcv;
+ DDP_LOCK(toep);
+
+ KASSERT(toep->ddp.active_id == db_idx,
+ ("completed DDP buffer (%d) != active_id (%d) for tid %d", db_idx,
+ toep->ddp.active_id, toep->tid));
+ db = &toep->ddp.db[db_idx];
+
+ if (__predict_false(inp->inp_flags & INP_DROPPED)) {
+ /*
+ * This can happen due to an administrative tcpdrop(8).
+ * Just ignore the received data.
+ */
+ CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
+ __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
+ if (invalidated)
+ complete_ddp_buffer(toep, db, db_idx);
+ goto out;
+ }
+
+ tp = intotcpcb(inp);
+
+ /*
+ * For RX_DDP_COMPLETE, len will be zero and rcv_nxt is the
+ * sequence number of the next byte to receive. The length of
+ * the data received for this message must be computed by
+ * comparing the new and old values of rcv_nxt.
+ *
+ * For RX_DATA_DDP, len might be non-zero, but it is only the
+ * length of the most recent DMA. It does not include the
+ * total length of the data received since the previous update
+ * for this DDP buffer. rcv_nxt is the sequence number of the
+ * first received byte from the most recent DMA.
+ */
+ len += be32toh(rcv_nxt) - tp->rcv_nxt;
+ tp->rcv_nxt += len;
+ tp->t_rcvtime = ticks;
+#ifndef USE_DDP_RX_FLOW_CONTROL
+ KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
+ tp->rcv_wnd -= len;
+#endif
+#ifdef VERBOSE_TRACES
+ CTR5(KTR_CXGBE, "%s: tid %u, DDP[%d] placed %d bytes (%#x)", __func__,
+ toep->tid, db_idx, len, report);
+#endif
+
+ /* receive buffer autosize */
+ MPASS(toep->vnet == so->so_vnet);
+ CURVNET_SET(toep->vnet);
+ SOCKBUF_LOCK(sb);
+ if (sb->sb_flags & SB_AUTOSIZE &&
+ V_tcp_do_autorcvbuf &&
+ sb->sb_hiwat < V_tcp_autorcvbuf_max &&
+ len > (sbspace(sb) / 8 * 7)) {
+ struct adapter *sc = td_adapter(toep->td);
+ unsigned int hiwat = sb->sb_hiwat;
+ unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
+ V_tcp_autorcvbuf_max);
+
+ if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
+ sb->sb_flags &= ~SB_AUTOSIZE;
+ }
+
+ if (len > 0) {
+ queue_ddp_rcvbuf_mbuf(toep, db_idx, len);
+ t4_rcvd_locked(&toep->td->tod, tp);
+ }
+ sorwakeup_locked(so);
+ SOCKBUF_UNLOCK_ASSERT(sb);
+ CURVNET_RESTORE();
+
+ if (invalidated)
+ complete_ddp_buffer(toep, db, db_idx);
+ else
+ KASSERT(db->placed < db->drb->len,
+ ("%s: full DDP buffer not invalidated", __func__));
+
+ if (toep->ddp.active_count != nitems(toep->ddp.db)) {
+ drb = alloc_cached_ddp_rcv_buffer(toep);
+ if (drb == NULL)
+ drb = alloc_ddp_rcv_buffer(toep, M_NOWAIT);
+ if (drb == NULL)
+ ddp_queue_toep(toep);
+ else {
+ if (!queue_ddp_rcvbuf(toep, drb)) {
+ ddp_queue_toep(toep);
+ }
+ }
+ }
+out:
+ DDP_UNLOCK(toep);
+ INP_WUNLOCK(inp);
+
+ return (0);
+}
+
+static int
+handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
+{
+ if ((toep->ddp.flags & DDP_RCVBUF) != 0)
+ return (handle_ddp_data_rcvbuf(toep, ddp_report, rcv_nxt, len));
+ else
+ return (handle_ddp_data_aio(toep, ddp_report, rcv_nxt, len));
+}
+
void
handle_ddp_indicate(struct toepcb *toep)
{
DDP_ASSERT_LOCKED(toep);
+ if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
+ /*
+ * Indicates are not meaningful for RCVBUF since
+ * buffers are activated when the socket option is
+ * set.
+ */
+ return;
+ }
+
MPASS(toep->ddp.active_count == 0);
MPASS((toep->ddp.flags & (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)) == 0);
if (toep->ddp.waiting_count == 0) {
@@ -654,6 +1065,8 @@
/*
* XXX: This duplicates a lot of code with handle_ddp_data().
*/
+ KASSERT((toep->ddp.flags & DDP_AIO) != 0,
+ ("%s: DDP_RCVBUF", __func__));
db_idx = G_COOKIE(cpl->cookie) - CPL_COOKIE_DDP0;
MPASS(db_idx < nitems(toep->ddp.db));
INP_WLOCK(inp);
@@ -707,6 +1120,8 @@
void
handle_ddp_close(struct toepcb *toep, struct tcpcb *tp, __be32 rcv_nxt)
{
+ struct socket *so = toep->inp->inp_socket;
+ struct sockbuf *sb = &so->so_rcv;
struct ddp_buffer *db;
struct kaiocb *job;
long copied;
@@ -715,14 +1130,19 @@
unsigned int db_flag;
#endif
int len, placed;
+ bool ddp_rcvbuf;
INP_WLOCK_ASSERT(toep->inp);
DDP_ASSERT_LOCKED(toep);
+ ddp_rcvbuf = (toep->ddp.flags & DDP_RCVBUF) != 0;
+
/* - 1 is to ignore the byte for FIN */
len = be32toh(rcv_nxt) - tp->rcv_nxt - 1;
tp->rcv_nxt += len;
+ CTR(KTR_CXGBE, "%s: tid %d placed %u bytes before FIN", __func__,
+ toep->tid, len);
while (toep->ddp.active_count > 0) {
MPASS(toep->ddp.active_id != -1);
db_idx = toep->ddp.active_id;
@@ -731,6 +1151,20 @@
#endif
MPASS((toep->ddp.flags & db_flag) != 0);
db = &toep->ddp.db[db_idx];
+ if (ddp_rcvbuf) {
+ placed = len;
+ if (placed > db->drb->len - db->placed)
+ placed = db->drb->len - db->placed;
+ if (placed != 0) {
+ SOCKBUF_LOCK(sb);
+ queue_ddp_rcvbuf_mbuf(toep, db_idx, placed);
+ sorwakeup_locked(so);
+ SOCKBUF_UNLOCK_ASSERT(sb);
+ }
+ complete_ddp_buffer(toep, db, db_idx);
+ len -= placed;
+ continue;
+ }
job = db->job;
copied = job->aio_received;
placed = len;
@@ -758,7 +1192,8 @@
}
MPASS(len == 0);
- ddp_complete_all(toep, 0);
+ if ((toep->ddp.flags & DDP_AIO) != 0)
+ ddp_complete_all(toep, 0);
}
#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
@@ -892,6 +1327,7 @@
static void
enable_ddp(struct adapter *sc, struct toepcb *toep)
{
+ uint64_t ddp_flags;
KASSERT((toep->ddp.flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
("%s: toep %p has bad ddp_flags 0x%x",
@@ -900,13 +1336,16 @@
CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
__func__, toep->tid, time_uptime);
+ ddp_flags = 0;
+ if ((toep->ddp.flags & DDP_AIO) != 0)
+ ddp_flags |= V_TF_DDP_BUF0_INDICATE(1) |
+ V_TF_DDP_BUF1_INDICATE(1);
DDP_ASSERT_LOCKED(toep);
toep->ddp.flags |= DDP_SC_REQ;
t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_RX_DDP_FLAGS,
V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
- V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
- V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1), 0, 0);
+ V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1), ddp_flags, 0, 0);
t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
V_TF_RCV_COALESCE_ENABLE(1), 0, 0, 0);
}
@@ -1103,6 +1542,19 @@
return (0);
}
+static int
+t4_alloc_page_pods_for_rcvbuf(struct ppod_region *pr,
+ struct ddp_rcv_buffer *drb)
+{
+ struct ppod_reservation *prsv = &drb->prsv;
+
+ KASSERT(prsv->prsv_nppods == 0,
+ ("%s: page pods already allocated", __func__));
+
+ return (t4_alloc_page_pods_for_buf(pr, (vm_offset_t)drb->buf, drb->len,
+ prsv));
+}
+
int
t4_alloc_page_pods_for_sgl(struct ppod_region *pr, struct ctl_sg_entry *sgl,
int entries, struct ppod_reservation *prsv)
@@ -1223,7 +1675,6 @@
ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
-
/* How many page pods are we writing in this cycle */
n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
chunk = PPOD_SZ(n);
@@ -1276,6 +1727,96 @@
return (0);
}
+static int
+t4_write_page_pods_for_rcvbuf(struct adapter *sc, struct sge_wrq *wrq, int tid,
+ struct ddp_rcv_buffer *drb)
+{
+ struct wrqe *wr;
+ struct ulp_mem_io *ulpmc;
+ struct ulptx_idata *ulpsc;
+ struct pagepod *ppod;
+ int i, j, k, n, chunk, len, ddp_pgsz;
+ u_int ppod_addr, offset;
+ uint32_t cmd;
+ struct ppod_reservation *prsv = &drb->prsv;
+ struct ppod_region *pr = prsv->prsv_pr;
+ uintptr_t end_pva, pva;
+ vm_paddr_t pa;
+
+ MPASS(prsv->prsv_nppods > 0);
+
+ cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
+ if (is_t4(sc))
+ cmd |= htobe32(F_ULP_MEMIO_ORDER);
+ else
+ cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
+ ddp_pgsz = 1 << pr->pr_page_shift[G_PPOD_PGSZ(prsv->prsv_tag)];
+ offset = (uintptr_t)drb->buf & PAGE_MASK;
+ ppod_addr = pr->pr_start + (prsv->prsv_tag & pr->pr_tag_mask);
+ pva = trunc_page((uintptr_t)drb->buf);
+ end_pva = trunc_page((uintptr_t)drb->buf + drb->len - 1);
+ for (i = 0; i < prsv->prsv_nppods; ppod_addr += chunk) {
+ /* How many page pods are we writing in this cycle */
+ n = min(prsv->prsv_nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
+ MPASS(n > 0);
+ chunk = PPOD_SZ(n);
+ len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
+
+ wr = alloc_wrqe(len, wrq);
+ if (wr == NULL)
+ return (ENOMEM); /* ok to just bail out */
+ ulpmc = wrtod(wr);
+
+ INIT_ULPTX_WR(ulpmc, len, 0, 0);
+ ulpmc->cmd = cmd;
+ ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
+ ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
+ ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
+
+ ulpsc = (struct ulptx_idata *)(ulpmc + 1);
+ ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
+ ulpsc->len = htobe32(chunk);
+
+ ppod = (struct pagepod *)(ulpsc + 1);
+ for (j = 0; j < n; i++, j++, ppod++) {
+ ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
+ V_PPOD_TID(tid) | prsv->prsv_tag);
+ ppod->len_offset = htobe64(V_PPOD_LEN(drb->len) |
+ V_PPOD_OFST(offset));
+ ppod->rsvd = 0;
+
+ for (k = 0; k < nitems(ppod->addr); k++) {
+ if (pva > end_pva)
+ ppod->addr[k] = 0;
+ else {
+ pa = pmap_kextract(pva);
+ ppod->addr[k] = htobe64(pa);
+ pva += ddp_pgsz;
+ }
+#if 0
+ CTR5(KTR_CXGBE,
+ "%s: tid %d ppod[%d]->addr[%d] = %p",
+ __func__, tid, i, k,
+ be64toh(ppod->addr[k]));
+#endif
+ }
+
+ /*
+ * Walk back 1 segment so that the first address in the
+ * next pod is the same as the last one in the current
+ * pod.
+ */
+ pva -= ddp_pgsz;
+ }
+
+ t4_wrq_tx(sc, wr);
+ }
+
+ MPASS(pva <= end_pva);
+
+ return (0);
+}
+
static struct mbuf *
alloc_raw_wr_mbuf(int len)
{
@@ -1761,6 +2302,7 @@
struct kaiocb *job;
DDP_ASSERT_LOCKED(toep);
+ KASSERT((toep->ddp.flags & DDP_AIO) != 0, ("%s: DDP_RCVBUF", __func__));
while (!TAILQ_EMPTY(&toep->ddp.aiojobq)) {
job = TAILQ_FIRST(&toep->ddp.aiojobq);
TAILQ_REMOVE(&toep->ddp.aiojobq, job, list);
@@ -2143,8 +2685,8 @@
* which will keep it open and keep the TCP PCB attached until
* after the job is completed.
*/
- wr = mk_update_tcb_for_ddp(sc, toep, db_idx, ps, job->aio_received,
- ddp_flags, ddp_flags_mask);
+ wr = mk_update_tcb_for_ddp(sc, toep, db_idx, &ps->prsv, ps->len,
+ job->aio_received, ddp_flags, ddp_flags_mask);
if (wr == NULL) {
recycle_pageset(toep, ps);
aio_ddp_requeue_one(toep, job);
@@ -2286,7 +2828,6 @@
struct tcpcb *tp = intotcpcb(inp);
struct toepcb *toep = tp->t_toe;
-
/* Ignore writes. */
if (job->uaiocb.aio_lio_opcode != LIO_READ)
return (EOPNOTSUPP);
@@ -2302,6 +2843,15 @@
DDP_LOCK(toep);
+ /*
+ * If DDP is being used for all normal receive, don't use it
+ * for AIO.
+ */
+ if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
+ DDP_UNLOCK(toep);
+ return (EOPNOTSUPP);
+ }
+
/*
* XXX: Think about possibly returning errors for ENOTCONN,
* etc. Perhaps the caller would only queue the request
@@ -2315,7 +2865,14 @@
panic("new job was cancelled");
TAILQ_INSERT_TAIL(&toep->ddp.aiojobq, job, list);
toep->ddp.waiting_count++;
- toep->ddp.flags |= DDP_OK;
+
+ if ((toep->ddp.flags & DDP_AIO) == 0) {
+ toep->ddp.flags |= DDP_AIO;
+ TAILQ_INIT(&toep->ddp.cached_pagesets);
+ TAILQ_INIT(&toep->ddp.aiojobq);
+ TASK_INIT(&toep->ddp.requeue_task, 0, aio_ddp_requeue_task,
+ toep);
+ }
/*
* Try to handle this request synchronously. If this has
@@ -2327,9 +2884,146 @@
return (0);
}
+static void
+ddp_rcvbuf_requeue(struct toepcb *toep)
+{
+ struct socket *so;
+ struct sockbuf *sb;
+ struct inpcb *inp;
+ struct ddp_rcv_buffer *drb;
+
+ DDP_ASSERT_LOCKED(toep);
+restart:
+ if ((toep->ddp.flags & DDP_DEAD) != 0) {
+ MPASS(toep->ddp.active_count == 0);
+ return;
+ }
+
+ /* If both buffers are active, nothing to do. */
+ if (toep->ddp.active_count == nitems(toep->ddp.db)) {
+ return;
+ }
+
+ inp = toep->inp;
+ so = inp->inp_socket;
+ sb = &so->so_rcv;
+
+ drb = alloc_cached_ddp_rcv_buffer(toep);
+ DDP_UNLOCK(toep);
+
+ if (drb == NULL) {
+ drb = alloc_ddp_rcv_buffer(toep, M_WAITOK);
+ if (drb == NULL) {
+ printf("%s: failed to allocate buffer\n", __func__);
+ DDP_LOCK(toep);
+ return;
+ }
+ }
+
+ DDP_LOCK(toep);
+ if ((toep->ddp.flags & DDP_DEAD) != 0 ||
+ toep->ddp.active_count == nitems(toep->ddp.db)) {
+ recycle_ddp_rcv_buffer(toep, drb);
+ return;
+ }
+
+ /* We will never get anything unless we are or were connected. */
+ SOCKBUF_LOCK(sb);
+ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
+ SOCKBUF_UNLOCK(sb);
+ recycle_ddp_rcv_buffer(toep, drb);
+ return;
+ }
+
+ /* Abort if socket has reported problems or is closed. */
+ if (so->so_error != 0 || (sb->sb_state & SBS_CANTRCVMORE) != 0) {
+ SOCKBUF_UNLOCK(sb);
+ recycle_ddp_rcv_buffer(toep, drb);
+ return;
+ }
+ SOCKBUF_UNLOCK(sb);
+
+ if (!queue_ddp_rcvbuf(toep, drb)) {
+ /*
+ * XXX: Need a way to kick a retry here.
+ *
+ * XXX: We know the fixed size needed and could
+ * preallocate the work request using a blocking
+ * request at the start of the task to avoid having to
+ * handle this edge case.
+ */
+ return;
+ }
+ goto restart;
+}
+
+static void
+ddp_rcvbuf_requeue_task(void *context, int pending)
+{
+ struct toepcb *toep = context;
+
+ DDP_LOCK(toep);
+ ddp_rcvbuf_requeue(toep);
+ toep->ddp.flags &= ~DDP_TASK_ACTIVE;
+ DDP_UNLOCK(toep);
+
+ free_toepcb(toep);
+}
+
+int
+t4_enable_ddp_rcv(struct socket *so, struct toepcb *toep)
+{
+ struct inpcb *inp = sotoinpcb(so);
+ struct adapter *sc = td_adapter(toep->td);
+
+ INP_WLOCK(inp);
+ switch (ulp_mode(toep)) {
+ case ULP_MODE_TCPDDP:
+ break;
+ case ULP_MODE_NONE:
+ if (set_ddp_ulp_mode(toep))
+ break;
+ /* FALLTHROUGH */
+ default:
+ INP_WUNLOCK(inp);
+ return (EOPNOTSUPP);
+ }
+ INP_WUNLOCK(inp);
+
+ DDP_LOCK(toep);
+
+ /*
+ * If DDP is being used for AIO already, don't use it for
+ * normal receive.
+ */
+ if ((toep->ddp.flags & DDP_AIO) != 0) {
+ DDP_UNLOCK(toep);
+ return (EOPNOTSUPP);
+ }
+
+ if ((toep->ddp.flags & DDP_RCVBUF) != 0) {
+ DDP_UNLOCK(toep);
+ return (EBUSY);
+ }
+
+ toep->ddp.flags |= DDP_RCVBUF;
+ TAILQ_INIT(&toep->ddp.cached_buffers);
+ enable_ddp(sc, toep);
+ TASK_INIT(&toep->ddp.requeue_task, 0, ddp_rcvbuf_requeue_task, toep);
+ ddp_queue_toep(toep);
+ DDP_UNLOCK(toep);
+ return (0);
+}
+
void
t4_ddp_mod_load(void)
{
+ if (t4_ddp_rcvbuf_len < PAGE_SIZE)
+ t4_ddp_rcvbuf_len = PAGE_SIZE;
+ if (t4_ddp_rcvbuf_len > MAX_DDP_BUFFER_SIZE)
+ t4_ddp_rcvbuf_len = MAX_DDP_BUFFER_SIZE;
+ if (!powerof2(t4_ddp_rcvbuf_len))
+ t4_ddp_rcvbuf_len = 1 << fls(t4_ddp_rcvbuf_len);
t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, do_ddp_tcb_rpl,
CPL_COOKIE_DDP0);
diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h
--- a/sys/dev/cxgbe/tom/t4_tom.h
+++ b/sys/dev/cxgbe/tom/t4_tom.h
@@ -85,6 +85,8 @@
DDP_BUF1_ACTIVE = (1 << 4), /* buffer 1 in use (not invalidated) */
DDP_TASK_ACTIVE = (1 << 5), /* requeue task is queued / running */
DDP_DEAD = (1 << 6), /* toepcb is shutting down */
+ DDP_AIO = (1 << 7), /* DDP used for AIO, not so_rcv */
+ DDP_RCVBUF = (1 << 8), /* DDP used for so_rcv, not AIO */
};
struct bio;
@@ -156,25 +158,51 @@
#define PS_PPODS_WRITTEN 0x0001 /* Page pods written to the card. */
-struct ddp_buffer {
- struct pageset *ps;
+struct ddp_rcv_buffer {
+ TAILQ_ENTRY(ddp_rcv_buffer) link;
+ void *buf;
+ struct ppod_reservation prsv;
+ size_t len;
+ u_int refs;
+};
- struct kaiocb *job;
- int cancel_pending;
+struct ddp_buffer {
+ union {
+ /* DDP_AIO fields */
+ struct {
+ struct pageset *ps;
+ struct kaiocb *job;
+ int cancel_pending;
+ };
+
+ /* DDP_RCVBUF fields */
+ struct {
+ struct ddp_rcv_buffer *drb;
+ uint32_t placed;
+ };
+ };
};
+/*
+ * (a) - DDP_AIO only
+ * (r) - DDP_RCVBUF only
+ */
struct ddp_pcb {
+ struct mtx lock;
u_int flags;
+ int active_id; /* the currently active DDP buffer */
struct ddp_buffer db[2];
- TAILQ_HEAD(, pageset) cached_pagesets;
- TAILQ_HEAD(, kaiocb) aiojobq;
- u_int waiting_count;
+ union {
+ TAILQ_HEAD(, pageset) cached_pagesets; /* (a) */
+ TAILQ_HEAD(, ddp_rcv_buffer) cached_buffers; /* (r) */
+ };
+ TAILQ_HEAD(, kaiocb) aiojobq; /* (a) */
+ u_int waiting_count; /* (a) */
u_int active_count;
u_int cached_count;
- int active_id; /* the currently active DDP buffer */
struct task requeue_task;
- struct kaiocb *queueing;
- struct mtx lock;
+ struct kaiocb *queueing; /* (a) */
+ struct mtx cache_lock; /* (r) */
};
struct toepcb {
@@ -230,6 +258,8 @@
#define DDP_LOCK(toep) mtx_lock(&(toep)->ddp.lock)
#define DDP_UNLOCK(toep) mtx_unlock(&(toep)->ddp.lock)
#define DDP_ASSERT_LOCKED(toep) mtx_assert(&(toep)->ddp.lock, MA_OWNED)
+#define DDP_CACHE_LOCK(toep) mtx_lock(&(toep)->ddp.cache_lock)
+#define DDP_CACHE_UNLOCK(toep) mtx_unlock(&(toep)->ddp.cache_lock)
/*
* Compressed state for embryonic connections for a listener.
@@ -502,6 +532,7 @@
struct ppod_reservation *, struct ctl_sg_entry *, int, int, struct mbufq *);
void t4_free_page_pods(struct ppod_reservation *);
int t4_aio_queue_ddp(struct socket *, struct kaiocb *);
+int t4_enable_ddp_rcv(struct socket *, struct toepcb *);
void t4_ddp_mod_load(void);
void t4_ddp_mod_unload(void);
void ddp_assert_empty(struct toepcb *);
diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c
--- a/sys/dev/cxgbe/tom/t4_tom.c
+++ b/sys/dev/cxgbe/tom/t4_tom.c
@@ -1950,6 +1950,35 @@
return (rc);
}
+static int
+t4_ctloutput_tom(struct socket *so, struct sockopt *sopt)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ int error, optval;
+
+ if (sopt->sopt_level == IPPROTO_TCP && sopt->sopt_name == TCP_USE_DDP) {
+ if (sopt->sopt_dir != SOPT_SET)
+ return (EOPNOTSUPP);
+
+ if (sopt->sopt_td != NULL) {
+ /* Only settable by the kernel. */
+ return (EPERM);
+ }
+
+ error = sooptcopyin(sopt, &optval, sizeof(optval),
+ sizeof(optval));
+ if (error != 0)
+ return (error);
+
+ if (optval != 0)
+ return (t4_enable_ddp_rcv(so, toep));
+ else
+ return (EOPNOTSUPP);
+ }
+ return (tcp_ctloutput(so, sopt));
+}
+
static int
t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
{
@@ -1989,9 +2018,11 @@
t4_tls_mod_load();
bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw));
+ toe_protosw.pr_ctloutput = t4_ctloutput_tom;
toe_protosw.pr_aio_queue = t4_aio_queue_tom;
bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
+ toe6_protosw.pr_ctloutput = t4_ctloutput_tom;
toe6_protosw.pr_aio_queue = t4_aio_queue_tom;
return (t4_register_uld(&tom_uld_info));
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sat, Nov 30, 8:13 AM (8 h, 21 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14939422
Default Alt Text
D44001.diff (38 KB)
Attached To
Mode
D44001: cxgbe: Support TCP_USE_DDP on offloaded TOE connections
Attached
Detach File
Event Timeline
Log In to Comment