diff --git a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c index cf1032f2a3a2..de8f2547f29a 100644 --- a/sys/dev/cxgbe/cxgbei/icl_cxgbei.c +++ b/sys/dev/cxgbe/cxgbei/icl_cxgbei.c @@ -1,1460 +1,1455 @@ /*- * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2015 Chelsio Communications, Inc. * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * cxgbei implementation of iSCSI Common Layer kobj(9) interface. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_tcb.h" #include "tom/t4_tom.h" #include "cxgbei.h" /* * Use the page pod tag for the TT hash. */ #define TT_HASH(icc, tt) (G_PPOD_TAG(tt) & (icc)->cmp_hash_mask) struct cxgbei_ddp_state { struct ppod_reservation prsv; struct cxgbei_cmp cmp; }; static MALLOC_DEFINE(M_CXGBEI, "cxgbei", "cxgbei(4)"); SYSCTL_NODE(_kern_icl, OID_AUTO, cxgbei, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Chelsio iSCSI offload"); static int first_burst_length = 8192; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, first_burst_length, CTLFLAG_RWTUN, &first_burst_length, 0, "First burst length"); static int max_burst_length = 2 * 1024 * 1024; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, max_burst_length, CTLFLAG_RWTUN, &max_burst_length, 0, "Maximum burst length"); static int sendspace = 1048576; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, sendspace, CTLFLAG_RWTUN, &sendspace, 0, "Default send socket buffer size"); static int recvspace = 1048576; SYSCTL_INT(_kern_icl_cxgbei, OID_AUTO, recvspace, CTLFLAG_RWTUN, &recvspace, 0, "Default receive socket buffer size"); static volatile u_int icl_cxgbei_ncons; -#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) -#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) -#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) -#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) - static icl_conn_new_pdu_t icl_cxgbei_conn_new_pdu; static icl_conn_pdu_data_segment_length_t icl_cxgbei_conn_pdu_data_segment_length; static icl_conn_pdu_append_data_t icl_cxgbei_conn_pdu_append_data; static icl_conn_pdu_get_data_t icl_cxgbei_conn_pdu_get_data; static icl_conn_pdu_queue_t icl_cxgbei_conn_pdu_queue; static icl_conn_pdu_queue_cb_t icl_cxgbei_conn_pdu_queue_cb; static icl_conn_handoff_t icl_cxgbei_conn_handoff; static icl_conn_free_t icl_cxgbei_conn_free; static icl_conn_close_t icl_cxgbei_conn_close; static icl_conn_task_setup_t icl_cxgbei_conn_task_setup; static icl_conn_task_done_t icl_cxgbei_conn_task_done; static icl_conn_transfer_setup_t icl_cxgbei_conn_transfer_setup; static icl_conn_transfer_done_t icl_cxgbei_conn_transfer_done; static kobj_method_t icl_cxgbei_methods[] = { KOBJMETHOD(icl_conn_new_pdu, icl_cxgbei_conn_new_pdu), KOBJMETHOD(icl_conn_pdu_free, icl_cxgbei_conn_pdu_free), KOBJMETHOD(icl_conn_pdu_data_segment_length, icl_cxgbei_conn_pdu_data_segment_length), KOBJMETHOD(icl_conn_pdu_append_data, icl_cxgbei_conn_pdu_append_data), KOBJMETHOD(icl_conn_pdu_get_data, icl_cxgbei_conn_pdu_get_data), KOBJMETHOD(icl_conn_pdu_queue, icl_cxgbei_conn_pdu_queue), KOBJMETHOD(icl_conn_pdu_queue_cb, icl_cxgbei_conn_pdu_queue_cb), KOBJMETHOD(icl_conn_handoff, icl_cxgbei_conn_handoff), KOBJMETHOD(icl_conn_free, icl_cxgbei_conn_free), KOBJMETHOD(icl_conn_close, icl_cxgbei_conn_close), KOBJMETHOD(icl_conn_task_setup, icl_cxgbei_conn_task_setup), KOBJMETHOD(icl_conn_task_done, icl_cxgbei_conn_task_done), KOBJMETHOD(icl_conn_transfer_setup, icl_cxgbei_conn_transfer_setup), KOBJMETHOD(icl_conn_transfer_done, icl_cxgbei_conn_transfer_done), { 0, 0 } }; DEFINE_CLASS(icl_cxgbei, icl_cxgbei_methods, sizeof(struct icl_cxgbei_conn)); void icl_cxgbei_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); KASSERT(icp->ref_cnt != 0, ("freeing deleted PDU")); MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); m_freem(ip->ip_ahs_mbuf); m_freem(ip->ip_data_mbuf); m_freem(ip->ip_bhs_mbuf); KASSERT(ic != NULL || icp->ref_cnt == 1, ("orphaned PDU has oustanding references")); if (atomic_fetchadd_int(&icp->ref_cnt, -1) != 1) return; free(icp, M_CXGBEI); #ifdef DIAGNOSTIC if (__predict_true(ic != NULL)) refcount_release(&ic->ic_outstanding_pdus); #endif } static void icl_cxgbei_pdu_call_cb(struct icl_pdu *ip) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); if (icp->cb != NULL) icp->cb(ip, icp->error); #ifdef DIAGNOSTIC if (__predict_true(ip->ip_conn != NULL)) refcount_release(&ip->ip_conn->ic_outstanding_pdus); #endif free(icp, M_CXGBEI); } static void icl_cxgbei_pdu_done(struct icl_pdu *ip, int error) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); if (error != 0) icp->error = error; m_freem(ip->ip_ahs_mbuf); ip->ip_ahs_mbuf = NULL; m_freem(ip->ip_data_mbuf); ip->ip_data_mbuf = NULL; m_freem(ip->ip_bhs_mbuf); ip->ip_bhs_mbuf = NULL; /* * All other references to this PDU should have been dropped * by the m_freem() of ip_data_mbuf. */ if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) icl_cxgbei_pdu_call_cb(ip); else __assert_unreachable(); } static void icl_cxgbei_mbuf_done(struct mbuf *mb) { struct icl_cxgbei_pdu *icp = (struct icl_cxgbei_pdu *)mb->m_ext.ext_arg1; /* * NB: mb_free_mext() might leave ref_cnt as 1 without * decrementing it if it hits the fast path in the ref_cnt * check. */ icl_cxgbei_pdu_call_cb(&icp->ip); } struct icl_pdu * icl_cxgbei_new_pdu(int flags) { struct icl_cxgbei_pdu *icp; struct icl_pdu *ip; struct mbuf *m; icp = malloc(sizeof(*icp), M_CXGBEI, flags | M_ZERO); if (__predict_false(icp == NULL)) return (NULL); icp->icp_signature = CXGBEI_PDU_SIGNATURE; icp->ref_cnt = 1; ip = &icp->ip; m = m_gethdr(flags, MT_DATA); if (__predict_false(m == NULL)) { free(icp, M_CXGBEI); return (NULL); } ip->ip_bhs_mbuf = m; ip->ip_bhs = mtod(m, struct iscsi_bhs *); memset(ip->ip_bhs, 0, sizeof(*ip->ip_bhs)); m->m_len = sizeof(struct iscsi_bhs); m->m_pkthdr.len = m->m_len; return (ip); } void icl_cxgbei_new_pdu_set_conn(struct icl_pdu *ip, struct icl_conn *ic) { ip->ip_conn = ic; #ifdef DIAGNOSTIC refcount_acquire(&ic->ic_outstanding_pdus); #endif } /* * Allocate icl_pdu with empty BHS to fill up by the caller. */ static struct icl_pdu * icl_cxgbei_conn_new_pdu(struct icl_conn *ic, int flags) { struct icl_pdu *ip; ip = icl_cxgbei_new_pdu(flags); if (__predict_false(ip == NULL)) return (NULL); icl_cxgbei_new_pdu_set_conn(ip, ic); return (ip); } static size_t icl_pdu_data_segment_length(const struct icl_pdu *request) { uint32_t len = 0; len += request->ip_bhs->bhs_data_segment_len[0]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[1]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[2]; return (len); } size_t icl_cxgbei_conn_pdu_data_segment_length(struct icl_conn *ic, const struct icl_pdu *request) { return (icl_pdu_data_segment_length(request)); } static struct mbuf * finalize_pdu(struct icl_cxgbei_conn *icc, struct icl_cxgbei_pdu *icp) { struct icl_pdu *ip = &icp->ip; uint8_t ulp_submode, padding; struct mbuf *m, *last; struct iscsi_bhs *bhs; int data_len; /* * Fix up the data segment mbuf first. */ m = ip->ip_data_mbuf; ulp_submode = icc->ulp_submode; if (m != NULL) { last = m_last(m); /* * Round up the data segment to a 4B boundary. Pad with 0 if * necessary. There will definitely be room in the mbuf. */ padding = roundup2(ip->ip_data_len, 4) - ip->ip_data_len; if (padding != 0) { MPASS(padding <= M_TRAILINGSPACE(last)); bzero(mtod(last, uint8_t *) + last->m_len, padding); last->m_len += padding; } } else { MPASS(ip->ip_data_len == 0); ulp_submode &= ~ULP_CRC_DATA; padding = 0; } /* * Now the header mbuf that has the BHS. */ m = ip->ip_bhs_mbuf; MPASS(m->m_pkthdr.len == sizeof(struct iscsi_bhs)); MPASS(m->m_len == sizeof(struct iscsi_bhs)); bhs = ip->ip_bhs; data_len = ip->ip_data_len; if (data_len > icc->ic.ic_max_send_data_segment_length) { struct iscsi_bhs_data_in *bhsdi; int flags; KASSERT(padding == 0, ("%s: ISO with padding %d for icp %p", __func__, padding, icp)); switch (bhs->bhs_opcode) { case ISCSI_BHS_OPCODE_SCSI_DATA_OUT: flags = 1; break; case ISCSI_BHS_OPCODE_SCSI_DATA_IN: flags = 2; break; default: panic("invalid opcode %#x for ISO", bhs->bhs_opcode); } data_len = icc->ic.ic_max_send_data_segment_length; bhsdi = (struct iscsi_bhs_data_in *)bhs; if (bhsdi->bhsdi_flags & BHSDI_FLAGS_F) { /* * Firmware will set F on the final PDU in the * burst. */ flags |= CXGBE_ISO_F; bhsdi->bhsdi_flags &= ~BHSDI_FLAGS_F; } set_mbuf_iscsi_iso(m, true); set_mbuf_iscsi_iso_flags(m, flags); set_mbuf_iscsi_iso_mss(m, data_len); } bhs->bhs_data_segment_len[2] = data_len; bhs->bhs_data_segment_len[1] = data_len >> 8; bhs->bhs_data_segment_len[0] = data_len >> 16; /* * Extract mbuf chain from PDU. */ m->m_pkthdr.len += ip->ip_data_len + padding; m->m_next = ip->ip_data_mbuf; set_mbuf_ulp_submode(m, ulp_submode); ip->ip_bhs_mbuf = NULL; ip->ip_data_mbuf = NULL; ip->ip_bhs = NULL; /* * Drop PDU reference on icp. Additional references might * still be held by zero-copy PDU buffers (ICL_NOCOPY). */ if (atomic_fetchadd_int(&icp->ref_cnt, -1) == 1) icl_cxgbei_pdu_call_cb(ip); return (m); } int icl_cxgbei_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *ip, const void *addr, size_t len, int flags) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); struct mbuf *m, *m_tail; const char *src; MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); KASSERT(len > 0, ("%s: len is %jd", __func__, (intmax_t)len)); m_tail = ip->ip_data_mbuf; if (m_tail != NULL) for (; m_tail->m_next != NULL; m_tail = m_tail->m_next) ; if (flags & ICL_NOCOPY) { m = m_get(flags & ~ICL_NOCOPY, MT_DATA); if (m == NULL) { ICL_WARN("failed to allocate mbuf"); return (ENOMEM); } m->m_flags |= M_RDONLY; m_extaddref(m, __DECONST(char *, addr), len, &icp->ref_cnt, icl_cxgbei_mbuf_done, icp, NULL); m->m_len = len; if (ip->ip_data_mbuf == NULL) { ip->ip_data_mbuf = m; ip->ip_data_len = len; } else { m_tail->m_next = m; m_tail = m_tail->m_next; ip->ip_data_len += len; } return (0); } src = (const char *)addr; /* Allocate as jumbo mbufs of size MJUM16BYTES. */ while (len >= MJUM16BYTES) { m = m_getjcl(M_NOWAIT, MT_DATA, 0, MJUM16BYTES); if (__predict_false(m == NULL)) { if ((flags & M_WAITOK) != 0) { /* Fall back to non-jumbo mbufs. */ break; } return (ENOMEM); } memcpy(mtod(m, void *), src, MJUM16BYTES); m->m_len = MJUM16BYTES; if (ip->ip_data_mbuf == NULL) { ip->ip_data_mbuf = m_tail = m; ip->ip_data_len = MJUM16BYTES; } else { m_tail->m_next = m; m_tail = m_tail->m_next; ip->ip_data_len += MJUM16BYTES; } src += MJUM16BYTES; len -= MJUM16BYTES; } /* Allocate mbuf chain for the remaining data. */ if (len != 0) { m = m_getm2(NULL, len, flags, MT_DATA, 0); if (__predict_false(m == NULL)) return (ENOMEM); if (ip->ip_data_mbuf == NULL) { ip->ip_data_mbuf = m; ip->ip_data_len = len; } else { m_tail->m_next = m; ip->ip_data_len += len; } for (; m != NULL; m = m->m_next) { m->m_len = min(len, M_SIZE(m)); memcpy(mtod(m, void *), src, m->m_len); src += m->m_len; len -= m->m_len; } MPASS(len == 0); } MPASS(ip->ip_data_len <= max(ic->ic_max_send_data_segment_length, ic->ic_hw_isomax)); return (0); } void icl_cxgbei_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, size_t off, void *addr, size_t len) { struct icl_cxgbei_pdu *icp = ip_to_icp(ip); if (icp->icp_flags & ICPF_RX_DDP) return; /* data is DDP'ed, no need to copy */ m_copydata(ip->ip_data_mbuf, off, len, addr); } void icl_cxgbei_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) { icl_cxgbei_conn_pdu_queue_cb(ic, ip, NULL); } void icl_cxgbei_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip, icl_pdu_cb cb) { struct epoch_tracker et; struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct icl_cxgbei_pdu *icp = ip_to_icp(ip); struct socket *so = ic->ic_socket; struct toepcb *toep = icc->toep; struct inpcb *inp; struct mbuf *m; MPASS(ic == ip->ip_conn); MPASS(ip->ip_bhs_mbuf != NULL); /* The kernel doesn't generate PDUs with AHS. */ MPASS(ip->ip_ahs_mbuf == NULL && ip->ip_ahs_len == 0); ICL_CONN_LOCK_ASSERT(ic); icp->cb = cb; /* NOTE: sowriteable without so_snd lock is a mostly harmless race. */ if (ic->ic_disconnecting || so == NULL || !sowriteable(so)) { icl_cxgbei_pdu_done(ip, ENOTCONN); return; } m = finalize_pdu(icc, icp); M_ASSERTPKTHDR(m); MPASS((m->m_pkthdr.len & 3) == 0); /* * Do not get inp from toep->inp as the toepcb might have detached * already. */ inp = sotoinpcb(so); CURVNET_SET(toep->vnet); NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || __predict_false((toep->flags & TPF_ATTACHED) == 0)) m_freem(m); else { mbufq_enqueue(&toep->ulp_pduq, m); t4_push_pdus(icc->sc, toep, 0); } INP_WUNLOCK(inp); NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } static struct icl_conn * icl_cxgbei_new_conn(const char *name, struct mtx *lock) { struct icl_cxgbei_conn *icc; struct icl_conn *ic; refcount_acquire(&icl_cxgbei_ncons); icc = (struct icl_cxgbei_conn *)kobj_create(&icl_cxgbei_class, M_CXGBE, M_WAITOK | M_ZERO); icc->icc_signature = CXGBEI_CONN_SIGNATURE; STAILQ_INIT(&icc->rcvd_pdus); icc->cmp_table = hashinit(64, M_CXGBEI, &icc->cmp_hash_mask); mtx_init(&icc->cmp_lock, "cxgbei_cmp", NULL, MTX_DEF); ic = &icc->ic; ic->ic_lock = lock; #ifdef DIAGNOSTIC refcount_init(&ic->ic_outstanding_pdus, 0); #endif ic->ic_name = name; ic->ic_offload = "cxgbei"; ic->ic_unmapped = false; CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); return (ic); } void icl_cxgbei_conn_free(struct icl_conn *ic) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); CTR2(KTR_CXGBE, "%s: icc %p", __func__, icc); mtx_destroy(&icc->cmp_lock); hashdestroy(icc->cmp_table, M_CXGBEI, icc->cmp_hash_mask); kobj_delete((struct kobj *)icc, M_CXGBE); refcount_release(&icl_cxgbei_ncons); } static int icl_cxgbei_setsockopt(struct icl_conn *ic, struct socket *so, int sspace, int rspace) { struct sockopt opt; int error, one = 1, ss, rs; ss = max(sendspace, sspace); rs = max(recvspace, rspace); error = soreserve(so, ss, rs); if (error != 0) { icl_cxgbei_conn_close(ic); return (error); } SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags |= SB_AUTOSIZE; SOCKBUF_UNLOCK(&so->so_rcv); /* * Disable Nagle. */ bzero(&opt, sizeof(opt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(so, &opt); if (error != 0) { icl_cxgbei_conn_close(ic); return (error); } return (0); } /* * Request/response structure used to find out the adapter offloading a socket. */ struct find_ofld_adapter_rr { struct socket *so; struct adapter *sc; /* result */ }; static void find_offload_adapter(struct adapter *sc, void *arg) { struct find_ofld_adapter_rr *fa = arg; struct socket *so = fa->so; struct tom_data *td = sc->tom_softc; struct tcpcb *tp; struct inpcb *inp; /* Non-TCP were filtered out earlier. */ MPASS(so->so_proto->pr_protocol == IPPROTO_TCP); if (fa->sc != NULL) return; /* Found already. */ if (td == NULL) return; /* TOE not enabled on this adapter. */ inp = sotoinpcb(so); INP_WLOCK(inp); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) == 0) { tp = intotcpcb(inp); if (tp->t_flags & TF_TOE && tp->tod == &td->tod) fa->sc = sc; /* Found. */ } INP_WUNLOCK(inp); } static bool is_memfree(struct adapter *sc) { uint32_t em; em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if ((em & F_EXT_MEM_ENABLE) != 0) return (false); if (is_t5(sc) && (em & F_EXT_MEM1_ENABLE) != 0) return (false); return (true); } /* XXXNP: move this to t4_tom. */ static void send_iscsi_flowc_wr(struct adapter *sc, struct toepcb *toep, int maxlen) { struct wrqe *wr; struct fw_flowc_wr *flowc; const u_int nparams = 1; u_int flowclen; struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq); if (wr == NULL) { /* XXX */ panic("%s: allocation failure.", __func__); } flowc = wrtod(wr); memset(flowc, 0, wr->wr_len); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(nparams)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) | V_FW_WR_FLOWID(toep->tid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_TXDATAPLEN_MAX; flowc->mnemval[0].val = htobe32(maxlen); txsd->tx_credits = howmany(flowclen, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; t4_wrq_tx(sc, wr); } static void set_ulp_mode_iscsi(struct adapter *sc, struct toepcb *toep, u_int ulp_submode) { uint64_t val; CTR3(KTR_CXGBE, "%s: tid %u, ULP_MODE_ISCSI, submode=%#x", __func__, toep->tid, ulp_submode); val = V_TCB_ULP_TYPE(ULP_MODE_ISCSI) | V_TCB_ULP_RAW(ulp_submode); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_ULP_TYPE, V_TCB_ULP_TYPE(M_TCB_ULP_TYPE) | V_TCB_ULP_RAW(M_TCB_ULP_RAW), val, 0, 0); val = V_TF_RX_FLOW_CONTROL_DISABLE(1ULL); t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS, val, val, 0, 0); } /* * XXXNP: Who is responsible for cleaning up the socket if this returns with an * error? Review all error paths. * * XXXNP: What happens to the socket's fd reference if the operation is * successful, and how does that affect the socket's life cycle? */ int icl_cxgbei_conn_handoff(struct icl_conn *ic, int fd) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct cxgbei_data *ci; struct find_ofld_adapter_rr fa; struct file *fp; struct socket *so; struct inpcb *inp; struct tcpcb *tp; struct toepcb *toep; cap_rights_t rights; u_int max_rx_pdu_len, max_tx_pdu_len; int error, max_iso_pdus; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ICL_CONN_LOCK_ASSERT_NOT(ic); /* * Steal the socket from userland. */ error = fget(curthread, fd, cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, curthread); return (EINVAL); } so = fp->f_data; if (so->so_type != SOCK_STREAM || so->so_proto->pr_protocol != IPPROTO_TCP) { fdrop(fp, curthread); return (EINVAL); } ICL_CONN_LOCK(ic); if (ic->ic_socket != NULL) { ICL_CONN_UNLOCK(ic); fdrop(fp, curthread); return (EBUSY); } ic->ic_disconnecting = false; ic->ic_socket = so; fp->f_ops = &badfileops; fp->f_data = NULL; fdrop(fp, curthread); ICL_CONN_UNLOCK(ic); /* Find the adapter offloading this socket. */ fa.sc = NULL; fa.so = so; t4_iterate(find_offload_adapter, &fa); if (fa.sc == NULL) return (EINVAL); icc->sc = fa.sc; ci = icc->sc->iscsi_ulp_softc; max_rx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_recv_data_segment_length; max_tx_pdu_len = ISCSI_BHS_SIZE + ic->ic_max_send_data_segment_length; if (ic->ic_header_crc32c) { max_rx_pdu_len += ISCSI_HEADER_DIGEST_SIZE; max_tx_pdu_len += ISCSI_HEADER_DIGEST_SIZE; } if (ic->ic_data_crc32c) { max_rx_pdu_len += ISCSI_DATA_DIGEST_SIZE; max_tx_pdu_len += ISCSI_DATA_DIGEST_SIZE; } inp = sotoinpcb(so); INP_WLOCK(inp); tp = intotcpcb(inp); if (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) { INP_WUNLOCK(inp); return (EBUSY); } /* * socket could not have been "unoffloaded" if here. */ MPASS(tp->t_flags & TF_TOE); MPASS(tp->tod != NULL); MPASS(tp->t_toe != NULL); toep = tp->t_toe; MPASS(toep->vi->adapter == icc->sc); if (ulp_mode(toep) != ULP_MODE_NONE) { INP_WUNLOCK(inp); return (EINVAL); } icc->toep = toep; icc->cwt = cxgbei_select_worker_thread(icc); icc->ulp_submode = 0; if (ic->ic_header_crc32c) icc->ulp_submode |= ULP_CRC_HEADER; if (ic->ic_data_crc32c) icc->ulp_submode |= ULP_CRC_DATA; if (icc->sc->tt.iso && chip_id(icc->sc) >= CHELSIO_T5 && !is_memfree(icc->sc)) { max_iso_pdus = CXGBEI_MAX_ISO_PAYLOAD / max_tx_pdu_len; ic->ic_hw_isomax = max_iso_pdus * ic->ic_max_send_data_segment_length; } else max_iso_pdus = 1; toep->params.ulp_mode = ULP_MODE_ISCSI; toep->ulpcb = icc; send_iscsi_flowc_wr(icc->sc, toep, roundup(max_iso_pdus * max_tx_pdu_len, tp->t_maxseg)); set_ulp_mode_iscsi(icc->sc, toep, icc->ulp_submode); INP_WUNLOCK(inp); return (icl_cxgbei_setsockopt(ic, so, max_tx_pdu_len, max_rx_pdu_len)); } void icl_cxgbei_conn_close(struct icl_conn *ic) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct icl_pdu *ip; struct socket *so; struct sockbuf *sb; struct inpcb *inp; struct toepcb *toep = icc->toep; MPASS(icc->icc_signature == CXGBEI_CONN_SIGNATURE); ICL_CONN_LOCK_ASSERT_NOT(ic); ICL_CONN_LOCK(ic); so = ic->ic_socket; if (ic->ic_disconnecting || so == NULL) { CTR4(KTR_CXGBE, "%s: icc %p (disconnecting = %d), so %p", __func__, icc, ic->ic_disconnecting, so); ICL_CONN_UNLOCK(ic); return; } ic->ic_disconnecting = true; #ifdef DIAGNOSTIC KASSERT(ic->ic_outstanding_pdus == 0, ("destroying session with %d outstanding PDUs", ic->ic_outstanding_pdus)); #endif ICL_CONN_UNLOCK(ic); CTR3(KTR_CXGBE, "%s: tid %d, icc %p", __func__, toep ? toep->tid : -1, icc); inp = sotoinpcb(so); sb = &so->so_rcv; INP_WLOCK(inp); if (toep != NULL) { /* NULL if connection was never offloaded. */ toep->ulpcb = NULL; /* Discard PDUs queued for TX. */ mbufq_drain(&toep->ulp_pduq); /* * Wait for the cwt threads to stop processing this * connection. */ SOCKBUF_LOCK(sb); if (icc->rx_flags & RXF_ACTIVE) { volatile u_int *p = &icc->rx_flags; SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); while (*p & RXF_ACTIVE) pause("conclo", 1); INP_WLOCK(inp); SOCKBUF_LOCK(sb); } /* * Discard received PDUs not passed to the iSCSI * layer. */ while (!STAILQ_EMPTY(&icc->rcvd_pdus)) { ip = STAILQ_FIRST(&icc->rcvd_pdus); STAILQ_REMOVE_HEAD(&icc->rcvd_pdus, ip_next); icl_cxgbei_pdu_done(ip, ENOTCONN); } SOCKBUF_UNLOCK(sb); /* * Grab a reference to use when waiting for the final * CPL to be received. If toep->inp is NULL, then * final_cpl_received() has already been called (e.g. * due to the peer sending a RST). */ if (toep->inp != NULL) { toep = hold_toepcb(toep); toep->flags |= TPF_WAITING_FOR_FINAL; } else toep = NULL; } INP_WUNLOCK(inp); ICL_CONN_LOCK(ic); ic->ic_socket = NULL; ICL_CONN_UNLOCK(ic); /* * XXXNP: we should send RST instead of FIN when PDUs held in various * queues were purged instead of delivered reliably but soabort isn't * really general purpose and wouldn't do the right thing here. */ soclose(so); /* * Wait for the socket to fully close. This ensures any * pending received data has been received (and in particular, * any data that would be received by DDP has been handled). * Callers assume that it is safe to free buffers for tasks * and transfers after this function returns. */ if (toep != NULL) { struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep); mtx_lock(lock); while ((toep->flags & TPF_WAITING_FOR_FINAL) != 0) mtx_sleep(toep, lock, PSOCK, "conclo2", 0); mtx_unlock(lock); free_toepcb(toep); } } static void cxgbei_insert_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp, uint32_t tt) { #ifdef INVARIANTS struct cxgbei_cmp *cmp2; #endif cmp->tt = tt; mtx_lock(&icc->cmp_lock); #ifdef INVARIANTS LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, tt)], link) { KASSERT(cmp2->tt != tt, ("%s: duplicate cmp", __func__)); } #endif LIST_INSERT_HEAD(&icc->cmp_table[TT_HASH(icc, tt)], cmp, link); mtx_unlock(&icc->cmp_lock); } struct cxgbei_cmp * cxgbei_find_cmp(struct icl_cxgbei_conn *icc, uint32_t tt) { struct cxgbei_cmp *cmp; mtx_lock(&icc->cmp_lock); LIST_FOREACH(cmp, &icc->cmp_table[TT_HASH(icc, tt)], link) { if (cmp->tt == tt) break; } mtx_unlock(&icc->cmp_lock); return (cmp); } static void cxgbei_rm_cmp(struct icl_cxgbei_conn *icc, struct cxgbei_cmp *cmp) { #ifdef INVARIANTS struct cxgbei_cmp *cmp2; #endif mtx_lock(&icc->cmp_lock); #ifdef INVARIANTS LIST_FOREACH(cmp2, &icc->cmp_table[TT_HASH(icc, cmp->tt)], link) { if (cmp2 == cmp) goto found; } panic("%s: could not find cmp", __func__); found: #endif LIST_REMOVE(cmp, link); mtx_unlock(&icc->cmp_lock); } int icl_cxgbei_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, struct ccb_scsiio *csio, uint32_t *ittp, void **arg) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct toepcb *toep = icc->toep; struct adapter *sc = icc->sc; struct cxgbei_data *ci = sc->iscsi_ulp_softc; struct ppod_region *pr = &ci->pr; struct cxgbei_ddp_state *ddp; struct ppod_reservation *prsv; struct inpcb *inp; struct mbufq mq; uint32_t itt; int rc = 0; ICL_CONN_LOCK_ASSERT(ic); /* This is for the offload driver's state. Must not be set already. */ MPASS(arg != NULL); MPASS(*arg == NULL); if (ic->ic_disconnecting || ic->ic_socket == NULL) return (ECONNRESET); if ((csio->ccb_h.flags & CAM_DIR_MASK) != CAM_DIR_IN || csio->dxfer_len < ci->ddp_threshold) { no_ddp: /* * No DDP for this I/O. Allocate an ITT (based on the one * passed in) that cannot be a valid hardware DDP tag in the * iSCSI region. */ itt = *ittp & M_PPOD_TAG; itt = V_PPOD_TAG(itt) | pr->pr_invalid_bit; *ittp = htobe32(itt); MPASS(*arg == NULL); /* State is maintained for DDP only. */ if (rc != 0) counter_u64_add( toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); return (0); } /* * Reserve resources for DDP, update the itt that should be used in the * PDU, and save DDP specific state for this I/O in *arg. */ ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); if (ddp == NULL) { rc = ENOMEM; goto no_ddp; } prsv = &ddp->prsv; /* XXX add support for all CAM_DATA_ types */ MPASS((csio->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_VADDR); rc = t4_alloc_page_pods_for_buf(pr, (vm_offset_t)csio->data_ptr, csio->dxfer_len, prsv); if (rc != 0) { free(ddp, M_CXGBEI); goto no_ddp; } mbufq_init(&mq, INT_MAX); rc = t4_write_page_pods_for_buf(sc, toep, prsv, (vm_offset_t)csio->data_ptr, csio->dxfer_len, &mq); if (__predict_false(rc != 0)) { mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); goto no_ddp; } /* * Do not get inp from toep->inp as the toepcb might have * detached already. */ inp = sotoinpcb(ic->ic_socket); INP_WLOCK(inp); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) { INP_WUNLOCK(inp); mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); return (ECONNRESET); } mbufq_concat(&toep->ulp_pduq, &mq); INP_WUNLOCK(inp); ddp->cmp.last_datasn = -1; cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); *ittp = htobe32(prsv->prsv_tag); *arg = prsv; counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); return (0); } void icl_cxgbei_conn_task_done(struct icl_conn *ic, void *arg) { if (arg != NULL) { struct cxgbei_ddp_state *ddp = arg; cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); t4_free_page_pods(&ddp->prsv); free(ddp, M_CXGBEI); } } static inline bool ddp_sgl_check(struct ctl_sg_entry *sg, int entries, int xferlen) { int total_len = 0; MPASS(entries > 0); if (((vm_offset_t)sg[--entries].addr & 3U) != 0) return (false); total_len += sg[entries].len; while (--entries >= 0) { if (((vm_offset_t)sg[entries].addr & PAGE_MASK) != 0 || (sg[entries].len % PAGE_SIZE) != 0) return (false); total_len += sg[entries].len; } MPASS(total_len == xferlen); return (true); } /* XXXNP: PDU should be passed in as parameter, like on the initiator. */ #define io_to_request_pdu(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND].ptr) #define io_to_ddp_state(io) ((io)->io_hdr.ctl_private[CTL_PRIV_FRONTEND2].ptr) int icl_cxgbei_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io, uint32_t *tttp, void **arg) { struct icl_cxgbei_conn *icc = ic_to_icc(ic); struct toepcb *toep = icc->toep; struct ctl_scsiio *ctsio = &io->scsiio; struct adapter *sc = icc->sc; struct cxgbei_data *ci = sc->iscsi_ulp_softc; struct ppod_region *pr = &ci->pr; struct cxgbei_ddp_state *ddp; struct ppod_reservation *prsv; struct ctl_sg_entry *sgl, sg_entry; struct inpcb *inp; struct mbufq mq; int sg_entries = ctsio->kern_sg_entries; uint32_t ttt; int xferlen, rc = 0, alias; /* This is for the offload driver's state. Must not be set already. */ MPASS(arg != NULL); MPASS(*arg == NULL); if (ctsio->ext_data_filled == 0) { int first_burst; struct icl_pdu *ip = io_to_request_pdu(io); #ifdef INVARIANTS struct icl_cxgbei_pdu *icp = ip_to_icp(ip); MPASS(icp->icp_signature == CXGBEI_PDU_SIGNATURE); MPASS(ic == ip->ip_conn); MPASS(ip->ip_bhs_mbuf != NULL); #endif first_burst = icl_pdu_data_segment_length(ip); /* * Note that ICL calls conn_transfer_setup even if the first * burst had everything and there's nothing left to transfer. * * NB: The CTL frontend might have provided a buffer * whose length (kern_data_len) is smaller than the * FirstBurstLength of unsolicited data. Treat those * as an empty transfer. */ xferlen = ctsio->kern_data_len; if (xferlen < first_burst || xferlen - first_burst < ci->ddp_threshold) { no_ddp: /* * No DDP for this transfer. Allocate a TTT (based on * the one passed in) that cannot be a valid hardware * DDP tag in the iSCSI region. */ ttt = *tttp & M_PPOD_TAG; ttt = V_PPOD_TAG(ttt) | pr->pr_invalid_bit; *tttp = htobe32(ttt); MPASS(io_to_ddp_state(io) == NULL); if (rc != 0) counter_u64_add( toep->ofld_rxq->rx_iscsi_ddp_setup_error, 1); return (0); } if (sg_entries == 0) { sgl = &sg_entry; sgl->len = xferlen; sgl->addr = (void *)ctsio->kern_data_ptr; sg_entries = 1; } else sgl = (void *)ctsio->kern_data_ptr; if (!ddp_sgl_check(sgl, sg_entries, xferlen)) goto no_ddp; /* * Reserve resources for DDP, update the ttt that should be used * in the PDU, and save DDP specific state for this I/O. */ MPASS(io_to_ddp_state(io) == NULL); ddp = malloc(sizeof(*ddp), M_CXGBEI, M_NOWAIT | M_ZERO); if (ddp == NULL) { rc = ENOMEM; goto no_ddp; } prsv = &ddp->prsv; rc = t4_alloc_page_pods_for_sgl(pr, sgl, sg_entries, prsv); if (rc != 0) { free(ddp, M_CXGBEI); goto no_ddp; } mbufq_init(&mq, INT_MAX); rc = t4_write_page_pods_for_sgl(sc, toep, prsv, sgl, sg_entries, xferlen, &mq); if (__predict_false(rc != 0)) { mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); goto no_ddp; } /* * Do not get inp from toep->inp as the toepcb might * have detached already. */ ICL_CONN_LOCK(ic); if (ic->ic_disconnecting || ic->ic_socket == NULL) { ICL_CONN_UNLOCK(ic); mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); return (ECONNRESET); } inp = sotoinpcb(ic->ic_socket); INP_WLOCK(inp); ICL_CONN_UNLOCK(ic); if ((inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) != 0) { INP_WUNLOCK(inp); mbufq_drain(&mq); t4_free_page_pods(prsv); free(ddp, M_CXGBEI); return (ECONNRESET); } mbufq_concat(&toep->ulp_pduq, &mq); INP_WUNLOCK(inp); ddp->cmp.next_buffer_offset = ctsio->kern_rel_offset + first_burst; ddp->cmp.last_datasn = -1; cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); *tttp = htobe32(prsv->prsv_tag); io_to_ddp_state(io) = ddp; *arg = ctsio; counter_u64_add(toep->ofld_rxq->rx_iscsi_ddp_setup_ok, 1); return (0); } /* * In the middle of an I/O. A non-NULL page pod reservation indicates * that a DDP buffer is being used for the I/O. */ ddp = io_to_ddp_state(ctsio); if (ddp == NULL) goto no_ddp; prsv = &ddp->prsv; alias = (prsv->prsv_tag & pr->pr_alias_mask) >> pr->pr_alias_shift; alias++; prsv->prsv_tag &= ~pr->pr_alias_mask; prsv->prsv_tag |= alias << pr->pr_alias_shift & pr->pr_alias_mask; ddp->cmp.last_datasn = -1; cxgbei_insert_cmp(icc, &ddp->cmp, prsv->prsv_tag); *tttp = htobe32(prsv->prsv_tag); *arg = ctsio; return (0); } void icl_cxgbei_conn_transfer_done(struct icl_conn *ic, void *arg) { struct ctl_scsiio *ctsio = arg; if (ctsio != NULL) { struct cxgbei_ddp_state *ddp; ddp = io_to_ddp_state(ctsio); MPASS(ddp != NULL); cxgbei_rm_cmp(ic_to_icc(ic), &ddp->cmp); if (ctsio->kern_data_len == ctsio->ext_data_filled || ic->ic_disconnecting) { t4_free_page_pods(&ddp->prsv); free(ddp, M_CXGBEI); io_to_ddp_state(ctsio) = NULL; } } } static void cxgbei_limits(struct adapter *sc, void *arg) { struct icl_drv_limits *idl = arg; struct cxgbei_data *ci; int max_dsl; if (begin_synchronized_op(sc, NULL, HOLD_LOCK, "t4lims") != 0) return; if (uld_active(sc, ULD_ISCSI)) { ci = sc->iscsi_ulp_softc; MPASS(ci != NULL); max_dsl = ci->max_rx_data_len; if (idl->idl_max_recv_data_segment_length > max_dsl) idl->idl_max_recv_data_segment_length = max_dsl; max_dsl = ci->max_tx_data_len; if (idl->idl_max_send_data_segment_length > max_dsl) idl->idl_max_send_data_segment_length = max_dsl; } end_synchronized_op(sc, LOCK_HELD); } static int icl_cxgbei_limits(struct icl_drv_limits *idl) { /* Maximum allowed by the RFC. cxgbei_limits will clip them. */ idl->idl_max_recv_data_segment_length = (1 << 24) - 1; idl->idl_max_send_data_segment_length = (1 << 24) - 1; /* These are somewhat arbitrary. */ idl->idl_max_burst_length = max_burst_length; idl->idl_first_burst_length = first_burst_length; t4_iterate(cxgbei_limits, idl); return (0); } int icl_cxgbei_mod_load(void) { int rc; refcount_init(&icl_cxgbei_ncons, 0); rc = icl_register("cxgbei", false, -100, icl_cxgbei_limits, icl_cxgbei_new_conn); return (rc); } int icl_cxgbei_mod_unload(void) { if (icl_cxgbei_ncons != 0) return (EBUSY); icl_unregister("cxgbei", false); return (0); } #endif diff --git a/sys/dev/iscsi/icl.h b/sys/dev/iscsi/icl.h index 07dcbbf2a0b5..edd43a45ba2e 100644 --- a/sys/dev/iscsi/icl.h +++ b/sys/dev/iscsi/icl.h @@ -1,167 +1,172 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 The FreeBSD Foundation * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef ICL_H #define ICL_H /* * iSCSI Common Layer. It's used by both the initiator and target to send * and receive iSCSI PDUs. */ #include #include #include #include SYSCTL_DECL(_kern_icl); extern int icl_debug; #define ICL_DEBUG(X, ...) \ do { \ if (icl_debug > 1) \ printf("%s: " X "\n", __func__, ## __VA_ARGS__);\ } while (0) #define ICL_WARN(X, ...) \ do { \ if (icl_debug > 0) { \ printf("WARNING: %s: " X "\n", \ __func__, ## __VA_ARGS__); \ } \ } while (0) struct icl_conn; struct ccb_scsiio; union ctl_io; struct icl_pdu { STAILQ_ENTRY(icl_pdu) ip_next; struct icl_conn *ip_conn; struct iscsi_bhs *ip_bhs; struct mbuf *ip_bhs_mbuf; size_t ip_ahs_len; struct mbuf *ip_ahs_mbuf; size_t ip_data_len; struct mbuf *ip_data_mbuf; /* * When a "large" received PDU represents multiple on-the-wire * PDUs, this is the count of additional on-the-wire PDUs. * For PDUs that match on-the-wire PDUs, this should be set to * zero. */ u_int ip_additional_pdus; /* * User (initiator or provider) private fields. */ void *ip_prv0; void *ip_prv1; }; #define ICL_NOCOPY (1 << 30) struct icl_conn { KOBJ_FIELDS; struct mtx *ic_lock; struct socket *ic_socket; #ifdef DIAGNOSTIC volatile u_int ic_outstanding_pdus; #endif uint32_t ic_max_recv_data_segment_length; uint32_t ic_max_send_data_segment_length; size_t ic_hw_isomax; size_t ic_maxtags; bool ic_header_crc32c; bool ic_data_crc32c; bool ic_disconnecting; bool ic_iser; bool ic_unmapped; const char *ic_name; const char *ic_offload; void (*ic_receive)(struct icl_pdu *); void (*ic_error)(struct icl_conn *); /* * User (initiator or provider) private fields. */ void *ic_prv0; }; +#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) +#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) +#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) +#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) + struct icl_drv_limits { int idl_max_recv_data_segment_length; int idl_max_send_data_segment_length; int idl_max_burst_length; int idl_first_burst_length; int spare[4]; }; typedef void (*icl_pdu_cb)(struct icl_pdu *, int error); struct icl_conn *icl_new_conn(const char *offload, bool iser, const char *name, struct mtx *lock); int icl_limits(const char *offload, bool iser, struct icl_drv_limits *idl); int icl_register(const char *offload, bool iser, int priority, int (*limits)(struct icl_drv_limits *), struct icl_conn *(*new_conn)(const char *, struct mtx *)); int icl_unregister(const char *offload, bool rdma); #ifdef ICL_KERNEL_PROXY struct sockaddr; struct icl_listen; /* * Target part. */ struct icl_listen *icl_listen_new(void (*accept_cb)(struct socket *, struct sockaddr *, int)); void icl_listen_free(struct icl_listen *il); int icl_listen_add(struct icl_listen *il, bool rdma, int domain, int socktype, int protocol, struct sockaddr *sa, int portal_id); int icl_listen_remove(struct icl_listen *il, struct sockaddr *sa); /* * Those two are not a public API; only to be used between icl_soft.c * and icl_soft_proxy.c. */ int icl_soft_handoff_sock(struct icl_conn *ic, struct socket *so); int icl_soft_proxy_connect(struct icl_conn *ic, int domain, int socktype, int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa); #endif /* ICL_KERNEL_PROXY */ #endif /* !ICL_H */ diff --git a/sys/dev/iscsi/icl_soft.c b/sys/dev/iscsi/icl_soft.c index 95c4c87dd6f3..37f3911204c4 100644 --- a/sys/dev/iscsi/icl_soft.c +++ b/sys/dev/iscsi/icl_soft.c @@ -1,1599 +1,1594 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2012 The FreeBSD Foundation * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Software implementation of iSCSI Common Layer kobj(9) interface. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ICL_CONN_STATE_BHS 1 #define ICL_CONN_STATE_AHS 2 #define ICL_CONN_STATE_HEADER_DIGEST 3 #define ICL_CONN_STATE_DATA 4 #define ICL_CONN_STATE_DATA_DIGEST 5 struct icl_soft_conn { struct icl_conn ic; /* soft specific stuff goes here. */ STAILQ_HEAD(, icl_pdu) to_send; struct cv send_cv; struct cv receive_cv; struct icl_pdu *receive_pdu; size_t receive_len; int receive_state; bool receive_running; bool check_send_space; bool send_running; }; struct icl_soft_pdu { struct icl_pdu ip; /* soft specific stuff goes here. */ u_int ref_cnt; icl_pdu_cb cb; int error; }; SYSCTL_NODE(_kern_icl, OID_AUTO, soft, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Software iSCSI"); static int coalesce = 1; SYSCTL_INT(_kern_icl_soft, OID_AUTO, coalesce, CTLFLAG_RWTUN, &coalesce, 0, "Try to coalesce PDUs before sending"); static int partial_receive_len = 256 * 1024; SYSCTL_INT(_kern_icl_soft, OID_AUTO, partial_receive_len, CTLFLAG_RWTUN, &partial_receive_len, 0, "Minimum read size for partially received " "data segment"); static int max_data_segment_length = 256 * 1024; SYSCTL_INT(_kern_icl_soft, OID_AUTO, max_data_segment_length, CTLFLAG_RWTUN, &max_data_segment_length, 0, "Maximum data segment length"); static int first_burst_length = 1024 * 1024; SYSCTL_INT(_kern_icl_soft, OID_AUTO, first_burst_length, CTLFLAG_RWTUN, &first_burst_length, 0, "First burst length"); static int max_burst_length = 1024 * 1024; SYSCTL_INT(_kern_icl_soft, OID_AUTO, max_burst_length, CTLFLAG_RWTUN, &max_burst_length, 0, "Maximum burst length"); static int sendspace = 1536 * 1024; SYSCTL_INT(_kern_icl_soft, OID_AUTO, sendspace, CTLFLAG_RWTUN, &sendspace, 0, "Default send socket buffer size"); static int recvspace = 1536 * 1024; SYSCTL_INT(_kern_icl_soft, OID_AUTO, recvspace, CTLFLAG_RWTUN, &recvspace, 0, "Default receive socket buffer size"); static MALLOC_DEFINE(M_ICL_SOFT, "icl_soft", "iSCSI software backend"); static uma_zone_t icl_soft_pdu_zone; static volatile u_int icl_ncons; -#define ICL_CONN_LOCK(X) mtx_lock(X->ic_lock) -#define ICL_CONN_UNLOCK(X) mtx_unlock(X->ic_lock) -#define ICL_CONN_LOCK_ASSERT(X) mtx_assert(X->ic_lock, MA_OWNED) -#define ICL_CONN_LOCK_ASSERT_NOT(X) mtx_assert(X->ic_lock, MA_NOTOWNED) - STAILQ_HEAD(icl_pdu_stailq, icl_pdu); static icl_conn_new_pdu_t icl_soft_conn_new_pdu; static icl_conn_pdu_free_t icl_soft_conn_pdu_free; static icl_conn_pdu_data_segment_length_t icl_soft_conn_pdu_data_segment_length; static icl_conn_pdu_append_data_t icl_soft_conn_pdu_append_data; static icl_conn_pdu_get_data_t icl_soft_conn_pdu_get_data; static icl_conn_pdu_queue_t icl_soft_conn_pdu_queue; static icl_conn_pdu_queue_cb_t icl_soft_conn_pdu_queue_cb; static icl_conn_handoff_t icl_soft_conn_handoff; static icl_conn_free_t icl_soft_conn_free; static icl_conn_close_t icl_soft_conn_close; static icl_conn_task_setup_t icl_soft_conn_task_setup; static icl_conn_task_done_t icl_soft_conn_task_done; static icl_conn_transfer_setup_t icl_soft_conn_transfer_setup; static icl_conn_transfer_done_t icl_soft_conn_transfer_done; #ifdef ICL_KERNEL_PROXY static icl_conn_connect_t icl_soft_conn_connect; #endif static kobj_method_t icl_soft_methods[] = { KOBJMETHOD(icl_conn_new_pdu, icl_soft_conn_new_pdu), KOBJMETHOD(icl_conn_pdu_free, icl_soft_conn_pdu_free), KOBJMETHOD(icl_conn_pdu_data_segment_length, icl_soft_conn_pdu_data_segment_length), KOBJMETHOD(icl_conn_pdu_append_data, icl_soft_conn_pdu_append_data), KOBJMETHOD(icl_conn_pdu_get_data, icl_soft_conn_pdu_get_data), KOBJMETHOD(icl_conn_pdu_queue, icl_soft_conn_pdu_queue), KOBJMETHOD(icl_conn_pdu_queue_cb, icl_soft_conn_pdu_queue_cb), KOBJMETHOD(icl_conn_handoff, icl_soft_conn_handoff), KOBJMETHOD(icl_conn_free, icl_soft_conn_free), KOBJMETHOD(icl_conn_close, icl_soft_conn_close), KOBJMETHOD(icl_conn_task_setup, icl_soft_conn_task_setup), KOBJMETHOD(icl_conn_task_done, icl_soft_conn_task_done), KOBJMETHOD(icl_conn_transfer_setup, icl_soft_conn_transfer_setup), KOBJMETHOD(icl_conn_transfer_done, icl_soft_conn_transfer_done), #ifdef ICL_KERNEL_PROXY KOBJMETHOD(icl_conn_connect, icl_soft_conn_connect), #endif { 0, 0 } }; DEFINE_CLASS(icl_soft, icl_soft_methods, sizeof(struct icl_soft_conn)); static void icl_conn_fail(struct icl_conn *ic) { if (ic->ic_socket == NULL) return; /* * XXX */ ic->ic_socket->so_error = EDOOFUS; (ic->ic_error)(ic); } static void icl_soft_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) { struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip; KASSERT(isp->ref_cnt == 0, ("freeing active PDU")); m_freem(ip->ip_bhs_mbuf); m_freem(ip->ip_ahs_mbuf); m_freem(ip->ip_data_mbuf); uma_zfree(icl_soft_pdu_zone, isp); #ifdef DIAGNOSTIC refcount_release(&ic->ic_outstanding_pdus); #endif } static void icl_soft_pdu_call_cb(struct icl_pdu *ip) { struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip; if (isp->cb != NULL) isp->cb(ip, isp->error); #ifdef DIAGNOSTIC refcount_release(&ip->ip_conn->ic_outstanding_pdus); #endif uma_zfree(icl_soft_pdu_zone, isp); } static void icl_soft_pdu_done(struct icl_pdu *ip, int error) { struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip; if (error != 0) isp->error = error; m_freem(ip->ip_bhs_mbuf); ip->ip_bhs_mbuf = NULL; m_freem(ip->ip_ahs_mbuf); ip->ip_ahs_mbuf = NULL; m_freem(ip->ip_data_mbuf); ip->ip_data_mbuf = NULL; if (atomic_fetchadd_int(&isp->ref_cnt, -1) == 1) icl_soft_pdu_call_cb(ip); } static void icl_soft_mbuf_done(struct mbuf *mb) { struct icl_soft_pdu *isp = (struct icl_soft_pdu *)mb->m_ext.ext_arg1; icl_soft_pdu_call_cb(&isp->ip); } /* * Allocate icl_pdu with empty BHS to fill up by the caller. */ struct icl_pdu * icl_soft_conn_new_pdu(struct icl_conn *ic, int flags) { struct icl_soft_pdu *isp; struct icl_pdu *ip; #ifdef DIAGNOSTIC refcount_acquire(&ic->ic_outstanding_pdus); #endif isp = uma_zalloc(icl_soft_pdu_zone, flags | M_ZERO); if (isp == NULL) { ICL_WARN("failed to allocate soft PDU"); #ifdef DIAGNOSTIC refcount_release(&ic->ic_outstanding_pdus); #endif return (NULL); } ip = &isp->ip; ip->ip_conn = ic; CTASSERT(sizeof(struct iscsi_bhs) <= MHLEN); ip->ip_bhs_mbuf = m_gethdr(flags, MT_DATA); if (ip->ip_bhs_mbuf == NULL) { ICL_WARN("failed to allocate BHS mbuf"); icl_soft_conn_pdu_free(ic, ip); return (NULL); } ip->ip_bhs = mtod(ip->ip_bhs_mbuf, struct iscsi_bhs *); memset(ip->ip_bhs, 0, sizeof(struct iscsi_bhs)); ip->ip_bhs_mbuf->m_len = sizeof(struct iscsi_bhs); return (ip); } static int icl_pdu_ahs_length(const struct icl_pdu *request) { return (request->ip_bhs->bhs_total_ahs_len * 4); } static size_t icl_pdu_data_segment_length(const struct icl_pdu *request) { uint32_t len = 0; len += request->ip_bhs->bhs_data_segment_len[0]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[1]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[2]; return (len); } size_t icl_soft_conn_pdu_data_segment_length(struct icl_conn *ic, const struct icl_pdu *request) { return (icl_pdu_data_segment_length(request)); } static void icl_pdu_set_data_segment_length(struct icl_pdu *response, uint32_t len) { response->ip_bhs->bhs_data_segment_len[2] = len; response->ip_bhs->bhs_data_segment_len[1] = len >> 8; response->ip_bhs->bhs_data_segment_len[0] = len >> 16; } static size_t icl_pdu_padding(const struct icl_pdu *ip) { if ((ip->ip_data_len % 4) != 0) return (4 - (ip->ip_data_len % 4)); return (0); } static size_t icl_pdu_size(const struct icl_pdu *response) { size_t len; KASSERT(response->ip_ahs_len == 0, ("responding with AHS")); len = sizeof(struct iscsi_bhs) + response->ip_data_len + icl_pdu_padding(response); if (response->ip_conn->ic_header_crc32c) len += ISCSI_HEADER_DIGEST_SIZE; if (response->ip_data_len != 0 && response->ip_conn->ic_data_crc32c) len += ISCSI_DATA_DIGEST_SIZE; return (len); } static void icl_soft_receive_buf(struct mbuf **r, size_t *rs, void *buf, size_t s) { m_copydata(*r, 0, s, buf); m_adj(*r, s); while ((*r) != NULL && (*r)->m_len == 0) *r = m_free(*r); *rs -= s; } static void icl_pdu_receive_ahs(struct icl_pdu *request, struct mbuf **r, size_t *rs) { request->ip_ahs_len = icl_pdu_ahs_length(request); if (request->ip_ahs_len == 0) return; request->ip_ahs_mbuf = *r; *r = m_split(request->ip_ahs_mbuf, request->ip_ahs_len, M_WAITOK); *rs -= request->ip_ahs_len; } static uint32_t icl_mbuf_to_crc32c(const struct mbuf *m0) { uint32_t digest = 0xffffffff; const struct mbuf *m; for (m = m0; m != NULL; m = m->m_next) digest = calculate_crc32c(digest, mtod(m, const void *), m->m_len); digest = digest ^ 0xffffffff; return (digest); } static int icl_pdu_check_header_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs) { uint32_t received_digest, valid_digest; if (request->ip_conn->ic_header_crc32c == false) return (0); CTASSERT(sizeof(received_digest) == ISCSI_HEADER_DIGEST_SIZE); icl_soft_receive_buf(r, rs, &received_digest, ISCSI_HEADER_DIGEST_SIZE); /* Temporary attach AHS to BHS to calculate header digest. */ request->ip_bhs_mbuf->m_next = request->ip_ahs_mbuf; valid_digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); request->ip_bhs_mbuf->m_next = NULL; if (received_digest != valid_digest) { ICL_WARN("header digest check failed; got 0x%x, " "should be 0x%x", received_digest, valid_digest); return (-1); } return (0); } /* * Return the number of bytes that should be waiting in the receive socket * before icl_pdu_receive_data_segment() gets called. */ static size_t icl_pdu_data_segment_receive_len(const struct icl_pdu *request) { size_t len; len = icl_pdu_data_segment_length(request); if (len == 0) return (0); /* * Account for the parts of data segment already read from * the socket buffer. */ KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); len -= request->ip_data_len; /* * Don't always wait for the full data segment to be delivered * to the socket; this might badly affect performance due to * TCP window scaling. */ if (len > partial_receive_len) { #if 0 ICL_DEBUG("need %zd bytes of data, limiting to %zd", len, partial_receive_len)); #endif len = partial_receive_len; return (len); } /* * Account for padding. Note that due to the way code is written, * the icl_pdu_receive_data_segment() must always receive padding * along with the last part of data segment, because it would be * impossible to tell whether we've already received the full data * segment including padding, or without it. */ if ((len % 4) != 0) len += 4 - (len % 4); #if 0 ICL_DEBUG("need %zd bytes of data", len)); #endif return (len); } static int icl_pdu_receive_data_segment(struct icl_pdu *request, struct mbuf **r, size_t *rs, bool *more_neededp) { struct icl_soft_conn *isc; size_t len, padding = 0; struct mbuf *m; isc = (struct icl_soft_conn *)request->ip_conn; *more_neededp = false; isc->receive_len = 0; len = icl_pdu_data_segment_length(request); if (len == 0) return (0); if ((len % 4) != 0) padding = 4 - (len % 4); /* * Account for already received parts of data segment. */ KASSERT(len > request->ip_data_len, ("len <= request->ip_data_len")); len -= request->ip_data_len; if (len + padding > *rs) { /* * Not enough data in the socket buffer. Receive as much * as we can. Don't receive padding, since, obviously, it's * not the end of data segment yet. */ #if 0 ICL_DEBUG("limited from %zd to %zd", len + padding, *rs - padding)); #endif len = *rs - padding; *more_neededp = true; padding = 0; } /* * Must not try to receive padding without at least one byte * of actual data segment. */ if (len > 0) { m = *r; *r = m_split(m, len + padding, M_WAITOK); *rs -= len + padding; if (request->ip_data_mbuf == NULL) request->ip_data_mbuf = m; else m_cat(request->ip_data_mbuf, m); request->ip_data_len += len; } else ICL_DEBUG("len 0"); if (*more_neededp) isc->receive_len = icl_pdu_data_segment_receive_len(request); return (0); } static int icl_pdu_check_data_digest(struct icl_pdu *request, struct mbuf **r, size_t *rs) { uint32_t received_digest, valid_digest; if (request->ip_conn->ic_data_crc32c == false) return (0); if (request->ip_data_len == 0) return (0); CTASSERT(sizeof(received_digest) == ISCSI_DATA_DIGEST_SIZE); icl_soft_receive_buf(r, rs, &received_digest, ISCSI_DATA_DIGEST_SIZE); /* * Note that ip_data_mbuf also contains padding; since digest * calculation is supposed to include that, we iterate over * the entire ip_data_mbuf chain, not just ip_data_len bytes of it. */ valid_digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); if (received_digest != valid_digest) { ICL_WARN("data digest check failed; got 0x%x, " "should be 0x%x", received_digest, valid_digest); return (-1); } return (0); } /* * Somewhat contrary to the name, this attempts to receive only one * "part" of PDU at a time; call it repeatedly until it returns non-NULL. */ static struct icl_pdu * icl_conn_receive_pdu(struct icl_soft_conn *isc, struct mbuf **r, size_t *rs) { struct icl_conn *ic = &isc->ic; struct icl_pdu *request; size_t len; int error = 0; bool more_needed; if (isc->receive_state == ICL_CONN_STATE_BHS) { KASSERT(isc->receive_pdu == NULL, ("isc->receive_pdu != NULL")); request = icl_soft_conn_new_pdu(ic, M_NOWAIT); if (request == NULL) { ICL_DEBUG("failed to allocate PDU; " "dropping connection"); icl_conn_fail(ic); return (NULL); } isc->receive_pdu = request; } else { KASSERT(isc->receive_pdu != NULL, ("isc->receive_pdu == NULL")); request = isc->receive_pdu; } switch (isc->receive_state) { case ICL_CONN_STATE_BHS: //ICL_DEBUG("receiving BHS"); icl_soft_receive_buf(r, rs, request->ip_bhs, sizeof(struct iscsi_bhs)); /* * We don't enforce any limit for AHS length; * its length is stored in 8 bit field. */ len = icl_pdu_data_segment_length(request); if (len > ic->ic_max_recv_data_segment_length) { ICL_WARN("received data segment " "length %zd is larger than negotiated; " "dropping connection", len); error = EINVAL; break; } isc->receive_state = ICL_CONN_STATE_AHS; isc->receive_len = icl_pdu_ahs_length(request); break; case ICL_CONN_STATE_AHS: //ICL_DEBUG("receiving AHS"); icl_pdu_receive_ahs(request, r, rs); isc->receive_state = ICL_CONN_STATE_HEADER_DIGEST; if (ic->ic_header_crc32c == false) isc->receive_len = 0; else isc->receive_len = ISCSI_HEADER_DIGEST_SIZE; break; case ICL_CONN_STATE_HEADER_DIGEST: //ICL_DEBUG("receiving header digest"); error = icl_pdu_check_header_digest(request, r, rs); if (error != 0) { ICL_DEBUG("header digest failed; " "dropping connection"); break; } isc->receive_state = ICL_CONN_STATE_DATA; isc->receive_len = icl_pdu_data_segment_receive_len(request); break; case ICL_CONN_STATE_DATA: //ICL_DEBUG("receiving data segment"); error = icl_pdu_receive_data_segment(request, r, rs, &more_needed); if (error != 0) { ICL_DEBUG("failed to receive data segment;" "dropping connection"); break; } if (more_needed) break; isc->receive_state = ICL_CONN_STATE_DATA_DIGEST; if (request->ip_data_len == 0 || ic->ic_data_crc32c == false) isc->receive_len = 0; else isc->receive_len = ISCSI_DATA_DIGEST_SIZE; break; case ICL_CONN_STATE_DATA_DIGEST: //ICL_DEBUG("receiving data digest"); error = icl_pdu_check_data_digest(request, r, rs); if (error != 0) { ICL_DEBUG("data digest failed; " "dropping connection"); break; } /* * We've received complete PDU; reset the receive state machine * and return the PDU. */ isc->receive_state = ICL_CONN_STATE_BHS; isc->receive_len = sizeof(struct iscsi_bhs); isc->receive_pdu = NULL; return (request); default: panic("invalid receive_state %d\n", isc->receive_state); } if (error != 0) { /* * Don't free the PDU; it's pointed to by isc->receive_pdu * and will get freed in icl_soft_conn_close(). */ icl_conn_fail(ic); } return (NULL); } static void icl_conn_receive_pdus(struct icl_soft_conn *isc, struct mbuf **r, size_t *rs) { struct icl_conn *ic = &isc->ic; struct icl_pdu *response; for (;;) { if (ic->ic_disconnecting) return; /* * Loop until we have a complete PDU or there is not enough * data in the socket buffer. */ if (*rs < isc->receive_len) { #if 0 ICL_DEBUG("not enough data; have %zd, need %zd", *rs, isc->receive_len); #endif return; } response = icl_conn_receive_pdu(isc, r, rs); if (response == NULL) continue; if (response->ip_ahs_len > 0) { ICL_WARN("received PDU with unsupported " "AHS; opcode 0x%x; dropping connection", response->ip_bhs->bhs_opcode); icl_soft_conn_pdu_free(ic, response); icl_conn_fail(ic); return; } (ic->ic_receive)(response); } } static void icl_receive_thread(void *arg) { struct icl_soft_conn *isc = arg; struct icl_conn *ic = &isc->ic; size_t available, read = 0; struct socket *so; struct mbuf *m, *r = NULL; struct uio uio; int error, flags; so = ic->ic_socket; for (;;) { SOCKBUF_LOCK(&so->so_rcv); if (ic->ic_disconnecting) { SOCKBUF_UNLOCK(&so->so_rcv); break; } /* * Set the low watermark, to be checked by * soreadable() in icl_soupcall_receive() * to avoid unnecessary wakeups until there * is enough data received to read the PDU. */ available = sbavail(&so->so_rcv); if (read + available < isc->receive_len) { so->so_rcv.sb_lowat = isc->receive_len - read; cv_wait(&isc->receive_cv, SOCKBUF_MTX(&so->so_rcv)); so->so_rcv.sb_lowat = so->so_rcv.sb_hiwat + 1; available = sbavail(&so->so_rcv); } SOCKBUF_UNLOCK(&so->so_rcv); if (available == 0) { if (so->so_error != 0) { ICL_DEBUG("connection error %d; " "dropping connection", so->so_error); icl_conn_fail(ic); break; } continue; } memset(&uio, 0, sizeof(uio)); uio.uio_resid = available; flags = MSG_DONTWAIT; error = soreceive(so, NULL, &uio, &m, NULL, &flags); if (error != 0) { ICL_DEBUG("soreceive error %d", error); break; } if (uio.uio_resid != 0) { m_freem(m); ICL_DEBUG("short read"); break; } if (r) m_cat(r, m); else r = m; read += available; icl_conn_receive_pdus(isc, &r, &read); } if (r) m_freem(r); ICL_CONN_LOCK(ic); isc->receive_running = false; cv_signal(&isc->send_cv); ICL_CONN_UNLOCK(ic); kthread_exit(); } static int icl_soupcall_receive(struct socket *so, void *arg, int waitflag) { struct icl_soft_conn *isc; if (!soreadable(so)) return (SU_OK); isc = arg; cv_signal(&isc->receive_cv); return (SU_OK); } static int icl_pdu_finalize(struct icl_pdu *request) { size_t padding, pdu_len; uint32_t digest, zero = 0; int ok; struct icl_conn *ic; ic = request->ip_conn; icl_pdu_set_data_segment_length(request, request->ip_data_len); pdu_len = icl_pdu_size(request); if (ic->ic_header_crc32c) { digest = icl_mbuf_to_crc32c(request->ip_bhs_mbuf); ok = m_append(request->ip_bhs_mbuf, sizeof(digest), (void *)&digest); if (ok != 1) { ICL_WARN("failed to append header digest"); return (1); } } if (request->ip_data_len != 0) { padding = icl_pdu_padding(request); if (padding > 0) { ok = m_append(request->ip_data_mbuf, padding, (void *)&zero); if (ok != 1) { ICL_WARN("failed to append padding"); return (1); } } if (ic->ic_data_crc32c) { digest = icl_mbuf_to_crc32c(request->ip_data_mbuf); ok = m_append(request->ip_data_mbuf, sizeof(digest), (void *)&digest); if (ok != 1) { ICL_WARN("failed to append data digest"); return (1); } } m_cat(request->ip_bhs_mbuf, request->ip_data_mbuf); request->ip_data_mbuf = NULL; } request->ip_bhs_mbuf->m_pkthdr.len = pdu_len; return (0); } static void icl_conn_send_pdus(struct icl_soft_conn *isc, struct icl_pdu_stailq *queue) { struct icl_conn *ic = &isc->ic; struct icl_pdu *request, *request2; struct mbuf *m; struct socket *so; long available, size, size2; int coalesced, error; ICL_CONN_LOCK_ASSERT_NOT(ic); so = ic->ic_socket; SOCKBUF_LOCK(&so->so_snd); /* * Check how much space do we have for transmit. We can't just * call sosend() and retry when we get EWOULDBLOCK or EMSGSIZE, * as it always frees the mbuf chain passed to it, even in case * of error. */ available = sbspace(&so->so_snd); isc->check_send_space = false; /* * Notify the socket upcall that we don't need wakeups * for the time being. */ so->so_snd.sb_lowat = so->so_snd.sb_hiwat + 1; SOCKBUF_UNLOCK(&so->so_snd); while (!STAILQ_EMPTY(queue)) { request = STAILQ_FIRST(queue); size = icl_pdu_size(request); if (available < size) { /* * Set the low watermark, to be checked by * sowriteable() in icl_soupcall_send() * to avoid unnecessary wakeups until there * is enough space for the PDU to fit. */ SOCKBUF_LOCK(&so->so_snd); available = sbspace(&so->so_snd); if (available < size) { #if 1 ICL_DEBUG("no space to send; " "have %ld, need %ld", available, size); #endif so->so_snd.sb_lowat = max(size, so->so_snd.sb_hiwat / 8); SOCKBUF_UNLOCK(&so->so_snd); return; } SOCKBUF_UNLOCK(&so->so_snd); } STAILQ_REMOVE_HEAD(queue, ip_next); error = icl_pdu_finalize(request); if (error != 0) { ICL_DEBUG("failed to finalize PDU; " "dropping connection"); icl_soft_pdu_done(request, EIO); icl_conn_fail(ic); return; } if (coalesce) { m = request->ip_bhs_mbuf; for (coalesced = 1; ; coalesced++) { request2 = STAILQ_FIRST(queue); if (request2 == NULL) break; size2 = icl_pdu_size(request2); if (available < size + size2) break; STAILQ_REMOVE_HEAD(queue, ip_next); error = icl_pdu_finalize(request2); if (error != 0) { ICL_DEBUG("failed to finalize PDU; " "dropping connection"); icl_soft_pdu_done(request, EIO); icl_soft_pdu_done(request2, EIO); icl_conn_fail(ic); return; } while (m->m_next) m = m->m_next; m_cat(m, request2->ip_bhs_mbuf); request2->ip_bhs_mbuf = NULL; request->ip_bhs_mbuf->m_pkthdr.len += size2; size += size2; icl_soft_pdu_done(request2, 0); } #if 0 if (coalesced > 1) { ICL_DEBUG("coalesced %d PDUs into %ld bytes", coalesced, size); } #endif } available -= size; error = sosend(so, NULL, NULL, request->ip_bhs_mbuf, NULL, MSG_DONTWAIT, curthread); request->ip_bhs_mbuf = NULL; /* Sosend consumes the mbuf. */ if (error != 0) { ICL_DEBUG("failed to send PDU, error %d; " "dropping connection", error); icl_soft_pdu_done(request, error); icl_conn_fail(ic); return; } icl_soft_pdu_done(request, 0); } } static void icl_send_thread(void *arg) { struct icl_soft_conn *isc; struct icl_conn *ic; struct icl_pdu_stailq queue; isc = arg; ic = &isc->ic; STAILQ_INIT(&queue); ICL_CONN_LOCK(ic); for (;;) { for (;;) { /* * Populate the local queue from the main one. * This way the icl_conn_send_pdus() can go through * all the queued PDUs without holding any locks. */ if (STAILQ_EMPTY(&queue) || isc->check_send_space) STAILQ_CONCAT(&queue, &isc->to_send); ICL_CONN_UNLOCK(ic); icl_conn_send_pdus(isc, &queue); ICL_CONN_LOCK(ic); /* * The icl_soupcall_send() was called since the last * call to sbspace(); go around; */ if (isc->check_send_space) continue; /* * Local queue is empty, but we still have PDUs * in the main one; go around. */ if (STAILQ_EMPTY(&queue) && !STAILQ_EMPTY(&isc->to_send)) continue; /* * There might be some stuff in the local queue, * which didn't get sent due to not having enough send * space. Wait for socket upcall. */ break; } if (ic->ic_disconnecting) { //ICL_DEBUG("terminating"); break; } cv_wait(&isc->send_cv, ic->ic_lock); } /* * We're exiting; move PDUs back to the main queue, so they can * get freed properly. At this point ordering doesn't matter. */ STAILQ_CONCAT(&isc->to_send, &queue); isc->send_running = false; cv_signal(&isc->send_cv); ICL_CONN_UNLOCK(ic); kthread_exit(); } static int icl_soupcall_send(struct socket *so, void *arg, int waitflag) { struct icl_soft_conn *isc; struct icl_conn *ic; if (!sowriteable(so)) return (SU_OK); isc = arg; ic = &isc->ic; ICL_CONN_LOCK(ic); isc->check_send_space = true; ICL_CONN_UNLOCK(ic); cv_signal(&isc->send_cv); return (SU_OK); } static int icl_soft_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request, const void *addr, size_t len, int flags) { struct icl_soft_pdu *isp = (struct icl_soft_pdu *)request; struct mbuf *mb, *newmb; size_t copylen, off = 0; KASSERT(len > 0, ("len == 0")); if (flags & ICL_NOCOPY) { newmb = m_get(flags & ~ICL_NOCOPY, MT_DATA); if (newmb == NULL) { ICL_WARN("failed to allocate mbuf"); return (ENOMEM); } newmb->m_flags |= M_RDONLY; m_extaddref(newmb, __DECONST(char *, addr), len, &isp->ref_cnt, icl_soft_mbuf_done, isp, NULL); newmb->m_len = len; } else { newmb = m_getm2(NULL, len, flags, MT_DATA, 0); if (newmb == NULL) { ICL_WARN("failed to allocate mbuf for %zd bytes", len); return (ENOMEM); } for (mb = newmb; mb != NULL; mb = mb->m_next) { copylen = min(M_TRAILINGSPACE(mb), len - off); memcpy(mtod(mb, char *), (const char *)addr + off, copylen); mb->m_len = copylen; off += copylen; } KASSERT(off == len, ("%s: off != len", __func__)); } if (request->ip_data_mbuf == NULL) { request->ip_data_mbuf = newmb; request->ip_data_len = len; } else { m_cat(request->ip_data_mbuf, newmb); request->ip_data_len += len; } return (0); } void icl_soft_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, size_t off, void *addr, size_t len) { m_copydata(ip->ip_data_mbuf, off, len, addr); } static void icl_soft_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) { icl_soft_conn_pdu_queue_cb(ic, ip, NULL); } static void icl_soft_conn_pdu_queue_cb(struct icl_conn *ic, struct icl_pdu *ip, icl_pdu_cb cb) { struct icl_soft_conn *isc = (struct icl_soft_conn *)ic; struct icl_soft_pdu *isp = (struct icl_soft_pdu *)ip; ICL_CONN_LOCK_ASSERT(ic); isp->ref_cnt++; isp->cb = cb; if (ic->ic_disconnecting || ic->ic_socket == NULL) { ICL_DEBUG("icl_pdu_queue on closed connection"); icl_soft_pdu_done(ip, ENOTCONN); return; } if (!STAILQ_EMPTY(&isc->to_send)) { STAILQ_INSERT_TAIL(&isc->to_send, ip, ip_next); /* * If the queue is not empty, someone else had already * signaled the send thread; no need to do that again, * just return. */ return; } STAILQ_INSERT_TAIL(&isc->to_send, ip, ip_next); cv_signal(&isc->send_cv); } static struct icl_conn * icl_soft_new_conn(const char *name, struct mtx *lock) { struct icl_soft_conn *isc; struct icl_conn *ic; refcount_acquire(&icl_ncons); isc = (struct icl_soft_conn *)kobj_create(&icl_soft_class, M_ICL_SOFT, M_WAITOK | M_ZERO); STAILQ_INIT(&isc->to_send); cv_init(&isc->send_cv, "icl_tx"); cv_init(&isc->receive_cv, "icl_rx"); ic = &isc->ic; ic->ic_lock = lock; #ifdef DIAGNOSTIC refcount_init(&ic->ic_outstanding_pdus, 0); #endif ic->ic_name = name; ic->ic_offload = "None"; ic->ic_unmapped = false; return (ic); } void icl_soft_conn_free(struct icl_conn *ic) { struct icl_soft_conn *isc = (struct icl_soft_conn *)ic; #ifdef DIAGNOSTIC KASSERT(ic->ic_outstanding_pdus == 0, ("destroying session with %d outstanding PDUs", ic->ic_outstanding_pdus)); #endif cv_destroy(&isc->send_cv); cv_destroy(&isc->receive_cv); kobj_delete((struct kobj *)isc, M_ICL_SOFT); refcount_release(&icl_ncons); } static int icl_conn_start(struct icl_conn *ic) { struct icl_soft_conn *isc = (struct icl_soft_conn *)ic; size_t minspace; struct sockopt opt; int error, one = 1; ICL_CONN_LOCK(ic); /* * XXX: Ugly hack. */ if (ic->ic_socket == NULL) { ICL_CONN_UNLOCK(ic); return (EINVAL); } isc->receive_state = ICL_CONN_STATE_BHS; isc->receive_len = sizeof(struct iscsi_bhs); ic->ic_disconnecting = false; ICL_CONN_UNLOCK(ic); /* * For sendspace, this is required because the current code cannot * send a PDU in pieces; thus, the minimum buffer size is equal * to the maximum PDU size. "+4" is to account for possible padding. */ minspace = sizeof(struct iscsi_bhs) + ic->ic_max_send_data_segment_length + ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4; if (sendspace < minspace) { ICL_WARN("kern.icl.sendspace too low; must be at least %zd", minspace); sendspace = minspace; } minspace = sizeof(struct iscsi_bhs) + ic->ic_max_recv_data_segment_length + ISCSI_HEADER_DIGEST_SIZE + ISCSI_DATA_DIGEST_SIZE + 4; if (recvspace < minspace) { ICL_WARN("kern.icl.recvspace too low; must be at least %zd", minspace); recvspace = minspace; } error = soreserve(ic->ic_socket, sendspace, recvspace); if (error != 0) { ICL_WARN("soreserve failed with error %d", error); icl_soft_conn_close(ic); return (error); } ic->ic_socket->so_snd.sb_flags |= SB_AUTOSIZE; ic->ic_socket->so_rcv.sb_flags |= SB_AUTOSIZE; /* * Disable Nagle. */ bzero(&opt, sizeof(opt)); opt.sopt_dir = SOPT_SET; opt.sopt_level = IPPROTO_TCP; opt.sopt_name = TCP_NODELAY; opt.sopt_val = &one; opt.sopt_valsize = sizeof(one); error = sosetopt(ic->ic_socket, &opt); if (error != 0) { ICL_WARN("disabling TCP_NODELAY failed with error %d", error); icl_soft_conn_close(ic); return (error); } /* * Register socket upcall, to get notified about incoming PDUs * and free space to send outgoing ones. */ SOCKBUF_LOCK(&ic->ic_socket->so_snd); soupcall_set(ic->ic_socket, SO_SND, icl_soupcall_send, isc); SOCKBUF_UNLOCK(&ic->ic_socket->so_snd); SOCKBUF_LOCK(&ic->ic_socket->so_rcv); soupcall_set(ic->ic_socket, SO_RCV, icl_soupcall_receive, isc); SOCKBUF_UNLOCK(&ic->ic_socket->so_rcv); /* * Start threads. */ ICL_CONN_LOCK(ic); isc->send_running = isc->receive_running = true; ICL_CONN_UNLOCK(ic); error = kthread_add(icl_send_thread, ic, NULL, NULL, 0, 0, "%stx", ic->ic_name); if (error != 0) { ICL_WARN("kthread_add(9) failed with error %d", error); ICL_CONN_LOCK(ic); isc->send_running = isc->receive_running = false; cv_signal(&isc->send_cv); ICL_CONN_UNLOCK(ic); icl_soft_conn_close(ic); return (error); } error = kthread_add(icl_receive_thread, ic, NULL, NULL, 0, 0, "%srx", ic->ic_name); if (error != 0) { ICL_WARN("kthread_add(9) failed with error %d", error); ICL_CONN_LOCK(ic); isc->receive_running = false; cv_signal(&isc->send_cv); ICL_CONN_UNLOCK(ic); icl_soft_conn_close(ic); return (error); } return (0); } int icl_soft_conn_handoff(struct icl_conn *ic, int fd) { struct file *fp; struct socket *so; cap_rights_t rights; int error; ICL_CONN_LOCK_ASSERT_NOT(ic); #ifdef ICL_KERNEL_PROXY /* * We're transitioning to Full Feature phase, and we don't * really care. */ if (fd == 0) { ICL_CONN_LOCK(ic); if (ic->ic_socket == NULL) { ICL_CONN_UNLOCK(ic); ICL_WARN("proxy handoff without connect"); return (EINVAL); } ICL_CONN_UNLOCK(ic); return (0); } #endif /* * Steal the socket from userland. */ error = fget(curthread, fd, cap_rights_init_one(&rights, CAP_SOCK_CLIENT), &fp); if (error != 0) return (error); if (fp->f_type != DTYPE_SOCKET) { fdrop(fp, curthread); return (EINVAL); } so = fp->f_data; if (so->so_type != SOCK_STREAM) { fdrop(fp, curthread); return (EINVAL); } ICL_CONN_LOCK(ic); if (ic->ic_socket != NULL) { ICL_CONN_UNLOCK(ic); fdrop(fp, curthread); return (EBUSY); } ic->ic_socket = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; fdrop(fp, curthread); ICL_CONN_UNLOCK(ic); error = icl_conn_start(ic); return (error); } void icl_soft_conn_close(struct icl_conn *ic) { struct icl_soft_conn *isc = (struct icl_soft_conn *)ic; struct icl_pdu *pdu; struct socket *so; /* * Wake up the threads, so they can properly terminate. * Receive thread sleeps on so->so_rcv lock, send on ic->ic_lock. */ ICL_CONN_LOCK(ic); if (!ic->ic_disconnecting) { so = ic->ic_socket; if (so) SOCKBUF_LOCK(&so->so_rcv); ic->ic_disconnecting = true; if (so) SOCKBUF_UNLOCK(&so->so_rcv); } while (isc->receive_running || isc->send_running) { cv_signal(&isc->receive_cv); cv_signal(&isc->send_cv); cv_wait(&isc->send_cv, ic->ic_lock); } /* Some other thread could close the connection same time. */ so = ic->ic_socket; if (so == NULL) { ICL_CONN_UNLOCK(ic); return; } ic->ic_socket = NULL; /* * Deregister socket upcalls. */ ICL_CONN_UNLOCK(ic); SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_upcall != NULL) soupcall_clear(so, SO_SND); SOCKBUF_UNLOCK(&so->so_snd); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_upcall != NULL) soupcall_clear(so, SO_RCV); SOCKBUF_UNLOCK(&so->so_rcv); soclose(so); ICL_CONN_LOCK(ic); if (isc->receive_pdu != NULL) { //ICL_DEBUG("freeing partially received PDU"); icl_soft_conn_pdu_free(ic, isc->receive_pdu); isc->receive_pdu = NULL; } /* * Remove any outstanding PDUs from the send queue. */ while (!STAILQ_EMPTY(&isc->to_send)) { pdu = STAILQ_FIRST(&isc->to_send); STAILQ_REMOVE_HEAD(&isc->to_send, ip_next); icl_soft_pdu_done(pdu, ENOTCONN); } KASSERT(STAILQ_EMPTY(&isc->to_send), ("destroying session with non-empty send queue")); ICL_CONN_UNLOCK(ic); } int icl_soft_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp) { return (0); } void icl_soft_conn_task_done(struct icl_conn *ic, void *prv) { } int icl_soft_conn_transfer_setup(struct icl_conn *ic, union ctl_io *io, uint32_t *transfer_tag, void **prvp) { return (0); } void icl_soft_conn_transfer_done(struct icl_conn *ic, void *prv) { } static int icl_soft_limits(struct icl_drv_limits *idl) { idl->idl_max_recv_data_segment_length = max_data_segment_length; idl->idl_max_send_data_segment_length = max_data_segment_length; idl->idl_max_burst_length = max_burst_length; idl->idl_first_burst_length = first_burst_length; return (0); } #ifdef ICL_KERNEL_PROXY int icl_soft_conn_connect(struct icl_conn *ic, int domain, int socktype, int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa) { return (icl_soft_proxy_connect(ic, domain, socktype, protocol, from_sa, to_sa)); } int icl_soft_handoff_sock(struct icl_conn *ic, struct socket *so) { int error; ICL_CONN_LOCK_ASSERT_NOT(ic); if (so->so_type != SOCK_STREAM) return (EINVAL); ICL_CONN_LOCK(ic); if (ic->ic_socket != NULL) { ICL_CONN_UNLOCK(ic); return (EBUSY); } ic->ic_socket = so; ICL_CONN_UNLOCK(ic); error = icl_conn_start(ic); return (error); } #endif /* ICL_KERNEL_PROXY */ static int icl_soft_load(void) { int error; icl_soft_pdu_zone = uma_zcreate("icl_soft_pdu", sizeof(struct icl_soft_pdu), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); refcount_init(&icl_ncons, 0); /* * The reason we call this "none" is that to the user, * it's known as "offload driver"; "offload driver: soft" * doesn't make much sense. */ error = icl_register("none", false, 0, icl_soft_limits, icl_soft_new_conn); KASSERT(error == 0, ("failed to register")); #if defined(ICL_KERNEL_PROXY) && 0 /* * Debugging aid for kernel proxy functionality. */ error = icl_register("proxytest", true, 0, icl_soft_limits, icl_soft_new_conn); KASSERT(error == 0, ("failed to register")); #endif return (error); } static int icl_soft_unload(void) { if (icl_ncons != 0) return (EBUSY); icl_unregister("none", false); #if defined(ICL_KERNEL_PROXY) && 0 icl_unregister("proxytest", true); #endif uma_zdestroy(icl_soft_pdu_zone); return (0); } static int icl_soft_modevent(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (icl_soft_load()); case MOD_UNLOAD: return (icl_soft_unload()); default: return (EINVAL); } } moduledata_t icl_soft_data = { "icl_soft", icl_soft_modevent, 0 }; DECLARE_MODULE(icl_soft, icl_soft_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); MODULE_DEPEND(icl_soft, icl, 1, 1, 1); MODULE_VERSION(icl_soft, 1);