Index: projects/bsd_rdma_4_9/sys/dev/iser/icl_iser.c =================================================================== --- projects/bsd_rdma_4_9/sys/dev/iser/icl_iser.c (revision 325890) +++ projects/bsd_rdma_4_9/sys/dev/iser/icl_iser.c (revision 325891) @@ -1,564 +1,564 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "icl_iser.h" SYSCTL_NODE(_kern, OID_AUTO, iser, CTLFLAG_RW, 0, "iSER module"); int iser_debug = 0; SYSCTL_INT(_kern_iser, OID_AUTO, debug, CTLFLAG_RWTUN, &iser_debug, 0, "Enable iser debug messages"); static MALLOC_DEFINE(M_ICL_ISER, "icl_iser", "iSCSI iser backend"); static uma_zone_t icl_pdu_zone; static volatile u_int icl_iser_ncons; struct iser_global ig; static void iser_conn_release(struct icl_conn *ic); static icl_conn_new_pdu_t iser_conn_new_pdu; static icl_conn_pdu_free_t iser_conn_pdu_free; static icl_conn_pdu_data_segment_length_t iser_conn_pdu_data_segment_length; static icl_conn_pdu_append_data_t iser_conn_pdu_append_data; static icl_conn_pdu_queue_t iser_conn_pdu_queue; static icl_conn_handoff_t iser_conn_handoff; static icl_conn_free_t iser_conn_free; static icl_conn_close_t iser_conn_close; static icl_conn_connect_t iser_conn_connect; static icl_conn_task_setup_t iser_conn_task_setup; static icl_conn_task_done_t iser_conn_task_done; static icl_conn_pdu_get_data_t iser_conn_pdu_get_data; static kobj_method_t icl_iser_methods[] = { KOBJMETHOD(icl_conn_new_pdu, iser_conn_new_pdu), KOBJMETHOD(icl_conn_pdu_free, iser_conn_pdu_free), KOBJMETHOD(icl_conn_pdu_data_segment_length, iser_conn_pdu_data_segment_length), KOBJMETHOD(icl_conn_pdu_append_data, iser_conn_pdu_append_data), KOBJMETHOD(icl_conn_pdu_queue, iser_conn_pdu_queue), KOBJMETHOD(icl_conn_handoff, iser_conn_handoff), KOBJMETHOD(icl_conn_free, iser_conn_free), KOBJMETHOD(icl_conn_close, iser_conn_close), KOBJMETHOD(icl_conn_connect, iser_conn_connect), KOBJMETHOD(icl_conn_task_setup, iser_conn_task_setup), KOBJMETHOD(icl_conn_task_done, iser_conn_task_done), KOBJMETHOD(icl_conn_pdu_get_data, iser_conn_pdu_get_data), { 0, 0 } }; DEFINE_CLASS(icl_iser, icl_iser_methods, sizeof(struct iser_conn)); /** * iser_initialize_headers() - Initialize task headers * @pdu: iser pdu * @iser_conn: iser connection * * Notes: * This routine may race with iser teardown flow for scsi * error handling TMFs. So for TMF we should acquire the * state mutex to avoid dereferencing the IB device which * may have already been terminated (racing teardown sequence). */ int iser_initialize_headers(struct icl_iser_pdu *pdu, struct iser_conn *iser_conn) { struct iser_tx_desc *tx_desc = &pdu->desc; struct iser_device *device = iser_conn->ib_conn.device; u64 dma_addr; int ret = 0; dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, ISER_HEADERS_LEN, DMA_TO_DEVICE); if (ib_dma_mapping_error(device->ib_device, dma_addr)) { ret = -ENOMEM; goto out; } tx_desc->mapped = true; tx_desc->dma_addr = dma_addr; tx_desc->tx_sg[0].addr = tx_desc->dma_addr; tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; tx_desc->tx_sg[0].lkey = device->mr->lkey; out: return (ret); } int iser_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request, const void *addr, size_t len, int flags) { struct iser_conn *iser_conn = icl_to_iser_conn(ic); if (request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_LOGIN_REQUEST || request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_TEXT_REQUEST) { ISER_DBG("copy to login buff"); memcpy(iser_conn->login_req_buf, addr, len); request->ip_data_len = len; } return (0); } void iser_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip, size_t off, void *addr, size_t len) { /* If we have a receive data, copy it to upper layer buffer */ if (ip->ip_data_mbuf) memcpy(addr, ip->ip_data_mbuf + off, len); } /* * Allocate icl_pdu with empty BHS to fill up by the caller. */ struct icl_pdu * iser_new_pdu(struct icl_conn *ic, int flags) { struct icl_iser_pdu *iser_pdu; struct icl_pdu *ip; struct iser_conn *iser_conn = icl_to_iser_conn(ic); iser_pdu = uma_zalloc(icl_pdu_zone, flags | M_ZERO); if (iser_pdu == NULL) { ISER_WARN("failed to allocate %zd bytes", sizeof(*iser_pdu)); return (NULL); } iser_pdu->iser_conn = iser_conn; ip = &iser_pdu->icl_pdu; ip->ip_conn = ic; ip->ip_bhs = &iser_pdu->desc.iscsi_header; return (ip); } struct icl_pdu * iser_conn_new_pdu(struct icl_conn *ic, int flags) { return (iser_new_pdu(ic, flags)); } void iser_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) { struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip); uma_zfree(icl_pdu_zone, iser_pdu); } size_t iser_conn_pdu_data_segment_length(struct icl_conn *ic, const struct icl_pdu *request) { uint32_t len = 0; len += request->ip_bhs->bhs_data_segment_len[0]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[1]; len <<= 8; len += request->ip_bhs->bhs_data_segment_len[2]; return (len); } void iser_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip) { iser_pdu_free(ic, ip); } static bool is_control_opcode(uint8_t opcode) { bool is_control = false; switch (opcode & ISCSI_OPCODE_MASK) { case ISCSI_BHS_OPCODE_NOP_OUT: case ISCSI_BHS_OPCODE_LOGIN_REQUEST: case ISCSI_BHS_OPCODE_LOGOUT_REQUEST: case ISCSI_BHS_OPCODE_TEXT_REQUEST: is_control = true; break; case ISCSI_BHS_OPCODE_SCSI_COMMAND: is_control = false; break; default: ISER_ERR("unknown opcode %d", opcode); } return (is_control); } void iser_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip) { struct iser_conn *iser_conn = icl_to_iser_conn(ic); struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip); int ret; if (iser_conn->state != ISER_CONN_UP) return; ret = iser_initialize_headers(iser_pdu, iser_conn); if (ret) { ISER_ERR("Failed to map TX descriptor pdu %p", iser_pdu); return; } if (is_control_opcode(ip->ip_bhs->bhs_opcode)) { ret = iser_send_control(iser_conn, iser_pdu); if (unlikely(ret)) ISER_ERR("Failed to send control pdu %p", iser_pdu); } else { ret = iser_send_command(iser_conn, iser_pdu); if (unlikely(ret)) ISER_ERR("Failed to send command pdu %p", iser_pdu); } } static struct icl_conn * iser_new_conn(const char *name, struct mtx *lock) { struct iser_conn *iser_conn; struct icl_conn *ic; refcount_acquire(&icl_iser_ncons); iser_conn = (struct iser_conn *)kobj_create(&icl_iser_class, M_ICL_ISER, M_WAITOK | M_ZERO); if (!iser_conn) { ISER_ERR("failed to allocate iser conn"); refcount_release(&icl_iser_ncons); return (NULL); } cv_init(&iser_conn->up_cv, "iser_cv"); sx_init(&iser_conn->state_mutex, "iser_conn_state_mutex"); mtx_init(&iser_conn->ib_conn.beacon.flush_lock, "flush_lock", NULL, MTX_DEF); cv_init(&iser_conn->ib_conn.beacon.flush_cv, "flush_cv"); mtx_init(&iser_conn->ib_conn.lock, "lock", NULL, MTX_DEF); ic = &iser_conn->icl_conn; ic->ic_lock = lock; ic->ic_name = name; ic->ic_offload = strdup("iser", M_TEMP); ic->ic_iser = true; ic->ic_unmapped = true; return (ic); } void iser_conn_free(struct icl_conn *ic) { struct iser_conn *iser_conn = icl_to_iser_conn(ic); iser_conn_release(ic); cv_destroy(&iser_conn->ib_conn.beacon.flush_cv); mtx_destroy(&iser_conn->ib_conn.beacon.flush_lock); sx_destroy(&iser_conn->state_mutex); cv_destroy(&iser_conn->up_cv); kobj_delete((struct kobj *)iser_conn, M_ICL_ISER); refcount_release(&icl_iser_ncons); } int iser_conn_handoff(struct icl_conn *ic, int fd) { struct iser_conn *iser_conn = icl_to_iser_conn(ic); int error = 0; sx_xlock(&iser_conn->state_mutex); if (iser_conn->state != ISER_CONN_UP) { error = EINVAL; ISER_ERR("iser_conn %p state is %d, teardown started\n", iser_conn, iser_conn->state); goto out; } error = iser_alloc_rx_descriptors(iser_conn, ic->ic_maxtags); if (error) goto out; error = iser_post_recvm(iser_conn, iser_conn->min_posted_rx); if (error) goto post_error; iser_conn->handoff_done = true; sx_xunlock(&iser_conn->state_mutex); return (error); post_error: iser_free_rx_descriptors(iser_conn); out: sx_xunlock(&iser_conn->state_mutex); return (error); } /** * Frees all conn objects */ static void iser_conn_release(struct icl_conn *ic) { struct iser_conn *iser_conn = icl_to_iser_conn(ic); struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_conn *curr, *tmp; mtx_lock(&ig.connlist_mutex); /* * Search for iser connection in global list. * It may not be there in case of failure in connection establishment * stage. */ list_for_each_entry_safe(curr, tmp, &ig.connlist, conn_list) { if (iser_conn == curr) { ISER_WARN("found iser_conn %p", iser_conn); list_del(&iser_conn->conn_list); } } mtx_unlock(&ig.connlist_mutex); /* * In case we reconnecting or removing session, we need to * release IB resources (which is safe to call more than once). */ sx_xlock(&iser_conn->state_mutex); iser_free_ib_conn_res(iser_conn, true); sx_xunlock(&iser_conn->state_mutex); if (ib_conn->cma_id != NULL) { rdma_destroy_id(ib_conn->cma_id); ib_conn->cma_id = NULL; } } void iser_conn_close(struct icl_conn *ic) { struct iser_conn *iser_conn = icl_to_iser_conn(ic); ISER_INFO("closing conn %p", iser_conn); sx_xlock(&iser_conn->state_mutex); /* * In case iser connection is waiting on conditional variable * (state PENDING) and we try to close it before connection establishment, * we need to signal it to continue releasing connection properly. */ if (!iser_conn_terminate(iser_conn) && iser_conn->state == ISER_CONN_PENDING) cv_signal(&iser_conn->up_cv); sx_xunlock(&iser_conn->state_mutex); } int iser_conn_connect(struct icl_conn *ic, int domain, int socktype, int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa) { struct iser_conn *iser_conn = icl_to_iser_conn(ic); struct ib_conn *ib_conn = &iser_conn->ib_conn; int err = 0; iser_conn_release(ic); sx_xlock(&iser_conn->state_mutex); /* the device is known only --after-- address resolution */ ib_conn->device = NULL; iser_conn->handoff_done = false; iser_conn->state = ISER_CONN_PENDING; - ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)iser_conn, + ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler, (void *)iser_conn, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ib_conn->cma_id)) { err = -PTR_ERR(ib_conn->cma_id); ISER_ERR("rdma_create_id failed: %d", err); goto id_failure; } err = rdma_resolve_addr(ib_conn->cma_id, from_sa, to_sa, 1000); if (err) { ISER_ERR("rdma_resolve_addr failed: %d", err); if (err < 0) err = -err; goto addr_failure; } ISER_DBG("before cv_wait: %p", iser_conn); cv_wait(&iser_conn->up_cv, &iser_conn->state_mutex); ISER_DBG("after cv_wait: %p", iser_conn); if (iser_conn->state != ISER_CONN_UP) { err = EIO; goto addr_failure; } err = iser_alloc_login_buf(iser_conn); if (err) goto addr_failure; sx_xunlock(&iser_conn->state_mutex); mtx_lock(&ig.connlist_mutex); list_add(&iser_conn->conn_list, &ig.connlist); mtx_unlock(&ig.connlist_mutex); return (0); id_failure: ib_conn->cma_id = NULL; addr_failure: sx_xunlock(&iser_conn->state_mutex); return (err); } int iser_conn_task_setup(struct icl_conn *ic, struct icl_pdu *ip, struct ccb_scsiio *csio, uint32_t *task_tagp, void **prvp) { struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip); *prvp = ip; iser_pdu->csio = csio; return (0); } void iser_conn_task_done(struct icl_conn *ic, void *prv) { struct icl_pdu *ip = prv; struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip); struct iser_device *device = iser_pdu->iser_conn->ib_conn.device; struct iser_tx_desc *tx_desc = &iser_pdu->desc; if (iser_pdu->dir[ISER_DIR_IN]) { iser_unreg_rdma_mem(iser_pdu, ISER_DIR_IN); iser_dma_unmap_task_data(iser_pdu, &iser_pdu->data[ISER_DIR_IN], DMA_FROM_DEVICE); } if (iser_pdu->dir[ISER_DIR_OUT]) { iser_unreg_rdma_mem(iser_pdu, ISER_DIR_OUT); iser_dma_unmap_task_data(iser_pdu, &iser_pdu->data[ISER_DIR_OUT], DMA_TO_DEVICE); } if (likely(tx_desc->mapped)) { ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); tx_desc->mapped = false; } iser_pdu_free(ic, ip); } static int iser_limits(struct icl_drv_limits *idl) { idl->idl_max_recv_data_segment_length = 128 * 1024; return (0); } static int icl_iser_load(void) { int error; ISER_DBG("Starting iSER datamover..."); icl_pdu_zone = uma_zcreate("icl_iser_pdu", sizeof(struct icl_iser_pdu), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* FIXME: Check rc */ refcount_init(&icl_iser_ncons, 0); error = icl_register("iser", true, 0, iser_limits, iser_new_conn); KASSERT(error == 0, ("failed to register iser")); memset(&ig, 0, sizeof(struct iser_global)); /* device init is called only after the first addr resolution */ sx_init(&ig.device_list_mutex, "global_device_lock"); INIT_LIST_HEAD(&ig.device_list); mtx_init(&ig.connlist_mutex, "global_conn_lock", NULL, MTX_DEF); INIT_LIST_HEAD(&ig.connlist); sx_init(&ig.close_conns_mutex, "global_close_conns_lock"); return (error); } static int icl_iser_unload(void) { ISER_DBG("Removing iSER datamover..."); if (icl_iser_ncons != 0) return (EBUSY); sx_destroy(&ig.close_conns_mutex); mtx_destroy(&ig.connlist_mutex); sx_destroy(&ig.device_list_mutex); icl_unregister("iser", true); uma_zdestroy(icl_pdu_zone); return (0); } static int icl_iser_modevent(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (icl_iser_load()); case MOD_UNLOAD: return (icl_iser_unload()); default: return (EINVAL); } } moduledata_t icl_iser_data = { .name = "icl_iser", .evhand = icl_iser_modevent, .priv = 0 }; DECLARE_MODULE(icl_iser, icl_iser_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE); MODULE_DEPEND(icl_iser, icl, 1, 1, 1); MODULE_DEPEND(icl_iser, ibcore, 1, 1, 1); MODULE_DEPEND(icl_iser, linuxkpi, 1, 1, 1); MODULE_VERSION(icl_iser, 1); Index: projects/bsd_rdma_4_9/sys/dev/iser/icl_iser.h =================================================================== --- projects/bsd_rdma_4_9/sys/dev/iser/icl_iser.h (revision 325890) +++ projects/bsd_rdma_4_9/sys/dev/iser/icl_iser.h (revision 325891) @@ -1,551 +1,549 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef ICL_ISER_H #define ICL_ISER_H /* * iSCSI Common Layer for RDMA. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ISER_DBG(X, ...) \ do { \ if (unlikely(iser_debug > 2)) \ printf("DEBUG: %s: " X "\n", \ __func__, ## __VA_ARGS__); \ } while (0) #define ISER_INFO(X, ...) \ do { \ if (unlikely(iser_debug > 1)) \ printf("INFO: %s: " X "\n", \ __func__, ## __VA_ARGS__); \ } while (0) #define ISER_WARN(X, ...) \ do { \ if (unlikely(iser_debug > 0)) { \ printf("WARNING: %s: " X "\n", \ __func__, ## __VA_ARGS__); \ } \ } while (0) #define ISER_ERR(X, ...) \ printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__) #define ISER_VER 0x10 #define ISER_WSV 0x08 #define ISER_RSV 0x04 #define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL #define ISER_BEACON_WRID 0xfffffffffffffffeULL #define SHIFT_4K 12 #define SIZE_4K (1ULL << SHIFT_4K) #define MASK_4K (~(SIZE_4K-1)) /* support up to 512KB in one RDMA */ #define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) #define ISER_DEF_XMIT_CMDS_MAX 256 /* the max RX (recv) WR supported by the iSER QP is defined by * * max_recv_wr = commands_max + recv_beacon */ #define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX + 1) #define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) /* QP settings */ /* Maximal bounds on received asynchronous PDUs */ #define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ #define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */ /* the max TX (send) WR supported by the iSER QP is defined by * * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * * to have at max for SCSI command. The tx posting & completion handling code * * supports -EAGAIN scheme where tx is suspended till the QP has room for more * * send WR. D=8 comes from 64K/8K */ #define ISER_INFLIGHT_DATAOUTS 8 /* the send_beacon increase the max_send_wr by 1 */ #define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \ (1 + ISER_INFLIGHT_DATAOUTS) + \ ISER_MAX_TX_MISC_PDUS + \ ISER_MAX_RX_MISC_PDUS + 1) #define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \ - ISER_MAX_TX_MISC_PDUS \ - ISER_MAX_RX_MISC_PDUS - 1) / \ (1 + ISER_INFLIGHT_DATAOUTS)) #define ISER_WC_BATCH_COUNT 16 #define ISER_SIGNAL_CMD_COUNT 32 /* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might * * encounter a CQ overrun state. */ #define ISCSI_ISER_MAX_CONN 8 #define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) #define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) #define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \ ISCSI_ISER_MAX_CONN) #define ISER_ZBVA_NOT_SUPPORTED 0x80 #define ISER_SEND_W_INV_NOT_SUPPORTED 0x40 #define ISCSI_DEF_MAX_RECV_SEG_LEN 8192 #define ISCSI_OPCODE_MASK 0x3f #define icl_to_iser_conn(ic) \ container_of(ic, struct iser_conn, icl_conn) #define icl_to_iser_pdu(ip) \ container_of(ip, struct icl_iser_pdu, icl_pdu) /** * struct iser_hdr - iSER header * * @flags: flags support (zbva, remote_inv) * @rsvd: reserved * @write_stag: write rkey * @write_va: write virtual address * @reaf_stag: read rkey * @read_va: read virtual address */ struct iser_hdr { u8 flags; u8 rsvd[3]; __be32 write_stag; __be64 write_va; __be32 read_stag; __be64 read_va; } __attribute__((packed)); struct iser_cm_hdr { u8 flags; u8 rsvd[3]; } __packed; /* Constant PDU lengths calculations */ #define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE) #define ISER_RECV_DATA_SEG_LEN 128 #define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) #define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) enum iser_conn_state { ISER_CONN_INIT, /* descriptor allocd, no conn */ ISER_CONN_PENDING, /* in the process of being established */ ISER_CONN_UP, /* up and running */ ISER_CONN_TERMINATING, /* in the process of being terminated */ ISER_CONN_DOWN, /* shut down */ ISER_CONN_STATES_NUM }; enum iser_task_status { ISER_TASK_STATUS_INIT = 0, ISER_TASK_STATUS_STARTED, ISER_TASK_STATUS_COMPLETED }; enum iser_data_dir { ISER_DIR_IN = 0, /* to initiator */ ISER_DIR_OUT, /* from initiator */ ISER_DIRS_NUM }; /** * struct iser_mem_reg - iSER memory registration info * * @sge: memory region sg element * @rkey: memory region remote key * @mem_h: pointer to registration context (FMR/Fastreg) */ struct iser_mem_reg { struct ib_sge sge; u32 rkey; void *mem_h; }; enum iser_desc_type { ISCSI_TX_CONTROL , ISCSI_TX_SCSI_COMMAND, ISCSI_TX_DATAOUT }; /** * struct iser_data_buf - iSER data buffer * * @sg: pointer to the sg list * @size: num entries of this sg * @data_len: total beffer byte len * @dma_nents: returned by dma_map_sg * @copy_buf: allocated copy buf for SGs unaligned * for rdma which are copied * @orig_sg: pointer to the original sg list (in case * we used a copy) * @sg_single: SG-ified clone of a non SG SC or * unaligned SG */ struct iser_data_buf { struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE]; void *sg; - unsigned int size; + int size; unsigned long data_len; unsigned int dma_nents; char *copy_buf; struct scatterlist *orig_sg; struct scatterlist sg_single; }; /* fwd declarations */ struct iser_conn; struct ib_conn; struct iser_device; /** * struct iser_tx_desc - iSER TX descriptor (for send wr_id) * * @iser_header: iser header * @iscsi_header: iscsi header (bhs) * @type: command/control/dataout * @dma_addr: header buffer dma_address * @tx_sg: sg[0] points to iser/iscsi headers * sg[1] optionally points to either of immediate data * unsolicited data-out or control * @num_sge: number sges used on this TX task * @mapped: indicates if the descriptor is dma mapped */ struct iser_tx_desc { struct iser_hdr iser_header; struct iscsi_bhs iscsi_header __attribute__((packed)); enum iser_desc_type type; u64 dma_addr; struct ib_sge tx_sg[2]; int num_sge; bool mapped; }; #define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ sizeof(u64) + sizeof(struct ib_sge))) /** * struct iser_rx_desc - iSER RX descriptor (for recv wr_id) * * @iser_header: iser header * @iscsi_header: iscsi header * @data: received data segment * @dma_addr: receive buffer dma address * @rx_sg: ib_sge of receive buffer * @pad: for sense data TODO: Modify to maximum sense length supported */ struct iser_rx_desc { struct iser_hdr iser_header; struct iscsi_bhs iscsi_header; char data[ISER_RECV_DATA_SEG_LEN]; u64 dma_addr; struct ib_sge rx_sg; char pad[ISER_RX_PAD_SIZE]; } __attribute__((packed)); struct icl_iser_pdu { struct icl_pdu icl_pdu; struct iser_tx_desc desc; struct iser_conn *iser_conn; enum iser_task_status status; struct ccb_scsiio *csio; int command_sent; int dir[ISER_DIRS_NUM]; struct iser_mem_reg rdma_reg[ISER_DIRS_NUM]; struct iser_data_buf data[ISER_DIRS_NUM]; }; /** * struct iser_comp - iSER completion context * * @device: pointer to device handle * @cq: completion queue * @wcs: work completion array * @tq: taskqueue handle * @task: task to run task_fn * @active_qps: Number of active QPs attached * to completion context */ struct iser_comp { struct iser_device *device; struct ib_cq *cq; struct ib_wc wcs[ISER_WC_BATCH_COUNT]; struct taskqueue *tq; struct task task; int active_qps; }; /** * struct iser_device - iSER device handle * * @ib_device: RDMA device * @pd: Protection Domain for this device * @dev_attr: Device attributes container * @mr: Global DMA memory region * @event_handler: IB events handle routine * @ig_list: entry in devices list * @refcount: Reference counter, dominated by open iser connections * @comps_used: Number of completion contexts used, Min between online * cpus and device max completion vectors * @comps: Dinamically allocated array of completion handlers */ struct iser_device { struct ib_device *ib_device; struct ib_pd *pd; struct ib_device_attr dev_attr; struct ib_mr *mr; struct ib_event_handler event_handler; struct list_head ig_list; int refcount; int comps_used; struct iser_comp *comps; }; /** * struct iser_reg_resources - Fast registration recources * * @mr: memory region - * @frpl: fast reg page list * @mr_valid: is mr valid indicator */ struct iser_reg_resources { struct ib_mr *mr; - struct ib_fast_reg_page_list *frpl; u8 mr_valid:1; }; /** * struct fast_reg_descriptor - Fast registration descriptor * * @list: entry in connection fastreg pool * @rsc: data buffer registration resources */ struct fast_reg_descriptor { struct list_head list; struct iser_reg_resources rsc; }; /** * struct iser_beacon - beacon to signal all flush errors were drained * * @send: send wr * @recv: recv wr * @flush_lock: protects flush_cv * @flush_cv: condition variable for beacon flush */ struct iser_beacon { union { struct ib_send_wr send; struct ib_recv_wr recv; }; struct mtx flush_lock; struct cv flush_cv; }; /** * struct ib_conn - Infiniband related objects * * @cma_id: rdma_cm connection maneger handle * @qp: Connection Queue-pair * @device: reference to iser device * @comp: iser completion context */ struct ib_conn { struct rdma_cm_id *cma_id; struct ib_qp *qp; int post_recv_buf_count; u8 sig_count; struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; struct iser_device *device; struct iser_comp *comp; struct iser_beacon beacon; struct mtx lock; union { struct { struct ib_fmr_pool *pool; struct iser_page_vec *page_vec; } fmr; struct { struct list_head pool; int pool_size; } fastreg; }; }; struct iser_conn { struct icl_conn icl_conn; struct ib_conn ib_conn; struct cv up_cv; struct list_head conn_list; struct sx state_mutex; enum iser_conn_state state; int qp_max_recv_dtos; int min_posted_rx; u16 max_cmds; char *login_buf; char *login_req_buf, *login_resp_buf; u64 login_req_dma, login_resp_dma; unsigned int rx_desc_head; struct iser_rx_desc *rx_descs; u32 num_rx_descs; bool handoff_done; }; /** * struct iser_global: iSER global context * * @device_list_mutex: protects device_list * @device_list: iser devices global list * @connlist_mutex: protects connlist * @connlist: iser connections global list * @desc_cache: kmem cache for tx dataout * @close_conns_mutex: serializes conns closure */ struct iser_global { struct sx device_list_mutex; struct list_head device_list; struct mtx connlist_mutex; struct list_head connlist; struct sx close_conns_mutex; }; extern struct iser_global ig; extern int iser_debug; void iser_create_send_desc(struct iser_conn *, struct iser_tx_desc *); int iser_post_recvl(struct iser_conn *); int iser_post_recvm(struct iser_conn *, int); int iser_alloc_login_buf(struct iser_conn *iser_conn); void iser_free_login_buf(struct iser_conn *iser_conn); int iser_post_send(struct ib_conn *, struct iser_tx_desc *, bool); void iser_snd_completion(struct iser_tx_desc *, struct ib_conn *); void iser_rcv_completion(struct iser_rx_desc *, unsigned long, struct ib_conn *); void iser_pdu_free(struct icl_conn *, struct icl_pdu *); struct icl_pdu * iser_new_pdu(struct icl_conn *ic, int flags); int iser_alloc_rx_descriptors(struct iser_conn *, int); void iser_free_rx_descriptors(struct iser_conn *); int iser_initialize_headers(struct icl_iser_pdu *, struct iser_conn *); int iser_send_control(struct iser_conn *, struct icl_iser_pdu *); int iser_send_command(struct iser_conn *, struct icl_iser_pdu *); int iser_reg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir); void iser_unreg_rdma_mem(struct icl_iser_pdu *, enum iser_data_dir); int iser_create_fastreg_pool(struct ib_conn *, unsigned); void iser_free_fastreg_pool(struct ib_conn *); int iser_dma_map_task_data(struct icl_iser_pdu *, struct iser_data_buf *, enum iser_data_dir, enum dma_data_direction); int iser_conn_terminate(struct iser_conn *); void iser_free_ib_conn_res(struct iser_conn *, bool); void iser_dma_unmap_task_data(struct icl_iser_pdu *, struct iser_data_buf *, enum dma_data_direction); int iser_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *); #endif /* !ICL_ISER_H */ Index: projects/bsd_rdma_4_9/sys/dev/iser/iser_memory.c =================================================================== --- projects/bsd_rdma_4_9/sys/dev/iser/iser_memory.c (revision 325890) +++ projects/bsd_rdma_4_9/sys/dev/iser/iser_memory.c (revision 325891) @@ -1,348 +1,285 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "icl_iser.h" static struct fast_reg_descriptor * iser_reg_desc_get(struct ib_conn *ib_conn) { struct fast_reg_descriptor *desc; mtx_lock(&ib_conn->lock); desc = list_first_entry(&ib_conn->fastreg.pool, struct fast_reg_descriptor, list); list_del(&desc->list); mtx_unlock(&ib_conn->lock); return (desc); } static void iser_reg_desc_put(struct ib_conn *ib_conn, struct fast_reg_descriptor *desc) { mtx_lock(&ib_conn->lock); list_add(&desc->list, &ib_conn->fastreg.pool); mtx_unlock(&ib_conn->lock); } #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) /** - * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses - * and returns the length of resulting physical address array (may be less than - * the original due to possible compaction). - * - * we build a "page vec" under the assumption that the SG meets the RDMA - * alignment requirements. Other then the first and last SG elements, all - * the "internal" elements can be compacted into a list whose elements are - * dma addresses of physical pages. The code supports also the weird case - * where --few fragments of the same page-- are present in the SG as - * consecutive elements. Also, it handles one entry SG. - */ -static int -iser_sg_to_page_vec(struct iser_data_buf *data, - struct ib_device *ibdev, u64 *pages, - int *offset, int *data_size) -{ - struct scatterlist *sg, *sgl = data->sgl; - u64 start_addr, end_addr, page, chunk_start = 0; - unsigned long total_sz = 0; - unsigned int dma_len; - int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; - - /* compute the offset of first element */ - *offset = (u64) sgl[0].offset & ~MASK_4K; - - new_chunk = 1; - cur_page = 0; - for_each_sg(sgl, sg, data->dma_nents, i) { - start_addr = ib_sg_dma_address(ibdev, sg); - if (new_chunk) - chunk_start = start_addr; - dma_len = ib_sg_dma_len(ibdev, sg); - end_addr = start_addr + dma_len; - total_sz += dma_len; - - /* collect page fragments until aligned or end of SG list */ - if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { - new_chunk = 0; - continue; - } - new_chunk = 1; - - /* address of the first page in the contiguous chunk; - masking relevant for the very first SG entry, - which might be unaligned */ - page = chunk_start & MASK_4K; - do { - pages[cur_page++] = page; - page += SIZE_4K; - } while (page < end_addr); - } - - *data_size = total_sz; - - return (cur_page); -} - -/** * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned * for RDMA sub-list of a scatter-gather list of memory buffers, and returns * the number of entries which are aligned correctly. Supports the case where * consecutive SG elements are actually fragments of the same physcial page. */ static int iser_data_buf_aligned_len(struct iser_data_buf *data, struct ib_device *ibdev) { struct scatterlist *sg, *sgl, *next_sg = NULL; u64 start_addr, end_addr; int i, ret_len, start_check = 0; if (data->dma_nents == 1) return (1); sgl = data->sgl; start_addr = ib_sg_dma_address(ibdev, sgl); for_each_sg(sgl, sg, data->dma_nents, i) { if (start_check && !IS_4K_ALIGNED(start_addr)) break; next_sg = sg_next(sg); if (!next_sg) break; end_addr = start_addr + ib_sg_dma_len(ibdev, sg); start_addr = ib_sg_dma_address(ibdev, next_sg); if (end_addr == start_addr) { start_check = 0; continue; } else start_check = 1; if (!IS_4K_ALIGNED(end_addr)) break; } ret_len = (next_sg) ? i : i+1; return (ret_len); } void iser_dma_unmap_task_data(struct icl_iser_pdu *iser_pdu, struct iser_data_buf *data, enum dma_data_direction dir) { struct ib_device *dev; dev = iser_pdu->iser_conn->ib_conn.device->ib_device; ib_dma_unmap_sg(dev, data->sgl, data->size, dir); } static int iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, struct iser_mem_reg *reg) { struct scatterlist *sg = mem->sgl; reg->sge.lkey = device->mr->lkey; reg->rkey = device->mr->rkey; reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); return (0); } /** * TODO: This should be a verb * iser_ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. * @rkey - the rkey to increment. */ static inline u32 iser_ib_inc_rkey(u32 rkey) { const u32 mask = 0x000000ff; return (((rkey + 1) & mask) | (rkey & ~mask)); } static void iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) { u32 rkey; memset(inv_wr, 0, sizeof(*inv_wr)); inv_wr->opcode = IB_WR_LOCAL_INV; inv_wr->wr_id = ISER_FASTREG_LI_WRID; inv_wr->ex.invalidate_rkey = mr->rkey; rkey = iser_ib_inc_rkey(mr->rkey); ib_update_fast_reg_key(mr, rkey); } static int iser_fast_reg_mr(struct icl_iser_pdu *iser_pdu, struct iser_data_buf *mem, struct iser_reg_resources *rsc, struct iser_mem_reg *reg) { struct ib_conn *ib_conn = &iser_pdu->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; - struct ib_send_wr fastreg_wr, inv_wr; + struct ib_mr *mr = rsc->mr; + struct ib_reg_wr fastreg_wr; + struct ib_send_wr inv_wr; struct ib_send_wr *bad_wr, *wr = NULL; - int ret, offset, size, plen; + int ret, n; /* if there a single dma entry, dma mr suffices */ if (mem->dma_nents == 1) return iser_reg_dma(device, mem, reg); - /* rsc is not null */ - plen = iser_sg_to_page_vec(mem, device->ib_device, - rsc->frpl->page_list, - &offset, &size); - if (plen * SIZE_4K < size) { - ISER_ERR("fast reg page_list too short to hold this SG"); - return (EINVAL); - } - if (!rsc->mr_valid) { - iser_inv_rkey(&inv_wr, rsc->mr); + iser_inv_rkey(&inv_wr, mr); wr = &inv_wr; } + n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K); + if (unlikely(n != mem->size)) { + ISER_ERR("failed to map sg (%d/%d)\n", n, mem->size); + return n < 0 ? n : -EINVAL; + } /* Prepare FASTREG WR */ memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = ISER_FASTREG_LI_WRID; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = rsc->frpl->page_list[0] + offset; - fastreg_wr.wr.fast_reg.page_list = rsc->frpl; - fastreg_wr.wr.fast_reg.page_list_len = plen; - fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K; - fastreg_wr.wr.fast_reg.length = size; - fastreg_wr.wr.fast_reg.rkey = rsc->mr->rkey; - fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); + fastreg_wr.wr.opcode = IB_WR_REG_MR; + fastreg_wr.wr.wr_id = ISER_FASTREG_LI_WRID; + fastreg_wr.wr.num_sge = 0; + fastreg_wr.mr = mr; + fastreg_wr.key = mr->rkey; + fastreg_wr.access = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; if (!wr) - wr = &fastreg_wr; + wr = &fastreg_wr.wr; else - wr->next = &fastreg_wr; + wr->next = &fastreg_wr.wr; ret = ib_post_send(ib_conn->qp, wr, &bad_wr); if (ret) { ISER_ERR("fast registration failed, ret:%d", ret); return (ret); } rsc->mr_valid = 0; - reg->sge.lkey = rsc->mr->lkey; - reg->rkey = rsc->mr->rkey; - reg->sge.addr = rsc->frpl->page_list[0] + offset; - reg->sge.length = size; + reg->sge.lkey = mr->lkey; + reg->rkey = mr->rkey; + reg->sge.addr = mr->iova; + reg->sge.length = mr->length; return (ret); } /** * iser_reg_rdma_mem - Registers memory intended for RDMA, * using Fast Registration WR (if possible) obtaining rkey and va * * returns 0 on success, errno code on failure */ int iser_reg_rdma_mem(struct icl_iser_pdu *iser_pdu, enum iser_data_dir cmd_dir) { struct ib_conn *ib_conn = &iser_pdu->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_pdu->data[cmd_dir]; struct iser_mem_reg *mem_reg = &iser_pdu->rdma_reg[cmd_dir]; struct fast_reg_descriptor *desc = NULL; int err, aligned_len; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { ISER_ERR("bounce buffer is not supported"); return 1; } if (mem->dma_nents != 1) { desc = iser_reg_desc_get(ib_conn); mem_reg->mem_h = desc; } err = iser_fast_reg_mr(iser_pdu, mem, desc ? &desc->rsc : NULL, mem_reg); if (err) goto err_reg; return (0); err_reg: if (desc) iser_reg_desc_put(ib_conn, desc); return (err); } void iser_unreg_rdma_mem(struct icl_iser_pdu *iser_pdu, enum iser_data_dir cmd_dir) { struct iser_mem_reg *reg = &iser_pdu->rdma_reg[cmd_dir]; if (!reg->mem_h) return; iser_reg_desc_put(&iser_pdu->iser_conn->ib_conn, reg->mem_h); reg->mem_h = NULL; } int iser_dma_map_task_data(struct icl_iser_pdu *iser_pdu, struct iser_data_buf *data, enum iser_data_dir iser_dir, enum dma_data_direction dma_dir) { struct ib_device *dev; iser_pdu->dir[iser_dir] = 1; dev = iser_pdu->iser_conn->ib_conn.device->ib_device; data->dma_nents = ib_dma_map_sg(dev, data->sgl, data->size, dma_dir); if (data->dma_nents == 0) { ISER_ERR("dma_map_sg failed"); return (EINVAL); } return (0); } Index: projects/bsd_rdma_4_9/sys/dev/iser/iser_verbs.c =================================================================== --- projects/bsd_rdma_4_9/sys/dev/iser/iser_verbs.c (revision 325890) +++ projects/bsd_rdma_4_9/sys/dev/iser/iser_verbs.c (revision 325891) @@ -1,967 +1,949 @@ /* $FreeBSD$ */ /*- * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "icl_iser.h" static MALLOC_DEFINE(M_ISER_VERBS, "iser_verbs", "iser verbs backend"); static int iser_cq_poll_limit = 512; static void iser_cq_event_callback(struct ib_event *cause, void *context) { ISER_ERR("got cq event %d", cause->event); } static void iser_qp_event_callback(struct ib_event *cause, void *context) { ISER_ERR("got qp event %d", cause->event); } static void iser_event_handler(struct ib_event_handler *handler, struct ib_event *event) { ISER_ERR("async event %d on device %s port %d", event->event, event->device->name, event->element.port_num); } /** * is_iser_tx_desc - Indicate if the completion wr_id * is a TX descriptor or not. * @iser_conn: iser connection * @wr_id: completion WR identifier * * Since we cannot rely on wc opcode in FLUSH errors * we must work around it by checking if the wr_id address * falls in the iser connection rx_descs buffer. If so * it is an RX descriptor, otherwize it is a TX. */ static inline bool is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id) { void *start = iser_conn->rx_descs; u64 len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs); void *end = (void *)((uintptr_t)start + (uintptr_t)len); if (start) { if (wr_id >= start && wr_id < end) return false; } else { return ((uintptr_t)wr_id != (uintptr_t)iser_conn->login_resp_buf); } return true; } /** * iser_handle_comp_error() - Handle error completion * @ib_conn: connection RDMA resources * @wc: work completion * * Notes: Update post_recv_buf_count in case of recv error completion. * For non-FLUSH error completion we should also notify iscsi layer that * connection is failed (in case we passed bind stage). */ static void iser_handle_comp_error(struct ib_conn *ib_conn, struct ib_wc *wc) { void *wr_id = (void *)(uintptr_t)wc->wr_id; struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, ib_conn); if (is_iser_tx_desc(iser_conn, wr_id)) { ISER_DBG("conn %p got send comp error", iser_conn); } else { ISER_DBG("conn %p got recv comp error", iser_conn); ib_conn->post_recv_buf_count--; } if (wc->status != IB_WC_WR_FLUSH_ERR) iser_conn->icl_conn.ic_error(&iser_conn->icl_conn); } /** * iser_handle_wc - handle a single work completion * @wc: work completion * * Soft-IRQ context, work completion can be either * SEND or RECV, and can turn out successful or * with error (or flush error). */ static void iser_handle_wc(struct ib_wc *wc) { struct ib_conn *ib_conn; struct iser_tx_desc *tx_desc; struct iser_rx_desc *rx_desc; ib_conn = wc->qp->qp_context; if (likely(wc->status == IB_WC_SUCCESS)) { if (wc->opcode == IB_WC_RECV) { rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id; iser_rcv_completion(rx_desc, wc->byte_len, ib_conn); } else if (wc->opcode == IB_WC_SEND) { tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id; iser_snd_completion(tx_desc, ib_conn); } else { ISER_ERR("Unknown wc opcode %d", wc->opcode); } } else { struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, ib_conn); if (wc->status != IB_WC_WR_FLUSH_ERR) { ISER_ERR("conn %p wr id %llx status %d vend_err %x", iser_conn, (unsigned long long)wc->wr_id, wc->status, wc->vendor_err); } else { ISER_DBG("flush error: conn %p wr id %llx", iser_conn, (unsigned long long)wc->wr_id); } if (wc->wr_id == ISER_BEACON_WRID) { /* all flush errors were consumed */ mtx_lock(&ib_conn->beacon.flush_lock); ISER_DBG("conn %p got ISER_BEACON_WRID", iser_conn); cv_signal(&ib_conn->beacon.flush_cv); mtx_unlock(&ib_conn->beacon.flush_lock); } else { iser_handle_comp_error(ib_conn, wc); } } } static void iser_cq_tasklet_fn(void *data, int pending) { struct iser_comp *comp = (struct iser_comp *)data; struct ib_cq *cq = comp->cq; struct ib_wc *const wcs = comp->wcs; int completed = 0; int i; int n; while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) { for (i = 0; i < n; i++) iser_handle_wc(&wcs[i]); completed += n; if (completed >= iser_cq_poll_limit) break; } /* * It is assumed here that arming CQ only once its empty * would not cause interrupts to be missed. */ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); } static void iser_cq_callback(struct ib_cq *cq, void *cq_context) { struct iser_comp *comp = cq_context; taskqueue_enqueue(comp->tq, &comp->task); } /** * iser_create_device_ib_res - creates Protection Domain (PD), Completion * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with * the adapator. * * returns 0 on success, -1 on failure */ static int iser_create_device_ib_res(struct iser_device *device) { - struct ib_device_attr *dev_attr = &device->dev_attr; - int ret, i, max_cqe; + struct ib_device *ib_dev = device->ib_device; + int i, max_cqe; - ret = ib_query_device(device->ib_device, dev_attr); - if (ret) { - ISER_ERR("Query device failed for %s", device->ib_device->name); - return (ret); - } - - if (!(dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { ISER_ERR("device %s doesn't support Fastreg, " "can't register memory", device->ib_device->name); return (1); } device->comps_used = min(mp_ncpus, device->ib_device->num_comp_vectors); device->comps = malloc(device->comps_used * sizeof(*device->comps), M_ISER_VERBS, M_WAITOK | M_ZERO); if (!device->comps) goto comps_err; - max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe); + max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe); ISER_DBG("using %d CQs, device %s supports %d vectors max_cqe %d", device->comps_used, device->ib_device->name, device->ib_device->num_comp_vectors, max_cqe); - device->pd = ib_alloc_pd(device->ib_device); + device->pd = ib_alloc_pd(device->ib_device, IB_PD_UNSAFE_GLOBAL_RKEY); if (IS_ERR(device->pd)) goto pd_err; for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; + struct ib_cq_init_attr cq_attr = { + .cqe = max_cqe, + .comp_vector = i, + }; comp->device = device; comp->cq = ib_create_cq(device->ib_device, iser_cq_callback, iser_cq_event_callback, (void *)comp, - max_cqe, i); + &cq_attr); if (IS_ERR(comp->cq)) { comp->cq = NULL; goto cq_err; } if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP)) goto cq_err; TASK_INIT(&comp->task, 0, iser_cq_tasklet_fn, comp); comp->tq = taskqueue_create_fast("iser_taskq", M_NOWAIT, taskqueue_thread_enqueue, &comp->tq); if (!comp->tq) goto tq_err; taskqueue_start_threads(&comp->tq, 1, PI_NET, "iser taskq"); } - device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); + device->mr = device->pd->__internal_mr; if (IS_ERR(device->mr)) goto tq_err; INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, iser_event_handler); if (ib_register_event_handler(&device->event_handler)) goto handler_err; return (0); handler_err: ib_dereg_mr(device->mr); tq_err: for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; if (comp->tq) taskqueue_free(comp->tq); } cq_err: for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; if (comp->cq) ib_destroy_cq(comp->cq); } ib_dealloc_pd(device->pd); pd_err: free(device->comps, M_ISER_VERBS); comps_err: ISER_ERR("failed to allocate an IB resource"); return (1); } /** * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR, * CQ and PD created with the device associated with the adapator. */ static void iser_free_device_ib_res(struct iser_device *device) { int i; for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; taskqueue_free(comp->tq); ib_destroy_cq(comp->cq); comp->cq = NULL; } (void)ib_unregister_event_handler(&device->event_handler); (void)ib_dereg_mr(device->mr); (void)ib_dealloc_pd(device->pd); free(device->comps, M_ISER_VERBS); device->comps = NULL; device->mr = NULL; device->pd = NULL; } static int iser_alloc_reg_res(struct ib_device *ib_device, struct ib_pd *pd, struct iser_reg_resources *res) { int ret; - res->frpl = ib_alloc_fast_reg_page_list(ib_device, - ISCSI_ISER_SG_TABLESIZE + 1); - if (IS_ERR(res->frpl)) { - ret = -PTR_ERR(res->frpl); - ISER_ERR("Failed to allocate fast reg page list err=%d", ret); - return (ret); - } - - res->mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); + res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, ISCSI_ISER_SG_TABLESIZE + 1); if (IS_ERR(res->mr)) { ret = -PTR_ERR(res->mr); ISER_ERR("Failed to allocate fast reg mr err=%d", ret); - goto fast_reg_mr_failure; + return (ret); } res->mr_valid = 1; return (0); - -fast_reg_mr_failure: - ib_free_fast_reg_page_list(res->frpl); - - return (ret); } static void iser_free_reg_res(struct iser_reg_resources *rsc) { ib_dereg_mr(rsc->mr); - ib_free_fast_reg_page_list(rsc->frpl); } static struct fast_reg_descriptor * iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd) { struct fast_reg_descriptor *desc; int ret; desc = malloc(sizeof(*desc), M_ISER_VERBS, M_WAITOK | M_ZERO); if (!desc) { ISER_ERR("Failed to allocate a new fastreg descriptor"); return (NULL); } ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc); if (ret) { ISER_ERR("failed to allocate reg_resources"); goto err; } return (desc); err: free(desc, M_ISER_VERBS); return (NULL); } /** * iser_create_fmr_pool - Creates FMR pool and page_vector * * returns 0 on success, or errno code on failure */ int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max) { struct iser_device *device = ib_conn->device; struct fast_reg_descriptor *desc; int i; INIT_LIST_HEAD(&ib_conn->fastreg.pool); ib_conn->fastreg.pool_size = 0; for (i = 0; i < cmds_max; i++) { desc = iser_create_fastreg_desc(device->ib_device, device->pd); if (!desc) { ISER_ERR("Failed to create fastreg descriptor"); goto err; } list_add_tail(&desc->list, &ib_conn->fastreg.pool); ib_conn->fastreg.pool_size++; } return (0); err: iser_free_fastreg_pool(ib_conn); return (ENOMEM); } /** * iser_free_fmr_pool - releases the FMR pool and page vec */ void iser_free_fastreg_pool(struct ib_conn *ib_conn) { struct fast_reg_descriptor *desc, *tmp; int i = 0; if (list_empty(&ib_conn->fastreg.pool)) return; ISER_DBG("freeing conn %p fr pool", ib_conn); list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { list_del(&desc->list); iser_free_reg_res(&desc->rsc); free(desc, M_ISER_VERBS); ++i; } if (i < ib_conn->fastreg.pool_size) ISER_WARN("pool still has %d regions registered", ib_conn->fastreg.pool_size - i); } /** * iser_create_ib_conn_res - Queue-Pair (QP) * * returns 0 on success, 1 on failure */ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) { struct iser_conn *iser_conn; struct iser_device *device; struct ib_device_attr *dev_attr; struct ib_qp_init_attr init_attr; int index, min_index = 0; int ret = -ENOMEM; iser_conn = container_of(ib_conn, struct iser_conn, ib_conn); device = ib_conn->device; dev_attr = &device->dev_attr; mtx_lock(&ig.connlist_mutex); /* select the CQ with the minimal number of usages */ for (index = 0; index < device->comps_used; index++) { if (device->comps[index].active_qps < device->comps[min_index].active_qps) min_index = index; } ib_conn->comp = &device->comps[min_index]; ib_conn->comp->active_qps++; mtx_unlock(&ig.connlist_mutex); ISER_INFO("cq index %d used for ib_conn %p", min_index, ib_conn); memset(&init_attr, 0, sizeof init_attr); init_attr.event_handler = iser_qp_event_callback; init_attr.qp_context = (void *)ib_conn; init_attr.send_cq = ib_conn->comp->cq; init_attr.recv_cq = ib_conn->comp->cq; init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; init_attr.cap.max_send_sge = 2; init_attr.cap.max_recv_sge = 1; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) { init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS); } else { init_attr.cap.max_send_wr = dev_attr->max_qp_wr; iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr); } ISER_DBG("device %s supports max_send_wr %d", device->ib_device->name, dev_attr->max_qp_wr); ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); if (ret) goto out_err; ib_conn->qp = ib_conn->cma_id->qp; ISER_DBG("setting conn %p cma_id %p qp %p", ib_conn, ib_conn->cma_id, ib_conn->cma_id->qp); return (ret); out_err: mtx_lock(&ig.connlist_mutex); ib_conn->comp->active_qps--; mtx_unlock(&ig.connlist_mutex); ISER_ERR("unable to alloc mem or create resource, err %d", ret); return (ret); } /** * based on the resolved device node GUID see if there already allocated * device for this device. If there's no such, create one. */ static struct iser_device * iser_device_find_by_ib_device(struct rdma_cm_id *cma_id) { struct iser_device *device; sx_xlock(&ig.device_list_mutex); list_for_each_entry(device, &ig.device_list, ig_list) /* find if there's a match using the node GUID */ if (device->ib_device->node_guid == cma_id->device->node_guid) goto inc_refcnt; device = malloc(sizeof *device, M_ISER_VERBS, M_WAITOK | M_ZERO); if (device == NULL) goto out; /* assign this device to the device */ device->ib_device = cma_id->device; /* init the device and link it into ig device list */ if (iser_create_device_ib_res(device)) { free(device, M_ISER_VERBS); device = NULL; goto out; } list_add(&device->ig_list, &ig.device_list); inc_refcnt: device->refcount++; ISER_INFO("device %p refcount %d", device, device->refcount); out: sx_xunlock(&ig.device_list_mutex); return (device); } /* if there's no demand for this device, release it */ static void iser_device_try_release(struct iser_device *device) { sx_xlock(&ig.device_list_mutex); device->refcount--; ISER_INFO("device %p refcount %d", device, device->refcount); if (!device->refcount) { iser_free_device_ib_res(device); list_del(&device->ig_list); free(device, M_ISER_VERBS); device = NULL; } sx_xunlock(&ig.device_list_mutex); } /** * Called with state mutex held **/ static int iser_conn_state_comp_exch(struct iser_conn *iser_conn, enum iser_conn_state comp, enum iser_conn_state exch) { int ret; ret = (iser_conn->state == comp); if (ret) iser_conn->state = exch; return ret; } /** * iser_free_ib_conn_res - release IB related resources * @iser_conn: iser connection struct * @destroy: indicator if we need to try to release the * iser device and memory regoins pool (only iscsi * shutdown and DEVICE_REMOVAL will use this). * * This routine is called with the iser state mutex held * so the cm_id removal is out of here. It is Safe to * be invoked multiple times. */ void iser_free_ib_conn_res(struct iser_conn *iser_conn, bool destroy) { struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_device *device = ib_conn->device; ISER_INFO("freeing conn %p cma_id %p qp %p", iser_conn, ib_conn->cma_id, ib_conn->qp); if (ib_conn->qp != NULL) { mtx_lock(&ig.connlist_mutex); ib_conn->comp->active_qps--; mtx_unlock(&ig.connlist_mutex); rdma_destroy_qp(ib_conn->cma_id); ib_conn->qp = NULL; } if (destroy) { if (iser_conn->login_buf) iser_free_login_buf(iser_conn); if (iser_conn->rx_descs) iser_free_rx_descriptors(iser_conn); if (device != NULL) { iser_device_try_release(device); ib_conn->device = NULL; } } } /** * triggers start of the disconnect procedures and wait for them to be done * Called with state mutex held */ int iser_conn_terminate(struct iser_conn *iser_conn) { struct ib_conn *ib_conn = &iser_conn->ib_conn; struct ib_send_wr *bad_send_wr; struct ib_recv_wr *bad_recv_wr; int err = 0; /* terminate the iser conn only if the conn state is UP */ if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP, ISER_CONN_TERMINATING)) return (0); ISER_INFO("iser_conn %p state %d\n", iser_conn, iser_conn->state); if (ib_conn->qp == NULL) { /* HOW can this be??? */ ISER_WARN("qp wasn't created"); return (1); } /* * Todo: This is a temporary workaround. * We serialize the connection closure using global lock in order to * receive all posted beacons completions. * Without Serialization, in case we open many connections (QPs) on * the same CQ, we might miss beacons because of missing interrupts. */ sx_xlock(&ig.close_conns_mutex); /* * In case we didn't already clean up the cma_id (peer initiated * a disconnection), we need to Cause the CMA to change the QP * state to ERROR. */ if (ib_conn->cma_id) { err = rdma_disconnect(ib_conn->cma_id); if (err) ISER_ERR("Failed to disconnect, conn: 0x%p err %d", iser_conn, err); mtx_lock(&ib_conn->beacon.flush_lock); memset(&ib_conn->beacon.send, 0, sizeof(struct ib_send_wr)); ib_conn->beacon.send.wr_id = ISER_BEACON_WRID; ib_conn->beacon.send.opcode = IB_WR_SEND; /* post an indication that all send flush errors were consumed */ err = ib_post_send(ib_conn->qp, &ib_conn->beacon.send, &bad_send_wr); if (err) { ISER_ERR("conn %p failed to post send_beacon", ib_conn); mtx_unlock(&ib_conn->beacon.flush_lock); goto out; } ISER_DBG("before send cv_wait: %p", iser_conn); cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock); ISER_DBG("after send cv_wait: %p", iser_conn); memset(&ib_conn->beacon.recv, 0, sizeof(struct ib_recv_wr)); ib_conn->beacon.recv.wr_id = ISER_BEACON_WRID; /* post an indication that all recv flush errors were consumed */ err = ib_post_recv(ib_conn->qp, &ib_conn->beacon.recv, &bad_recv_wr); if (err) { ISER_ERR("conn %p failed to post recv_beacon", ib_conn); mtx_unlock(&ib_conn->beacon.flush_lock); goto out; } ISER_DBG("before recv cv_wait: %p", iser_conn); cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock); mtx_unlock(&ib_conn->beacon.flush_lock); ISER_DBG("after recv cv_wait: %p", iser_conn); } out: sx_xunlock(&ig.close_conns_mutex); return (1); } /** * Called with state mutex held **/ static void iser_connect_error(struct rdma_cm_id *cma_id) { struct iser_conn *iser_conn; iser_conn = cma_id->context; ISER_ERR("conn %p", iser_conn); iser_conn->state = ISER_CONN_TERMINATING; cv_signal(&iser_conn->up_cv); } /** * Called with state mutex held **/ static void iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; struct iser_conn *iser_conn; struct ib_conn *ib_conn; int ret; iser_conn = cma_id->context; ib_conn = &iser_conn->ib_conn; device = iser_device_find_by_ib_device(cma_id); if (!device) { ISER_ERR("conn %p device lookup/creation failed", iser_conn); iser_connect_error(cma_id); return; } ib_conn->device = device; ret = rdma_resolve_route(cma_id, 1000); if (ret) { ISER_ERR("conn %p resolve route failed: %d", iser_conn, ret); iser_connect_error(cma_id); return; } } /** * Called with state mutex held **/ static void iser_route_handler(struct rdma_cm_id *cma_id) { struct rdma_conn_param conn_param; int ret; struct iser_cm_hdr req_hdr; struct iser_conn *iser_conn = cma_id->context; struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_device *device = ib_conn->device; ret = iser_create_ib_conn_res(ib_conn); if (ret) goto failure; memset(&conn_param, 0, sizeof conn_param); conn_param.responder_resources = device->dev_attr.max_qp_rd_atom; conn_param.retry_count = 7; conn_param.rnr_retry_count = 6; /* * Initiaotr depth should not be set, but in order to compat * with old targets, we keep this value set. */ conn_param.initiator_depth = 1; memset(&req_hdr, 0, sizeof(req_hdr)); req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | ISER_SEND_W_INV_NOT_SUPPORTED); conn_param.private_data = (void *)&req_hdr; conn_param.private_data_len = sizeof(struct iser_cm_hdr); ret = rdma_connect(cma_id, &conn_param); if (ret) { ISER_ERR("conn %p failure connecting: %d", iser_conn, ret); goto failure; } return; failure: iser_connect_error(cma_id); } /** * Called with state mutex held **/ static void iser_connected_handler(struct rdma_cm_id *cma_id) { struct iser_conn *iser_conn; struct ib_qp_attr attr; struct ib_qp_init_attr init_attr; iser_conn = cma_id->context; (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); ISER_INFO("remote qpn:%x my qpn:%x", attr.dest_qp_num, cma_id->qp->qp_num); iser_conn->state = ISER_CONN_UP; cv_signal(&iser_conn->up_cv); } /** * Called with state mutex held **/ static void iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy) { struct iser_conn *iser_conn = cma_id->context; if (iser_conn_terminate(iser_conn)) iser_conn->icl_conn.ic_error(&iser_conn->icl_conn); } int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { struct iser_conn *iser_conn; int ret = 0; iser_conn = cma_id->context; ISER_INFO("event %d status %d conn %p id %p", event->event, event->status, cma_id->context, cma_id); sx_xlock(&iser_conn->state_mutex); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: iser_addr_handler(cma_id); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: iser_route_handler(cma_id); break; case RDMA_CM_EVENT_ESTABLISHED: iser_connected_handler(cma_id); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: iser_connect_error(cma_id); break; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_TIMEWAIT_EXIT: iser_cleanup_handler(cma_id, false); break; default: ISER_ERR("Unexpected RDMA CM event (%d)", event->event); break; } sx_xunlock(&iser_conn->state_mutex); return (ret); } int iser_post_recvl(struct iser_conn *iser_conn) { struct ib_recv_wr rx_wr, *rx_wr_failed; struct ib_conn *ib_conn = &iser_conn->ib_conn; struct ib_sge sge; int ib_ret; sge.addr = iser_conn->login_resp_dma; sge.length = ISER_RX_LOGIN_SIZE; sge.lkey = ib_conn->device->mr->lkey; rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf; rx_wr.sg_list = &sge; rx_wr.num_sge = 1; rx_wr.next = NULL; ib_conn->post_recv_buf_count++; ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed); if (ib_ret) { ISER_ERR("ib_post_recv failed ret=%d", ib_ret); ib_conn->post_recv_buf_count--; } return (ib_ret); } int iser_post_recvm(struct iser_conn *iser_conn, int count) { struct ib_recv_wr *rx_wr, *rx_wr_failed; int i, ib_ret; struct ib_conn *ib_conn = &iser_conn->ib_conn; unsigned int my_rx_head = iser_conn->rx_desc_head; struct iser_rx_desc *rx_desc; for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { rx_desc = &iser_conn->rx_descs[my_rx_head]; rx_wr->wr_id = (uintptr_t)rx_desc; rx_wr->sg_list = &rx_desc->rx_sg; rx_wr->num_sge = 1; rx_wr->next = rx_wr + 1; my_rx_head = (my_rx_head + 1) % iser_conn->qp_max_recv_dtos; } rx_wr--; rx_wr->next = NULL; /* mark end of work requests list */ ib_conn->post_recv_buf_count += count; ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed); if (ib_ret) { ISER_ERR("ib_post_recv failed ret=%d", ib_ret); ib_conn->post_recv_buf_count -= count; } else iser_conn->rx_desc_head = my_rx_head; return (ib_ret); } /** * iser_start_send - Initiate a Send DTO operation * * returns 0 on success, -1 on failure */ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, bool signal) { int ib_ret; struct ib_send_wr send_wr, *send_wr_failed; ib_dma_sync_single_for_device(ib_conn->device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); send_wr.next = NULL; send_wr.wr_id = (uintptr_t)tx_desc; send_wr.sg_list = tx_desc->tx_sg; send_wr.num_sge = tx_desc->num_sge; send_wr.opcode = IB_WR_SEND; send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0; ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); if (ib_ret) ISER_ERR("ib_post_send failed, ret:%d", ib_ret); return (ib_ret); } Index: projects/bsd_rdma_4_9/sys/modules/iser/Makefile =================================================================== --- projects/bsd_rdma_4_9/sys/modules/iser/Makefile (revision 325890) +++ projects/bsd_rdma_4_9/sys/modules/iser/Makefile (revision 325891) @@ -1,32 +1,33 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/dev/iser/ .include KMOD= iser SRCS= icl_iser.c SRCS+= iser_initiator.c SRCS+= iser_memory.c SRCS+= iser_verbs.c SRCS+= vnode_if.h SRCS+= opt_inet.h SRCS+= opt_inet6.h SRCS+= opt_cam.h SRCS+= bus_if.h SRCS+= device_if.h SRCS+= icl_conn_if.h CFLAGS+= -I${SRCTOP}/sys/ CFLAGS+= -I${SYSDIR}/ofed/include +CFLAGS+= -I${SYSDIR}/ofed/include/uapi CFLAGS+= -I${SYSDIR}/compat/linuxkpi/common/include CFLAGS+= -DCONFIG_INFINIBAND_USER_MEM CFLAGS+= -DINET6 -DINET CFLAGS+= -fms-extensions CFLAGS+=-DICL_KERNEL_PROXY MFILES= kern/bus_if.m kern/device_if.m dev/iscsi/icl_conn_if.m .include