Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h (revision 323642) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp.h (revision 323643) @@ -1,726 +1,724 @@ #ifndef _SDP_H_ #define _SDP_H_ #define LINUXKPI_PARAM_PREFIX ib_sdp_ #include "opt_ddb.h" #include "opt_inet.h" #include "opt_ofed.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #ifdef SDP_DEBUG #define CONFIG_INFINIBAND_SDP_DEBUG #endif #include "sdp_dbg.h" #undef LIST_HEAD /* From sys/queue.h */ #define LIST_HEAD(name, type) \ struct name { \ struct type *lh_first; /* first element */ \ } /* Interval between successive polls in the Tx routine when polling is used instead of interrupts (in per-core Tx rings) - should be power of 2 */ #define SDP_TX_POLL_MODER 16 #define SDP_TX_POLL_TIMEOUT (HZ / 20) #define SDP_NAGLE_TIMEOUT (HZ / 10) #define SDP_SRCAVAIL_CANCEL_TIMEOUT (HZ * 5) #define SDP_SRCAVAIL_ADV_TIMEOUT (1 * HZ) #define SDP_SRCAVAIL_PAYLOAD_LEN 1 #define SDP_RESOLVE_TIMEOUT 1000 #define SDP_ROUTE_TIMEOUT 1000 #define SDP_RETRY_COUNT 5 #define SDP_KEEPALIVE_TIME (120 * 60 * HZ) #define SDP_FIN_WAIT_TIMEOUT (60 * HZ) /* like TCP_FIN_TIMEOUT */ #define SDP_TX_SIZE 0x40 #define SDP_RX_SIZE 0x40 #define SDP_FMR_SIZE (MIN(0x1000, PAGE_SIZE) / sizeof(u64)) #define SDP_FMR_POOL_SIZE 1024 #define SDP_FMR_DIRTY_SIZE ( SDP_FMR_POOL_SIZE / 4 ) #define SDP_MAX_RDMA_READ_LEN (PAGE_SIZE * (SDP_FMR_SIZE - 2)) /* mb inlined data len - rest will be rx'ed into frags */ #define SDP_HEAD_SIZE (sizeof(struct sdp_bsdh)) /* limit tx payload len, if the sink supports bigger buffers than the source * can handle. * or rx fragment size (limited by sge->length size) */ #define SDP_MAX_PACKET (1 << 16) #define SDP_MAX_PAYLOAD (SDP_MAX_PACKET - SDP_HEAD_SIZE) #define SDP_MAX_RECV_SGES (SDP_MAX_PACKET / MCLBYTES) #define SDP_MAX_SEND_SGES (SDP_MAX_PACKET / MCLBYTES) + 2 #define SDP_NUM_WC 4 #define SDP_DEF_ZCOPY_THRESH 64*1024 #define SDP_MIN_ZCOPY_THRESH PAGE_SIZE #define SDP_MAX_ZCOPY_THRESH 1048576 #define SDP_OP_RECV 0x800000000LL #define SDP_OP_SEND 0x400000000LL #define SDP_OP_RDMA 0x200000000LL #define SDP_OP_NOP 0x100000000LL /* how long (in jiffies) to block sender till tx completion*/ #define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10) #define SDP_AUTO_CONF 0xffff #define AUTO_MOD_DELAY (HZ / 4) struct sdp_mb_cb { __u32 seq; /* Starting sequence number */ struct bzcopy_state *bz; struct rx_srcavail_state *rx_sa; struct tx_srcavail_state *tx_sa; }; #define M_PUSH M_PROTO1 /* Do a 'push'. */ #define M_URG M_PROTO2 /* Mark as urgent (oob). */ #define SDP_SKB_CB(__mb) ((struct sdp_mb_cb *)&((__mb)->cb[0])) #define BZCOPY_STATE(mb) (SDP_SKB_CB(mb)->bz) #define RX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->rx_sa) #define TX_SRCAVAIL_STATE(mb) (SDP_SKB_CB(mb)->tx_sa) #ifndef MIN #define MIN(a, b) (a < b ? a : b) #endif #define ring_head(ring) (atomic_read(&(ring).head)) #define ring_tail(ring) (atomic_read(&(ring).tail)) #define ring_posted(ring) (ring_head(ring) - ring_tail(ring)) #define rx_ring_posted(ssk) ring_posted(ssk->rx_ring) #ifdef SDP_ZCOPY #define tx_ring_posted(ssk) (ring_posted(ssk->tx_ring) + \ (ssk->tx_ring.rdma_inflight ? ssk->tx_ring.rdma_inflight->busy : 0)) #else #define tx_ring_posted(ssk) ring_posted(ssk->tx_ring) #endif extern int sdp_zcopy_thresh; extern int rcvbuf_initial_size; extern struct workqueue_struct *rx_comp_wq; extern struct ib_client sdp_client; enum sdp_mid { SDP_MID_HELLO = 0x0, SDP_MID_HELLO_ACK = 0x1, SDP_MID_DISCONN = 0x2, SDP_MID_ABORT = 0x3, SDP_MID_SENDSM = 0x4, SDP_MID_RDMARDCOMPL = 0x6, SDP_MID_SRCAVAIL_CANCEL = 0x8, SDP_MID_CHRCVBUF = 0xB, SDP_MID_CHRCVBUF_ACK = 0xC, SDP_MID_SINKAVAIL = 0xFD, SDP_MID_SRCAVAIL = 0xFE, SDP_MID_DATA = 0xFF, }; enum sdp_flags { SDP_OOB_PRES = 1 << 0, SDP_OOB_PEND = 1 << 1, }; enum { SDP_MIN_TX_CREDITS = 2 }; enum { SDP_ERR_ERROR = -4, SDP_ERR_FAULT = -3, SDP_NEW_SEG = -2, SDP_DO_WAIT_MEM = -1 }; struct sdp_bsdh { u8 mid; u8 flags; __u16 bufs; __u32 len; __u32 mseq; __u32 mseq_ack; } __attribute__((__packed__)); union cma_ip_addr { struct in6_addr ip6; struct { __u32 pad[3]; __u32 addr; } ip4; } __attribute__((__packed__)); /* TODO: too much? Can I avoid having the src/dst and port here? */ struct sdp_hh { struct sdp_bsdh bsdh; u8 majv_minv; u8 ipv_cap; u8 rsvd1; u8 max_adverts; __u32 desremrcvsz; __u32 localrcvsz; __u16 port; __u16 rsvd2; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; u8 rsvd3[IB_CM_REQ_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 48]; } __attribute__((__packed__)); struct sdp_hah { struct sdp_bsdh bsdh; u8 majv_minv; u8 ipv_cap; u8 rsvd1; u8 ext_max_adverts; __u32 actrcvsz; u8 rsvd2[IB_CM_REP_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 8]; } __attribute__((__packed__)); struct sdp_rrch { __u32 len; } __attribute__((__packed__)); struct sdp_srcah { __u32 len; __u32 rkey; __u64 vaddr; } __attribute__((__packed__)); struct sdp_buf { struct mbuf *mb; u64 mapping[SDP_MAX_SEND_SGES]; } __attribute__((__packed__)); struct sdp_chrecvbuf { u32 size; } __attribute__((__packed__)); /* Context used for synchronous zero copy bcopy (BZCOPY) */ struct bzcopy_state { unsigned char __user *u_base; int u_len; int left; int page_cnt; int cur_page; int cur_offset; int busy; struct sdp_sock *ssk; struct page **pages; }; enum rx_sa_flag { RX_SA_ABORTED = 2, }; enum tx_sa_flag { TX_SA_SENDSM = 0x01, TX_SA_CROSS_SEND = 0x02, TX_SA_INTRRUPTED = 0x04, TX_SA_TIMEDOUT = 0x08, TX_SA_ERROR = 0x10, }; struct rx_srcavail_state { /* Advertised buffer stuff */ u32 mseq; u32 used; u32 reported; u32 len; u32 rkey; u64 vaddr; /* Dest buff info */ struct ib_umem *umem; struct ib_pool_fmr *fmr; /* Utility */ u8 busy; enum rx_sa_flag flags; }; struct tx_srcavail_state { /* Data below 'busy' will be reset */ u8 busy; struct ib_umem *umem; struct ib_pool_fmr *fmr; u32 bytes_sent; u32 bytes_acked; enum tx_sa_flag abort_flags; u8 posted; u32 mseq; }; struct sdp_tx_ring { #ifdef SDP_ZCOPY struct rx_srcavail_state *rdma_inflight; #endif struct sdp_buf *buffer; atomic_t head; atomic_t tail; struct ib_cq *cq; atomic_t credits; #define tx_credits(ssk) (atomic_read(&ssk->tx_ring.credits)) struct callout timer; u16 poll_cnt; }; struct sdp_rx_ring { struct sdp_buf *buffer; atomic_t head; atomic_t tail; struct ib_cq *cq; int destroyed; struct rwlock destroyed_lock; }; struct sdp_device { struct ib_pd *pd; - struct ib_mr *mr; struct ib_fmr_pool *fmr_pool; }; struct sdp_moderation { unsigned long last_moder_packets; unsigned long last_moder_tx_packets; unsigned long last_moder_bytes; unsigned long last_moder_jiffies; int last_moder_time; u16 rx_usecs; u16 rx_frames; u16 tx_usecs; u32 pkt_rate_low; u16 rx_usecs_low; u32 pkt_rate_high; u16 rx_usecs_high; u16 sample_interval; u16 adaptive_rx_coal; u32 msg_enable; int moder_cnt; int moder_time; }; /* These are flags fields. */ #define SDP_TIMEWAIT 0x0001 /* In ssk timewait state. */ #define SDP_DROPPED 0x0002 /* Socket has been dropped. */ #define SDP_SOCKREF 0x0004 /* Holding a sockref for close. */ #define SDP_NODELAY 0x0008 /* Disble nagle. */ #define SDP_NEEDFIN 0x0010 /* Send a fin on the next tx. */ #define SDP_DREQWAIT 0x0020 /* Waiting on DREQ. */ #define SDP_DESTROY 0x0040 /* Being destroyed. */ #define SDP_DISCON 0x0080 /* rdma_disconnect is owed. */ /* These are oobflags */ #define SDP_HADOOB 0x0001 /* Had OOB data. */ #define SDP_HAVEOOB 0x0002 /* Have OOB data. */ struct sdp_sock { LIST_ENTRY(sdp_sock) list; struct socket *socket; struct rdma_cm_id *id; struct ib_device *ib_device; struct sdp_device *sdp_dev; struct ib_qp *qp; struct ucred *cred; struct callout keep2msl; /* 2msl and keepalive timer. */ struct callout nagle_timer; /* timeout waiting for ack */ struct ib_ucontext context; in_port_t lport; in_addr_t laddr; in_port_t fport; in_addr_t faddr; int flags; int oobflags; /* protected by rx lock. */ int state; int softerror; int recv_bytes; /* Bytes per recv. buf including header */ int xmit_size_goal; char iobc; struct sdp_rx_ring rx_ring; struct sdp_tx_ring tx_ring; struct rwlock lock; struct mbufq rxctlq; /* received control packets */ int qp_active; /* XXX Flag. */ int max_sge; struct work_struct rx_comp_work; #define rcv_nxt(ssk) atomic_read(&(ssk->rcv_nxt)) atomic_t rcv_nxt; /* SDP specific */ atomic_t mseq_ack; #define mseq_ack(ssk) (atomic_read(&ssk->mseq_ack)) unsigned max_bufs; /* Initial buffers offered by other side */ unsigned min_bufs; /* Low water mark to wake senders */ unsigned long nagle_last_unacked; /* mseq of lastest unacked packet */ atomic_t remote_credits; #define remote_credits(ssk) (atomic_read(&ssk->remote_credits)) int poll_cq; /* SDP slow start */ int recv_request_head; /* mark the rx_head when the resize request was received */ int recv_request; /* XXX flag if request to resize was received */ unsigned long tx_packets; unsigned long rx_packets; unsigned long tx_bytes; unsigned long rx_bytes; struct sdp_moderation auto_mod; struct task shutdown_task; #ifdef SDP_ZCOPY struct tx_srcavail_state *tx_sa; struct rx_srcavail_state *rx_sa; spinlock_t tx_sa_lock; struct delayed_work srcavail_cancel_work; int srcavail_cancel_mseq; /* ZCOPY data: -1:use global; 0:disable zcopy; >0: zcopy threshold */ int zcopy_thresh; #endif }; #define sdp_sk(so) ((struct sdp_sock *)(so->so_pcb)) #define SDP_RLOCK(ssk) rw_rlock(&(ssk)->lock) #define SDP_WLOCK(ssk) rw_wlock(&(ssk)->lock) #define SDP_RUNLOCK(ssk) rw_runlock(&(ssk)->lock) #define SDP_WUNLOCK(ssk) rw_wunlock(&(ssk)->lock) #define SDP_WLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_WLOCKED) #define SDP_RLOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_RLOCKED) #define SDP_LOCK_ASSERT(ssk) rw_assert(&(ssk)->lock, RA_LOCKED) MALLOC_DECLARE(M_SDP); static inline void tx_sa_reset(struct tx_srcavail_state *tx_sa) { memset((void *)&tx_sa->busy, 0, sizeof(*tx_sa) - offsetof(typeof(*tx_sa), busy)); } static inline void rx_ring_unlock(struct sdp_rx_ring *rx_ring) { rw_runlock(&rx_ring->destroyed_lock); } static inline int rx_ring_trylock(struct sdp_rx_ring *rx_ring) { rw_rlock(&rx_ring->destroyed_lock); if (rx_ring->destroyed) { rx_ring_unlock(rx_ring); return 0; } return 1; } static inline void rx_ring_destroy_lock(struct sdp_rx_ring *rx_ring) { rw_wlock(&rx_ring->destroyed_lock); rx_ring->destroyed = 1; rw_wunlock(&rx_ring->destroyed_lock); } static inline void sdp_arm_rx_cq(struct sdp_sock *ssk) { sdp_prf(ssk->socket, NULL, "Arming RX cq"); sdp_dbg_data(ssk->socket, "Arming RX cq\n"); ib_req_notify_cq(ssk->rx_ring.cq, IB_CQ_NEXT_COMP); } static inline void sdp_arm_tx_cq(struct sdp_sock *ssk) { sdp_prf(ssk->socket, NULL, "Arming TX cq"); sdp_dbg_data(ssk->socket, "Arming TX cq. credits: %d, posted: %d\n", tx_credits(ssk), tx_ring_posted(ssk)); ib_req_notify_cq(ssk->tx_ring.cq, IB_CQ_NEXT_COMP); } /* return the min of: * - tx credits * - free slots in tx_ring (not including SDP_MIN_TX_CREDITS */ static inline int tx_slots_free(struct sdp_sock *ssk) { int min_free; min_free = MIN(tx_credits(ssk), SDP_TX_SIZE - tx_ring_posted(ssk)); if (min_free < SDP_MIN_TX_CREDITS) return 0; return min_free - SDP_MIN_TX_CREDITS; }; /* utilities */ static inline char *mid2str(int mid) { #define ENUM2STR(e) [e] = #e static char *mid2str[] = { ENUM2STR(SDP_MID_HELLO), ENUM2STR(SDP_MID_HELLO_ACK), ENUM2STR(SDP_MID_ABORT), ENUM2STR(SDP_MID_DISCONN), ENUM2STR(SDP_MID_SENDSM), ENUM2STR(SDP_MID_RDMARDCOMPL), ENUM2STR(SDP_MID_SRCAVAIL_CANCEL), ENUM2STR(SDP_MID_CHRCVBUF), ENUM2STR(SDP_MID_CHRCVBUF_ACK), ENUM2STR(SDP_MID_DATA), ENUM2STR(SDP_MID_SRCAVAIL), ENUM2STR(SDP_MID_SINKAVAIL), }; if (mid >= ARRAY_SIZE(mid2str)) return NULL; return mid2str[mid]; } static inline struct mbuf * sdp_alloc_mb(struct socket *sk, u8 mid, int size, int wait) { struct sdp_bsdh *h; struct mbuf *mb; MGETHDR(mb, wait, MT_DATA); if (mb == NULL) return (NULL); mb->m_pkthdr.len = mb->m_len = sizeof(struct sdp_bsdh); h = mtod(mb, struct sdp_bsdh *); h->mid = mid; return mb; } static inline struct mbuf * sdp_alloc_mb_data(struct socket *sk, int wait) { return sdp_alloc_mb(sk, SDP_MID_DATA, 0, wait); } static inline struct mbuf * sdp_alloc_mb_disconnect(struct socket *sk, int wait) { return sdp_alloc_mb(sk, SDP_MID_DISCONN, 0, wait); } static inline void * mb_put(struct mbuf *mb, int len) { uint8_t *data; data = mb->m_data; data += mb->m_len; mb->m_len += len; return (void *)data; } static inline struct mbuf * sdp_alloc_mb_chrcvbuf_ack(struct socket *sk, int size, int wait) { struct mbuf *mb; struct sdp_chrecvbuf *resp_size; mb = sdp_alloc_mb(sk, SDP_MID_CHRCVBUF_ACK, sizeof(*resp_size), wait); if (mb == NULL) return (NULL); resp_size = (struct sdp_chrecvbuf *)mb_put(mb, sizeof *resp_size); resp_size->size = htonl(size); return mb; } static inline struct mbuf * sdp_alloc_mb_srcavail(struct socket *sk, u32 len, u32 rkey, u64 vaddr, int wait) { struct mbuf *mb; struct sdp_srcah *srcah; mb = sdp_alloc_mb(sk, SDP_MID_SRCAVAIL, sizeof(*srcah), wait); if (mb == NULL) return (NULL); srcah = (struct sdp_srcah *)mb_put(mb, sizeof(*srcah)); srcah->len = htonl(len); srcah->rkey = htonl(rkey); srcah->vaddr = cpu_to_be64(vaddr); return mb; } static inline struct mbuf * sdp_alloc_mb_srcavail_cancel(struct socket *sk, int wait) { return sdp_alloc_mb(sk, SDP_MID_SRCAVAIL_CANCEL, 0, wait); } static inline struct mbuf * sdp_alloc_mb_rdmardcompl(struct socket *sk, u32 len, int wait) { struct mbuf *mb; struct sdp_rrch *rrch; mb = sdp_alloc_mb(sk, SDP_MID_RDMARDCOMPL, sizeof(*rrch), wait); if (mb == NULL) return (NULL); rrch = (struct sdp_rrch *)mb_put(mb, sizeof(*rrch)); rrch->len = htonl(len); return mb; } static inline struct mbuf * sdp_alloc_mb_sendsm(struct socket *sk, int wait) { return sdp_alloc_mb(sk, SDP_MID_SENDSM, 0, wait); } static inline int sdp_tx_ring_slots_left(struct sdp_sock *ssk) { return SDP_TX_SIZE - tx_ring_posted(ssk); } static inline int credit_update_needed(struct sdp_sock *ssk) { int c; c = remote_credits(ssk); if (likely(c > SDP_MIN_TX_CREDITS)) c += c/2; return unlikely(c < rx_ring_posted(ssk)) && likely(tx_credits(ssk) > 0) && likely(sdp_tx_ring_slots_left(ssk)); } #define SDPSTATS_COUNTER_INC(stat) #define SDPSTATS_COUNTER_ADD(stat, val) #define SDPSTATS_COUNTER_MID_INC(stat, mid) #define SDPSTATS_HIST_LINEAR(stat, size) #define SDPSTATS_HIST(stat, size) static inline void sdp_cleanup_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf, enum dma_data_direction dir) { struct ib_device *dev; struct mbuf *mb; int i; dev = ssk->ib_device; for (i = 0, mb = sbuf->mb; mb != NULL; mb = mb->m_next, i++) ib_dma_unmap_single(dev, sbuf->mapping[i], mb->m_len, dir); } /* sdp_main.c */ void sdp_set_default_moderation(struct sdp_sock *ssk); void sdp_start_keepalive_timer(struct socket *sk); void sdp_urg(struct sdp_sock *ssk, struct mbuf *mb); void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk); void sdp_abort(struct socket *sk); struct sdp_sock *sdp_notify(struct sdp_sock *ssk, int error); /* sdp_cma.c */ int sdp_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *); /* sdp_tx.c */ int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device); void sdp_tx_ring_destroy(struct sdp_sock *ssk); int sdp_xmit_poll(struct sdp_sock *ssk, int force); void sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb); void sdp_post_sends(struct sdp_sock *ssk, int wait); void sdp_post_keepalive(struct sdp_sock *ssk); /* sdp_rx.c */ void sdp_rx_ring_init(struct sdp_sock *ssk); int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device); void sdp_rx_ring_destroy(struct sdp_sock *ssk); int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size); int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size); void sdp_do_posts(struct sdp_sock *ssk); void sdp_rx_comp_full(struct sdp_sock *ssk); /* sdp_zcopy.c */ struct kiocb; int sdp_sendmsg_zcopy(struct kiocb *iocb, struct socket *sk, struct iovec *iov); int sdp_handle_srcavail(struct sdp_sock *ssk, struct sdp_srcah *srcah); void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack); void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack, u32 bytes_completed); int sdp_handle_rdma_read_cqe(struct sdp_sock *ssk); int sdp_rdma_to_iovec(struct socket *sk, struct iovec *iov, struct mbuf *mb, unsigned long *used); int sdp_post_rdma_rd_compl(struct sdp_sock *ssk, struct rx_srcavail_state *rx_sa); int sdp_post_sendsm(struct socket *sk); void srcavail_cancel_timeout(struct work_struct *work); void sdp_abort_srcavail(struct socket *sk); void sdp_abort_rdma_read(struct socket *sk); int sdp_process_rx(struct sdp_sock *ssk); #endif Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c (revision 323642) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_cma.c (revision 323643) @@ -1,456 +1,455 @@ /* * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id$ */ #include "sdp.h" #define SDP_MAJV_MINV 0x22 SDP_MODPARAM_SINT(sdp_link_layer_ib_only, 1, "Support only link layer of " "type Infiniband"); enum { SDP_HH_SIZE = 76, SDP_HAH_SIZE = 180, }; static void sdp_qp_event_handler(struct ib_event *event, void *data) { } static int sdp_get_max_dev_sge(struct ib_device *dev) { - struct ib_device_attr attr; + struct ib_device_attr *device_attr; static int max_sges = -1; if (max_sges > 0) goto out; - ib_query_device(dev, &attr); - - max_sges = attr.max_sge; + device_attr = &dev->attrs; + max_sges = device_attr->max_sge; out: return max_sges; } static int sdp_init_qp(struct socket *sk, struct rdma_cm_id *id) { struct ib_qp_init_attr qp_init_attr = { .event_handler = sdp_qp_event_handler, .cap.max_send_wr = SDP_TX_SIZE, .cap.max_recv_wr = SDP_RX_SIZE, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, }; struct ib_device *device = id->device; struct sdp_sock *ssk; int rc; sdp_dbg(sk, "%s\n", __func__); ssk = sdp_sk(sk); ssk->max_sge = sdp_get_max_dev_sge(device); sdp_dbg(sk, "Max sges: %d\n", ssk->max_sge); qp_init_attr.cap.max_send_sge = MIN(ssk->max_sge, SDP_MAX_SEND_SGES); sdp_dbg(sk, "Setting max send sge to: %d\n", qp_init_attr.cap.max_send_sge); qp_init_attr.cap.max_recv_sge = MIN(ssk->max_sge, SDP_MAX_RECV_SGES); sdp_dbg(sk, "Setting max recv sge to: %d\n", qp_init_attr.cap.max_recv_sge); ssk->sdp_dev = ib_get_client_data(device, &sdp_client); if (!ssk->sdp_dev) { sdp_warn(sk, "SDP not available on device %s\n", device->name); rc = -ENODEV; goto err_rx; } rc = sdp_rx_ring_create(ssk, device); if (rc) goto err_rx; rc = sdp_tx_ring_create(ssk, device); if (rc) goto err_tx; qp_init_attr.recv_cq = ssk->rx_ring.cq; qp_init_attr.send_cq = ssk->tx_ring.cq; rc = rdma_create_qp(id, ssk->sdp_dev->pd, &qp_init_attr); if (rc) { sdp_warn(sk, "Unable to create QP: %d.\n", rc); goto err_qp; } ssk->qp = id->qp; ssk->ib_device = device; ssk->qp_active = 1; ssk->context.device = device; sdp_dbg(sk, "%s done\n", __func__); return 0; err_qp: sdp_tx_ring_destroy(ssk); err_tx: sdp_rx_ring_destroy(ssk); err_rx: return rc; } static int sdp_connect_handler(struct socket *sk, struct rdma_cm_id *id, struct rdma_cm_event *event) { struct sockaddr_in *src_addr; struct sockaddr_in *dst_addr; struct socket *child; const struct sdp_hh *h; struct sdp_sock *ssk; int rc; sdp_dbg(sk, "%s %p -> %p\n", __func__, sdp_sk(sk)->id, id); h = event->param.conn.private_data; SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh); if (!h->max_adverts) return -EINVAL; child = sonewconn(sk, SS_ISCONNECTED); if (!child) return -ENOMEM; ssk = sdp_sk(child); rc = sdp_init_qp(child, id); if (rc) return rc; SDP_WLOCK(ssk); id->context = ssk; ssk->id = id; ssk->socket = child; ssk->cred = crhold(child->so_cred); dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr; src_addr = (struct sockaddr_in *)&id->route.addr.src_addr; ssk->fport = dst_addr->sin_port; ssk->faddr = dst_addr->sin_addr.s_addr; ssk->lport = src_addr->sin_port; ssk->max_bufs = ntohs(h->bsdh.bufs); atomic_set(&ssk->tx_ring.credits, ssk->max_bufs); ssk->min_bufs = tx_credits(ssk) / 4; ssk->xmit_size_goal = ntohl(h->localrcvsz) - sizeof(struct sdp_bsdh); sdp_init_buffers(ssk, rcvbuf_initial_size); ssk->state = TCPS_SYN_RECEIVED; SDP_WUNLOCK(ssk); return 0; } static int sdp_response_handler(struct socket *sk, struct rdma_cm_id *id, struct rdma_cm_event *event) { const struct sdp_hah *h; struct sockaddr_in *dst_addr; struct sdp_sock *ssk; sdp_dbg(sk, "%s\n", __func__); ssk = sdp_sk(sk); SDP_WLOCK(ssk); ssk->state = TCPS_ESTABLISHED; sdp_set_default_moderation(ssk); if (ssk->flags & SDP_DROPPED) { SDP_WUNLOCK(ssk); return 0; } if (sk->so_options & SO_KEEPALIVE) sdp_start_keepalive_timer(sk); h = event->param.conn.private_data; SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh); ssk->max_bufs = ntohs(h->bsdh.bufs); atomic_set(&ssk->tx_ring.credits, ssk->max_bufs); ssk->min_bufs = tx_credits(ssk) / 4; ssk->xmit_size_goal = ntohl(h->actrcvsz) - sizeof(struct sdp_bsdh); ssk->poll_cq = 1; dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr; ssk->fport = dst_addr->sin_port; ssk->faddr = dst_addr->sin_addr.s_addr; soisconnected(sk); SDP_WUNLOCK(ssk); return 0; } static int sdp_connected_handler(struct socket *sk, struct rdma_cm_event *event) { struct sdp_sock *ssk; sdp_dbg(sk, "%s\n", __func__); ssk = sdp_sk(sk); SDP_WLOCK(ssk); ssk->state = TCPS_ESTABLISHED; sdp_set_default_moderation(ssk); if (sk->so_options & SO_KEEPALIVE) sdp_start_keepalive_timer(sk); if ((ssk->flags & SDP_DROPPED) == 0) soisconnected(sk); SDP_WUNLOCK(ssk); return 0; } static int sdp_disconnected_handler(struct socket *sk) { struct sdp_sock *ssk; ssk = sdp_sk(sk); sdp_dbg(sk, "%s\n", __func__); SDP_WLOCK_ASSERT(ssk); if (sdp_sk(sk)->state == TCPS_SYN_RECEIVED) { sdp_connected_handler(sk, NULL); if (rcv_nxt(ssk)) return 0; } return -ECONNRESET; } int sdp_cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_conn_param conn_param; struct socket *sk; struct sdp_sock *ssk; struct sdp_hah hah; struct sdp_hh hh; int rc = 0; ssk = id->context; sk = NULL; if (ssk) sk = ssk->socket; if (!ssk || !sk || !ssk->id) { sdp_dbg(sk, "cm_id is being torn down, event %d, ssk %p, sk %p, id %p\n", event->event, ssk, sk, id); return event->event == RDMA_CM_EVENT_CONNECT_REQUEST ? -EINVAL : 0; } sdp_dbg(sk, "%s event %d id %p\n", __func__, event->event, id); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_RESOLVED\n"); if (sdp_link_layer_ib_only && rdma_node_get_transport(id->device->node_type) == RDMA_TRANSPORT_IB && rdma_port_get_link_layer(id->device, id->port_num) != IB_LINK_LAYER_INFINIBAND) { sdp_dbg(sk, "Link layer is: %d. Only IB link layer " "is allowed\n", rdma_port_get_link_layer(id->device, id->port_num)); rc = -ENETUNREACH; break; } rc = rdma_resolve_route(id, SDP_ROUTE_TIMEOUT); break; case RDMA_CM_EVENT_ADDR_ERROR: sdp_dbg(sk, "RDMA_CM_EVENT_ADDR_ERROR\n"); rc = -ENETUNREACH; break; case RDMA_CM_EVENT_ROUTE_RESOLVED: sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_RESOLVED : %p\n", id); rc = sdp_init_qp(sk, id); if (rc) break; atomic_set(&sdp_sk(sk)->remote_credits, rx_ring_posted(sdp_sk(sk))); memset(&hh, 0, sizeof hh); hh.bsdh.mid = SDP_MID_HELLO; hh.bsdh.len = htonl(sizeof(struct sdp_hh)); hh.max_adverts = 1; hh.ipv_cap = 0x40; hh.majv_minv = SDP_MAJV_MINV; sdp_init_buffers(sdp_sk(sk), rcvbuf_initial_size); hh.bsdh.bufs = htons(rx_ring_posted(sdp_sk(sk))); hh.localrcvsz = hh.desremrcvsz = htonl(sdp_sk(sk)->recv_bytes); hh.max_adverts = 0x1; sdp_sk(sk)->laddr = ((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr; memset(&conn_param, 0, sizeof conn_param); conn_param.private_data_len = sizeof hh; conn_param.private_data = &hh; conn_param.responder_resources = 4 /* TODO */; conn_param.initiator_depth = 4 /* TODO */; conn_param.retry_count = SDP_RETRY_COUNT; SDP_DUMP_PACKET(NULL, "TX", NULL, &hh.bsdh); rc = rdma_connect(id, &conn_param); break; case RDMA_CM_EVENT_ROUTE_ERROR: sdp_dbg(sk, "RDMA_CM_EVENT_ROUTE_ERROR : %p\n", id); rc = -ETIMEDOUT; break; case RDMA_CM_EVENT_CONNECT_REQUEST: sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_REQUEST\n"); rc = sdp_connect_handler(sk, id, event); if (rc) { sdp_dbg(sk, "Destroying qp\n"); rdma_reject(id, NULL, 0); break; } ssk = id->context; atomic_set(&ssk->remote_credits, rx_ring_posted(ssk)); memset(&hah, 0, sizeof hah); hah.bsdh.mid = SDP_MID_HELLO_ACK; hah.bsdh.bufs = htons(rx_ring_posted(ssk)); hah.bsdh.len = htonl(sizeof(struct sdp_hah)); hah.majv_minv = SDP_MAJV_MINV; hah.ext_max_adverts = 1; /* Doesn't seem to be mandated by spec, but just in case */ hah.actrcvsz = htonl(ssk->recv_bytes); memset(&conn_param, 0, sizeof conn_param); conn_param.private_data_len = sizeof hah; conn_param.private_data = &hah; conn_param.responder_resources = 4 /* TODO */; conn_param.initiator_depth = 4 /* TODO */; conn_param.retry_count = SDP_RETRY_COUNT; SDP_DUMP_PACKET(sk, "TX", NULL, &hah.bsdh); rc = rdma_accept(id, &conn_param); if (rc) { ssk->id = NULL; id->qp = NULL; id->context = NULL; } break; case RDMA_CM_EVENT_CONNECT_RESPONSE: sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_RESPONSE\n"); rc = sdp_response_handler(sk, id, event); if (rc) { sdp_dbg(sk, "Destroying qp\n"); rdma_reject(id, NULL, 0); } else rc = rdma_accept(id, NULL); break; case RDMA_CM_EVENT_CONNECT_ERROR: sdp_dbg(sk, "RDMA_CM_EVENT_CONNECT_ERROR\n"); rc = -ETIMEDOUT; break; case RDMA_CM_EVENT_UNREACHABLE: sdp_dbg(sk, "RDMA_CM_EVENT_UNREACHABLE\n"); rc = -ENETUNREACH; break; case RDMA_CM_EVENT_REJECTED: sdp_dbg(sk, "RDMA_CM_EVENT_REJECTED\n"); rc = -ECONNREFUSED; break; case RDMA_CM_EVENT_ESTABLISHED: sdp_dbg(sk, "RDMA_CM_EVENT_ESTABLISHED\n"); sdp_sk(sk)->laddr = ((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr; rc = sdp_connected_handler(sk, event); break; case RDMA_CM_EVENT_DISCONNECTED: /* This means DREQ/DREP received */ sdp_dbg(sk, "RDMA_CM_EVENT_DISCONNECTED\n"); SDP_WLOCK(ssk); if (ssk->state == TCPS_LAST_ACK) { sdp_cancel_dreq_wait_timeout(ssk); sdp_dbg(sk, "%s: waiting for Infiniband tear down\n", __func__); } ssk->qp_active = 0; SDP_WUNLOCK(ssk); rdma_disconnect(id); SDP_WLOCK(ssk); if (ssk->state != TCPS_TIME_WAIT) { if (ssk->state == TCPS_CLOSE_WAIT) { sdp_dbg(sk, "IB teardown while in " "TCPS_CLOSE_WAIT taking reference to " "let close() finish the work\n"); } rc = sdp_disconnected_handler(sk); if (rc) rc = -EPIPE; } SDP_WUNLOCK(ssk); break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: sdp_dbg(sk, "RDMA_CM_EVENT_TIMEWAIT_EXIT\n"); SDP_WLOCK(ssk); rc = sdp_disconnected_handler(sk); SDP_WUNLOCK(ssk); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: sdp_dbg(sk, "RDMA_CM_EVENT_DEVICE_REMOVAL\n"); rc = -ENETRESET; break; default: printk(KERN_ERR "SDP: Unexpected CMA event: %d\n", event->event); rc = -ECONNABORTED; break; } sdp_dbg(sk, "event %d done. status %d\n", event->event, rc); if (rc) { SDP_WLOCK(ssk); if (ssk->id == id) { ssk->id = NULL; id->qp = NULL; id->context = NULL; if (sdp_notify(ssk, -rc)) SDP_WUNLOCK(ssk); } else SDP_WUNLOCK(ssk); } return rc; } Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (revision 323642) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c (revision 323643) @@ -1,1972 +1,1961 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c */ /* * * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include "sdp.h" #include #include #include #include uma_zone_t sdp_zone; struct rwlock sdp_lock; LIST_HEAD(, sdp_sock) sdp_list; struct workqueue_struct *rx_comp_wq; RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); #define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) #define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) #define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) #define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) #define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) #define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) #define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol"); static void sdp_stop_keepalive_timer(struct socket *so); /* * SDP protocol interface to socket abstraction. */ /* * sdp_sendspace and sdp_recvspace are the default send and receive window * sizes, respectively. */ u_long sdp_sendspace = 1024*32; u_long sdp_recvspace = 1024*64; static int sdp_count; /* * Disable async. CMA events for sockets which are being torn down. */ static void sdp_destroy_cma(struct sdp_sock *ssk) { if (ssk->id == NULL) return; rdma_destroy_id(ssk->id); ssk->id = NULL; } static int sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) { struct sockaddr_in *sin; struct sockaddr_in null; int error; SDP_WLOCK_ASSERT(ssk); if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) return (EINVAL); /* rdma_bind_addr handles bind races. */ SDP_WUNLOCK(ssk); if (ssk->id == NULL) - ssk->id = rdma_create_id(sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); + ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); if (ssk->id == NULL) { SDP_WLOCK(ssk); return (ENOMEM); } if (nam == NULL) { null.sin_family = AF_INET; null.sin_len = sizeof(null); null.sin_addr.s_addr = INADDR_ANY; null.sin_port = 0; bzero(&null.sin_zero, sizeof(null.sin_zero)); nam = (struct sockaddr *)&null; } error = -rdma_bind_addr(ssk->id, nam); SDP_WLOCK(ssk); if (error == 0) { sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; ssk->laddr = sin->sin_addr.s_addr; ssk->lport = sin->sin_port; } else sdp_destroy_cma(ssk); return (error); } static void sdp_pcbfree(struct sdp_sock *ssk) { KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); KASSERT((ssk->flags & SDP_DESTROY) == 0, ("ssk %p already destroyed", ssk)); sdp_dbg(ssk->socket, "Freeing pcb"); SDP_WLOCK_ASSERT(ssk); ssk->flags |= SDP_DESTROY; SDP_WUNLOCK(ssk); SDP_LIST_WLOCK(); sdp_count--; LIST_REMOVE(ssk, list); SDP_LIST_WUNLOCK(); crfree(ssk->cred); ssk->qp_active = 0; if (ssk->qp) { ib_destroy_qp(ssk->qp); ssk->qp = NULL; } sdp_tx_ring_destroy(ssk); sdp_rx_ring_destroy(ssk); sdp_destroy_cma(ssk); rw_destroy(&ssk->rx_ring.destroyed_lock); rw_destroy(&ssk->lock); uma_zfree(sdp_zone, ssk); } /* * Common routines to return a socket address. */ static struct sockaddr * sdp_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } static int sdp_getsockaddr(struct socket *so, struct sockaddr **nam) { struct sdp_sock *ssk; struct in_addr addr; in_port_t port; ssk = sdp_sk(so); SDP_RLOCK(ssk); port = ssk->lport; addr.s_addr = ssk->laddr; SDP_RUNLOCK(ssk); *nam = sdp_sockaddr(port, &addr); return 0; } static int sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct sdp_sock *ssk; struct in_addr addr; in_port_t port; ssk = sdp_sk(so); SDP_RLOCK(ssk); port = ssk->fport; addr.s_addr = ssk->faddr; SDP_RUNLOCK(ssk); *nam = sdp_sockaddr(port, &addr); return 0; } static void sdp_pcbnotifyall(struct in_addr faddr, int errno, struct sdp_sock *(*notify)(struct sdp_sock *, int)) { struct sdp_sock *ssk, *ssk_temp; SDP_LIST_WLOCK(); LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { SDP_WLOCK(ssk); if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { SDP_WUNLOCK(ssk); continue; } if ((ssk->flags & SDP_DESTROY) == 0) if ((*notify)(ssk, errno)) SDP_WUNLOCK(ssk); } SDP_LIST_WUNLOCK(); } #if 0 static void sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) { struct sdp_sock *ssk; SDP_LIST_RLOCK(); LIST_FOREACH(ssk, &sdp_list, list) { SDP_WLOCK(ssk); func(ssk, arg); SDP_WUNLOCK(ssk); } SDP_LIST_RUNLOCK(); } #endif static void sdp_output_reset(struct sdp_sock *ssk) { struct rdma_cm_id *id; SDP_WLOCK_ASSERT(ssk); if (ssk->id) { id = ssk->id; ssk->qp_active = 0; SDP_WUNLOCK(ssk); rdma_disconnect(id); SDP_WLOCK(ssk); } ssk->state = TCPS_CLOSED; } /* * Attempt to close a SDP socket, marking it as dropped, and freeing * the socket if we hold the only reference. */ static struct sdp_sock * sdp_closed(struct sdp_sock *ssk) { struct socket *so; SDP_WLOCK_ASSERT(ssk); ssk->flags |= SDP_DROPPED; so = ssk->socket; soisdisconnected(so); if (ssk->flags & SDP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("sdp_closed: !SS_PROTOREF")); ssk->flags &= ~SDP_SOCKREF; SDP_WUNLOCK(ssk); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); return (NULL); } return (ssk); } /* * Perform timer based shutdowns which can not operate in * callout context. */ static void sdp_shutdown_task(void *data, int pending) { struct sdp_sock *ssk; ssk = data; SDP_WLOCK(ssk); /* * I don't think this can race with another call to pcbfree() * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. */ if (ssk->flags & SDP_DESTROY) panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", ssk); if (ssk->flags & SDP_DISCON) sdp_output_reset(ssk); /* We have to clear this so sdp_detach() will call pcbfree(). */ ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); if ((ssk->flags & SDP_DROPPED) == 0 && sdp_closed(ssk) == NULL) return; if (ssk->socket == NULL) { sdp_pcbfree(ssk); return; } SDP_WUNLOCK(ssk); } /* * 2msl has expired, schedule the shutdown task. */ static void sdp_2msl_timeout(void *data) { struct sdp_sock *ssk; ssk = data; /* Callout canceled. */ if (!callout_active(&ssk->keep2msl)) goto out; callout_deactivate(&ssk->keep2msl); /* Should be impossible, defensive programming. */ if ((ssk->flags & SDP_TIMEWAIT) == 0) goto out; taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); out: SDP_WUNLOCK(ssk); return; } /* * Schedule the 2msl wait timer. */ static void sdp_2msl_wait(struct sdp_sock *ssk) { SDP_WLOCK_ASSERT(ssk); ssk->flags |= SDP_TIMEWAIT; ssk->state = TCPS_TIME_WAIT; soisdisconnected(ssk->socket); callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); } /* * Timed out waiting for the final fin/ack from rdma_disconnect(). */ static void sdp_dreq_timeout(void *data) { struct sdp_sock *ssk; ssk = data; /* Callout canceled. */ if (!callout_active(&ssk->keep2msl)) goto out; /* Callout rescheduled, probably as a different timer. */ if (callout_pending(&ssk->keep2msl)) goto out; callout_deactivate(&ssk->keep2msl); if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) goto out; if ((ssk->flags & SDP_DREQWAIT) == 0) goto out; ssk->flags &= ~SDP_DREQWAIT; ssk->flags |= SDP_DISCON; sdp_2msl_wait(ssk); ssk->qp_active = 0; out: SDP_WUNLOCK(ssk); } /* * Received the final fin/ack. Cancel the 2msl. */ void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) { sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); ssk->flags &= ~SDP_DREQWAIT; sdp_2msl_wait(ssk); } static int sdp_init_sock(struct socket *sk) { struct sdp_sock *ssk = sdp_sk(sk); sdp_dbg(sk, "%s\n", __func__); callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); #ifdef SDP_ZCOPY INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ ssk->tx_ring.rdma_inflight = NULL; #endif atomic_set(&ssk->mseq_ack, 0); sdp_rx_ring_init(ssk); ssk->tx_ring.buffer = NULL; return 0; } /* * Allocate an sdp_sock for the socket and reserve socket buffer space. */ static int sdp_attach(struct socket *so, int proto, struct thread *td) { struct sdp_sock *ssk; int error; ssk = sdp_sk(so); KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { error = soreserve(so, sdp_sendspace, sdp_recvspace); if (error) return (error); } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); if (ssk == NULL) return (ENOBUFS); rw_init(&ssk->lock, "sdpsock"); ssk->socket = so; ssk->cred = crhold(so->so_cred); so->so_pcb = (caddr_t)ssk; sdp_init_sock(so); ssk->flags = 0; ssk->qp_active = 0; ssk->state = TCPS_CLOSED; mbufq_init(&ssk->rxctlq, INT_MAX); SDP_LIST_WLOCK(); LIST_INSERT_HEAD(&sdp_list, ssk, list); sdp_count++; SDP_LIST_WUNLOCK(); if ((so->so_options & SO_LINGER) && so->so_linger == 0) so->so_linger = TCP_LINGERTIME; return (0); } /* * Detach SDP from the socket, potentially leaving it around for the * timewait to expire. */ static void sdp_detach(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); ssk->socket->so_pcb = NULL; ssk->socket = NULL; if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) SDP_WUNLOCK(ssk); else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) sdp_pcbfree(ssk); else panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); } /* * Allocate a local address for the socket. */ static int sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct sdp_sock *ssk; struct sockaddr_in *sin; sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EINVAL); if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return (EAFNOSUPPORT); ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = EINVAL; goto out; } error = sdp_pcbbind(ssk, nam, td->td_ucred); out: SDP_WUNLOCK(ssk); return (error); } /* * Prepare to accept connections. */ static int sdp_listen(struct socket *so, int backlog, struct thread *td) { int error = 0; struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = EINVAL; goto out; } if (error == 0 && ssk->lport == 0) error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); SOCK_LOCK(so); if (error == 0) error = solisten_proto_check(so); if (error == 0) { solisten_proto(so, backlog); ssk->state = TCPS_LISTEN; } SOCK_UNLOCK(so); out: SDP_WUNLOCK(ssk); if (error == 0) error = -rdma_listen(ssk->id, backlog); return (error); } /* * Initiate a SDP connection to nam. */ static int sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) { struct sockaddr_in src; struct socket *so; int error; so = ssk->socket; SDP_WLOCK_ASSERT(ssk); if (ssk->lport == 0) { error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); if (error) return error; } src.sin_family = AF_INET; src.sin_len = sizeof(src); bzero(&src.sin_zero, sizeof(src.sin_zero)); src.sin_port = ssk->lport; src.sin_addr.s_addr = ssk->laddr; soisconnecting(so); SDP_WUNLOCK(ssk); error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, SDP_RESOLVE_TIMEOUT); SDP_WLOCK(ssk); if (error == 0) ssk->state = TCPS_SYN_SENT; return 0; } /* * Initiate SDP connection. */ static int sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) { int error = 0; struct sdp_sock *ssk; struct sockaddr_in *sin; sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EINVAL); if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return (EAFNOSUPPORT); if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) return (error); ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) error = EINVAL; else error = sdp_start_connect(ssk, nam, td); SDP_WUNLOCK(ssk); return (error); } /* * Drop a SDP socket, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ static struct sdp_sock * sdp_drop(struct sdp_sock *ssk, int errno) { struct socket *so; SDP_WLOCK_ASSERT(ssk); so = ssk->socket; if (TCPS_HAVERCVDSYN(ssk->state)) sdp_output_reset(ssk); if (errno == ETIMEDOUT && ssk->softerror) errno = ssk->softerror; so->so_error = errno; return (sdp_closed(ssk)); } /* * User issued close, and wish to trail through shutdown states: * if never received SYN, just forget it. If got a SYN from peer, * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. * If already got a FIN from peer, then almost done; go to LAST_ACK * state. In all other cases, have already sent FIN to peer (e.g. * after PRU_SHUTDOWN), and just have to play tedious game waiting * for peer to send FIN or not respond to keep-alives, etc. * We can let the user exit from the close as soon as the FIN is acked. */ static void sdp_usrclosed(struct sdp_sock *ssk) { SDP_WLOCK_ASSERT(ssk); switch (ssk->state) { case TCPS_LISTEN: ssk->state = TCPS_CLOSED; SDP_WUNLOCK(ssk); sdp_destroy_cma(ssk); SDP_WLOCK(ssk); /* FALLTHROUGH */ case TCPS_CLOSED: ssk = sdp_closed(ssk); /* * sdp_closed() should never return NULL here as the socket is * still open. */ KASSERT(ssk != NULL, ("sdp_usrclosed: sdp_closed() returned NULL")); break; case TCPS_SYN_SENT: /* FALLTHROUGH */ case TCPS_SYN_RECEIVED: ssk->flags |= SDP_NEEDFIN; break; case TCPS_ESTABLISHED: ssk->flags |= SDP_NEEDFIN; ssk->state = TCPS_FIN_WAIT_1; break; case TCPS_CLOSE_WAIT: ssk->state = TCPS_LAST_ACK; break; } if (ssk->state >= TCPS_FIN_WAIT_2) { /* Prevent the connection hanging in FIN_WAIT_2 forever. */ if (ssk->state == TCPS_FIN_WAIT_2) sdp_2msl_wait(ssk); else soisdisconnected(ssk->socket); } } static void sdp_output_disconnect(struct sdp_sock *ssk) { SDP_WLOCK_ASSERT(ssk); callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, sdp_dreq_timeout, ssk); ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; sdp_post_sends(ssk, M_NOWAIT); } /* * Initiate or continue a disconnect. * If embryonic state, just send reset (once). * If in ``let data drain'' option and linger null, just drop. * Otherwise (hard), mark socket disconnecting and drop * current input data; switch states based on user close, and * send segment to peer (with FIN). */ static void sdp_start_disconnect(struct sdp_sock *ssk) { struct socket *so; int unread; so = ssk->socket; SDP_WLOCK_ASSERT(ssk); sdp_stop_keepalive_timer(so); /* * Neither sdp_closed() nor sdp_drop() should return NULL, as the * socket is still open. */ if (ssk->state < TCPS_ESTABLISHED) { ssk = sdp_closed(ssk); KASSERT(ssk != NULL, ("sdp_start_disconnect: sdp_close() returned NULL")); } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { ssk = sdp_drop(ssk, 0); KASSERT(ssk != NULL, ("sdp_start_disconnect: sdp_drop() returned NULL")); } else { soisdisconnecting(so); unread = sbused(&so->so_rcv); sbflush(&so->so_rcv); sdp_usrclosed(ssk); if (!(ssk->flags & SDP_DROPPED)) { if (unread) sdp_output_reset(ssk); else sdp_output_disconnect(ssk); } } } /* * User initiated disconnect. */ static int sdp_disconnect(struct socket *so) { struct sdp_sock *ssk; int error = 0; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = ECONNRESET; goto out; } sdp_start_disconnect(ssk); out: SDP_WUNLOCK(ssk); return (error); } /* * Accept a connection. Essentially all the work is done at higher levels; * just return the address of the peer, storing through addr. * * * XXX This is broken XXX * * The rationale for acquiring the sdp lock here is somewhat complicated, * and is described in detail in the commit log entry for r175612. Acquiring * it delays an accept(2) racing with sonewconn(), which inserts the socket * before the address/port fields are initialized. A better fix would * prevent the socket from being placed in the listen queue until all fields * are fully initialized. */ static int sdp_accept(struct socket *so, struct sockaddr **nam) { struct sdp_sock *ssk = NULL; struct in_addr addr; in_port_t port; int error; if (so->so_state & SS_ISDISCONNECTED) return (ECONNABORTED); port = 0; addr.s_addr = 0; error = 0; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = ECONNABORTED; goto out; } port = ssk->fport; addr.s_addr = ssk->faddr; out: SDP_WUNLOCK(ssk); if (error == 0) *nam = sdp_sockaddr(port, &addr); return error; } /* * Mark the connection as being incapable of further output. */ static int sdp_shutdown(struct socket *so) { int error = 0; struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = ECONNRESET; goto out; } socantsendmore(so); sdp_usrclosed(ssk); if (!(ssk->flags & SDP_DROPPED)) sdp_output_disconnect(ssk); out: SDP_WUNLOCK(ssk); return (error); } static void sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) { struct mbuf *n; int ncnt; SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); KASSERT(mb->m_flags & M_PKTHDR, ("sdp_append: %p Missing packet header.\n", mb)); n = sb->sb_lastrecord; /* * If the queue is empty just set all pointers and proceed. */ if (n == NULL) { sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; for (; mb; mb = mb->m_next) { sb->sb_mbtail = mb; sballoc(sb, mb); } return; } /* * Count the number of mbufs in the current tail. */ for (ncnt = 0; n->m_next; n = n->m_next) ncnt++; n = sb->sb_lastrecord; /* * If the two chains can fit in a single sdp packet and * the last record has not been sent yet (WRITABLE) coalesce * them. The lastrecord remains the same but we must strip the * packet header and then let sbcompress do the hard part. */ if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < ssk->xmit_size_goal) { m_adj(mb, SDP_HEAD_SIZE); n->m_pkthdr.len += mb->m_pkthdr.len; n->m_flags |= mb->m_flags & (M_PUSH | M_URG); m_demote(mb, 1, 0); sbcompress(sb, mb, sb->sb_mbtail); return; } /* * Not compressible, just append to the end and adjust counters. */ sb->sb_lastrecord->m_flags |= M_PUSH; sb->sb_lastrecord->m_nextpkt = mb; sb->sb_lastrecord = mb; if (sb->sb_sndptr == NULL) sb->sb_sndptr = mb; for (; mb; mb = mb->m_next) { sb->sb_mbtail = mb; sballoc(sb, mb); } } /* * Do a send by putting data in output queue and updating urgent * marker if URG set. Possibly send more data. Unlike the other * pru_*() routines, the mbuf chains are our responsibility. We * must either enqueue them or free them. The other pru_* routines * generally are caller-frees. * * This comes from sendfile, normal sends will come from sdp_sosend(). */ static int sdp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { struct sdp_sock *ssk; struct mbuf *n; int error; int cnt; error = 0; ssk = sdp_sk(so); KASSERT(m->m_flags & M_PKTHDR, ("sdp_send: %p no packet header", m)); M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; for (n = m, cnt = 0; n->m_next; n = n->m_next) cnt++; if (cnt > SDP_MAX_SEND_SGES) { n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); if (n == NULL) { m_freem(m); return (EMSGSIZE); } m = n; for (cnt = 0; n->m_next; n = n->m_next) cnt++; } SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { if (control) m_freem(control); if (m) m_freem(m); error = ECONNRESET; goto out; } if (control) { /* SDP doesn't support control messages. */ if (control->m_len) { m_freem(control); if (m) m_freem(m); error = EINVAL; goto out; } m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { SOCKBUF_LOCK(&so->so_snd); sdp_append(ssk, &so->so_snd, m, cnt); SOCKBUF_UNLOCK(&so->so_snd); if (nam && ssk->state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected. */ error = sdp_start_connect(ssk, nam, td); if (error) goto out; } if (flags & PRUS_EOF) { /* * Close the send side of the connection after * the data is sent. */ socantsendmore(so); sdp_usrclosed(ssk); if (!(ssk->flags & SDP_DROPPED)) sdp_output_disconnect(ssk); } else if (!(ssk->flags & SDP_DROPPED) && !(flags & PRUS_MORETOCOME)) sdp_post_sends(ssk, M_NOWAIT); SDP_WUNLOCK(ssk); return (0); } else { SOCKBUF_LOCK(&so->so_snd); if (sbspace(&so->so_snd) < -512) { SOCKBUF_UNLOCK(&so->so_snd); m_freem(m); error = ENOBUFS; goto out; } /* * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section. * Otherwise, snd_up should be one lower. */ m->m_flags |= M_URG | M_PUSH; sdp_append(ssk, &so->so_snd, m, cnt); SOCKBUF_UNLOCK(&so->so_snd); if (nam && ssk->state < TCPS_SYN_SENT) { /* * Do implied connect if not yet connected. */ error = sdp_start_connect(ssk, nam, td); if (error) goto out; } sdp_post_sends(ssk, M_NOWAIT); SDP_WUNLOCK(ssk); return (0); } out: SDP_WUNLOCK(ssk); return (error); } #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) /* * Send on a socket. If send must go all at once and message is larger than * send buffering, then hard error. Lock against other senders. If must go * all at once and not enough room now, then inform user that this would * block and do nothing. Otherwise, if nonblocking, send as much as * possible. The data to be sent is described by "uio" if nonzero, otherwise * by the mbuf chain "top" (which must be null if uio is not). Data provided * in mbuf chain must be small enough to send all at once. * * Returns nonzero on error, timeout or signal; callers must check for short * counts if EINTR/ERESTART are returned. Data and control buffers are freed * on return. */ static int sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { struct sdp_sock *ssk; long space, resid; int atomic; int error; int copy; if (uio != NULL) resid = uio->uio_resid; else resid = top->m_pkthdr.len; atomic = top != NULL; if (control != NULL) { if (control->m_len) { m_freem(control); if (top) m_freem(top); return (EINVAL); } m_freem(control); control = NULL; } /* * In theory resid should be unsigned. However, space must be * signed, as it might be less than 0 if we over-committed, and we * must use a signed comparison of space and resid. On the other * hand, a negative resid causes us to loop sending 0-length * segments to the protocol. * * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM * type sockets since that's an error. */ if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { error = EINVAL; goto out; } if (td != NULL) td->td_ru.ru_msgsnd++; ssk = sdp_sk(so); error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) goto out; restart: do { SOCKBUF_LOCK(&so->so_snd); if (so->so_snd.sb_state & SBS_CANTSENDMORE) { SOCKBUF_UNLOCK(&so->so_snd); error = EPIPE; goto release; } if (so->so_error) { error = so->so_error; so->so_error = 0; SOCKBUF_UNLOCK(&so->so_snd); goto release; } if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { SOCKBUF_UNLOCK(&so->so_snd); error = ENOTCONN; goto release; } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { SOCKBUF_UNLOCK(&so->so_snd); error = EMSGSIZE; goto release; } if (space < resid && (atomic || space < so->so_snd.sb_lowat)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) { SOCKBUF_UNLOCK(&so->so_snd); error = EWOULDBLOCK; goto release; } error = sbwait(&so->so_snd); SOCKBUF_UNLOCK(&so->so_snd); if (error) goto release; goto restart; } SOCKBUF_UNLOCK(&so->so_snd); do { if (uio == NULL) { resid = 0; if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { /* * Copy the data from userland into a mbuf * chain. If no data is to be copied in, * a single empty mbuf is returned. */ copy = min(space, ssk->xmit_size_goal - SDP_HEAD_SIZE); top = m_uiotombuf(uio, M_WAITOK, copy, 0, M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)); if (top == NULL) { /* only possible error */ error = EFAULT; goto release; } space -= resid - uio->uio_resid; resid = uio->uio_resid; } /* * XXX all the SBS_CANTSENDMORE checks previously * done could be out of date after dropping the * socket lock. */ error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : /* * Set EOF on the last send if the user specified * MSG_EOF. */ ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : /* If there is more to send set PRUS_MORETOCOME. */ (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, top, addr, NULL, td); top = NULL; if (error) goto release; } while (resid && space > 0); } while (resid); release: sbunlock(&so->so_snd); out: if (top != NULL) m_freem(top); return (error); } /* * The part of soreceive() that implements reading non-inline out-of-band * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is * unable to return an mbuf chain to the caller. */ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) { struct protosw *pr = so->so_proto; struct mbuf *m; int error; KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); m = m_get(M_WAITOK, MT_DATA); error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { error = uiomove(mtod(m, void *), (int) min(uio->uio_resid, m->m_len), uio); m = m_free(m); } while (uio->uio_resid && error == 0 && m); bad: if (m != NULL) m_freem(m); return (error); } /* * Optimized version of soreceive() for stream (TCP) sockets. */ static int sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int len = 0, error = 0, flags, oresid; struct sockbuf *sb; struct mbuf *m, *n = NULL; struct sdp_sock *ssk; /* We only do stream sockets. */ if (so->so_type != SOCK_STREAM) return (EINVAL); if (psa != NULL) *psa = NULL; if (controlp != NULL) return (EINVAL); if (flagsp != NULL) flags = *flagsp &~ MSG_EOR; else flags = 0; if (flags & MSG_OOB) return (soreceive_rcvoob(so, uio, flags)); if (mp0 != NULL) *mp0 = NULL; sb = &so->so_rcv; ssk = sdp_sk(so); /* Prevent other readers from entering the socket. */ error = sblock(sb, SBLOCKWAIT(flags)); if (error) goto out; SOCKBUF_LOCK(sb); /* Easy one, no space to copyout anything. */ if (uio->uio_resid == 0) { error = EINVAL; goto out; } oresid = uio->uio_resid; /* We will never ever get anything unless we are connected. */ if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { /* When disconnecting there may be still some data left. */ if (sbavail(sb)) goto deliver; if (!(so->so_state & SS_ISDISCONNECTED)) error = ENOTCONN; goto out; } /* Socket buffer is empty and we shall not block. */ if (sbavail(sb) == 0 && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { error = EAGAIN; goto out; } restart: SOCKBUF_LOCK_ASSERT(&so->so_rcv); /* Abort if socket has reported problems. */ if (so->so_error) { if (sbavail(sb)) goto deliver; if (oresid > uio->uio_resid) goto out; error = so->so_error; if (!(flags & MSG_PEEK)) so->so_error = 0; goto out; } /* Door is closed. Deliver what is left, if any. */ if (sb->sb_state & SBS_CANTRCVMORE) { if (sbavail(sb)) goto deliver; else goto out; } /* Socket buffer got some data that we shall deliver now. */ if (sbavail(sb) && !(flags & MSG_WAITALL) && ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)) || sbavail(sb) >= sb->sb_lowat || sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat) ) { goto deliver; } /* On MSG_WAITALL we must wait until all data or error arrives. */ if ((flags & MSG_WAITALL) && (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) goto deliver; /* * Wait and block until (more) data comes in. * NB: Drops the sockbuf lock during wait. */ error = sbwait(sb); if (error) goto out; goto restart; deliver: SOCKBUF_LOCK_ASSERT(&so->so_rcv); KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); /* Statistics. */ if (uio->uio_td) uio->uio_td->td_ru.ru_msgrcv++; /* Fill uio until full or current end of socket buffer is reached. */ len = min(uio->uio_resid, sbavail(sb)); if (mp0 != NULL) { /* Dequeue as many mbufs as possible. */ if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { for (*mp0 = m = sb->sb_mb; m != NULL && m->m_len <= len; m = m->m_next) { len -= m->m_len; uio->uio_resid -= m->m_len; sbfree(sb, m); n = m; } sb->sb_mb = m; if (sb->sb_mb == NULL) SB_EMPTY_FIXUP(sb); n->m_next = NULL; } /* Copy the remainder. */ if (len > 0) { KASSERT(sb->sb_mb != NULL, ("%s: len > 0 && sb->sb_mb empty", __func__)); m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); if (m == NULL) len = 0; /* Don't flush data from sockbuf. */ else uio->uio_resid -= m->m_len; if (*mp0 != NULL) n->m_next = m; else *mp0 = m; if (*mp0 == NULL) { error = ENOBUFS; goto out; } } } else { /* NB: Must unlock socket buffer as uiomove may sleep. */ SOCKBUF_UNLOCK(sb); error = m_mbuftouio(uio, sb->sb_mb, len); SOCKBUF_LOCK(sb); if (error) goto out; } SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); /* * Remove the delivered data from the socket buffer unless we * were only peeking. */ if (!(flags & MSG_PEEK)) { if (len > 0) sbdrop_locked(sb, len); /* Notify protocol that we drained some data. */ SOCKBUF_UNLOCK(sb); SDP_WLOCK(ssk); sdp_do_posts(ssk); SDP_WUNLOCK(ssk); SOCKBUF_LOCK(sb); } /* * For MSG_WAITALL we may have to loop again and wait for * more data to come in. */ if ((flags & MSG_WAITALL) && uio->uio_resid > 0) goto restart; out: SOCKBUF_LOCK_ASSERT(sb); SBLASTRECORDCHK(sb); SBLASTMBUFCHK(sb); SOCKBUF_UNLOCK(sb); sbunlock(sb); return (error); } /* * Abort is used to teardown a connection typically while sitting in * the accept queue. */ void sdp_abort(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); /* * If we have not yet dropped, do it now. */ if (!(ssk->flags & SDP_TIMEWAIT) && !(ssk->flags & SDP_DROPPED)) sdp_drop(ssk, ECONNABORTED); KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", ssk, ssk->flags)); SDP_WUNLOCK(ssk); } /* * Close a SDP socket and initiate a friendly disconnect. */ static void sdp_close(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); /* * If we have not yet dropped, do it now. */ if (!(ssk->flags & SDP_TIMEWAIT) && !(ssk->flags & SDP_DROPPED)) sdp_start_disconnect(ssk); /* * If we've still not dropped let the socket layer know we're * holding on to the socket and pcb for a while. */ if (!(ssk->flags & SDP_DROPPED)) { SOCK_LOCK(so); so->so_state |= SS_PROTOREF; SOCK_UNLOCK(so); ssk->flags |= SDP_SOCKREF; } SDP_WUNLOCK(ssk); } /* * User requests out-of-band data. */ static int sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) { int error = 0; struct sdp_sock *ssk; ssk = sdp_sk(so); SDP_WLOCK(ssk); if (!rx_ring_trylock(&ssk->rx_ring)) { SDP_WUNLOCK(ssk); return (ECONNRESET); } if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { error = ECONNRESET; goto out; } if ((so->so_oobmark == 0 && (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || so->so_options & SO_OOBINLINE || ssk->oobflags & SDP_HADOOB) { error = EINVAL; goto out; } if ((ssk->oobflags & SDP_HAVEOOB) == 0) { error = EWOULDBLOCK; goto out; } m->m_len = 1; *mtod(m, caddr_t) = ssk->iobc; if ((flags & MSG_PEEK) == 0) ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); out: rx_ring_unlock(&ssk->rx_ring); SDP_WUNLOCK(ssk); return (error); } void sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) { struct mbuf *m; struct socket *so; so = ssk->socket; if (so == NULL) return; so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; sohasoutofband(so); ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); if (!(so->so_options & SO_OOBINLINE)) { for (m = mb; m->m_next != NULL; m = m->m_next); ssk->iobc = *(mtod(m, char *) + m->m_len - 1); ssk->oobflags |= SDP_HAVEOOB; m->m_len--; mb->m_pkthdr.len--; } } /* * Notify a sdp socket of an asynchronous error. * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ struct sdp_sock * sdp_notify(struct sdp_sock *ssk, int error) { SDP_WLOCK_ASSERT(ssk); if ((ssk->flags & SDP_TIMEWAIT) || (ssk->flags & SDP_DROPPED)) return (ssk); /* * Ignore some errors if we are hooked up. */ if (ssk->state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) return (ssk); ssk->softerror = error; return sdp_drop(ssk, error); } static void sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) { struct in_addr faddr; faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); } static int sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { return (EOPNOTSUPP); } static void sdp_keepalive_timeout(void *data) { struct sdp_sock *ssk; ssk = data; /* Callout canceled. */ if (!callout_active(&ssk->keep2msl)) return; /* Callout rescheduled as a different kind of timer. */ if (callout_pending(&ssk->keep2msl)) goto out; callout_deactivate(&ssk->keep2msl); if (ssk->flags & SDP_DROPPED || (ssk->socket->so_options & SO_KEEPALIVE) == 0) goto out; sdp_post_keepalive(ssk); callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, sdp_keepalive_timeout, ssk); out: SDP_WUNLOCK(ssk); } void sdp_start_keepalive_timer(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); if (!callout_pending(&ssk->keep2msl)) callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, sdp_keepalive_timeout, ssk); } static void sdp_stop_keepalive_timer(struct socket *so) { struct sdp_sock *ssk; ssk = sdp_sk(so); callout_stop(&ssk->keep2msl); } /* * sdp_ctloutput() must drop the inpcb lock before performing copyin on * socket option arguments. When it re-acquires the lock after the copy, it * has to revalidate that the connection is still valid for the socket * option. */ #define SDP_WLOCK_RECHECK(inp) do { \ SDP_WLOCK(ssk); \ if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ SDP_WUNLOCK(ssk); \ return (ECONNRESET); \ } \ } while(0) static int sdp_ctloutput(struct socket *so, struct sockopt *sopt) { int error, opt, optval; struct sdp_sock *ssk; error = 0; ssk = sdp_sk(so); if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { SDP_WLOCK(ssk); if (so->so_options & SO_KEEPALIVE) sdp_start_keepalive_timer(so); else sdp_stop_keepalive_timer(so); SDP_WUNLOCK(ssk); } if (sopt->sopt_level != IPPROTO_TCP) return (error); SDP_WLOCK(ssk); if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { SDP_WUNLOCK(ssk); return (ECONNRESET); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case TCP_NODELAY: SDP_WUNLOCK(ssk); error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) return (error); SDP_WLOCK_RECHECK(ssk); opt = SDP_NODELAY; if (optval) ssk->flags |= opt; else ssk->flags &= ~opt; sdp_do_posts(ssk); SDP_WUNLOCK(ssk); break; default: SDP_WUNLOCK(ssk); error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case TCP_NODELAY: optval = ssk->flags & SDP_NODELAY; SDP_WUNLOCK(ssk); error = sooptcopyout(sopt, &optval, sizeof optval); break; default: SDP_WUNLOCK(ssk); error = ENOPROTOOPT; break; } break; } return (error); } #undef SDP_WLOCK_RECHECK int sdp_mod_count = 0; int sdp_mod_usec = 0; void sdp_set_default_moderation(struct sdp_sock *ssk) { - struct ib_cq_attr attr; if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) return; - memset(&attr, 0, sizeof(attr)); - attr.moderation.cq_count = sdp_mod_count; - attr.moderation.cq_period = sdp_mod_usec; - - ib_modify_cq(ssk->rx_ring.cq, &attr, IB_CQ_MODERATION); + ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); } static void sdp_dev_add(struct ib_device *device) { struct ib_fmr_pool_param param; struct sdp_device *sdp_dev; sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); - sdp_dev->pd = ib_alloc_pd(device); + sdp_dev->pd = ib_alloc_pd(device, 0); if (IS_ERR(sdp_dev->pd)) goto out_pd; - sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(sdp_dev->mr)) - goto out_mr; memset(¶m, 0, sizeof param); param.max_pages_per_fmr = SDP_FMR_SIZE; param.page_shift = PAGE_SHIFT; param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); param.pool_size = SDP_FMR_POOL_SIZE; param.dirty_watermark = SDP_FMR_DIRTY_SIZE; param.cache = 1; sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); if (IS_ERR(sdp_dev->fmr_pool)) goto out_fmr; ib_set_client_data(device, &sdp_client, sdp_dev); return; out_fmr: - ib_dereg_mr(sdp_dev->mr); -out_mr: ib_dealloc_pd(sdp_dev->pd); out_pd: free(sdp_dev, M_SDP); } static void -sdp_dev_rem(struct ib_device *device) +sdp_dev_rem(struct ib_device *device, void *client_data) { struct sdp_device *sdp_dev; struct sdp_sock *ssk; SDP_LIST_WLOCK(); LIST_FOREACH(ssk, &sdp_list, list) { if (ssk->ib_device != device) continue; SDP_WLOCK(ssk); if ((ssk->flags & SDP_DESTROY) == 0) ssk = sdp_notify(ssk, ECONNRESET); if (ssk) SDP_WUNLOCK(ssk); } SDP_LIST_WUNLOCK(); /* * XXX Do I need to wait between these two? */ sdp_dev = ib_get_client_data(device, &sdp_client); if (!sdp_dev) return; ib_flush_fmr_pool(sdp_dev->fmr_pool); ib_destroy_fmr_pool(sdp_dev->fmr_pool); - ib_dereg_mr(sdp_dev->mr); ib_dealloc_pd(sdp_dev->pd); free(sdp_dev, M_SDP); } struct ib_client sdp_client = { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; static int sdp_pcblist(SYSCTL_HANDLER_ARGS) { int error, n, i; struct sdp_sock *ssk; struct xinpgen xig; /* * The process of preparing the TCB list is too time-consuming and * resource-intensive to repeat twice on every request. */ if (req->oldptr == NULL) { n = sdp_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if (req->newptr != NULL) return (EPERM); /* * OK, now we're committed to doing something. */ SDP_LIST_RLOCK(); n = sdp_count; SDP_LIST_RUNLOCK(); error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) + n * sizeof(struct xtcpcb)); if (error != 0) return (error); xig.xig_len = sizeof xig; xig.xig_count = n; xig.xig_gen = 0; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); SDP_LIST_RLOCK(); for (ssk = LIST_FIRST(&sdp_list), i = 0; ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { struct xtcpcb xt; SDP_RLOCK(ssk); if (ssk->flags & SDP_TIMEWAIT) { if (ssk->cred != NULL) error = cr_cansee(req->td->td_ucred, ssk->cred); else error = EINVAL; /* Skip this inp. */ } else if (ssk->socket) error = cr_canseesocket(req->td->td_ucred, ssk->socket); else error = EINVAL; if (error) { error = 0; goto next; } bzero(&xt, sizeof(xt)); xt.xt_len = sizeof xt; xt.xt_inp.inp_gencnt = 0; xt.xt_inp.inp_vflag = INP_IPV4; memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); xt.xt_inp.inp_lport = ssk->lport; memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); xt.xt_inp.inp_fport = ssk->fport; xt.t_state = ssk->state; if (ssk->socket != NULL) sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket); xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; SDP_RUNLOCK(ssk); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) break; i++; continue; next: SDP_RUNLOCK(ssk); } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = 0; xig.xig_sogen = so_gencnt; xig.xig_count = sdp_count; error = SYSCTL_OUT(req, &xig, sizeof xig); } SDP_LIST_RUNLOCK(); return (error); } static SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP"); SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb", "List of active SDP connections"); static void sdp_zone_change(void *tag) { uma_zone_set_max(sdp_zone, maxsockets); } static void sdp_init(void) { LIST_INIT(&sdp_list); sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(sdp_zone, maxsockets); EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, EVENTHANDLER_PRI_ANY); rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); ib_register_client(&sdp_client); } extern struct domain sdpdomain; struct pr_usrreqs sdp_usrreqs = { .pru_abort = sdp_abort, .pru_accept = sdp_accept, .pru_attach = sdp_attach, .pru_bind = sdp_bind, .pru_connect = sdp_connect, .pru_control = sdp_control, .pru_detach = sdp_detach, .pru_disconnect = sdp_disconnect, .pru_listen = sdp_listen, .pru_peeraddr = sdp_getpeeraddr, .pru_rcvoob = sdp_rcvoob, .pru_send = sdp_send, .pru_sosend = sdp_sosend, .pru_soreceive = sdp_sorecv, .pru_shutdown = sdp_shutdown, .pru_sockaddr = sdp_getsockaddr, .pru_close = sdp_close, }; struct protosw sdpsw[] = { { .pr_type = SOCK_STREAM, .pr_domain = &sdpdomain, .pr_protocol = IPPROTO_IP, .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, .pr_ctlinput = sdp_ctlinput, .pr_ctloutput = sdp_ctloutput, .pr_usrreqs = &sdp_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &sdpdomain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, .pr_ctlinput = sdp_ctlinput, .pr_ctloutput = sdp_ctloutput, .pr_usrreqs = &sdp_usrreqs }, }; struct domain sdpdomain = { .dom_family = AF_INET_SDP, .dom_name = "SDP", .dom_init = sdp_init, .dom_protosw = sdpsw, .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], }; DOMAIN_SET(sdp); int sdp_debug_level = 1; int sdp_data_debug_level = 0; Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c (revision 323642) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_proc.c (revision 323643) @@ -1,533 +1,532 @@ /* * Copyright (c) 2008 Mellanox Technologies Ltd. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include -#include #include "sdp.h" #ifdef CONFIG_PROC_FS #define PROC_SDP_STATS "sdpstats" #define PROC_SDP_PERF "sdpprf" /* just like TCP fs */ struct sdp_seq_afinfo { struct module *owner; char *name; sa_family_t family; int (*seq_show) (struct seq_file *m, void *v); struct file_operations *seq_fops; }; struct sdp_iter_state { sa_family_t family; int num; struct seq_operations seq_ops; }; static void *sdp_get_idx(struct seq_file *seq, loff_t pos) { int i = 0; struct sdp_sock *ssk; if (!list_empty(&sock_list)) list_for_each_entry(ssk, &sock_list, sock_list) { if (i == pos) return ssk; i++; } return NULL; } static void *sdp_seq_start(struct seq_file *seq, loff_t *pos) { void *start = NULL; struct sdp_iter_state *st = seq->private; st->num = 0; if (!*pos) return SEQ_START_TOKEN; spin_lock_irq(&sock_list_lock); start = sdp_get_idx(seq, *pos - 1); if (start) sock_hold((struct socket *)start, SOCK_REF_SEQ); spin_unlock_irq(&sock_list_lock); return start; } static void *sdp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sdp_iter_state *st = seq->private; void *next = NULL; spin_lock_irq(&sock_list_lock); if (v == SEQ_START_TOKEN) next = sdp_get_idx(seq, 0); else next = sdp_get_idx(seq, *pos); if (next) sock_hold((struct socket *)next, SOCK_REF_SEQ); spin_unlock_irq(&sock_list_lock); *pos += 1; st->num++; return next; } static void sdp_seq_stop(struct seq_file *seq, void *v) { } #define TMPSZ 150 static int sdp_seq_show(struct seq_file *seq, void *v) { struct sdp_iter_state *st; struct socket *sk = v; char tmpbuf[TMPSZ + 1]; unsigned int dest; unsigned int src; int uid; unsigned long inode; __u16 destp; __u16 srcp; __u32 rx_queue, tx_queue; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-*s\n", TMPSZ - 1, " sl local_address rem_address " "uid inode rx_queue tx_queue state"); goto out; } st = seq->private; dest = inet_sk(sk)->daddr; src = inet_sk(sk)->rcv_saddr; destp = ntohs(inet_sk(sk)->dport); srcp = ntohs(inet_sk(sk)->sport); uid = sock_i_uid(sk); inode = sock_i_ino(sk); rx_queue = rcv_nxt(sdp_sk(sk)) - sdp_sk(sk)->copied_seq; tx_queue = sdp_sk(sk)->write_seq - sdp_sk(sk)->tx_ring.una_seq; sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %5d %lu %08X:%08X %X", st->num, src, srcp, dest, destp, uid, inode, rx_queue, tx_queue, sk->sk_state); seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf); sock_put(sk, SOCK_REF_SEQ); out: return 0; } static int sdp_seq_open(struct inode *inode, struct file *file) { struct sdp_seq_afinfo *afinfo = PDE(inode)->data; struct seq_file *seq; struct sdp_iter_state *s; int rc; if (unlikely(afinfo == NULL)) return -EINVAL; /* Workaround bogus warning by memtrack */ #define _kzalloc(size,flags) kzalloc(size,flags) #undef kzalloc s = kzalloc(sizeof(*s), GFP_KERNEL); #define kzalloc(s,f) _kzalloc(s,f) if (!s) return -ENOMEM; s->family = afinfo->family; s->seq_ops.start = sdp_seq_start; s->seq_ops.next = sdp_seq_next; s->seq_ops.show = afinfo->seq_show; s->seq_ops.stop = sdp_seq_stop; rc = seq_open(file, &s->seq_ops); if (rc) goto out_kfree; seq = file->private_data; seq->private = s; out: return rc; out_kfree: kfree(s); goto out; } static struct file_operations sdp_seq_fops; static struct sdp_seq_afinfo sdp_seq_afinfo = { .owner = THIS_MODULE, .name = "sdp", .family = AF_INET_SDP, .seq_show = sdp_seq_show, .seq_fops = &sdp_seq_fops, }; #ifdef SDPSTATS_ON DEFINE_PER_CPU(struct sdpstats, sdpstats); static void sdpstats_seq_hist(struct seq_file *seq, char *str, u32 *h, int n, int is_log) { int i; u32 max = 0; seq_printf(seq, "%s:\n", str); for (i = 0; i < n; i++) { if (h[i] > max) max = h[i]; } if (max == 0) { seq_printf(seq, " - all values are 0\n"); return; } for (i = 0; i < n; i++) { char s[51]; int j = 50 * h[i] / max; int val = is_log ? (i == n-1 ? 0 : 1<time - start_t; nsec_rem = do_div(t, 1000000000); seq_printf(m, "%-6d: [%5lu.%06lu] %-50s - [%d{%d} %d:%d] " "mb: %p %s:%d\n", l->idx, (unsigned long)t, nsec_rem/1000, l->msg, l->pid, l->cpu, l->sk_num, l->sk_dport, l->mb, l->func, l->line); out: return 0; } static void *sdpprf_start(struct seq_file *p, loff_t *pos) { int idx = *pos; if (!*pos) { if (!sdpprf_log_count) return SEQ_START_TOKEN; } if (*pos >= MIN(sdpprf_log_count, SDPPRF_LOG_SIZE - 1)) return NULL; if (sdpprf_log_count >= SDPPRF_LOG_SIZE - 1) { int off = sdpprf_log_count & (SDPPRF_LOG_SIZE - 1); idx = (idx + off) & (SDPPRF_LOG_SIZE - 1); } if (!start_t) start_t = sdpprf_log[idx].time; return &sdpprf_log[idx]; } static void *sdpprf_next(struct seq_file *p, void *v, loff_t *pos) { struct sdpprf_log *l = v; if (++*pos >= MIN(sdpprf_log_count, SDPPRF_LOG_SIZE - 1)) return NULL; ++l; if (l - &sdpprf_log[0] >= SDPPRF_LOG_SIZE - 1) return &sdpprf_log[0]; return l; } static void sdpprf_stop(struct seq_file *p, void *v) { } static struct seq_operations sdpprf_ops = { .start = sdpprf_start, .stop = sdpprf_stop, .next = sdpprf_next, .show = sdpprf_show, }; static int sdpprf_open(struct inode *inode, struct file *file) { int res; res = seq_open(file, &sdpprf_ops); return res; } static ssize_t sdpprf_write(struct file *file, const char __user *buf, size_t count, loff_t *offs) { sdpprf_log_count = 0; printk(KERN_INFO "Cleared sdpprf statistics\n"); return count; } static struct file_operations sdpprf_fops = { .open = sdpprf_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, .write = sdpprf_write, }; #endif /* SDP_PROFILING */ int __init sdp_proc_init(void) { struct proc_dir_entry *p = NULL; #ifdef SDPSTATS_ON struct proc_dir_entry *stats = NULL; #endif #ifdef SDP_PROFILING struct proc_dir_entry *prof = NULL; #endif sdp_seq_afinfo.seq_fops->owner = sdp_seq_afinfo.owner; sdp_seq_afinfo.seq_fops->open = sdp_seq_open; sdp_seq_afinfo.seq_fops->read = seq_read; sdp_seq_afinfo.seq_fops->llseek = seq_lseek; sdp_seq_afinfo.seq_fops->release = seq_release_private; p = proc_net_fops_create(&init_net, sdp_seq_afinfo.name, S_IRUGO, sdp_seq_afinfo.seq_fops); if (p) p->data = &sdp_seq_afinfo; else goto no_mem; #ifdef SDPSTATS_ON stats = proc_net_fops_create(&init_net, PROC_SDP_STATS, S_IRUGO | S_IWUGO, &sdpstats_fops); if (!stats) goto no_mem_stats; #endif #ifdef SDP_PROFILING prof = proc_net_fops_create(&init_net, PROC_SDP_PERF, S_IRUGO | S_IWUGO, &sdpprf_fops); if (!prof) goto no_mem_prof; #endif return 0; #ifdef SDP_PROFILING no_mem_prof: #endif #ifdef SDPSTATS_ON proc_net_remove(&init_net, PROC_SDP_STATS); no_mem_stats: #endif proc_net_remove(&init_net, sdp_seq_afinfo.name); no_mem: return -ENOMEM; } void sdp_proc_unregister(void) { proc_net_remove(&init_net, sdp_seq_afinfo.name); memset(sdp_seq_afinfo.seq_fops, 0, sizeof(*sdp_seq_afinfo.seq_fops)); #ifdef SDPSTATS_ON proc_net_remove(&init_net, PROC_SDP_STATS); #endif #ifdef SDP_PROFILING proc_net_remove(&init_net, PROC_SDP_PERF); #endif } #else /* CONFIG_PROC_FS */ int __init sdp_proc_init(void) { return 0; } void sdp_proc_unregister(void) { } #endif /* CONFIG_PROC_FS */ Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c (revision 323642) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_rx.c (revision 323643) @@ -1,754 +1,759 @@ /* * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "sdp.h" SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024, "Receive buffer initial size in bytes."); SDP_MODPARAM_SINT(rcvbuf_scale, 0x8, "Receive buffer size scale factor."); /* Like tcp_fin - called when SDP_MID_DISCONNECT is received */ static void sdp_handle_disconn(struct sdp_sock *ssk) { sdp_dbg(ssk->socket, "%s\n", __func__); SDP_WLOCK_ASSERT(ssk); if (TCPS_HAVERCVDFIN(ssk->state) == 0) socantrcvmore(ssk->socket); switch (ssk->state) { case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: ssk->state = TCPS_CLOSE_WAIT; break; case TCPS_FIN_WAIT_1: /* Received a reply FIN - start Infiniband tear down */ sdp_dbg(ssk->socket, "%s: Starting Infiniband tear down sending DREQ\n", __func__); sdp_cancel_dreq_wait_timeout(ssk); ssk->qp_active = 0; if (ssk->id) { struct rdma_cm_id *id; id = ssk->id; SDP_WUNLOCK(ssk); rdma_disconnect(id); SDP_WLOCK(ssk); } else { sdp_warn(ssk->socket, "%s: ssk->id is NULL\n", __func__); return; } break; case TCPS_TIME_WAIT: /* This is a mutual close situation and we've got the DREQ from the peer before the SDP_MID_DISCONNECT */ break; case TCPS_CLOSED: /* FIN arrived after IB teardown started - do nothing */ sdp_dbg(ssk->socket, "%s: fin in state %s\n", __func__, sdp_state_str(ssk->state)); return; default: sdp_warn(ssk->socket, "%s: FIN in unexpected state. state=%d\n", __func__, ssk->state); break; } } static int sdp_post_recv(struct sdp_sock *ssk) { struct sdp_buf *rx_req; int i, rc; u64 addr; struct ib_device *dev; struct ib_recv_wr rx_wr = { NULL }; struct ib_sge ibsge[SDP_MAX_RECV_SGES]; struct ib_sge *sge = ibsge; struct ib_recv_wr *bad_wr; struct mbuf *mb, *m; struct sdp_bsdh *h; int id = ring_head(ssk->rx_ring); /* Now, allocate and repost recv */ sdp_prf(ssk->socket, mb, "Posting mb"); mb = m_getm2(NULL, ssk->recv_bytes, M_NOWAIT, MT_DATA, M_PKTHDR); if (mb == NULL) { /* Retry so we can't stall out with no memory. */ if (!rx_ring_posted(ssk)) queue_work(rx_comp_wq, &ssk->rx_comp_work); return -1; } for (m = mb; m != NULL; m = m->m_next) { m->m_len = M_SIZE(m); mb->m_pkthdr.len += m->m_len; } h = mtod(mb, struct sdp_bsdh *); rx_req = ssk->rx_ring.buffer + (id & (SDP_RX_SIZE - 1)); rx_req->mb = mb; dev = ssk->ib_device; for (i = 0; mb != NULL; i++, mb = mb->m_next, sge++) { addr = ib_dma_map_single(dev, mb->m_data, mb->m_len, DMA_TO_DEVICE); /* TODO: proper error handling */ BUG_ON(ib_dma_mapping_error(dev, addr)); BUG_ON(i >= SDP_MAX_RECV_SGES); rx_req->mapping[i] = addr; sge->addr = addr; sge->length = mb->m_len; - sge->lkey = ssk->sdp_dev->mr->lkey; + sge->lkey = ssk->sdp_dev->pd->local_dma_lkey; } rx_wr.next = NULL; rx_wr.wr_id = id | SDP_OP_RECV; rx_wr.sg_list = ibsge; rx_wr.num_sge = i; rc = ib_post_recv(ssk->qp, &rx_wr, &bad_wr); if (unlikely(rc)) { sdp_warn(ssk->socket, "ib_post_recv failed. status %d\n", rc); sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE); m_freem(mb); sdp_notify(ssk, ECONNRESET); return -1; } atomic_inc(&ssk->rx_ring.head); SDPSTATS_COUNTER_INC(post_recv); return 0; } static inline int sdp_post_recvs_needed(struct sdp_sock *ssk) { unsigned long bytes_in_process; unsigned long max_bytes; int buffer_size; int posted; if (!ssk->qp_active || !ssk->socket) return 0; posted = rx_ring_posted(ssk); if (posted >= SDP_RX_SIZE) return 0; if (posted < SDP_MIN_TX_CREDITS) return 1; buffer_size = ssk->recv_bytes; max_bytes = max(ssk->socket->so_rcv.sb_hiwat, (1 + SDP_MIN_TX_CREDITS) * buffer_size); max_bytes *= rcvbuf_scale; /* * Compute bytes in the receive queue and socket buffer. */ bytes_in_process = (posted - SDP_MIN_TX_CREDITS) * buffer_size; bytes_in_process += sbused(&ssk->socket->so_rcv); return bytes_in_process < max_bytes; } static inline void sdp_post_recvs(struct sdp_sock *ssk) { while (sdp_post_recvs_needed(ssk)) if (sdp_post_recv(ssk)) return; } static inline struct mbuf * sdp_sock_queue_rcv_mb(struct socket *sk, struct mbuf *mb) { struct sdp_sock *ssk = sdp_sk(sk); struct sdp_bsdh *h; h = mtod(mb, struct sdp_bsdh *); #ifdef SDP_ZCOPY SDP_SKB_CB(mb)->seq = rcv_nxt(ssk); if (h->mid == SDP_MID_SRCAVAIL) { struct sdp_srcah *srcah = (struct sdp_srcah *)(h+1); struct rx_srcavail_state *rx_sa; ssk->srcavail_cancel_mseq = 0; ssk->rx_sa = rx_sa = RX_SRCAVAIL_STATE(mb) = kzalloc( sizeof(struct rx_srcavail_state), M_NOWAIT); rx_sa->mseq = ntohl(h->mseq); rx_sa->used = 0; rx_sa->len = mb_len = ntohl(srcah->len); rx_sa->rkey = ntohl(srcah->rkey); rx_sa->vaddr = be64_to_cpu(srcah->vaddr); rx_sa->flags = 0; if (ssk->tx_sa) { sdp_dbg_data(ssk->socket, "got RX SrcAvail while waiting " "for TX SrcAvail. waking up TX SrcAvail" "to be aborted\n"); wake_up(sk->sk_sleep); } atomic_add(mb->len, &ssk->rcv_nxt); sdp_dbg_data(sk, "queueing SrcAvail. mb_len = %d vaddr = %lld\n", mb_len, rx_sa->vaddr); } else #endif { atomic_add(mb->m_pkthdr.len, &ssk->rcv_nxt); } m_adj(mb, SDP_HEAD_SIZE); SOCKBUF_LOCK(&sk->so_rcv); if (unlikely(h->flags & SDP_OOB_PRES)) sdp_urg(ssk, mb); sbappend_locked(&sk->so_rcv, mb, 0); sorwakeup_locked(sk); return mb; } static int sdp_get_recv_bytes(struct sdp_sock *ssk, u32 new_size) { return MIN(new_size, SDP_MAX_PACKET); } int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size) { ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size); sdp_post_recvs(ssk); return 0; } int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size) { u32 curr_size = ssk->recv_bytes; u32 max_size = SDP_MAX_PACKET; if (new_size > curr_size && new_size <= max_size) { ssk->recv_bytes = sdp_get_recv_bytes(ssk, new_size); return 0; } return -1; } static void sdp_handle_resize_request(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf) { if (sdp_resize_buffers(ssk, ntohl(buf->size)) == 0) ssk->recv_request_head = ring_head(ssk->rx_ring) + 1; else ssk->recv_request_head = ring_tail(ssk->rx_ring); ssk->recv_request = 1; } static void sdp_handle_resize_ack(struct sdp_sock *ssk, struct sdp_chrecvbuf *buf) { u32 new_size = ntohl(buf->size); if (new_size > ssk->xmit_size_goal) ssk->xmit_size_goal = new_size; } static struct mbuf * sdp_recv_completion(struct sdp_sock *ssk, int id) { struct sdp_buf *rx_req; struct ib_device *dev; struct mbuf *mb; if (unlikely(id != ring_tail(ssk->rx_ring))) { printk(KERN_WARNING "Bogus recv completion id %d tail %d\n", id, ring_tail(ssk->rx_ring)); return NULL; } dev = ssk->ib_device; rx_req = &ssk->rx_ring.buffer[id & (SDP_RX_SIZE - 1)]; mb = rx_req->mb; sdp_cleanup_sdp_buf(ssk, rx_req, DMA_FROM_DEVICE); atomic_inc(&ssk->rx_ring.tail); atomic_dec(&ssk->remote_credits); return mb; } static void sdp_process_rx_ctl_mb(struct sdp_sock *ssk, struct mbuf *mb) { struct sdp_bsdh *h; struct socket *sk; SDP_WLOCK_ASSERT(ssk); sk = ssk->socket; h = mtod(mb, struct sdp_bsdh *); switch (h->mid) { case SDP_MID_DATA: case SDP_MID_SRCAVAIL: sdp_dbg(sk, "DATA after socket rcv was shutdown\n"); /* got data in RCV_SHUTDOWN */ if (ssk->state == TCPS_FIN_WAIT_1) { sdp_dbg(sk, "RX data when state = FIN_WAIT1\n"); sdp_notify(ssk, ECONNRESET); } break; #ifdef SDP_ZCOPY case SDP_MID_RDMARDCOMPL: break; case SDP_MID_SENDSM: sdp_handle_sendsm(ssk, ntohl(h->mseq_ack)); break; case SDP_MID_SRCAVAIL_CANCEL: sdp_dbg_data(sk, "Handling SrcAvailCancel\n"); sdp_prf(sk, NULL, "Handling SrcAvailCancel"); if (ssk->rx_sa) { ssk->srcavail_cancel_mseq = ntohl(h->mseq); ssk->rx_sa->flags |= RX_SA_ABORTED; ssk->rx_sa = NULL; /* TODO: change it into SDP_MID_DATA and get the dirty logic from recvmsg */ } else { sdp_dbg(sk, "Got SrcAvailCancel - " "but no SrcAvail in process\n"); } break; case SDP_MID_SINKAVAIL: sdp_dbg_data(sk, "Got SinkAvail - not supported: ignored\n"); sdp_prf(sk, NULL, "Got SinkAvail - not supported: ignored"); /* FALLTHROUGH */ #endif case SDP_MID_ABORT: sdp_dbg_data(sk, "Handling ABORT\n"); sdp_prf(sk, NULL, "Handling ABORT"); sdp_notify(ssk, ECONNRESET); break; case SDP_MID_DISCONN: sdp_dbg_data(sk, "Handling DISCONN\n"); sdp_prf(sk, NULL, "Handling DISCONN"); sdp_handle_disconn(ssk); break; case SDP_MID_CHRCVBUF: sdp_dbg_data(sk, "Handling RX CHRCVBUF\n"); sdp_handle_resize_request(ssk, (struct sdp_chrecvbuf *)(h+1)); break; case SDP_MID_CHRCVBUF_ACK: sdp_dbg_data(sk, "Handling RX CHRCVBUF_ACK\n"); sdp_handle_resize_ack(ssk, (struct sdp_chrecvbuf *)(h+1)); break; default: /* TODO: Handle other messages */ sdp_warn(sk, "SDP: FIXME MID %d\n", h->mid); break; } m_freem(mb); } static int sdp_process_rx_mb(struct sdp_sock *ssk, struct mbuf *mb) { struct socket *sk; struct sdp_bsdh *h; unsigned long mseq_ack; int credits_before; h = mtod(mb, struct sdp_bsdh *); sk = ssk->socket; /* * If another thread is in so_pcbfree this may be partially torn * down but no further synchronization is required as the destroying * thread will wait for receive to shutdown before discarding the * socket. */ if (sk == NULL) { m_freem(mb); return 0; } SDPSTATS_HIST_LINEAR(credits_before_update, tx_credits(ssk)); mseq_ack = ntohl(h->mseq_ack); credits_before = tx_credits(ssk); atomic_set(&ssk->tx_ring.credits, mseq_ack - ring_head(ssk->tx_ring) + 1 + ntohs(h->bufs)); if (mseq_ack >= ssk->nagle_last_unacked) ssk->nagle_last_unacked = 0; sdp_prf1(ssk->socket, mb, "RX %s +%d c:%d->%d mseq:%d ack:%d\n", mid2str(h->mid), ntohs(h->bufs), credits_before, tx_credits(ssk), ntohl(h->mseq), ntohl(h->mseq_ack)); if (unlikely(h->mid == SDP_MID_DATA && mb->m_pkthdr.len == SDP_HEAD_SIZE)) { /* Credit update is valid even after RCV_SHUTDOWN */ m_freem(mb); return 0; } if ((h->mid != SDP_MID_DATA && h->mid != SDP_MID_SRCAVAIL) || TCPS_HAVERCVDFIN(ssk->state)) { sdp_prf(sk, NULL, "Control mb - queing to control queue"); #ifdef SDP_ZCOPY if (h->mid == SDP_MID_SRCAVAIL_CANCEL) { sdp_dbg_data(sk, "Got SrcAvailCancel. " "seq: 0x%d seq_ack: 0x%d\n", ntohl(h->mseq), ntohl(h->mseq_ack)); ssk->srcavail_cancel_mseq = ntohl(h->mseq); } if (h->mid == SDP_MID_RDMARDCOMPL) { struct sdp_rrch *rrch = (struct sdp_rrch *)(h+1); sdp_dbg_data(sk, "RdmaRdCompl message arrived\n"); sdp_handle_rdma_read_compl(ssk, ntohl(h->mseq_ack), ntohl(rrch->len)); } #endif if (mbufq_enqueue(&ssk->rxctlq, mb) != 0) m_freem(mb); return (0); } sdp_prf1(sk, NULL, "queueing %s mb\n", mid2str(h->mid)); mb = sdp_sock_queue_rcv_mb(sk, mb); return 0; } /* called only from irq */ static struct mbuf * sdp_process_rx_wc(struct sdp_sock *ssk, struct ib_wc *wc) { struct mbuf *mb; struct sdp_bsdh *h; struct socket *sk = ssk->socket; int mseq; mb = sdp_recv_completion(ssk, wc->wr_id); if (unlikely(!mb)) return NULL; if (unlikely(wc->status)) { if (ssk->qp_active && sk) { sdp_dbg(sk, "Recv completion with error. " "Status %d, vendor: %d\n", wc->status, wc->vendor_err); sdp_abort(sk); ssk->qp_active = 0; } m_freem(mb); return NULL; } sdp_dbg_data(sk, "Recv completion. ID %d Length %d\n", (int)wc->wr_id, wc->byte_len); if (unlikely(wc->byte_len < sizeof(struct sdp_bsdh))) { sdp_warn(sk, "SDP BUG! byte_len %d < %zd\n", wc->byte_len, sizeof(struct sdp_bsdh)); m_freem(mb); return NULL; } /* Use m_adj to trim the tail of data we didn't use. */ m_adj(mb, -(mb->m_pkthdr.len - wc->byte_len)); h = mtod(mb, struct sdp_bsdh *); SDP_DUMP_PACKET(ssk->socket, "RX", mb, h); ssk->rx_packets++; ssk->rx_bytes += mb->m_pkthdr.len; mseq = ntohl(h->mseq); atomic_set(&ssk->mseq_ack, mseq); if (mseq != (int)wc->wr_id) sdp_warn(sk, "SDP BUG! mseq %d != wrid %d\n", mseq, (int)wc->wr_id); return mb; } /* Wakeup writers if we now have credits. */ static void sdp_bzcopy_write_space(struct sdp_sock *ssk) { struct socket *sk = ssk->socket; if (tx_credits(ssk) >= ssk->min_bufs && sk) sowwakeup(sk); } /* only from interrupt. */ static int sdp_poll_rx_cq(struct sdp_sock *ssk) { struct ib_cq *cq = ssk->rx_ring.cq; struct ib_wc ibwc[SDP_NUM_WC]; int n, i; int wc_processed = 0; struct mbuf *mb; do { n = ib_poll_cq(cq, SDP_NUM_WC, ibwc); for (i = 0; i < n; ++i) { struct ib_wc *wc = &ibwc[i]; BUG_ON(!(wc->wr_id & SDP_OP_RECV)); mb = sdp_process_rx_wc(ssk, wc); if (!mb) continue; sdp_process_rx_mb(ssk, mb); wc_processed++; } } while (n == SDP_NUM_WC); if (wc_processed) sdp_bzcopy_write_space(ssk); return wc_processed; } static void sdp_rx_comp_work(struct work_struct *work) { struct sdp_sock *ssk = container_of(work, struct sdp_sock, rx_comp_work); sdp_prf(ssk->socket, NULL, "%s", __func__); SDP_WLOCK(ssk); if (unlikely(!ssk->qp)) { sdp_prf(ssk->socket, NULL, "qp was destroyed"); goto out; } if (unlikely(!ssk->rx_ring.cq)) { sdp_prf(ssk->socket, NULL, "rx_ring.cq is NULL"); goto out; } if (unlikely(!ssk->poll_cq)) { struct rdma_cm_id *id = ssk->id; if (id && id->qp) rdma_notify(id, IB_EVENT_COMM_EST); goto out; } sdp_do_posts(ssk); out: SDP_WUNLOCK(ssk); } void sdp_do_posts(struct sdp_sock *ssk) { struct socket *sk = ssk->socket; int xmit_poll_force; struct mbuf *mb; SDP_WLOCK_ASSERT(ssk); if (!ssk->qp_active) { sdp_dbg(sk, "QP is deactivated\n"); return; } while ((mb = mbufq_dequeue(&ssk->rxctlq)) != NULL) sdp_process_rx_ctl_mb(ssk, mb); if (ssk->state == TCPS_TIME_WAIT) return; if (!ssk->rx_ring.cq || !ssk->tx_ring.cq) return; sdp_post_recvs(ssk); if (tx_ring_posted(ssk)) sdp_xmit_poll(ssk, 1); sdp_post_sends(ssk, M_NOWAIT); xmit_poll_force = tx_credits(ssk) < SDP_MIN_TX_CREDITS; if (credit_update_needed(ssk) || xmit_poll_force) { /* if has pending tx because run out of tx_credits - xmit it */ sdp_prf(sk, NULL, "Processing to free pending sends"); sdp_xmit_poll(ssk, xmit_poll_force); sdp_prf(sk, NULL, "Sending credit update"); sdp_post_sends(ssk, M_NOWAIT); } } int sdp_process_rx(struct sdp_sock *ssk) { int wc_processed = 0; int credits_before; if (!rx_ring_trylock(&ssk->rx_ring)) { sdp_dbg(ssk->socket, "ring destroyed. not polling it\n"); return 0; } credits_before = tx_credits(ssk); wc_processed = sdp_poll_rx_cq(ssk); sdp_prf(ssk->socket, NULL, "processed %d", wc_processed); if (wc_processed) { sdp_prf(ssk->socket, NULL, "credits: %d -> %d", credits_before, tx_credits(ssk)); queue_work(rx_comp_wq, &ssk->rx_comp_work); } sdp_arm_rx_cq(ssk); rx_ring_unlock(&ssk->rx_ring); return (wc_processed); } static void sdp_rx_irq(struct ib_cq *cq, void *cq_context) { struct sdp_sock *ssk; ssk = cq_context; KASSERT(cq == ssk->rx_ring.cq, ("%s: mismatched cq on %p", __func__, ssk)); SDPSTATS_COUNTER_INC(rx_int_count); sdp_prf(sk, NULL, "rx irq"); sdp_process_rx(ssk); } static void sdp_rx_ring_purge(struct sdp_sock *ssk) { while (rx_ring_posted(ssk) > 0) { struct mbuf *mb; mb = sdp_recv_completion(ssk, ring_tail(ssk->rx_ring)); if (!mb) break; m_freem(mb); } } void sdp_rx_ring_init(struct sdp_sock *ssk) { ssk->rx_ring.buffer = NULL; ssk->rx_ring.destroyed = 0; rw_init(&ssk->rx_ring.destroyed_lock, "sdp rx lock"); } static void sdp_rx_cq_event_handler(struct ib_event *event, void *data) { } int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device) { + struct ib_cq_init_attr rx_cq_attr = { + .cqe = SDP_RX_SIZE, + .comp_vector = 0, + .flags = 0, + }; struct ib_cq *rx_cq; int rc = 0; sdp_dbg(ssk->socket, "rx ring created"); INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work); atomic_set(&ssk->rx_ring.head, 1); atomic_set(&ssk->rx_ring.tail, 1); ssk->rx_ring.buffer = malloc(sizeof(*ssk->rx_ring.buffer) * SDP_RX_SIZE, M_SDP, M_WAITOK); rx_cq = ib_create_cq(device, sdp_rx_irq, sdp_rx_cq_event_handler, - ssk, SDP_RX_SIZE, 0); + ssk, &rx_cq_attr); if (IS_ERR(rx_cq)) { rc = PTR_ERR(rx_cq); sdp_warn(ssk->socket, "Unable to allocate RX CQ: %d.\n", rc); goto err_cq; } sdp_sk(ssk->socket)->rx_ring.cq = rx_cq; sdp_arm_rx_cq(ssk); return 0; err_cq: free(ssk->rx_ring.buffer, M_SDP); ssk->rx_ring.buffer = NULL; return rc; } void sdp_rx_ring_destroy(struct sdp_sock *ssk) { cancel_work_sync(&ssk->rx_comp_work); rx_ring_destroy_lock(&ssk->rx_ring); if (ssk->rx_ring.buffer) { sdp_rx_ring_purge(ssk); free(ssk->rx_ring.buffer, M_SDP); ssk->rx_ring.buffer = NULL; } if (ssk->rx_ring.cq) { if (ib_destroy_cq(ssk->rx_ring.cq)) { sdp_warn(ssk->socket, "destroy cq(%p) failed\n", ssk->rx_ring.cq); } else { ssk->rx_ring.cq = NULL; } } WARN_ON(ring_head(ssk->rx_ring) != ring_tail(ssk->rx_ring)); } Index: projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c =================================================================== --- projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c (revision 323642) +++ projects/bsd_rdma_4_9/sys/ofed/drivers/infiniband/ulp/sdp/sdp_tx.c (revision 323643) @@ -1,480 +1,485 @@ /* * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "sdp.h" #define sdp_cnt(var) do { (var)++; } while (0) SDP_MODPARAM_SINT(sdp_keepalive_probes_sent, 0, "Total number of keepalive probes sent."); static int sdp_process_tx_cq(struct sdp_sock *ssk); static void sdp_poll_tx_timeout(void *data); int sdp_xmit_poll(struct sdp_sock *ssk, int force) { int wc_processed = 0; SDP_WLOCK_ASSERT(ssk); sdp_prf(ssk->socket, NULL, "%s", __func__); /* If we don't have a pending timer, set one up to catch our recent post in case the interface becomes idle */ if (!callout_pending(&ssk->tx_ring.timer)) callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT, sdp_poll_tx_timeout, ssk); /* Poll the CQ every SDP_TX_POLL_MODER packets */ if (force || (++ssk->tx_ring.poll_cnt & (SDP_TX_POLL_MODER - 1)) == 0) wc_processed = sdp_process_tx_cq(ssk); return wc_processed; } void sdp_post_send(struct sdp_sock *ssk, struct mbuf *mb) { struct sdp_buf *tx_req; struct sdp_bsdh *h; unsigned long mseq; struct ib_device *dev; struct ib_send_wr *bad_wr; struct ib_sge ibsge[SDP_MAX_SEND_SGES]; struct ib_sge *sge; struct ib_send_wr tx_wr = { NULL }; int i, rc; u64 addr; SDPSTATS_COUNTER_MID_INC(post_send, h->mid); SDPSTATS_HIST(send_size, mb->len); if (!ssk->qp_active) { m_freem(mb); return; } mseq = ring_head(ssk->tx_ring); h = mtod(mb, struct sdp_bsdh *); ssk->tx_packets++; ssk->tx_bytes += mb->m_pkthdr.len; #ifdef SDP_ZCOPY if (unlikely(h->mid == SDP_MID_SRCAVAIL)) { struct tx_srcavail_state *tx_sa = TX_SRCAVAIL_STATE(mb); if (ssk->tx_sa != tx_sa) { sdp_dbg_data(ssk->socket, "SrcAvail cancelled " "before being sent!\n"); WARN_ON(1); m_freem(mb); return; } TX_SRCAVAIL_STATE(mb)->mseq = mseq; } #endif if (unlikely(mb->m_flags & M_URG)) h->flags = SDP_OOB_PRES | SDP_OOB_PEND; else h->flags = 0; mb->m_flags |= M_RDONLY; /* Don't allow compression once sent. */ h->bufs = htons(rx_ring_posted(ssk)); h->len = htonl(mb->m_pkthdr.len); h->mseq = htonl(mseq); h->mseq_ack = htonl(mseq_ack(ssk)); sdp_prf1(ssk->socket, mb, "TX: %s bufs: %d mseq:%ld ack:%d", mid2str(h->mid), rx_ring_posted(ssk), mseq, ntohl(h->mseq_ack)); SDP_DUMP_PACKET(ssk->socket, "TX", mb, h); tx_req = &ssk->tx_ring.buffer[mseq & (SDP_TX_SIZE - 1)]; tx_req->mb = mb; dev = ssk->ib_device; sge = &ibsge[0]; for (i = 0; mb != NULL; i++, mb = mb->m_next, sge++) { addr = ib_dma_map_single(dev, mb->m_data, mb->m_len, DMA_TO_DEVICE); /* TODO: proper error handling */ BUG_ON(ib_dma_mapping_error(dev, addr)); BUG_ON(i >= SDP_MAX_SEND_SGES); tx_req->mapping[i] = addr; sge->addr = addr; sge->length = mb->m_len; - sge->lkey = ssk->sdp_dev->mr->lkey; + sge->lkey = ssk->sdp_dev->pd->local_dma_lkey; } tx_wr.next = NULL; tx_wr.wr_id = mseq | SDP_OP_SEND; tx_wr.sg_list = ibsge; tx_wr.num_sge = i; tx_wr.opcode = IB_WR_SEND; tx_wr.send_flags = IB_SEND_SIGNALED; if (unlikely(tx_req->mb->m_flags & M_URG)) tx_wr.send_flags |= IB_SEND_SOLICITED; rc = ib_post_send(ssk->qp, &tx_wr, &bad_wr); if (unlikely(rc)) { sdp_dbg(ssk->socket, "ib_post_send failed with status %d.\n", rc); sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE); sdp_notify(ssk, ECONNRESET); m_freem(tx_req->mb); return; } atomic_inc(&ssk->tx_ring.head); atomic_dec(&ssk->tx_ring.credits); atomic_set(&ssk->remote_credits, rx_ring_posted(ssk)); return; } static struct mbuf * sdp_send_completion(struct sdp_sock *ssk, int mseq) { struct ib_device *dev; struct sdp_buf *tx_req; struct mbuf *mb = NULL; struct sdp_tx_ring *tx_ring = &ssk->tx_ring; if (unlikely(mseq != ring_tail(*tx_ring))) { printk(KERN_WARNING "Bogus send completion id %d tail %d\n", mseq, ring_tail(*tx_ring)); goto out; } dev = ssk->ib_device; tx_req = &tx_ring->buffer[mseq & (SDP_TX_SIZE - 1)]; mb = tx_req->mb; sdp_cleanup_sdp_buf(ssk, tx_req, DMA_TO_DEVICE); #ifdef SDP_ZCOPY /* TODO: AIO and real zcopy code; add their context support here */ if (BZCOPY_STATE(mb)) BZCOPY_STATE(mb)->busy--; #endif atomic_inc(&tx_ring->tail); out: return mb; } static int sdp_handle_send_comp(struct sdp_sock *ssk, struct ib_wc *wc) { struct mbuf *mb = NULL; struct sdp_bsdh *h; if (unlikely(wc->status)) { if (wc->status != IB_WC_WR_FLUSH_ERR) { sdp_prf(ssk->socket, mb, "Send completion with error. " "Status %d", wc->status); sdp_dbg_data(ssk->socket, "Send completion with error. " "Status %d\n", wc->status); sdp_notify(ssk, ECONNRESET); } } mb = sdp_send_completion(ssk, wc->wr_id); if (unlikely(!mb)) return -1; h = mtod(mb, struct sdp_bsdh *); sdp_prf1(ssk->socket, mb, "tx completion. mseq:%d", ntohl(h->mseq)); sdp_dbg(ssk->socket, "tx completion. %p %d mseq:%d", mb, mb->m_pkthdr.len, ntohl(h->mseq)); m_freem(mb); return 0; } static inline void sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc) { if (likely(wc->wr_id & SDP_OP_SEND)) { sdp_handle_send_comp(ssk, wc); return; } #ifdef SDP_ZCOPY if (wc->wr_id & SDP_OP_RDMA) { /* TODO: handle failed RDMA read cqe */ sdp_dbg_data(ssk->socket, "TX comp: RDMA read. status: %d\n", wc->status); sdp_prf1(sk, NULL, "TX comp: RDMA read"); if (!ssk->tx_ring.rdma_inflight) { sdp_warn(ssk->socket, "ERROR: unexpected RDMA read\n"); return; } if (!ssk->tx_ring.rdma_inflight->busy) { sdp_warn(ssk->socket, "ERROR: too many RDMA read completions\n"); return; } /* Only last RDMA read WR is signalled. Order is guaranteed - * therefore if Last RDMA read WR is completed - all other * have, too */ ssk->tx_ring.rdma_inflight->busy = 0; sowwakeup(ssk->socket); sdp_dbg_data(ssk->socket, "woke up sleepers\n"); return; } #endif /* Keepalive probe sent cleanup */ sdp_cnt(sdp_keepalive_probes_sent); if (likely(!wc->status)) return; sdp_dbg(ssk->socket, " %s consumes KEEPALIVE status %d\n", __func__, wc->status); if (wc->status == IB_WC_WR_FLUSH_ERR) return; sdp_notify(ssk, ECONNRESET); } static int sdp_process_tx_cq(struct sdp_sock *ssk) { struct ib_wc ibwc[SDP_NUM_WC]; int n, i; int wc_processed = 0; SDP_WLOCK_ASSERT(ssk); if (!ssk->tx_ring.cq) { sdp_dbg(ssk->socket, "tx irq on destroyed tx_cq\n"); return 0; } do { n = ib_poll_cq(ssk->tx_ring.cq, SDP_NUM_WC, ibwc); for (i = 0; i < n; ++i) { sdp_process_tx_wc(ssk, ibwc + i); wc_processed++; } } while (n == SDP_NUM_WC); if (wc_processed) { sdp_post_sends(ssk, M_NOWAIT); sdp_prf1(sk, NULL, "Waking sendmsg. inflight=%d", (u32) tx_ring_posted(ssk)); sowwakeup(ssk->socket); } return wc_processed; } static void sdp_poll_tx(struct sdp_sock *ssk) { struct socket *sk = ssk->socket; u32 inflight, wc_processed; sdp_prf1(ssk->socket, NULL, "TX timeout: inflight=%d, head=%d tail=%d", (u32) tx_ring_posted(ssk), ring_head(ssk->tx_ring), ring_tail(ssk->tx_ring)); if (unlikely(ssk->state == TCPS_CLOSED)) { sdp_warn(sk, "Socket is closed\n"); goto out; } wc_processed = sdp_process_tx_cq(ssk); if (!wc_processed) SDPSTATS_COUNTER_INC(tx_poll_miss); else SDPSTATS_COUNTER_INC(tx_poll_hit); inflight = (u32) tx_ring_posted(ssk); sdp_prf1(ssk->socket, NULL, "finished tx processing. inflight = %d", inflight); /* If there are still packets in flight and the timer has not already * been scheduled by the Tx routine then schedule it here to guarantee * completion processing of these packets */ if (inflight) callout_reset(&ssk->tx_ring.timer, SDP_TX_POLL_TIMEOUT, sdp_poll_tx_timeout, ssk); out: #ifdef SDP_ZCOPY if (ssk->tx_ring.rdma_inflight && ssk->tx_ring.rdma_inflight->busy) { sdp_prf1(sk, NULL, "RDMA is inflight - arming irq"); sdp_arm_tx_cq(ssk); } #endif return; } static void sdp_poll_tx_timeout(void *data) { struct sdp_sock *ssk = (struct sdp_sock *)data; if (!callout_active(&ssk->tx_ring.timer)) return; callout_deactivate(&ssk->tx_ring.timer); sdp_poll_tx(ssk); } static void sdp_tx_irq(struct ib_cq *cq, void *cq_context) { struct sdp_sock *ssk; ssk = cq_context; sdp_prf1(ssk->socket, NULL, "tx irq"); sdp_dbg_data(ssk->socket, "Got tx comp interrupt\n"); SDPSTATS_COUNTER_INC(tx_int_count); SDP_WLOCK(ssk); sdp_poll_tx(ssk); SDP_WUNLOCK(ssk); } static void sdp_tx_ring_purge(struct sdp_sock *ssk) { while (tx_ring_posted(ssk)) { struct mbuf *mb; mb = sdp_send_completion(ssk, ring_tail(ssk->tx_ring)); if (!mb) break; m_freem(mb); } } void sdp_post_keepalive(struct sdp_sock *ssk) { int rc; struct ib_send_wr wr, *bad_wr; sdp_dbg(ssk->socket, "%s\n", __func__); memset(&wr, 0, sizeof(wr)); wr.next = NULL; wr.wr_id = 0; wr.sg_list = NULL; wr.num_sge = 0; wr.opcode = IB_WR_RDMA_WRITE; rc = ib_post_send(ssk->qp, &wr, &bad_wr); if (rc) { sdp_dbg(ssk->socket, "ib_post_keepalive failed with status %d.\n", rc); sdp_notify(ssk, ECONNRESET); } sdp_cnt(sdp_keepalive_probes_sent); } static void sdp_tx_cq_event_handler(struct ib_event *event, void *data) { } int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device) { + struct ib_cq_init_attr tx_cq_attr = { + .cqe = SDP_TX_SIZE, + .comp_vector = 0, + .flags = 0, + }; struct ib_cq *tx_cq; int rc = 0; sdp_dbg(ssk->socket, "tx ring create\n"); callout_init_rw(&ssk->tx_ring.timer, &ssk->lock, 0); callout_init_rw(&ssk->nagle_timer, &ssk->lock, 0); atomic_set(&ssk->tx_ring.head, 1); atomic_set(&ssk->tx_ring.tail, 1); ssk->tx_ring.buffer = malloc(sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE, M_SDP, M_WAITOK); tx_cq = ib_create_cq(device, sdp_tx_irq, sdp_tx_cq_event_handler, - ssk, SDP_TX_SIZE, 0); + ssk, &tx_cq_attr); if (IS_ERR(tx_cq)) { rc = PTR_ERR(tx_cq); sdp_warn(ssk->socket, "Unable to allocate TX CQ: %d.\n", rc); goto err_cq; } ssk->tx_ring.cq = tx_cq; ssk->tx_ring.poll_cnt = 0; sdp_arm_tx_cq(ssk); return 0; err_cq: free(ssk->tx_ring.buffer, M_SDP); ssk->tx_ring.buffer = NULL; return rc; } void sdp_tx_ring_destroy(struct sdp_sock *ssk) { sdp_dbg(ssk->socket, "tx ring destroy\n"); SDP_WLOCK(ssk); callout_stop(&ssk->tx_ring.timer); callout_stop(&ssk->nagle_timer); SDP_WUNLOCK(ssk); callout_drain(&ssk->tx_ring.timer); callout_drain(&ssk->nagle_timer); if (ssk->tx_ring.buffer) { sdp_tx_ring_purge(ssk); free(ssk->tx_ring.buffer, M_SDP); ssk->tx_ring.buffer = NULL; } if (ssk->tx_ring.cq) { if (ib_destroy_cq(ssk->tx_ring.cq)) { sdp_warn(ssk->socket, "destroy cq(%p) failed\n", ssk->tx_ring.cq); } else { ssk->tx_ring.cq = NULL; } } WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring)); }