Index: user/alc/PQ_LAUNDRY/sys/ddb/db_ps.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/ddb/db_ps.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/ddb/db_ps.c (revision 303654) @@ -1,488 +1,495 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include static void dumpthread(volatile struct proc *p, volatile struct thread *td, int all); /* * At least one non-optional show-command must be implemented using * DB_SHOW_ALL_COMMAND() so that db_show_all_cmd_set gets created. * Here is one. */ DB_SHOW_ALL_COMMAND(procs, db_procs_cmd) { db_ps(addr, have_addr, count, modif); } /* * Layout: * - column counts * - header * - single-threaded process * - multi-threaded process * - thread in a MT process * * 1 2 3 4 5 6 7 * 1234567890123456789012345678901234567890123456789012345678901234567890 * pid ppid pgrp uid state wmesg wchan cmd * < wmesg > < wchan > * (threaded) * < wmesg > < wchan > * * For machines with 64-bit pointers, we expand the wchan field 8 more * characters. */ void db_ps(db_expr_t addr, bool hasaddr, db_expr_t count, char *modif) { volatile struct proc *p, *pp; volatile struct thread *td; struct ucred *cred; struct pgrp *pgrp; char state[9]; int np, rflag, sflag, dflag, lflag, wflag; np = nprocs; if (!LIST_EMPTY(&allproc)) p = LIST_FIRST(&allproc); else p = &proc0; #ifdef __LP64__ db_printf(" pid ppid pgrp uid state wmesg wchan cmd\n"); #else db_printf(" pid ppid pgrp uid state wmesg wchan cmd\n"); #endif while (--np >= 0 && !db_pager_quit) { if (p == NULL) { db_printf("oops, ran out of processes early!\n"); break; } pp = p->p_pptr; if (pp == NULL) pp = p; cred = p->p_ucred; pgrp = p->p_pgrp; db_printf("%5d %5d %5d %5d ", p->p_pid, pp->p_pid, pgrp != NULL ? pgrp->pg_id : 0, cred != NULL ? cred->cr_ruid : 0); /* Determine our primary process state. */ switch (p->p_state) { case PRS_NORMAL: if (P_SHOULDSTOP(p)) state[0] = 'T'; else { /* * One of D, L, R, S, W. For a * multithreaded process we will use * the state of the thread with the * highest precedence. The * precendence order from high to low * is R, L, D, S, W. If no thread is * in a sane state we use '?' for our * primary state. */ rflag = sflag = dflag = lflag = wflag = 0; FOREACH_THREAD_IN_PROC(p, td) { if (td->td_state == TDS_RUNNING || td->td_state == TDS_RUNQ || td->td_state == TDS_CAN_RUN) rflag++; if (TD_ON_LOCK(td)) lflag++; if (TD_IS_SLEEPING(td)) { if (!(td->td_flags & TDF_SINTR)) dflag++; else sflag++; } if (TD_AWAITING_INTR(td)) wflag++; } if (rflag) state[0] = 'R'; else if (lflag) state[0] = 'L'; else if (dflag) state[0] = 'D'; else if (sflag) state[0] = 'S'; else if (wflag) state[0] = 'W'; else state[0] = '?'; } break; case PRS_NEW: state[0] = 'N'; break; case PRS_ZOMBIE: state[0] = 'Z'; break; default: state[0] = 'U'; break; } state[1] = '\0'; /* Additional process state flags. */ if (!(p->p_flag & P_INMEM)) strlcat(state, "W", sizeof(state)); if (p->p_flag & P_TRACED) strlcat(state, "X", sizeof(state)); if (p->p_flag & P_WEXIT && p->p_state != PRS_ZOMBIE) strlcat(state, "E", sizeof(state)); if (p->p_flag & P_PPWAIT) strlcat(state, "V", sizeof(state)); if (p->p_flag & P_SYSTEM || p->p_lock > 0) strlcat(state, "L", sizeof(state)); if (p->p_pgrp != NULL && p->p_session != NULL && SESS_LEADER(p)) strlcat(state, "s", sizeof(state)); /* Cheated here and didn't compare pgid's. */ if (p->p_flag & P_CONTROLT) strlcat(state, "+", sizeof(state)); if (cred != NULL && jailed(cred)) strlcat(state, "J", sizeof(state)); db_printf(" %-6.6s ", state); if (p->p_flag & P_HADTHREADS) { #ifdef __LP64__ db_printf(" (threaded) "); #else db_printf(" (threaded) "); #endif if (p->p_flag & P_SYSTEM) db_printf("["); db_printf("%s", p->p_comm); if (p->p_flag & P_SYSTEM) db_printf("]"); db_printf("\n"); } FOREACH_THREAD_IN_PROC(p, td) { dumpthread(p, td, p->p_flag & P_HADTHREADS); if (db_pager_quit) break; } p = LIST_NEXT(p, p_list); if (p == NULL && np > 0) p = LIST_FIRST(&zombproc); } } static void dumpthread(volatile struct proc *p, volatile struct thread *td, int all) { char state[9], wprefix; const char *wmesg; void *wchan; if (all) { db_printf("%6d ", td->td_tid); switch (td->td_state) { case TDS_RUNNING: snprintf(state, sizeof(state), "Run"); break; case TDS_RUNQ: snprintf(state, sizeof(state), "RunQ"); break; case TDS_CAN_RUN: snprintf(state, sizeof(state), "CanRun"); break; case TDS_INACTIVE: snprintf(state, sizeof(state), "Inactv"); break; case TDS_INHIBITED: state[0] = '\0'; if (TD_ON_LOCK(td)) strlcat(state, "L", sizeof(state)); if (TD_IS_SLEEPING(td)) { if (td->td_flags & TDF_SINTR) strlcat(state, "S", sizeof(state)); else strlcat(state, "D", sizeof(state)); } if (TD_IS_SWAPPED(td)) strlcat(state, "W", sizeof(state)); if (TD_AWAITING_INTR(td)) strlcat(state, "I", sizeof(state)); if (TD_IS_SUSPENDED(td)) strlcat(state, "s", sizeof(state)); if (state[0] != '\0') break; default: snprintf(state, sizeof(state), "???"); } db_printf(" %-6.6s ", state); } wprefix = ' '; if (TD_ON_LOCK(td)) { wprefix = '*'; wmesg = td->td_lockname; wchan = td->td_blocked; } else if (TD_ON_SLEEPQ(td)) { wmesg = td->td_wmesg; wchan = td->td_wchan; } else if (TD_IS_RUNNING(td)) { snprintf(state, sizeof(state), "CPU %d", td->td_oncpu); wmesg = state; wchan = NULL; } else { wmesg = ""; wchan = NULL; } db_printf("%c%-8.8s ", wprefix, wmesg); if (wchan == NULL) #ifdef __LP64__ db_printf("%18s ", ""); #else db_printf("%10s ", ""); #endif else db_printf("%p ", wchan); if (p->p_flag & P_SYSTEM) db_printf("["); if (td->td_name[0] != '\0') db_printf("%s", td->td_name); else db_printf("%s", td->td_proc->p_comm); if (p->p_flag & P_SYSTEM) db_printf("]"); db_printf("\n"); } DB_SHOW_COMMAND(thread, db_show_thread) { struct thread *td; struct lock_object *lock; bool comma; int delta; /* Determine which thread to examine. */ if (have_addr) td = db_lookup_thread(addr, false); else td = kdb_thread; lock = (struct lock_object *)td->td_lock; db_printf("Thread %d at %p:\n", td->td_tid, td); db_printf(" proc (pid %d): %p\n", td->td_proc->p_pid, td->td_proc); if (td->td_name[0] != '\0') db_printf(" name: %s\n", td->td_name); db_printf(" stack: %p-%p\n", (void *)td->td_kstack, (void *)(td->td_kstack + td->td_kstack_pages * PAGE_SIZE - 1)); db_printf(" flags: %#x ", td->td_flags); db_printf(" pflags: %#x\n", td->td_pflags); db_printf(" state: "); switch (td->td_state) { case TDS_INACTIVE: db_printf("INACTIVE\n"); break; case TDS_CAN_RUN: db_printf("CAN RUN\n"); break; case TDS_RUNQ: db_printf("RUNQ\n"); break; case TDS_RUNNING: db_printf("RUNNING (CPU %d)\n", td->td_oncpu); break; case TDS_INHIBITED: db_printf("INHIBITED: {"); comma = false; if (TD_IS_SLEEPING(td)) { db_printf("SLEEPING"); comma = true; } if (TD_IS_SUSPENDED(td)) { if (comma) db_printf(", "); db_printf("SUSPENDED"); comma = true; } if (TD_IS_SWAPPED(td)) { if (comma) db_printf(", "); db_printf("SWAPPED"); comma = true; } if (TD_ON_LOCK(td)) { if (comma) db_printf(", "); db_printf("LOCK"); comma = true; } if (TD_AWAITING_INTR(td)) { if (comma) db_printf(", "); db_printf("IWAIT"); } db_printf("}\n"); break; default: db_printf("??? (%#x)\n", td->td_state); break; } if (TD_ON_LOCK(td)) db_printf(" lock: %s turnstile: %p\n", td->td_lockname, td->td_blocked); if (TD_ON_SLEEPQ(td)) db_printf( " wmesg: %s wchan: %p sleeptimo %lx. %jx (curr %lx. %jx)\n", td->td_wmesg, td->td_wchan, (long)sbttobt(td->td_sleeptimo).sec, (uintmax_t)sbttobt(td->td_sleeptimo).frac, (long)sbttobt(sbinuptime()).sec, (uintmax_t)sbttobt(sbinuptime()).frac); db_printf(" priority: %d\n", td->td_priority); db_printf(" container lock: %s (%p)\n", lock->lo_name, lock); if (td->td_swvoltick != 0) { delta = (u_int)ticks - (u_int)td->td_swvoltick; db_printf(" last voluntary switch: %d ms ago\n", 1000 * delta / hz); } if (td->td_swinvoltick != 0) { delta = (u_int)ticks - (u_int)td->td_swinvoltick; db_printf(" last involuntary switch: %d ms ago\n", 1000 * delta / hz); } } DB_SHOW_COMMAND(proc, db_show_proc) { struct thread *td; struct proc *p; int i; /* Determine which process to examine. */ if (have_addr) p = db_lookup_proc(addr); else p = kdb_thread->td_proc; db_printf("Process %d (%s) at %p:\n", p->p_pid, p->p_comm, p); db_printf(" state: "); switch (p->p_state) { case PRS_NEW: db_printf("NEW\n"); break; case PRS_NORMAL: db_printf("NORMAL\n"); break; case PRS_ZOMBIE: db_printf("ZOMBIE\n"); break; default: db_printf("??? (%#x)\n", p->p_state); } if (p->p_ucred != NULL) { db_printf(" uid: %d gids: ", p->p_ucred->cr_uid); for (i = 0; i < p->p_ucred->cr_ngroups; i++) { db_printf("%d", p->p_ucred->cr_groups[i]); if (i < (p->p_ucred->cr_ngroups - 1)) db_printf(", "); } db_printf("\n"); } if (p->p_pptr != NULL) db_printf(" parent: pid %d at %p\n", p->p_pptr->p_pid, p->p_pptr); if (p->p_leader != NULL && p->p_leader != p) db_printf(" leader: pid %d at %p\n", p->p_leader->p_pid, p->p_leader); if (p->p_sysent != NULL) db_printf(" ABI: %s\n", p->p_sysent->sv_name); - if (p->p_args != NULL) - db_printf(" arguments: %.*s\n", (int)p->p_args->ar_length, - p->p_args->ar_args); + if (p->p_args != NULL) { + db_printf(" arguments: "); + for (i = 0; i < (int)p->p_args->ar_length; i++) { + if (p->p_args->ar_args[i] == '\0') + db_printf(" "); + else + db_printf("%c", p->p_args->ar_args[i]); + } + db_printf("\n"); + } db_printf(" threads: %d\n", p->p_numthreads); FOREACH_THREAD_IN_PROC(p, td) { dumpthread(p, td, 1); if (db_pager_quit) break; } } void db_findstack_cmd(db_expr_t addr, bool have_addr, db_expr_t dummy3 __unused, char *dummy4 __unused) { struct proc *p; struct thread *td; struct kstack_cache_entry *ks_ce; vm_offset_t saddr; if (have_addr) saddr = addr; else { db_printf("Usage: findstack
\n"); return; } FOREACH_PROC_IN_SYSTEM(p) { FOREACH_THREAD_IN_PROC(p, td) { if (td->td_kstack <= saddr && saddr < td->td_kstack + PAGE_SIZE * td->td_kstack_pages) { db_printf("Thread %p\n", td); return; } } } for (ks_ce = kstack_cache; ks_ce != NULL; ks_ce = ks_ce->next_ks_entry) { if ((vm_offset_t)ks_ce <= saddr && saddr < (vm_offset_t)ks_ce + PAGE_SIZE * kstack_pages) { db_printf("Cached stack %p\n", ks_ce); return; } } } Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/adapter.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/adapter.h (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/adapter.h (revision 303654) @@ -1,1176 +1,1178 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * */ #ifndef __T4_ADAPTER_H__ #define __T4_ADAPTER_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "offload.h" #include "t4_ioctl.h" #include "common/t4_msg.h" #include "firmware/t4fw_interface.h" #define KTR_CXGBE KTR_SPARE3 MALLOC_DECLARE(M_CXGBE); #define CXGBE_UNIMPLEMENTED(s) \ panic("%s (%s, line %d) not implemented yet.", s, __FILE__, __LINE__) #if defined(__i386__) || defined(__amd64__) static __inline void prefetch(void *x) { __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); } #else #define prefetch(x) #endif #ifndef SYSCTL_ADD_UQUAD #define SYSCTL_ADD_UQUAD SYSCTL_ADD_QUAD #define sysctl_handle_64 sysctl_handle_quad #define CTLTYPE_U64 CTLTYPE_QUAD #endif #if (__FreeBSD_version >= 900030) || \ ((__FreeBSD_version >= 802507) && (__FreeBSD_version < 900000)) #define SBUF_DRAIN 1 #endif #ifdef __amd64__ /* XXX: need systemwide bus_space_read_8/bus_space_write_8 */ static __inline uint64_t t4_bus_space_read_8(bus_space_tag_t tag, bus_space_handle_t handle, bus_size_t offset) { KASSERT(tag == X86_BUS_SPACE_MEM, ("%s: can only handle mem space", __func__)); return (*(volatile uint64_t *)(handle + offset)); } static __inline void t4_bus_space_write_8(bus_space_tag_t tag, bus_space_handle_t bsh, bus_size_t offset, uint64_t value) { KASSERT(tag == X86_BUS_SPACE_MEM, ("%s: can only handle mem space", __func__)); *(volatile uint64_t *)(bsh + offset) = value; } #else static __inline uint64_t t4_bus_space_read_8(bus_space_tag_t tag, bus_space_handle_t handle, bus_size_t offset) { return (uint64_t)bus_space_read_4(tag, handle, offset) + ((uint64_t)bus_space_read_4(tag, handle, offset + 4) << 32); } static __inline void t4_bus_space_write_8(bus_space_tag_t tag, bus_space_handle_t bsh, bus_size_t offset, uint64_t value) { bus_space_write_4(tag, bsh, offset, value); bus_space_write_4(tag, bsh, offset + 4, value >> 32); } #endif struct adapter; typedef struct adapter adapter_t; enum { /* * All ingress queues use this entry size. Note that the firmware event * queue and any iq expecting CPL_RX_PKT in the descriptor needs this to * be at least 64. */ IQ_ESIZE = 64, /* Default queue sizes for all kinds of ingress queues */ FW_IQ_QSIZE = 256, RX_IQ_QSIZE = 1024, /* All egress queues use this entry size */ EQ_ESIZE = 64, /* Default queue sizes for all kinds of egress queues */ CTRL_EQ_QSIZE = 128, TX_EQ_QSIZE = 1024, #if MJUMPAGESIZE != MCLBYTES SW_ZONE_SIZES = 4, /* cluster, jumbop, jumbo9k, jumbo16k */ #else SW_ZONE_SIZES = 3, /* cluster, jumbo9k, jumbo16k */ #endif CL_METADATA_SIZE = CACHE_LINE_SIZE, SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */ TX_SGL_SEGS = 39, TX_SGL_SEGS_TSO = 38, TX_WR_FLITS = SGE_MAX_WR_LEN / 8 }; enum { /* adapter intr_type */ INTR_INTX = (1 << 0), INTR_MSI = (1 << 1), INTR_MSIX = (1 << 2) }; enum { XGMAC_MTU = (1 << 0), XGMAC_PROMISC = (1 << 1), XGMAC_ALLMULTI = (1 << 2), XGMAC_VLANEX = (1 << 3), XGMAC_UCADDR = (1 << 4), XGMAC_MCADDRS = (1 << 5), XGMAC_ALL = 0xffff }; enum { /* flags understood by begin_synchronized_op */ HOLD_LOCK = (1 << 0), SLEEP_OK = (1 << 1), INTR_OK = (1 << 2), /* flags understood by end_synchronized_op */ LOCK_HELD = HOLD_LOCK, }; enum { /* adapter flags */ FULL_INIT_DONE = (1 << 0), FW_OK = (1 << 1), /* INTR_DIRECT = (1 << 2), No longer used. */ MASTER_PF = (1 << 3), ADAP_SYSCTL_CTX = (1 << 4), /* TOM_INIT_DONE= (1 << 5), No longer used */ BUF_PACKING_OK = (1 << 6), CXGBE_BUSY = (1 << 9), /* port flags */ HAS_TRACEQ = (1 << 3), /* VI flags */ DOOMED = (1 << 0), VI_INIT_DONE = (1 << 1), VI_SYSCTL_CTX = (1 << 2), INTR_RXQ = (1 << 4), /* All NIC rxq's take interrupts */ INTR_OFLD_RXQ = (1 << 5), /* All TOE rxq's take interrupts */ INTR_ALL = (INTR_RXQ | INTR_OFLD_RXQ), /* adapter debug_flags */ DF_DUMP_MBOX = (1 << 0), }; #define IS_DOOMED(vi) ((vi)->flags & DOOMED) #define SET_DOOMED(vi) do {(vi)->flags |= DOOMED;} while (0) #define IS_BUSY(sc) ((sc)->flags & CXGBE_BUSY) #define SET_BUSY(sc) do {(sc)->flags |= CXGBE_BUSY;} while (0) #define CLR_BUSY(sc) do {(sc)->flags &= ~CXGBE_BUSY;} while (0) struct vi_info { device_t dev; struct port_info *pi; struct ifnet *ifp; struct ifmedia media; unsigned long flags; int if_flags; uint16_t *rss, *nm_rss; uint16_t viid; int16_t xact_addr_filt;/* index of exact MAC address filter */ uint16_t rss_size; /* size of VI's RSS table slice */ uint16_t rss_base; /* start of VI's RSS table slice */ eventhandler_tag vlan_c; int nintr; int first_intr; /* These need to be int as they are used in sysctl */ int ntxq; /* # of tx queues */ int first_txq; /* index of first tx queue */ int rsrv_noflowq; /* Reserve queue 0 for non-flowid packets */ int nrxq; /* # of rx queues */ int first_rxq; /* index of first rx queue */ int nofldtxq; /* # of offload tx queues */ int first_ofld_txq; /* index of first offload tx queue */ int nofldrxq; /* # of offload rx queues */ int first_ofld_rxq; /* index of first offload rx queue */ int nnmtxq; int first_nm_txq; int nnmrxq; int first_nm_rxq; int tmr_idx; int pktc_idx; int qsize_rxq; int qsize_txq; struct timeval last_refreshed; struct fw_vi_stats_vf stats; struct callout tick; struct sysctl_ctx_list ctx; /* from ifconfig up to driver detach */ uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */ }; enum { /* tx_sched_class flags */ TX_SC_OK = (1 << 0), /* Set up in hardware, active. */ }; struct tx_sched_class { int refcount; int flags; struct t4_sched_class_params params; }; struct port_info { device_t dev; struct adapter *adapter; struct vi_info *vi; int nvi; int up_vis; int uld_vis; struct tx_sched_class *tc; /* traffic classes for this channel */ struct mtx pi_lock; char lockname[16]; unsigned long flags; uint8_t lport; /* associated offload logical port */ int8_t mdio_addr; uint8_t port_type; uint8_t mod_type; uint8_t port_id; uint8_t tx_chan; uint8_t rx_chan_map; /* rx MPS channel bitmap */ int linkdnrc; struct link_config link_cfg; struct timeval last_refreshed; struct port_stats stats; u_int tnl_cong_drops; u_int tx_parse_error; struct callout tick; }; #define IS_MAIN_VI(vi) ((vi) == &((vi)->pi->vi[0])) /* Where the cluster came from, how it has been carved up. */ struct cluster_layout { int8_t zidx; int8_t hwidx; uint16_t region1; /* mbufs laid out within this region */ /* region2 is the DMA region */ uint16_t region3; /* cluster_metadata within this region */ }; struct cluster_metadata { u_int refcount; struct fl_sdesc *sd; /* For debug only. Could easily be stale */ }; struct fl_sdesc { caddr_t cl; uint16_t nmbuf; /* # of driver originated mbufs with ref on cluster */ struct cluster_layout cll; }; struct tx_desc { __be64 flit[8]; }; struct tx_sdesc { struct mbuf *m; /* m_nextpkt linked chain of frames */ uint8_t desc_used; /* # of hardware descriptors used by the WR */ }; #define IQ_PAD (IQ_ESIZE - sizeof(struct rsp_ctrl) - sizeof(struct rss_header)) struct iq_desc { struct rss_header rss; uint8_t cpl[IQ_PAD]; struct rsp_ctrl rsp; }; #undef IQ_PAD CTASSERT(sizeof(struct iq_desc) == IQ_ESIZE); enum { /* iq flags */ IQ_ALLOCATED = (1 << 0), /* firmware resources allocated */ IQ_HAS_FL = (1 << 1), /* iq associated with a freelist */ IQ_INTR = (1 << 2), /* iq takes direct interrupt */ IQ_LRO_ENABLED = (1 << 3), /* iq is an eth rxq with LRO enabled */ /* iq state */ IQS_DISABLED = 0, IQS_BUSY = 1, IQS_IDLE = 2, /* netmap related flags */ NM_OFF = 0, NM_ON = 1, NM_BUSY = 2, }; struct sge_iq; struct rss_header; typedef int (*cpl_handler_t)(struct sge_iq *, const struct rss_header *, struct mbuf *); typedef int (*an_handler_t)(struct sge_iq *, const struct rsp_ctrl *); typedef int (*fw_msg_handler_t)(struct adapter *, const __be64 *); /* * Ingress Queue: T4 is producer, driver is consumer. */ struct sge_iq { uint32_t flags; volatile int state; struct adapter *adapter; cpl_handler_t set_tcb_rpl; cpl_handler_t l2t_write_rpl; struct iq_desc *desc; /* KVA of descriptor ring */ int8_t intr_pktc_idx; /* packet count threshold index */ uint8_t gen; /* generation bit */ uint8_t intr_params; /* interrupt holdoff parameters */ uint8_t intr_next; /* XXX: holdoff for next interrupt */ uint16_t qsize; /* size (# of entries) of the queue */ uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer index */ uint16_t cntxt_id; /* SGE context id for the iq */ uint16_t abs_id; /* absolute SGE id for the iq */ STAILQ_ENTRY(sge_iq) link; bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; /* bus address of descriptor ring */ }; enum { EQ_CTRL = 1, EQ_ETH = 2, EQ_OFLD = 3, /* eq flags */ EQ_TYPEMASK = 0x3, /* 2 lsbits hold the type (see above) */ EQ_ALLOCATED = (1 << 2), /* firmware resources allocated */ EQ_ENABLED = (1 << 3), /* open for business */ }; /* Listed in order of preference. Update t4_sysctls too if you change these */ enum {DOORBELL_UDB, DOORBELL_WCWR, DOORBELL_UDBWC, DOORBELL_KDB}; /* * Egress Queue: driver is producer, T4 is consumer. * * Note: A free list is an egress queue (driver produces the buffers and T4 * consumes them) but it's special enough to have its own struct (see sge_fl). */ struct sge_eq { unsigned int flags; /* MUST be first */ unsigned int cntxt_id; /* SGE context id for the eq */ struct mtx eq_lock; struct tx_desc *desc; /* KVA of descriptor ring */ uint16_t doorbells; volatile uint32_t *udb; /* KVA of doorbell (lies within BAR2) */ u_int udb_qid; /* relative qid within the doorbell page */ uint16_t sidx; /* index of the entry with the status page */ uint16_t cidx; /* consumer idx (desc idx) */ uint16_t pidx; /* producer idx (desc idx) */ uint16_t equeqidx; /* EQUEQ last requested at this pidx */ uint16_t dbidx; /* pidx of the most recent doorbell */ uint16_t iqid; /* iq that gets egr_update for the eq */ uint8_t tx_chan; /* tx channel used by the eq */ volatile u_int equiq; /* EQUIQ outstanding */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; /* bus address of descriptor ring */ char lockname[16]; }; struct sw_zone_info { uma_zone_t zone; /* zone that this cluster comes from */ int size; /* size of cluster: 2K, 4K, 9K, 16K, etc. */ int type; /* EXT_xxx type of the cluster */ int8_t head_hwidx; int8_t tail_hwidx; }; struct hw_buf_info { int8_t zidx; /* backpointer to zone; -ve means unused */ int8_t next; /* next hwidx for this zone; -1 means no more */ int size; }; enum { NUM_MEMWIN = 3, MEMWIN0_APERTURE = 2048, MEMWIN0_BASE = 0x1b800, MEMWIN1_APERTURE = 32768, MEMWIN1_BASE = 0x28000, MEMWIN2_APERTURE_T4 = 65536, MEMWIN2_BASE_T4 = 0x30000, MEMWIN2_APERTURE_T5 = 128 * 1024, MEMWIN2_BASE_T5 = 0x60000, }; struct memwin { struct rwlock mw_lock __aligned(CACHE_LINE_SIZE); uint32_t mw_base; /* constant after setup_memwin */ uint32_t mw_aperture; /* ditto */ uint32_t mw_curpos; /* protected by mw_lock */ }; enum { FL_STARVING = (1 << 0), /* on the adapter's list of starving fl's */ FL_DOOMED = (1 << 1), /* about to be destroyed */ FL_BUF_PACKING = (1 << 2), /* buffer packing enabled */ FL_BUF_RESUME = (1 << 3), /* resume from the middle of the frame */ }; #define FL_RUNNING_LOW(fl) \ (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) <= fl->lowat) #define FL_NOT_RUNNING_LOW(fl) \ (IDXDIFF(fl->dbidx * 8, fl->cidx, fl->sidx * 8) >= 2 * fl->lowat) struct sge_fl { struct mtx fl_lock; __be64 *desc; /* KVA of descriptor ring, ptr to addresses */ struct fl_sdesc *sdesc; /* KVA of software descriptor ring */ struct cluster_layout cll_def; /* default refill zone, layout */ uint16_t lowat; /* # of buffers <= this means fl needs help */ int flags; uint16_t buf_boundary; /* The 16b idx all deal with hw descriptors */ uint16_t dbidx; /* hw pidx after last doorbell */ uint16_t sidx; /* index of status page */ volatile uint16_t hw_cidx; /* The 32b idx are all buffer idx, not hardware descriptor idx */ uint32_t cidx; /* consumer index */ uint32_t pidx; /* producer index */ uint32_t dbval; u_int rx_offset; /* offset in fl buf (when buffer packing) */ volatile uint32_t *udb; uint64_t mbuf_allocated;/* # of mbuf allocated from zone_mbuf */ uint64_t mbuf_inlined; /* # of mbuf created within clusters */ uint64_t cl_allocated; /* # of clusters allocated */ uint64_t cl_recycled; /* # of clusters recycled */ uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */ /* These 3 are valid when FL_BUF_RESUME is set, stale otherwise. */ struct mbuf *m0; struct mbuf **pnext; u_int remaining; uint16_t qsize; /* # of hw descriptors (status page included) */ uint16_t cntxt_id; /* SGE context id for the freelist */ TAILQ_ENTRY(sge_fl) link; /* All starving freelists */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; char lockname[16]; bus_addr_t ba; /* bus address of descriptor ring */ struct cluster_layout cll_alt; /* alternate refill zone, layout */ }; struct mp_ring; /* txq: SGE egress queue + what's needed for Ethernet NIC */ struct sge_txq { struct sge_eq eq; /* MUST be first */ struct ifnet *ifp; /* the interface this txq belongs to */ struct mp_ring *r; /* tx software ring */ struct tx_sdesc *sdesc; /* KVA of software descriptor ring */ struct sglist *gl; __be32 cpl_ctrl0; /* for convenience */ int tc_idx; /* traffic class */ struct task tx_reclaim_task; /* stats for common events first */ uint64_t txcsum; /* # of times hardware assisted with checksum */ uint64_t tso_wrs; /* # of TSO work requests */ uint64_t vlan_insertion;/* # of times VLAN tag was inserted */ uint64_t imm_wrs; /* # of work requests with immediate data */ uint64_t sgl_wrs; /* # of work requests with direct SGL */ uint64_t txpkt_wrs; /* # of txpkt work requests (not coalesced) */ uint64_t txpkts0_wrs; /* # of type0 coalesced tx work requests */ uint64_t txpkts1_wrs; /* # of type1 coalesced tx work requests */ uint64_t txpkts0_pkts; /* # of frames in type0 coalesced tx WRs */ uint64_t txpkts1_pkts; /* # of frames in type1 coalesced tx WRs */ /* stats for not-that-common events */ } __aligned(CACHE_LINE_SIZE); /* rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ struct ifnet *ifp; /* the interface this rxq belongs to */ #if defined(INET) || defined(INET6) struct lro_ctrl lro; /* LRO state */ #endif /* stats for common events first */ uint64_t rxcsum; /* # of times hardware assisted with checksum */ uint64_t vlan_extraction;/* # of times VLAN tag was extracted */ /* stats for not-that-common events */ } __aligned(CACHE_LINE_SIZE); static inline struct sge_rxq * iq_to_rxq(struct sge_iq *iq) { return (__containerof(iq, struct sge_rxq, iq)); } /* ofld_rxq: SGE ingress queue + SGE free list + miscellaneous items */ struct sge_ofld_rxq { struct sge_iq iq; /* MUST be first */ struct sge_fl fl; /* MUST follow iq */ } __aligned(CACHE_LINE_SIZE); static inline struct sge_ofld_rxq * iq_to_ofld_rxq(struct sge_iq *iq) { return (__containerof(iq, struct sge_ofld_rxq, iq)); } struct wrqe { STAILQ_ENTRY(wrqe) link; struct sge_wrq *wrq; int wr_len; char wr[] __aligned(16); }; struct wrq_cookie { TAILQ_ENTRY(wrq_cookie) link; int ndesc; int pidx; }; /* * wrq: SGE egress queue that is given prebuilt work requests. Both the control * and offload tx queues are of this type. */ struct sge_wrq { struct sge_eq eq; /* MUST be first */ struct adapter *adapter; struct task wrq_tx_task; /* Tx desc reserved but WR not "committed" yet. */ TAILQ_HEAD(wrq_incomplete_wrs , wrq_cookie) incomplete_wrs; /* List of WRs ready to go out as soon as descriptors are available. */ STAILQ_HEAD(, wrqe) wr_list; u_int nwr_pending; u_int ndesc_needed; /* stats for common events first */ uint64_t tx_wrs_direct; /* # of WRs written directly to desc ring. */ uint64_t tx_wrs_ss; /* # of WRs copied from scratch space. */ uint64_t tx_wrs_copied; /* # of WRs queued and copied to desc ring. */ /* stats for not-that-common events */ /* * Scratch space for work requests that wrap around after reaching the * status page, and some information about the last WR that used it. */ uint16_t ss_pidx; uint16_t ss_len; uint8_t ss[SGE_MAX_WR_LEN]; } __aligned(CACHE_LINE_SIZE); struct sge_nm_rxq { struct vi_info *vi; struct iq_desc *iq_desc; uint16_t iq_abs_id; uint16_t iq_cntxt_id; uint16_t iq_cidx; uint16_t iq_sidx; uint8_t iq_gen; __be64 *fl_desc; uint16_t fl_cntxt_id; uint32_t fl_cidx; uint32_t fl_pidx; uint32_t fl_sidx; uint32_t fl_db_val; u_int fl_hwidx:4; u_int nid; /* netmap ring # for this queue */ /* infrequently used items after this */ bus_dma_tag_t iq_desc_tag; bus_dmamap_t iq_desc_map; bus_addr_t iq_ba; int intr_idx; bus_dma_tag_t fl_desc_tag; bus_dmamap_t fl_desc_map; bus_addr_t fl_ba; } __aligned(CACHE_LINE_SIZE); struct sge_nm_txq { struct tx_desc *desc; uint16_t cidx; uint16_t pidx; uint16_t sidx; uint16_t equiqidx; /* EQUIQ last requested at this pidx */ uint16_t equeqidx; /* EQUEQ last requested at this pidx */ uint16_t dbidx; /* pidx of the most recent doorbell */ uint16_t doorbells; volatile uint32_t *udb; u_int udb_qid; u_int cntxt_id; __be32 cpl_ctrl0; /* for convenience */ u_int nid; /* netmap ring # for this queue */ /* infrequently used items after this */ bus_dma_tag_t desc_tag; bus_dmamap_t desc_map; bus_addr_t ba; int iqidx; } __aligned(CACHE_LINE_SIZE); struct sge { int nrxq; /* total # of Ethernet rx queues */ int ntxq; /* total # of Ethernet tx tx queues */ int nofldrxq; /* total # of TOE rx queues */ int nofldtxq; /* total # of TOE tx queues */ int nnmrxq; /* total # of netmap rx queues */ int nnmtxq; /* total # of netmap tx queues */ int niq; /* total # of ingress queues */ int neq; /* total # of egress queues */ struct sge_iq fwq; /* Firmware event queue */ struct sge_wrq mgmtq; /* Management queue (control queue) */ struct sge_wrq *ctrlq; /* Control queues */ struct sge_txq *txq; /* NIC tx queues */ struct sge_rxq *rxq; /* NIC rx queues */ struct sge_wrq *ofld_txq; /* TOE tx queues */ struct sge_ofld_rxq *ofld_rxq; /* TOE rx queues */ struct sge_nm_txq *nm_txq; /* netmap tx queues */ struct sge_nm_rxq *nm_rxq; /* netmap rx queues */ uint16_t iq_start; int eq_start; struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */ struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */ int8_t safe_hwidx1; /* may not have room for metadata */ int8_t safe_hwidx2; /* with room for metadata and maybe more */ struct sw_zone_info sw_zone_info[SW_ZONE_SIZES]; struct hw_buf_info hw_buf_info[SGE_FLBUF_SIZES]; }; struct adapter { SLIST_ENTRY(adapter) link; device_t dev; struct cdev *cdev; /* PCIe register resources */ int regs_rid; struct resource *regs_res; int msix_rid; struct resource *msix_res; bus_space_handle_t bh; bus_space_tag_t bt; bus_size_t mmio_len; int udbs_rid; struct resource *udbs_res; volatile uint8_t *udbs_base; unsigned int pf; unsigned int mbox; unsigned int vpd_busy; unsigned int vpd_flag; /* Interrupt information */ int intr_type; int intr_count; struct irq { struct resource *res; int rid; volatile int nm_state; /* NM_OFF, NM_ON, or NM_BUSY */ void *tag; struct sge_rxq *rxq; struct sge_nm_rxq *nm_rxq; } __aligned(CACHE_LINE_SIZE) *irq; + int sge_gts_reg; + int sge_kdoorbell_reg; bus_dma_tag_t dmat; /* Parent DMA tag */ struct sge sge; int lro_timeout; int sc_do_rxcopy; struct taskqueue *tq[MAX_NCHAN]; /* General purpose taskqueues */ struct port_info *port[MAX_NPORTS]; uint8_t chan_map[MAX_NCHAN]; void *tom_softc; /* (struct tom_data *) */ struct tom_tunables tt; void *iwarp_softc; /* (struct c4iw_dev *) */ void *iscsi_ulp_softc; /* (struct cxgbei_data *) */ struct l2t_data *l2t; /* L2 table */ struct tid_info tids; uint16_t doorbells; int offload_map; /* ports with IFCAP_TOE enabled */ int active_ulds; /* ULDs activated on this adapter */ int flags; int debug_flags; char ifp_lockname[16]; struct mtx ifp_lock; struct ifnet *ifp; /* tracer ifp */ struct ifmedia media; int traceq; /* iq used by all tracers, -1 if none */ int tracer_valid; /* bitmap of valid tracers */ int tracer_enabled; /* bitmap of enabled tracers */ char fw_version[16]; char tp_version[16]; char exprom_version[16]; char cfg_file[32]; u_int cfcsum; struct adapter_params params; const struct chip_params *chip_params; struct t4_virt_res vres; uint16_t nbmcaps; uint16_t linkcaps; uint16_t switchcaps; uint16_t niccaps; uint16_t toecaps; uint16_t rdmacaps; uint16_t tlscaps; uint16_t iscsicaps; uint16_t fcoecaps; struct sysctl_ctx_list ctx; /* from adapter_full_init to full_uninit */ struct mtx sc_lock; char lockname[16]; /* Starving free lists */ struct mtx sfl_lock; /* same cache-line as sc_lock? but that's ok */ TAILQ_HEAD(, sge_fl) sfl; struct callout sfl_callout; struct mtx reg_lock; /* for indirect register access */ struct memwin memwin[NUM_MEMWIN]; /* memory windows */ const char *last_op; const void *last_op_thr; int last_op_flags; }; #define ADAPTER_LOCK(sc) mtx_lock(&(sc)->sc_lock) #define ADAPTER_UNLOCK(sc) mtx_unlock(&(sc)->sc_lock) #define ADAPTER_LOCK_ASSERT_OWNED(sc) mtx_assert(&(sc)->sc_lock, MA_OWNED) #define ADAPTER_LOCK_ASSERT_NOTOWNED(sc) mtx_assert(&(sc)->sc_lock, MA_NOTOWNED) #define ASSERT_SYNCHRONIZED_OP(sc) \ KASSERT(IS_BUSY(sc) && \ (mtx_owned(&(sc)->sc_lock) || sc->last_op_thr == curthread), \ ("%s: operation not synchronized.", __func__)) #define PORT_LOCK(pi) mtx_lock(&(pi)->pi_lock) #define PORT_UNLOCK(pi) mtx_unlock(&(pi)->pi_lock) #define PORT_LOCK_ASSERT_OWNED(pi) mtx_assert(&(pi)->pi_lock, MA_OWNED) #define PORT_LOCK_ASSERT_NOTOWNED(pi) mtx_assert(&(pi)->pi_lock, MA_NOTOWNED) #define FL_LOCK(fl) mtx_lock(&(fl)->fl_lock) #define FL_TRYLOCK(fl) mtx_trylock(&(fl)->fl_lock) #define FL_UNLOCK(fl) mtx_unlock(&(fl)->fl_lock) #define FL_LOCK_ASSERT_OWNED(fl) mtx_assert(&(fl)->fl_lock, MA_OWNED) #define FL_LOCK_ASSERT_NOTOWNED(fl) mtx_assert(&(fl)->fl_lock, MA_NOTOWNED) #define RXQ_FL_LOCK(rxq) FL_LOCK(&(rxq)->fl) #define RXQ_FL_UNLOCK(rxq) FL_UNLOCK(&(rxq)->fl) #define RXQ_FL_LOCK_ASSERT_OWNED(rxq) FL_LOCK_ASSERT_OWNED(&(rxq)->fl) #define RXQ_FL_LOCK_ASSERT_NOTOWNED(rxq) FL_LOCK_ASSERT_NOTOWNED(&(rxq)->fl) #define EQ_LOCK(eq) mtx_lock(&(eq)->eq_lock) #define EQ_TRYLOCK(eq) mtx_trylock(&(eq)->eq_lock) #define EQ_UNLOCK(eq) mtx_unlock(&(eq)->eq_lock) #define EQ_LOCK_ASSERT_OWNED(eq) mtx_assert(&(eq)->eq_lock, MA_OWNED) #define EQ_LOCK_ASSERT_NOTOWNED(eq) mtx_assert(&(eq)->eq_lock, MA_NOTOWNED) #define TXQ_LOCK(txq) EQ_LOCK(&(txq)->eq) #define TXQ_TRYLOCK(txq) EQ_TRYLOCK(&(txq)->eq) #define TXQ_UNLOCK(txq) EQ_UNLOCK(&(txq)->eq) #define TXQ_LOCK_ASSERT_OWNED(txq) EQ_LOCK_ASSERT_OWNED(&(txq)->eq) #define TXQ_LOCK_ASSERT_NOTOWNED(txq) EQ_LOCK_ASSERT_NOTOWNED(&(txq)->eq) #define CH_DUMP_MBOX(sc, mbox, data_reg) \ do { \ if (sc->debug_flags & DF_DUMP_MBOX) { \ log(LOG_NOTICE, \ "%s mbox %u: %016llx %016llx %016llx %016llx " \ "%016llx %016llx %016llx %016llx\n", \ device_get_nameunit(sc->dev), mbox, \ (unsigned long long)t4_read_reg64(sc, data_reg), \ (unsigned long long)t4_read_reg64(sc, data_reg + 8), \ (unsigned long long)t4_read_reg64(sc, data_reg + 16), \ (unsigned long long)t4_read_reg64(sc, data_reg + 24), \ (unsigned long long)t4_read_reg64(sc, data_reg + 32), \ (unsigned long long)t4_read_reg64(sc, data_reg + 40), \ (unsigned long long)t4_read_reg64(sc, data_reg + 48), \ (unsigned long long)t4_read_reg64(sc, data_reg + 56)); \ } \ } while (0) #define for_each_txq(vi, iter, q) \ for (q = &vi->pi->adapter->sge.txq[vi->first_txq], iter = 0; \ iter < vi->ntxq; ++iter, ++q) #define for_each_rxq(vi, iter, q) \ for (q = &vi->pi->adapter->sge.rxq[vi->first_rxq], iter = 0; \ iter < vi->nrxq; ++iter, ++q) #define for_each_ofld_txq(vi, iter, q) \ for (q = &vi->pi->adapter->sge.ofld_txq[vi->first_ofld_txq], iter = 0; \ iter < vi->nofldtxq; ++iter, ++q) #define for_each_ofld_rxq(vi, iter, q) \ for (q = &vi->pi->adapter->sge.ofld_rxq[vi->first_ofld_rxq], iter = 0; \ iter < vi->nofldrxq; ++iter, ++q) #define for_each_nm_txq(vi, iter, q) \ for (q = &vi->pi->adapter->sge.nm_txq[vi->first_nm_txq], iter = 0; \ iter < vi->nnmtxq; ++iter, ++q) #define for_each_nm_rxq(vi, iter, q) \ for (q = &vi->pi->adapter->sge.nm_rxq[vi->first_nm_rxq], iter = 0; \ iter < vi->nnmrxq; ++iter, ++q) #define for_each_vi(_pi, _iter, _vi) \ for ((_vi) = (_pi)->vi, (_iter) = 0; (_iter) < (_pi)->nvi; \ ++(_iter), ++(_vi)) #define IDXINCR(idx, incr, wrap) do { \ idx = wrap - idx > incr ? idx + incr : incr - (wrap - idx); \ } while (0) #define IDXDIFF(head, tail, wrap) \ ((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head)) /* One for errors, one for firmware events */ #define T4_EXTRA_INTR 2 static inline uint32_t t4_read_reg(struct adapter *sc, uint32_t reg) { return bus_space_read_4(sc->bt, sc->bh, reg); } static inline void t4_write_reg(struct adapter *sc, uint32_t reg, uint32_t val) { bus_space_write_4(sc->bt, sc->bh, reg, val); } static inline uint64_t t4_read_reg64(struct adapter *sc, uint32_t reg) { return t4_bus_space_read_8(sc->bt, sc->bh, reg); } static inline void t4_write_reg64(struct adapter *sc, uint32_t reg, uint64_t val) { t4_bus_space_write_8(sc->bt, sc->bh, reg, val); } static inline void t4_os_pci_read_cfg1(struct adapter *sc, int reg, uint8_t *val) { *val = pci_read_config(sc->dev, reg, 1); } static inline void t4_os_pci_write_cfg1(struct adapter *sc, int reg, uint8_t val) { pci_write_config(sc->dev, reg, val, 1); } static inline void t4_os_pci_read_cfg2(struct adapter *sc, int reg, uint16_t *val) { *val = pci_read_config(sc->dev, reg, 2); } static inline void t4_os_pci_write_cfg2(struct adapter *sc, int reg, uint16_t val) { pci_write_config(sc->dev, reg, val, 2); } static inline void t4_os_pci_read_cfg4(struct adapter *sc, int reg, uint32_t *val) { *val = pci_read_config(sc->dev, reg, 4); } static inline void t4_os_pci_write_cfg4(struct adapter *sc, int reg, uint32_t val) { pci_write_config(sc->dev, reg, val, 4); } static inline struct port_info * adap2pinfo(struct adapter *sc, int idx) { return (sc->port[idx]); } static inline void t4_os_set_hw_addr(struct adapter *sc, int idx, uint8_t hw_addr[]) { bcopy(hw_addr, sc->port[idx]->vi[0].hw_addr, ETHER_ADDR_LEN); } static inline bool is_10G_port(const struct port_info *pi) { return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) != 0); } static inline bool is_40G_port(const struct port_info *pi) { return ((pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G) != 0); } static inline int port_top_speed(const struct port_info *pi) { if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_100G) return (100); if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_40G) return (40); if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_10G) return (10); if (pi->link_cfg.supported & FW_PORT_CAP_SPEED_1G) return (1); return (0); } static inline int tx_resume_threshold(struct sge_eq *eq) { /* not quite the same as qsize / 4, but this will do. */ return (eq->sidx / 4); } static inline int t4_use_ldst(struct adapter *sc) { #ifdef notyet return (sc->flags & FW_OK || !sc->use_bd); #else return (0); #endif } /* t4_main.c */ int t4_os_find_pci_capability(struct adapter *, int); int t4_os_pci_save_state(struct adapter *); int t4_os_pci_restore_state(struct adapter *); void t4_os_portmod_changed(const struct adapter *, int); void t4_os_link_changed(struct adapter *, int, int, int); void t4_iterate(void (*)(struct adapter *, void *), void *); int t4_filter_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); int begin_synchronized_op(struct adapter *, struct vi_info *, int, char *); void doom_vi(struct adapter *, struct vi_info *); void end_synchronized_op(struct adapter *, int); int update_mac_settings(struct ifnet *, int); int adapter_full_init(struct adapter *); int adapter_full_uninit(struct adapter *); uint64_t cxgbe_get_counter(struct ifnet *, ift_counter); int vi_full_init(struct vi_info *); int vi_full_uninit(struct vi_info *); void vi_sysctls(struct vi_info *); void vi_tick(void *); #ifdef DEV_NETMAP /* t4_netmap.c */ void cxgbe_nm_attach(struct vi_info *); void cxgbe_nm_detach(struct vi_info *); void t4_nm_intr(void *); #endif /* t4_sge.c */ void t4_sge_modload(void); void t4_sge_modunload(void); uint64_t t4_sge_extfree_refs(void); void t4_tweak_chip_settings(struct adapter *); int t4_read_chip_settings(struct adapter *); int t4_create_dma_tag(struct adapter *); void t4_sge_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid_list *); int t4_destroy_dma_tag(struct adapter *); int t4_setup_adapter_queues(struct adapter *); int t4_teardown_adapter_queues(struct adapter *); int t4_setup_vi_queues(struct vi_info *); int t4_teardown_vi_queues(struct vi_info *); void t4_intr_all(void *); void t4_intr(void *); void t4_vi_intr(void *); void t4_intr_err(void *); void t4_intr_evt(void *); void t4_wrq_tx_locked(struct adapter *, struct sge_wrq *, struct wrqe *); void t4_update_fl_bufsize(struct ifnet *); int parse_pkt(struct mbuf **); void *start_wrq_wr(struct sge_wrq *, int, struct wrq_cookie *); void commit_wrq_wr(struct sge_wrq *, void *, struct wrq_cookie *); int tnl_cong(struct port_info *, int); int t4_register_an_handler(an_handler_t); int t4_register_fw_msg_handler(int, fw_msg_handler_t); int t4_register_cpl_handler(int, cpl_handler_t); /* t4_tracer.c */ struct t4_tracer; void t4_tracer_modload(void); void t4_tracer_modunload(void); void t4_tracer_port_detach(struct adapter *); int t4_get_tracer(struct adapter *, struct t4_tracer *); int t4_set_tracer(struct adapter *, struct t4_tracer *); int t4_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *); int t5_trace_pkt(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline struct wrqe * alloc_wrqe(int wr_len, struct sge_wrq *wrq) { int len = offsetof(struct wrqe, wr) + wr_len; struct wrqe *wr; wr = malloc(len, M_CXGBE, M_NOWAIT); if (__predict_false(wr == NULL)) return (NULL); wr->wr_len = wr_len; wr->wrq = wrq; return (wr); } static inline void * wrtod(struct wrqe *wr) { return (&wr->wr[0]); } static inline void free_wrqe(struct wrqe *wr) { free(wr, M_CXGBE); } static inline void t4_wrq_tx(struct adapter *sc, struct wrqe *wr) { struct sge_wrq *wrq = wr->wrq; TXQ_LOCK(wrq); t4_wrq_tx_locked(sc, wrq, wr); TXQ_UNLOCK(wrq); } #endif Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/iw_cxgbe/cq.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/iw_cxgbe/cq.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/iw_cxgbe/cq.c (revision 303654) @@ -1,929 +1,929 @@ /* * Copyright (c) 2009-2013 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include "iw_cxgbe.h" #include "user.h" static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, struct c4iw_dev_ucontext *uctx) { struct adapter *sc = rdev->adap; struct fw_ri_res_wr *res_wr; struct fw_ri_res *res; int wr_len; struct c4iw_wr_wait wr_wait; struct wrqe *wr; wr_len = sizeof *res_wr + sizeof *res; wr = alloc_wrqe(wr_len, &sc->sge.mgmtq); if (wr == NULL) return (0); res_wr = wrtod(wr); memset(res_wr, 0, wr_len); res_wr->op_nres = cpu_to_be32( V_FW_WR_OP(FW_RI_RES_WR) | V_FW_RI_RES_WR_NRES(1) | F_FW_WR_COMPL); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); res_wr->cookie = (unsigned long) &wr_wait; res = res_wr->res; res->u.cq.restype = FW_RI_RES_TYPE_CQ; res->u.cq.op = FW_RI_RES_OP_RESET; res->u.cq.iqid = cpu_to_be32(cq->cqid); c4iw_init_wr_wait(&wr_wait); t4_wrq_tx(sc, wr); c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); kfree(cq->sw_queue); contigfree(cq->queue, cq->memsize, M_DEVBUF); c4iw_put_cqid(rdev, cq->cqid, uctx); return 0; } static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, struct c4iw_dev_ucontext *uctx) { struct adapter *sc = rdev->adap; struct fw_ri_res_wr *res_wr; struct fw_ri_res *res; int wr_len; int user = (uctx != &rdev->uctx); struct c4iw_wr_wait wr_wait; int ret; struct wrqe *wr; cq->cqid = c4iw_get_cqid(rdev, uctx); if (!cq->cqid) { ret = -ENOMEM; goto err1; } if (!user) { cq->sw_queue = kzalloc(cq->memsize, GFP_KERNEL); if (!cq->sw_queue) { ret = -ENOMEM; goto err2; } } cq->queue = contigmalloc(cq->memsize, M_DEVBUF, M_NOWAIT, 0ul, ~0ul, PAGE_SIZE, 0); if (cq->queue) cq->dma_addr = vtophys(cq->queue); else { ret = -ENOMEM; goto err3; } pci_unmap_addr_set(cq, mapping, cq->dma_addr); memset(cq->queue, 0, cq->memsize); /* build fw_ri_res_wr */ wr_len = sizeof *res_wr + sizeof *res; wr = alloc_wrqe(wr_len, &sc->sge.mgmtq); if (wr == NULL) return (0); res_wr = wrtod(wr); memset(res_wr, 0, wr_len); res_wr->op_nres = cpu_to_be32( V_FW_WR_OP(FW_RI_RES_WR) | V_FW_RI_RES_WR_NRES(1) | F_FW_WR_COMPL); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); res_wr->cookie = (unsigned long) &wr_wait; res = res_wr->res; res->u.cq.restype = FW_RI_RES_TYPE_CQ; res->u.cq.op = FW_RI_RES_OP_WRITE; res->u.cq.iqid = cpu_to_be32(cq->cqid); //Fixme: Always use first queue id for IQANDSTINDEX. Linux does the same. res->u.cq.iqandst_to_iqandstindex = cpu_to_be32( V_FW_RI_RES_WR_IQANUS(0) | V_FW_RI_RES_WR_IQANUD(1) | F_FW_RI_RES_WR_IQANDST | V_FW_RI_RES_WR_IQANDSTINDEX(sc->sge.ofld_rxq[0].iq.abs_id)); res->u.cq.iqdroprss_to_iqesize = cpu_to_be16( F_FW_RI_RES_WR_IQDROPRSS | V_FW_RI_RES_WR_IQPCIECH(2) | V_FW_RI_RES_WR_IQINTCNTTHRESH(0) | F_FW_RI_RES_WR_IQO | V_FW_RI_RES_WR_IQESIZE(1)); res->u.cq.iqsize = cpu_to_be16(cq->size); res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr); c4iw_init_wr_wait(&wr_wait); t4_wrq_tx(sc, wr); CTR2(KTR_IW_CXGBE, "%s wait_event wr_wait %p", __func__, &wr_wait); ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); if (ret) goto err4; cq->gen = 1; cq->gts = (void *)((unsigned long)rman_get_virtual(sc->regs_res) + - MYPF_REG(SGE_PF_GTS)); + sc->sge_gts_reg); cq->rdev = rdev; if (user) { cq->ugts = (u64)((char*)rman_get_virtual(sc->udbs_res) + (cq->cqid << rdev->cqshift)); cq->ugts &= PAGE_MASK; CTR5(KTR_IW_CXGBE, "%s: UGTS %p cqid %x cqshift %d page_mask %x", __func__, cq->ugts, cq->cqid, rdev->cqshift, PAGE_MASK); } return 0; err4: contigfree(cq->queue, cq->memsize, M_DEVBUF); err3: kfree(cq->sw_queue); err2: c4iw_put_cqid(rdev, cq->cqid, uctx); err1: return ret; } static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq) { struct t4_cqe cqe; CTR5(KTR_IW_CXGBE, "%s wq %p cq %p sw_cidx %u sw_pidx %u", __func__, wq, cq, cq->sw_cidx, cq->sw_pidx); memset(&cqe, 0, sizeof(cqe)); cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) | V_CQE_OPCODE(FW_RI_SEND) | V_CQE_TYPE(0) | V_CQE_SWCQE(1) | V_CQE_QPID(wq->sq.qid)); cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen)); cq->sw_queue[cq->sw_pidx] = cqe; t4_swcq_produce(cq); } int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count) { int flushed = 0; int in_use = wq->rq.in_use - count; BUG_ON(in_use < 0); CTR5(KTR_IW_CXGBE, "%s wq %p cq %p rq.in_use %u skip count %u", __func__, wq, cq, wq->rq.in_use, count); while (in_use--) { insert_recv_cqe(wq, cq); flushed++; } return flushed; } static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq, struct t4_swsqe *swcqe) { struct t4_cqe cqe; CTR5(KTR_IW_CXGBE, "%s wq %p cq %p sw_cidx %u sw_pidx %u", __func__, wq, cq, cq->sw_cidx, cq->sw_pidx); memset(&cqe, 0, sizeof(cqe)); cqe.header = cpu_to_be32(V_CQE_STATUS(T4_ERR_SWFLUSH) | V_CQE_OPCODE(swcqe->opcode) | V_CQE_TYPE(1) | V_CQE_SWCQE(1) | V_CQE_QPID(wq->sq.qid)); CQE_WRID_SQ_IDX(&cqe) = swcqe->idx; cqe.bits_type_ts = cpu_to_be64(V_CQE_GENBIT((u64)cq->gen)); cq->sw_queue[cq->sw_pidx] = cqe; t4_swcq_produce(cq); } int c4iw_flush_sq(struct t4_wq *wq, struct t4_cq *cq, int count) { int flushed = 0; struct t4_swsqe *swsqe = &wq->sq.sw_sq[wq->sq.cidx + count]; int in_use = wq->sq.in_use - count; BUG_ON(in_use < 0); while (in_use--) { swsqe->signaled = 0; insert_sq_cqe(wq, cq, swsqe); swsqe++; if (swsqe == (wq->sq.sw_sq + wq->sq.size)) swsqe = wq->sq.sw_sq; flushed++; } return flushed; } /* * Move all CQEs from the HWCQ into the SWCQ. */ void c4iw_flush_hw_cq(struct t4_cq *cq) { struct t4_cqe *cqe = NULL, *swcqe; int ret; CTR3(KTR_IW_CXGBE, "%s cq %p cqid 0x%x", __func__, cq, cq->cqid); ret = t4_next_hw_cqe(cq, &cqe); while (!ret) { CTR3(KTR_IW_CXGBE, "%s flushing hwcq cidx 0x%x swcq pidx 0x%x", __func__, cq->cidx, cq->sw_pidx); swcqe = &cq->sw_queue[cq->sw_pidx]; *swcqe = *cqe; swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1)); t4_swcq_produce(cq); t4_hwcq_consume(cq); ret = t4_next_hw_cqe(cq, &cqe); } } static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq) { if (CQE_OPCODE(cqe) == FW_RI_TERMINATE) return 0; if ((CQE_OPCODE(cqe) == FW_RI_RDMA_WRITE) && RQ_TYPE(cqe)) return 0; if ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && SQ_TYPE(cqe)) return 0; if (CQE_SEND_OPCODE(cqe) && RQ_TYPE(cqe) && t4_rq_empty(wq)) return 0; return 1; } void c4iw_count_scqes(struct t4_cq *cq, struct t4_wq *wq, int *count) { struct t4_cqe *cqe; u32 ptr; *count = 0; ptr = cq->sw_cidx; while (ptr != cq->sw_pidx) { cqe = &cq->sw_queue[ptr]; if ((SQ_TYPE(cqe) || ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && wq->sq.oldest_read)) && (CQE_QPID(cqe) == wq->sq.qid)) (*count)++; if (++ptr == cq->size) ptr = 0; } CTR3(KTR_IW_CXGBE, "%s cq %p count %d", __func__, cq, *count); } void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count) { struct t4_cqe *cqe; u32 ptr; *count = 0; CTR2(KTR_IW_CXGBE, "%s count zero %d", __func__, *count); ptr = cq->sw_cidx; while (ptr != cq->sw_pidx) { cqe = &cq->sw_queue[ptr]; if (RQ_TYPE(cqe) && (CQE_OPCODE(cqe) != FW_RI_READ_RESP) && (CQE_QPID(cqe) == wq->sq.qid) && cqe_completes_wr(cqe, wq)) (*count)++; if (++ptr == cq->size) ptr = 0; } CTR3(KTR_IW_CXGBE, "%s cq %p count %d", __func__, cq, *count); } static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq) { struct t4_swsqe *swsqe; u16 ptr = wq->sq.cidx; int count = wq->sq.in_use; int unsignaled = 0; swsqe = &wq->sq.sw_sq[ptr]; while (count--) if (!swsqe->signaled) { if (++ptr == wq->sq.size) ptr = 0; swsqe = &wq->sq.sw_sq[ptr]; unsignaled++; } else if (swsqe->complete) { /* * Insert this completed cqe into the swcq. */ CTR3(KTR_IW_CXGBE, "%s moving cqe into swcq sq idx %u cq idx %u", __func__, ptr, cq->sw_pidx); swsqe->cqe.header |= htonl(V_CQE_SWCQE(1)); cq->sw_queue[cq->sw_pidx] = swsqe->cqe; t4_swcq_produce(cq); swsqe->signaled = 0; wq->sq.in_use -= unsignaled; break; } else break; } static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe, struct t4_cqe *read_cqe) { read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx; read_cqe->len = cpu_to_be32(wq->sq.oldest_read->read_len); read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(hw_cqe)) | V_CQE_SWCQE(SW_CQE(hw_cqe)) | V_CQE_OPCODE(FW_RI_READ_REQ) | V_CQE_TYPE(1)); read_cqe->bits_type_ts = hw_cqe->bits_type_ts; } /* * Return a ptr to the next read wr in the SWSQ or NULL. */ static void advance_oldest_read(struct t4_wq *wq) { u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1; if (rptr == wq->sq.size) rptr = 0; while (rptr != wq->sq.pidx) { wq->sq.oldest_read = &wq->sq.sw_sq[rptr]; if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ) return; if (++rptr == wq->sq.size) rptr = 0; } wq->sq.oldest_read = NULL; } /* * poll_cq * * Caller must: * check the validity of the first CQE, * supply the wq assicated with the qpid. * * credit: cq credit to return to sge. * cqe_flushed: 1 iff the CQE is flushed. * cqe: copy of the polled CQE. * * return value: * 0 CQE returned ok. * -EAGAIN CQE skipped, try again. * -EOVERFLOW CQ overflow detected. */ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, u8 *cqe_flushed, u64 *cookie, u32 *credit) { int ret = 0; struct t4_cqe *hw_cqe, read_cqe; *cqe_flushed = 0; *credit = 0; ret = t4_next_cqe(cq, &hw_cqe); if (ret) return ret; CTR6(KTR_IW_CXGBE, "%s CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x", __func__, CQE_OVFBIT(hw_cqe), CQE_QPID(hw_cqe), CQE_GENBIT(hw_cqe), CQE_TYPE(hw_cqe), CQE_STATUS(hw_cqe)); CTR5(KTR_IW_CXGBE, "%s opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x", __func__, CQE_OPCODE(hw_cqe), CQE_LEN(hw_cqe), CQE_WRID_HI(hw_cqe), CQE_WRID_LOW(hw_cqe)); /* * skip cqe's not affiliated with a QP. */ if (wq == NULL) { ret = -EAGAIN; goto skip_cqe; } /* * Gotta tweak READ completions: * 1) the cqe doesn't contain the sq_wptr from the wr. * 2) opcode not reflected from the wr. * 3) read_len not reflected from the wr. * 4) cq_type is RQ_TYPE not SQ_TYPE. */ if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) { /* * If this is an unsolicited read response, then the read * was generated by the kernel driver as part of peer-2-peer * connection setup. So ignore the completion. */ if (!wq->sq.oldest_read) { if (CQE_STATUS(hw_cqe)) t4_set_wq_in_error(wq); ret = -EAGAIN; goto skip_cqe; } /* * Don't write to the HWCQ, so create a new read req CQE * in local memory. */ create_read_req_cqe(wq, hw_cqe, &read_cqe); hw_cqe = &read_cqe; advance_oldest_read(wq); } if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) { *cqe_flushed = t4_wq_in_error(wq); t4_set_wq_in_error(wq); goto proc_cqe; } if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) { ret = -EAGAIN; goto skip_cqe; } /* * RECV completion. */ if (RQ_TYPE(hw_cqe)) { /* * HW only validates 4 bits of MSN. So we must validate that * the MSN in the SEND is the next expected MSN. If its not, * then we complete this with T4_ERR_MSN and mark the wq in * error. */ if (t4_rq_empty(wq)) { t4_set_wq_in_error(wq); ret = -EAGAIN; goto skip_cqe; } if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) { t4_set_wq_in_error(wq); hw_cqe->header |= htonl(V_CQE_STATUS(T4_ERR_MSN)); goto proc_cqe; } goto proc_cqe; } /* * If we get here its a send completion. * * Handle out of order completion. These get stuffed * in the SW SQ. Then the SW SQ is walked to move any * now in-order completions into the SW CQ. This handles * 2 cases: * 1) reaping unsignaled WRs when the first subsequent * signaled WR is completed. * 2) out of order read completions. */ if (!SW_CQE(hw_cqe) && (CQE_WRID_SQ_IDX(hw_cqe) != wq->sq.cidx)) { struct t4_swsqe *swsqe; CTR2(KTR_IW_CXGBE, "%s out of order completion going in sw_sq at idx %u", __func__, CQE_WRID_SQ_IDX(hw_cqe)); swsqe = &wq->sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)]; swsqe->cqe = *hw_cqe; swsqe->complete = 1; ret = -EAGAIN; goto flush_wq; } proc_cqe: *cqe = *hw_cqe; /* * Reap the associated WR(s) that are freed up with this * completion. */ if (SQ_TYPE(hw_cqe)) { wq->sq.cidx = CQE_WRID_SQ_IDX(hw_cqe); CTR2(KTR_IW_CXGBE, "%s completing sq idx %u", __func__, wq->sq.cidx); *cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id; t4_sq_consume(wq); } else { CTR2(KTR_IW_CXGBE, "%s completing rq idx %u", __func__, wq->rq.cidx); *cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id; BUG_ON(t4_rq_empty(wq)); t4_rq_consume(wq); } flush_wq: /* * Flush any completed cqes that are now in-order. */ flush_completed_wrs(wq, cq); skip_cqe: if (SW_CQE(hw_cqe)) { CTR4(KTR_IW_CXGBE, "%s cq %p cqid 0x%x skip sw cqe cidx %u", __func__, cq, cq->cqid, cq->sw_cidx); t4_swcq_consume(cq); } else { CTR4(KTR_IW_CXGBE, "%s cq %p cqid 0x%x skip hw cqe cidx %u", __func__, cq, cq->cqid, cq->cidx); t4_hwcq_consume(cq); } return ret; } /* * Get one cq entry from c4iw and map it to openib. * * Returns: * 0 cqe returned * -ENODATA EMPTY; * -EAGAIN caller must try again * any other -errno fatal error */ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) { struct c4iw_qp *qhp = NULL; struct t4_cqe cqe = {0, 0}, *rd_cqe; struct t4_wq *wq; u32 credit = 0; u8 cqe_flushed; u64 cookie = 0; int ret; ret = t4_next_cqe(&chp->cq, &rd_cqe); if (ret) return ret; qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe)); if (!qhp) wq = NULL; else { spin_lock(&qhp->lock); wq = &(qhp->wq); } ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit); if (ret) goto out; wc->wr_id = cookie; wc->qp = &qhp->ibqp; wc->vendor_err = CQE_STATUS(&cqe); wc->wc_flags = 0; CTR5(KTR_IW_CXGBE, "%s qpid 0x%x type %d opcode %d status 0x%x", __func__, CQE_QPID(&cqe), CQE_TYPE(&cqe), CQE_OPCODE(&cqe), CQE_STATUS(&cqe)); CTR5(KTR_IW_CXGBE, "%s len %u wrid hi 0x%x lo 0x%x cookie 0x%llx", __func__, CQE_LEN(&cqe), CQE_WRID_HI(&cqe), CQE_WRID_LOW(&cqe), (unsigned long long)cookie); if (CQE_TYPE(&cqe) == 0) { if (!CQE_STATUS(&cqe)) wc->byte_len = CQE_LEN(&cqe); else wc->byte_len = 0; wc->opcode = IB_WC_RECV; if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV || CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); wc->wc_flags |= IB_WC_WITH_INVALIDATE; } } else { switch (CQE_OPCODE(&cqe)) { case FW_RI_RDMA_WRITE: wc->opcode = IB_WC_RDMA_WRITE; break; case FW_RI_READ_REQ: wc->opcode = IB_WC_RDMA_READ; wc->byte_len = CQE_LEN(&cqe); break; case FW_RI_SEND_WITH_INV: case FW_RI_SEND_WITH_SE_INV: wc->opcode = IB_WC_SEND; wc->wc_flags |= IB_WC_WITH_INVALIDATE; break; case FW_RI_SEND: case FW_RI_SEND_WITH_SE: wc->opcode = IB_WC_SEND; break; case FW_RI_BIND_MW: wc->opcode = IB_WC_BIND_MW; break; case FW_RI_LOCAL_INV: wc->opcode = IB_WC_LOCAL_INV; break; case FW_RI_FAST_REGISTER: wc->opcode = IB_WC_FAST_REG_MR; break; default: printf("Unexpected opcode %d " "in the CQE received for QPID = 0x%0x\n", CQE_OPCODE(&cqe), CQE_QPID(&cqe)); ret = -EINVAL; goto out; } } if (cqe_flushed) wc->status = IB_WC_WR_FLUSH_ERR; else { switch (CQE_STATUS(&cqe)) { case T4_ERR_SUCCESS: wc->status = IB_WC_SUCCESS; break; case T4_ERR_STAG: wc->status = IB_WC_LOC_ACCESS_ERR; break; case T4_ERR_PDID: wc->status = IB_WC_LOC_PROT_ERR; break; case T4_ERR_QPID: case T4_ERR_ACCESS: wc->status = IB_WC_LOC_ACCESS_ERR; break; case T4_ERR_WRAP: wc->status = IB_WC_GENERAL_ERR; break; case T4_ERR_BOUND: wc->status = IB_WC_LOC_LEN_ERR; break; case T4_ERR_INVALIDATE_SHARED_MR: case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: wc->status = IB_WC_MW_BIND_ERR; break; case T4_ERR_CRC: case T4_ERR_MARKER: case T4_ERR_PDU_LEN_ERR: case T4_ERR_OUT_OF_RQE: case T4_ERR_DDP_VERSION: case T4_ERR_RDMA_VERSION: case T4_ERR_DDP_QUEUE_NUM: case T4_ERR_MSN: case T4_ERR_TBIT: case T4_ERR_MO: case T4_ERR_MSN_RANGE: case T4_ERR_IRD_OVERFLOW: case T4_ERR_OPCODE: case T4_ERR_INTERNAL_ERR: wc->status = IB_WC_FATAL_ERR; break; case T4_ERR_SWFLUSH: wc->status = IB_WC_WR_FLUSH_ERR; break; default: printf("Unexpected cqe_status 0x%x for QPID = 0x%0x\n", CQE_STATUS(&cqe), CQE_QPID(&cqe)); wc->status = IB_WC_FATAL_ERR; } } out: if (wq) spin_unlock(&qhp->lock); return ret; } int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { struct c4iw_cq *chp; unsigned long flags; int npolled; int err = 0; chp = to_c4iw_cq(ibcq); spin_lock_irqsave(&chp->lock, flags); for (npolled = 0; npolled < num_entries; ++npolled) { do { err = c4iw_poll_cq_one(chp, wc + npolled); } while (err == -EAGAIN); if (err) break; } spin_unlock_irqrestore(&chp->lock, flags); return !err || err == -ENODATA ? npolled : err; } int c4iw_destroy_cq(struct ib_cq *ib_cq) { struct c4iw_cq *chp; struct c4iw_ucontext *ucontext; CTR2(KTR_IW_CXGBE, "%s ib_cq %p", __func__, ib_cq); chp = to_c4iw_cq(ib_cq); remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid); atomic_dec(&chp->refcnt); wait_event(chp->wait, !atomic_read(&chp->refcnt)); ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context) : NULL; destroy_cq(&chp->rhp->rdev, &chp->cq, ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx); kfree(chp); return 0; } struct ib_cq * c4iw_create_cq(struct ib_device *ibdev, struct ib_cq_init_attr *attr, struct ib_ucontext *ib_context, struct ib_udata *udata) { struct c4iw_dev *rhp; struct c4iw_cq *chp; struct c4iw_create_cq_resp uresp; struct c4iw_ucontext *ucontext = NULL; int ret; size_t memsize, hwentries; struct c4iw_mm_entry *mm, *mm2; int entries = attr->cqe; CTR3(KTR_IW_CXGBE, "%s ib_dev %p entries %d", __func__, ibdev, entries); rhp = to_c4iw_dev(ibdev); chp = kzalloc(sizeof(*chp), GFP_KERNEL); if (!chp) return ERR_PTR(-ENOMEM); if (ib_context) ucontext = to_c4iw_ucontext(ib_context); /* account for the status page. */ entries++; /* IQ needs one extra entry to differentiate full vs empty. */ entries++; /* * entries must be multiple of 16 for HW. */ entries = roundup(entries, 16); /* * Make actual HW queue 2x to avoid cidx_inc overflows. */ hwentries = entries * 2; /* * Make HW queue at least 64 entries so GTS updates aren't too * frequent. */ if (hwentries < 64) hwentries = 64; memsize = hwentries * sizeof *chp->cq.queue; /* * memsize must be a multiple of the page size if its a user cq. */ if (ucontext) { memsize = roundup(memsize, PAGE_SIZE); hwentries = memsize / sizeof *chp->cq.queue; while (hwentries > T4_MAX_IQ_SIZE) { memsize -= PAGE_SIZE; hwentries = memsize / sizeof *chp->cq.queue; } } chp->cq.size = hwentries; chp->cq.memsize = memsize; ret = create_cq(&rhp->rdev, &chp->cq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); if (ret) goto err1; chp->rhp = rhp; chp->cq.size--; /* status page */ chp->ibcq.cqe = entries - 2; spin_lock_init(&chp->lock); spin_lock_init(&chp->comp_handler_lock); atomic_set(&chp->refcnt, 1); init_waitqueue_head(&chp->wait); ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid); if (ret) goto err2; if (ucontext) { mm = kmalloc(sizeof *mm, GFP_KERNEL); if (!mm) goto err3; mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); if (!mm2) goto err4; memset(&uresp, 0, sizeof(uresp)); uresp.qid_mask = rhp->rdev.cqmask; uresp.cqid = chp->cq.cqid; uresp.size = chp->cq.size; uresp.memsize = chp->cq.memsize; spin_lock(&ucontext->mmap_lock); uresp.key = ucontext->key; ucontext->key += PAGE_SIZE; uresp.gts_key = ucontext->key; ucontext->key += PAGE_SIZE; spin_unlock(&ucontext->mmap_lock); ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp) - sizeof(uresp.reserved)); if (ret) goto err5; mm->key = uresp.key; mm->addr = vtophys(chp->cq.queue); mm->len = chp->cq.memsize; insert_mmap(ucontext, mm); mm2->key = uresp.gts_key; mm2->addr = chp->cq.ugts; mm2->len = PAGE_SIZE; insert_mmap(ucontext, mm2); } CTR6(KTR_IW_CXGBE, "%s cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx", __func__, chp->cq.cqid, chp, chp->cq.size, chp->cq.memsize, (unsigned long long) chp->cq.dma_addr); return &chp->ibcq; err5: kfree(mm2); err4: kfree(mm); err3: remove_handle(rhp, &rhp->cqidr, chp->cq.cqid); err2: destroy_cq(&chp->rhp->rdev, &chp->cq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); err1: kfree(chp); return ERR_PTR(ret); } int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) { return -ENOSYS; } int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct c4iw_cq *chp; int ret; unsigned long flag; chp = to_c4iw_cq(ibcq); spin_lock_irqsave(&chp->lock, flag); ret = t4_arm_cq(&chp->cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED); spin_unlock_irqrestore(&chp->lock, flag); if (ret && !(flags & IB_CQ_REPORT_MISSED_EVENTS)) ret = 0; return ret; } #endif Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/iw_cxgbe/qp.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/iw_cxgbe/qp.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/iw_cxgbe/qp.c (revision 303654) @@ -1,1716 +1,1716 @@ /* * Copyright (c) 2009-2013 Chelsio, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #ifdef TCP_OFFLOAD #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct sge_iq; struct rss_header; struct cpl_set_tcb_rpl; #include #include "offload.h" #include "tom/t4_tom.h" #include "iw_cxgbe.h" #include "user.h" extern int db_delay_usecs; extern int db_fc_threshold; static void creds(struct toepcb *toep, size_t wrsize); static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state) { unsigned long flag; spin_lock_irqsave(&qhp->lock, flag); qhp->attr.state = state; spin_unlock_irqrestore(&qhp->lock, flag); } static void dealloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) { contigfree(sq->queue, sq->memsize, M_DEVBUF); } static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) { dealloc_host_sq(rdev, sq); } static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq) { sq->queue = contigmalloc(sq->memsize, M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0); if (sq->queue) sq->dma_addr = vtophys(sq->queue); else return -ENOMEM; sq->phys_addr = vtophys(sq->queue); pci_unmap_addr_set(sq, mapping, sq->dma_addr); CTR4(KTR_IW_CXGBE, "%s sq %p dma_addr %p phys_addr %p", __func__, sq->queue, sq->dma_addr, sq->phys_addr); return 0; } static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, struct c4iw_dev_ucontext *uctx) { /* * uP clears EQ contexts when the connection exits rdma mode, * so no need to post a RESET WR for these EQs. */ contigfree(wq->rq.queue, wq->rq.memsize, M_DEVBUF); dealloc_sq(rdev, &wq->sq); c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); kfree(wq->rq.sw_rq); kfree(wq->sq.sw_sq); c4iw_put_qpid(rdev, wq->rq.qid, uctx); c4iw_put_qpid(rdev, wq->sq.qid, uctx); return 0; } static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, struct t4_cq *rcq, struct t4_cq *scq, struct c4iw_dev_ucontext *uctx) { struct adapter *sc = rdev->adap; int user = (uctx != &rdev->uctx); struct fw_ri_res_wr *res_wr; struct fw_ri_res *res; int wr_len; struct c4iw_wr_wait wr_wait; int ret; int eqsize; struct wrqe *wr; wq->sq.qid = c4iw_get_qpid(rdev, uctx); if (!wq->sq.qid) return -ENOMEM; wq->rq.qid = c4iw_get_qpid(rdev, uctx); if (!wq->rq.qid) goto err1; if (!user) { wq->sq.sw_sq = kzalloc(wq->sq.size * sizeof *wq->sq.sw_sq, GFP_KERNEL); if (!wq->sq.sw_sq) goto err2; wq->rq.sw_rq = kzalloc(wq->rq.size * sizeof *wq->rq.sw_rq, GFP_KERNEL); if (!wq->rq.sw_rq) goto err3; } /* RQT must be a power of 2. */ wq->rq.rqt_size = roundup_pow_of_two(wq->rq.size); wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size); if (!wq->rq.rqt_hwaddr) goto err4; if (alloc_host_sq(rdev, &wq->sq)) goto err5; memset(wq->sq.queue, 0, wq->sq.memsize); pci_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr); wq->rq.queue = contigmalloc(wq->rq.memsize, M_DEVBUF, M_NOWAIT, 0ul, ~0ul, 4096, 0); if (wq->rq.queue) wq->rq.dma_addr = vtophys(wq->rq.queue); else goto err6; CTR5(KTR_IW_CXGBE, "%s sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx", __func__, wq->sq.queue, (unsigned long long)vtophys(wq->sq.queue), wq->rq.queue, (unsigned long long)vtophys(wq->rq.queue)); memset(wq->rq.queue, 0, wq->rq.memsize); pci_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr); wq->db = (void *)((unsigned long)rman_get_virtual(sc->regs_res) + - MYPF_REG(SGE_PF_KDOORBELL)); + sc->sge_kdoorbell_reg); wq->gts = (void *)((unsigned long)rman_get_virtual(rdev->adap->regs_res) - + MYPF_REG(SGE_PF_GTS)); + + sc->sge_gts_reg); if (user) { wq->sq.udb = (u64)((char*)rman_get_virtual(rdev->adap->udbs_res) + (wq->sq.qid << rdev->qpshift)); wq->sq.udb &= PAGE_MASK; wq->rq.udb = (u64)((char*)rman_get_virtual(rdev->adap->udbs_res) + (wq->rq.qid << rdev->qpshift)); wq->rq.udb &= PAGE_MASK; } wq->rdev = rdev; wq->rq.msn = 1; /* build fw_ri_res_wr */ wr_len = sizeof *res_wr + 2 * sizeof *res; wr = alloc_wrqe(wr_len, &sc->sge.mgmtq); if (wr == NULL) return (0); res_wr = wrtod(wr); memset(res_wr, 0, wr_len); res_wr->op_nres = cpu_to_be32( V_FW_WR_OP(FW_RI_RES_WR) | V_FW_RI_RES_WR_NRES(2) | F_FW_WR_COMPL); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); res_wr->cookie = (unsigned long) &wr_wait; res = res_wr->res; res->u.sqrq.restype = FW_RI_RES_TYPE_SQ; res->u.sqrq.op = FW_RI_RES_OP_WRITE; /* eqsize is the number of 64B entries plus the status page size. */ eqsize = wq->sq.size * T4_SQ_NUM_SLOTS + (sc->params.sge.spg_len / EQ_ESIZE); res->u.sqrq.fetchszm_to_iqid = cpu_to_be32( V_FW_RI_RES_WR_HOSTFCMODE(0) | /* no host cidx updates */ V_FW_RI_RES_WR_CPRIO(0) | /* don't keep in chip cache */ V_FW_RI_RES_WR_PCIECHN(0) | /* set by uP at ri_init time */ V_FW_RI_RES_WR_IQID(scq->cqid)); res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( V_FW_RI_RES_WR_DCAEN(0) | V_FW_RI_RES_WR_DCACPU(0) | V_FW_RI_RES_WR_FBMIN(2) | V_FW_RI_RES_WR_FBMAX(2) | V_FW_RI_RES_WR_CIDXFTHRESHO(0) | V_FW_RI_RES_WR_CIDXFTHRESH(0) | V_FW_RI_RES_WR_EQSIZE(eqsize)); res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid); res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr); res++; res->u.sqrq.restype = FW_RI_RES_TYPE_RQ; res->u.sqrq.op = FW_RI_RES_OP_WRITE; /* eqsize is the number of 64B entries plus the status page size. */ eqsize = wq->rq.size * T4_RQ_NUM_SLOTS + (sc->params.sge.spg_len / EQ_ESIZE); res->u.sqrq.fetchszm_to_iqid = cpu_to_be32( V_FW_RI_RES_WR_HOSTFCMODE(0) | /* no host cidx updates */ V_FW_RI_RES_WR_CPRIO(0) | /* don't keep in chip cache */ V_FW_RI_RES_WR_PCIECHN(0) | /* set by uP at ri_init time */ V_FW_RI_RES_WR_IQID(rcq->cqid)); res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( V_FW_RI_RES_WR_DCAEN(0) | V_FW_RI_RES_WR_DCACPU(0) | V_FW_RI_RES_WR_FBMIN(2) | V_FW_RI_RES_WR_FBMAX(2) | V_FW_RI_RES_WR_CIDXFTHRESHO(0) | V_FW_RI_RES_WR_CIDXFTHRESH(0) | V_FW_RI_RES_WR_EQSIZE(eqsize)); res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid); res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr); c4iw_init_wr_wait(&wr_wait); t4_wrq_tx(sc, wr); ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, wq->sq.qid, __func__); if (ret) goto err7; CTR6(KTR_IW_CXGBE, "%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%llx rqudb 0x%llx", __func__, wq->sq.qid, wq->rq.qid, wq->db, (unsigned long long)wq->sq.udb, (unsigned long long)wq->rq.udb); return 0; err7: contigfree(wq->rq.queue, wq->rq.memsize, M_DEVBUF); err6: dealloc_sq(rdev, &wq->sq); err5: c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); err4: kfree(wq->rq.sw_rq); err3: kfree(wq->sq.sw_sq); err2: c4iw_put_qpid(rdev, wq->rq.qid, uctx); err1: c4iw_put_qpid(rdev, wq->sq.qid, uctx); return -ENOMEM; } static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp, struct ib_send_wr *wr, int max, u32 *plenp) { u8 *dstp, *srcp; u32 plen = 0; int i; int rem, len; dstp = (u8 *)immdp->data; for (i = 0; i < wr->num_sge; i++) { if ((plen + wr->sg_list[i].length) > max) return -EMSGSIZE; srcp = (u8 *)(unsigned long)wr->sg_list[i].addr; plen += wr->sg_list[i].length; rem = wr->sg_list[i].length; while (rem) { if (dstp == (u8 *)&sq->queue[sq->size]) dstp = (u8 *)sq->queue; if (rem <= (u8 *)&sq->queue[sq->size] - dstp) len = rem; else len = (u8 *)&sq->queue[sq->size] - dstp; memcpy(dstp, srcp, len); dstp += len; srcp += len; rem -= len; } } len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp); if (len) memset(dstp, 0, len); immdp->op = FW_RI_DATA_IMMD; immdp->r1 = 0; immdp->r2 = 0; immdp->immdlen = cpu_to_be32(plen); *plenp = plen; return 0; } static int build_isgl(__be64 *queue_start, __be64 *queue_end, struct fw_ri_isgl *isglp, struct ib_sge *sg_list, int num_sge, u32 *plenp) { int i; u32 plen = 0; __be64 *flitp = (__be64 *)isglp->sge; for (i = 0; i < num_sge; i++) { if ((plen + sg_list[i].length) < plen) return -EMSGSIZE; plen += sg_list[i].length; *flitp = cpu_to_be64(((u64)sg_list[i].lkey << 32) | sg_list[i].length); if (++flitp == queue_end) flitp = queue_start; *flitp = cpu_to_be64(sg_list[i].addr); if (++flitp == queue_end) flitp = queue_start; } *flitp = (__force __be64)0; isglp->op = FW_RI_DATA_ISGL; isglp->r1 = 0; isglp->nsge = cpu_to_be16(num_sge); isglp->r2 = 0; if (plenp) *plenp = plen; return 0; } static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) { u32 plen; int size; int ret; if (wr->num_sge > T4_MAX_SEND_SGE) return -EINVAL; switch (wr->opcode) { case IB_WR_SEND: if (wr->send_flags & IB_SEND_SOLICITED) wqe->send.sendop_pkd = cpu_to_be32( V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE)); else wqe->send.sendop_pkd = cpu_to_be32( V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND)); wqe->send.stag_inv = 0; break; case IB_WR_SEND_WITH_INV: if (wr->send_flags & IB_SEND_SOLICITED) wqe->send.sendop_pkd = cpu_to_be32( V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_SE_INV)); else wqe->send.sendop_pkd = cpu_to_be32( V_FW_RI_SEND_WR_SENDOP(FW_RI_SEND_WITH_INV)); wqe->send.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); break; default: return -EINVAL; } plen = 0; if (wr->num_sge) { if (wr->send_flags & IB_SEND_INLINE) { ret = build_immd(sq, wqe->send.u.immd_src, wr, T4_MAX_SEND_INLINE, &plen); if (ret) return ret; size = sizeof wqe->send + sizeof(struct fw_ri_immd) + plen; } else { ret = build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], wqe->send.u.isgl_src, wr->sg_list, wr->num_sge, &plen); if (ret) return ret; size = sizeof wqe->send + sizeof(struct fw_ri_isgl) + wr->num_sge * sizeof(struct fw_ri_sge); } } else { wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD; wqe->send.u.immd_src[0].r1 = 0; wqe->send.u.immd_src[0].r2 = 0; wqe->send.u.immd_src[0].immdlen = 0; size = sizeof wqe->send + sizeof(struct fw_ri_immd); plen = 0; } *len16 = DIV_ROUND_UP(size, 16); wqe->send.plen = cpu_to_be32(plen); return 0; } static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) { u32 plen; int size; int ret; if (wr->num_sge > T4_MAX_SEND_SGE) return -EINVAL; wqe->write.r2 = 0; wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey); wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr); if (wr->num_sge) { if (wr->send_flags & IB_SEND_INLINE) { ret = build_immd(sq, wqe->write.u.immd_src, wr, T4_MAX_WRITE_INLINE, &plen); if (ret) return ret; size = sizeof wqe->write + sizeof(struct fw_ri_immd) + plen; } else { ret = build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], wqe->write.u.isgl_src, wr->sg_list, wr->num_sge, &plen); if (ret) return ret; size = sizeof wqe->write + sizeof(struct fw_ri_isgl) + wr->num_sge * sizeof(struct fw_ri_sge); } } else { wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD; wqe->write.u.immd_src[0].r1 = 0; wqe->write.u.immd_src[0].r2 = 0; wqe->write.u.immd_src[0].immdlen = 0; size = sizeof wqe->write + sizeof(struct fw_ri_immd); plen = 0; } *len16 = DIV_ROUND_UP(size, 16); wqe->write.plen = cpu_to_be32(plen); return 0; } static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) { if (wr->num_sge > 1) return -EINVAL; if (wr->num_sge) { wqe->read.stag_src = cpu_to_be32(wr->wr.rdma.rkey); wqe->read.to_src_hi = cpu_to_be32((u32)(wr->wr.rdma.remote_addr >> 32)); wqe->read.to_src_lo = cpu_to_be32((u32)wr->wr.rdma.remote_addr); wqe->read.stag_sink = cpu_to_be32(wr->sg_list[0].lkey); wqe->read.plen = cpu_to_be32(wr->sg_list[0].length); wqe->read.to_sink_hi = cpu_to_be32((u32)(wr->sg_list[0].addr >> 32)); wqe->read.to_sink_lo = cpu_to_be32((u32)(wr->sg_list[0].addr)); } else { wqe->read.stag_src = cpu_to_be32(2); wqe->read.to_src_hi = 0; wqe->read.to_src_lo = 0; wqe->read.stag_sink = cpu_to_be32(2); wqe->read.plen = 0; wqe->read.to_sink_hi = 0; wqe->read.to_sink_lo = 0; } wqe->read.r2 = 0; wqe->read.r5 = 0; *len16 = DIV_ROUND_UP(sizeof wqe->read, 16); return 0; } static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, struct ib_recv_wr *wr, u8 *len16) { int ret; ret = build_isgl((__be64 *)qhp->wq.rq.queue, (__be64 *)&qhp->wq.rq.queue[qhp->wq.rq.size], &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL); if (ret) return ret; *len16 = DIV_ROUND_UP(sizeof wqe->recv + wr->num_sge * sizeof(struct fw_ri_sge), 16); return 0; } static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) { struct fw_ri_immd *imdp; __be64 *p; int i; int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32); int rem; if (wr->wr.fast_reg.page_list_len > T4_MAX_FR_DEPTH) return -EINVAL; wqe->fr.qpbinde_to_dcacpu = 0; wqe->fr.pgsz_shift = wr->wr.fast_reg.page_shift - 12; wqe->fr.addr_type = FW_RI_VA_BASED_TO; wqe->fr.mem_perms = c4iw_ib_to_tpt_access(wr->wr.fast_reg.access_flags); wqe->fr.len_hi = 0; wqe->fr.len_lo = cpu_to_be32(wr->wr.fast_reg.length); wqe->fr.stag = cpu_to_be32(wr->wr.fast_reg.rkey); wqe->fr.va_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32); wqe->fr.va_lo_fbo = cpu_to_be32(wr->wr.fast_reg.iova_start & 0xffffffff); WARN_ON(pbllen > T4_MAX_FR_IMMD); imdp = (struct fw_ri_immd *)(&wqe->fr + 1); imdp->op = FW_RI_DATA_IMMD; imdp->r1 = 0; imdp->r2 = 0; imdp->immdlen = cpu_to_be32(pbllen); p = (__be64 *)(imdp + 1); rem = pbllen; for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { *p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]); rem -= sizeof *p; if (++p == (__be64 *)&sq->queue[sq->size]) p = (__be64 *)sq->queue; } BUG_ON(rem < 0); while (rem) { *p = 0; rem -= sizeof *p; if (++p == (__be64 *)&sq->queue[sq->size]) p = (__be64 *)sq->queue; } *len16 = DIV_ROUND_UP(sizeof wqe->fr + sizeof *imdp + pbllen, 16); return 0; } static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) { wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); wqe->inv.r2 = 0; *len16 = DIV_ROUND_UP(sizeof wqe->inv, 16); return 0; } void c4iw_qp_add_ref(struct ib_qp *qp) { CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, qp); atomic_inc(&(to_c4iw_qp(qp)->refcnt)); } void c4iw_qp_rem_ref(struct ib_qp *qp) { CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, qp); if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt))) wake_up(&(to_c4iw_qp(qp)->wait)); } int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { int err = 0; u8 len16 = 0; enum fw_wr_opcodes fw_opcode = 0; enum fw_ri_wr_flags fw_flags; struct c4iw_qp *qhp; union t4_wr *wqe; u32 num_wrs; struct t4_swsqe *swsqe; unsigned long flag; u16 idx = 0; qhp = to_c4iw_qp(ibqp); spin_lock_irqsave(&qhp->lock, flag); if (t4_wq_in_error(&qhp->wq)) { spin_unlock_irqrestore(&qhp->lock, flag); return -EINVAL; } num_wrs = t4_sq_avail(&qhp->wq); if (num_wrs == 0) { spin_unlock_irqrestore(&qhp->lock, flag); return -ENOMEM; } while (wr) { if (num_wrs == 0) { err = -ENOMEM; *bad_wr = wr; break; } wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue + qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE); fw_flags = 0; if (wr->send_flags & IB_SEND_SOLICITED) fw_flags |= FW_RI_SOLICITED_EVENT_FLAG; if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all) fw_flags |= FW_RI_COMPLETION_FLAG; swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; switch (wr->opcode) { case IB_WR_SEND_WITH_INV: case IB_WR_SEND: if (wr->send_flags & IB_SEND_FENCE) fw_flags |= FW_RI_READ_FENCE_FLAG; fw_opcode = FW_RI_SEND_WR; if (wr->opcode == IB_WR_SEND) swsqe->opcode = FW_RI_SEND; else swsqe->opcode = FW_RI_SEND_WITH_INV; err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16); break; case IB_WR_RDMA_WRITE: fw_opcode = FW_RI_RDMA_WRITE_WR; swsqe->opcode = FW_RI_RDMA_WRITE; err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16); break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: fw_opcode = FW_RI_RDMA_READ_WR; swsqe->opcode = FW_RI_READ_REQ; if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) fw_flags = FW_RI_RDMA_READ_INVALIDATE; else fw_flags = 0; err = build_rdma_read(wqe, wr, &len16); if (err) break; swsqe->read_len = wr->sg_list[0].length; if (!qhp->wq.sq.oldest_read) qhp->wq.sq.oldest_read = swsqe; break; case IB_WR_FAST_REG_MR: fw_opcode = FW_RI_FR_NSMR_WR; swsqe->opcode = FW_RI_FAST_REGISTER; err = build_fastreg(&qhp->wq.sq, wqe, wr, &len16); break; case IB_WR_LOCAL_INV: if (wr->send_flags & IB_SEND_FENCE) fw_flags |= FW_RI_LOCAL_FENCE_FLAG; fw_opcode = FW_RI_INV_LSTAG_WR; swsqe->opcode = FW_RI_LOCAL_INV; err = build_inv_stag(wqe, wr, &len16); break; default: CTR2(KTR_IW_CXGBE, "%s post of type =%d TBD!", __func__, wr->opcode); err = -EINVAL; } if (err) { *bad_wr = wr; break; } swsqe->idx = qhp->wq.sq.pidx; swsqe->complete = 0; swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) || qhp->sq_sig_all; swsqe->wr_id = wr->wr_id; init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16); CTR5(KTR_IW_CXGBE, "%s cookie 0x%llx pidx 0x%x opcode 0x%x read_len %u", __func__, (unsigned long long)wr->wr_id, qhp->wq.sq.pidx, swsqe->opcode, swsqe->read_len); wr = wr->next; num_wrs--; t4_sq_produce(&qhp->wq, len16); idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); } if (t4_wq_db_enabled(&qhp->wq)) t4_ring_sq_db(&qhp->wq, idx); spin_unlock_irqrestore(&qhp->lock, flag); return err; } int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr) { int err = 0; struct c4iw_qp *qhp; union t4_recv_wr *wqe; u32 num_wrs; u8 len16 = 0; unsigned long flag; u16 idx = 0; qhp = to_c4iw_qp(ibqp); spin_lock_irqsave(&qhp->lock, flag); if (t4_wq_in_error(&qhp->wq)) { spin_unlock_irqrestore(&qhp->lock, flag); return -EINVAL; } num_wrs = t4_rq_avail(&qhp->wq); if (num_wrs == 0) { spin_unlock_irqrestore(&qhp->lock, flag); return -ENOMEM; } while (wr) { if (wr->num_sge > T4_MAX_RECV_SGE) { err = -EINVAL; *bad_wr = wr; break; } wqe = (union t4_recv_wr *)((u8 *)qhp->wq.rq.queue + qhp->wq.rq.wq_pidx * T4_EQ_ENTRY_SIZE); if (num_wrs) err = build_rdma_recv(qhp, wqe, wr, &len16); else err = -ENOMEM; if (err) { *bad_wr = wr; break; } qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id; wqe->recv.opcode = FW_RI_RECV_WR; wqe->recv.r1 = 0; wqe->recv.wrid = qhp->wq.rq.pidx; wqe->recv.r2[0] = 0; wqe->recv.r2[1] = 0; wqe->recv.r2[2] = 0; wqe->recv.len16 = len16; CTR3(KTR_IW_CXGBE, "%s cookie 0x%llx pidx %u", __func__, (unsigned long long) wr->wr_id, qhp->wq.rq.pidx); t4_rq_produce(&qhp->wq, len16); idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); wr = wr->next; num_wrs--; } if (t4_wq_db_enabled(&qhp->wq)) t4_ring_rq_db(&qhp->wq, idx); spin_unlock_irqrestore(&qhp->lock, flag); return err; } int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind) { return -ENOSYS; } static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type, u8 *ecode) { int status; int tagged; int opcode; int rqtype; int send_inv; if (!err_cqe) { *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; *ecode = 0; return; } status = CQE_STATUS(err_cqe); opcode = CQE_OPCODE(err_cqe); rqtype = RQ_TYPE(err_cqe); send_inv = (opcode == FW_RI_SEND_WITH_INV) || (opcode == FW_RI_SEND_WITH_SE_INV); tagged = (opcode == FW_RI_RDMA_WRITE) || (rqtype && (opcode == FW_RI_READ_RESP)); switch (status) { case T4_ERR_STAG: if (send_inv) { *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; *ecode = RDMAP_CANT_INV_STAG; } else { *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_INV_STAG; } break; case T4_ERR_PDID: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; if ((opcode == FW_RI_SEND_WITH_INV) || (opcode == FW_RI_SEND_WITH_SE_INV)) *ecode = RDMAP_CANT_INV_STAG; else *ecode = RDMAP_STAG_NOT_ASSOC; break; case T4_ERR_QPID: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_STAG_NOT_ASSOC; break; case T4_ERR_ACCESS: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_ACC_VIOL; break; case T4_ERR_WRAP: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_TO_WRAP; break; case T4_ERR_BOUND: if (tagged) { *layer_type = LAYER_DDP|DDP_TAGGED_ERR; *ecode = DDPT_BASE_BOUNDS; } else { *layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT; *ecode = RDMAP_BASE_BOUNDS; } break; case T4_ERR_INVALIDATE_SHARED_MR: case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; *ecode = RDMAP_CANT_INV_STAG; break; case T4_ERR_ECC: case T4_ERR_ECC_PSTAG: case T4_ERR_INTERNAL_ERR: *layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA; *ecode = 0; break; case T4_ERR_OUT_OF_RQE: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_MSN_NOBUF; break; case T4_ERR_PBL_ADDR_BOUND: *layer_type = LAYER_DDP|DDP_TAGGED_ERR; *ecode = DDPT_BASE_BOUNDS; break; case T4_ERR_CRC: *layer_type = LAYER_MPA|DDP_LLP; *ecode = MPA_CRC_ERR; break; case T4_ERR_MARKER: *layer_type = LAYER_MPA|DDP_LLP; *ecode = MPA_MARKER_ERR; break; case T4_ERR_PDU_LEN_ERR: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_MSG_TOOBIG; break; case T4_ERR_DDP_VERSION: if (tagged) { *layer_type = LAYER_DDP|DDP_TAGGED_ERR; *ecode = DDPT_INV_VERS; } else { *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_VERS; } break; case T4_ERR_RDMA_VERSION: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; *ecode = RDMAP_INV_VERS; break; case T4_ERR_OPCODE: *layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP; *ecode = RDMAP_INV_OPCODE; break; case T4_ERR_DDP_QUEUE_NUM: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_QN; break; case T4_ERR_MSN: case T4_ERR_MSN_GAP: case T4_ERR_MSN_RANGE: case T4_ERR_IRD_OVERFLOW: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_MSN_RANGE; break; case T4_ERR_TBIT: *layer_type = LAYER_DDP|DDP_LOCAL_CATA; *ecode = 0; break; case T4_ERR_MO: *layer_type = LAYER_DDP|DDP_UNTAGGED_ERR; *ecode = DDPU_INV_MO; break; default: *layer_type = LAYER_RDMAP|DDP_LOCAL_CATA; *ecode = 0; break; } } static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, gfp_t gfp) { struct fw_ri_wr *wqe; struct terminate_message *term; struct wrqe *wr; struct socket *so = qhp->ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; CTR4(KTR_IW_CXGBE, "%s qhp %p qid 0x%x tid %u", __func__, qhp, qhp->wq.sq.qid, qhp->ep->hwtid); wr = alloc_wrqe(sizeof(*wqe), toep->ofld_txq); if (wr == NULL) return; wqe = wrtod(wr); memset(wqe, 0, sizeof *wqe); wqe->op_compl = cpu_to_be32(V_FW_WR_OP(FW_RI_WR)); wqe->flowid_len16 = cpu_to_be32( V_FW_WR_FLOWID(qhp->ep->hwtid) | V_FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); wqe->u.terminate.type = FW_RI_TYPE_TERMINATE; wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term); term = (struct terminate_message *)wqe->u.terminate.termmsg; if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) { term->layer_etype = qhp->attr.layer_etype; term->ecode = qhp->attr.ecode; } else build_term_codes(err_cqe, &term->layer_etype, &term->ecode); creds(toep, sizeof(*wqe)); t4_wrq_tx(qhp->rhp->rdev.adap, wr); } /* Assumes qhp lock is held. */ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, struct c4iw_cq *schp) { int count; int flushed; unsigned long flag; CTR4(KTR_IW_CXGBE, "%s qhp %p rchp %p schp %p", __func__, qhp, rchp, schp); /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&rchp->lock, flag); spin_lock(&qhp->lock); c4iw_flush_hw_cq(&rchp->cq); c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&rchp->lock, flag); if (flushed && rchp->ibcq.comp_handler) { spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); } /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&schp->lock, flag); spin_lock(&qhp->lock); c4iw_flush_hw_cq(&schp->cq); c4iw_count_scqes(&schp->cq, &qhp->wq, &count); flushed = c4iw_flush_sq(&qhp->wq, &schp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&schp->lock, flag); if (flushed && schp->ibcq.comp_handler) { spin_lock_irqsave(&schp->comp_handler_lock, flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); spin_unlock_irqrestore(&schp->comp_handler_lock, flag); } } static void flush_qp(struct c4iw_qp *qhp) { struct c4iw_cq *rchp, *schp; unsigned long flag; rchp = get_chp(qhp->rhp, qhp->attr.rcq); schp = get_chp(qhp->rhp, qhp->attr.scq); if (qhp->ibqp.uobject) { t4_set_wq_in_error(&qhp->wq); t4_set_cq_in_error(&rchp->cq); spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); if (schp != rchp) { t4_set_cq_in_error(&schp->cq); spin_lock_irqsave(&schp->comp_handler_lock, flag); (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); spin_unlock_irqrestore(&schp->comp_handler_lock, flag); } return; } __flush_qp(qhp, rchp, schp); } static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, struct c4iw_ep *ep) { struct c4iw_rdev *rdev = &rhp->rdev; struct adapter *sc = rdev->adap; struct fw_ri_wr *wqe; int ret; struct wrqe *wr; struct socket *so = ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; KASSERT(rhp == qhp->rhp && ep == qhp->ep, ("%s: EDOOFUS", __func__)); CTR4(KTR_IW_CXGBE, "%s qhp %p qid 0x%x tid %u", __func__, qhp, qhp->wq.sq.qid, ep->hwtid); wr = alloc_wrqe(sizeof(*wqe), toep->ofld_txq); if (wr == NULL) return (0); wqe = wrtod(wr); memset(wqe, 0, sizeof *wqe); wqe->op_compl = cpu_to_be32(V_FW_WR_OP(FW_RI_WR) | F_FW_WR_COMPL); wqe->flowid_len16 = cpu_to_be32(V_FW_WR_FLOWID(ep->hwtid) | V_FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); wqe->cookie = (unsigned long) &ep->com.wr_wait; wqe->u.fini.type = FW_RI_TYPE_FINI; c4iw_init_wr_wait(&ep->com.wr_wait); creds(toep, sizeof(*wqe)); t4_wrq_tx(sc, wr); ret = c4iw_wait_for_reply(rdev, &ep->com.wr_wait, ep->hwtid, qhp->wq.sq.qid, __func__); return ret; } static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init) { CTR2(KTR_IW_CXGBE, "%s p2p_type = %d", __func__, p2p_type); memset(&init->u, 0, sizeof init->u); switch (p2p_type) { case FW_RI_INIT_P2PTYPE_RDMA_WRITE: init->u.write.opcode = FW_RI_RDMA_WRITE_WR; init->u.write.stag_sink = cpu_to_be32(1); init->u.write.to_sink = cpu_to_be64(1); init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD; init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write + sizeof(struct fw_ri_immd), 16); break; case FW_RI_INIT_P2PTYPE_READ_REQ: init->u.write.opcode = FW_RI_RDMA_READ_WR; init->u.read.stag_src = cpu_to_be32(1); init->u.read.to_src_lo = cpu_to_be32(1); init->u.read.stag_sink = cpu_to_be32(1); init->u.read.to_sink_lo = cpu_to_be32(1); init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16); break; } } static void creds(struct toepcb *toep, size_t wrsize) { struct ofld_tx_sdesc *txsd; CTR3(KTR_IW_CXGBE, "%s:creB %p %u", __func__, toep , wrsize); INP_WLOCK(toep->inp); txsd = &toep->txsd[toep->txsd_pidx]; txsd->tx_credits = howmany(wrsize, 16); txsd->plen = 0; KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0, ("%s: not enough credits (%d)", __func__, toep->tx_credits)); toep->tx_credits -= txsd->tx_credits; if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) toep->txsd_pidx = 0; toep->txsd_avail--; INP_WUNLOCK(toep->inp); CTR5(KTR_IW_CXGBE, "%s:creE %p %u %u %u", __func__, toep , txsd->tx_credits, toep->tx_credits, toep->txsd_pidx); } static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) { struct fw_ri_wr *wqe; int ret; struct wrqe *wr; struct c4iw_ep *ep = qhp->ep; struct c4iw_rdev *rdev = &qhp->rhp->rdev; struct adapter *sc = rdev->adap; struct socket *so = ep->com.so; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); struct toepcb *toep = tp->t_toe; CTR4(KTR_IW_CXGBE, "%s qhp %p qid 0x%x tid %u", __func__, qhp, qhp->wq.sq.qid, ep->hwtid); wr = alloc_wrqe(sizeof(*wqe), toep->ofld_txq); if (wr == NULL) return (0); wqe = wrtod(wr); memset(wqe, 0, sizeof *wqe); wqe->op_compl = cpu_to_be32( V_FW_WR_OP(FW_RI_WR) | F_FW_WR_COMPL); wqe->flowid_len16 = cpu_to_be32(V_FW_WR_FLOWID(ep->hwtid) | V_FW_WR_LEN16(DIV_ROUND_UP(sizeof *wqe, 16))); wqe->cookie = (unsigned long) &ep->com.wr_wait; wqe->u.init.type = FW_RI_TYPE_INIT; wqe->u.init.mpareqbit_p2ptype = V_FW_RI_WR_MPAREQBIT(qhp->attr.mpa_attr.initiator) | V_FW_RI_WR_P2PTYPE(qhp->attr.mpa_attr.p2p_type); wqe->u.init.mpa_attrs = FW_RI_MPA_IETF_ENABLE; if (qhp->attr.mpa_attr.recv_marker_enabled) wqe->u.init.mpa_attrs |= FW_RI_MPA_RX_MARKER_ENABLE; if (qhp->attr.mpa_attr.xmit_marker_enabled) wqe->u.init.mpa_attrs |= FW_RI_MPA_TX_MARKER_ENABLE; if (qhp->attr.mpa_attr.crc_enabled) wqe->u.init.mpa_attrs |= FW_RI_MPA_CRC_ENABLE; wqe->u.init.qp_caps = FW_RI_QP_RDMA_READ_ENABLE | FW_RI_QP_RDMA_WRITE_ENABLE | FW_RI_QP_BIND_ENABLE; if (!qhp->ibqp.uobject) wqe->u.init.qp_caps |= FW_RI_QP_FAST_REGISTER_ENABLE | FW_RI_QP_STAG0_ENABLE; wqe->u.init.nrqe = cpu_to_be16(t4_rqes_posted(&qhp->wq)); wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd); wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid); wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid); wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid); wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq); wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq); wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord); wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird); wqe->u.init.iss = cpu_to_be32(ep->snd_seq); wqe->u.init.irs = cpu_to_be32(ep->rcv_seq); wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size); wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr - sc->vres.rq.start); if (qhp->attr.mpa_attr.initiator) build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init); c4iw_init_wr_wait(&ep->com.wr_wait); creds(toep, sizeof(*wqe)); t4_wrq_tx(sc, wr); ret = c4iw_wait_for_reply(rdev, &ep->com.wr_wait, ep->hwtid, qhp->wq.sq.qid, __func__); toep->ulp_mode = ULP_MODE_RDMA; return ret; } int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, enum c4iw_qp_attr_mask mask, struct c4iw_qp_attributes *attrs, int internal) { int ret = 0; struct c4iw_qp_attributes newattr = qhp->attr; int disconnect = 0; int terminate = 0; int abort = 0; int free = 0; struct c4iw_ep *ep = NULL; CTR5(KTR_IW_CXGBE, "%s qhp %p sqid 0x%x rqid 0x%x ep %p", __func__, qhp, qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep); CTR3(KTR_IW_CXGBE, "%s state %d -> %d", __func__, qhp->attr.state, (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1); mutex_lock(&qhp->mutex); /* Process attr changes if in IDLE */ if (mask & C4IW_QP_ATTR_VALID_MODIFY) { if (qhp->attr.state != C4IW_QP_STATE_IDLE) { ret = -EIO; goto out; } if (mask & C4IW_QP_ATTR_ENABLE_RDMA_READ) newattr.enable_rdma_read = attrs->enable_rdma_read; if (mask & C4IW_QP_ATTR_ENABLE_RDMA_WRITE) newattr.enable_rdma_write = attrs->enable_rdma_write; if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND) newattr.enable_bind = attrs->enable_bind; if (mask & C4IW_QP_ATTR_MAX_ORD) { if (attrs->max_ord > c4iw_max_read_depth) { ret = -EINVAL; goto out; } newattr.max_ord = attrs->max_ord; } if (mask & C4IW_QP_ATTR_MAX_IRD) { if (attrs->max_ird > c4iw_max_read_depth) { ret = -EINVAL; goto out; } newattr.max_ird = attrs->max_ird; } qhp->attr = newattr; } if (!(mask & C4IW_QP_ATTR_NEXT_STATE)) goto out; if (qhp->attr.state == attrs->next_state) goto out; switch (qhp->attr.state) { case C4IW_QP_STATE_IDLE: switch (attrs->next_state) { case C4IW_QP_STATE_RTS: if (!(mask & C4IW_QP_ATTR_LLP_STREAM_HANDLE)) { ret = -EINVAL; goto out; } if (!(mask & C4IW_QP_ATTR_MPA_ATTR)) { ret = -EINVAL; goto out; } qhp->attr.mpa_attr = attrs->mpa_attr; qhp->attr.llp_stream_handle = attrs->llp_stream_handle; qhp->ep = qhp->attr.llp_stream_handle; set_state(qhp, C4IW_QP_STATE_RTS); /* * Ref the endpoint here and deref when we * disassociate the endpoint from the QP. This * happens in CLOSING->IDLE transition or *->ERROR * transition. */ c4iw_get_ep(&qhp->ep->com); ret = rdma_init(rhp, qhp); if (ret) goto err; break; case C4IW_QP_STATE_ERROR: set_state(qhp, C4IW_QP_STATE_ERROR); flush_qp(qhp); break; default: ret = -EINVAL; goto out; } break; case C4IW_QP_STATE_RTS: switch (attrs->next_state) { case C4IW_QP_STATE_CLOSING: BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2); set_state(qhp, C4IW_QP_STATE_CLOSING); ep = qhp->ep; if (!internal) { abort = 0; disconnect = 1; c4iw_get_ep(&qhp->ep->com); } if (qhp->ibqp.uobject) t4_set_wq_in_error(&qhp->wq); ret = rdma_fini(rhp, qhp, ep); if (ret) goto err; break; case C4IW_QP_STATE_TERMINATE: set_state(qhp, C4IW_QP_STATE_TERMINATE); qhp->attr.layer_etype = attrs->layer_etype; qhp->attr.ecode = attrs->ecode; if (qhp->ibqp.uobject) t4_set_wq_in_error(&qhp->wq); ep = qhp->ep; if (!internal) terminate = 1; disconnect = 1; c4iw_get_ep(&qhp->ep->com); break; case C4IW_QP_STATE_ERROR: set_state(qhp, C4IW_QP_STATE_ERROR); if (qhp->ibqp.uobject) t4_set_wq_in_error(&qhp->wq); if (!internal) { abort = 1; disconnect = 1; ep = qhp->ep; c4iw_get_ep(&qhp->ep->com); } goto err; break; default: ret = -EINVAL; goto out; } break; case C4IW_QP_STATE_CLOSING: if (!internal) { ret = -EINVAL; goto out; } switch (attrs->next_state) { case C4IW_QP_STATE_IDLE: flush_qp(qhp); set_state(qhp, C4IW_QP_STATE_IDLE); qhp->attr.llp_stream_handle = NULL; c4iw_put_ep(&qhp->ep->com); qhp->ep = NULL; wake_up(&qhp->wait); break; case C4IW_QP_STATE_ERROR: goto err; default: ret = -EINVAL; goto err; } break; case C4IW_QP_STATE_ERROR: if (attrs->next_state != C4IW_QP_STATE_IDLE) { ret = -EINVAL; goto out; } if (!t4_sq_empty(&qhp->wq) || !t4_rq_empty(&qhp->wq)) { ret = -EINVAL; goto out; } set_state(qhp, C4IW_QP_STATE_IDLE); break; case C4IW_QP_STATE_TERMINATE: if (!internal) { ret = -EINVAL; goto out; } goto err; break; default: printf("%s in a bad state %d\n", __func__, qhp->attr.state); ret = -EINVAL; goto err; break; } goto out; err: CTR3(KTR_IW_CXGBE, "%s disassociating ep %p qpid 0x%x", __func__, qhp->ep, qhp->wq.sq.qid); /* disassociate the LLP connection */ qhp->attr.llp_stream_handle = NULL; if (!ep) ep = qhp->ep; qhp->ep = NULL; set_state(qhp, C4IW_QP_STATE_ERROR); free = 1; BUG_ON(!ep); flush_qp(qhp); wake_up(&qhp->wait); out: mutex_unlock(&qhp->mutex); if (terminate) post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL); /* * If disconnect is 1, then we need to initiate a disconnect * on the EP. This can be a normal close (RTS->CLOSING) or * an abnormal close (RTS/CLOSING->ERROR). */ if (disconnect) { c4iw_ep_disconnect(ep, abort, internal ? GFP_ATOMIC : GFP_KERNEL); c4iw_put_ep(&ep->com); } /* * If free is 1, then we've disassociated the EP from the QP * and we need to dereference the EP. */ if (free) c4iw_put_ep(&ep->com); CTR2(KTR_IW_CXGBE, "%s exit state %d", __func__, qhp->attr.state); return ret; } static int enable_qp_db(int id, void *p, void *data) { struct c4iw_qp *qp = p; t4_enable_wq_db(&qp->wq); return 0; } int c4iw_destroy_qp(struct ib_qp *ib_qp) { struct c4iw_dev *rhp; struct c4iw_qp *qhp; struct c4iw_qp_attributes attrs; struct c4iw_ucontext *ucontext; CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, ib_qp); qhp = to_c4iw_qp(ib_qp); rhp = qhp->rhp; attrs.next_state = C4IW_QP_STATE_ERROR; if (qhp->attr.state == C4IW_QP_STATE_TERMINATE) c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); else c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0); wait_event(qhp->wait, !qhp->ep); spin_lock_irq(&rhp->lock); remove_handle_nolock(rhp, &rhp->qpidr, qhp->wq.sq.qid); rhp->qpcnt--; BUG_ON(rhp->qpcnt < 0); if (rhp->qpcnt <= db_fc_threshold && rhp->db_state == FLOW_CONTROL) { rhp->rdev.stats.db_state_transitions++; rhp->db_state = NORMAL; idr_for_each(&rhp->qpidr, enable_qp_db, NULL); } spin_unlock_irq(&rhp->lock); atomic_dec(&qhp->refcnt); wait_event(qhp->wait, !atomic_read(&qhp->refcnt)); ucontext = ib_qp->uobject ? to_c4iw_ucontext(ib_qp->uobject->context) : NULL; destroy_qp(&rhp->rdev, &qhp->wq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); CTR3(KTR_IW_CXGBE, "%s ib_qp %p qpid 0x%0x", __func__, ib_qp, qhp->wq.sq.qid); kfree(qhp); return 0; } static int disable_qp_db(int id, void *p, void *data) { struct c4iw_qp *qp = p; t4_disable_wq_db(&qp->wq); return 0; } struct ib_qp * c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct ib_udata *udata) { struct c4iw_dev *rhp; struct c4iw_qp *qhp; struct c4iw_pd *php; struct c4iw_cq *schp; struct c4iw_cq *rchp; struct c4iw_create_qp_resp uresp; int sqsize, rqsize; struct c4iw_ucontext *ucontext; int ret; struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4; CTR2(KTR_IW_CXGBE, "%s ib_pd %p", __func__, pd); if (attrs->qp_type != IB_QPT_RC) return ERR_PTR(-EINVAL); php = to_c4iw_pd(pd); rhp = php->rhp; schp = get_chp(rhp, ((struct c4iw_cq *)attrs->send_cq)->cq.cqid); rchp = get_chp(rhp, ((struct c4iw_cq *)attrs->recv_cq)->cq.cqid); if (!schp || !rchp) return ERR_PTR(-EINVAL); if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE) return ERR_PTR(-EINVAL); rqsize = roundup(attrs->cap.max_recv_wr + 1, 16); if (rqsize > T4_MAX_RQ_SIZE) return ERR_PTR(-E2BIG); sqsize = roundup(attrs->cap.max_send_wr + 1, 16); if (sqsize > T4_MAX_SQ_SIZE) return ERR_PTR(-E2BIG); ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL; qhp = kzalloc(sizeof(*qhp), GFP_KERNEL); if (!qhp) return ERR_PTR(-ENOMEM); qhp->wq.sq.size = sqsize; qhp->wq.sq.memsize = (sqsize + 1) * sizeof *qhp->wq.sq.queue; qhp->wq.rq.size = rqsize; qhp->wq.rq.memsize = (rqsize + 1) * sizeof *qhp->wq.rq.queue; if (ucontext) { qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE); qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE); } CTR5(KTR_IW_CXGBE, "%s sqsize %u sqmemsize %zu rqsize %u rqmemsize %zu", __func__, sqsize, qhp->wq.sq.memsize, rqsize, qhp->wq.rq.memsize); ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); if (ret) goto err1; attrs->cap.max_recv_wr = rqsize - 1; attrs->cap.max_send_wr = sqsize - 1; attrs->cap.max_inline_data = T4_MAX_SEND_INLINE; qhp->rhp = rhp; qhp->attr.pd = php->pdid; qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid; qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid; qhp->attr.sq_num_entries = attrs->cap.max_send_wr; qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; qhp->attr.sq_max_sges = attrs->cap.max_send_sge; qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; qhp->attr.state = C4IW_QP_STATE_IDLE; qhp->attr.next_state = C4IW_QP_STATE_IDLE; qhp->attr.enable_rdma_read = 1; qhp->attr.enable_rdma_write = 1; qhp->attr.enable_bind = 1; qhp->attr.max_ord = 1; qhp->attr.max_ird = 1; qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR; spin_lock_init(&qhp->lock); mutex_init(&qhp->mutex); init_waitqueue_head(&qhp->wait); atomic_set(&qhp->refcnt, 1); spin_lock_irq(&rhp->lock); if (rhp->db_state != NORMAL) t4_disable_wq_db(&qhp->wq); if (++rhp->qpcnt > db_fc_threshold && rhp->db_state == NORMAL) { rhp->rdev.stats.db_state_transitions++; rhp->db_state = FLOW_CONTROL; idr_for_each(&rhp->qpidr, disable_qp_db, NULL); } ret = insert_handle_nolock(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid); spin_unlock_irq(&rhp->lock); if (ret) goto err2; if (udata) { mm1 = kmalloc(sizeof *mm1, GFP_KERNEL); if (!mm1) { ret = -ENOMEM; goto err3; } mm2 = kmalloc(sizeof *mm2, GFP_KERNEL); if (!mm2) { ret = -ENOMEM; goto err4; } mm3 = kmalloc(sizeof *mm3, GFP_KERNEL); if (!mm3) { ret = -ENOMEM; goto err5; } mm4 = kmalloc(sizeof *mm4, GFP_KERNEL); if (!mm4) { ret = -ENOMEM; goto err6; } uresp.flags = 0; uresp.qid_mask = rhp->rdev.qpmask; uresp.sqid = qhp->wq.sq.qid; uresp.sq_size = qhp->wq.sq.size; uresp.sq_memsize = qhp->wq.sq.memsize; uresp.rqid = qhp->wq.rq.qid; uresp.rq_size = qhp->wq.rq.size; uresp.rq_memsize = qhp->wq.rq.memsize; spin_lock(&ucontext->mmap_lock); uresp.sq_key = ucontext->key; ucontext->key += PAGE_SIZE; uresp.rq_key = ucontext->key; ucontext->key += PAGE_SIZE; uresp.sq_db_gts_key = ucontext->key; ucontext->key += PAGE_SIZE; uresp.rq_db_gts_key = ucontext->key; ucontext->key += PAGE_SIZE; spin_unlock(&ucontext->mmap_lock); ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); if (ret) goto err7; mm1->key = uresp.sq_key; mm1->addr = qhp->wq.sq.phys_addr; mm1->len = PAGE_ALIGN(qhp->wq.sq.memsize); CTR4(KTR_IW_CXGBE, "%s mm1 %x, %x, %d", __func__, mm1->key, mm1->addr, mm1->len); insert_mmap(ucontext, mm1); mm2->key = uresp.rq_key; mm2->addr = vtophys(qhp->wq.rq.queue); mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize); CTR4(KTR_IW_CXGBE, "%s mm2 %x, %x, %d", __func__, mm2->key, mm2->addr, mm2->len); insert_mmap(ucontext, mm2); mm3->key = uresp.sq_db_gts_key; mm3->addr = qhp->wq.sq.udb; mm3->len = PAGE_SIZE; CTR4(KTR_IW_CXGBE, "%s mm3 %x, %x, %d", __func__, mm3->key, mm3->addr, mm3->len); insert_mmap(ucontext, mm3); mm4->key = uresp.rq_db_gts_key; mm4->addr = qhp->wq.rq.udb; mm4->len = PAGE_SIZE; CTR4(KTR_IW_CXGBE, "%s mm4 %x, %x, %d", __func__, mm4->key, mm4->addr, mm4->len); insert_mmap(ucontext, mm4); } qhp->ibqp.qp_num = qhp->wq.sq.qid; init_timer(&(qhp->timer)); CTR5(KTR_IW_CXGBE, "%s qhp %p sq_num_entries %d, rq_num_entries %d qpid 0x%0x", __func__, qhp, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries, qhp->wq.sq.qid); return &qhp->ibqp; err7: kfree(mm4); err6: kfree(mm3); err5: kfree(mm2); err4: kfree(mm1); err3: remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); err2: destroy_qp(&rhp->rdev, &qhp->wq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx); err1: kfree(qhp); return ERR_PTR(ret); } int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct c4iw_dev *rhp; struct c4iw_qp *qhp; enum c4iw_qp_attr_mask mask = 0; struct c4iw_qp_attributes attrs; CTR2(KTR_IW_CXGBE, "%s ib_qp %p", __func__, ibqp); /* iwarp does not support the RTR state */ if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR)) attr_mask &= ~IB_QP_STATE; /* Make sure we still have something left to do */ if (!attr_mask) return 0; memset(&attrs, 0, sizeof attrs); qhp = to_c4iw_qp(ibqp); rhp = qhp->rhp; attrs.next_state = c4iw_convert_state(attr->qp_state); attrs.enable_rdma_read = (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) ? 1 : 0; attrs.enable_rdma_write = (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0; mask |= (attr_mask & IB_QP_STATE) ? C4IW_QP_ATTR_NEXT_STATE : 0; mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ? (C4IW_QP_ATTR_ENABLE_RDMA_READ | C4IW_QP_ATTR_ENABLE_RDMA_WRITE | C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0; /* * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for * ringing the queue db when we're in DB_FULL mode. */ attrs.sq_db_inc = attr->sq_psn; attrs.rq_db_inc = attr->rq_psn; mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0; mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0; return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0); } struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn) { CTR3(KTR_IW_CXGBE, "%s ib_dev %p qpn 0x%x", __func__, dev, qpn); return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn); } int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr) { struct c4iw_qp *qhp = to_c4iw_qp(ibqp); memset(attr, 0, sizeof *attr); memset(init_attr, 0, sizeof *init_attr); attr->qp_state = to_ib_qp_state(qhp->attr.state); init_attr->cap.max_send_wr = qhp->attr.sq_num_entries; init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries; init_attr->cap.max_send_sge = qhp->attr.sq_max_sges; init_attr->cap.max_recv_sge = qhp->attr.sq_max_sges; init_attr->cap.max_inline_data = T4_MAX_SEND_INLINE; init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0; return 0; } #endif Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_main.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_main.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_main.c (revision 303654) @@ -1,9530 +1,9532 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #if defined(__i386__) || defined(__amd64__) #include #include #endif #ifdef DDB #include #include #endif #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "t4_ioctl.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #include "t4_if.h" /* T4 bus driver interface */ static int t4_probe(device_t); static int t4_attach(device_t); static int t4_detach(device_t); static int t4_ready(device_t); static int t4_read_port_unit(device_t, int, int *); static device_method_t t4_methods[] = { DEVMETHOD(device_probe, t4_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_unit, t4_read_port_unit), DEVMETHOD_END }; static driver_t t4_driver = { "t4nex", t4_methods, sizeof(struct adapter) }; /* T4 port (cxgbe) interface */ static int cxgbe_probe(device_t); static int cxgbe_attach(device_t); static int cxgbe_detach(device_t); static device_method_t cxgbe_methods[] = { DEVMETHOD(device_probe, cxgbe_probe), DEVMETHOD(device_attach, cxgbe_attach), DEVMETHOD(device_detach, cxgbe_detach), { 0, 0 } }; static driver_t cxgbe_driver = { "cxgbe", cxgbe_methods, sizeof(struct port_info) }; /* T4 VI (vcxgbe) interface */ static int vcxgbe_probe(device_t); static int vcxgbe_attach(device_t); static int vcxgbe_detach(device_t); static device_method_t vcxgbe_methods[] = { DEVMETHOD(device_probe, vcxgbe_probe), DEVMETHOD(device_attach, vcxgbe_attach), DEVMETHOD(device_detach, vcxgbe_detach), { 0, 0 } }; static driver_t vcxgbe_driver = { "vcxgbe", vcxgbe_methods, sizeof(struct vi_info) }; static d_ioctl_t t4_ioctl; static struct cdevsw t4_cdevsw = { .d_version = D_VERSION, .d_ioctl = t4_ioctl, .d_name = "t4nex", }; /* T5 bus driver interface */ static int t5_probe(device_t); static device_method_t t5_methods[] = { DEVMETHOD(device_probe, t5_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_unit, t4_read_port_unit), DEVMETHOD_END }; static driver_t t5_driver = { "t5nex", t5_methods, sizeof(struct adapter) }; /* T5 port (cxl) interface */ static driver_t cxl_driver = { "cxl", cxgbe_methods, sizeof(struct port_info) }; /* T5 VI (vcxl) interface */ static driver_t vcxl_driver = { "vcxl", vcxgbe_methods, sizeof(struct vi_info) }; /* ifnet + media interface */ static void cxgbe_init(void *); static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t); static int cxgbe_transmit(struct ifnet *, struct mbuf *); static void cxgbe_qflush(struct ifnet *); static int cxgbe_media_change(struct ifnet *); static void cxgbe_media_status(struct ifnet *, struct ifmediareq *); MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services"); /* * Correct lock order when you need to acquire multiple locks is t4_list_lock, * then ADAPTER_LOCK, then t4_uld_list_lock. */ static struct sx t4_list_lock; SLIST_HEAD(, adapter) t4_list; #ifdef TCP_OFFLOAD static struct sx t4_uld_list_lock; SLIST_HEAD(, uld_info) t4_uld_list; #endif /* * Tunables. See tweak_tunables() too. * * Each tunable is set to a default value here if it's known at compile-time. * Otherwise it is set to -1 as an indication to tweak_tunables() that it should * provide a reasonable default when the driver is loaded. * * Tunables applicable to both T4 and T5 are under hw.cxgbe. Those specific to * T5 are under hw.cxl. */ /* * Number of queues for tx and rx, 10G and 1G, NIC and offload. */ #define NTXQ_10G 16 static int t4_ntxq10g = -1; TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq10g); #define NRXQ_10G 8 static int t4_nrxq10g = -1; TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq10g); #define NTXQ_1G 4 static int t4_ntxq1g = -1; TUNABLE_INT("hw.cxgbe.ntxq1g", &t4_ntxq1g); #define NRXQ_1G 2 static int t4_nrxq1g = -1; TUNABLE_INT("hw.cxgbe.nrxq1g", &t4_nrxq1g); #define NTXQ_VI 1 static int t4_ntxq_vi = -1; TUNABLE_INT("hw.cxgbe.ntxq_vi", &t4_ntxq_vi); #define NRXQ_VI 1 static int t4_nrxq_vi = -1; TUNABLE_INT("hw.cxgbe.nrxq_vi", &t4_nrxq_vi); static int t4_rsrv_noflowq = 0; TUNABLE_INT("hw.cxgbe.rsrv_noflowq", &t4_rsrv_noflowq); #ifdef TCP_OFFLOAD #define NOFLDTXQ_10G 8 static int t4_nofldtxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq10g", &t4_nofldtxq10g); #define NOFLDRXQ_10G 2 static int t4_nofldrxq10g = -1; TUNABLE_INT("hw.cxgbe.nofldrxq10g", &t4_nofldrxq10g); #define NOFLDTXQ_1G 2 static int t4_nofldtxq1g = -1; TUNABLE_INT("hw.cxgbe.nofldtxq1g", &t4_nofldtxq1g); #define NOFLDRXQ_1G 1 static int t4_nofldrxq1g = -1; TUNABLE_INT("hw.cxgbe.nofldrxq1g", &t4_nofldrxq1g); #define NOFLDTXQ_VI 1 static int t4_nofldtxq_vi = -1; TUNABLE_INT("hw.cxgbe.nofldtxq_vi", &t4_nofldtxq_vi); #define NOFLDRXQ_VI 1 static int t4_nofldrxq_vi = -1; TUNABLE_INT("hw.cxgbe.nofldrxq_vi", &t4_nofldrxq_vi); #endif #ifdef DEV_NETMAP #define NNMTXQ_VI 2 static int t4_nnmtxq_vi = -1; TUNABLE_INT("hw.cxgbe.nnmtxq_vi", &t4_nnmtxq_vi); #define NNMRXQ_VI 2 static int t4_nnmrxq_vi = -1; TUNABLE_INT("hw.cxgbe.nnmrxq_vi", &t4_nnmrxq_vi); #endif /* * Holdoff parameters for 10G and 1G ports. */ #define TMR_IDX_10G 1 static int t4_tmr_idx_10g = TMR_IDX_10G; TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx_10g); #define PKTC_IDX_10G (-1) static int t4_pktc_idx_10g = PKTC_IDX_10G; TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx_10g); #define TMR_IDX_1G 1 static int t4_tmr_idx_1g = TMR_IDX_1G; TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_1G", &t4_tmr_idx_1g); #define PKTC_IDX_1G (-1) static int t4_pktc_idx_1g = PKTC_IDX_1G; TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_1G", &t4_pktc_idx_1g); /* * Size (# of entries) of each tx and rx queue. */ static unsigned int t4_qsize_txq = TX_EQ_QSIZE; TUNABLE_INT("hw.cxgbe.qsize_txq", &t4_qsize_txq); static unsigned int t4_qsize_rxq = RX_IQ_QSIZE; TUNABLE_INT("hw.cxgbe.qsize_rxq", &t4_qsize_rxq); /* * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively). */ static int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX; TUNABLE_INT("hw.cxgbe.interrupt_types", &t4_intr_types); /* * Configuration file. */ #define DEFAULT_CF "default" #define FLASH_CF "flash" #define UWIRE_CF "uwire" #define FPGA_CF "fpga" static char t4_cfg_file[32] = DEFAULT_CF; TUNABLE_STR("hw.cxgbe.config_file", t4_cfg_file, sizeof(t4_cfg_file)); /* * PAUSE settings (bit 0, 1 = rx_pause, tx_pause respectively). * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them. * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water * mark or when signalled to do so, 0 to never emit PAUSE. */ static int t4_pause_settings = PAUSE_TX | PAUSE_RX; TUNABLE_INT("hw.cxgbe.pause_settings", &t4_pause_settings); /* * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed, * encouraged respectively). */ static unsigned int t4_fw_install = 1; TUNABLE_INT("hw.cxgbe.fw_install", &t4_fw_install); /* * ASIC features that will be used. Disable the ones you don't want so that the * chip resources aren't wasted on features that will not be used. */ static int t4_nbmcaps_allowed = 0; TUNABLE_INT("hw.cxgbe.nbmcaps_allowed", &t4_nbmcaps_allowed); static int t4_linkcaps_allowed = 0; /* No DCBX, PPP, etc. by default */ TUNABLE_INT("hw.cxgbe.linkcaps_allowed", &t4_linkcaps_allowed); static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS | FW_CAPS_CONFIG_SWITCH_EGRESS; TUNABLE_INT("hw.cxgbe.switchcaps_allowed", &t4_switchcaps_allowed); static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC; TUNABLE_INT("hw.cxgbe.niccaps_allowed", &t4_niccaps_allowed); static int t4_toecaps_allowed = -1; TUNABLE_INT("hw.cxgbe.toecaps_allowed", &t4_toecaps_allowed); static int t4_rdmacaps_allowed = -1; TUNABLE_INT("hw.cxgbe.rdmacaps_allowed", &t4_rdmacaps_allowed); static int t4_tlscaps_allowed = 0; TUNABLE_INT("hw.cxgbe.tlscaps_allowed", &t4_tlscaps_allowed); static int t4_iscsicaps_allowed = -1; TUNABLE_INT("hw.cxgbe.iscsicaps_allowed", &t4_iscsicaps_allowed); static int t4_fcoecaps_allowed = 0; TUNABLE_INT("hw.cxgbe.fcoecaps_allowed", &t4_fcoecaps_allowed); static int t5_write_combine = 0; TUNABLE_INT("hw.cxl.write_combine", &t5_write_combine); static int t4_num_vis = 1; TUNABLE_INT("hw.cxgbe.num_vis", &t4_num_vis); /* Functions used by extra VIs to obtain unique MAC addresses for each VI. */ static int vi_mac_funcs[] = { FW_VI_FUNC_OFLD, FW_VI_FUNC_IWARP, FW_VI_FUNC_OPENISCSI, FW_VI_FUNC_OPENFCOE, FW_VI_FUNC_FOISCSI, FW_VI_FUNC_FOFCOE, }; struct intrs_and_queues { uint16_t intr_type; /* INTx, MSI, or MSI-X */ uint16_t nirq; /* Total # of vectors */ uint16_t intr_flags_10g;/* Interrupt flags for each 10G port */ uint16_t intr_flags_1g; /* Interrupt flags for each 1G port */ uint16_t ntxq10g; /* # of NIC txq's for each 10G port */ uint16_t nrxq10g; /* # of NIC rxq's for each 10G port */ uint16_t ntxq1g; /* # of NIC txq's for each 1G port */ uint16_t nrxq1g; /* # of NIC rxq's for each 1G port */ uint16_t rsrv_noflowq; /* Flag whether to reserve queue 0 */ uint16_t nofldtxq10g; /* # of TOE txq's for each 10G port */ uint16_t nofldrxq10g; /* # of TOE rxq's for each 10G port */ uint16_t nofldtxq1g; /* # of TOE txq's for each 1G port */ uint16_t nofldrxq1g; /* # of TOE rxq's for each 1G port */ /* The vcxgbe/vcxl interfaces use these and not the ones above. */ uint16_t ntxq_vi; /* # of NIC txq's */ uint16_t nrxq_vi; /* # of NIC rxq's */ uint16_t nofldtxq_vi; /* # of TOE txq's */ uint16_t nofldrxq_vi; /* # of TOE rxq's */ uint16_t nnmtxq_vi; /* # of netmap txq's */ uint16_t nnmrxq_vi; /* # of netmap rxq's */ }; struct filter_entry { uint32_t valid:1; /* filter allocated and valid */ uint32_t locked:1; /* filter is administratively locked */ uint32_t pending:1; /* filter action is pending firmware reply */ uint32_t smtidx:8; /* Source MAC Table index for smac */ struct l2t_entry *l2t; /* Layer Two Table entry for dmac */ struct t4_filter_specification fs; }; static int map_bars_0_and_4(struct adapter *); static int map_bar_2(struct adapter *); static void setup_memwin(struct adapter *); static void position_memwin(struct adapter *, int, uint32_t); static int rw_via_memwin(struct adapter *, int, uint32_t, uint32_t *, int, int); static inline int read_via_memwin(struct adapter *, int, uint32_t, uint32_t *, int); static inline int write_via_memwin(struct adapter *, int, uint32_t, const uint32_t *, int); static int validate_mem_range(struct adapter *, uint32_t, int); static int fwmtype_to_hwmtype(int); static int validate_mt_off_len(struct adapter *, int, uint32_t, int, uint32_t *); static int fixup_devlog_params(struct adapter *); static int cfg_itype_and_nqueues(struct adapter *, int, int, int, struct intrs_and_queues *); static int prep_firmware(struct adapter *); static int partition_resources(struct adapter *, const struct firmware *, const char *); static int get_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); static void build_medialist(struct port_info *, struct ifmedia *); static int cxgbe_init_synchronized(struct vi_info *); static int cxgbe_uninit_synchronized(struct vi_info *); static int setup_intr_handlers(struct adapter *); static void quiesce_txq(struct adapter *, struct sge_txq *); static void quiesce_wrq(struct adapter *, struct sge_wrq *); static void quiesce_iq(struct adapter *, struct sge_iq *); static void quiesce_fl(struct adapter *, struct sge_fl *); static int t4_alloc_irq(struct adapter *, struct irq *, int rid, driver_intr_t *, void *, char *); static int t4_free_irq(struct adapter *, struct irq *); static void get_regs(struct adapter *, struct t4_regdump *, uint8_t *); static void vi_refresh_stats(struct adapter *, struct vi_info *); static void cxgbe_refresh_stats(struct adapter *, struct port_info *); static void cxgbe_tick(void *); static void cxgbe_vlan_config(void *, struct ifnet *, uint16_t); static void t4_sysctls(struct adapter *); static void cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield(SYSCTL_HANDLER_ARGS); static int sysctl_btphy(SYSCTL_HANDLER_ARGS); static int sysctl_noflowq(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS); static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS); static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS); static int sysctl_temperature(SYSCTL_HANDLER_ARGS); #ifdef SBUF_DRAIN static int sysctl_cctrl(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS); static int sysctl_cim_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_la_t6(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS); static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS); static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_devlog(SYSCTL_HANDLER_ARGS); static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS); static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS); static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS); static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS); static int sysctl_meminfo(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS); static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS); static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS); static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tids(SYSCTL_HANDLER_ARGS); static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la(SYSCTL_HANDLER_ARGS); static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS); static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS); static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tc_params(SYSCTL_HANDLER_ARGS); #endif #ifdef TCP_OFFLOAD static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS); static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS); static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS); #endif static uint32_t fconf_iconf_to_mode(uint32_t, uint32_t); static uint32_t mode_to_fconf(uint32_t); static uint32_t mode_to_iconf(uint32_t); static int check_fspec_against_fconf_iconf(struct adapter *, struct t4_filter_specification *); static int get_filter_mode(struct adapter *, uint32_t *); static int set_filter_mode(struct adapter *, uint32_t); static inline uint64_t get_filter_hits(struct adapter *, uint32_t); static int get_filter(struct adapter *, struct t4_filter *); static int set_filter(struct adapter *, struct t4_filter *); static int del_filter(struct adapter *, struct t4_filter *); static void clear_filter(struct filter_entry *); static int set_filter_wr(struct adapter *, int); static int del_filter_wr(struct adapter *, int); static int set_tcb_rpl(struct sge_iq *, const struct rss_header *, struct mbuf *); static int get_sge_context(struct adapter *, struct t4_sge_context *); static int load_fw(struct adapter *, struct t4_data *); static int read_card_mem(struct adapter *, int, struct t4_mem_range *); static int read_i2c(struct adapter *, struct t4_i2c_data *); static int set_sched_class(struct adapter *, struct t4_sched_params *); static int set_sched_queue(struct adapter *, struct t4_sched_queue *); #ifdef TCP_OFFLOAD static int toe_capability(struct vi_info *, int); #endif static int mod_event(module_t, int, void *); static int notify_siblings(device_t, int); struct { uint16_t device; char *desc; } t4_pciids[] = { {0xa000, "Chelsio Terminator 4 FPGA"}, {0x4400, "Chelsio T440-dbg"}, {0x4401, "Chelsio T420-CR"}, {0x4402, "Chelsio T422-CR"}, {0x4403, "Chelsio T440-CR"}, {0x4404, "Chelsio T420-BCH"}, {0x4405, "Chelsio T440-BCH"}, {0x4406, "Chelsio T440-CH"}, {0x4407, "Chelsio T420-SO"}, {0x4408, "Chelsio T420-CX"}, {0x4409, "Chelsio T420-BT"}, {0x440a, "Chelsio T404-BT"}, {0x440e, "Chelsio T440-LP-CR"}, }, t5_pciids[] = { {0xb000, "Chelsio Terminator 5 FPGA"}, {0x5400, "Chelsio T580-dbg"}, {0x5401, "Chelsio T520-CR"}, /* 2 x 10G */ {0x5402, "Chelsio T522-CR"}, /* 2 x 10G, 2 X 1G */ {0x5403, "Chelsio T540-CR"}, /* 4 x 10G */ {0x5407, "Chelsio T520-SO"}, /* 2 x 10G, nomem */ {0x5409, "Chelsio T520-BT"}, /* 2 x 10GBaseT */ {0x540a, "Chelsio T504-BT"}, /* 4 x 1G */ {0x540d, "Chelsio T580-CR"}, /* 2 x 40G */ {0x540e, "Chelsio T540-LP-CR"}, /* 4 x 10G */ {0x5410, "Chelsio T580-LP-CR"}, /* 2 x 40G */ {0x5411, "Chelsio T520-LL-CR"}, /* 2 x 10G */ {0x5412, "Chelsio T560-CR"}, /* 1 x 40G, 2 x 10G */ {0x5414, "Chelsio T580-LP-SO-CR"}, /* 2 x 40G, nomem */ {0x5415, "Chelsio T502-BT"}, /* 2 x 1G */ #ifdef notyet {0x5404, "Chelsio T520-BCH"}, {0x5405, "Chelsio T540-BCH"}, {0x5406, "Chelsio T540-CH"}, {0x5408, "Chelsio T520-CX"}, {0x540b, "Chelsio B520-SR"}, {0x540c, "Chelsio B504-BT"}, {0x540f, "Chelsio Amsterdam"}, {0x5413, "Chelsio T580-CHR"}, #endif }; #ifdef TCP_OFFLOAD /* * service_iq() has an iq and needs the fl. Offset of fl from the iq should be * exactly the same for both rxq and ofld_rxq. */ CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq)); CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); #endif CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE); static int t4_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xa000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t4_pciids); i++) { if (d == t4_pciids[i].device) { device_set_desc(dev, t4_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t5_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xb000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t5_pciids); i++) { if (d == t5_pciids[i].device) { device_set_desc(dev, t5_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void t5_attribute_workaround(device_t dev) { device_t root_port; uint32_t v; /* * The T5 chips do not properly echo the No Snoop and Relaxed * Ordering attributes when replying to a TLP from a Root * Port. As a workaround, find the parent Root Port and * disable No Snoop and Relaxed Ordering. Note that this * affects all devices under this root port. */ root_port = pci_find_pcie_root_port(dev); if (root_port == NULL) { device_printf(dev, "Unable to find parent root port\n"); return; } v = pcie_adjust_config(root_port, PCIER_DEVICE_CTL, PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE, 0, 2); if ((v & (PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE)) != 0) device_printf(dev, "Disabled No Snoop/Relaxed Ordering on %s\n", device_get_nameunit(root_port)); } static int t4_attach(device_t dev) { struct adapter *sc; int rc = 0, i, j, n10g, n1g, rqidx, tqidx; struct make_dev_args mda; struct intrs_and_queues iaq; struct sge *s; uint8_t *buf; #ifdef TCP_OFFLOAD int ofld_rqidx, ofld_tqidx; #endif #ifdef DEV_NETMAP int nm_rqidx, nm_tqidx; #endif int num_vis; sc = device_get_softc(dev); sc->dev = dev; TUNABLE_INT_FETCH("hw.cxgbe.debug_flags", &sc->debug_flags); if ((pci_get_device(dev) & 0xff00) == 0x5400) t5_attribute_workaround(dev); pci_enable_busmaster(dev); if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) { uint32_t v; pci_set_max_read_req(dev, 4096); v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2); v |= PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); sc->params.pci.mps = 128 << ((v & PCIEM_CTL_MAX_PAYLOAD) >> 5); } + sc->sge_gts_reg = MYPF_REG(A_SGE_PF_GTS); + sc->sge_kdoorbell_reg = MYPF_REG(A_SGE_PF_KDOORBELL); sc->traceq = -1; mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF); snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer", device_get_nameunit(dev)); snprintf(sc->lockname, sizeof(sc->lockname), "%s", device_get_nameunit(dev)); mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF); sx_xlock(&t4_list_lock); SLIST_INSERT_HEAD(&t4_list, sc, link); sx_xunlock(&t4_list_lock); mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF); TAILQ_INIT(&sc->sfl); callout_init_mtx(&sc->sfl_callout, &sc->sfl_lock, 0); mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF); rc = map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ /* * This is the real PF# to which we're attaching. Works from within PCI * passthrough environments too, where pci_get_function() could return a * different PF# depending on the passthrough configuration. We need to * use the real PF# in all our communication with the firmware. */ sc->pf = G_SOURCEPF(t4_read_reg(sc, A_PL_WHOAMI)); sc->mbox = sc->pf; memset(sc->chan_map, 0xff, sizeof(sc->chan_map)); /* Prepare the adapter for operation. */ buf = malloc(PAGE_SIZE, M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_prep_adapter(sc, buf); free(buf, M_CXGBE); if (rc != 0) { device_printf(dev, "failed to prepare adapter: %d.\n", rc); goto done; } /* * Do this really early, with the memory windows set up even before the * character device. The userland tool's register i/o and mem read * will work even in "recovery mode". */ setup_memwin(sc); if (t4_init_devlog_params(sc, 0) == 0) fixup_devlog_params(sc); make_dev_args_init(&mda); mda.mda_devsw = &t4_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; rc = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); if (rc != 0) device_printf(dev, "failed to create nexus char device: %d.\n", rc); /* Go no further if recovery mode has been requested. */ if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) { device_printf(dev, "recovery mode.\n"); goto done; } #if defined(__i386__) if ((cpu_feature & CPUID_CX8) == 0) { device_printf(dev, "64 bit atomics not available.\n"); rc = ENOTSUP; goto done; } #endif /* Prepare the firmware for operation */ rc = prep_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ rc = get_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = set_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = map_bar_2(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_create_dma_tag(sc); if (rc != 0) goto done; /* error message displayed already */ /* * Number of VIs to create per-port. The first VI is the "main" regular * VI for the port. The rest are additional virtual interfaces on the * same physical port. Note that the main VI does not have native * netmap support but the extra VIs do. * * Limit the number of VIs per port to the number of available * MAC addresses per port. */ if (t4_num_vis >= 1) num_vis = t4_num_vis; else num_vis = 1; if (num_vis > nitems(vi_mac_funcs)) { num_vis = nitems(vi_mac_funcs); device_printf(dev, "Number of VIs limited to %d\n", num_vis); } /* * First pass over all the ports - allocate VIs and initialize some * basic parameters like mac address, port type, etc. We also figure * out whether a port is 10G or 1G and use that information when * calculating how many interrupts to attempt to allocate. */ n10g = n1g = 0; for_each_port(sc, i) { struct port_info *pi; pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK); sc->port[i] = pi; /* These must be set before t4_port_init */ pi->adapter = sc; pi->port_id = i; /* * XXX: vi[0] is special so we can't delay this allocation until * pi->nvi's final value is known. */ pi->vi = malloc(sizeof(struct vi_info) * num_vis, M_CXGBE, M_ZERO | M_WAITOK); /* * Allocate the "main" VI and initialize parameters * like mac addr. */ rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i); if (rc != 0) { device_printf(dev, "unable to initialize port %d: %d\n", i, rc); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } pi->link_cfg.requested_fc &= ~(PAUSE_TX | PAUSE_RX); pi->link_cfg.requested_fc |= t4_pause_settings; pi->link_cfg.fc &= ~(PAUSE_TX | PAUSE_RX); pi->link_cfg.fc |= t4_pause_settings; rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, &pi->link_cfg); if (rc != 0) { device_printf(dev, "port %d l1cfg failed: %d\n", i, rc); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d", device_get_nameunit(dev), i); mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF); sc->chan_map[pi->tx_chan] = i; pi->tc = malloc(sizeof(struct tx_sched_class) * sc->chip_params->nsched_cls, M_CXGBE, M_ZERO | M_WAITOK); if (is_10G_port(pi) || is_40G_port(pi)) { n10g++; } else { n1g++; } pi->linkdnrc = -1; pi->dev = device_add_child(dev, is_t4(sc) ? "cxgbe" : "cxl", -1); if (pi->dev == NULL) { device_printf(dev, "failed to add device for port %d.\n", i); rc = ENXIO; goto done; } pi->vi[0].dev = pi->dev; device_set_softc(pi->dev, pi); } /* * Interrupt type, # of interrupts, # of rx/tx queues, etc. */ rc = cfg_itype_and_nqueues(sc, n10g, n1g, num_vis, &iaq); if (rc != 0) goto done; /* error message displayed already */ if (iaq.nrxq_vi + iaq.nofldrxq_vi + iaq.nnmrxq_vi == 0) num_vis = 1; sc->intr_type = iaq.intr_type; sc->intr_count = iaq.nirq; s = &sc->sge; s->nrxq = n10g * iaq.nrxq10g + n1g * iaq.nrxq1g; s->ntxq = n10g * iaq.ntxq10g + n1g * iaq.ntxq1g; if (num_vis > 1) { s->nrxq += (n10g + n1g) * (num_vis - 1) * iaq.nrxq_vi; s->ntxq += (n10g + n1g) * (num_vis - 1) * iaq.ntxq_vi; } s->neq = s->ntxq + s->nrxq; /* the free list in an rxq is an eq */ s->neq += sc->params.nports + 1;/* ctrl queues: 1 per port + 1 mgmt */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ #ifdef TCP_OFFLOAD if (is_offload(sc)) { s->nofldrxq = n10g * iaq.nofldrxq10g + n1g * iaq.nofldrxq1g; s->nofldtxq = n10g * iaq.nofldtxq10g + n1g * iaq.nofldtxq1g; if (num_vis > 1) { s->nofldrxq += (n10g + n1g) * (num_vis - 1) * iaq.nofldrxq_vi; s->nofldtxq += (n10g + n1g) * (num_vis - 1) * iaq.nofldtxq_vi; } s->neq += s->nofldtxq + s->nofldrxq; s->niq += s->nofldrxq; s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef DEV_NETMAP if (num_vis > 1) { s->nnmrxq = (n10g + n1g) * (num_vis - 1) * iaq.nnmrxq_vi; s->nnmtxq = (n10g + n1g) * (num_vis - 1) * iaq.nnmtxq_vi; } s->neq += s->nnmtxq + s->nnmrxq; s->niq += s->nnmrxq; s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq), M_CXGBE, M_ZERO | M_WAITOK); #endif s->ctrlq = malloc(sc->params.nports * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE, M_ZERO | M_WAITOK); s->iqmap = malloc(s->niq * sizeof(struct sge_iq *), M_CXGBE, M_ZERO | M_WAITOK); s->eqmap = malloc(s->neq * sizeof(struct sge_eq *), M_CXGBE, M_ZERO | M_WAITOK); sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE, M_ZERO | M_WAITOK); t4_init_l2t(sc, M_WAITOK); /* * Second pass over the ports. This time we know the number of rx and * tx queues that each port should get. */ rqidx = tqidx = 0; #ifdef TCP_OFFLOAD ofld_rqidx = ofld_tqidx = 0; #endif #ifdef DEV_NETMAP nm_rqidx = nm_tqidx = 0; #endif for_each_port(sc, i) { struct port_info *pi = sc->port[i]; struct vi_info *vi; if (pi == NULL) continue; pi->nvi = num_vis; for_each_vi(pi, j, vi) { vi->pi = pi; vi->qsize_rxq = t4_qsize_rxq; vi->qsize_txq = t4_qsize_txq; vi->first_rxq = rqidx; vi->first_txq = tqidx; if (is_10G_port(pi) || is_40G_port(pi)) { vi->tmr_idx = t4_tmr_idx_10g; vi->pktc_idx = t4_pktc_idx_10g; vi->flags |= iaq.intr_flags_10g & INTR_RXQ; vi->nrxq = j == 0 ? iaq.nrxq10g : iaq.nrxq_vi; vi->ntxq = j == 0 ? iaq.ntxq10g : iaq.ntxq_vi; } else { vi->tmr_idx = t4_tmr_idx_1g; vi->pktc_idx = t4_pktc_idx_1g; vi->flags |= iaq.intr_flags_1g & INTR_RXQ; vi->nrxq = j == 0 ? iaq.nrxq1g : iaq.nrxq_vi; vi->ntxq = j == 0 ? iaq.ntxq1g : iaq.ntxq_vi; } rqidx += vi->nrxq; tqidx += vi->ntxq; if (j == 0 && vi->ntxq > 1) vi->rsrv_noflowq = iaq.rsrv_noflowq ? 1 : 0; else vi->rsrv_noflowq = 0; #ifdef TCP_OFFLOAD vi->first_ofld_rxq = ofld_rqidx; vi->first_ofld_txq = ofld_tqidx; if (is_10G_port(pi) || is_40G_port(pi)) { vi->flags |= iaq.intr_flags_10g & INTR_OFLD_RXQ; vi->nofldrxq = j == 0 ? iaq.nofldrxq10g : iaq.nofldrxq_vi; vi->nofldtxq = j == 0 ? iaq.nofldtxq10g : iaq.nofldtxq_vi; } else { vi->flags |= iaq.intr_flags_1g & INTR_OFLD_RXQ; vi->nofldrxq = j == 0 ? iaq.nofldrxq1g : iaq.nofldrxq_vi; vi->nofldtxq = j == 0 ? iaq.nofldtxq1g : iaq.nofldtxq_vi; } ofld_rqidx += vi->nofldrxq; ofld_tqidx += vi->nofldtxq; #endif #ifdef DEV_NETMAP if (j > 0) { vi->first_nm_rxq = nm_rqidx; vi->first_nm_txq = nm_tqidx; vi->nnmrxq = iaq.nnmrxq_vi; vi->nnmtxq = iaq.nnmtxq_vi; nm_rqidx += vi->nnmrxq; nm_tqidx += vi->nnmtxq; } #endif } } rc = setup_intr_handlers(sc); if (rc != 0) { device_printf(dev, "failed to setup interrupt handlers: %d\n", rc); goto done; } rc = bus_generic_attach(dev); if (rc != 0) { device_printf(dev, "failed to attach all child ports: %d\n", rc); goto done; } device_printf(dev, "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n", sc->params.pci.speed, sc->params.pci.width, sc->params.nports, sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" : (sc->intr_type == INTR_MSI ? "MSI" : "INTx"), sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq); t4_set_desc(sc); notify_siblings(dev, 0); done: if (rc != 0 && sc->cdev) { /* cdev was created and so cxgbetool works; recover that way. */ device_printf(dev, "error during attach, adapter is now in recovery mode.\n"); rc = 0; } if (rc != 0) t4_detach(dev); else t4_sysctls(sc); return (rc); } static int t4_ready(device_t dev) { struct adapter *sc; sc = device_get_softc(dev); if (sc->flags & FW_OK) return (0); return (ENXIO); } static int t4_read_port_unit(device_t dev, int port, int *unit) { struct adapter *sc; struct port_info *pi; sc = device_get_softc(dev); if (port < 0 || port >= MAX_NPORTS) return (EINVAL); pi = sc->port[port]; if (pi == NULL || pi->dev == NULL) return (ENXIO); *unit = device_get_unit(pi->dev); return (0); } static int notify_siblings(device_t dev, int detaching) { device_t sibling; int error, i; error = 0; for (i = 0; i < PCI_FUNCMAX; i++) { if (i == pci_get_function(dev)) continue; sibling = pci_find_dbsf(pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), i); if (sibling == NULL || !device_is_attached(sibling)) continue; if (detaching) error = T4_DETACH_CHILD(sibling); else (void)T4_ATTACH_CHILD(sibling); if (error) break; } return (error); } /* * Idempotent */ static int t4_detach(device_t dev) { struct adapter *sc; struct port_info *pi; int i, rc; sc = device_get_softc(dev); rc = notify_siblings(dev, 1); if (rc) { device_printf(dev, "failed to detach sibling devices: %d\n", rc); return (rc); } if (sc->flags & FULL_INIT_DONE) t4_intr_disable(sc); if (sc->cdev) { destroy_dev(sc->cdev); sc->cdev = NULL; } rc = bus_generic_detach(dev); if (rc) { device_printf(dev, "failed to detach child devices: %d\n", rc); return (rc); } for (i = 0; i < sc->intr_count; i++) t4_free_irq(sc, &sc->irq[i]); for (i = 0; i < MAX_NPORTS; i++) { pi = sc->port[i]; if (pi) { t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->vi[0].viid); if (pi->dev) device_delete_child(dev, pi->dev); mtx_destroy(&pi->pi_lock); free(pi->vi, M_CXGBE); free(pi->tc, M_CXGBE); free(pi, M_CXGBE); } } if (sc->flags & FULL_INIT_DONE) adapter_full_uninit(sc); if (sc->flags & FW_OK) t4_fw_bye(sc, sc->mbox); if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX) pci_release_msi(dev); if (sc->regs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid, sc->regs_res); if (sc->udbs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid, sc->udbs_res); if (sc->msix_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid, sc->msix_res); if (sc->l2t) t4_free_l2t(sc->l2t); #ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); free(sc->sge.ofld_txq, M_CXGBE); #endif #ifdef DEV_NETMAP free(sc->sge.nm_rxq, M_CXGBE); free(sc->sge.nm_txq, M_CXGBE); #endif free(sc->irq, M_CXGBE); free(sc->sge.rxq, M_CXGBE); free(sc->sge.txq, M_CXGBE); free(sc->sge.ctrlq, M_CXGBE); free(sc->sge.iqmap, M_CXGBE); free(sc->sge.eqmap, M_CXGBE); free(sc->tids.ftid_tab, M_CXGBE); t4_destroy_dma_tag(sc); if (mtx_initialized(&sc->sc_lock)) { sx_xlock(&t4_list_lock); SLIST_REMOVE(&t4_list, sc, adapter, link); sx_xunlock(&t4_list_lock); mtx_destroy(&sc->sc_lock); } callout_drain(&sc->sfl_callout); if (mtx_initialized(&sc->tids.ftid_lock)) mtx_destroy(&sc->tids.ftid_lock); if (mtx_initialized(&sc->sfl_lock)) mtx_destroy(&sc->sfl_lock); if (mtx_initialized(&sc->ifp_lock)) mtx_destroy(&sc->ifp_lock); if (mtx_initialized(&sc->reg_lock)) mtx_destroy(&sc->reg_lock); for (i = 0; i < NUM_MEMWIN; i++) { struct memwin *mw = &sc->memwin[i]; if (rw_initialized(&mw->mw_lock)) rw_destroy(&mw->mw_lock); } bzero(sc, sizeof(*sc)); return (0); } static int cxgbe_probe(device_t dev) { char buf[128]; struct port_info *pi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d", pi->port_id); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \ IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \ IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS) #define T4_CAP_ENABLE (T4_CAP) static int cxgbe_vi_attach(device_t dev, struct vi_info *vi) { struct ifnet *ifp; struct sbuf *sb; vi->xact_addr_filt = -1; callout_init(&vi->tick, 1); /* Allocate an ifnet and set it up */ ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); } vi->ifp = ifp; ifp->if_softc = vi; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = cxgbe_init; ifp->if_ioctl = cxgbe_ioctl; ifp->if_transmit = cxgbe_transmit; ifp->if_qflush = cxgbe_qflush; ifp->if_get_counter = cxgbe_get_counter; ifp->if_capabilities = T4_CAP; #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) ifp->if_capabilities |= IFCAP_TOE; #endif ifp->if_capenable = T4_CAP_ENABLE; ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6; ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS; ifp->if_hw_tsomaxsegsize = 65536; /* Initialize ifmedia for this VI */ ifmedia_init(&vi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); build_medialist(vi->pi, &vi->media); vi->vlan_c = EVENTHANDLER_REGISTER(vlan_config, cxgbe_vlan_config, ifp, EVENTHANDLER_PRI_ANY); ether_ifattach(ifp, vi->hw_addr); #ifdef DEV_NETMAP if (vi->nnmrxq != 0) cxgbe_nm_attach(vi); #endif sb = sbuf_new_auto(); sbuf_printf(sb, "%d txq, %d rxq (NIC)", vi->ntxq, vi->nrxq); #ifdef TCP_OFFLOAD if (ifp->if_capabilities & IFCAP_TOE) sbuf_printf(sb, "; %d txq, %d rxq (TOE)", vi->nofldtxq, vi->nofldrxq); #endif #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) sbuf_printf(sb, "; %d txq, %d rxq (netmap)", vi->nnmtxq, vi->nnmrxq); #endif sbuf_finish(sb); device_printf(dev, "%s\n", sbuf_data(sb)); sbuf_delete(sb); vi_sysctls(vi); return (0); } static int cxgbe_attach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct vi_info *vi; int i, rc; callout_init_mtx(&pi->tick, &pi->pi_lock, 0); rc = cxgbe_vi_attach(dev, &pi->vi[0]); if (rc) return (rc); for_each_vi(pi, i, vi) { if (i == 0) continue; vi->dev = device_add_child(dev, is_t4(pi->adapter) ? "vcxgbe" : "vcxl", -1); if (vi->dev == NULL) { device_printf(dev, "failed to add VI %d\n", i); continue; } device_set_softc(vi->dev, vi); } cxgbe_sysctls(pi); bus_generic_attach(dev); return (0); } static void cxgbe_vi_detach(struct vi_info *vi) { struct ifnet *ifp = vi->ifp; ether_ifdetach(ifp); if (vi->vlan_c) EVENTHANDLER_DEREGISTER(vlan_config, vi->vlan_c); /* Let detach proceed even if these fail. */ #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) cxgbe_nm_detach(vi); #endif cxgbe_uninit_synchronized(vi); callout_drain(&vi->tick); vi_full_uninit(vi); ifmedia_removeall(&vi->media); if_free(vi->ifp); vi->ifp = NULL; } static int cxgbe_detach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; int rc; /* Detach the extra VIs first. */ rc = bus_generic_detach(dev); if (rc) return (rc); device_delete_children(dev); doom_vi(sc, &pi->vi[0]); if (pi->flags & HAS_TRACEQ) { sc->traceq = -1; /* cloner should not create ifnet */ t4_tracer_port_detach(sc); } cxgbe_vi_detach(&pi->vi[0]); callout_drain(&pi->tick); end_synchronized_op(sc, 0); return (0); } static void cxgbe_init(void *arg) { struct vi_info *vi = arg; struct adapter *sc = vi->pi->adapter; if (begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4init") != 0) return; cxgbe_init_synchronized(vi); end_synchronized_op(sc, 0); } static int cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) { int rc = 0, mtu, flags, can_sleep; struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->pi->adapter; struct ifreq *ifr = (struct ifreq *)data; uint32_t mask; switch (cmd) { case SIOCSIFMTU: mtu = ifr->ifr_mtu; if ((mtu < ETHERMIN) || (mtu > ETHERMTU_JUMBO)) return (EINVAL); rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4mtu"); if (rc) return (rc); ifp->if_mtu = mtu; if (vi->flags & VI_INIT_DONE) { t4_update_fl_bufsize(ifp); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MTU); } end_synchronized_op(sc, 0); break; case SIOCSIFFLAGS: can_sleep = 0; redo_sifflags: rc = begin_synchronized_op(sc, vi, can_sleep ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4flg"); if (rc) return (rc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { flags = vi->if_flags; if ((ifp->if_flags ^ flags) & (IFF_PROMISC | IFF_ALLMULTI)) { if (can_sleep == 1) { end_synchronized_op(sc, 0); can_sleep = 0; goto redo_sifflags; } rc = update_mac_settings(ifp, XGMAC_PROMISC | XGMAC_ALLMULTI); } } else { if (can_sleep == 0) { end_synchronized_op(sc, LOCK_HELD); can_sleep = 1; goto redo_sifflags; } rc = cxgbe_init_synchronized(vi); } vi->if_flags = ifp->if_flags; } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (can_sleep == 0) { end_synchronized_op(sc, LOCK_HELD); can_sleep = 1; goto redo_sifflags; } rc = cxgbe_uninit_synchronized(vi); } end_synchronized_op(sc, can_sleep ? 0 : LOCK_HELD); break; case SIOCADDMULTI: case SIOCDELMULTI: /* these two are called with a mutex held :-( */ rc = begin_synchronized_op(sc, vi, HOLD_LOCK, "t4multi"); if (rc) return (rc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MCADDRS); end_synchronized_op(sc, LOCK_HELD); break; case SIOCSIFCAP: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4cap"); if (rc) return (rc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO4 & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO4; if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (IFCAP_TSO6 & ifp->if_capenable && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO6; if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; /* * Note that we leave CSUM_TSO alone (it is always set). The * kernel takes both IFCAP_TSOx and CSUM_TSO into account before * sending a TSO request our way, so it's sufficient to toggle * IFCAP_TSOx only. */ if (mask & IFCAP_TSO4) { if (!(IFCAP_TSO4 & ifp->if_capenable) && !(IFCAP_TXCSUM & ifp->if_capenable)) { if_printf(ifp, "enable txcsum first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO4; } if (mask & IFCAP_TSO6) { if (!(IFCAP_TSO6 & ifp->if_capenable) && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { if_printf(ifp, "enable txcsum6 first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO6; } if (mask & IFCAP_LRO) { #if defined(INET) || defined(INET6) int i; struct sge_rxq *rxq; ifp->if_capenable ^= IFCAP_LRO; for_each_rxq(vi, i, rxq) { if (ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; else rxq->iq.flags &= ~IQ_LRO_ENABLED; } #endif } #ifdef TCP_OFFLOAD if (mask & IFCAP_TOE) { int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE; rc = toe_capability(vi, enable); if (rc != 0) goto fail; ifp->if_capenable ^= mask; } #endif if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_VLANEX); } if (mask & IFCAP_VLAN_MTU) { ifp->if_capenable ^= IFCAP_VLAN_MTU; /* Need to find out how to disable auto-mtu-inflation */ } if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (mask & IFCAP_VLAN_HWCSUM) ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); #endif fail: end_synchronized_op(sc, 0); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: ifmedia_ioctl(ifp, ifr, &vi->media, cmd); break; case SIOCGI2C: { struct ifi2creq i2c; rc = copyin(ifr->ifr_data, &i2c, sizeof(i2c)); if (rc != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { rc = EPERM; break; } if (i2c.len > sizeof(i2c.data)) { rc = EINVAL; break; } rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4i2c"); if (rc) return (rc); rc = -t4_i2c_rd(sc, sc->mbox, vi->pi->port_id, i2c.dev_addr, i2c.offset, i2c.len, &i2c.data[0]); end_synchronized_op(sc, 0); if (rc == 0) rc = copyout(&i2c, ifr->ifr_data, sizeof(i2c)); break; } default: rc = ether_ioctl(ifp, cmd, data); } return (rc); } static int cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_txq *txq; void *items[1]; int rc; M_ASSERTPKTHDR(m); MPASS(m->m_nextpkt == NULL); /* not quite ready for this yet */ if (__predict_false(pi->link_cfg.link_ok == 0)) { m_freem(m); return (ENETDOWN); } rc = parse_pkt(&m); if (__predict_false(rc != 0)) { MPASS(m == NULL); /* was freed already */ atomic_add_int(&pi->tx_parse_error, 1); /* rare, atomic is ok */ return (rc); } /* Select a txq. */ txq = &sc->sge.txq[vi->first_txq]; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) txq += ((m->m_pkthdr.flowid % (vi->ntxq - vi->rsrv_noflowq)) + vi->rsrv_noflowq); items[0] = m; rc = mp_ring_enqueue(txq->r, items, 1, 4096); if (__predict_false(rc != 0)) m_freem(m); return (rc); } static void cxgbe_qflush(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct sge_txq *txq; int i; /* queues do not exist if !VI_INIT_DONE. */ if (vi->flags & VI_INIT_DONE) { for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags &= ~EQ_ENABLED; TXQ_UNLOCK(txq); while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 0); pause("qflush", 1); } } } if_qflush(ifp); } static uint64_t vi_get_counter(struct ifnet *ifp, ift_counter c) { struct vi_info *vi = ifp->if_softc; struct fw_vi_stats_vf *s = &vi->stats; vi_refresh_stats(vi->pi->adapter, vi); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_bcast_frames + s->rx_mcast_frames + s->rx_ucast_frames); case IFCOUNTER_IERRORS: return (s->rx_err_frames); case IFCOUNTER_OPACKETS: return (s->tx_bcast_frames + s->tx_mcast_frames + s->tx_ucast_frames + s->tx_offload_frames); case IFCOUNTER_OERRORS: return (s->tx_drop_frames); case IFCOUNTER_IBYTES: return (s->rx_bcast_bytes + s->rx_mcast_bytes + s->rx_ucast_bytes); case IFCOUNTER_OBYTES: return (s->tx_bcast_bytes + s->tx_mcast_bytes + s->tx_ucast_bytes + s->tx_offload_bytes); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = 0; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->drops); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } uint64_t cxgbe_get_counter(struct ifnet *ifp, ift_counter c) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct port_stats *s = &pi->stats; if (pi->nvi > 1) return (vi_get_counter(ifp, c)); cxgbe_refresh_stats(sc, pi); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_frames); case IFCOUNTER_IERRORS: return (s->rx_jabber + s->rx_runt + s->rx_too_long + s->rx_fcs_err + s->rx_len_err); case IFCOUNTER_OPACKETS: return (s->tx_frames); case IFCOUNTER_OERRORS: return (s->tx_error_frames); case IFCOUNTER_IBYTES: return (s->rx_octets); case IFCOUNTER_OBYTES: return (s->tx_octets); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_IQDROPS: return (s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 + s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 + s->rx_trunc3 + pi->tnl_cong_drops); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = s->tx_drop; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->drops); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } static int cxgbe_media_change(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; device_printf(vi->dev, "%s unimplemented.\n", __func__); return (EOPNOTSUPP); } static void cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct ifmedia_entry *cur; int speed = pi->link_cfg.speed; cur = vi->media.ifm_cur; ifmr->ifm_status = IFM_AVALID; if (!pi->link_cfg.link_ok) return; ifmr->ifm_status |= IFM_ACTIVE; /* active and current will differ iff current media is autoselect. */ if (IFM_SUBTYPE(cur->ifm_media) != IFM_AUTO) return; ifmr->ifm_active = IFM_ETHER | IFM_FDX; if (speed == 10000) ifmr->ifm_active |= IFM_10G_T; else if (speed == 1000) ifmr->ifm_active |= IFM_1000_T; else if (speed == 100) ifmr->ifm_active |= IFM_100_TX; else if (speed == 10) ifmr->ifm_active |= IFM_10_T; else KASSERT(0, ("%s: link up but speed unknown (%u)", __func__, speed)); } static int vcxgbe_probe(device_t dev) { char buf[128]; struct vi_info *vi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d vi %td", vi->pi->port_id, vi - vi->pi->vi); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } static int vcxgbe_attach(device_t dev) { struct vi_info *vi; struct port_info *pi; struct adapter *sc; int func, index, rc; u32 param, val; vi = device_get_softc(dev); pi = vi->pi; sc = pi->adapter; index = vi - pi->vi; KASSERT(index < nitems(vi_mac_funcs), ("%s: VI %s doesn't have a MAC func", __func__, device_get_nameunit(dev))); func = vi_mac_funcs[index]; rc = t4_alloc_vi_func(sc, sc->mbox, pi->tx_chan, sc->pf, 0, 1, vi->hw_addr, &vi->rss_size, func, 0); if (rc < 0) { device_printf(dev, "Failed to allocate virtual interface " "for port %d: %d\n", pi->port_id, -rc); return (-rc); } vi->viid = rc; param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_RSSINFO) | V_FW_PARAMS_PARAM_YZ(vi->viid); rc = t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc) vi->rss_base = 0xffff; else { /* MPASS((val >> 16) == rss_size); */ vi->rss_base = val & 0xffff; } rc = cxgbe_vi_attach(dev, vi); if (rc) { t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); return (rc); } return (0); } static int vcxgbe_detach(device_t dev) { struct vi_info *vi; struct adapter *sc; vi = device_get_softc(dev); sc = vi->pi->adapter; doom_vi(sc, vi); cxgbe_vi_detach(vi); t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); end_synchronized_op(sc, 0); return (0); } void t4_fatal_err(struct adapter *sc) { t4_set_reg_field(sc, A_SGE_CONTROL, F_GLOBALENABLE, 0); t4_intr_disable(sc); log(LOG_EMERG, "%s: encountered fatal error, adapter stopped.\n", device_get_nameunit(sc->dev)); } static int map_bars_0_and_4(struct adapter *sc) { sc->regs_rid = PCIR_BAR(0); sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->regs_rid, RF_ACTIVE); if (sc->regs_res == NULL) { device_printf(sc->dev, "cannot map registers.\n"); return (ENXIO); } sc->bt = rman_get_bustag(sc->regs_res); sc->bh = rman_get_bushandle(sc->regs_res); sc->mmio_len = rman_get_size(sc->regs_res); setbit(&sc->doorbells, DOORBELL_KDB); sc->msix_rid = PCIR_BAR(4); sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->msix_rid, RF_ACTIVE); if (sc->msix_res == NULL) { device_printf(sc->dev, "cannot map MSI-X BAR.\n"); return (ENXIO); } return (0); } static int map_bar_2(struct adapter *sc) { /* * T4: only iWARP driver uses the userspace doorbells. There is no need * to map it if RDMA is disabled. */ if (is_t4(sc) && sc->rdmacaps == 0) return (0); sc->udbs_rid = PCIR_BAR(2); sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->udbs_rid, RF_ACTIVE); if (sc->udbs_res == NULL) { device_printf(sc->dev, "cannot map doorbell BAR.\n"); return (ENXIO); } sc->udbs_base = rman_get_virtual(sc->udbs_res); if (is_t5(sc)) { setbit(&sc->doorbells, DOORBELL_UDB); #if defined(__i386__) || defined(__amd64__) if (t5_write_combine) { int rc; /* * Enable write combining on BAR2. This is the * userspace doorbell BAR and is split into 128B * (UDBS_SEG_SIZE) doorbell regions, each associated * with an egress queue. The first 64B has the doorbell * and the second 64B can be used to submit a tx work * request with an implicit doorbell. */ rc = pmap_change_attr((vm_offset_t)sc->udbs_base, rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING); if (rc == 0) { clrbit(&sc->doorbells, DOORBELL_UDB); setbit(&sc->doorbells, DOORBELL_WCWR); setbit(&sc->doorbells, DOORBELL_UDBWC); } else { device_printf(sc->dev, "couldn't enable write combining: %d\n", rc); } t4_write_reg(sc, A_SGE_STAT_CFG, V_STATSOURCE_T5(7) | V_STATMODE(0)); } #endif } return (0); } struct memwin_init { uint32_t base; uint32_t aperture; }; static const struct memwin_init t4_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 } }; static const struct memwin_init t5_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 }, }; static void setup_memwin(struct adapter *sc) { const struct memwin_init *mw_init; struct memwin *mw; int i; uint32_t bar0; if (is_t4(sc)) { /* * Read low 32b of bar0 indirectly via the hardware backdoor * mechanism. Works from within PCI passthrough environments * too, where rman_get_start() can return a different value. We * need to program the T4 memory window decoders with the actual * addresses that will be coming across the PCIe link. */ bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0)); bar0 &= (uint32_t) PCIM_BAR_MEM_BASE; mw_init = &t4_memwin[0]; } else { /* T5+ use the relative offset inside the PCIe BAR */ bar0 = 0; mw_init = &t5_memwin[0]; } for (i = 0, mw = &sc->memwin[0]; i < NUM_MEMWIN; i++, mw_init++, mw++) { rw_init(&mw->mw_lock, "memory window access"); mw->mw_base = mw_init->base; mw->mw_aperture = mw_init->aperture; mw->mw_curpos = 0; t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i), (mw->mw_base + bar0) | V_BIR(0) | V_WINDOW(ilog2(mw->mw_aperture) - 10)); rw_wlock(&mw->mw_lock); position_memwin(sc, i, 0); rw_wunlock(&mw->mw_lock); } /* flush */ t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2)); } /* * Positions the memory window at the given address in the card's address space. * There are some alignment requirements and the actual position may be at an * address prior to the requested address. mw->mw_curpos always has the actual * position of the window. */ static void position_memwin(struct adapter *sc, int idx, uint32_t addr) { struct memwin *mw; uint32_t pf; uint32_t reg; MPASS(idx >= 0 && idx < NUM_MEMWIN); mw = &sc->memwin[idx]; rw_assert(&mw->mw_lock, RA_WLOCKED); if (is_t4(sc)) { pf = 0; mw->mw_curpos = addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); mw->mw_curpos = addr & ~0x7f; /* start must be 128B aligned */ } reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, idx); t4_write_reg(sc, reg, mw->mw_curpos | pf); t4_read_reg(sc, reg); /* flush */ } static int rw_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len, int rw) { struct memwin *mw; uint32_t mw_end, v; MPASS(idx >= 0 && idx < NUM_MEMWIN); /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len <= 0) return (EINVAL); mw = &sc->memwin[idx]; while (len > 0) { rw_rlock(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; if (addr >= mw_end || addr < mw->mw_curpos) { /* Will need to reposition the window */ if (!rw_try_upgrade(&mw->mw_lock)) { rw_runlock(&mw->mw_lock); rw_wlock(&mw->mw_lock); } rw_assert(&mw->mw_lock, RA_WLOCKED); position_memwin(sc, idx, addr); rw_downgrade(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; } rw_assert(&mw->mw_lock, RA_RLOCKED); while (addr < mw_end && len > 0) { if (rw == 0) { v = t4_read_reg(sc, mw->mw_base + addr - mw->mw_curpos); *val++ = le32toh(v); } else { v = *val++; t4_write_reg(sc, mw->mw_base + addr - mw->mw_curpos, htole32(v)); } addr += 4; len -= 4; } rw_runlock(&mw->mw_lock); } return (0); } static inline int read_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, val, len, 0)); } static inline int write_via_memwin(struct adapter *sc, int idx, uint32_t addr, const uint32_t *val, int len) { return (rw_via_memwin(sc, idx, addr, (void *)(uintptr_t)val, len, 1)); } static int t4_range_cmp(const void *a, const void *b) { return ((const struct t4_range *)a)->start - ((const struct t4_range *)b)->start; } /* * Verify that the memory range specified by the addr/len pair is valid within * the card's address space. */ static int validate_mem_range(struct adapter *sc, uint32_t addr, int len) { struct t4_range mem_ranges[4], *r, *next; uint32_t em, addr_len; int i, n, remaining; /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len <= 0) return (EINVAL); /* Enabled memories */ em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); r = &mem_ranges[0]; n = 0; bzero(r, sizeof(mem_ranges)); if (em & F_EDRAM0_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); r->size = G_EDRAM0_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM0_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EDRAM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); r->size = G_EDRAM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EXT_MEM_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); r->size = G_EXT_MEM_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (is_t5(sc) && em & F_EXT_MEM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); r->size = G_EXT_MEM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } MPASS(n <= nitems(mem_ranges)); if (n > 1) { /* Sort and merge the ranges. */ qsort(mem_ranges, n, sizeof(struct t4_range), t4_range_cmp); /* Start from index 0 and examine the next n - 1 entries. */ r = &mem_ranges[0]; for (remaining = n - 1; remaining > 0; remaining--, r++) { MPASS(r->size > 0); /* r is a valid entry. */ next = r + 1; MPASS(next->size > 0); /* and so is the next one. */ while (r->start + r->size >= next->start) { /* Merge the next one into the current entry. */ r->size = max(r->start + r->size, next->start + next->size) - r->start; n--; /* One fewer entry in total. */ if (--remaining == 0) goto done; /* short circuit */ next++; } if (next != r + 1) { /* * Some entries were merged into r and next * points to the first valid entry that couldn't * be merged. */ MPASS(next->size > 0); /* must be valid */ memcpy(r + 1, next, remaining * sizeof(*r)); #ifdef INVARIANTS /* * This so that the foo->size assertion in the * next iteration of the loop do the right * thing for entries that were pulled up and are * no longer valid. */ MPASS(n < nitems(mem_ranges)); bzero(&mem_ranges[n], (nitems(mem_ranges) - n) * sizeof(struct t4_range)); #endif } } done: /* Done merging the ranges. */ MPASS(n > 0); r = &mem_ranges[0]; for (i = 0; i < n; i++, r++) { if (addr >= r->start && addr + len <= r->start + r->size) return (0); } } return (EFAULT); } static int fwmtype_to_hwmtype(int mtype) { switch (mtype) { case FW_MEMTYPE_EDC0: return (MEM_EDC0); case FW_MEMTYPE_EDC1: return (MEM_EDC1); case FW_MEMTYPE_EXTMEM: return (MEM_MC0); case FW_MEMTYPE_EXTMEM1: return (MEM_MC1); default: panic("%s: cannot translate fw mtype %d.", __func__, mtype); } } /* * Verify that the memory range specified by the memtype/offset/len pair is * valid and lies entirely within the memtype specified. The global address of * the start of the range is returned in addr. */ static int validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, int len, uint32_t *addr) { uint32_t em, addr_len, maddr; /* Memory can only be accessed in naturally aligned 4 byte units */ if (off & 3 || len & 3 || len == 0) return (EINVAL); em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); switch (fwmtype_to_hwmtype(mtype)) { case MEM_EDC0: if (!(em & F_EDRAM0_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); maddr = G_EDRAM0_BASE(addr_len) << 20; break; case MEM_EDC1: if (!(em & F_EDRAM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); maddr = G_EDRAM1_BASE(addr_len) << 20; break; case MEM_MC: if (!(em & F_EXT_MEM_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); maddr = G_EXT_MEM_BASE(addr_len) << 20; break; case MEM_MC1: if (!is_t5(sc) || !(em & F_EXT_MEM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); maddr = G_EXT_MEM1_BASE(addr_len) << 20; break; default: return (EINVAL); } *addr = maddr + off; /* global address */ return (validate_mem_range(sc, *addr, len)); } static int fixup_devlog_params(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; int rc; rc = validate_mt_off_len(sc, dparams->memtype, dparams->start, dparams->size, &dparams->addr); return (rc); } static int cfg_itype_and_nqueues(struct adapter *sc, int n10g, int n1g, int num_vis, struct intrs_and_queues *iaq) { int rc, itype, navail, nrxq10g, nrxq1g, n; int nofldrxq10g = 0, nofldrxq1g = 0; bzero(iaq, sizeof(*iaq)); iaq->ntxq10g = t4_ntxq10g; iaq->ntxq1g = t4_ntxq1g; iaq->ntxq_vi = t4_ntxq_vi; iaq->nrxq10g = nrxq10g = t4_nrxq10g; iaq->nrxq1g = nrxq1g = t4_nrxq1g; iaq->nrxq_vi = t4_nrxq_vi; iaq->rsrv_noflowq = t4_rsrv_noflowq; #ifdef TCP_OFFLOAD if (is_offload(sc)) { iaq->nofldtxq10g = t4_nofldtxq10g; iaq->nofldtxq1g = t4_nofldtxq1g; iaq->nofldtxq_vi = t4_nofldtxq_vi; iaq->nofldrxq10g = nofldrxq10g = t4_nofldrxq10g; iaq->nofldrxq1g = nofldrxq1g = t4_nofldrxq1g; iaq->nofldrxq_vi = t4_nofldrxq_vi; } #endif #ifdef DEV_NETMAP iaq->nnmtxq_vi = t4_nnmtxq_vi; iaq->nnmrxq_vi = t4_nnmrxq_vi; #endif for (itype = INTR_MSIX; itype; itype >>= 1) { if ((itype & t4_intr_types) == 0) continue; /* not allowed */ if (itype == INTR_MSIX) navail = pci_msix_count(sc->dev); else if (itype == INTR_MSI) navail = pci_msi_count(sc->dev); else navail = 1; restart: if (navail == 0) continue; iaq->intr_type = itype; iaq->intr_flags_10g = 0; iaq->intr_flags_1g = 0; /* * Best option: an interrupt vector for errors, one for the * firmware event queue, and one for every rxq (NIC and TOE) of * every VI. The VIs that support netmap use the same * interrupts for the NIC rx queues and the netmap rx queues * because only one set of queues is active at a time. */ iaq->nirq = T4_EXTRA_INTR; iaq->nirq += n10g * (nrxq10g + nofldrxq10g); iaq->nirq += n1g * (nrxq1g + nofldrxq1g); iaq->nirq += (n10g + n1g) * (num_vis - 1) * max(iaq->nrxq_vi, iaq->nnmrxq_vi); /* See comment above. */ iaq->nirq += (n10g + n1g) * (num_vis - 1) * iaq->nofldrxq_vi; if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { iaq->intr_flags_10g = INTR_ALL; iaq->intr_flags_1g = INTR_ALL; goto allocate; } /* Disable the VIs (and netmap) if there aren't enough intrs */ if (num_vis > 1) { device_printf(sc->dev, "virtual interfaces disabled " "because num_vis=%u with current settings " "(nrxq10g=%u, nrxq1g=%u, nofldrxq10g=%u, " "nofldrxq1g=%u, nrxq_vi=%u nofldrxq_vi=%u, " "nnmrxq_vi=%u) would need %u interrupts but " "only %u are available.\n", num_vis, nrxq10g, nrxq1g, nofldrxq10g, nofldrxq1g, iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi, iaq->nirq, navail); num_vis = 1; iaq->ntxq_vi = iaq->nrxq_vi = 0; iaq->nofldtxq_vi = iaq->nofldrxq_vi = 0; iaq->nnmtxq_vi = iaq->nnmrxq_vi = 0; goto restart; } /* * Second best option: a vector for errors, one for the firmware * event queue, and vectors for either all the NIC rx queues or * all the TOE rx queues. The queues that don't get vectors * will forward their interrupts to those that do. */ iaq->nirq = T4_EXTRA_INTR; if (nrxq10g >= nofldrxq10g) { iaq->intr_flags_10g = INTR_RXQ; iaq->nirq += n10g * nrxq10g; } else { iaq->intr_flags_10g = INTR_OFLD_RXQ; iaq->nirq += n10g * nofldrxq10g; } if (nrxq1g >= nofldrxq1g) { iaq->intr_flags_1g = INTR_RXQ; iaq->nirq += n1g * nrxq1g; } else { iaq->intr_flags_1g = INTR_OFLD_RXQ; iaq->nirq += n1g * nofldrxq1g; } if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) goto allocate; /* * Next best option: an interrupt vector for errors, one for the * firmware event queue, and at least one per main-VI. At this * point we know we'll have to downsize nrxq and/or nofldrxq to * fit what's available to us. */ iaq->nirq = T4_EXTRA_INTR; iaq->nirq += n10g + n1g; if (iaq->nirq <= navail) { int leftover = navail - iaq->nirq; if (n10g > 0) { int target = max(nrxq10g, nofldrxq10g); iaq->intr_flags_10g = nrxq10g >= nofldrxq10g ? INTR_RXQ : INTR_OFLD_RXQ; n = 1; while (n < target && leftover >= n10g) { leftover -= n10g; iaq->nirq += n10g; n++; } iaq->nrxq10g = min(n, nrxq10g); #ifdef TCP_OFFLOAD iaq->nofldrxq10g = min(n, nofldrxq10g); #endif } if (n1g > 0) { int target = max(nrxq1g, nofldrxq1g); iaq->intr_flags_1g = nrxq1g >= nofldrxq1g ? INTR_RXQ : INTR_OFLD_RXQ; n = 1; while (n < target && leftover >= n1g) { leftover -= n1g; iaq->nirq += n1g; n++; } iaq->nrxq1g = min(n, nrxq1g); #ifdef TCP_OFFLOAD iaq->nofldrxq1g = min(n, nofldrxq1g); #endif } if (itype != INTR_MSI || powerof2(iaq->nirq)) goto allocate; } /* * Least desirable option: one interrupt vector for everything. */ iaq->nirq = iaq->nrxq10g = iaq->nrxq1g = 1; iaq->intr_flags_10g = iaq->intr_flags_1g = 0; #ifdef TCP_OFFLOAD if (is_offload(sc)) iaq->nofldrxq10g = iaq->nofldrxq1g = 1; #endif allocate: navail = iaq->nirq; rc = 0; if (itype == INTR_MSIX) rc = pci_alloc_msix(sc->dev, &navail); else if (itype == INTR_MSI) rc = pci_alloc_msi(sc->dev, &navail); if (rc == 0) { if (navail == iaq->nirq) return (0); /* * Didn't get the number requested. Use whatever number * the kernel is willing to allocate (it's in navail). */ device_printf(sc->dev, "fewer vectors than requested, " "type=%d, req=%d, rcvd=%d; will downshift req.\n", itype, iaq->nirq, navail); pci_release_msi(sc->dev); goto restart; } device_printf(sc->dev, "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n", itype, rc, iaq->nirq, navail); } device_printf(sc->dev, "failed to find a usable interrupt type. " "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types, pci_msix_count(sc->dev), pci_msi_count(sc->dev)); return (ENXIO); } #define FW_VERSION(chip) ( \ V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \ V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \ V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \ V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD)) #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf) struct fw_info { uint8_t chip; char *kld_name; char *fw_mod_name; struct fw_hdr fw_hdr; /* XXX: waste of space, need a sparse struct */ } fw_info[] = { { .chip = CHELSIO_T4, .kld_name = "t4fw_cfg", .fw_mod_name = "t4fw", .fw_hdr = { .chip = FW_HDR_CHIP_T4, .fw_ver = htobe32_const(FW_VERSION(T4)), .intfver_nic = FW_INTFVER(T4, NIC), .intfver_vnic = FW_INTFVER(T4, VNIC), .intfver_ofld = FW_INTFVER(T4, OFLD), .intfver_ri = FW_INTFVER(T4, RI), .intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T4, ISCSI), .intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU), .intfver_fcoe = FW_INTFVER(T4, FCOE), }, }, { .chip = CHELSIO_T5, .kld_name = "t5fw_cfg", .fw_mod_name = "t5fw", .fw_hdr = { .chip = FW_HDR_CHIP_T5, .fw_ver = htobe32_const(FW_VERSION(T5)), .intfver_nic = FW_INTFVER(T5, NIC), .intfver_vnic = FW_INTFVER(T5, VNIC), .intfver_ofld = FW_INTFVER(T5, OFLD), .intfver_ri = FW_INTFVER(T5, RI), .intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T5, ISCSI), .intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU), .intfver_fcoe = FW_INTFVER(T5, FCOE), }, } }; static struct fw_info * find_fw_info(int chip) { int i; for (i = 0; i < nitems(fw_info); i++) { if (fw_info[i].chip == chip) return (&fw_info[i]); } return (NULL); } /* * Is the given firmware API compatible with the one the driver was compiled * with? */ static int fw_compatible(const struct fw_hdr *hdr1, const struct fw_hdr *hdr2) { /* short circuit if it's the exact same firmware version */ if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver) return (1); /* * XXX: Is this too conservative? Perhaps I should limit this to the * features that are supported in the driver. */ #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x) if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) && SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) && SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe)) return (1); #undef SAME_INTF return (0); } /* * The firmware in the KLD is usable, but should it be installed? This routine * explains itself in detail if it indicates the KLD firmware should be * installed. */ static int should_install_kld_fw(struct adapter *sc, int card_fw_usable, int k, int c) { const char *reason; if (!card_fw_usable) { reason = "incompatible or unusable"; goto install; } if (k > c) { reason = "older than the version bundled with this driver"; goto install; } if (t4_fw_install == 2 && k != c) { reason = "different than the version bundled with this driver"; goto install; } return (0); install: if (t4_fw_install == 0) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver is prohibited from installing a different " "firmware on the card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); return (0); } device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "installing firmware %u.%u.%u.%u on card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason, G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k)); return (1); } /* * Establish contact with the firmware and determine if we are the master driver * or not, and whether we are responsible for chip initialization. */ static int prep_firmware(struct adapter *sc) { const struct firmware *fw = NULL, *default_cfg; int rc, pf, card_fw_usable, kld_fw_usable, need_fw_reset = 1; enum dev_state state; struct fw_info *fw_info; struct fw_hdr *card_fw; /* fw on the card */ const struct fw_hdr *kld_fw; /* fw in the KLD */ const struct fw_hdr *drv_fw; /* fw header the driver was compiled against */ /* Contact firmware. */ rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state); if (rc < 0 || state == DEV_STATE_ERR) { rc = -rc; device_printf(sc->dev, "failed to connect to the firmware: %d, %d.\n", rc, state); return (rc); } pf = rc; if (pf == sc->mbox) sc->flags |= MASTER_PF; else if (state == DEV_STATE_UNINIT) { /* * We didn't get to be the master so we definitely won't be * configuring the chip. It's a bug if someone else hasn't * configured it already. */ device_printf(sc->dev, "couldn't be master(%d), " "device not already initialized either(%d).\n", rc, state); return (EDOOFUS); } /* This is the firmware whose headers the driver was compiled against */ fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } drv_fw = &fw_info->fw_hdr; /* * The firmware KLD contains many modules. The KLD name is also the * name of the module that contains the default config file. */ default_cfg = firmware_get(fw_info->kld_name); /* Read the header of the firmware on the card */ card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_read_flash(sc, FLASH_FW_START, sizeof (*card_fw) / sizeof (uint32_t), (uint32_t *)card_fw, 1); if (rc == 0) card_fw_usable = fw_compatible(drv_fw, (const void*)card_fw); else { device_printf(sc->dev, "Unable to read card's firmware header: %d\n", rc); card_fw_usable = 0; } /* This is the firmware in the KLD */ fw = firmware_get(fw_info->fw_mod_name); if (fw != NULL) { kld_fw = (const void *)fw->data; kld_fw_usable = fw_compatible(drv_fw, kld_fw); } else { kld_fw = NULL; kld_fw_usable = 0; } if (card_fw_usable && card_fw->fw_ver == drv_fw->fw_ver && (!kld_fw_usable || kld_fw->fw_ver == drv_fw->fw_ver)) { /* * Common case: the firmware on the card is an exact match and * the KLD is an exact match too, or the KLD is * absent/incompatible. Note that t4_fw_install = 2 is ignored * here -- use cxgbetool loadfw if you want to reinstall the * same firmware as the one on the card. */ } else if (kld_fw_usable && state == DEV_STATE_UNINIT && should_install_kld_fw(sc, card_fw_usable, be32toh(kld_fw->fw_ver), be32toh(card_fw->fw_ver))) { rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0); if (rc != 0) { device_printf(sc->dev, "failed to install firmware: %d\n", rc); goto done; } /* Installed successfully, update the cached header too. */ memcpy(card_fw, kld_fw, sizeof(*card_fw)); card_fw_usable = 1; need_fw_reset = 0; /* already reset as part of load_fw */ } if (!card_fw_usable) { uint32_t d, c, k; d = ntohl(drv_fw->fw_ver); c = ntohl(card_fw->fw_ver); k = kld_fw ? ntohl(kld_fw->fw_ver) : 0; device_printf(sc->dev, "Cannot find a usable firmware: " "fw_install %d, chip state %d, " "driver compiled with %d.%d.%d.%d, " "card has %d.%d.%d.%d, KLD has %d.%d.%d.%d\n", t4_fw_install, state, G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d), G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k)); rc = EINVAL; goto done; } /* We're using whatever's on the card and it's known to be good. */ sc->params.fw_vers = ntohl(card_fw->fw_ver); snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers), G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers)); t4_get_tp_version(sc, &sc->params.tp_vers); snprintf(sc->tp_version, sizeof(sc->tp_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers), G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers)); if (t4_get_exprom_version(sc, &sc->params.exprom_vers) != 0) sc->params.exprom_vers = 0; else { snprintf(sc->exprom_version, sizeof(sc->exprom_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.exprom_vers), G_FW_HDR_FW_VER_MINOR(sc->params.exprom_vers), G_FW_HDR_FW_VER_MICRO(sc->params.exprom_vers), G_FW_HDR_FW_VER_BUILD(sc->params.exprom_vers)); } /* Reset device */ if (need_fw_reset && (rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST)) != 0) { device_printf(sc->dev, "firmware reset failed: %d.\n", rc); if (rc != ETIMEDOUT && rc != EIO) t4_fw_bye(sc, sc->mbox); goto done; } sc->flags |= FW_OK; rc = get_params__pre_init(sc); if (rc != 0) goto done; /* error message displayed already */ /* Partition adapter resources as specified in the config file. */ if (state == DEV_STATE_UNINIT) { KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); rc = partition_resources(sc, default_cfg, fw_info->kld_name); if (rc != 0) goto done; /* error message displayed already */ t4_tweak_chip_settings(sc); /* get basic stuff going */ rc = -t4_fw_initialize(sc, sc->mbox); if (rc != 0) { device_printf(sc->dev, "fw init failed: %d.\n", rc); goto done; } } else { snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", pf); sc->cfcsum = 0; } done: free(card_fw, M_CXGBE); if (fw != NULL) firmware_put(fw, FIRMWARE_UNLOAD); if (default_cfg != NULL) firmware_put(default_cfg, FIRMWARE_UNLOAD); return (rc); } #define FW_PARAM_DEV(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param)) #define FW_PARAM_PFVF(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param)) /* * Partition chip resources for use between various PFs, VFs, etc. */ static int partition_resources(struct adapter *sc, const struct firmware *default_cfg, const char *name_prefix) { const struct firmware *cfg = NULL; int rc = 0; struct fw_caps_config_cmd caps; uint32_t mtype, moff, finicsum, cfcsum; /* * Figure out what configuration file to use. Pick the default config * file for the card if the user hasn't specified one explicitly. */ snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", t4_cfg_file); if (strncmp(t4_cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { /* Card specific overrides go here. */ if (pci_get_device(sc->dev) == 0x440a) snprintf(sc->cfg_file, sizeof(sc->cfg_file), UWIRE_CF); if (is_fpga(sc)) snprintf(sc->cfg_file, sizeof(sc->cfg_file), FPGA_CF); } /* * We need to load another module if the profile is anything except * "default" or "flash". */ if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) != 0 && strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) { char s[32]; snprintf(s, sizeof(s), "%s_%s", name_prefix, sc->cfg_file); cfg = firmware_get(s); if (cfg == NULL) { if (default_cfg != NULL) { device_printf(sc->dev, "unable to load module \"%s\" for " "configuration profile \"%s\", will use " "the default config file instead.\n", s, sc->cfg_file); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", DEFAULT_CF); } else { device_printf(sc->dev, "unable to load module \"%s\" for " "configuration profile \"%s\", will use " "the config file on the card's flash " "instead.\n", s, sc->cfg_file); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", FLASH_CF); } } } if (strncmp(sc->cfg_file, DEFAULT_CF, sizeof(sc->cfg_file)) == 0 && default_cfg == NULL) { device_printf(sc->dev, "default config file not available, will use the config " "file on the card's flash instead.\n"); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", FLASH_CF); } if (strncmp(sc->cfg_file, FLASH_CF, sizeof(sc->cfg_file)) != 0) { u_int cflen; const uint32_t *cfdata; uint32_t param, val, addr; KASSERT(cfg != NULL || default_cfg != NULL, ("%s: no config to upload", __func__)); /* * Ask the firmware where it wants us to upload the config file. */ param = FW_PARAM_DEV(CF); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* No support for config file? Shouldn't happen. */ device_printf(sc->dev, "failed to query config file location: %d.\n", rc); goto done; } mtype = G_FW_PARAMS_PARAM_Y(val); moff = G_FW_PARAMS_PARAM_Z(val) << 16; /* * XXX: sheer laziness. We deliberately added 4 bytes of * useless stuffing/comments at the end of the config file so * it's ok to simply throw away the last remaining bytes when * the config file is not an exact multiple of 4. This also * helps with the validate_mt_off_len check. */ if (cfg != NULL) { cflen = cfg->datasize & ~3; cfdata = cfg->data; } else { cflen = default_cfg->datasize & ~3; cfdata = default_cfg->data; } if (cflen > FLASH_CFG_MAX_SIZE) { device_printf(sc->dev, "config file too long (%d, max allowed is %d). " "Will try to use the config on the card, if any.\n", cflen, FLASH_CFG_MAX_SIZE); goto use_config_on_flash; } rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr); if (rc != 0) { device_printf(sc->dev, "%s: addr (%d/0x%x) or len %d is not valid: %d. " "Will try to use the config on the card, if any.\n", __func__, mtype, moff, cflen, rc); goto use_config_on_flash; } write_via_memwin(sc, 2, addr, cfdata, cflen); } else { use_config_on_flash: mtype = FW_MEMTYPE_FLASH; moff = t4_flash_cfg_addr(sc); } bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to pre-process config file: %d " "(mtype %d, moff 0x%x).\n", rc, mtype, moff); goto done; } finicsum = be32toh(caps.finicsum); cfcsum = be32toh(caps.cfcsum); if (finicsum != cfcsum) { device_printf(sc->dev, "WARNING: config file checksum mismatch: %08x %08x\n", finicsum, cfcsum); } sc->cfcsum = cfcsum; #define LIMIT_CAPS(x) do { \ caps.x &= htobe16(t4_##x##_allowed); \ } while (0) /* * Let the firmware know what features will (not) be used so it can tune * things accordingly. */ LIMIT_CAPS(nbmcaps); LIMIT_CAPS(linkcaps); LIMIT_CAPS(switchcaps); LIMIT_CAPS(niccaps); LIMIT_CAPS(toecaps); LIMIT_CAPS(rdmacaps); LIMIT_CAPS(tlscaps); LIMIT_CAPS(iscsicaps); LIMIT_CAPS(fcoecaps); #undef LIMIT_CAPS caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL); if (rc != 0) { device_printf(sc->dev, "failed to process config file: %d.\n", rc); } done: if (cfg != NULL) firmware_put(cfg, FIRMWARE_UNLOAD); return (rc); } /* * Retrieve parameters that are needed (or nice to have) very early. */ static int get_params__pre_init(struct adapter *sc) { int rc; uint32_t param[2], val[2]; param[0] = FW_PARAM_DEV(PORTVEC); param[1] = FW_PARAM_DEV(CCLK); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (pre_init): %d.\n", rc); return (rc); } sc->params.portvec = val[0]; sc->params.nports = bitcount32(val[0]); sc->params.vpd.cclk = val[1]; /* Read device log parameters. */ rc = -t4_init_devlog_params(sc, 1); if (rc == 0) fixup_devlog_params(sc); else { device_printf(sc->dev, "failed to get devlog parameters: %d.\n", rc); rc = 0; /* devlog isn't critical for device operation */ } return (rc); } /* * Retrieve various parameters that are of interest to the driver. The device * has been initialized by the firmware at this point. */ static int get_params__post_init(struct adapter *sc) { int rc; uint32_t param[7], val[7]; struct fw_caps_config_cmd caps; param[0] = FW_PARAM_PFVF(IQFLINT_START); param[1] = FW_PARAM_PFVF(EQ_START); param[2] = FW_PARAM_PFVF(FILTER_START); param[3] = FW_PARAM_PFVF(FILTER_END); param[4] = FW_PARAM_PFVF(L2T_START); param[5] = FW_PARAM_PFVF(L2T_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (post_init): %d.\n", rc); return (rc); } sc->sge.iq_start = val[0]; sc->sge.eq_start = val[1]; sc->tids.ftid_base = val[2]; sc->tids.nftids = val[3] - val[2] + 1; sc->params.ftid_min = val[2]; sc->params.ftid_max = val[3]; sc->vres.l2t.start = val[4]; sc->vres.l2t.size = val[5] - val[4] + 1; KASSERT(sc->vres.l2t.size <= L2T_SIZE, ("%s: L2 table size (%u) larger than expected (%u)", __func__, sc->vres.l2t.size, L2T_SIZE)); /* get capabilites */ bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to get card capabilities: %d.\n", rc); return (rc); } #define READ_CAPS(x) do { \ sc->x = htobe16(caps.x); \ } while (0) READ_CAPS(nbmcaps); READ_CAPS(linkcaps); READ_CAPS(switchcaps); READ_CAPS(niccaps); READ_CAPS(toecaps); READ_CAPS(rdmacaps); READ_CAPS(tlscaps); READ_CAPS(iscsicaps); READ_CAPS(fcoecaps); if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) { param[0] = FW_PARAM_PFVF(ETHOFLD_START); param[1] = FW_PARAM_PFVF(ETHOFLD_END); param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query NIC parameters: %d.\n", rc); return (rc); } sc->tids.etid_base = val[0]; sc->params.etid_min = val[0]; sc->tids.netids = val[1] - val[0] + 1; sc->params.netids = sc->tids.netids; sc->params.eo_wr_cred = val[2]; sc->params.ethoffload = 1; } if (sc->toecaps) { /* query offload-related parameters */ param[0] = FW_PARAM_DEV(NTID); param[1] = FW_PARAM_PFVF(SERVER_START); param[2] = FW_PARAM_PFVF(SERVER_END); param[3] = FW_PARAM_PFVF(TDDP_START); param[4] = FW_PARAM_PFVF(TDDP_END); param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TOE parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); sc->tids.stid_base = val[1]; sc->tids.nstids = val[2] - val[1] + 1; sc->vres.ddp.start = val[3]; sc->vres.ddp.size = val[4] - val[3] + 1; sc->params.ofldq_wr_cred = val[5]; sc->params.offload = 1; } if (sc->rdmacaps) { param[0] = FW_PARAM_PFVF(STAG_START); param[1] = FW_PARAM_PFVF(STAG_END); param[2] = FW_PARAM_PFVF(RQ_START); param[3] = FW_PARAM_PFVF(RQ_END); param[4] = FW_PARAM_PFVF(PBL_START); param[5] = FW_PARAM_PFVF(PBL_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(1): %d.\n", rc); return (rc); } sc->vres.stag.start = val[0]; sc->vres.stag.size = val[1] - val[0] + 1; sc->vres.rq.start = val[2]; sc->vres.rq.size = val[3] - val[2] + 1; sc->vres.pbl.start = val[4]; sc->vres.pbl.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SQRQ_START); param[1] = FW_PARAM_PFVF(SQRQ_END); param[2] = FW_PARAM_PFVF(CQ_START); param[3] = FW_PARAM_PFVF(CQ_END); param[4] = FW_PARAM_PFVF(OCQ_START); param[5] = FW_PARAM_PFVF(OCQ_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(2): %d.\n", rc); return (rc); } sc->vres.qp.start = val[0]; sc->vres.qp.size = val[1] - val[0] + 1; sc->vres.cq.start = val[2]; sc->vres.cq.size = val[3] - val[2] + 1; sc->vres.ocq.start = val[4]; sc->vres.ocq.size = val[5] - val[4] + 1; } if (sc->iscsicaps) { param[0] = FW_PARAM_PFVF(ISCSI_START); param[1] = FW_PARAM_PFVF(ISCSI_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query iSCSI parameters: %d.\n", rc); return (rc); } sc->vres.iscsi.start = val[0]; sc->vres.iscsi.size = val[1] - val[0] + 1; } /* * We've got the params we wanted to query via the firmware. Now grab * some others directly from the chip. */ rc = t4_read_chip_settings(sc); return (rc); } static int set_params__post_init(struct adapter *sc) { uint32_t param, val; /* ask for encapsulated CPLs */ param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP); val = 1; (void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); return (0); } #undef FW_PARAM_PFVF #undef FW_PARAM_DEV static void t4_set_desc(struct adapter *sc) { char buf[128]; struct adapter_params *p = &sc->params; snprintf(buf, sizeof(buf), "Chelsio %s %sNIC (rev %d), S/N:%s, " "P/N:%s, E/C:%s", p->vpd.id, is_offload(sc) ? "R" : "", chip_rev(sc), p->vpd.sn, p->vpd.pn, p->vpd.ec); device_set_desc_copy(sc->dev, buf); } static void build_medialist(struct port_info *pi, struct ifmedia *media) { int m; PORT_LOCK(pi); ifmedia_removeall(media); m = IFM_ETHER | IFM_FDX; switch(pi->port_type) { case FW_PORT_TYPE_BT_XFI: case FW_PORT_TYPE_BT_XAUI: ifmedia_add(media, m | IFM_10G_T, 0, NULL); /* fall through */ case FW_PORT_TYPE_BT_SGMII: ifmedia_add(media, m | IFM_1000_T, 0, NULL); ifmedia_add(media, m | IFM_100_TX, 0, NULL); ifmedia_add(media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(media, IFM_ETHER | IFM_AUTO); break; case FW_PORT_TYPE_CX4: ifmedia_add(media, m | IFM_10G_CX4, 0, NULL); ifmedia_set(media, m | IFM_10G_CX4); break; case FW_PORT_TYPE_QSFP_10G: case FW_PORT_TYPE_SFP: case FW_PORT_TYPE_FIBER_XFI: case FW_PORT_TYPE_FIBER_XAUI: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: ifmedia_add(media, m | IFM_10G_LR, 0, NULL); ifmedia_set(media, m | IFM_10G_LR); break; case FW_PORT_MOD_TYPE_SR: ifmedia_add(media, m | IFM_10G_SR, 0, NULL); ifmedia_set(media, m | IFM_10G_SR); break; case FW_PORT_MOD_TYPE_LRM: ifmedia_add(media, m | IFM_10G_LRM, 0, NULL); ifmedia_set(media, m | IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_10G_TWINAX, 0, NULL); ifmedia_set(media, m | IFM_10G_TWINAX); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, 0, NULL); ifmedia_set(media, m | IFM_NONE); break; case FW_PORT_MOD_TYPE_NA: case FW_PORT_MOD_TYPE_ER: default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; case FW_PORT_TYPE_QSFP: switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: ifmedia_add(media, m | IFM_40G_LR4, 0, NULL); ifmedia_set(media, m | IFM_40G_LR4); break; case FW_PORT_MOD_TYPE_SR: ifmedia_add(media, m | IFM_40G_SR4, 0, NULL); ifmedia_set(media, m | IFM_40G_SR4); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: ifmedia_add(media, m | IFM_40G_CR4, 0, NULL); ifmedia_set(media, m | IFM_40G_CR4); break; case FW_PORT_MOD_TYPE_NONE: m &= ~IFM_FDX; ifmedia_add(media, m | IFM_NONE, 0, NULL); ifmedia_set(media, m | IFM_NONE); break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } break; default: device_printf(pi->dev, "unknown port_type (%d), mod_type (%d)\n", pi->port_type, pi->mod_type); ifmedia_add(media, m | IFM_UNKNOWN, 0, NULL); ifmedia_set(media, m | IFM_UNKNOWN); break; } PORT_UNLOCK(pi); } #define FW_MAC_EXACT_CHUNK 7 /* * Program the port's XGMAC based on parameters in ifnet. The caller also * indicates which parameters should be programmed (the rest are left alone). */ int update_mac_settings(struct ifnet *ifp, int flags) { int rc = 0; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1; ASSERT_SYNCHRONIZED_OP(sc); KASSERT(flags, ("%s: not told what to update.", __func__)); if (flags & XGMAC_MTU) mtu = ifp->if_mtu; if (flags & XGMAC_PROMISC) promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0; if (flags & XGMAC_ALLMULTI) allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0; if (flags & XGMAC_VLANEX) vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0; if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) { rc = -t4_set_rxmode(sc, sc->mbox, vi->viid, mtu, promisc, allmulti, 1, vlanex, false); if (rc) { if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags, rc); return (rc); } } if (flags & XGMAC_UCADDR) { uint8_t ucaddr[ETHER_ADDR_LEN]; bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr)); rc = t4_change_mac(sc, sc->mbox, vi->viid, vi->xact_addr_filt, ucaddr, true, true); if (rc < 0) { rc = -rc; if_printf(ifp, "change_mac failed: %d\n", rc); return (rc); } else { vi->xact_addr_filt = rc; rc = 0; } } if (flags & XGMAC_MCADDRS) { const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK]; int del = 1; uint64_t hash = 0; struct ifmultiaddr *ifma; int i = 0, j; if_maddr_rlock(ifp); TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mcaddr[i] = LLADDR((struct sockaddr_dl *)ifma->ifma_addr); MPASS(ETHER_IS_MULTICAST(mcaddr[i])); i++; if (i == FW_MAC_EXACT_CHUNK) { rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } del = 0; i = 0; } } if (i > 0) { rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } } rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, hash, 0); if (rc != 0) if_printf(ifp, "failed to set mc address hash: %d", rc); mcfail: if_maddr_runlock(ifp); } return (rc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ int begin_synchronized_op(struct adapter *sc, struct vi_info *vi, int flags, char *wmesg) { int rc, pri; #ifdef WITNESS /* the caller thinks it's ok to sleep, but is it really? */ if (flags & SLEEP_OK) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "begin_synchronized_op"); #endif if (INTR_OK) pri = PCATCH; else pri = 0; ADAPTER_LOCK(sc); for (;;) { if (vi && IS_DOOMED(vi)) { rc = ENXIO; goto done; } if (!IS_BUSY(sc)) { rc = 0; break; } if (!(flags & SLEEP_OK)) { rc = EBUSY; goto done; } if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) { rc = EINTR; goto done; } } KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__)); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = wmesg; sc->last_op_thr = curthread; sc->last_op_flags = flags; #endif done: if (!(flags & HOLD_LOCK) || rc) ADAPTER_UNLOCK(sc); return (rc); } /* * Tell if_ioctl and if_init that the VI is going away. This is * special variant of begin_synchronized_op and must be paired with a * call to end_synchronized_op. */ void doom_vi(struct adapter *sc, struct vi_info *vi) { ADAPTER_LOCK(sc); SET_DOOMED(vi); wakeup(&sc->flags); while (IS_BUSY(sc)) mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = "t4detach"; sc->last_op_thr = curthread; sc->last_op_flags = 0; #endif ADAPTER_UNLOCK(sc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ void end_synchronized_op(struct adapter *sc, int flags) { if (flags & LOCK_HELD) ADAPTER_LOCK_ASSERT_OWNED(sc); else ADAPTER_LOCK(sc); KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__)); CLR_BUSY(sc); wakeup(&sc->flags); ADAPTER_UNLOCK(sc); } static int cxgbe_init_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; int rc = 0, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return (0); /* already running */ if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_full_init(sc)) != 0)) return (rc); /* error message displayed already */ if (!(vi->flags & VI_INIT_DONE) && ((rc = vi_full_init(vi)) != 0)) return (rc); /* error message displayed already */ rc = update_mac_settings(ifp, XGMAC_ALL); if (rc) goto done; /* error message displayed already */ rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); goto done; } /* * Can't fail from this point onwards. Review cxgbe_uninit_synchronized * if this changes. */ for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_ENABLED; TXQ_UNLOCK(txq); } /* * The first iq of the first port to come up is used for tracing. */ if (sc->traceq < 0 && IS_MAIN_VI(vi)) { sc->traceq = sc->sge.rxq[vi->first_rxq].iq.abs_id; t4_write_reg(sc, is_t4(sc) ? A_MPS_TRC_RSS_CONTROL : A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) | V_QUEUENUMBER(sc->traceq)); pi->flags |= HAS_TRACEQ; } /* all ok */ PORT_LOCK(pi); ifp->if_drv_flags |= IFF_DRV_RUNNING; pi->up_vis++; if (pi->nvi > 1) callout_reset(&vi->tick, hz, vi_tick, vi); else callout_reset(&pi->tick, hz, cxgbe_tick, pi); PORT_UNLOCK(pi); done: if (rc != 0) cxgbe_uninit_synchronized(vi); return (rc); } /* * Idempotent. */ static int cxgbe_uninit_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; int rc, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); if (!(vi->flags & VI_INIT_DONE)) { KASSERT(!(ifp->if_drv_flags & IFF_DRV_RUNNING), ("uninited VI is running")); return (0); } /* * Disable the VI so that all its data in either direction is discarded * by the MPS. Leave everything else (the queues, interrupts, and 1Hz * tick) intact as the TP can deliver negative advice or data that it's * holding in its RAM (for an offloaded connection) even after the VI is * disabled. */ rc = -t4_enable_vi(sc, sc->mbox, vi->viid, false, false); if (rc) { if_printf(ifp, "disable_vi failed: %d\n", rc); return (rc); } for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags &= ~EQ_ENABLED; TXQ_UNLOCK(txq); } PORT_LOCK(pi); if (pi->nvi == 1) callout_stop(&pi->tick); else callout_stop(&vi->tick); if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { PORT_UNLOCK(pi); return (0); } ifp->if_drv_flags &= ~IFF_DRV_RUNNING; pi->up_vis--; if (pi->up_vis > 0) { PORT_UNLOCK(pi); return (0); } PORT_UNLOCK(pi); pi->link_cfg.link_ok = 0; pi->link_cfg.speed = 0; pi->linkdnrc = -1; t4_os_link_changed(sc, pi->port_id, 0, -1); return (0); } /* * It is ok for this function to fail midway and return right away. t4_detach * will walk the entire sc->irq list and clean up whatever is valid. */ static int setup_intr_handlers(struct adapter *sc) { int rc, rid, p, q, v; char s[8]; struct irq *irq; struct port_info *pi; struct vi_info *vi; struct sge *sge = &sc->sge; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; #endif #ifdef RSS int nbuckets = rss_getnumbuckets(); #endif /* * Setup interrupts. */ irq = &sc->irq[0]; rid = sc->intr_type == INTR_INTX ? 0 : 1; if (sc->intr_count == 1) return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all")); /* Multiple interrupts. */ KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); /* The first one is always error intr */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err"); if (rc != 0) return (rc); irq++; rid++; /* The second one is always the firmware event queue */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sge->fwq, "evt"); if (rc != 0) return (rc); irq++; rid++; for_each_port(sc, p) { pi = sc->port[p]; for_each_vi(pi, v, vi) { vi->first_intr = rid - 1; if (vi->nnmrxq > 0) { int n = max(vi->nrxq, vi->nnmrxq); MPASS(vi->flags & INTR_RXQ); rxq = &sge->rxq[vi->first_rxq]; #ifdef DEV_NETMAP nm_rxq = &sge->nm_rxq[vi->first_nm_rxq]; #endif for (q = 0; q < n; q++) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); if (q < vi->nrxq) irq->rxq = rxq++; #ifdef DEV_NETMAP if (q < vi->nnmrxq) irq->nm_rxq = nm_rxq++; #endif rc = t4_alloc_irq(sc, irq, rid, t4_vi_intr, irq, s); if (rc != 0) return (rc); irq++; rid++; vi->nintr++; } } else if (vi->flags & INTR_RXQ) { for_each_rxq(vi, q, rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, rxq, s); if (rc != 0) return (rc); #ifdef RSS bus_bind_intr(sc->dev, irq->res, rss_getcpu(q % nbuckets)); #endif irq++; rid++; vi->nintr++; } } #ifdef TCP_OFFLOAD if (vi->flags & INTR_OFLD_RXQ) { for_each_ofld_rxq(vi, q, ofld_rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'A' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, ofld_rxq, s); if (rc != 0) return (rc); irq++; rid++; vi->nintr++; } } #endif } } MPASS(irq == &sc->irq[sc->intr_count]); return (0); } int adapter_full_init(struct adapter *sc) { int rc, i; ASSERT_SYNCHRONIZED_OP(sc); ADAPTER_LOCK_ASSERT_NOTOWNED(sc); KASSERT((sc->flags & FULL_INIT_DONE) == 0, ("%s: FULL_INIT_DONE already", __func__)); /* * queues that belong to the adapter (not any particular port). */ rc = t4_setup_adapter_queues(sc); if (rc != 0) goto done; for (i = 0; i < nitems(sc->tq); i++) { sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT, taskqueue_thread_enqueue, &sc->tq[i]); if (sc->tq[i] == NULL) { device_printf(sc->dev, "failed to allocate task queue %d\n", i); rc = ENOMEM; goto done; } taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d", device_get_nameunit(sc->dev), i); } t4_intr_enable(sc); sc->flags |= FULL_INIT_DONE; done: if (rc != 0) adapter_full_uninit(sc); return (rc); } int adapter_full_uninit(struct adapter *sc) { int i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); t4_teardown_adapter_queues(sc); for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) { taskqueue_free(sc->tq[i]); sc->tq[i] = NULL; } sc->flags &= ~FULL_INIT_DONE; return (0); } #ifdef RSS #define SUPPORTED_RSS_HASHTYPES (RSS_HASHTYPE_RSS_IPV4 | \ RSS_HASHTYPE_RSS_TCP_IPV4 | RSS_HASHTYPE_RSS_IPV6 | \ RSS_HASHTYPE_RSS_TCP_IPV6 | RSS_HASHTYPE_RSS_UDP_IPV4 | \ RSS_HASHTYPE_RSS_UDP_IPV6) /* Translates kernel hash types to hardware. */ static int hashconfig_to_hashen(int hashconfig) { int hashen = 0; if (hashconfig & RSS_HASHTYPE_RSS_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV4) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV6) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; return (hashen); } /* Translates hardware hash types to kernel. */ static int hashen_to_hashconfig(int hashen) { int hashconfig = 0; if (hashen & F_FW_RSS_VI_CONFIG_CMD_UDPEN) { /* * If UDP hashing was enabled it must have been enabled for * either IPv4 or IPv6 (inclusive or). Enabling UDP without * enabling any 4-tuple hash is nonsense configuration. */ MPASS(hashen & (F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)); if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV6; } if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV6; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV6; return (hashconfig); } #endif int vi_full_init(struct vi_info *vi) { struct adapter *sc = vi->pi->adapter; struct ifnet *ifp = vi->ifp; uint16_t *rss; struct sge_rxq *rxq; int rc, i, j, hashen; #ifdef RSS int nbuckets = rss_getnumbuckets(); int hashconfig = rss_gethashconfig(); int extra; uint32_t raw_rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; uint32_t rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; #endif ASSERT_SYNCHRONIZED_OP(sc); KASSERT((vi->flags & VI_INIT_DONE) == 0, ("%s: VI_INIT_DONE already", __func__)); sysctl_ctx_init(&vi->ctx); vi->flags |= VI_SYSCTL_CTX; /* * Allocate tx/rx/fl queues for this VI. */ rc = t4_setup_vi_queues(vi); if (rc != 0) goto done; /* error message displayed already */ /* * Setup RSS for this VI. Save a copy of the RSS table for later use. */ if (vi->nrxq > vi->rss_size) { if_printf(ifp, "nrxq (%d) > hw RSS table size (%d); " "some queues will never receive traffic.\n", vi->nrxq, vi->rss_size); } else if (vi->rss_size % vi->nrxq) { if_printf(ifp, "nrxq (%d), hw RSS table size (%d); " "expect uneven traffic distribution.\n", vi->nrxq, vi->rss_size); } #ifdef RSS MPASS(RSS_KEYSIZE == 40); if (vi->nrxq != nbuckets) { if_printf(ifp, "nrxq (%d) != kernel RSS buckets (%d);" "performance will be impacted.\n", vi->nrxq, nbuckets); } rss_getkey((void *)&raw_rss_key[0]); for (i = 0; i < nitems(rss_key); i++) { rss_key[i] = htobe32(raw_rss_key[nitems(rss_key) - 1 - i]); } t4_write_rss_key(sc, &rss_key[0], -1); #endif rss = malloc(vi->rss_size * sizeof (*rss), M_CXGBE, M_ZERO | M_WAITOK); for (i = 0; i < vi->rss_size;) { #ifdef RSS j = rss_get_indirection_to_bucket(i); j %= vi->nrxq; rxq = &sc->sge.rxq[vi->first_rxq + j]; rss[i++] = rxq->iq.abs_id; #else for_each_rxq(vi, j, rxq) { rss[i++] = rxq->iq.abs_id; if (i == vi->rss_size) break; } #endif } rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, rss, vi->rss_size); if (rc != 0) { if_printf(ifp, "rss_config failed: %d\n", rc); goto done; } #ifdef RSS hashen = hashconfig_to_hashen(hashconfig); /* * We may have had to enable some hashes even though the global config * wants them disabled. This is a potential problem that must be * reported to the user. */ extra = hashen_to_hashconfig(hashen) ^ hashconfig; /* * If we consider only the supported hash types, then the enabled hashes * are a superset of the requested hashes. In other words, there cannot * be any supported hash that was requested but not enabled, but there * can be hashes that were not requested but had to be enabled. */ extra &= SUPPORTED_RSS_HASHTYPES; MPASS((extra & hashconfig) == 0); if (extra) { if_printf(ifp, "global RSS config (0x%x) cannot be accommodated.\n", hashconfig); } if (extra & RSS_HASHTYPE_RSS_IPV4) if_printf(ifp, "IPv4 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV4) if_printf(ifp, "TCP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_IPV6) if_printf(ifp, "IPv6 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV6) if_printf(ifp, "TCP/IPv6 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV4) if_printf(ifp, "UDP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV6) if_printf(ifp, "UDP/IPv6 4-tuple hashing forced on.\n"); #else hashen = F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_UDPEN; #endif rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, hashen, rss[0]); if (rc != 0) { if_printf(ifp, "rss hash/defaultq config failed: %d\n", rc); goto done; } vi->rss = rss; vi->flags |= VI_INIT_DONE; done: if (rc != 0) vi_full_uninit(vi); return (rc); } /* * Idempotent. */ int vi_full_uninit(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; int i; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif if (vi->flags & VI_INIT_DONE) { /* Need to quiesce queues. */ /* XXX: Only for the first VI? */ if (IS_MAIN_VI(vi)) quiesce_wrq(sc, &sc->sge.ctrlq[pi->port_id]); for_each_txq(vi, i, txq) { quiesce_txq(sc, txq); } #ifdef TCP_OFFLOAD for_each_ofld_txq(vi, i, ofld_txq) { quiesce_wrq(sc, ofld_txq); } #endif for_each_rxq(vi, i, rxq) { quiesce_iq(sc, &rxq->iq); quiesce_fl(sc, &rxq->fl); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { quiesce_iq(sc, &ofld_rxq->iq); quiesce_fl(sc, &ofld_rxq->fl); } #endif free(vi->rss, M_CXGBE); free(vi->nm_rss, M_CXGBE); } t4_teardown_vi_queues(vi); vi->flags &= ~VI_INIT_DONE; return (0); } static void quiesce_txq(struct adapter *sc, struct sge_txq *txq) { struct sge_eq *eq = &txq->eq; struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; (void) sc; /* unused */ #ifdef INVARIANTS TXQ_LOCK(txq); MPASS((eq->flags & EQ_ENABLED) == 0); TXQ_UNLOCK(txq); #endif /* Wait for the mp_ring to empty. */ while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 0); pause("rquiesce", 1); } /* Then wait for the hardware to finish. */ while (spg->cidx != htobe16(eq->pidx)) pause("equiesce", 1); /* Finally, wait for the driver to reclaim all descriptors. */ while (eq->cidx != eq->pidx) pause("dquiesce", 1); } static void quiesce_wrq(struct adapter *sc, struct sge_wrq *wrq) { /* XXXTX */ } static void quiesce_iq(struct adapter *sc, struct sge_iq *iq) { (void) sc; /* unused */ /* Synchronize with the interrupt handler */ while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED)) pause("iqfree", 1); } static void quiesce_fl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); fl->flags |= FL_DOOMED; FL_UNLOCK(fl); callout_stop(&sc->sfl_callout); mtx_unlock(&sc->sfl_lock); KASSERT((fl->flags & FL_STARVING) == 0, ("%s: still starving", __func__)); } static int t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid, driver_intr_t *handler, void *arg, char *name) { int rc; irq->rid = rid; irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid, RF_SHAREABLE | RF_ACTIVE); if (irq->res == NULL) { device_printf(sc->dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET, NULL, handler, arg, &irq->tag); if (rc != 0) { device_printf(sc->dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name, rc); } else if (name) bus_describe_intr(sc->dev, irq->res, irq->tag, name); return (rc); } static int t4_free_irq(struct adapter *sc, struct irq *irq) { if (irq->tag) bus_teardown_intr(sc->dev, irq->res, irq->tag); if (irq->res) bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res); bzero(irq, sizeof(*irq)); return (0); } static void get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf) { regs->version = chip_id(sc) | chip_rev(sc) << 10; t4_get_regs(sc, buf, regs->len); } #define A_PL_INDIR_CMD 0x1f8 #define S_PL_AUTOINC 31 #define M_PL_AUTOINC 0x1U #define V_PL_AUTOINC(x) ((x) << S_PL_AUTOINC) #define G_PL_AUTOINC(x) (((x) >> S_PL_AUTOINC) & M_PL_AUTOINC) #define S_PL_VFID 20 #define M_PL_VFID 0xffU #define V_PL_VFID(x) ((x) << S_PL_VFID) #define G_PL_VFID(x) (((x) >> S_PL_VFID) & M_PL_VFID) #define S_PL_ADDR 0 #define M_PL_ADDR 0xfffffU #define V_PL_ADDR(x) ((x) << S_PL_ADDR) #define G_PL_ADDR(x) (((x) >> S_PL_ADDR) & M_PL_ADDR) #define A_PL_INDIR_DATA 0x1fc static uint64_t read_vf_stat(struct adapter *sc, unsigned int viid, int reg) { u32 stats[2]; mtx_assert(&sc->reg_lock, MA_OWNED); t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(G_FW_VIID_VIN(viid)) | V_PL_ADDR(VF_MPS_REG(reg))); stats[0] = t4_read_reg(sc, A_PL_INDIR_DATA); stats[1] = t4_read_reg(sc, A_PL_INDIR_DATA); return (((uint64_t)stats[1]) << 32 | stats[0]); } static void t4_get_vi_stats(struct adapter *sc, unsigned int viid, struct fw_vi_stats_vf *stats) { #define GET_STAT(name) \ read_vf_stat(sc, viid, A_MPS_VF_STAT_##name##_L) stats->tx_bcast_bytes = GET_STAT(TX_VF_BCAST_BYTES); stats->tx_bcast_frames = GET_STAT(TX_VF_BCAST_FRAMES); stats->tx_mcast_bytes = GET_STAT(TX_VF_MCAST_BYTES); stats->tx_mcast_frames = GET_STAT(TX_VF_MCAST_FRAMES); stats->tx_ucast_bytes = GET_STAT(TX_VF_UCAST_BYTES); stats->tx_ucast_frames = GET_STAT(TX_VF_UCAST_FRAMES); stats->tx_drop_frames = GET_STAT(TX_VF_DROP_FRAMES); stats->tx_offload_bytes = GET_STAT(TX_VF_OFFLOAD_BYTES); stats->tx_offload_frames = GET_STAT(TX_VF_OFFLOAD_FRAMES); stats->rx_bcast_bytes = GET_STAT(RX_VF_BCAST_BYTES); stats->rx_bcast_frames = GET_STAT(RX_VF_BCAST_FRAMES); stats->rx_mcast_bytes = GET_STAT(RX_VF_MCAST_BYTES); stats->rx_mcast_frames = GET_STAT(RX_VF_MCAST_FRAMES); stats->rx_ucast_bytes = GET_STAT(RX_VF_UCAST_BYTES); stats->rx_ucast_frames = GET_STAT(RX_VF_UCAST_FRAMES); stats->rx_err_frames = GET_STAT(RX_VF_ERR_FRAMES); #undef GET_STAT } static void t4_clr_vi_stats(struct adapter *sc, unsigned int viid) { int reg; t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(G_FW_VIID_VIN(viid)) | V_PL_ADDR(VF_MPS_REG(A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L))); for (reg = A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L; reg <= A_MPS_VF_STAT_RX_VF_ERR_FRAMES_H; reg += 4) t4_write_reg(sc, A_PL_INDIR_DATA, 0); } static void vi_refresh_stats(struct adapter *sc, struct vi_info *vi) { struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ if (!(vi->flags & VI_INIT_DONE)) return; getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &vi->last_refreshed, <)) return; mtx_lock(&sc->reg_lock); t4_get_vi_stats(sc, vi->viid, &vi->stats); getmicrotime(&vi->last_refreshed); mtx_unlock(&sc->reg_lock); } static void cxgbe_refresh_stats(struct adapter *sc, struct port_info *pi) { int i; u_int v, tnl_cong_drops; struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &pi->last_refreshed, <)) return; tnl_cong_drops = 0; t4_get_port_stats(sc, pi->tx_chan, &pi->stats); for (i = 0; i < sc->chip_params->nchan; i++) { if (pi->rx_chan_map & (1 << i)) { mtx_lock(&sc->reg_lock); t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); mtx_unlock(&sc->reg_lock); tnl_cong_drops += v; } } pi->tnl_cong_drops = tnl_cong_drops; getmicrotime(&pi->last_refreshed); } static void cxgbe_tick(void *arg) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; PORT_LOCK_ASSERT_OWNED(pi); cxgbe_refresh_stats(sc, pi); callout_schedule(&pi->tick, hz); } void vi_tick(void *arg) { struct vi_info *vi = arg; struct adapter *sc = vi->pi->adapter; vi_refresh_stats(sc, vi); callout_schedule(&vi->tick, hz); } static void cxgbe_vlan_config(void *arg, struct ifnet *ifp, uint16_t vid) { struct ifnet *vlan; if (arg != ifp || ifp->if_type != IFT_ETHER) return; vlan = VLAN_DEVAT(ifp, vid); VLAN_SETCOOKIE(vlan, ifp); } /* * Should match fw_caps_config_ enums in t4fw_interface.h */ static char *caps_decoder[] = { "\20\001IPMI\002NCSI", /* 0: NBM */ "\20\001PPP\002QFC\003DCBX", /* 1: link */ "\20\001INGRESS\002EGRESS", /* 2: switch */ "\20\001NIC\002VM\003IDS\004UM\005UM_ISGL" /* 3: NIC */ "\006HASHFILTER\007ETHOFLD", "\20\001TOE", /* 4: TOE */ "\20\001RDDP\002RDMAC", /* 5: RDMA */ "\20\001INITIATOR_PDU\002TARGET_PDU" /* 6: iSCSI */ "\003INITIATOR_CNXOFLD\004TARGET_CNXOFLD" "\005INITIATOR_SSNOFLD\006TARGET_SSNOFLD" "\007T10DIF" "\010INITIATOR_CMDOFLD\011TARGET_CMDOFLD", "\20\00KEYS", /* 7: TLS */ "\20\001INITIATOR\002TARGET\003CTRL_OFLD" /* 8: FCoE */ "\004PO_INITIATOR\005PO_TARGET", }; static void t4_sysctls(struct adapter *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *c0; static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"}; ctx = device_get_sysctl_ctx(sc->dev); /* * dev.t4nex.X. */ oid = device_get_sysctl_tree(sc->dev); c0 = children = SYSCTL_CHILDREN(oid); sc->sc_do_rxcopy = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW, &sc->sc_do_rxcopy, 1, "Do RX copy of small frames"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL, sc->params.nports, "# of ports"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, NULL, chip_rev(sc), "chip hardware revision"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "tp_version", CTLFLAG_RD, sc->tp_version, 0, "TP microcode version"); if (sc->params.exprom_vers != 0) { SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "exprom_version", CTLFLAG_RD, sc->exprom_version, 0, "expansion ROM version"); } SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf", CTLFLAG_RD, sc->cfg_file, 0, "configuration file"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL, sc->cfcsum, "config file checksum"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells", CTLTYPE_STRING | CTLFLAG_RD, doorbells, sc->doorbells, sysctl_bitfield, "A", "available doorbells"); #define SYSCTL_CAP(name, n, text) \ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, #name, \ CTLTYPE_STRING | CTLFLAG_RD, caps_decoder[n], sc->name, \ sysctl_bitfield, "A", "available " text "capabilities") SYSCTL_CAP(nbmcaps, 0, "NBM"); SYSCTL_CAP(linkcaps, 1, "link"); SYSCTL_CAP(switchcaps, 2, "switch"); SYSCTL_CAP(niccaps, 3, "NIC"); SYSCTL_CAP(toecaps, 4, "TCP offload"); SYSCTL_CAP(rdmacaps, 5, "RDMA"); SYSCTL_CAP(iscsicaps, 6, "iSCSI"); SYSCTL_CAP(tlscaps, 7, "TLS"); SYSCTL_CAP(fcoecaps, 8, "FCoE"); #undef SYSCTL_CAP SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL, sc->params.vpd.cclk, "core clock frequency (in KHz)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers", CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.timer_val, sizeof(sc->params.sge.timer_val), sysctl_int_array, "A", "interrupt holdoff timer values (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts", CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.counter_val, sizeof(sc->params.sge.counter_val), sysctl_int_array, "A", "interrupt holdoff packet counter values"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD, NULL, sc->tids.nftids, "number of filters"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, sc, 0, sysctl_temperature, "I", "chip temperature (in Celsius)"); t4_sge_sysctls(sc, ctx, children); sc->lro_timeout = 100; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW, &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "debug_flags", CTLFLAG_RW, &sc->debug_flags, 0, "flags to enable runtime debugging"); #ifdef SBUF_DRAIN /* * dev.t4nex.X.misc. Marked CTLFLAG_SKIP to avoid information overload. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc", CTLFLAG_RD | CTLFLAG_SKIP, NULL, "logs and miscellaneous information"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cctrl, "A", "congestion control"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp", CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0", CTLTYPE_STRING | CTLFLAG_RD, sc, 3, sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1", CTLTYPE_STRING | CTLFLAG_RD, sc, 4, sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5, sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, chip_id(sc) <= CHELSIO_T5 ? sysctl_cim_la : sysctl_cim_la_t6, "A", "CIM logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ma_la, "A", "CIM MA logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2", CTLTYPE_STRING | CTLFLAG_RD, sc, 2 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3", CTLTYPE_STRING | CTLFLAG_RD, sc, 3 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge", CTLTYPE_STRING | CTLFLAG_RD, sc, 4 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)"); if (chip_id(sc) > CHELSIO_T4) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 6 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 7 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_pif_la, "A", "CIM PIF logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_qcfg, "A", "CIM queue configuration"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cpl_stats, "A", "CPL statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ddp_stats, "A", "non-TCP DDP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_devlog, "A", "firmware's device log"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_fcoe_stats, "A", "FCoE statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_hw_sched, "A", "hardware scheduler "); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_l2t, "A", "hardware L2 table"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_lb_stats, "A", "loopback statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_meminfo, "A", "memory regions"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, chip_id(sc) <= CHELSIO_T5 ? sysctl_mps_tcam : sysctl_mps_tcam_t6, "A", "MPS TCAM entries"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_path_mtus, "A", "path MTUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_pm_stats, "A", "PM statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_rdma_stats, "A", "RDMA statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tcp_stats, "A", "TCP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tids, "A", "TID information"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_err_stats, "A", "TP error statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la_mask", CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_tp_la_mask, "I", "TP logic analyzer event capture mask"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_la, "A", "TP logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tx_rate, "A", "Tx rate"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ulprx_la, "A", "ULPRX logic analyzer"); if (is_t5(sc)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_wcwr_stats, "A", "write combined work requests"); } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { /* * dev.t4nex.X.toe. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD, NULL, "TOE parameters"); children = SYSCTL_CHILDREN(oid); sc->tt.sndbuf = 256 * 1024; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW, &sc->tt.sndbuf, 0, "max hardware send buffer size"); sc->tt.ddp = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW, &sc->tt.ddp, 0, "DDP allowed"); sc->tt.rx_coalesce = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce", CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing"); sc->tt.tx_align = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align", CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload"); sc->tt.tx_zcopy = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_zcopy", CTLFLAG_RW, &sc->tt.tx_zcopy, 0, "Enable zero-copy aio_write(2)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_tick, "A", "TP timer tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timestamp_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_tp_tick, "A", "TCP timestamp tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_tp_tick, "A", "DACK tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_timer", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, sysctl_tp_dack_timer, "IU", "DACK timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_min", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MIN, sysctl_tp_timer, "LU", "Retransmit min (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_max", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MAX, sysctl_tp_timer, "LU", "Retransmit max (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_min", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MIN, sysctl_tp_timer, "LU", "Persist timer min (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_max", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MAX, sysctl_tp_timer, "LU", "Persist timer max (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_idle", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_IDLE, sysctl_tp_timer, "LU", "Keepidle idle timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_intvl", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_INTVL, sysctl_tp_timer, "LU", "Keepidle interval (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "initial_srtt", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_INIT_SRTT, sysctl_tp_timer, "LU", "Initial SRTT (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "finwait2_timer", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_FINWAIT2_TIMER, sysctl_tp_timer, "LU", "FINWAIT2 timer (us)"); } #endif } void vi_sysctls(struct vi_info *vi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children; ctx = device_get_sysctl_ctx(vi->dev); /* * dev.v?(cxgbe|cxl).X. */ oid = device_get_sysctl_tree(vi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "viid", CTLFLAG_RD, NULL, vi->viid, "VI identifer"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD, &vi->nrxq, 0, "# of rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD, &vi->ntxq, 0, "# of tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD, &vi->first_rxq, 0, "index of first rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &vi->first_txq, 0, "index of first tx queue"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_size", CTLFLAG_RD, NULL, vi->rss_size, "size of RSS indirection table"); if (IS_MAIN_VI(vi)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_noflowq, "IU", "Reserve queue 0 for non-flowid packets"); } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, &vi->nofldrxq, 0, "# of rx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD, &vi->nofldtxq, 0, "# of tx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq", CTLFLAG_RD, &vi->first_ofld_rxq, 0, "index of first TOE rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq", CTLFLAG_RD, &vi->first_ofld_txq, 0, "index of first TOE tx queue"); } #endif #ifdef DEV_NETMAP if (vi->nnmrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD, &vi->nnmrxq, 0, "# of netmap rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD, &vi->nnmtxq, 0, "# of netmap tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq", CTLFLAG_RD, &vi->first_nm_rxq, 0, "index of first netmap rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq", CTLFLAG_RD, &vi->first_nm_txq, 0, "index of first netmap tx queue"); } #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_tmr_idx, "I", "holdoff timer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_pktc_idx, "I", "holdoff packet counter index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_rxq, "I", "rx queue size"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_txq, "I", "tx queue size"); } static void cxgbe_sysctls(struct port_info *pi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *children2; struct adapter *sc = pi->adapter; int i; char name[16]; ctx = device_get_sysctl_ctx(pi->dev); /* * dev.cxgbe.X. */ oid = device_get_sysctl_tree(pi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING | CTLFLAG_RD, pi, 0, sysctl_linkdnrc, "A", "reason why link is down"); if (pi->port_type == FW_PORT_TYPE_BT_XAUI) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, pi, 0, sysctl_btphy, "I", "PHY temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version", CTLTYPE_INT | CTLFLAG_RD, pi, 1, sysctl_btphy, "I", "PHY firmware version"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings", CTLTYPE_STRING | CTLFLAG_RW, pi, PAUSE_TX, sysctl_pause_settings, "A", "PAUSE settings (bit 0 = rx_pause, bit 1 = tx_pause)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "max_speed", CTLFLAG_RD, NULL, port_top_speed(pi), "max speed (in Gbps)"); /* * dev.(cxgbe|cxl).X.tc. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "tc", CTLFLAG_RD, NULL, "Tx scheduler traffic classes"); for (i = 0; i < sc->chip_params->nsched_cls; i++) { struct tx_sched_class *tc = &pi->tc[i]; snprintf(name, sizeof(name), "%d", i); children2 = SYSCTL_CHILDREN(SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL, "traffic class")); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "flags", CTLFLAG_RD, &tc->flags, 0, "flags"); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "refcount", CTLFLAG_RD, &tc->refcount, 0, "references to this class"); #ifdef SBUF_DRAIN SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "params", CTLTYPE_STRING | CTLFLAG_RD, sc, (pi->port_id << 16) | i, sysctl_tc_params, "A", "traffic class parameters"); #endif } /* * dev.cxgbe.X.stats. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD, NULL, "port statistics"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD, &pi->tx_parse_error, 0, "# of tx packets with invalid length or # of segments"); #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \ SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \ CTLTYPE_U64 | CTLFLAG_RD, sc, reg, \ sysctl_handle_t4_reg64, "QU", desc) SYSCTL_ADD_T4_REG64(pi, "tx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_error_frames", "# of error frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_64", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_65_127", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_128_255", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_256_511", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_512_1023", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1024_1518", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1519_max", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "tx_drop", "# of dropped tx frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_DROP_L)); SYSCTL_ADD_T4_REG64(pi, "tx_pause", "# of pause frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp0", "# of PPP prio 0 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp1", "# of PPP prio 1 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp2", "# of PPP prio 2 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp3", "# of PPP prio 3 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp4", "# of PPP prio 4 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp5", "# of PPP prio 5 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp6", "# of PPP prio 6 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp7", "# of PPP prio 7 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP7_L)); SYSCTL_ADD_T4_REG64(pi, "rx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_too_long", "# of frames exceeding MTU", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_jabber", "# of jabber frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_fcs_err", "# of frames received with bad FCS", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_len_err", "# of frames received with length error", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LEN_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_symbol_err", "symbol errors", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_SYM_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_runt", "# of short frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LESS_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_64", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_65_127", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_128_255", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_256_511", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_512_1023", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1024_1518", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1519_max", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "rx_pause", "# of pause frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp0", "# of PPP prio 0 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp1", "# of PPP prio 1 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp2", "# of PPP prio 2 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp3", "# of PPP prio 3 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp4", "# of PPP prio 4 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp5", "# of PPP prio 5 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp6", "# of PPP prio 6 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp7", "# of PPP prio 7 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP7_L)); #undef SYSCTL_ADD_T4_REG64 #define SYSCTL_ADD_T4_PORTSTAT(name, desc) \ SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \ &pi->stats.name, desc) /* We get these from port_stats and they may be stale by up to 1s */ SYSCTL_ADD_T4_PORTSTAT(rx_ovflow0, "# drops due to buffer-group 0 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow1, "# drops due to buffer-group 1 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow2, "# drops due to buffer-group 2 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow3, "# drops due to buffer-group 3 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc0, "# of buffer-group 0 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc1, "# of buffer-group 1 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc2, "# of buffer-group 2 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc3, "# of buffer-group 3 truncated packets"); #undef SYSCTL_ADD_T4_PORTSTAT } static int sysctl_int_array(SYSCTL_HANDLER_ARGS) { int rc, *i, space = 0; struct sbuf sb; sbuf_new_for_sysctl(&sb, NULL, 64, req); for (i = arg1; arg2; arg2 -= sizeof(int), i++) { if (space) sbuf_printf(&sb, " "); sbuf_printf(&sb, "%d", *i); space = 1; } rc = sbuf_finish(&sb); sbuf_delete(&sb); return (rc); } static int sysctl_bitfield(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", (int)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_btphy(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; int op = arg2; struct adapter *sc = pi->adapter; u_int v; int rc; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4btt"); if (rc) return (rc); /* XXX: magic numbers */ rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820, &v); end_synchronized_op(sc, 0); if (rc) return (rc); if (op == 0) v /= 256; rc = sysctl_handle_int(oidp, &v, 0, req); return (rc); } static int sysctl_noflowq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; int rc, val; val = vi->rsrv_noflowq; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if ((val >= 1) && (vi->ntxq > 1)) vi->rsrv_noflowq = 1; else vi->rsrv_noflowq = 0; return (rc); } static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int idx, rc, i; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif uint8_t v; idx = vi->tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4tmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->pktc_idx != -1); for_each_rxq(vi, i, rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&rxq->iq.intr_params, v); #else rxq->iq.intr_params = v; #endif } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&ofld_rxq->iq.intr_params, v); #else ofld_rxq->iq.intr_params = v; #endif } #endif vi->tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int idx, rc; idx = vi->pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4pktc"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int qsize, rc; qsize = vi->qsize_rxq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || (qsize & 7)) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4rxqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_rxq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int qsize, rc; qsize = vi->qsize_txq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || qsize > 65536) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4txqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_txq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; if (req->newptr == NULL) { struct sbuf *sb; static char *bits = "\20\1PAUSE_RX\2PAUSE_TX"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", lc->fc & (PAUSE_TX | PAUSE_RX), bits); rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[2]; int n; s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX)); s[1] = 0; rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); if (s[1] != 0) return (EINVAL); if (s[0] < '0' || s[0] > '9') return (EINVAL); /* not a number */ n = s[0] - '0'; if (n & ~(PAUSE_TX | PAUSE_RX)) return (EINVAL); /* some other bit is set too */ rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4PAUSE"); if (rc) return (rc); if ((lc->requested_fc & (PAUSE_TX | PAUSE_RX)) != n) { int link_ok = lc->link_ok; lc->requested_fc &= ~(PAUSE_TX | PAUSE_RX); lc->requested_fc |= n; rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); lc->link_ok = link_ok; /* restore */ } end_synchronized_op(sc, 0); } return (rc); } static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int reg = arg2; uint64_t val; val = t4_read_reg64(sc, reg); return (sysctl_handle_64(oidp, &val, 0, req)); } static int sysctl_temperature(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, t; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp"); if (rc) return (rc); param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); end_synchronized_op(sc, 0); if (rc) return (rc); /* unknown is returned as 0 but we display -1 in that case */ t = val == 0 ? -1 : val; rc = sysctl_handle_int(oidp, &t, 0, req); return (rc); } #ifdef SBUF_DRAIN static int sysctl_cctrl(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t incr[NMTUS][NCCTRL_WIN]; static const char *dec_fac[] = { "0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875", "0.9375" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); t4_read_cong_tbl(sc, incr); for (i = 0; i < NCCTRL_WIN; ++i) { sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i, incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i], incr[5][i], incr[6][i], incr[7][i]); sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n", incr[8][i], incr[9][i], incr[10][i], incr[11][i], incr[12][i], incr[13][i], incr[14][i], incr[15][i], sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = { "TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI", /* ibq's */ "ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI", /* obq's */ "SGE0-RX", "SGE1-RX" /* additional obq's (T5 onwards) */ }; static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n, qid = arg2; uint32_t *buf, *p; char *qtype; u_int cim_num_obq = sc->chip_params->cim_num_obq; KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq, ("%s: bad qid %d\n", __func__, qid)); if (qid < CIM_NUM_IBQ) { /* inbound queue */ qtype = "IBQ"; n = 4 * CIM_IBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_ibq(sc, qid, buf, n); } else { /* outbound queue */ qtype = "OBQ"; qid -= CIM_NUM_IBQ; n = 4 * cim_num_obq * CIM_OBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_obq(sc, qid, buf, n); } if (rc < 0) { rc = -rc; goto done; } n = rc * sizeof(uint32_t); /* rc has # of words actually read */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]); for (i = 0, p = buf; i < n; i += 16, p += 4) sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1], p[2], p[3]); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int cfg; struct sbuf *sb; uint32_t *buf, *p; int rc; MPASS(chip_id(sc) <= CHELSIO_T5); rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg); if (rc != 0) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_cim_read_la(sc, buf, NULL); if (rc != 0) goto done; sbuf_printf(sb, "Status Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 8]; p += 8) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x", p[5] & 0xff, p[6], p[7]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x", (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8, p[4] & 0xff, p[5] >> 8); sbuf_printf(sb, "\n %02x %x%07x %x%07x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4); } else { sbuf_printf(sb, "\n %02x %x%07x %x%07x %08x %08x " "%08x%08x%08x%08x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5], p[6], p[7]); } } rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_la_t6(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int cfg; struct sbuf *sb; uint32_t *buf, *p; int rc; MPASS(chip_id(sc) > CHELSIO_T5); rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg); if (rc != 0) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_cim_read_la(sc, buf, NULL); if (rc != 0) goto done; sbuf_printf(sb, "Status Inst Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data LS1Stat LS1Addr LS1Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 10]; p += 10) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x %08x", p[3] & 0xff, p[2], p[1], p[0]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x %02x%06x", (p[6] >> 8) & 0xff, p[6] & 0xff, p[5] >> 8, p[5] & 0xff, p[4] >> 8, p[4] & 0xff, p[3] >> 8); sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16); } else { sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x " "%08x %08x %08x %08x %08x %08x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16, p[2], p[1], p[0], p[5], p[4], p[3]); } } rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE); p = buf; for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCnt ID Tag UE Data RDY VLD"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%3u %2u %x %u %08x%08x %u %u", (p[2] >> 10) & 0xff, (p[2] >> 7) & 7, (p[2] >> 3) & 0xf, (p[2] >> 2) & 1, (p[1] >> 2) | ((p[2] & 3) << 30), (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1, p[0] & 1); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL); p = buf; sbuf_printf(sb, "Cntl ID DataBE Addr Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %04x %08x %08x%08x%08x%08x", (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff, p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCntl ID Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %08x%08x%08x%08x", (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t thres[CIM_NUM_IBQ]; uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr; uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat; u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq; cim_num_obq = sc->chip_params->cim_num_obq; if (is_t4(sc)) { ibq_rdaddr = A_UP_IBQ_0_RDADDR; obq_rdaddr = A_UP_OBQ_0_REALADDR; } else { ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR; obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR; } nq = CIM_NUM_IBQ + cim_num_obq; rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat); if (rc == 0) rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr); if (rc != 0) return (rc); t4_read_cimq_cfg(sc, base, size, thres); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Queue Base Size Thres RdPtr WrPtr SOP EOP Avail"); for (i = 0; i < CIM_NUM_IBQ; i++, p += 4) sbuf_printf(sb, "\n%7s %5x %5u %5u %6x %4x %4u %4u %5u", qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]), G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); for ( ; i < nq; i++, p += 4, wr += 2) sbuf_printf(sb, "\n%7s %5x %5u %12x %4x %4u %4u %5u", qname[i], base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff, wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_cpl_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_cpl_stats(sc, &stats); mtx_unlock(&sc->reg_lock); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\nCPL requests: %10u %10u %10u %10u", stats.req[0], stats.req[1], stats.req[2], stats.req[3]); sbuf_printf(sb, "\nCPL responses: %10u %10u %10u %10u", stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\nCPL requests: %10u %10u", stats.req[0], stats.req[1]); sbuf_printf(sb, "\nCPL responses: %10u %10u", stats.rsp[0], stats.rsp[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_usm_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_usm_stats(sc, &stats); sbuf_printf(sb, "Frames: %u\n", stats.frames); sbuf_printf(sb, "Octets: %ju\n", stats.octets); sbuf_printf(sb, "Drops: %u", stats.drops); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static const char * const devlog_level_strings[] = { [FW_DEVLOG_LEVEL_EMERG] = "EMERG", [FW_DEVLOG_LEVEL_CRIT] = "CRIT", [FW_DEVLOG_LEVEL_ERR] = "ERR", [FW_DEVLOG_LEVEL_NOTICE] = "NOTICE", [FW_DEVLOG_LEVEL_INFO] = "INFO", [FW_DEVLOG_LEVEL_DEBUG] = "DEBUG" }; static const char * const devlog_facility_strings[] = { [FW_DEVLOG_FACILITY_CORE] = "CORE", [FW_DEVLOG_FACILITY_CF] = "CF", [FW_DEVLOG_FACILITY_SCHED] = "SCHED", [FW_DEVLOG_FACILITY_TIMER] = "TIMER", [FW_DEVLOG_FACILITY_RES] = "RES", [FW_DEVLOG_FACILITY_HW] = "HW", [FW_DEVLOG_FACILITY_FLR] = "FLR", [FW_DEVLOG_FACILITY_DMAQ] = "DMAQ", [FW_DEVLOG_FACILITY_PHY] = "PHY", [FW_DEVLOG_FACILITY_MAC] = "MAC", [FW_DEVLOG_FACILITY_PORT] = "PORT", [FW_DEVLOG_FACILITY_VI] = "VI", [FW_DEVLOG_FACILITY_FILTER] = "FILTER", [FW_DEVLOG_FACILITY_ACL] = "ACL", [FW_DEVLOG_FACILITY_TM] = "TM", [FW_DEVLOG_FACILITY_QFC] = "QFC", [FW_DEVLOG_FACILITY_DCB] = "DCB", [FW_DEVLOG_FACILITY_ETH] = "ETH", [FW_DEVLOG_FACILITY_OFLD] = "OFLD", [FW_DEVLOG_FACILITY_RI] = "RI", [FW_DEVLOG_FACILITY_ISCSI] = "ISCSI", [FW_DEVLOG_FACILITY_FCOE] = "FCOE", [FW_DEVLOG_FACILITY_FOISCSI] = "FOISCSI", [FW_DEVLOG_FACILITY_FOFCOE] = "FOFCOE", [FW_DEVLOG_FACILITY_CHNET] = "CHNET", }; static int sysctl_devlog(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e *buf, *e; int i, j, rc, nentries, first = 0; struct sbuf *sb; uint64_t ftstamp = UINT64_MAX; if (dparams->addr == 0) return (ENXIO); buf = malloc(dparams->size, M_CXGBE, M_NOWAIT); if (buf == NULL) return (ENOMEM); rc = read_via_memwin(sc, 1, dparams->addr, (void *)buf, dparams->size); if (rc != 0) goto done; nentries = dparams->size / sizeof(struct fw_devlog_e); for (i = 0; i < nentries; i++) { e = &buf[i]; if (e->timestamp == 0) break; /* end */ e->timestamp = be64toh(e->timestamp); e->seqno = be32toh(e->seqno); for (j = 0; j < 8; j++) e->params[j] = be32toh(e->params[j]); if (e->timestamp < ftstamp) { ftstamp = e->timestamp; first = i; } } if (buf[first].timestamp == 0) goto done; /* nothing in the log */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%10s %15s %8s %8s %s\n", "Seq#", "Tstamp", "Level", "Facility", "Message"); i = first; do { e = &buf[i]; if (e->timestamp == 0) break; /* end */ sbuf_printf(sb, "%10d %15ju %8s %8s ", e->seqno, e->timestamp, (e->level < nitems(devlog_level_strings) ? devlog_level_strings[e->level] : "UNKNOWN"), (e->facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e->facility] : "UNKNOWN")); sbuf_printf(sb, e->fmt, e->params[0], e->params[1], e->params[2], e->params[3], e->params[4], e->params[5], e->params[6], e->params[7]); if (++i == nentries) i = 0; } while (i != first); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_fcoe_stats stats[MAX_NCHAN]; int i, nchan = sc->chip_params->nchan; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nchan; i++) t4_get_fcoe_stats(sc, i, &stats[i]); if (nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp, stats[2].octets_ddp, stats[3].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp, stats[2].frames_ddp, stats[3].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u %16u %16u", stats[0].frames_drop, stats[1].frames_drop, stats[2].frames_drop, stats[3].frames_drop); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u", stats[0].frames_drop, stats[1].frames_drop); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; unsigned int map, kbps, ipg, mode; unsigned int pace_tab[NTX_SCHED]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP); mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG)); t4_read_pace_tbl(sc, pace_tab); sbuf_printf(sb, "Scheduler Mode Channel Rate (Kbps) " "Class IPG (0.1 ns) Flow IPG (us)"); for (i = 0; i < NTX_SCHED; ++i, map >>= 2) { t4_get_tx_sched(sc, i, &kbps, &ipg); sbuf_printf(sb, "\n %u %-5s %u ", i, (mode & (1 << i)) ? "flow" : "class", map & 3); if (kbps) sbuf_printf(sb, "%9u ", kbps); else sbuf_printf(sb, " disabled "); if (ipg) sbuf_printf(sb, "%13u ", ipg); else sbuf_printf(sb, " disabled "); if (pace_tab[i]) sbuf_printf(sb, "%10u", pace_tab[i]); else sbuf_printf(sb, " disabled"); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, j; uint64_t *p0, *p1; struct lb_port_stats s[2]; static const char *stat_name[] = { "OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:", "UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:", "Frames128To255:", "Frames256To511:", "Frames512To1023:", "Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:", "BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:", "BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:", "BG2FramesTrunc:", "BG3FramesTrunc:" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); memset(s, 0, sizeof(s)); for (i = 0; i < sc->chip_params->nchan; i += 2) { t4_get_lb_stats(sc, i, &s[0]); t4_get_lb_stats(sc, i + 1, &s[1]); p0 = &s[0].octets; p1 = &s[1].octets; sbuf_printf(sb, "%s Loopback %u" " Loopback %u", i == 0 ? "" : "\n", i, i + 1); for (j = 0; j < nitems(stat_name); j++) sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j], *p0++, *p1++); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS) { int rc = 0; struct port_info *pi = arg1; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 64, req); if (sb == NULL) return (ENOMEM); if (pi->linkdnrc < 0) sbuf_printf(sb, "n/a"); else sbuf_printf(sb, "%s", t4_link_down_rc_str(pi->linkdnrc)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } struct mem_desc { unsigned int base; unsigned int limit; unsigned int idx; }; static int mem_desc_cmp(const void *a, const void *b) { return ((const struct mem_desc *)a)->base - ((const struct mem_desc *)b)->base; } static void mem_region_show(struct sbuf *sb, const char *name, unsigned int from, unsigned int to) { unsigned int size; if (from == to) return; size = to - from + 1; if (size == 0) return; /* XXX: need humanize_number(3) in libkern for a more readable 'size' */ sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size); } static int sysctl_meminfo(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n; uint32_t lo, hi, used, alloc; static const char *memory[] = {"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:"}; static const char *region[] = { "DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:", "Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:", "Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:", "TDDP region:", "TPT region:", "STAG region:", "RQ region:", "RQUDP region:", "PBL region:", "TXPBL region:", "DBVFIFO region:", "ULPRX state:", "ULPTX state:", "On-chip queues:" }; struct mem_desc avail[4]; struct mem_desc mem[nitems(region) + 3]; /* up to 3 holes */ struct mem_desc *md = mem; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nitems(mem); i++) { mem[i].limit = 0; mem[i].idx = i; } /* Find and sort the populated memory ranges */ i = 0; lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if (lo & F_EDRAM0_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM0_BAR); avail[i].base = G_EDRAM0_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20); avail[i].idx = 0; i++; } if (lo & F_EDRAM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM1_BAR); avail[i].base = G_EDRAM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20); avail[i].idx = 1; i++; } if (lo & F_EXT_MEM_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); avail[i].base = G_EXT_MEM_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM_SIZE(hi) << 20); avail[i].idx = is_t5(sc) ? 3 : 2; /* Call it MC0 for T5 */ i++; } if (is_t5(sc) && lo & F_EXT_MEM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); avail[i].base = G_EXT_MEM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20); avail[i].idx = 4; i++; } if (!i) /* no memory available */ return 0; qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp); (md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR); (md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE); /* the next few have explicit upper bounds */ md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) * G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE)); md++; md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) * G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE)); md++; if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { if (chip_id(sc) <= CHELSIO_T5) md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE); else md->base = t4_read_reg(sc, A_LE_DB_HASH_TBL_BASE_ADDR); md->limit = 0; } else { md->base = 0; md->idx = nitems(region); /* hide it */ } md++; #define ulp_region(reg) \ md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\ (md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT) ulp_region(RX_ISCSI); ulp_region(RX_TDDP); ulp_region(TX_TPT); ulp_region(RX_STAG); ulp_region(RX_RQ); ulp_region(RX_RQUDP); ulp_region(RX_PBL); ulp_region(TX_PBL); #undef ulp_region md->base = 0; md->idx = nitems(region); if (!is_t4(sc)) { uint32_t size = 0; uint32_t sge_ctrl = t4_read_reg(sc, A_SGE_CONTROL2); uint32_t fifo_size = t4_read_reg(sc, A_SGE_DBVFIFO_SIZE); if (is_t5(sc)) { if (sge_ctrl & F_VFIFO_ENABLE) size = G_DBVFIFO_SIZE(fifo_size); } else size = G_T6_DBVFIFO_SIZE(fifo_size); if (size) { md->base = G_BASEADDR(t4_read_reg(sc, A_SGE_DBVFIFO_BADDR)); md->limit = md->base + (size << 2) - 1; } } md++; md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE); md->limit = 0; md++; md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE); md->limit = 0; md++; md->base = sc->vres.ocq.start; if (sc->vres.ocq.size) md->limit = md->base + sc->vres.ocq.size - 1; else md->idx = nitems(region); /* hide it */ md++; /* add any address-space holes, there can be up to 3 */ for (n = 0; n < i - 1; n++) if (avail[n].limit < avail[n + 1].base) (md++)->base = avail[n].limit; if (avail[n].limit) (md++)->base = avail[n].limit; n = md - mem; qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp); for (lo = 0; lo < i; lo++) mem_region_show(sb, memory[avail[lo].idx], avail[lo].base, avail[lo].limit - 1); sbuf_printf(sb, "\n"); for (i = 0; i < n; i++) { if (mem[i].idx >= nitems(region)) continue; /* skip holes */ if (!mem[i].limit) mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0; mem_region_show(sb, region[mem[i].idx], mem[i].base, mem[i].limit); } sbuf_printf(sb, "\n"); lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP RAM:", lo, hi); lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP Extmem2:", lo, hi); lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE); sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n", G_PMRXMAXPAGE(lo), t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10, (lo & F_PMRXNUMCHN) ? 2 : 1); lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE); hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n", G_PMTXMAXPAGE(lo), hi >= (1 << 20) ? (hi >> 20) : (hi >> 10), hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo)); sbuf_printf(sb, "%u p-structs\n", t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT)); for (i = 0; i < 4; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_MAC_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated", i, used, alloc); } for (i = 0; i < sc->chip_params->nchan; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_LPBK_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nLoopback %d using %u pages out of %u allocated", i, used, alloc); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static inline void tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask) { *mask = x | y; y = htobe64(y); memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN); } static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) <= CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask Vld Ports PF" " VF Replication P0 P1 P2 P3 ML"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint64_t tcamx, tcamy, mask; uint32_t cls_lo, cls_hi; uint8_t addr[ETHER_ADDR_LEN]; tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i)); tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i)); if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx" " %c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, (cls_lo & F_SRAM_VLD) ? 'Y' : 'N', G_PORTMAP(cls_hi), G_PF(cls_lo), (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1); if (cls_lo & F_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mps"); if (rc) break; rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) { sbuf_printf(sb, "%36d", rc); rc = 0; } else { sbuf_printf(sb, " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%36s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo), G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo), G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) > CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask VNI Mask" " IVLAN Vld DIP_Hit Lookup Port Vld Ports PF VF" " Replication" " P0 P1 P2 P3 ML\n"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint8_t dip_hit, vlan_vld, lookup_type, port_num; uint16_t ivlan; uint64_t tcamx, tcamy, val, mask; uint32_t cls_lo, cls_hi, ctl, data2, vnix, vniy; uint8_t addr[ETHER_ADDR_LEN]; ctl = V_CTLREQID(1) | V_CTLCMDTYPE(0) | V_CTLXYBITSEL(0); if (i < 256) ctl |= V_CTLTCAMINDEX(i) | V_CTLTCAMSEL(0); else ctl |= V_CTLTCAMINDEX(i - 256) | V_CTLTCAMSEL(1); t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamy = G_DMACH(val) << 32; tcamy |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); lookup_type = G_DATALKPTYPE(data2); port_num = G_DATAPORTNUM(data2); if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI */ vniy = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); dip_hit = data2 & F_DATADIPHIT; vlan_vld = 0; } else { vniy = 0; dip_hit = 0; vlan_vld = data2 & F_DATAVIDH2; ivlan = G_VIDL(val); } ctl |= V_CTLXYBITSEL(1); t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamx = G_DMACH(val) << 32; tcamx |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI mask */ vnix = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); } else vnix = 0; if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); if (lookup_type && lookup_type != M_DATALKPTYPE) { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx %06x %06x - - %3c" " 'I' %4x %3c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, vniy, vnix, dip_hit ? 'Y' : 'N', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } else { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx - - ", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask); if (vlan_vld) sbuf_printf(sb, "%4u Y ", ivlan); else sbuf_printf(sb, " - N "); sbuf_printf(sb, "- %3c %4x %3c %#x%4u%4d", lookup_type ? 'I' : 'O', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } if (cls_lo & F_T6_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t6mps"); if (rc) break; rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) { sbuf_printf(sb, "%72d", rc); rc = 0; } else { sbuf_printf(sb, " %08x %08x %08x %08x" " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc255_224), be32toh(ldst_cmd.u.mps.rplc.rplc223_192), be32toh(ldst_cmd.u.mps.rplc.rplc191_160), be32toh(ldst_cmd.u.mps.rplc.rplc159_128), be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%72s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#x", G_T6_SRAM_PRIO0(cls_lo), G_T6_SRAM_PRIO1(cls_lo), G_T6_SRAM_PRIO2(cls_lo), G_T6_SRAM_PRIO3(cls_lo), (cls_lo >> S_T6_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint16_t mtus[NMTUS]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_read_mtu_tbl(sc, mtus, NULL); sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u", mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6], mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13], mtus[14], mtus[15]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint32_t tx_cnt[MAX_PM_NSTATS], rx_cnt[MAX_PM_NSTATS]; uint64_t tx_cyc[MAX_PM_NSTATS], rx_cyc[MAX_PM_NSTATS]; static const char *tx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Bypass + mem:", "Tx FIFO wait", NULL, "Tx latency" }; static const char *rx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Flush:", " Rx FIFO wait", NULL, "Rx latency" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_pmtx_get_stats(sc, tx_cnt, tx_cyc); t4_pmrx_get_stats(sc, rx_cnt, rx_cyc); sbuf_printf(sb, " Tx pcmds Tx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); } sbuf_printf(sb, "\n Rx pcmds Rx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } if (chip_id(sc) > CHELSIO_T5) { sbuf_printf(sb, "\n Total wait Total occupancy"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); i += 2; MPASS(i < nitems(tx_stats)); sbuf_printf(sb, "\n Reads Total wait"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_rdma_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_rdma_stats(sc, &stats); mtx_unlock(&sc->reg_lock); sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod); sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tcp_stats v4, v6; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_tcp_stats(sc, &v4, &v6); mtx_unlock(&sc->reg_lock); sbuf_printf(sb, " IP IPv6\n"); sbuf_printf(sb, "OutRsts: %20u %20u\n", v4.tcp_out_rsts, v6.tcp_out_rsts); sbuf_printf(sb, "InSegs: %20ju %20ju\n", v4.tcp_in_segs, v6.tcp_in_segs); sbuf_printf(sb, "OutSegs: %20ju %20ju\n", v4.tcp_out_segs, v6.tcp_out_segs); sbuf_printf(sb, "RetransSegs: %20ju %20ju", v4.tcp_retrans_segs, v6.tcp_retrans_segs); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tids(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tid_info *t = &sc->tids; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (t->natids) { sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1, t->atids_in_use); } if (t->ntids) { if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { uint32_t b = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4; if (b) { sbuf_printf(sb, "TID range: 0-%u, %u-%u", b - 1, t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4, t->ntids - 1); } else { sbuf_printf(sb, "TID range: %u-%u", t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4, t->ntids - 1); } } else sbuf_printf(sb, "TID range: 0-%u", t->ntids - 1); sbuf_printf(sb, ", in use: %u\n", atomic_load_acq_int(&t->tids_in_use)); } if (t->nstids) { sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base, t->stid_base + t->nstids - 1, t->stids_in_use); } if (t->nftids) { sbuf_printf(sb, "FTID range: %u-%u\n", t->ftid_base, t->ftid_base + t->nftids - 1); } if (t->netids) { sbuf_printf(sb, "ETID range: %u-%u\n", t->etid_base, t->etid_base + t->netids - 1); } sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users", t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4), t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_err_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_err_stats(sc, &stats); mtx_unlock(&sc->reg_lock); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "macInErrs: %10u %10u %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1], stats.mac_in_errs[2], stats.mac_in_errs[3]); sbuf_printf(sb, "hdrInErrs: %10u %10u %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1], stats.hdr_in_errs[2], stats.hdr_in_errs[3]); sbuf_printf(sb, "tcpInErrs: %10u %10u %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1], stats.tcp_in_errs[2], stats.tcp_in_errs[3]); sbuf_printf(sb, "tcp6InErrs: %10u %10u %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1], stats.tcp6_in_errs[2], stats.tcp6_in_errs[3]); sbuf_printf(sb, "tnlCongDrops: %10u %10u %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1], stats.tnl_cong_drops[2], stats.tnl_cong_drops[3]); sbuf_printf(sb, "tnlTxDrops: %10u %10u %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1], stats.tnl_tx_drops[2], stats.tnl_tx_drops[3]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1], stats.ofld_vlan_drops[2], stats.ofld_vlan_drops[3]); sbuf_printf(sb, "ofldChanDrops: %10u %10u %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1], stats.ofld_chan_drops[2], stats.ofld_chan_drops[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "macInErrs: %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1]); sbuf_printf(sb, "hdrInErrs: %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1]); sbuf_printf(sb, "tcpInErrs: %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1]); sbuf_printf(sb, "tcp6InErrs: %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1]); sbuf_printf(sb, "tnlCongDrops: %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1]); sbuf_printf(sb, "tnlTxDrops: %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1]); sbuf_printf(sb, "ofldChanDrops: %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1]); } sbuf_printf(sb, "ofldNoNeigh: %u\nofldCongDefer: %u", stats.ofld_no_neigh, stats.ofld_cong_defer); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct tp_params *tpp = &sc->params.tp; u_int mask; int rc; mask = tpp->la_mask >> 16; rc = sysctl_handle_int(oidp, &mask, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (mask > 0xffff) return (EINVAL); tpp->la_mask = mask << 16; t4_set_reg_field(sc, A_TP_DBG_LA_CONFIG, 0xffff0000U, tpp->la_mask); return (0); } struct field_desc { const char *name; u_int start; u_int width; }; static void field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f) { char buf[32]; int line_size = 0; while (f->name) { uint64_t mask = (1ULL << f->width) - 1; int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name, ((uintmax_t)v >> f->start) & mask); if (line_size + len >= 79) { line_size = 8; sbuf_printf(sb, "\n "); } sbuf_printf(sb, "%s ", buf); line_size += len + 1; f++; } sbuf_printf(sb, "\n"); } static const struct field_desc tp_la0[] = { { "RcfOpCodeOut", 60, 4 }, { "State", 56, 4 }, { "WcfState", 52, 4 }, { "RcfOpcSrcOut", 50, 2 }, { "CRxError", 49, 1 }, { "ERxError", 48, 1 }, { "SanityFailed", 47, 1 }, { "SpuriousMsg", 46, 1 }, { "FlushInputMsg", 45, 1 }, { "FlushInputCpl", 44, 1 }, { "RssUpBit", 43, 1 }, { "RssFilterHit", 42, 1 }, { "Tid", 32, 10 }, { "InitTcb", 31, 1 }, { "LineNumber", 24, 7 }, { "Emsg", 23, 1 }, { "EdataOut", 22, 1 }, { "Cmsg", 21, 1 }, { "CdataOut", 20, 1 }, { "EreadPdu", 19, 1 }, { "CreadPdu", 18, 1 }, { "TunnelPkt", 17, 1 }, { "RcfPeerFin", 16, 1 }, { "RcfReasonOut", 12, 4 }, { "TxCchannel", 10, 2 }, { "RcfTxChannel", 8, 2 }, { "RxEchannel", 6, 2 }, { "RcfRxChannel", 5, 1 }, { "RcfDataOutSrdy", 4, 1 }, { "RxDvld", 3, 1 }, { "RxOoDvld", 2, 1 }, { "RxCongestion", 1, 1 }, { "TxCongestion", 0, 1 }, { NULL } }; static const struct field_desc tp_la1[] = { { "CplCmdIn", 56, 8 }, { "CplCmdOut", 48, 8 }, { "ESynOut", 47, 1 }, { "EAckOut", 46, 1 }, { "EFinOut", 45, 1 }, { "ERstOut", 44, 1 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static const struct field_desc tp_la2[] = { { "CplCmdIn", 56, 8 }, { "MpsVfVld", 55, 1 }, { "MpsPf", 52, 3 }, { "MpsVf", 44, 8 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static void tp_la_show(struct sbuf *sb, uint64_t *p, int idx) { field_desc_show(sb, *p, tp_la0); } static void tp_la_show2(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], tp_la0); } static void tp_la_show3(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1); } static int sysctl_tp_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint64_t *buf, *p; int rc; u_int i, inc; void (*show_func)(struct sbuf *, uint64_t *, int); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK); t4_tp_read_la(sc, buf, NULL); p = buf; switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) { case 2: inc = 2; show_func = tp_la_show2; break; case 3: inc = 2; show_func = tp_la_show3; break; default: inc = 1; show_func = tp_la_show; } for (i = 0; i < TPLA_SIZE / inc; i++, p += inc) (*show_func)(sb, p, i); rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; u64 nrate[MAX_NCHAN], orate[MAX_NCHAN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_chan_txrate(sc, nrate, orate); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju %10ju %10ju\n", nrate[0], nrate[1], nrate[2], nrate[3]); sbuf_printf(sb, "Offload B/s: %10ju %10ju %10ju %10ju", orate[0], orate[1], orate[2], orate[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju\n", nrate[0], nrate[1]); sbuf_printf(sb, "Offload B/s: %10ju %10ju", orate[0], orate[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint32_t *buf, *p; int rc, i; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_ulprx_read_la(sc, buf); p = buf; sbuf_printf(sb, " Pcmd Type Message" " Data"); for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) { sbuf_printf(sb, "\n%08x%08x %4x %08x %08x%08x%08x%08x", p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, v; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); v = t4_read_reg(sc, A_SGE_STAT_CFG); if (G_STATSOURCE_T5(v) == 7) { if (G_STATMODE(v) == 0) { sbuf_printf(sb, "total %d, incomplete %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } else if (G_STATMODE(v) == 1) { sbuf_printf(sb, "total %d, data overflow %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tc_params(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct tx_sched_class *tc; struct t4_sched_class_params p; struct sbuf *sb; int i, rc, port_id, flags, mbps, gbps; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); port_id = arg2 >> 16; MPASS(port_id < sc->params.nports); MPASS(sc->port[port_id] != NULL); i = arg2 & 0xffff; MPASS(i < sc->chip_params->nsched_cls); tc = &sc->port[port_id]->tc[i]; rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4tc_p"); if (rc) goto done; flags = tc->flags; p = tc->params; end_synchronized_op(sc, LOCK_HELD); if ((flags & TX_SC_OK) == 0) { sbuf_printf(sb, "none"); goto done; } if (p.level == SCHED_CLASS_LEVEL_CL_WRR) { sbuf_printf(sb, "cl-wrr weight %u", p.weight); goto done; } else if (p.level == SCHED_CLASS_LEVEL_CL_RL) sbuf_printf(sb, "cl-rl"); else if (p.level == SCHED_CLASS_LEVEL_CH_RL) sbuf_printf(sb, "ch-rl"); else { rc = ENXIO; goto done; } if (p.ratemode == SCHED_CLASS_RATEMODE_REL) { /* XXX: top speed or actual link speed? */ gbps = port_top_speed(sc->port[port_id]); sbuf_printf(sb, " %u%% of %uGbps", p.maxrate, gbps); } else if (p.ratemode == SCHED_CLASS_RATEMODE_ABS) { switch (p.rateunit) { case SCHED_CLASS_RATEUNIT_BITS: mbps = p.maxrate / 1000; gbps = p.maxrate / 1000000; if (p.maxrate == gbps * 1000000) sbuf_printf(sb, " %uGbps", gbps); else if (p.maxrate == mbps * 1000) sbuf_printf(sb, " %uMbps", mbps); else sbuf_printf(sb, " %uKbps", p.maxrate); break; case SCHED_CLASS_RATEUNIT_PKTS: sbuf_printf(sb, " %upps", p.maxrate); break; default: rc = ENXIO; goto done; } } switch (p.mode) { case SCHED_CLASS_MODE_CLASS: sbuf_printf(sb, " aggregate"); break; case SCHED_CLASS_MODE_FLOW: sbuf_printf(sb, " per-flow"); break; default: rc = ENXIO; goto done; } done: if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } #endif #ifdef TCP_OFFLOAD static void unit_conv(char *buf, size_t len, u_int val, u_int factor) { u_int rem = val % factor; if (rem == 0) snprintf(buf, len, "%u", val / factor); else { while (rem % 10 == 0) rem /= 10; snprintf(buf, len, "%u.%u", val / factor, rem); } } static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; char buf[16]; u_int res, re; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION); switch (arg2) { case 0: /* timer_tick */ re = G_TIMERRESOLUTION(res); break; case 1: /* TCP timestamp tick */ re = G_TIMESTAMPRESOLUTION(res); break; case 2: /* DACK tick */ re = G_DELAYEDACKRESOLUTION(res); break; default: return (EDOOFUS); } unit_conv(buf, sizeof(buf), (cclk_ps << re), 1000000); return (sysctl_handle_string(oidp, buf, sizeof(buf), req)); } static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int res, dack_re, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION); dack_re = G_DELAYEDACKRESOLUTION(res); v = ((cclk_ps << dack_re) / 1000000) * t4_read_reg(sc, A_TP_DACK_TIMER); return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int reg = arg2; u_int tre; u_long tp_tick_us, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; MPASS(reg == A_TP_RXT_MIN || reg == A_TP_RXT_MAX || reg == A_TP_PERS_MIN || reg == A_TP_PERS_MAX || reg == A_TP_KEEP_IDLE || A_TP_KEEP_INTVL || reg == A_TP_INIT_SRTT || reg == A_TP_FINWAIT2_TIMER); tre = G_TIMERRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION)); tp_tick_us = (cclk_ps << tre) / 1000000; if (reg == A_TP_INIT_SRTT) v = tp_tick_us * G_INITSRTT(t4_read_reg(sc, reg)); else v = tp_tick_us * t4_read_reg(sc, reg); return (sysctl_handle_long(oidp, &v, 0, req)); } #endif static uint32_t fconf_iconf_to_mode(uint32_t fconf, uint32_t iconf) { uint32_t mode; mode = T4_FILTER_IPv4 | T4_FILTER_IPv6 | T4_FILTER_IP_SADDR | T4_FILTER_IP_DADDR | T4_FILTER_IP_SPORT | T4_FILTER_IP_DPORT; if (fconf & F_FRAGMENTATION) mode |= T4_FILTER_IP_FRAGMENT; if (fconf & F_MPSHITTYPE) mode |= T4_FILTER_MPS_HIT_TYPE; if (fconf & F_MACMATCH) mode |= T4_FILTER_MAC_IDX; if (fconf & F_ETHERTYPE) mode |= T4_FILTER_ETH_TYPE; if (fconf & F_PROTOCOL) mode |= T4_FILTER_IP_PROTO; if (fconf & F_TOS) mode |= T4_FILTER_IP_TOS; if (fconf & F_VLAN) mode |= T4_FILTER_VLAN; if (fconf & F_VNIC_ID) { mode |= T4_FILTER_VNIC; if (iconf & F_VNIC) mode |= T4_FILTER_IC_VNIC; } if (fconf & F_PORT) mode |= T4_FILTER_PORT; if (fconf & F_FCOE) mode |= T4_FILTER_FCoE; return (mode); } static uint32_t mode_to_fconf(uint32_t mode) { uint32_t fconf = 0; if (mode & T4_FILTER_IP_FRAGMENT) fconf |= F_FRAGMENTATION; if (mode & T4_FILTER_MPS_HIT_TYPE) fconf |= F_MPSHITTYPE; if (mode & T4_FILTER_MAC_IDX) fconf |= F_MACMATCH; if (mode & T4_FILTER_ETH_TYPE) fconf |= F_ETHERTYPE; if (mode & T4_FILTER_IP_PROTO) fconf |= F_PROTOCOL; if (mode & T4_FILTER_IP_TOS) fconf |= F_TOS; if (mode & T4_FILTER_VLAN) fconf |= F_VLAN; if (mode & T4_FILTER_VNIC) fconf |= F_VNIC_ID; if (mode & T4_FILTER_PORT) fconf |= F_PORT; if (mode & T4_FILTER_FCoE) fconf |= F_FCOE; return (fconf); } static uint32_t mode_to_iconf(uint32_t mode) { if (mode & T4_FILTER_IC_VNIC) return (F_VNIC); return (0); } static int check_fspec_against_fconf_iconf(struct adapter *sc, struct t4_filter_specification *fs) { struct tp_params *tpp = &sc->params.tp; uint32_t fconf = 0; if (fs->val.frag || fs->mask.frag) fconf |= F_FRAGMENTATION; if (fs->val.matchtype || fs->mask.matchtype) fconf |= F_MPSHITTYPE; if (fs->val.macidx || fs->mask.macidx) fconf |= F_MACMATCH; if (fs->val.ethtype || fs->mask.ethtype) fconf |= F_ETHERTYPE; if (fs->val.proto || fs->mask.proto) fconf |= F_PROTOCOL; if (fs->val.tos || fs->mask.tos) fconf |= F_TOS; if (fs->val.vlan_vld || fs->mask.vlan_vld) fconf |= F_VLAN; if (fs->val.ovlan_vld || fs->mask.ovlan_vld) { fconf |= F_VNIC_ID; if (tpp->ingress_config & F_VNIC) return (EINVAL); } if (fs->val.pfvf_vld || fs->mask.pfvf_vld) { fconf |= F_VNIC_ID; if ((tpp->ingress_config & F_VNIC) == 0) return (EINVAL); } if (fs->val.iport || fs->mask.iport) fconf |= F_PORT; if (fs->val.fcoe || fs->mask.fcoe) fconf |= F_FCOE; if ((tpp->vlan_pri_map | fconf) != tpp->vlan_pri_map) return (E2BIG); return (0); } static int get_filter_mode(struct adapter *sc, uint32_t *mode) { struct tp_params *tpp = &sc->params.tp; /* * We trust the cached values of the relevant TP registers. This means * things work reliably only if writes to those registers are always via * t4_set_filter_mode. */ *mode = fconf_iconf_to_mode(tpp->vlan_pri_map, tpp->ingress_config); return (0); } static int set_filter_mode(struct adapter *sc, uint32_t mode) { struct tp_params *tpp = &sc->params.tp; uint32_t fconf, iconf; int rc; iconf = mode_to_iconf(mode); if ((iconf ^ tpp->ingress_config) & F_VNIC) { /* * For now we just complain if A_TP_INGRESS_CONFIG is not * already set to the correct value for the requested filter * mode. It's not clear if it's safe to write to this register * on the fly. (And we trust the cached value of the register). */ return (EBUSY); } fconf = mode_to_fconf(mode); rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4setfm"); if (rc) return (rc); if (sc->tids.ftids_in_use > 0) { rc = EBUSY; goto done; } #ifdef TCP_OFFLOAD if (uld_active(sc, ULD_TOM)) { rc = EBUSY; goto done; } #endif rc = -t4_set_filter_mode(sc, fconf); done: end_synchronized_op(sc, LOCK_HELD); return (rc); } static inline uint64_t get_filter_hits(struct adapter *sc, uint32_t fid) { uint32_t tcb_addr; tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + (fid + sc->tids.ftid_base) * TCB_SIZE; if (is_t4(sc)) { uint64_t hits; read_via_memwin(sc, 0, tcb_addr + 16, (uint32_t *)&hits, 8); return (be64toh(hits)); } else { uint32_t hits; read_via_memwin(sc, 0, tcb_addr + 24, &hits, 4); return (be32toh(hits)); } } static int get_filter(struct adapter *sc, struct t4_filter *t) { int i, rc, nfilters = sc->tids.nftids; struct filter_entry *f; rc = begin_synchronized_op(sc, NULL, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4getf"); if (rc) return (rc); if (sc->tids.ftids_in_use == 0 || sc->tids.ftid_tab == NULL || t->idx >= nfilters) { t->idx = 0xffffffff; goto done; } f = &sc->tids.ftid_tab[t->idx]; for (i = t->idx; i < nfilters; i++, f++) { if (f->valid) { t->idx = i; t->l2tidx = f->l2t ? f->l2t->idx : 0; t->smtidx = f->smtidx; if (f->fs.hitcnts) t->hits = get_filter_hits(sc, t->idx); else t->hits = UINT64_MAX; t->fs = f->fs; goto done; } } t->idx = 0xffffffff; done: end_synchronized_op(sc, LOCK_HELD); return (0); } static int set_filter(struct adapter *sc, struct t4_filter *t) { unsigned int nfilters, nports; struct filter_entry *f; int i, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setf"); if (rc) return (rc); nfilters = sc->tids.nftids; nports = sc->params.nports; if (nfilters == 0) { rc = ENOTSUP; goto done; } if (t->idx >= nfilters) { rc = EINVAL; goto done; } /* Validate against the global filter mode and ingress config */ rc = check_fspec_against_fconf_iconf(sc, &t->fs); if (rc != 0) goto done; if (t->fs.action == FILTER_SWITCH && t->fs.eport >= nports) { rc = EINVAL; goto done; } if (t->fs.val.iport >= nports) { rc = EINVAL; goto done; } /* Can't specify an iq if not steering to it */ if (!t->fs.dirsteer && t->fs.iq) { rc = EINVAL; goto done; } /* IPv6 filter idx must be 4 aligned */ if (t->fs.type == 1 && ((t->idx & 0x3) || t->idx + 4 >= nfilters)) { rc = EINVAL; goto done; } if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_full_init(sc)) != 0)) goto done; if (sc->tids.ftid_tab == NULL) { KASSERT(sc->tids.ftids_in_use == 0, ("%s: no memory allocated but filters_in_use > 0", __func__)); sc->tids.ftid_tab = malloc(sizeof (struct filter_entry) * nfilters, M_CXGBE, M_NOWAIT | M_ZERO); if (sc->tids.ftid_tab == NULL) { rc = ENOMEM; goto done; } mtx_init(&sc->tids.ftid_lock, "T4 filters", 0, MTX_DEF); } for (i = 0; i < 4; i++) { f = &sc->tids.ftid_tab[t->idx + i]; if (f->pending || f->valid) { rc = EBUSY; goto done; } if (f->locked) { rc = EPERM; goto done; } if (t->fs.type == 0) break; } f = &sc->tids.ftid_tab[t->idx]; f->fs = t->fs; rc = set_filter_wr(sc, t->idx); done: end_synchronized_op(sc, 0); if (rc == 0) { mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? 0 : EIO; break; } if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock, PCATCH, "t4setfw", 0)) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); } return (rc); } static int del_filter(struct adapter *sc, struct t4_filter *t) { unsigned int nfilters; struct filter_entry *f; int rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4delf"); if (rc) return (rc); nfilters = sc->tids.nftids; if (nfilters == 0) { rc = ENOTSUP; goto done; } if (sc->tids.ftid_tab == NULL || sc->tids.ftids_in_use == 0 || t->idx >= nfilters) { rc = EINVAL; goto done; } if (!(sc->flags & FULL_INIT_DONE)) { rc = EAGAIN; goto done; } f = &sc->tids.ftid_tab[t->idx]; if (f->pending) { rc = EBUSY; goto done; } if (f->locked) { rc = EPERM; goto done; } if (f->valid) { t->fs = f->fs; /* extra info for the caller */ rc = del_filter_wr(sc, t->idx); } done: end_synchronized_op(sc, 0); if (rc == 0) { mtx_lock(&sc->tids.ftid_lock); for (;;) { if (f->pending == 0) { rc = f->valid ? EIO : 0; break; } if (mtx_sleep(&sc->tids.ftid_tab, &sc->tids.ftid_lock, PCATCH, "t4delfw", 0)) { rc = EINPROGRESS; break; } } mtx_unlock(&sc->tids.ftid_lock); } return (rc); } static void clear_filter(struct filter_entry *f) { if (f->l2t) t4_l2t_release(f->l2t); bzero(f, sizeof (*f)); } static int set_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; struct fw_filter_wr *fwr; unsigned int ftid, vnic_vld, vnic_vld_mask; struct wrq_cookie cookie; ASSERT_SYNCHRONIZED_OP(sc); if (f->fs.newdmac || f->fs.newvlan) { /* This filter needs an L2T entry; allocate one. */ f->l2t = t4_l2t_alloc_switching(sc->l2t); if (f->l2t == NULL) return (EAGAIN); if (t4_l2t_set_switching(sc, f->l2t, f->fs.vlan, f->fs.eport, f->fs.dmac)) { t4_l2t_release(f->l2t); f->l2t = NULL; return (ENOMEM); } } /* Already validated against fconf, iconf */ MPASS((f->fs.val.pfvf_vld & f->fs.val.ovlan_vld) == 0); MPASS((f->fs.mask.pfvf_vld & f->fs.mask.ovlan_vld) == 0); if (f->fs.val.pfvf_vld || f->fs.val.ovlan_vld) vnic_vld = 1; else vnic_vld = 0; if (f->fs.mask.pfvf_vld || f->fs.mask.ovlan_vld) vnic_vld_mask = 1; else vnic_vld_mask = 0; ftid = sc->tids.ftid_base + fidx; fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie); if (fwr == NULL) return (ENOMEM); bzero(fwr, sizeof(*fwr)); fwr->op_pkd = htobe32(V_FW_WR_OP(FW_FILTER_WR)); fwr->len16_pkd = htobe32(FW_LEN16(*fwr)); fwr->tid_to_iq = htobe32(V_FW_FILTER_WR_TID(ftid) | V_FW_FILTER_WR_RQTYPE(f->fs.type) | V_FW_FILTER_WR_NOREPLY(0) | V_FW_FILTER_WR_IQ(f->fs.iq)); fwr->del_filter_to_l2tix = htobe32(V_FW_FILTER_WR_RPTTID(f->fs.rpttid) | V_FW_FILTER_WR_DROP(f->fs.action == FILTER_DROP) | V_FW_FILTER_WR_DIRSTEER(f->fs.dirsteer) | V_FW_FILTER_WR_MASKHASH(f->fs.maskhash) | V_FW_FILTER_WR_DIRSTEERHASH(f->fs.dirsteerhash) | V_FW_FILTER_WR_LPBK(f->fs.action == FILTER_SWITCH) | V_FW_FILTER_WR_DMAC(f->fs.newdmac) | V_FW_FILTER_WR_SMAC(f->fs.newsmac) | V_FW_FILTER_WR_INSVLAN(f->fs.newvlan == VLAN_INSERT || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_RMVLAN(f->fs.newvlan == VLAN_REMOVE || f->fs.newvlan == VLAN_REWRITE) | V_FW_FILTER_WR_HITCNTS(f->fs.hitcnts) | V_FW_FILTER_WR_TXCHAN(f->fs.eport) | V_FW_FILTER_WR_PRIO(f->fs.prio) | V_FW_FILTER_WR_L2TIX(f->l2t ? f->l2t->idx : 0)); fwr->ethtype = htobe16(f->fs.val.ethtype); fwr->ethtypem = htobe16(f->fs.mask.ethtype); fwr->frag_to_ovlan_vldm = (V_FW_FILTER_WR_FRAG(f->fs.val.frag) | V_FW_FILTER_WR_FRAGM(f->fs.mask.frag) | V_FW_FILTER_WR_IVLAN_VLD(f->fs.val.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLD(vnic_vld) | V_FW_FILTER_WR_IVLAN_VLDM(f->fs.mask.vlan_vld) | V_FW_FILTER_WR_OVLAN_VLDM(vnic_vld_mask)); fwr->smac_sel = 0; fwr->rx_chan_rx_rpl_iq = htobe16(V_FW_FILTER_WR_RX_CHAN(0) | V_FW_FILTER_WR_RX_RPL_IQ(sc->sge.fwq.abs_id)); fwr->maci_to_matchtypem = htobe32(V_FW_FILTER_WR_MACI(f->fs.val.macidx) | V_FW_FILTER_WR_MACIM(f->fs.mask.macidx) | V_FW_FILTER_WR_FCOE(f->fs.val.fcoe) | V_FW_FILTER_WR_FCOEM(f->fs.mask.fcoe) | V_FW_FILTER_WR_PORT(f->fs.val.iport) | V_FW_FILTER_WR_PORTM(f->fs.mask.iport) | V_FW_FILTER_WR_MATCHTYPE(f->fs.val.matchtype) | V_FW_FILTER_WR_MATCHTYPEM(f->fs.mask.matchtype)); fwr->ptcl = f->fs.val.proto; fwr->ptclm = f->fs.mask.proto; fwr->ttyp = f->fs.val.tos; fwr->ttypm = f->fs.mask.tos; fwr->ivlan = htobe16(f->fs.val.vlan); fwr->ivlanm = htobe16(f->fs.mask.vlan); fwr->ovlan = htobe16(f->fs.val.vnic); fwr->ovlanm = htobe16(f->fs.mask.vnic); bcopy(f->fs.val.dip, fwr->lip, sizeof (fwr->lip)); bcopy(f->fs.mask.dip, fwr->lipm, sizeof (fwr->lipm)); bcopy(f->fs.val.sip, fwr->fip, sizeof (fwr->fip)); bcopy(f->fs.mask.sip, fwr->fipm, sizeof (fwr->fipm)); fwr->lp = htobe16(f->fs.val.dport); fwr->lpm = htobe16(f->fs.mask.dport); fwr->fp = htobe16(f->fs.val.sport); fwr->fpm = htobe16(f->fs.mask.sport); if (f->fs.newsmac) bcopy(f->fs.smac, fwr->sma, sizeof (fwr->sma)); f->pending = 1; sc->tids.ftids_in_use++; commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie); return (0); } static int del_filter_wr(struct adapter *sc, int fidx) { struct filter_entry *f = &sc->tids.ftid_tab[fidx]; struct fw_filter_wr *fwr; unsigned int ftid; struct wrq_cookie cookie; ftid = sc->tids.ftid_base + fidx; fwr = start_wrq_wr(&sc->sge.mgmtq, howmany(sizeof(*fwr), 16), &cookie); if (fwr == NULL) return (ENOMEM); bzero(fwr, sizeof (*fwr)); t4_mk_filtdelwr(ftid, fwr, sc->sge.fwq.abs_id); f->pending = 1; commit_wrq_wr(&sc->sge.mgmtq, fwr, &cookie); return (0); } int t4_filter_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_set_tcb_rpl *rpl = (const void *)(rss + 1); unsigned int idx = GET_TID(rpl); unsigned int rc; struct filter_entry *f; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); MPASS(iq == &sc->sge.fwq); MPASS(is_ftid(sc, idx)); idx -= sc->tids.ftid_base; f = &sc->tids.ftid_tab[idx]; rc = G_COOKIE(rpl->cookie); mtx_lock(&sc->tids.ftid_lock); if (rc == FW_FILTER_WR_FLT_ADDED) { KASSERT(f->pending, ("%s: filter[%u] isn't pending.", __func__, idx)); f->smtidx = (be64toh(rpl->oldval) >> 24) & 0xff; f->pending = 0; /* asynchronous setup completed */ f->valid = 1; } else { if (rc != FW_FILTER_WR_FLT_DELETED) { /* Add or delete failed, display an error */ log(LOG_ERR, "filter %u setup failed with error %u\n", idx, rc); } clear_filter(f); sc->tids.ftids_in_use--; } wakeup(&sc->tids.ftid_tab); mtx_unlock(&sc->tids.ftid_lock); return (0); } static int set_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { MPASS(iq->set_tcb_rpl != NULL); return (iq->set_tcb_rpl(iq, rss, m)); } static int l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { MPASS(iq->l2t_write_rpl != NULL); return (iq->l2t_write_rpl(iq, rss, m)); } static int get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt) { int rc; if (cntxt->cid > M_CTXTQID) return (EINVAL); if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS && cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt"); if (rc) return (rc); if (sc->flags & FW_OK) { rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); if (rc == 0) goto done; } /* * Read via firmware failed or wasn't even attempted. Read directly via * the backdoor. */ rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); done: end_synchronized_op(sc, 0); return (rc); } static int load_fw(struct adapter *sc, struct t4_data *fw) { int rc; uint8_t *fw_data; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw"); if (rc) return (rc); if (sc->flags & FULL_INIT_DONE) { rc = EBUSY; goto done; } fw_data = malloc(fw->len, M_CXGBE, M_WAITOK); if (fw_data == NULL) { rc = ENOMEM; goto done; } rc = copyin(fw->data, fw_data, fw->len); if (rc == 0) rc = -t4_load_fw(sc, fw_data, fw->len); free(fw_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } #define MAX_READ_BUF_SIZE (128 * 1024) static int read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr) { uint32_t addr, remaining, n; uint32_t *buf; int rc; uint8_t *dst; rc = validate_mem_range(sc, mr->addr, mr->len); if (rc != 0) return (rc); buf = malloc(min(mr->len, MAX_READ_BUF_SIZE), M_CXGBE, M_WAITOK); addr = mr->addr; remaining = mr->len; dst = (void *)mr->data; while (remaining) { n = min(remaining, MAX_READ_BUF_SIZE); read_via_memwin(sc, 2, addr, buf, n); rc = copyout(buf, dst, n); if (rc != 0) break; dst += n; remaining -= n; addr += n; } free(buf, M_CXGBE); return (rc); } #undef MAX_READ_BUF_SIZE static int read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd) { int rc; if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports) return (EINVAL); if (i2cd->len > sizeof(i2cd->data)) return (EFBIG); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd"); if (rc) return (rc); rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr, i2cd->offset, i2cd->len, &i2cd->data[0]); end_synchronized_op(sc, 0); return (rc); } static int in_range(int val, int lo, int hi) { return (val < 0 || (val <= hi && val >= lo)); } static int set_sched_class_config(struct adapter *sc, int minmax) { int rc; if (minmax < 0) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc"); if (rc) return (rc); rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1); end_synchronized_op(sc, 0); return (rc); } static int set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p, int sleep_ok) { int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode; struct port_info *pi; struct tx_sched_class *tc; if (p->level == SCHED_CLASS_LEVEL_CL_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL; else if (p->level == SCHED_CLASS_LEVEL_CL_WRR) fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR; else if (p->level == SCHED_CLASS_LEVEL_CH_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL; else return (EINVAL); if (p->mode == SCHED_CLASS_MODE_CLASS) fw_mode = FW_SCHED_PARAMS_MODE_CLASS; else if (p->mode == SCHED_CLASS_MODE_FLOW) fw_mode = FW_SCHED_PARAMS_MODE_FLOW; else return (EINVAL); if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE; else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE; else return (EINVAL); if (p->ratemode == SCHED_CLASS_RATEMODE_REL) fw_ratemode = FW_SCHED_PARAMS_RATE_REL; else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) fw_ratemode = FW_SCHED_PARAMS_RATE_ABS; else return (EINVAL); /* Vet our parameters ... */ if (!in_range(p->channel, 0, sc->chip_params->nchan - 1)) return (ERANGE); pi = sc->port[sc->chan_map[p->channel]]; if (pi == NULL) return (ENXIO); MPASS(pi->tx_chan == p->channel); top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */ if (!in_range(p->cl, 0, sc->chip_params->nsched_cls) || !in_range(p->minrate, 0, top_speed) || !in_range(p->maxrate, 0, top_speed) || !in_range(p->weight, 0, 100)) return (ERANGE); /* * Translate any unset parameters into the firmware's * nomenclature and/or fail the call if the parameters * are required ... */ if (p->rateunit < 0 || p->ratemode < 0 || p->channel < 0 || p->cl < 0) return (EINVAL); if (p->minrate < 0) p->minrate = 0; if (p->maxrate < 0) { if (p->level == SCHED_CLASS_LEVEL_CL_RL || p->level == SCHED_CLASS_LEVEL_CH_RL) return (EINVAL); else p->maxrate = 0; } if (p->weight < 0) { if (p->level == SCHED_CLASS_LEVEL_CL_WRR) return (EINVAL); else p->weight = 0; } if (p->pktsize < 0) { if (p->level == SCHED_CLASS_LEVEL_CL_RL || p->level == SCHED_CLASS_LEVEL_CH_RL) return (EINVAL); else p->pktsize = 0; } rc = begin_synchronized_op(sc, NULL, sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp"); if (rc) return (rc); tc = &pi->tc[p->cl]; tc->params = *p; rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level, fw_mode, fw_rateunit, fw_ratemode, p->channel, p->cl, p->minrate, p->maxrate, p->weight, p->pktsize, sleep_ok); if (rc == 0) tc->flags |= TX_SC_OK; else { /* * Unknown state at this point, see tc->params for what was * attempted. */ tc->flags &= ~TX_SC_OK; } end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD); return (rc); } static int set_sched_class(struct adapter *sc, struct t4_sched_params *p) { if (p->type != SCHED_CLASS_TYPE_PACKET) return (EINVAL); if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG) return (set_sched_class_config(sc, p->u.config.minmax)); if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS) return (set_sched_class_params(sc, &p->u.params, 1)); return (EINVAL); } static int set_sched_queue(struct adapter *sc, struct t4_sched_queue *p) { struct port_info *pi = NULL; struct vi_info *vi; struct sge_txq *txq; uint32_t fw_mnem, fw_queue, fw_class; int i, rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4setsq"); if (rc) return (rc); if (p->port >= sc->params.nports) { rc = EINVAL; goto done; } /* XXX: Only supported for the main VI. */ pi = sc->port[p->port]; vi = &pi->vi[0]; if (!(vi->flags & VI_INIT_DONE)) { /* tx queues not set up yet */ rc = EAGAIN; goto done; } if (!in_range(p->queue, 0, vi->ntxq - 1) || !in_range(p->cl, 0, sc->chip_params->nsched_cls - 1)) { rc = EINVAL; goto done; } /* * Create a template for the FW_PARAMS_CMD mnemonic and value (TX * Scheduling Class in this case). */ fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH)); fw_class = p->cl < 0 ? 0xffffffff : p->cl; /* * If op.queue is non-negative, then we're only changing the scheduling * on a single specified TX queue. */ if (p->queue >= 0) { txq = &sc->sge.txq[vi->first_txq + p->queue]; fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); goto done; } /* * Change the scheduling on all the TX queues for the * interface. */ for_each_txq(vi, i, txq) { fw_queue = (fw_mnem | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); if (rc) goto done; } rc = 0; done: end_synchronized_op(sc, 0); return (rc); } int t4_os_find_pci_capability(struct adapter *sc, int cap) { int i; return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0); } int t4_os_pci_save_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_save(dev, dinfo, 0); return (0); } int t4_os_pci_restore_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_restore(dev, dinfo); return (0); } void t4_os_portmod_changed(const struct adapter *sc, int idx) { struct port_info *pi = sc->port[idx]; struct vi_info *vi; struct ifnet *ifp; int v; static const char *mod_str[] = { NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM" }; for_each_vi(pi, v, vi) { build_medialist(pi, &vi->media); } ifp = pi->vi[0].ifp; if (pi->mod_type == FW_PORT_MOD_TYPE_NONE) if_printf(ifp, "transceiver unplugged.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN) if_printf(ifp, "unknown transceiver inserted.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED) if_printf(ifp, "unsupported transceiver inserted.\n"); else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) { if_printf(ifp, "%s transceiver inserted.\n", mod_str[pi->mod_type]); } else { if_printf(ifp, "transceiver (type %d) inserted.\n", pi->mod_type); } } void t4_os_link_changed(struct adapter *sc, int idx, int link_stat, int reason) { struct port_info *pi = sc->port[idx]; struct vi_info *vi; struct ifnet *ifp; int v; if (link_stat) pi->linkdnrc = -1; else { if (reason >= 0) pi->linkdnrc = reason; } for_each_vi(pi, v, vi) { ifp = vi->ifp; if (ifp == NULL) continue; if (link_stat) { ifp->if_baudrate = IF_Mbps(pi->link_cfg.speed); if_link_state_change(ifp, LINK_STATE_UP); } else { if_link_state_change(ifp, LINK_STATE_DOWN); } } } void t4_iterate(void (*func)(struct adapter *, void *), void *arg) { struct adapter *sc; sx_slock(&t4_list_lock); SLIST_FOREACH(sc, &t4_list, link) { /* * func should not make any assumptions about what state sc is * in - the only guarantee is that sc->sc_lock is a valid lock. */ func(sc, arg); } sx_sunlock(&t4_list_lock); } static int t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { int rc; struct adapter *sc = dev->si_drv1; rc = priv_check(td, PRIV_DRIVER); if (rc != 0) return (rc); switch (cmd) { case CHELSIO_T4_GETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) edata->val = t4_read_reg(sc, edata->addr); else if (edata->size == 8) edata->val = t4_read_reg64(sc, edata->addr); else return (EINVAL); break; } case CHELSIO_T4_SETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) { if (edata->val & 0xffffffff00000000) return (EINVAL); t4_write_reg(sc, edata->addr, (uint32_t) edata->val); } else if (edata->size == 8) t4_write_reg64(sc, edata->addr, edata->val); else return (EINVAL); break; } case CHELSIO_T4_REGDUMP: { struct t4_regdump *regs = (struct t4_regdump *)data; int reglen = is_t4(sc) ? T4_REGDUMP_SIZE : T5_REGDUMP_SIZE; uint8_t *buf; if (regs->len < reglen) { regs->len = reglen; /* hint to the caller */ return (ENOBUFS); } regs->len = reglen; buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO); get_regs(sc, regs, buf); rc = copyout(buf, regs->data, reglen); free(buf, M_CXGBE); break; } case CHELSIO_T4_GET_FILTER_MODE: rc = get_filter_mode(sc, (uint32_t *)data); break; case CHELSIO_T4_SET_FILTER_MODE: rc = set_filter_mode(sc, *(uint32_t *)data); break; case CHELSIO_T4_GET_FILTER: rc = get_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_SET_FILTER: rc = set_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_DEL_FILTER: rc = del_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_GET_SGE_CONTEXT: rc = get_sge_context(sc, (struct t4_sge_context *)data); break; case CHELSIO_T4_LOAD_FW: rc = load_fw(sc, (struct t4_data *)data); break; case CHELSIO_T4_GET_MEM: rc = read_card_mem(sc, 2, (struct t4_mem_range *)data); break; case CHELSIO_T4_GET_I2C: rc = read_i2c(sc, (struct t4_i2c_data *)data); break; case CHELSIO_T4_CLEAR_STATS: { int i, v; u_int port_id = *(uint32_t *)data; struct port_info *pi; struct vi_info *vi; if (port_id >= sc->params.nports) return (EINVAL); pi = sc->port[port_id]; /* MAC stats */ t4_clr_port_stats(sc, pi->tx_chan); pi->tx_parse_error = 0; mtx_lock(&sc->reg_lock); for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) t4_clr_vi_stats(sc, vi->viid); } mtx_unlock(&sc->reg_lock); /* * Since this command accepts a port, clear stats for * all VIs on this port. */ for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) { struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *wrq; for_each_rxq(vi, i, rxq) { #if defined(INET) || defined(INET6) rxq->lro.lro_queued = 0; rxq->lro.lro_flushed = 0; #endif rxq->rxcsum = 0; rxq->vlan_extraction = 0; } for_each_txq(vi, i, txq) { txq->txcsum = 0; txq->tso_wrs = 0; txq->vlan_insertion = 0; txq->imm_wrs = 0; txq->sgl_wrs = 0; txq->txpkt_wrs = 0; txq->txpkts0_wrs = 0; txq->txpkts1_wrs = 0; txq->txpkts0_pkts = 0; txq->txpkts1_pkts = 0; mp_ring_reset_stats(txq->r); } #ifdef TCP_OFFLOAD /* nothing to clear for each ofld_rxq */ for_each_ofld_txq(vi, i, wrq) { wrq->tx_wrs_direct = 0; wrq->tx_wrs_copied = 0; } #endif if (IS_MAIN_VI(vi)) { wrq = &sc->sge.ctrlq[pi->port_id]; wrq->tx_wrs_direct = 0; wrq->tx_wrs_copied = 0; } } } break; } case CHELSIO_T4_SCHED_CLASS: rc = set_sched_class(sc, (struct t4_sched_params *)data); break; case CHELSIO_T4_SCHED_QUEUE: rc = set_sched_queue(sc, (struct t4_sched_queue *)data); break; case CHELSIO_T4_GET_TRACER: rc = t4_get_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_SET_TRACER: rc = t4_set_tracer(sc, (struct t4_tracer *)data); break; default: rc = ENOTTY; } return (rc); } void t4_db_full(struct adapter *sc) { CXGBE_UNIMPLEMENTED(__func__); } void t4_db_dropped(struct adapter *sc) { CXGBE_UNIMPLEMENTED(__func__); } #ifdef TCP_OFFLOAD void t4_iscsi_init(struct adapter *sc, u_int tag_mask, const u_int *pgsz_order) { t4_write_reg(sc, A_ULP_RX_ISCSI_TAGMASK, tag_mask); t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, V_HPZ0(pgsz_order[0]) | V_HPZ1(pgsz_order[1]) | V_HPZ2(pgsz_order[2]) | V_HPZ3(pgsz_order[3])); } static int toe_capability(struct vi_info *vi, int enable) { int rc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; ASSERT_SYNCHRONIZED_OP(sc); if (!is_offload(sc)) return (ENODEV); if (enable) { if ((vi->ifp->if_capenable & IFCAP_TOE) != 0) { /* TOE is already enabled. */ return (0); } /* * We need the port's queues around so that we're able to send * and receive CPLs to/from the TOE even if the ifnet for this * port has never been UP'd administratively. */ if (!(vi->flags & VI_INIT_DONE)) { rc = vi_full_init(vi); if (rc) return (rc); } if (!(pi->vi[0].flags & VI_INIT_DONE)) { rc = vi_full_init(&pi->vi[0]); if (rc) return (rc); } if (isset(&sc->offload_map, pi->port_id)) { /* TOE is enabled on another VI of this port. */ pi->uld_vis++; return (0); } if (!uld_active(sc, ULD_TOM)) { rc = t4_activate_uld(sc, ULD_TOM); if (rc == EAGAIN) { log(LOG_WARNING, "You must kldload t4_tom.ko before trying " "to enable TOE on a cxgbe interface.\n"); } if (rc != 0) return (rc); KASSERT(sc->tom_softc != NULL, ("%s: TOM activated but softc NULL", __func__)); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM activated but flag not set", __func__)); } /* Activate iWARP and iSCSI too, if the modules are loaded. */ if (!uld_active(sc, ULD_IWARP)) (void) t4_activate_uld(sc, ULD_IWARP); if (!uld_active(sc, ULD_ISCSI)) (void) t4_activate_uld(sc, ULD_ISCSI); pi->uld_vis++; setbit(&sc->offload_map, pi->port_id); } else { pi->uld_vis--; if (!isset(&sc->offload_map, pi->port_id) || pi->uld_vis > 0) return (0); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); } return (0); } /* * Add an upper layer driver to the global list. */ int t4_register_uld(struct uld_info *ui) { int rc = 0; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u->uld_id == ui->uld_id) { rc = EEXIST; goto done; } } SLIST_INSERT_HEAD(&t4_uld_list, ui, link); ui->refcount = 0; done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_unregister_uld(struct uld_info *ui) { int rc = EINVAL; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u == ui) { if (ui->refcount > 0) { rc = EBUSY; goto done; } SLIST_REMOVE(&t4_uld_list, ui, uld_info, link); rc = 0; goto done; } } done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_activate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = EAGAIN; /* kldoad the module with this ULD and try again. */ sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { if (!(sc->flags & FULL_INIT_DONE)) { rc = adapter_full_init(sc); if (rc != 0) break; } rc = ui->activate(sc); if (rc == 0) { setbit(&sc->active_ulds, id); ui->refcount++; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } int t4_deactivate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = ENXIO; sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { rc = ui->deactivate(sc); if (rc == 0) { clrbit(&sc->active_ulds, id); ui->refcount--; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } int uld_active(struct adapter *sc, int uld_id) { MPASS(uld_id >= 0 && uld_id <= ULD_MAX); return (isset(&sc->active_ulds, uld_id)); } #endif /* * Come up with reasonable defaults for some of the tunables, provided they're * not set by the user (in which case we'll use the values as is). */ static void tweak_tunables(void) { int nc = mp_ncpus; /* our snapshot of the number of CPUs */ if (t4_ntxq10g < 1) { #ifdef RSS t4_ntxq10g = rss_getnumbuckets(); #else t4_ntxq10g = min(nc, NTXQ_10G); #endif } if (t4_ntxq1g < 1) { #ifdef RSS /* XXX: way too many for 1GbE? */ t4_ntxq1g = rss_getnumbuckets(); #else t4_ntxq1g = min(nc, NTXQ_1G); #endif } if (t4_ntxq_vi < 1) t4_ntxq_vi = min(nc, NTXQ_VI); if (t4_nrxq10g < 1) { #ifdef RSS t4_nrxq10g = rss_getnumbuckets(); #else t4_nrxq10g = min(nc, NRXQ_10G); #endif } if (t4_nrxq1g < 1) { #ifdef RSS /* XXX: way too many for 1GbE? */ t4_nrxq1g = rss_getnumbuckets(); #else t4_nrxq1g = min(nc, NRXQ_1G); #endif } if (t4_nrxq_vi < 1) t4_nrxq_vi = min(nc, NRXQ_VI); #ifdef TCP_OFFLOAD if (t4_nofldtxq10g < 1) t4_nofldtxq10g = min(nc, NOFLDTXQ_10G); if (t4_nofldtxq1g < 1) t4_nofldtxq1g = min(nc, NOFLDTXQ_1G); if (t4_nofldtxq_vi < 1) t4_nofldtxq_vi = min(nc, NOFLDTXQ_VI); if (t4_nofldrxq10g < 1) t4_nofldrxq10g = min(nc, NOFLDRXQ_10G); if (t4_nofldrxq1g < 1) t4_nofldrxq1g = min(nc, NOFLDRXQ_1G); if (t4_nofldrxq_vi < 1) t4_nofldrxq_vi = min(nc, NOFLDRXQ_VI); if (t4_toecaps_allowed == -1) t4_toecaps_allowed = FW_CAPS_CONFIG_TOE; if (t4_rdmacaps_allowed == -1) { t4_rdmacaps_allowed = FW_CAPS_CONFIG_RDMA_RDDP | FW_CAPS_CONFIG_RDMA_RDMAC; } if (t4_iscsicaps_allowed == -1) { t4_iscsicaps_allowed = FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU | FW_CAPS_CONFIG_ISCSI_TARGET_PDU | FW_CAPS_CONFIG_ISCSI_T10DIF; } #else if (t4_toecaps_allowed == -1) t4_toecaps_allowed = 0; if (t4_rdmacaps_allowed == -1) t4_rdmacaps_allowed = 0; if (t4_iscsicaps_allowed == -1) t4_iscsicaps_allowed = 0; #endif #ifdef DEV_NETMAP if (t4_nnmtxq_vi < 1) t4_nnmtxq_vi = min(nc, NNMTXQ_VI); if (t4_nnmrxq_vi < 1) t4_nnmrxq_vi = min(nc, NNMRXQ_VI); #endif if (t4_tmr_idx_10g < 0 || t4_tmr_idx_10g >= SGE_NTIMERS) t4_tmr_idx_10g = TMR_IDX_10G; if (t4_pktc_idx_10g < -1 || t4_pktc_idx_10g >= SGE_NCOUNTERS) t4_pktc_idx_10g = PKTC_IDX_10G; if (t4_tmr_idx_1g < 0 || t4_tmr_idx_1g >= SGE_NTIMERS) t4_tmr_idx_1g = TMR_IDX_1G; if (t4_pktc_idx_1g < -1 || t4_pktc_idx_1g >= SGE_NCOUNTERS) t4_pktc_idx_1g = PKTC_IDX_1G; if (t4_qsize_txq < 128) t4_qsize_txq = 128; if (t4_qsize_rxq < 128) t4_qsize_rxq = 128; while (t4_qsize_rxq & 7) t4_qsize_rxq++; t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX; } #ifdef DDB static void t4_dump_tcb(struct adapter *sc, int tid) { uint32_t base, i, j, off, pf, reg, save, tcb_addr, win_pos; reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2); save = t4_read_reg(sc, reg); base = sc->memwin[2].mw_base; /* Dump TCB for the tid */ tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE); tcb_addr += tid * TCB_SIZE; if (is_t4(sc)) { pf = 0; win_pos = tcb_addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); win_pos = tcb_addr & ~0x7f; /* start must be 128B aligned */ } t4_write_reg(sc, reg, win_pos | pf); t4_read_reg(sc, reg); off = tcb_addr - win_pos; for (i = 0; i < 4; i++) { uint32_t buf[8]; for (j = 0; j < 8; j++, off += 4) buf[j] = htonl(t4_read_reg(sc, base + off)); db_printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); } t4_write_reg(sc, reg, save); t4_read_reg(sc, reg); } static void t4_dump_devlog(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e e; int i, first, j, m, nentries, rc; uint64_t ftstamp = UINT64_MAX; if (dparams->start == 0) { db_printf("devlog params not valid\n"); return; } nentries = dparams->size / sizeof(struct fw_devlog_e); m = fwmtype_to_hwmtype(dparams->memtype); /* Find the first entry. */ first = -1; for (i = 0; i < nentries && !db_pager_quit; i++) { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) break; if (e.timestamp == 0) break; e.timestamp = be64toh(e.timestamp); if (e.timestamp < ftstamp) { ftstamp = e.timestamp; first = i; } } if (first == -1) return; i = first; do { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) return; if (e.timestamp == 0) return; e.timestamp = be64toh(e.timestamp); e.seqno = be32toh(e.seqno); for (j = 0; j < 8; j++) e.params[j] = be32toh(e.params[j]); db_printf("%10d %15ju %8s %8s ", e.seqno, e.timestamp, (e.level < nitems(devlog_level_strings) ? devlog_level_strings[e.level] : "UNKNOWN"), (e.facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e.facility] : "UNKNOWN")); db_printf(e.fmt, e.params[0], e.params[1], e.params[2], e.params[3], e.params[4], e.params[5], e.params[6], e.params[7]); if (++i == nentries) i = 0; } while (i != first && !db_pager_quit); } static struct command_table db_t4_table = LIST_HEAD_INITIALIZER(db_t4_table); _DB_SET(_show, t4, NULL, db_show_table, 0, &db_t4_table); DB_FUNC(devlog, db_show_devlog, db_t4_table, CS_OWN, NULL) { device_t dev; int t; bool valid; valid = false; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); valid = true; } db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 devlog \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } t4_dump_devlog(device_get_softc(dev)); } DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL) { device_t dev; int radix, tid, t; bool valid; valid = false; radix = db_radix; db_radix = 10; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); t = db_read_token(); if (t == tNUMBER) { tid = db_tok_number; valid = true; } } db_radix = radix; db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 tcb \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } if (tid < 0) { db_printf("invalid tid\n"); return; } t4_dump_tcb(device_get_softc(dev), tid); } #endif static struct sx mlu; /* mod load unload */ SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload"); static int mod_event(module_t mod, int cmd, void *arg) { int rc = 0; static int loaded = 0; switch (cmd) { case MOD_LOAD: sx_xlock(&mlu); if (loaded++ == 0) { t4_sge_modload(); t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl); t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl); t4_register_cpl_handler(CPL_TRACE_PKT, t4_trace_pkt); t4_register_cpl_handler(CPL_T5_TRACE_PKT, t5_trace_pkt); sx_init(&t4_list_lock, "T4/T5 adapters"); SLIST_INIT(&t4_list); #ifdef TCP_OFFLOAD sx_init(&t4_uld_list_lock, "T4/T5 ULDs"); SLIST_INIT(&t4_uld_list); #endif t4_tracer_modload(); tweak_tunables(); } sx_xunlock(&mlu); break; case MOD_UNLOAD: sx_xlock(&mlu); if (--loaded == 0) { int tries; sx_slock(&t4_list_lock); if (!SLIST_EMPTY(&t4_list)) { rc = EBUSY; sx_sunlock(&t4_list_lock); goto done_unload; } #ifdef TCP_OFFLOAD sx_slock(&t4_uld_list_lock); if (!SLIST_EMPTY(&t4_uld_list)) { rc = EBUSY; sx_sunlock(&t4_uld_list_lock); sx_sunlock(&t4_list_lock); goto done_unload; } #endif tries = 0; while (tries++ < 5 && t4_sge_extfree_refs() != 0) { uprintf("%ju clusters with custom free routine " "still is use.\n", t4_sge_extfree_refs()); pause("t4unload", 2 * hz); } #ifdef TCP_OFFLOAD sx_sunlock(&t4_uld_list_lock); #endif sx_sunlock(&t4_list_lock); if (t4_sge_extfree_refs() == 0) { t4_tracer_modunload(); #ifdef TCP_OFFLOAD sx_destroy(&t4_uld_list_lock); #endif sx_destroy(&t4_list_lock); t4_sge_modunload(); loaded = 0; } else { rc = EBUSY; loaded++; /* undo earlier decrement */ } } done_unload: sx_xunlock(&mlu); break; } return (rc); } static devclass_t t4_devclass, t5_devclass; static devclass_t cxgbe_devclass, cxl_devclass; static devclass_t vcxgbe_devclass, vcxl_devclass; DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0); MODULE_VERSION(t4nex, 1); MODULE_DEPEND(t4nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t4nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0); MODULE_VERSION(t5nex, 1); MODULE_DEPEND(t5nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t5nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0); MODULE_VERSION(cxgbe, 1); DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0); MODULE_VERSION(cxl, 1); DRIVER_MODULE(vcxgbe, cxgbe, vcxgbe_driver, vcxgbe_devclass, 0, 0); MODULE_VERSION(vcxgbe, 1); DRIVER_MODULE(vcxl, cxl, vcxl_driver, vcxl_devclass, 0, 0); MODULE_VERSION(vcxl, 1); Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_netmap.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_netmap.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_netmap.c (revision 303654) @@ -1,1011 +1,1011 @@ /*- * Copyright (c) 2014 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #ifdef DEV_NETMAP #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" extern int fl_pad; /* XXXNM */ SYSCTL_NODE(_hw, OID_AUTO, cxgbe, CTLFLAG_RD, 0, "cxgbe netmap parameters"); /* * 0 = normal netmap rx * 1 = black hole * 2 = supermassive black hole (buffer packing enabled) */ int black_hole = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_black_hole, CTLFLAG_RDTUN, &black_hole, 0, "Sink incoming packets."); int rx_ndesc = 256; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_rx_ndesc, CTLFLAG_RWTUN, &rx_ndesc, 0, "# of rx descriptors after which the hw cidx is updated."); int holdoff_tmr_idx = 2; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_holdoff_tmr_idx, CTLFLAG_RWTUN, &holdoff_tmr_idx, 0, "Holdoff timer index for netmap rx queues."); /* * Congestion drops. * -1: no congestion feedback (not recommended). * 0: backpressure the channel instead of dropping packets right away. * 1: no backpressure, drop packets for the congested queue immediately. */ static int nm_cong_drop = 1; TUNABLE_INT("hw.cxgbe.nm_cong_drop", &nm_cong_drop); static int alloc_nm_rxq_hwq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int cong) { int rc, cntxt_id, i; __be32 v; struct adapter *sc = vi->pi->adapter; struct sge_params *sp = &sc->params.sge; struct netmap_adapter *na = NA(vi->ifp); struct fw_iq_cmd c; MPASS(na != NULL); MPASS(nm_rxq->iq_desc != NULL); MPASS(nm_rxq->fl_desc != NULL); bzero(nm_rxq->iq_desc, vi->qsize_rxq * IQ_ESIZE); bzero(nm_rxq->fl_desc, na->num_rx_desc * EQ_ESIZE + sp->spg_len); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | V_FW_IQ_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | FW_LEN16(c)); if (vi->flags & INTR_RXQ) { KASSERT(nm_rxq->intr_idx < sc->intr_count, ("%s: invalid direct intr_idx %d", __func__, nm_rxq->intr_idx)); v = V_FW_IQ_CMD_IQANDSTINDEX(nm_rxq->intr_idx); } else { CXGBE_UNIMPLEMENTED(__func__); /* XXXNM: needs review */ v = V_FW_IQ_CMD_IQANDSTINDEX(nm_rxq->intr_idx) | F_FW_IQ_CMD_IQANDST; } c.type_to_iqandstindex = htobe32(v | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(vi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(vi->pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(0) | V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(vi->qsize_rxq); c.iqaddr = htobe64(nm_rxq->iq_ba); if (cong >= 0) { c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN | V_FW_IQ_CMD_FL0CNGCHMAP(cong) | F_FW_IQ_CMD_FL0CONGCIF | F_FW_IQ_CMD_FL0CONGEN); } c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | (black_hole == 2 ? F_FW_IQ_CMD_FL0PACKEN : 0)); c.fl0dcaen_to_fl0cidxfthresh = htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_128B) | V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B)); c.fl0size = htobe16(na->num_rx_desc / 8 + sp->spg_len / EQ_ESIZE); c.fl0addr = htobe64(nm_rxq->fl_ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create netmap ingress queue: %d\n", rc); return (rc); } nm_rxq->iq_cidx = 0; MPASS(nm_rxq->iq_sidx == vi->qsize_rxq - sp->spg_len / IQ_ESIZE); nm_rxq->iq_gen = F_RSPD_GEN; nm_rxq->iq_cntxt_id = be16toh(c.iqid); nm_rxq->iq_abs_id = be16toh(c.physiqid); cntxt_id = nm_rxq->iq_cntxt_id - sc->sge.iq_start; if (cntxt_id >= sc->sge.niq) { panic ("%s: nm_rxq->iq_cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.niq - 1); } sc->sge.iqmap[cntxt_id] = (void *)nm_rxq; nm_rxq->fl_cntxt_id = be16toh(c.fl0id); nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; MPASS(nm_rxq->fl_sidx == na->num_rx_desc); cntxt_id = nm_rxq->fl_cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) { panic("%s: nm_rxq->fl_cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); } sc->sge.eqmap[cntxt_id] = (void *)nm_rxq; nm_rxq->fl_db_val = V_QID(nm_rxq->fl_cntxt_id) | sc->chip_params->sge_fl_db; if (is_t5(sc) && cong >= 0) { uint32_t param, val; param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | V_FW_PARAMS_PARAM_YZ(nm_rxq->iq_cntxt_id); param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | V_FW_PARAMS_PARAM_YZ(nm_rxq->iq_cntxt_id); if (cong == 0) val = 1 << 19; else { val = 2 << 19; for (i = 0; i < 4; i++) { if (cong & (1 << i)) val |= 1 << (i << 2); } } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* report error but carry on */ device_printf(sc->dev, "failed to set congestion manager context for " "ingress queue %d: %d\n", nm_rxq->iq_cntxt_id, rc); } } - t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), + t4_write_reg(sc, sc->sge_gts_reg, V_INGRESSQID(nm_rxq->iq_cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(holdoff_tmr_idx))); return (rc); } static int free_nm_rxq_hwq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq) { struct adapter *sc = vi->pi->adapter; int rc; rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, nm_rxq->iq_cntxt_id, nm_rxq->fl_cntxt_id, 0xffff); if (rc != 0) device_printf(sc->dev, "%s: failed for iq %d, fl %d: %d\n", __func__, nm_rxq->iq_cntxt_id, nm_rxq->fl_cntxt_id, rc); return (rc); } static int alloc_nm_txq_hwq(struct vi_info *vi, struct sge_nm_txq *nm_txq) { int rc, cntxt_id; size_t len; struct adapter *sc = vi->pi->adapter; struct netmap_adapter *na = NA(vi->ifp); struct fw_eq_eth_cmd c; MPASS(na != NULL); MPASS(nm_txq->desc != NULL); len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len; bzero(nm_txq->desc, len); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid)); c.fetchszm_to_iqid = htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_ETH_CMD_PCIECHN(vi->pi->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(sc->sge.nm_rxq[nm_txq->iqidx].iq_cntxt_id)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_EQSIZE(len / EQ_ESIZE)); c.eqaddr = htobe64(nm_txq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create netmap egress queue: %d\n", rc); return (rc); } nm_txq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = nm_txq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: nm_txq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = (void *)nm_txq; nm_txq->pidx = nm_txq->cidx = 0; MPASS(nm_txq->sidx == na->num_tx_desc); nm_txq->equiqidx = nm_txq->equeqidx = nm_txq->dbidx = 0; nm_txq->doorbells = sc->doorbells; if (isset(&nm_txq->doorbells, DOORBELL_UDB) || isset(&nm_txq->doorbells, DOORBELL_UDBWC) || isset(&nm_txq->doorbells, DOORBELL_WCWR)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (nm_txq->cntxt_id >> s_qpp) << PAGE_SHIFT; nm_txq->udb_qid = nm_txq->cntxt_id & mask; if (nm_txq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&nm_txq->doorbells, DOORBELL_WCWR); else { udb += nm_txq->udb_qid << UDBS_SEG_SHIFT; nm_txq->udb_qid = 0; } nm_txq->udb = (volatile void *)udb; } return (rc); } static int free_nm_txq_hwq(struct vi_info *vi, struct sge_nm_txq *nm_txq) { struct adapter *sc = vi->pi->adapter; int rc; rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, nm_txq->cntxt_id); if (rc != 0) device_printf(sc->dev, "%s: failed for eq %d: %d\n", __func__, nm_txq->cntxt_id, rc); return (rc); } static int cxgbe_netmap_on(struct adapter *sc, struct vi_info *vi, struct ifnet *ifp, struct netmap_adapter *na) { struct netmap_slot *slot; struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; int rc, i, j, hwidx; struct hw_buf_info *hwb; ASSERT_SYNCHRONIZED_OP(sc); if ((vi->flags & VI_INIT_DONE) == 0 || (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return (EAGAIN); hwb = &sc->sge.hw_buf_info[0]; for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) { if (hwb->size == NETMAP_BUF_SIZE(na)) break; } if (i >= SGE_FLBUF_SIZES) { if_printf(ifp, "no hwidx for netmap buffer size %d.\n", NETMAP_BUF_SIZE(na)); return (ENXIO); } hwidx = i; /* Must set caps before calling netmap_reset */ nm_set_native_flags(na); for_each_nm_rxq(vi, i, nm_rxq) { struct irq *irq = &sc->irq[vi->first_intr + i]; alloc_nm_rxq_hwq(vi, nm_rxq, tnl_cong(vi->pi, nm_cong_drop)); nm_rxq->fl_hwidx = hwidx; slot = netmap_reset(na, NR_RX, i, 0); MPASS(slot != NULL); /* XXXNM: error check, not assert */ /* We deal with 8 bufs at a time */ MPASS((na->num_rx_desc & 7) == 0); MPASS(na->num_rx_desc == nm_rxq->fl_sidx); for (j = 0; j < nm_rxq->fl_sidx; j++) { uint64_t ba; PNMB(na, &slot[j], &ba); MPASS(ba != 0); nm_rxq->fl_desc[j] = htobe64(ba | hwidx); } j = nm_rxq->fl_pidx = nm_rxq->fl_sidx - 8; MPASS((j & 7) == 0); j /= 8; /* driver pidx to hardware pidx */ wmb(); - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(j)); atomic_cmpset_int(&irq->nm_state, NM_OFF, NM_ON); } for_each_nm_txq(vi, i, nm_txq) { alloc_nm_txq_hwq(vi, nm_txq); slot = netmap_reset(na, NR_TX, i, 0); MPASS(slot != NULL); /* XXXNM: error check, not assert */ } if (vi->nm_rss == NULL) { vi->nm_rss = malloc(vi->rss_size * sizeof(uint16_t), M_CXGBE, M_ZERO | M_WAITOK); } for (i = 0; i < vi->rss_size;) { for_each_nm_rxq(vi, j, nm_rxq) { vi->nm_rss[i++] = nm_rxq->iq_abs_id; if (i == vi->rss_size) break; } } rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, vi->nm_rss, vi->rss_size); if (rc != 0) if_printf(ifp, "netmap rss_config failed: %d\n", rc); return (rc); } static int cxgbe_netmap_off(struct adapter *sc, struct vi_info *vi, struct ifnet *ifp, struct netmap_adapter *na) { int rc, i; struct sge_nm_txq *nm_txq; struct sge_nm_rxq *nm_rxq; ASSERT_SYNCHRONIZED_OP(sc); if ((vi->flags & VI_INIT_DONE) == 0) return (0); rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, vi->rss, vi->rss_size); if (rc != 0) if_printf(ifp, "failed to restore RSS config: %d\n", rc); nm_clear_native_flags(na); for_each_nm_txq(vi, i, nm_txq) { struct sge_qstat *spg = (void *)&nm_txq->desc[nm_txq->sidx]; /* Wait for hw pidx to catch up ... */ while (be16toh(nm_txq->pidx) != spg->pidx) pause("nmpidx", 1); /* ... and then for the cidx. */ while (spg->pidx != spg->cidx) pause("nmcidx", 1); free_nm_txq_hwq(vi, nm_txq); } for_each_nm_rxq(vi, i, nm_rxq) { struct irq *irq = &sc->irq[vi->first_intr + i]; while (!atomic_cmpset_int(&irq->nm_state, NM_ON, NM_OFF)) pause("nmst", 1); free_nm_rxq_hwq(vi, nm_rxq); } return (rc); } static int cxgbe_netmap_reg(struct netmap_adapter *na, int on) { struct ifnet *ifp = na->ifp; struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->pi->adapter; int rc; rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4nmreg"); if (rc != 0) return (rc); if (on) rc = cxgbe_netmap_on(sc, vi, ifp, na); else rc = cxgbe_netmap_off(sc, vi, ifp, na); end_synchronized_op(sc, 0); return (rc); } /* How many packets can a single type1 WR carry in n descriptors */ static inline int ndesc_to_npkt(const int n) { MPASS(n > 0 && n <= SGE_MAX_WR_NDESC); return (n * 2 - 1); } #define MAX_NPKT_IN_TYPE1_WR (ndesc_to_npkt(SGE_MAX_WR_NDESC)) /* Space (in descriptors) needed for a type1 WR that carries n packets */ static inline int npkt_to_ndesc(const int n) { MPASS(n > 0 && n <= MAX_NPKT_IN_TYPE1_WR); return ((n + 2) / 2); } /* Space (in 16B units) needed for a type1 WR that carries n packets */ static inline int npkt_to_len16(const int n) { MPASS(n > 0 && n <= MAX_NPKT_IN_TYPE1_WR); return (n * 2 + 1); } #define NMIDXDIFF(q, idx) IDXDIFF((q)->pidx, (q)->idx, (q)->sidx) static void ring_nm_txq_db(struct adapter *sc, struct sge_nm_txq *nm_txq) { int n; u_int db = nm_txq->doorbells; MPASS(nm_txq->pidx != nm_txq->dbidx); n = NMIDXDIFF(nm_txq, dbidx); if (n > 1) clrbit(&db, DOORBELL_WCWR); wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: *nm_txq->udb = htole32(V_QID(nm_txq->udb_qid) | V_PIDX(n)); break; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; /* * Queues whose 128B doorbell segment fits in the page do not * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ KASSERT(nm_txq->udb_qid == 0 && n == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for nm_txq %p", __func__, nm_txq->doorbells, n, nm_txq->pidx, nm_txq)); dst = (volatile void *)((uintptr_t)nm_txq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); src = (void *)&nm_txq->desc[nm_txq->dbidx]; while (src != (void *)&nm_txq->desc[nm_txq->dbidx + 1]) *dst++ = *src++; wmb(); break; } case DOORBELL_UDBWC: *nm_txq->udb = htole32(V_QID(nm_txq->udb_qid) | V_PIDX(n)); wmb(); break; case DOORBELL_KDB: - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + t4_write_reg(sc, sc->sge_kdoorbell_reg, V_QID(nm_txq->cntxt_id) | V_PIDX(n)); break; } nm_txq->dbidx = nm_txq->pidx; } int lazy_tx_credit_flush = 1; /* * Write work requests to send 'npkt' frames and ring the doorbell to send them * on their way. No need to check for wraparound. */ static void cxgbe_nm_tx(struct adapter *sc, struct sge_nm_txq *nm_txq, struct netmap_kring *kring, int npkt, int npkt_remaining, int txcsum) { struct netmap_ring *ring = kring->ring; struct netmap_slot *slot; const u_int lim = kring->nkr_num_slots - 1; struct fw_eth_tx_pkts_wr *wr = (void *)&nm_txq->desc[nm_txq->pidx]; uint16_t len; uint64_t ba; struct cpl_tx_pkt_core *cpl; struct ulptx_sgl *usgl; int i, n; while (npkt) { n = min(npkt, MAX_NPKT_IN_TYPE1_WR); len = 0; wr = (void *)&nm_txq->desc[nm_txq->pidx]; wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(npkt_to_len16(n))); wr->npkt = n; wr->r3 = 0; wr->type = 1; cpl = (void *)(wr + 1); for (i = 0; i < n; i++) { slot = &ring->slot[kring->nr_hwcur]; PNMB(kring->na, slot, &ba); MPASS(ba != 0); cpl->ctrl0 = nm_txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(slot->len); /* * netmap(4) says "netmap does not use features such as * checksum offloading, TCP segmentation offloading, * encryption, VLAN encapsulation/decapsulation, etc." * * So the ncxl interfaces have tx hardware checksumming * disabled by default. But you can override netmap by * enabling IFCAP_TXCSUM on the interface manully. */ cpl->ctrl1 = txcsum ? 0 : htobe64(F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS); usgl = (void *)(cpl + 1); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(1)); usgl->len0 = htobe32(slot->len); usgl->addr0 = htobe64(ba); slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); cpl = (void *)(usgl + 1); MPASS(slot->len + len <= UINT16_MAX); len += slot->len; kring->nr_hwcur = nm_next(kring->nr_hwcur, lim); } wr->plen = htobe16(len); npkt -= n; nm_txq->pidx += npkt_to_ndesc(n); MPASS(nm_txq->pidx <= nm_txq->sidx); if (__predict_false(nm_txq->pidx == nm_txq->sidx)) { /* * This routine doesn't know how to write WRs that wrap * around. Make sure it wasn't asked to. */ MPASS(npkt == 0); nm_txq->pidx = 0; } if (npkt == 0 && npkt_remaining == 0) { /* All done. */ if (lazy_tx_credit_flush == 0) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); nm_txq->equeqidx = nm_txq->pidx; nm_txq->equiqidx = nm_txq->pidx; } ring_nm_txq_db(sc, nm_txq); return; } if (NMIDXDIFF(nm_txq, equiqidx) >= nm_txq->sidx / 2) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ | F_FW_WR_EQUIQ); nm_txq->equeqidx = nm_txq->pidx; nm_txq->equiqidx = nm_txq->pidx; } else if (NMIDXDIFF(nm_txq, equeqidx) >= 64) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); nm_txq->equeqidx = nm_txq->pidx; } if (NMIDXDIFF(nm_txq, dbidx) >= 2 * SGE_MAX_WR_NDESC) ring_nm_txq_db(sc, nm_txq); } /* Will get called again. */ MPASS(npkt_remaining); } /* How many contiguous free descriptors starting at pidx */ static inline int contiguous_ndesc_available(struct sge_nm_txq *nm_txq) { if (nm_txq->cidx > nm_txq->pidx) return (nm_txq->cidx - nm_txq->pidx - 1); else if (nm_txq->cidx > 0) return (nm_txq->sidx - nm_txq->pidx); else return (nm_txq->sidx - nm_txq->pidx - 1); } static int reclaim_nm_tx_desc(struct sge_nm_txq *nm_txq) { struct sge_qstat *spg = (void *)&nm_txq->desc[nm_txq->sidx]; uint16_t hw_cidx = spg->cidx; /* snapshot */ struct fw_eth_tx_pkts_wr *wr; int n = 0; hw_cidx = be16toh(hw_cidx); while (nm_txq->cidx != hw_cidx) { wr = (void *)&nm_txq->desc[nm_txq->cidx]; MPASS(wr->op_pkd == htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR))); MPASS(wr->type == 1); MPASS(wr->npkt > 0 && wr->npkt <= MAX_NPKT_IN_TYPE1_WR); n += wr->npkt; nm_txq->cidx += npkt_to_ndesc(wr->npkt); /* * We never sent a WR that wrapped around so the credits coming * back, WR by WR, should never cause the cidx to wrap around * either. */ MPASS(nm_txq->cidx <= nm_txq->sidx); if (__predict_false(nm_txq->cidx == nm_txq->sidx)) nm_txq->cidx = 0; } return (n); } static int cxgbe_netmap_txsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct ifnet *ifp = na->ifp; struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->pi->adapter; struct sge_nm_txq *nm_txq = &sc->sge.nm_txq[vi->first_nm_txq + kring->ring_id]; const u_int head = kring->rhead; u_int reclaimed = 0; int n, d, npkt_remaining, ndesc_remaining, txcsum; /* * Tx was at kring->nr_hwcur last time around and now we need to advance * to kring->rhead. Note that the driver's pidx moves independent of * netmap's kring->nr_hwcur (pidx counts descriptors and the relation * between descriptors and frames isn't 1:1). */ npkt_remaining = head >= kring->nr_hwcur ? head - kring->nr_hwcur : kring->nkr_num_slots - kring->nr_hwcur + head; txcsum = ifp->if_capenable & (IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6); while (npkt_remaining) { reclaimed += reclaim_nm_tx_desc(nm_txq); ndesc_remaining = contiguous_ndesc_available(nm_txq); /* Can't run out of descriptors with packets still remaining */ MPASS(ndesc_remaining > 0); /* # of desc needed to tx all remaining packets */ d = (npkt_remaining / MAX_NPKT_IN_TYPE1_WR) * SGE_MAX_WR_NDESC; if (npkt_remaining % MAX_NPKT_IN_TYPE1_WR) d += npkt_to_ndesc(npkt_remaining % MAX_NPKT_IN_TYPE1_WR); if (d <= ndesc_remaining) n = npkt_remaining; else { /* Can't send all, calculate how many can be sent */ n = (ndesc_remaining / SGE_MAX_WR_NDESC) * MAX_NPKT_IN_TYPE1_WR; if (ndesc_remaining % SGE_MAX_WR_NDESC) n += ndesc_to_npkt(ndesc_remaining % SGE_MAX_WR_NDESC); } /* Send n packets and update nm_txq->pidx and kring->nr_hwcur */ npkt_remaining -= n; cxgbe_nm_tx(sc, nm_txq, kring, n, npkt_remaining, txcsum); } MPASS(npkt_remaining == 0); MPASS(kring->nr_hwcur == head); MPASS(nm_txq->dbidx == nm_txq->pidx); /* * Second part: reclaim buffers for completed transmissions. */ if (reclaimed || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { reclaimed += reclaim_nm_tx_desc(nm_txq); kring->nr_hwtail += reclaimed; if (kring->nr_hwtail >= kring->nkr_num_slots) kring->nr_hwtail -= kring->nkr_num_slots; } return (0); } static int cxgbe_netmap_rxsync(struct netmap_kring *kring, int flags) { struct netmap_adapter *na = kring->na; struct netmap_ring *ring = kring->ring; struct ifnet *ifp = na->ifp; struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->pi->adapter; struct sge_nm_rxq *nm_rxq = &sc->sge.nm_rxq[vi->first_nm_rxq + kring->ring_id]; u_int const head = kring->rhead; u_int n; int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; if (black_hole) return (0); /* No updates ever. */ if (netmap_no_pendintr || force_update) { kring->nr_hwtail = atomic_load_acq_32(&nm_rxq->fl_cidx); kring->nr_kflags &= ~NKR_PENDINTR; } /* Userspace done with buffers from kring->nr_hwcur to head */ n = head >= kring->nr_hwcur ? head - kring->nr_hwcur : kring->nkr_num_slots - kring->nr_hwcur + head; n &= ~7U; if (n > 0) { u_int fl_pidx = nm_rxq->fl_pidx; struct netmap_slot *slot = &ring->slot[fl_pidx]; uint64_t ba; int i, dbinc = 0, hwidx = nm_rxq->fl_hwidx; /* * We always deal with 8 buffers at a time. We must have * stopped at an 8B boundary (fl_pidx) last time around and we * must have a multiple of 8B buffers to give to the freelist. */ MPASS((fl_pidx & 7) == 0); MPASS((n & 7) == 0); IDXINCR(kring->nr_hwcur, n, kring->nkr_num_slots); IDXINCR(nm_rxq->fl_pidx, n, nm_rxq->fl_sidx); while (n > 0) { for (i = 0; i < 8; i++, fl_pidx++, slot++) { PNMB(na, slot, &ba); MPASS(ba != 0); nm_rxq->fl_desc[fl_pidx] = htobe64(ba | hwidx); slot->flags &= ~NS_BUF_CHANGED; MPASS(fl_pidx <= nm_rxq->fl_sidx); } n -= 8; if (fl_pidx == nm_rxq->fl_sidx) { fl_pidx = 0; slot = &ring->slot[0]; } if (++dbinc == 8 && n >= 32) { wmb(); - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(dbinc)); dbinc = 0; } } MPASS(nm_rxq->fl_pidx == fl_pidx); if (dbinc > 0) { wmb(); - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(dbinc)); } } return (0); } void cxgbe_nm_attach(struct vi_info *vi) { struct port_info *pi; struct adapter *sc; struct netmap_adapter na; MPASS(vi->nnmrxq > 0); MPASS(vi->ifp != NULL); pi = vi->pi; sc = pi->adapter; bzero(&na, sizeof(na)); na.ifp = vi->ifp; na.na_flags = NAF_BDG_MAYSLEEP; /* Netmap doesn't know about the space reserved for the status page. */ na.num_tx_desc = vi->qsize_txq - sc->params.sge.spg_len / EQ_ESIZE; /* * The freelist's cidx/pidx drives netmap's rx cidx/pidx. So * num_rx_desc is based on the number of buffers that can be held in the * freelist, and not the number of entries in the iq. (These two are * not exactly the same due to the space taken up by the status page). */ na.num_rx_desc = rounddown(vi->qsize_rxq, 8); na.nm_txsync = cxgbe_netmap_txsync; na.nm_rxsync = cxgbe_netmap_rxsync; na.nm_register = cxgbe_netmap_reg; na.num_tx_rings = vi->nnmtxq; na.num_rx_rings = vi->nnmrxq; netmap_attach(&na); /* This adds IFCAP_NETMAP to if_capabilities */ } void cxgbe_nm_detach(struct vi_info *vi) { MPASS(vi->nnmrxq > 0); MPASS(vi->ifp != NULL); netmap_detach(vi->ifp); } static void handle_nm_fw6_msg(struct adapter *sc, struct ifnet *ifp, const struct cpl_fw6_msg *cpl) { const struct cpl_sge_egr_update *egr; uint32_t oq; struct sge_nm_txq *nm_txq; if (cpl->type != FW_TYPE_RSSCPL && cpl->type != FW6_TYPE_RSSCPL) panic("%s: FW_TYPE 0x%x on nm_rxq.", __func__, cpl->type); /* data[0] is RSS header */ egr = (const void *)&cpl->data[1]; oq = be32toh(egr->opcode_qid); MPASS(G_CPL_OPCODE(oq) == CPL_SGE_EGR_UPDATE); nm_txq = (void *)sc->sge.eqmap[G_EGR_QID(oq) - sc->sge.eq_start]; netmap_tx_irq(ifp, nm_txq->nid); } void t4_nm_intr(void *arg) { struct sge_nm_rxq *nm_rxq = arg; struct vi_info *vi = nm_rxq->vi; struct adapter *sc = vi->pi->adapter; struct ifnet *ifp = vi->ifp; struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring = &na->rx_rings[nm_rxq->nid]; struct netmap_ring *ring = kring->ring; struct iq_desc *d = &nm_rxq->iq_desc[nm_rxq->iq_cidx]; uint32_t lq; u_int n = 0, work = 0; uint8_t opcode; uint32_t fl_cidx = atomic_load_acq_32(&nm_rxq->fl_cidx); u_int fl_credits = fl_cidx & 7; while ((d->rsp.u.type_gen & F_RSPD_GEN) == nm_rxq->iq_gen) { rmb(); lq = be32toh(d->rsp.pldbuflen_qid); opcode = d->rss.opcode; switch (G_RSPD_TYPE(d->rsp.u.type_gen)) { case X_RSPD_TYPE_FLBUF: if (black_hole != 2) { /* No buffer packing so new buf every time */ MPASS(lq & F_RSPD_NEWBUF); } /* fall through */ case X_RSPD_TYPE_CPL: MPASS(opcode < NUM_CPL_CMDS); switch (opcode) { case CPL_FW4_MSG: case CPL_FW6_MSG: handle_nm_fw6_msg(sc, ifp, (const void *)&d->cpl[0]); break; case CPL_RX_PKT: ring->slot[fl_cidx].len = G_RSPD_LEN(lq) - sc->params.sge.fl_pktshift; ring->slot[fl_cidx].flags = kring->nkr_slot_flags; fl_cidx += (lq & F_RSPD_NEWBUF) ? 1 : 0; fl_credits += (lq & F_RSPD_NEWBUF) ? 1 : 0; if (__predict_false(fl_cidx == nm_rxq->fl_sidx)) fl_cidx = 0; break; default: panic("%s: unexpected opcode 0x%x on nm_rxq %p", __func__, opcode, nm_rxq); } break; case X_RSPD_TYPE_INTR: /* Not equipped to handle forwarded interrupts. */ panic("%s: netmap queue received interrupt for iq %u\n", __func__, lq); default: panic("%s: illegal response type %d on nm_rxq %p", __func__, G_RSPD_TYPE(d->rsp.u.type_gen), nm_rxq); } d++; if (__predict_false(++nm_rxq->iq_cidx == nm_rxq->iq_sidx)) { nm_rxq->iq_cidx = 0; d = &nm_rxq->iq_desc[0]; nm_rxq->iq_gen ^= F_RSPD_GEN; } if (__predict_false(++n == rx_ndesc)) { atomic_store_rel_32(&nm_rxq->fl_cidx, fl_cidx); if (black_hole && fl_credits >= 8) { fl_credits /= 8; IDXINCR(nm_rxq->fl_pidx, fl_credits * 8, nm_rxq->fl_sidx); - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(fl_credits)); fl_credits = fl_cidx & 7; } else if (!black_hole) { netmap_rx_irq(ifp, nm_rxq->nid, &work); MPASS(work != 0); } - t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), + t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(n) | V_INGRESSQID(nm_rxq->iq_cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); n = 0; } } atomic_store_rel_32(&nm_rxq->fl_cidx, fl_cidx); if (black_hole) { fl_credits /= 8; IDXINCR(nm_rxq->fl_pidx, fl_credits * 8, nm_rxq->fl_sidx); - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + t4_write_reg(sc, sc->sge_kdoorbell_reg, nm_rxq->fl_db_val | V_PIDX(fl_credits)); } else netmap_rx_irq(ifp, nm_rxq->nid, &work); - t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(n) | + t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(n) | V_INGRESSQID((u32)nm_rxq->iq_cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(holdoff_tmr_idx))); } #endif Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c (revision 303654) @@ -1,4960 +1,4960 @@ /*- * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_NETMAP #include #include #include #include #include #endif #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #ifdef T4_PKT_TIMESTAMP #define RX_COPY_THRESHOLD (MINCLSIZE - 8) #else #define RX_COPY_THRESHOLD MINCLSIZE #endif /* * Ethernet frames are DMA'd at this byte offset into the freelist buffer. * 0-7 are valid values. */ static int fl_pktshift = 2; TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift); /* * Pad ethernet payload up to this boundary. * -1: driver should figure out a good value. * 0: disable padding. * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value. */ int fl_pad = -1; TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad); /* * Status page length. * -1: driver should figure out a good value. * 64 or 128 are the only other valid values. */ static int spg_len = -1; TUNABLE_INT("hw.cxgbe.spg_len", &spg_len); /* * Congestion drops. * -1: no congestion feedback (not recommended). * 0: backpressure the channel instead of dropping packets right away. * 1: no backpressure, drop packets for the congested queue immediately. */ static int cong_drop = 0; TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop); /* * Deliver multiple frames in the same free list buffer if they fit. * -1: let the driver decide whether to enable buffer packing or not. * 0: disable buffer packing. * 1: enable buffer packing. */ static int buffer_packing = -1; TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing); /* * Start next frame in a packed buffer at this boundary. * -1: driver should figure out a good value. * T4: driver will ignore this and use the same value as fl_pad above. * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. */ static int fl_pack = -1; TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack); /* * Allow the driver to create mbuf(s) in a cluster allocated for rx. * 0: never; always allocate mbufs from the zone_mbuf UMA zone. * 1: ok to create mbuf(s) within a cluster if there is room. */ static int allow_mbufs_in_cluster = 1; TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster); /* * Largest rx cluster size that the driver is allowed to allocate. */ static int largest_rx_cluster = MJUM16BYTES; TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster); /* * Size of cluster allocation that's most likely to succeed. The driver will * fall back to this size if it fails to allocate clusters larger than this. */ static int safest_rx_cluster = PAGE_SIZE; TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster); struct txpkts { u_int wr_type; /* type 0 or type 1 */ u_int npkt; /* # of packets in this work request */ u_int plen; /* total payload (sum of all packets) */ u_int len16; /* # of 16B pieces used by this work request */ }; /* A packet's SGL. This + m_pkthdr has all info needed for tx */ struct sgl { struct sglist sg; struct sglist_seg seg[TX_SGL_SEGS]; }; static int service_iq(struct sge_iq *, int); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int); static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *); static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t, uint16_t, char *); static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *, bus_addr_t *, void **); static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, void *); static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *, int, int); static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *); static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_fl *); static int alloc_fwq(struct adapter *); static int free_fwq(struct adapter *); static int alloc_mgmtq(struct adapter *); static int free_mgmtq(struct adapter *); static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int, struct sysctl_oid *); static int free_rxq(struct vi_info *, struct sge_rxq *); #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int, struct sysctl_oid *); static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *); #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int, struct sysctl_oid *); static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *); static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int, struct sysctl_oid *); static int free_nm_txq(struct vi_info *, struct sge_nm_txq *); #endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #endif static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *); static int free_eq(struct adapter *, struct sge_eq *); static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *, struct sysctl_oid *); static int free_wrq(struct adapter *, struct sge_wrq *); static int alloc_txq(struct vi_info *, struct sge_txq *, int, struct sysctl_oid *); static int free_txq(struct vi_info *, struct sge_txq *); static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int); static inline void ring_fl_db(struct adapter *, struct sge_fl *); static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); static int alloc_fl_sdesc(struct sge_fl *); static void free_fl_sdesc(struct adapter *, struct sge_fl *); static void find_best_refill_source(struct adapter *, struct sge_fl *, int); static void find_safe_refill_source(struct adapter *, struct sge_fl *); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static inline void get_pkt_gl(struct mbuf *, struct sglist *); static inline u_int txpkt_len16(u_int, u_int); static inline u_int txpkts0_len16(u_int); static inline u_int txpkts1_len16(void); static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int); static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int); static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int); static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int); static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int); static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int); static inline uint16_t read_hw_cidx(struct sge_eq *); static inline u_int reclaimable_tx_desc(struct sge_eq *); static inline u_int total_available_tx_desc(struct sge_eq *); static u_int reclaim_tx_descs(struct sge_txq *, u_int); static void tx_reclaim(void *, int); static __be64 get_flit(struct sglist_seg *, int, int); static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *, struct mbuf *); static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *); static void wrq_tx_drain(void *, int); static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *); static int sysctl_uint16(SYSCTL_HANDLER_ARGS); static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); static int sysctl_tc(SYSCTL_HANDLER_ARGS); static counter_u64_t extfree_refs; static counter_u64_t extfree_rels; an_handler_t t4_an_handler; fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES]; cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS]; static int an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl) { #ifdef INVARIANTS panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl); #else log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n", __func__, iq, ctrl); #endif return (EDOOFUS); } int t4_register_an_handler(an_handler_t h) { uintptr_t *loc, new; new = h ? (uintptr_t)h : (uintptr_t)an_not_handled; loc = (uintptr_t *) &t4_an_handler; atomic_store_rel_ptr(loc, new); return (0); } static int fw_msg_not_handled(struct adapter *sc, const __be64 *rpl) { const struct cpl_fw6_msg *cpl = __containerof(rpl, struct cpl_fw6_msg, data[0]); #ifdef INVARIANTS panic("%s: fw_msg type %d", __func__, cpl->type); #else log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type); #endif return (EDOOFUS); } int t4_register_fw_msg_handler(int type, fw_msg_handler_t h) { uintptr_t *loc, new; if (type >= nitems(t4_fw_msg_handler)) return (EINVAL); /* * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL * handler dispatch table. Reject any attempt to install a handler for * this subtype. */ if (type == FW_TYPE_RSSCPL || type == FW6_TYPE_RSSCPL) return (EINVAL); new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled; loc = (uintptr_t *) &t4_fw_msg_handler[type]; atomic_store_rel_ptr(loc, new); return (0); } static int cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { #ifdef INVARIANTS panic("%s: opcode 0x%02x on iq %p with payload %p", __func__, rss->opcode, iq, m); #else log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n", __func__, rss->opcode, iq, m); m_freem(m); #endif return (EDOOFUS); } int t4_register_cpl_handler(int opcode, cpl_handler_t h) { uintptr_t *loc, new; if (opcode >= nitems(t4_cpl_handler)) return (EINVAL); new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled; loc = (uintptr_t *) &t4_cpl_handler[opcode]; atomic_store_rel_ptr(loc, new); return (0); } /* * Called on MOD_LOAD. Validates and calculates the SGE tunables. */ void t4_sge_modload(void) { int i; if (fl_pktshift < 0 || fl_pktshift > 7) { printf("Invalid hw.cxgbe.fl_pktshift value (%d)," " using 2 instead.\n", fl_pktshift); fl_pktshift = 2; } if (spg_len != 64 && spg_len != 128) { int len; #if defined(__i386__) || defined(__amd64__) len = cpu_clflush_line_size > 64 ? 128 : 64; #else len = 64; #endif if (spg_len != -1) { printf("Invalid hw.cxgbe.spg_len value (%d)," " using %d instead.\n", spg_len, len); } spg_len = len; } if (cong_drop < -1 || cong_drop > 1) { printf("Invalid hw.cxgbe.cong_drop value (%d)," " using 0 instead.\n", cong_drop); cong_drop = 0; } extfree_refs = counter_u64_alloc(M_WAITOK); extfree_rels = counter_u64_alloc(M_WAITOK); counter_u64_zero(extfree_refs); counter_u64_zero(extfree_rels); t4_an_handler = an_not_handled; for (i = 0; i < nitems(t4_fw_msg_handler); i++) t4_fw_msg_handler[i] = fw_msg_not_handled; for (i = 0; i < nitems(t4_cpl_handler); i++) t4_cpl_handler[i] = cpl_not_handled; t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update); t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx); t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl); t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl); } void t4_sge_modunload(void) { counter_u64_free(extfree_refs); counter_u64_free(extfree_rels); } uint64_t t4_sge_extfree_refs(void) { uint64_t refs, rels; rels = counter_u64_fetch(extfree_rels); refs = counter_u64_fetch(extfree_refs); return (refs - rels); } static inline void setup_pad_and_pack_boundaries(struct adapter *sc) { uint32_t v, m; int pad, pack; pad = fl_pad; if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) { /* * If there is any chance that we might use buffer packing and * the chip is a T4, then pick 64 as the pad/pack boundary. Set * it to 32 in all other cases. */ pad = is_t4(sc) && buffer_packing ? 64 : 32; /* * For fl_pad = 0 we'll still write a reasonable value to the * register but all the freelists will opt out of padding. * We'll complain here only if the user tried to set it to a * value greater than 0 that was invalid. */ if (fl_pad > 0) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value" " (%d), using %d instead.\n", fl_pad, pad); } } m = V_INGPADBOUNDARY(M_INGPADBOUNDARY); v = V_INGPADBOUNDARY(ilog2(pad) - 5); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); if (is_t4(sc)) { if (fl_pack != -1 && fl_pack != pad) { /* Complain but carry on. */ device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored," " using %d instead.\n", fl_pack, pad); } return; } pack = fl_pack; if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || !powerof2(fl_pack)) { pack = max(sc->params.pci.mps, CACHE_LINE_SIZE); MPASS(powerof2(pack)); if (pack < 16) pack = 16; if (pack == 32) pack = 64; if (pack > 4096) pack = 4096; if (fl_pack != -1) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value" " (%d), using %d instead.\n", fl_pack, pack); } } m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); if (pack == 16) v = V_INGPACKBOUNDARY(0); else v = V_INGPACKBOUNDARY(ilog2(pack) - 5); MPASS(!is_t4(sc)); /* T4 doesn't have SGE_CONTROL2 */ t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); } /* * adap->params.vpd.cclk must be set up before this is called. */ void t4_tweak_chip_settings(struct adapter *sc) { int i; uint32_t v, m; int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200}; int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sge_flbuf_sizes[] = { MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, MJUMPAGESIZE - CL_METADATA_SIZE, MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE, #endif MJUM9BYTES, MJUM16BYTES, MCLBYTES - MSIZE - CL_METADATA_SIZE, MJUM9BYTES - CL_METADATA_SIZE, MJUM16BYTES - CL_METADATA_SIZE, }; KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); setup_pad_and_pack_boundaries(sc); v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES, ("%s: hw buffer size table too big", __func__)); for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) { t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i), sge_flbuf_sizes[i]); } v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]); t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v); KASSERT(intr_timer[0] <= timer_max, ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0], timer_max)); for (i = 1; i < nitems(intr_timer); i++) { KASSERT(intr_timer[i] >= intr_timer[i - 1], ("%s: timers not listed in increasing order (%d)", __func__, i)); while (intr_timer[i] > timer_max) { if (i == nitems(intr_timer) - 1) { intr_timer[i] = timer_max; break; } intr_timer[i] += intr_timer[i - 1]; intr_timer[i] /= 2; } } v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) | V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1])); t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v); v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) | V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3])); t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v); v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) | V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5])); t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v); /* 4K, 16K, 64K, 256K DDP "page sizes" */ v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v); m = v = F_TDDPTAGTCB; t4_set_reg_field(sc, A_ULP_RX_CTL, m, v); m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; t4_set_reg_field(sc, A_TP_PARA_REG5, m, v); } /* * SGE wants the buffer to be at least 64B and then a multiple of 16. If * padding is is use the buffer's start and end need to be aligned to the pad * boundary as well. We'll just make sure that the size is a multiple of the * boundary here, it is up to the buffer allocation code to make sure the start * of the buffer is aligned as well. */ static inline int hwsz_ok(struct adapter *sc, int hwsz) { int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1; return (hwsz >= 64 && (hwsz & mask) == 0); } /* * XXX: driver really should be able to deal with unexpected settings. */ int t4_read_chip_settings(struct adapter *sc) { struct sge *s = &sc->sge; struct sge_params *sp = &sc->params.sge; int i, j, n, rc = 0; uint32_t m, v, r; uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sw_buf_sizes[] = { /* Sorted by size */ MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, #endif MJUM9BYTES, MJUM16BYTES }; struct sw_zone_info *swz, *safe_swz; struct hw_buf_info *hwb; t4_init_sge_params(sc); m = F_RXPKTCPLMODE; v = F_RXPKTCPLMODE; r = t4_read_reg(sc, A_SGE_CONTROL); if ((r & m) != v) { device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r); rc = EINVAL; } /* * If this changes then every single use of PAGE_SHIFT in the driver * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift. */ if (sp->page_shift != PAGE_SHIFT) { device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r); rc = EINVAL; } /* Filter out unusable hw buffer sizes entirely (mark with -2). */ hwb = &s->hw_buf_info[0]; for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) { r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i)); hwb->size = r; hwb->zidx = hwsz_ok(sc, r) ? -1 : -2; hwb->next = -1; } /* * Create a sorted list in decreasing order of hw buffer sizes (and so * increasing order of spare area) for each software zone. * * If padding is enabled then the start and end of the buffer must align * to the pad boundary; if packing is enabled then they must align with * the pack boundary as well. Allocations from the cluster zones are * aligned to min(size, 4K), so the buffer starts at that alignment and * ends at hwb->size alignment. If mbuf inlining is allowed the * starting alignment will be reduced to MSIZE and the driver will * exercise appropriate caution when deciding on the best buffer layout * to use. */ n = 0; /* no usable buffer size to begin with */ swz = &s->sw_zone_info[0]; safe_swz = NULL; for (i = 0; i < SW_ZONE_SIZES; i++, swz++) { int8_t head = -1, tail = -1; swz->size = sw_buf_sizes[i]; swz->zone = m_getzone(swz->size); swz->type = m_gettype(swz->size); if (swz->size < PAGE_SIZE) { MPASS(powerof2(swz->size)); if (fl_pad && (swz->size % sp->pad_boundary != 0)) continue; } if (swz->size == safest_rx_cluster) safe_swz = swz; hwb = &s->hw_buf_info[0]; for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) { if (hwb->zidx != -1 || hwb->size > swz->size) continue; #ifdef INVARIANTS if (fl_pad) MPASS(hwb->size % sp->pad_boundary == 0); #endif hwb->zidx = i; if (head == -1) head = tail = j; else if (hwb->size < s->hw_buf_info[tail].size) { s->hw_buf_info[tail].next = j; tail = j; } else { int8_t *cur; struct hw_buf_info *t; for (cur = &head; *cur != -1; cur = &t->next) { t = &s->hw_buf_info[*cur]; if (hwb->size == t->size) { hwb->zidx = -2; break; } if (hwb->size > t->size) { hwb->next = *cur; *cur = j; break; } } } } swz->head_hwidx = head; swz->tail_hwidx = tail; if (tail != -1) { n++; if (swz->size - s->hw_buf_info[tail].size >= CL_METADATA_SIZE) sc->flags |= BUF_PACKING_OK; } } if (n == 0) { device_printf(sc->dev, "no usable SGE FL buffer size.\n"); rc = EINVAL; } s->safe_hwidx1 = -1; s->safe_hwidx2 = -1; if (safe_swz != NULL) { s->safe_hwidx1 = safe_swz->head_hwidx; for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) { int spare; hwb = &s->hw_buf_info[i]; #ifdef INVARIANTS if (fl_pad) MPASS(hwb->size % sp->pad_boundary == 0); #endif spare = safe_swz->size - hwb->size; if (spare >= CL_METADATA_SIZE) { s->safe_hwidx2 = i; break; } } } v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ); if (r != v) { device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r); rc = EINVAL; } m = v = F_TDDPTAGTCB; r = t4_read_reg(sc, A_ULP_RX_CTL); if ((r & m) != v) { device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r); rc = EINVAL; } m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; r = t4_read_reg(sc, A_TP_PARA_REG5); if ((r & m) != v) { device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r); rc = EINVAL; } t4_init_tp_params(sc); t4_read_mtu_tbl(sc, sc->params.mtus, NULL); t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd); return (rc); } int t4_create_dma_tag(struct adapter *sc) { int rc; rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmat); if (rc != 0) { device_printf(sc->dev, "failed to create main DMA tag: %d\n", rc); } return (rc); } void t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) { struct sge_params *sp = &sc->params.sge; SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A", "freelist buffer sizes"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD, NULL, sp->pad_boundary, "payload pad boundary (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD, NULL, sp->spg_len, "status page size (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD, NULL, cong_drop, "congestion drop setting"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, NULL, sp->pack_boundary, "payload pack boundary (bytes)"); } int t4_destroy_dma_tag(struct adapter *sc) { if (sc->dmat) bus_dma_tag_destroy(sc->dmat); return (0); } /* * Allocate and initialize the firmware event queue and the management queue. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. */ int t4_setup_adapter_queues(struct adapter *sc) { int rc; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); sysctl_ctx_init(&sc->ctx); sc->flags |= ADAP_SYSCTL_CTX; /* * Firmware event queue */ rc = alloc_fwq(sc); if (rc != 0) return (rc); /* * Management queue. This is just a control queue that uses the fwq as * its associated iq. */ rc = alloc_mgmtq(sc); return (rc); } /* * Idempotent */ int t4_teardown_adapter_queues(struct adapter *sc) { ADAPTER_LOCK_ASSERT_NOTOWNED(sc); /* Do this before freeing the queue */ if (sc->flags & ADAP_SYSCTL_CTX) { sysctl_ctx_free(&sc->ctx); sc->flags &= ~ADAP_SYSCTL_CTX; } free_mgmtq(sc); free_fwq(sc); return (0); } static inline int first_vector(struct vi_info *vi) { struct adapter *sc = vi->pi->adapter; if (sc->intr_count == 1) return (0); return (vi->first_intr); } /* * Given an arbitrary "index," come up with an iq that can be used by other * queues (of this VI) for interrupt forwarding, SGE egress updates, etc. * The iq returned is guaranteed to be something that takes direct interrupts. */ static struct sge_iq * vi_intr_iq(struct vi_info *vi, int idx) { struct adapter *sc = vi->pi->adapter; struct sge *s = &sc->sge; struct sge_iq *iq = NULL; int nintr, i; if (sc->intr_count == 1) return (&sc->sge.fwq); nintr = vi->nintr; KASSERT(nintr != 0, ("%s: vi %p has no exclusive interrupts, total interrupts = %d", __func__, vi, sc->intr_count)); i = idx % nintr; if (vi->flags & INTR_RXQ) { if (i < vi->nrxq) { iq = &s->rxq[vi->first_rxq + i].iq; goto done; } i -= vi->nrxq; } #ifdef TCP_OFFLOAD if (vi->flags & INTR_OFLD_RXQ) { if (i < vi->nofldrxq) { iq = &s->ofld_rxq[vi->first_ofld_rxq + i].iq; goto done; } i -= vi->nofldrxq; } #endif panic("%s: vi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__, vi, vi->flags & INTR_ALL, idx, nintr); done: MPASS(iq != NULL); KASSERT(iq->flags & IQ_INTR, ("%s: iq %p (vi %p, intr_flags 0x%lx, idx %d)", __func__, iq, vi, vi->flags & INTR_ALL, idx)); return (iq); } /* Maximum payload that can be delivered with a single iq descriptor */ static inline int mtu_to_max_payload(struct adapter *sc, int mtu, const int toe) { int payload; #ifdef TCP_OFFLOAD if (toe) { payload = sc->tt.rx_coalesce ? G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu; } else { #endif /* large enough even when hw VLAN extraction is disabled */ payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu; #ifdef TCP_OFFLOAD } #endif return (payload); } int t4_setup_vi_queues(struct vi_info *vi) { int rc = 0, i, j, intr_idx, iqid; struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *ctrlq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif #ifdef DEV_NETMAP int saved_idx; struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif char name[16]; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); int maxp, mtu = ifp->if_mtu; /* Interrupt vector to start from (when using multiple vectors) */ intr_idx = first_vector(vi); #ifdef DEV_NETMAP saved_idx = intr_idx; if (ifp->if_capabilities & IFCAP_NETMAP) { /* netmap is supported with direct interrupts only. */ MPASS(vi->flags & INTR_RXQ); /* * We don't have buffers to back the netmap rx queues * right now so we create the queues in a way that * doesn't set off any congestion signal in the chip. */ oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq", CTLFLAG_RD, NULL, "rx queues"); for_each_nm_rxq(vi, i, nm_rxq) { rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq", CTLFLAG_RD, NULL, "tx queues"); for_each_nm_txq(vi, i, nm_txq) { iqid = vi->first_nm_rxq + (i % vi->nnmrxq); rc = alloc_nm_txq(vi, nm_txq, iqid, i, oid); if (rc != 0) goto done; } } /* Normal rx queues and netmap rx queues share the same interrupts. */ intr_idx = saved_idx; #endif /* * First pass over all NIC and TOE rx queues: * a) initialize iq and fl * b) allocate queue iff it will take direct interrupts. */ maxp = mtu_to_max_payload(sc, mtu, 0); if (vi->flags & INTR_RXQ) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); } for_each_rxq(vi, i, rxq) { init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq); snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(vi->dev), i); init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name); if (vi->flags & INTR_RXQ) { rxq->iq.flags |= IQ_INTR; rc = alloc_rxq(vi, rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } } #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq); #endif #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); if (vi->flags & INTR_OFLD_RXQ) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections"); } for_each_ofld_rxq(vi, i, ofld_rxq) { init_iq(&ofld_rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq); snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(vi->dev), i); init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name); if (vi->flags & INTR_OFLD_RXQ) { ofld_rxq->iq.flags |= IQ_INTR; rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } } #endif /* * Second pass over all NIC and TOE rx queues. The queues forwarding * their interrupts are allocated now. */ j = 0; if (!(vi->flags & INTR_RXQ)) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); for_each_rxq(vi, i, rxq) { MPASS(!(rxq->iq.flags & IQ_INTR)); intr_idx = vi_intr_iq(vi, j)->abs_id; rc = alloc_rxq(vi, rxq, intr_idx, i, oid); if (rc != 0) goto done; j++; } } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0 && !(vi->flags & INTR_OFLD_RXQ)) { oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections"); for_each_ofld_rxq(vi, i, ofld_rxq) { MPASS(!(ofld_rxq->iq.flags & IQ_INTR)); intr_idx = vi_intr_iq(vi, j)->abs_id; rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid); if (rc != 0) goto done; j++; } } #endif /* * Now the tx queues. Only one pass needed. */ oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD, NULL, "tx queues"); j = 0; for_each_txq(vi, i, txq) { iqid = vi_intr_iq(vi, j)->cntxt_id; snprintf(name, sizeof(name), "%s txq%d", device_get_nameunit(vi->dev), i); init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, iqid, name); rc = alloc_txq(vi, txq, i, oid); if (rc != 0) goto done; j++; } #ifdef TCP_OFFLOAD oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections"); for_each_ofld_txq(vi, i, ofld_txq) { struct sysctl_oid *oid2; iqid = vi_intr_iq(vi, j)->cntxt_id; snprintf(name, sizeof(name), "%s ofld_txq%d", device_get_nameunit(vi->dev), i); init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan, iqid, name); snprintf(name, sizeof(name), "%d", i); oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL, "offload tx queue"); rc = alloc_wrq(sc, vi, ofld_txq, oid2); if (rc != 0) goto done; j++; } #endif /* * Finally, the control queue. */ if (!IS_MAIN_VI(vi)) goto done; oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD, NULL, "ctrl queue"); ctrlq = &sc->sge.ctrlq[pi->port_id]; iqid = vi_intr_iq(vi, 0)->cntxt_id; snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(vi->dev)); init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid, name); rc = alloc_wrq(sc, vi, ctrlq, oid); done: if (rc) t4_teardown_vi_queues(vi); return (rc); } /* * Idempotent */ int t4_teardown_vi_queues(struct vi_info *vi) { int i; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; struct sge_wrq *ofld_txq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif /* Do this before freeing the queues */ if (vi->flags & VI_SYSCTL_CTX) { sysctl_ctx_free(&vi->ctx); vi->flags &= ~VI_SYSCTL_CTX; } #ifdef DEV_NETMAP if (vi->ifp->if_capabilities & IFCAP_NETMAP) { for_each_nm_txq(vi, i, nm_txq) { free_nm_txq(vi, nm_txq); } for_each_nm_rxq(vi, i, nm_rxq) { free_nm_rxq(vi, nm_rxq); } } #endif /* * Take down all the tx queues first, as they reference the rx queues * (for egress updates, etc.). */ if (IS_MAIN_VI(vi)) free_wrq(sc, &sc->sge.ctrlq[pi->port_id]); for_each_txq(vi, i, txq) { free_txq(vi, txq); } #ifdef TCP_OFFLOAD for_each_ofld_txq(vi, i, ofld_txq) { free_wrq(sc, ofld_txq); } #endif /* * Then take down the rx queues that forward their interrupts, as they * reference other rx queues. */ for_each_rxq(vi, i, rxq) { if ((rxq->iq.flags & IQ_INTR) == 0) free_rxq(vi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { if ((ofld_rxq->iq.flags & IQ_INTR) == 0) free_ofld_rxq(vi, ofld_rxq); } #endif /* * Then take down the rx queues that take direct interrupts. */ for_each_rxq(vi, i, rxq) { if (rxq->iq.flags & IQ_INTR) free_rxq(vi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { if (ofld_rxq->iq.flags & IQ_INTR) free_ofld_rxq(vi, ofld_rxq); } #endif return (0); } /* * Deals with errors and the firmware event queue. All data rx queues forward * their interrupt to the firmware event queue. */ void t4_intr_all(void *arg) { struct adapter *sc = arg; struct sge_iq *fwq = &sc->sge.fwq; t4_intr_err(arg); if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) { service_iq(fwq, 0); atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE); } } /* Deals with error interrupts */ void t4_intr_err(void *arg) { struct adapter *sc = arg; t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); t4_slow_intr_handler(sc); } void t4_intr_evt(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } void t4_intr(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } void t4_vi_intr(void *arg) { struct irq *irq = arg; #ifdef DEV_NETMAP if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) { t4_nm_intr(irq->nm_rxq); atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON); } #endif if (irq->rxq != NULL) t4_intr(irq->rxq); } /* * Deals with anything and everything on the given ingress queue. */ static int service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; struct sge_rxq *rxq = iq_to_rxq(iq); /* Use iff iq is part of rxq */ struct sge_fl *fl; /* Use iff IQ_HAS_FL */ struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs = 0, limit; int rsp_type, refill; uint32_t lq; uint16_t fl_hw_cidx; struct mbuf *m0; STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); #if defined(INET) || defined(INET6) const struct timeval lro_timeout = {0, sc->lro_timeout}; #endif KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); limit = budget ? budget : iq->qsize / 16; if (iq->flags & IQ_HAS_FL) { fl = &rxq->fl; fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ } else { fl = NULL; fl_hw_cidx = 0; /* to silence gcc warning */ } /* * We always come back and check the descriptor ring for new indirect * interrupts and other responses after running a single handler. */ for (;;) { while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); refill = 0; m0 = NULL; rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: KASSERT(iq->flags & IQ_HAS_FL, ("%s: data for an iq (%p) with no freelist", __func__, iq)); m0 = get_fl_payload(sc, fl, lq); if (__predict_false(m0 == NULL)) goto process_iql; refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2; #ifdef T4_PKT_TIMESTAMP /* * 60 bit timestamp for the payload is * *(uint64_t *)m0->m_pktdat. Note that it is * in the leading free-space in the mbuf. The * kernel can clobber it during a pullup, * m_copymdata, etc. You need to make sure that * the mbuf reaches you unmolested if you care * about the timestamp. */ *(uint64_t *)m0->m_pktdat = be64toh(ctrl->u.last_flit) & 0xfffffffffffffff; #endif /* fall through */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0); break; case X_RSPD_TYPE_INTR: /* * Interrupts should be forwarded only to queues * that are not forwarding their interrupts. * This means service_iq can recurse but only 1 * level deep. */ KASSERT(budget == 0, ("%s: budget %u, rsp_type %u", __func__, budget, rsp_type)); /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. */ if (lq >= 1024) { t4_an_handler(iq, &d->rsp); break; } q = sc->sge.iqmap[lq - sc->sge.iq_start]; if (atomic_cmpset_int(&q->state, IQS_IDLE, IQS_BUSY)) { if (service_iq(q, q->qsize / 16) == 0) { atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); } else { STAILQ_INSERT_TAIL(&iql, q, link); } } break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { - t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), + t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndescs = 0; #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED && sc->lro_timeout != 0) { tcp_lro_flush_inactive(&rxq->lro, &lro_timeout); } #endif if (budget) { if (iq->flags & IQ_HAS_FL) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); } return (EINPROGRESS); } } if (refill) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); fl_hw_cidx = fl->hw_cidx; } } process_iql: if (STAILQ_EMPTY(&iql)) break; /* * Process the head only, and send it to the back of the list if * it's still not done. */ q = STAILQ_FIRST(&iql); STAILQ_REMOVE_HEAD(&iql, link); if (service_iq(q, q->qsize / 8) == 0) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); else STAILQ_INSERT_TAIL(&iql, q, link); } #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED) { struct lro_ctrl *lro = &rxq->lro; tcp_lro_flush_all(lro); } #endif - t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_CIDXINC(ndescs) | + t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); if (iq->flags & IQ_HAS_FL) { int starved; FL_LOCK(fl); starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) add_fl_to_sfl(sc, fl); } return (0); } static inline int cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll) { int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0; if (rc) MPASS(cll->region3 >= CL_METADATA_SIZE); return (rc); } static inline struct cluster_metadata * cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll, caddr_t cl) { if (cl_has_metadata(fl, cll)) { struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; return ((struct cluster_metadata *)(cl + swz->size) - 1); } return (NULL); } static void rxb_free(struct mbuf *m, void *arg1, void *arg2) { uma_zone_t zone = arg1; caddr_t cl = arg2; uma_zfree(zone, cl); counter_u64_add(extfree_rels, 1); } /* * The mbuf returned by this function could be allocated from zone_mbuf or * constructed in spare room in the cluster. * * The mbuf carries the payload in one of these ways * a) frame inside the mbuf (mbuf from zone_mbuf) * b) m_cljset (for clusters without metadata) zone_mbuf * c) m_extaddref (cluster with metadata) inline mbuf * d) m_extaddref (cluster with metadata) zone_mbuf */ static struct mbuf * get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, int remaining) { struct mbuf *m; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct cluster_layout *cll = &sd->cll; struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx]; struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl); int len, blen; caddr_t payload; blen = hwb->size - fl->rx_offset; /* max possible in this buf */ len = min(remaining, blen); payload = sd->cl + cll->region1 + fl->rx_offset; if (fl->flags & FL_BUF_PACKING) { const u_int l = fr_offset + len; const u_int pad = roundup2(l, fl->buf_boundary) - l; if (fl->rx_offset + len + pad < hwb->size) blen = len + pad; MPASS(fl->rx_offset + blen <= hwb->size); } else { MPASS(fl->rx_offset == 0); /* not packing */ } if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { /* * Copy payload into a freshly allocated mbuf. */ m = fr_offset == 0 ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; #ifdef T4_PKT_TIMESTAMP /* Leave room for a timestamp */ m->m_data += 8; #endif /* copy data to mbuf */ bcopy(payload, mtod(m, caddr_t), len); } else if (sd->nmbuf * MSIZE < cll->region1) { /* * There's spare room in the cluster for an mbuf. Create one * and associate it with the payload that's in the cluster. */ MPASS(clm != NULL); m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE); /* No bzero required */ if (m_init(m, M_NOWAIT, MT_DATA, fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE)) return (NULL); fl->mbuf_inlined++; m_extaddref(m, payload, blen, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { /* * Grab an mbuf from zone_mbuf and associate it with the * payload in the cluster. */ m = fr_offset == 0 ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; if (clm != NULL) { m_extaddref(m, payload, blen, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { m_cljset(m, sd->cl, swz->type); sd->cl = NULL; /* consumed, not a recycle candidate */ } } if (fr_offset == 0) m->m_pkthdr.len = remaining; m->m_len = len; if (fl->flags & FL_BUF_PACKING) { fl->rx_offset += blen; MPASS(fl->rx_offset <= hwb->size); if (fl->rx_offset < hwb->size) return (m); /* without advancing the cidx */ } if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } fl->rx_offset = 0; return (m); } static struct mbuf * get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf) { struct mbuf *m0, *m, **pnext; u_int remaining; const u_int total = G_RSPD_LEN(len_newbuf); if (__predict_false(fl->flags & FL_BUF_RESUME)) { M_ASSERTPKTHDR(fl->m0); MPASS(fl->m0->m_pkthdr.len == total); MPASS(fl->remaining < total); m0 = fl->m0; pnext = fl->pnext; remaining = fl->remaining; fl->flags &= ~FL_BUF_RESUME; goto get_segment; } if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) { fl->rx_offset = 0; if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } } /* * Payload starts at rx_offset in the current hw buffer. Its length is * 'len' and it may span multiple hw buffers. */ m0 = get_scatter_segment(sc, fl, 0, total); if (m0 == NULL) return (NULL); remaining = total - m0->m_len; pnext = &m0->m_next; while (remaining > 0) { get_segment: MPASS(fl->rx_offset == 0); m = get_scatter_segment(sc, fl, total - remaining, remaining); if (__predict_false(m == NULL)) { fl->m0 = m0; fl->pnext = pnext; fl->remaining = remaining; fl->flags |= FL_BUF_RESUME; return (NULL); } *pnext = m; pnext = &m->m_next; remaining -= m->m_len; } *pnext = NULL; M_ASSERTPKTHDR(m0); return (m0); } static int t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) { struct sge_rxq *rxq = iq_to_rxq(iq); struct ifnet *ifp = rxq->ifp; struct adapter *sc = iq->adapter; const struct cpl_rx_pkt *cpl = (const void *)(rss + 1); #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxq->lro; #endif static const int sw_hashtype[4][2] = { {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6}, {M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6}, {M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6}, }; KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__, rss->opcode)); m0->m_pkthdr.len -= sc->params.sge.fl_pktshift; m0->m_len -= sc->params.sge.fl_pktshift; m0->m_data += sc->params.sge.fl_pktshift; m0->m_pkthdr.rcvif = ifp; M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]); m0->m_pkthdr.flowid = be32toh(rss->hash_val); if (cpl->csum_calc && !cpl->err_vec) { if (ifp->if_capenable & IFCAP_RXCSUM && cpl->l2info & htobe32(F_RXF_IP)) { m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); rxq->rxcsum++; } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && cpl->l2info & htobe32(F_RXF_IP6)) { m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR); rxq->rxcsum++; } if (__predict_false(cpl->ip_frag)) m0->m_pkthdr.csum_data = be16toh(cpl->csum); else m0->m_pkthdr.csum_data = 0xffff; } if (cpl->vlan_ex) { m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan); m0->m_flags |= M_VLANTAG; rxq->vlan_extraction++; } #if defined(INET) || defined(INET6) if (cpl->l2info & htobe32(F_RXF_LRO) && iq->flags & IQ_LRO_ENABLED && tcp_lro_rx(lro, m0, 0) == 0) { /* queued for LRO */ } else #endif ifp->if_input(ifp, m0); return (0); } /* * Must drain the wrq or make sure that someone else will. */ static void wrq_tx_drain(void *arg, int n) { struct sge_wrq *wrq = arg; struct sge_eq *eq = &wrq->eq; EQ_LOCK(eq); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(wrq->adapter, wrq); EQ_UNLOCK(eq); } static void drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq) { struct sge_eq *eq = &wrq->eq; u_int available, dbdiff; /* # of hardware descriptors */ u_int n; struct wrqe *wr; struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ EQ_LOCK_ASSERT_OWNED(eq); MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); wr = STAILQ_FIRST(&wrq->wr_list); MPASS(wr != NULL); /* Must be called with something useful to do */ MPASS(eq->pidx == eq->dbidx); dbdiff = 0; do { eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; MPASS(wr->wrq == wrq); n = howmany(wr->wr_len, EQ_ESIZE); if (available < n) break; dst = (void *)&eq->desc[eq->pidx]; if (__predict_true(eq->sidx - eq->pidx > n)) { /* Won't wrap, won't end exactly at the status page. */ bcopy(&wr->wr[0], dst, wr->wr_len); eq->pidx += n; } else { int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE; bcopy(&wr->wr[0], dst, first_portion); if (wr->wr_len > first_portion) { bcopy(&wr->wr[first_portion], &eq->desc[0], wr->wr_len - first_portion); } eq->pidx = n - (eq->sidx - eq->pidx); } if (available < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } dbdiff += n; if (dbdiff >= 16) { ring_eq_db(sc, eq, dbdiff); dbdiff = 0; } STAILQ_REMOVE_HEAD(&wrq->wr_list, link); free_wrqe(wr); MPASS(wrq->nwr_pending > 0); wrq->nwr_pending--; MPASS(wrq->ndesc_needed >= n); wrq->ndesc_needed -= n; } while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL); if (dbdiff) ring_eq_db(sc, eq, dbdiff); } /* * Doesn't fail. Holds on to work requests it can't send right away. */ void t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { #ifdef INVARIANTS struct sge_eq *eq = &wrq->eq; #endif EQ_LOCK_ASSERT_OWNED(eq); MPASS(wr != NULL); MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN); MPASS((wr->wr_len & 0x7) == 0); STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); wrq->nwr_pending++; wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE); if (!TAILQ_EMPTY(&wrq->incomplete_wrs)) return; /* commit_wrq_wr will drain wr_list as well. */ drain_wrq_wr_list(sc, wrq); /* Doorbell must have caught up to the pidx. */ MPASS(eq->pidx == eq->dbidx); } void t4_update_fl_bufsize(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->pi->adapter; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif struct sge_fl *fl; int i, maxp, mtu = ifp->if_mtu; maxp = mtu_to_max_payload(sc, mtu, 0); for_each_rxq(vi, i, rxq) { fl = &rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); for_each_ofld_rxq(vi, i, ofld_rxq) { fl = &ofld_rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #endif } static inline int mbuf_nsegs(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.l5hlen > 0, ("%s: mbuf %p missing information on # of segments.", __func__, m)); return (m->m_pkthdr.l5hlen); } static inline void set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs) { M_ASSERTPKTHDR(m); m->m_pkthdr.l5hlen = nsegs; } static inline int mbuf_len16(struct mbuf *m) { int n; M_ASSERTPKTHDR(m); n = m->m_pkthdr.PH_loc.eight[0]; MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); return (n); } static inline void set_mbuf_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[0] = len16; } static inline int needs_tso(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & CSUM_TSO) { KASSERT(m->m_pkthdr.tso_segsz > 0, ("%s: TSO requested in mbuf %p but MSS not provided", __func__, m)); return (1); } return (0); } static inline int needs_l3_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) return (1); return (0); } static inline int needs_l4_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) return (1); return (0); } static inline int needs_vlan_insertion(struct mbuf *m) { M_ASSERTPKTHDR(m); if (m->m_flags & M_VLANTAG) { KASSERT(m->m_pkthdr.ether_vtag != 0, ("%s: HWVLAN requested in mbuf %p but tag not provided", __func__, m)); return (1); } return (0); } static void * m_advance(struct mbuf **pm, int *poffset, int len) { struct mbuf *m = *pm; int offset = *poffset; uintptr_t p = 0; MPASS(len > 0); while (len) { if (offset + len < m->m_len) { offset += len; p = mtod(m, uintptr_t) + offset; break; } len -= m->m_len - offset; m = m->m_next; offset = 0; MPASS(m != NULL); } *poffset = offset; *pm = m; return ((void *)p); } static inline int same_paddr(char *a, char *b) { if (a == b) return (1); else if (a != NULL && b != NULL) { vm_offset_t x = (vm_offset_t)a; vm_offset_t y = (vm_offset_t)b; if ((x & PAGE_MASK) == (y & PAGE_MASK) && pmap_kextract(x) == pmap_kextract(y)) return (1); } return (0); } /* * Can deal with empty mbufs in the chain that have m_len = 0, but the chain * must have at least one mbuf that's not empty. */ static inline int count_mbuf_nsegs(struct mbuf *m) { char *prev_end, *start; int len, nsegs; MPASS(m != NULL); nsegs = 0; prev_end = NULL; for (; m; m = m->m_next) { len = m->m_len; if (__predict_false(len == 0)) continue; start = mtod(m, char *); nsegs += sglist_count(start, len); if (same_paddr(prev_end, start)) nsegs--; prev_end = start + len; } MPASS(nsegs > 0); return (nsegs); } /* * Analyze the mbuf to determine its tx needs. The mbuf passed in may change: * a) caller can assume it's been freed if this function returns with an error. * b) it may get defragged up if the gather list is too long for the hardware. */ int parse_pkt(struct mbuf **mp) { struct mbuf *m0 = *mp, *m; int rc, nsegs, defragged = 0, offset; struct ether_header *eh; void *l3hdr; #if defined(INET) || defined(INET6) struct tcphdr *tcp; #endif uint16_t eh_type; M_ASSERTPKTHDR(m0); if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) { rc = EINVAL; fail: m_freem(m0); *mp = NULL; return (rc); } restart: /* * First count the number of gather list segments in the payload. * Defrag the mbuf if nsegs exceeds the hardware limit. */ M_ASSERTPKTHDR(m0); MPASS(m0->m_pkthdr.len > 0); nsegs = count_mbuf_nsegs(m0); if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) { if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) { rc = EFBIG; goto fail; } *mp = m0 = m; /* update caller's copy after defrag */ goto restart; } if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) { m0 = m_pullup(m0, m0->m_pkthdr.len); if (m0 == NULL) { /* Should have left well enough alone. */ rc = EFBIG; goto fail; } *mp = m0; /* update caller's copy after pullup */ goto restart; } set_mbuf_nsegs(m0, nsegs); set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0))); if (!needs_tso(m0)) return (0); m = m0; eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); if (eh_type == ETHERTYPE_VLAN) { struct ether_vlan_header *evh = (void *)eh; eh_type = ntohs(evh->evl_proto); m0->m_pkthdr.l2hlen = sizeof(*evh); } else m0->m_pkthdr.l2hlen = sizeof(*eh); offset = 0; l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen); switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6 = l3hdr; MPASS(ip6->ip6_nxt == IPPROTO_TCP); m0->m_pkthdr.l3hlen = sizeof(*ip6); break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; m0->m_pkthdr.l3hlen = ip->ip_hl * 4; break; } #endif default: panic("%s: ethertype 0x%04x unknown. if_cxgbe must be compiled" " with the same INET/INET6 options as the kernel.", __func__, eh_type); } #if defined(INET) || defined(INET6) tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen); m0->m_pkthdr.l4hlen = tcp->th_off * 4; #endif MPASS(m0 == *mp); return (0); } void * start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, available; struct wrqe *wr; void *w; MPASS(len16 > 0); ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC); EQ_LOCK(eq); if (!STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); if (!STAILQ_EMPTY(&wrq->wr_list)) { slowpath: EQ_UNLOCK(eq); wr = alloc_wrqe(len16 * 16, wrq); if (__predict_false(wr == NULL)) return (NULL); cookie->pidx = -1; cookie->ndesc = ndesc; return (&wr->wr); } eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; if (available < ndesc) goto slowpath; cookie->pidx = eq->pidx; cookie->ndesc = ndesc; TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link); w = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, ndesc, eq->sidx); if (__predict_false(eq->pidx < ndesc - 1)) { w = &wrq->ss[0]; wrq->ss_pidx = cookie->pidx; wrq->ss_len = len16 * 16; } EQ_UNLOCK(eq); return (w); } void commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, pidx; struct wrq_cookie *prev, *next; if (cookie->pidx == -1) { struct wrqe *wr = __containerof(w, struct wrqe, wr); t4_wrq_tx(sc, wr); return; } ndesc = cookie->ndesc; /* Can be more than SGE_MAX_WR_NDESC here. */ pidx = cookie->pidx; MPASS(pidx >= 0 && pidx < eq->sidx); if (__predict_false(w == &wrq->ss[0])) { int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE; MPASS(wrq->ss_len > n); /* WR had better wrap around. */ bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n); bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n); wrq->tx_wrs_ss++; } else wrq->tx_wrs_direct++; EQ_LOCK(eq); prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link); next = TAILQ_NEXT(cookie, link); if (prev == NULL) { MPASS(pidx == eq->dbidx); if (next == NULL || ndesc >= 16) ring_eq_db(wrq->adapter, eq, ndesc); else { MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc); next->pidx = pidx; next->ndesc += ndesc; } } else { MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc); prev->ndesc += ndesc; } TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); #ifdef INVARIANTS if (TAILQ_EMPTY(&wrq->incomplete_wrs)) { /* Doorbell must have caught up to the pidx. */ MPASS(wrq->eq.pidx == wrq->eq.dbidx); } #endif EQ_UNLOCK(eq); } static u_int can_resume_eth_tx(struct mp_ring *r) { struct sge_eq *eq = r->cookie; return (total_available_tx_desc(eq) > eq->sidx / 8); } static inline int cannot_use_txpkts(struct mbuf *m) { /* maybe put a GL limit too, to avoid silliness? */ return (needs_tso(m)); } /* * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to * be consumed. Return the actual number consumed. 0 indicates a stall. */ static u_int eth_tx(struct mp_ring *r, u_int cidx, u_int pidx) { struct sge_txq *txq = r->cookie; struct sge_eq *eq = &txq->eq; struct ifnet *ifp = txq->ifp; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; u_int total, remaining; /* # of packets */ u_int available, dbdiff; /* # of hardware descriptors */ u_int n, next_cidx; struct mbuf *m0, *tail; struct txpkts txp; struct fw_eth_tx_pkts_wr *wr; /* any fw WR struct will do */ remaining = IDXDIFF(pidx, cidx, r->size); MPASS(remaining > 0); /* Must not be called without work to do. */ total = 0; TXQ_LOCK(txq); if (__predict_false((eq->flags & EQ_ENABLED) == 0)) { while (cidx != pidx) { m0 = r->items[cidx]; m_freem(m0); if (++cidx == r->size) cidx = 0; } reclaim_tx_descs(txq, 2048); total = remaining; goto done; } /* How many hardware descriptors do we have readily available. */ if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx); while (remaining > 0) { m0 = r->items[cidx]; M_ASSERTPKTHDR(m0); MPASS(m0->m_nextpkt == NULL); if (available < SGE_MAX_WR_NDESC) { available += reclaim_tx_descs(txq, 64); if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16)) break; /* out of descriptors */ } next_cidx = cidx + 1; if (__predict_false(next_cidx == r->size)) next_cidx = 0; wr = (void *)&eq->desc[eq->pidx]; if (remaining > 1 && try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) { /* pkts at cidx, next_cidx should both be in txp. */ MPASS(txp.npkt == 2); tail = r->items[next_cidx]; MPASS(tail->m_nextpkt == NULL); ETHER_BPF_MTAP(ifp, m0); ETHER_BPF_MTAP(ifp, tail); m0->m_nextpkt = tail; if (__predict_false(++next_cidx == r->size)) next_cidx = 0; while (next_cidx != pidx) { if (add_to_txpkts(r->items[next_cidx], &txp, available) != 0) break; tail->m_nextpkt = r->items[next_cidx]; tail = tail->m_nextpkt; ETHER_BPF_MTAP(ifp, tail); if (__predict_false(++next_cidx == r->size)) next_cidx = 0; } n = write_txpkts_wr(txq, wr, m0, &txp, available); total += txp.npkt; remaining -= txp.npkt; } else { total++; remaining--; ETHER_BPF_MTAP(ifp, m0); n = write_txpkt_wr(txq, (void *)wr, m0, available); } MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC); available -= n; dbdiff += n; IDXINCR(eq->pidx, n, eq->sidx); if (total_available_tx_desc(eq) < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } if (dbdiff >= 16 && remaining >= 4) { ring_eq_db(sc, eq, dbdiff); available += reclaim_tx_descs(txq, 4 * dbdiff); dbdiff = 0; } cidx = next_cidx; } if (dbdiff != 0) { ring_eq_db(sc, eq, dbdiff); reclaim_tx_descs(txq, 32); } done: TXQ_UNLOCK(txq); return (total); } static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, int qsize) { KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS, ("%s: bad tmr_idx %d", __func__, tmr_idx)); KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */ ("%s: bad pktc_idx %d", __func__, pktc_idx)); iq->flags = 0; iq->adapter = sc; iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx); iq->intr_pktc_idx = SGE_NCOUNTERS - 1; if (pktc_idx >= 0) { iq->intr_params |= F_QINTR_CNT_EN; iq->intr_pktc_idx = pktc_idx; } iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */ iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE; } static inline void init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name) { fl->qsize = qsize; fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(fl->lockname, name, sizeof(fl->lockname)); if (sc->flags & BUF_PACKING_OK && ((!is_t4(sc) && buffer_packing) || /* T5+: enabled unless 0 */ (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */ fl->flags |= FL_BUF_PACKING; find_best_refill_source(sc, fl, maxp); find_safe_refill_source(sc, fl); } static inline void init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan, uint16_t iqid, char *name) { KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype)); eq->flags = eqtype & EQ_TYPEMASK; eq->tx_chan = tx_chan; eq->iqid = iqid; eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(eq->lockname, name, sizeof(eq->lockname)); } static int alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_addr_t *pa, void **va) { int rc; rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc); goto done; } rc = bus_dmamem_alloc(*tag, va, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc); goto done; } rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0); if (rc != 0) { device_printf(sc->dev, "cannot load DMA map: %d\n", rc); goto done; } done: if (rc) free_ring(sc, *tag, *map, *pa, *va); return (rc); } static int free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map, bus_addr_t pa, void *va) { if (pa) bus_dmamap_unload(tag, map); if (va) bus_dmamem_free(tag, va, map); if (tag) bus_dma_tag_destroy(tag); return (0); } /* * Allocates the ring for an ingress queue and an optional freelist. If the * freelist is specified it will be allocated and then associated with the * ingress queue. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. * * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then * the intr_idx specifies the vector, starting from 0. Otherwise it specifies * the abs_id of the ingress queue to which its interrupts should be forwarded. */ static int alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl, int intr_idx, int cong) { int rc, i, cntxt_id; size_t len; struct fw_iq_cmd c; struct port_info *pi = vi->pi; struct adapter *sc = iq->adapter; struct sge_params *sp = &sc->params.sge; __be32 v = 0; len = iq->qsize * IQ_ESIZE; rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba, (void **)&iq->desc); if (rc != 0) return (rc); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | V_FW_IQ_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | FW_LEN16(c)); /* Special handling for firmware event queue */ if (iq == &sc->sge.fwq) v |= F_FW_IQ_CMD_IQASYNCH; if (iq->flags & IQ_INTR) { KASSERT(intr_idx < sc->intr_count, ("%s: invalid direct intr_idx %d", __func__, intr_idx)); } else v |= F_FW_IQ_CMD_IQANDST; v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx); c.type_to_iqandstindex = htobe32(v | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(vi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) | V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(iq->qsize); c.iqaddr = htobe64(iq->ba); if (cong >= 0) c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN); if (fl) { mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); len = fl->qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); if (rc) return (rc); /* Allocate space for one software descriptor per buffer. */ rc = alloc_fl_sdesc(fl); if (rc != 0) { device_printf(sc->dev, "failed to setup fl software descriptors: %d\n", rc); return (rc); } if (fl->flags & FL_BUF_PACKING) { fl->lowat = roundup2(sp->fl_starve_threshold2, 8); fl->buf_boundary = sp->pack_boundary; } else { fl->lowat = roundup2(sp->fl_starve_threshold, 8); fl->buf_boundary = 16; } if (fl_pad && fl->buf_boundary < sp->pad_boundary) fl->buf_boundary = sp->pad_boundary; c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN : 0)); if (cong >= 0) { c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) | F_FW_IQ_CMD_FL0CONGCIF | F_FW_IQ_CMD_FL0CONGEN); } c.fl0dcaen_to_fl0cidxfthresh = htobe16(V_FW_IQ_CMD_FL0FBMIN(X_FETCHBURSTMIN_128B) | V_FW_IQ_CMD_FL0FBMAX(X_FETCHBURSTMAX_512B)); c.fl0size = htobe16(fl->qsize); c.fl0addr = htobe64(fl->ba); } rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create ingress queue: %d\n", rc); return (rc); } iq->cidx = 0; iq->gen = F_RSPD_GEN; iq->intr_next = iq->intr_params; iq->cntxt_id = be16toh(c.iqid); iq->abs_id = be16toh(c.physiqid); iq->flags |= IQ_ALLOCATED; cntxt_id = iq->cntxt_id - sc->sge.iq_start; if (cntxt_id >= sc->sge.niq) { panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.niq - 1); } sc->sge.iqmap[cntxt_id] = iq; if (fl) { u_int qid; iq->flags |= IQ_HAS_FL; fl->cntxt_id = be16toh(c.fl0id); fl->pidx = fl->cidx = 0; cntxt_id = fl->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) { panic("%s: fl->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); } sc->sge.eqmap[cntxt_id] = (void *)fl; qid = fl->cntxt_id; if (isset(&sc->doorbells, DOORBELL_UDB)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (qid >> s_qpp) << PAGE_SHIFT; qid &= mask; if (qid < PAGE_SIZE / UDBS_SEG_SIZE) { udb += qid << UDBS_SEG_SHIFT; qid = 0; } fl->udb = (volatile void *)udb; } fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db; FL_LOCK(fl); /* Enough to make sure the SGE doesn't think it's starved */ refill_fl(sc, fl, fl->lowat); FL_UNLOCK(fl); } if (is_t5(sc) && cong >= 0) { uint32_t param, val; param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | V_FW_PARAMS_PARAM_YZ(iq->cntxt_id); if (cong == 0) val = 1 << 19; else { val = 2 << 19; for (i = 0; i < 4; i++) { if (cong & (1 << i)) val |= 1 << (i << 2); } } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* report error but carry on */ device_printf(sc->dev, "failed to set congestion manager context for " "ingress queue %d: %d\n", iq->cntxt_id, rc); } } /* Enable IQ interrupts */ atomic_store_rel_int(&iq->state, IQS_IDLE); - t4_write_reg(sc, MYPF_REG(A_SGE_PF_GTS), V_SEINTARM(iq->intr_params) | + t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) | V_INGRESSQID(iq->cntxt_id)); return (0); } static int free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl) { int rc; struct adapter *sc = iq->adapter; device_t dev; if (sc == NULL) return (0); /* nothing to do */ dev = vi ? vi->dev : sc->dev; if (iq->flags & IQ_ALLOCATED) { rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff); if (rc != 0) { device_printf(dev, "failed to free queue %p: %d\n", iq, rc); return (rc); } iq->flags &= ~IQ_ALLOCATED; } free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); bzero(iq, sizeof(*iq)); if (fl) { free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc); if (fl->sdesc) free_fl_sdesc(sc, fl); if (mtx_initialized(&fl->fl_lock)) mtx_destroy(&fl->fl_lock); bzero(fl, sizeof(*fl)); } return (0); } static void add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_fl *fl) { struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL, fl_pad ? 1 : 0, "padding enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL, fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, 0, "consumer index"); if (fl->flags & FL_BUF_PACKING) { SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); } SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, 0, "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated", CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined", CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); } static int alloc_fwq(struct adapter *sc) { int rc, intr_idx; struct sge_iq *fwq = &sc->sge.fwq; struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE); fwq->flags |= IQ_INTR; /* always */ intr_idx = sc->intr_count > 1 ? 1 : 0; fwq->set_tcb_rpl = t4_filter_rpl; fwq->l2t_write_rpl = do_l2t_write_rpl; rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1); if (rc != 0) { device_printf(sc->dev, "failed to create firmware event queue: %d\n", rc); return (rc); } oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD, NULL, "firmware event queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I", "consumer index"); return (0); } static int free_fwq(struct adapter *sc) { return free_iq_fl(NULL, &sc->sge.fwq, NULL); } static int alloc_mgmtq(struct adapter *sc) { int rc; struct sge_wrq *mgmtq = &sc->sge.mgmtq; char name[16]; struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD, NULL, "management queue"); snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev)); init_eq(sc, &mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan, sc->sge.fwq.cntxt_id, name); rc = alloc_wrq(sc, NULL, mgmtq, oid); if (rc != 0) { device_printf(sc->dev, "failed to create management queue: %d\n", rc); return (rc); } return (0); } static int free_mgmtq(struct adapter *sc) { return free_wrq(sc, &sc->sge.mgmtq); } int tnl_cong(struct port_info *pi, int drop) { if (drop == -1) return (-1); else if (drop == 1) return (0); else return (pi->rx_chan_map); } static int alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(vi->pi, cong_drop)); if (rc != 0) return (rc); /* * The freelist is just barely above the starvation threshold right now, * fill it up a bit more. */ FL_LOCK(&rxq->fl); refill_fl(vi->pi->adapter, &rxq->fl, 128); FL_UNLOCK(&rxq->fl); #if defined(INET) || defined(INET6) rc = tcp_lro_init(&rxq->lro); if (rc != 0) return (rc); rxq->lro.ifp = vi->ifp; /* also indicates LRO init'ed */ if (vi->ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; #endif rxq->ifp = vi->ifp; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); #if defined(INET) || defined(INET6) SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, &rxq->lro.lro_queued, 0, NULL); SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &rxq->lro.lro_flushed, 0, NULL); #endif SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD, &rxq->rxcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); add_fl_sysctls(&vi->ctx, oid, &rxq->fl); return (rc); } static int free_rxq(struct vi_info *vi, struct sge_rxq *rxq) { int rc; #if defined(INET) || defined(INET6) if (rxq->lro.ifp) { tcp_lro_free(&rxq->lro); rxq->lro.ifp = NULL; } #endif rc = free_iq_fl(vi, &rxq->iq, &rxq->fl); if (rc == 0) bzero(rxq, sizeof(*rxq)); return (rc); } #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, vi->pi->rx_chan_map); if (rc != 0) return (rc); children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I", "consumer index"); add_fl_sysctls(&vi->ctx, oid, &ofld_rxq->fl); return (rc); } static int free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq) { int rc; rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl); if (rc == 0) bzero(ofld_rxq, sizeof(*ofld_rxq)); return (rc); } #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; struct sysctl_ctx_list *ctx; char name[16]; size_t len; struct adapter *sc = vi->pi->adapter; struct netmap_adapter *na = NA(vi->ifp); MPASS(na != NULL); len = vi->qsize_rxq * IQ_ESIZE; rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map, &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc); if (rc != 0) return (rc); len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len; rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map, &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc); if (rc != 0) return (rc); nm_rxq->vi = vi; nm_rxq->nid = idx; nm_rxq->iq_cidx = 0; nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE; nm_rxq->iq_gen = F_RSPD_GEN; nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; nm_rxq->fl_sidx = na->num_rx_desc; nm_rxq->intr_idx = intr_idx; ctx = &vi->ctx; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I", "consumer index"); children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &nm_rxq->fl_cidx, 0, "consumer index"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &nm_rxq->fl_pidx, 0, "producer index"); return (rc); } static int free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq) { struct adapter *sc = vi->pi->adapter; free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba, nm_rxq->iq_desc); free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba, nm_rxq->fl_desc); return (0); } static int alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx, struct sysctl_oid *oid) { int rc; size_t len; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct netmap_adapter *na = NA(vi->ifp); char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len; rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map, &nm_txq->ba, (void **)&nm_txq->desc); if (rc) return (rc); nm_txq->pidx = nm_txq->cidx = 0; nm_txq->sidx = na->num_tx_desc; nm_txq->nid = idx; nm_txq->iqidx = iqidx; nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_VF_VLD(1) | V_TXPKT_VF(vi->viid)); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "netmap tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &nm_txq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I", "producer index"); return (rc); } static int free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq) { struct adapter *sc = vi->pi->adapter; free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba, nm_txq->desc); return (0); } #endif static int ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ctrl_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) | V_FW_EQ_CTRL_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC | F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); c.physeqid_pkd = htobe32(0); c.fetchszm_to_iqid = htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_CTRL_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create control queue %d: %d\n", eq->tx_chan, rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } static int eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_eth_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid)); c.fetchszm_to_iqid = htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create Ethernet egress queue: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #ifdef TCP_OFFLOAD static int ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ofld_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) | V_FW_EQ_OFLD_CMD_VFN(0)); c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC | F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); c.fetchszm_to_iqid = htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_OFLD_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create egress queue for TCP offload: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #endif static int alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, qsize; size_t len; mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF); qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; len = qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba, (void **)&eq->desc); if (rc) return (rc); eq->pidx = eq->cidx = 0; eq->equeqidx = eq->dbidx = 0; eq->doorbells = sc->doorbells; switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = ctrl_eq_alloc(sc, eq); break; case EQ_ETH: rc = eth_eq_alloc(sc, vi, eq); break; #ifdef TCP_OFFLOAD case EQ_OFLD: rc = ofld_eq_alloc(sc, vi, eq); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to allocate egress queue(%d): %d\n", eq->flags & EQ_TYPEMASK, rc); } if (isset(&eq->doorbells, DOORBELL_UDB) || isset(&eq->doorbells, DOORBELL_UDBWC) || isset(&eq->doorbells, DOORBELL_WCWR)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */ eq->udb_qid = eq->cntxt_id & mask; /* id in page */ if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&eq->doorbells, DOORBELL_WCWR); else { udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */ eq->udb_qid = 0; } eq->udb = (volatile void *)udb; } return (rc); } static int free_eq(struct adapter *sc, struct sge_eq *eq) { int rc; if (eq->flags & EQ_ALLOCATED) { switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; case EQ_ETH: rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #ifdef TCP_OFFLOAD case EQ_OFLD: rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to free egress queue (%d): %d\n", eq->flags & EQ_TYPEMASK, rc); return (rc); } eq->flags &= ~EQ_ALLOCATED; } free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc); if (mtx_initialized(&eq->eq_lock)) mtx_destroy(&eq->eq_lock); bzero(eq, sizeof(*eq)); return (0); } static int alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq, struct sysctl_oid *oid) { int rc; struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = alloc_eq(sc, vi, &wrq->eq); if (rc) return (rc); wrq->adapter = sc; TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq); TAILQ_INIT(&wrq->incomplete_wrs); STAILQ_INIT(&wrq->wr_list); wrq->nwr_pending = 0; wrq->ndesc_needed = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &wrq->eq.cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD, &wrq->tx_wrs_direct, "# of work requests (direct)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD, &wrq->tx_wrs_copied, "# of work requests (copied)"); return (rc); } static int free_wrq(struct adapter *sc, struct sge_wrq *wrq) { int rc; rc = free_eq(sc, &wrq->eq); if (rc) return (rc); bzero(wrq, sizeof(*wrq)); return (0); } static int alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx, struct sysctl_oid *oid) { int rc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_eq *eq = &txq->eq; char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx, M_CXGBE, M_WAITOK); if (rc != 0) { device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc); return (rc); } rc = alloc_eq(sc, vi, eq); if (rc != 0) { mp_ring_free(txq->r); txq->r = NULL; return (rc); } /* Can't fail after this point. */ TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq); txq->ifp = vi->ifp; txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK); txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_VF_VLD(1) | V_TXPKT_VF(vi->viid)); txq->tc_idx = -1; txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE, M_ZERO | M_WAITOK); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &eq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc", CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I", "traffic class (-1 means none)"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD, &txq->txcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD, &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD, &txq->tso_wrs, "# of TSO work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD, &txq->imm_wrs, "# of work requests with immediate data"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD, &txq->sgl_wrs, "# of work requests with direct SGL"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD, &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs", CTLFLAG_RD, &txq->txpkts0_wrs, "# of txpkts (type 0) work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs", CTLFLAG_RD, &txq->txpkts1_wrs, "# of txpkts (type 1) work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts", CTLFLAG_RD, &txq->txpkts0_pkts, "# of frames tx'd using type0 txpkts work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts", CTLFLAG_RD, &txq->txpkts1_pkts, "# of frames tx'd using type1 txpkts work requests"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues", CTLFLAG_RD, &txq->r->enqueues, "# of enqueues to the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops", CTLFLAG_RD, &txq->r->drops, "# of drops in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts", CTLFLAG_RD, &txq->r->starts, "# of normal consumer starts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls", CTLFLAG_RD, &txq->r->stalls, "# of consumer stalls in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts", CTLFLAG_RD, &txq->r->restarts, "# of consumer restarts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications", CTLFLAG_RD, &txq->r->abdications, "# of consumer abdications in the mp_ring for this queue"); return (0); } static int free_txq(struct vi_info *vi, struct sge_txq *txq) { int rc; struct adapter *sc = vi->pi->adapter; struct sge_eq *eq = &txq->eq; rc = free_eq(sc, eq); if (rc) return (rc); sglist_free(txq->gl); free(txq->sdesc, M_CXGBE); mp_ring_free(txq->r); bzero(txq, sizeof(*txq)); return (0); } static void oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *ba = arg; KASSERT(nseg == 1, ("%s meant for single segment mappings only.", __func__)); *ba = error ? 0 : segs->ds_addr; } static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl) { uint32_t n, v; n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx); MPASS(n > 0); wmb(); v = fl->dbval | V_PIDX(n); if (fl->udb) *fl->udb = htole32(v); else - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), v); + t4_write_reg(sc, sc->sge_kdoorbell_reg, v); IDXINCR(fl->dbidx, n, fl->sidx); } /* * Fills up the freelist by allocating up to 'n' buffers. Buffers that are * recycled do not count towards this allocation budget. * * Returns non-zero to indicate that this freelist should be added to the list * of starving freelists. */ static int refill_fl(struct adapter *sc, struct sge_fl *fl, int n) { __be64 *d; struct fl_sdesc *sd; uintptr_t pa; caddr_t cl; struct cluster_layout *cll; struct sw_zone_info *swz; struct cluster_metadata *clm; uint16_t max_pidx; uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */ FL_LOCK_ASSERT_OWNED(fl); /* * We always stop at the beginning of the hardware descriptor that's just * before the one with the hw cidx. This is to avoid hw pidx = hw cidx, * which would mean an empty freelist to the chip. */ max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1; if (fl->pidx == max_pidx * 8) return (0); d = &fl->desc[fl->pidx]; sd = &fl->sdesc[fl->pidx]; cll = &fl->cll_def; /* default layout */ swz = &sc->sge.sw_zone_info[cll->zidx]; while (n > 0) { if (sd->cl != NULL) { if (sd->nmbuf == 0) { /* * Fast recycle without involving any atomics on * the cluster's metadata (if the cluster has * metadata). This happens when all frames * received in the cluster were small enough to * fit within a single mbuf each. */ fl->cl_fast_recycled++; #ifdef INVARIANTS clm = cl_metadata(sc, fl, &sd->cll, sd->cl); if (clm != NULL) MPASS(clm->refcount == 1); #endif goto recycled_fast; } /* * Cluster is guaranteed to have metadata. Clusters * without metadata always take the fast recycle path * when they're recycled. */ clm = cl_metadata(sc, fl, &sd->cll, sd->cl); MPASS(clm != NULL); if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { fl->cl_recycled++; counter_u64_add(extfree_rels, 1); goto recycled; } sd->cl = NULL; /* gave up my reference */ } MPASS(sd->cl == NULL); alloc: cl = uma_zalloc(swz->zone, M_NOWAIT); if (__predict_false(cl == NULL)) { if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 || fl->cll_def.zidx == fl->cll_alt.zidx) break; /* fall back to the safe zone */ cll = &fl->cll_alt; swz = &sc->sge.sw_zone_info[cll->zidx]; goto alloc; } fl->cl_allocated++; n--; pa = pmap_kextract((vm_offset_t)cl); pa += cll->region1; sd->cl = cl; sd->cll = *cll; *d = htobe64(pa | cll->hwidx); clm = cl_metadata(sc, fl, cll, cl); if (clm != NULL) { recycled: #ifdef INVARIANTS clm->sd = sd; #endif clm->refcount = 1; } sd->nmbuf = 0; recycled_fast: d++; sd++; if (__predict_false(++fl->pidx % 8 == 0)) { uint16_t pidx = fl->pidx / 8; if (__predict_false(pidx == fl->sidx)) { fl->pidx = 0; pidx = 0; sd = fl->sdesc; d = fl->desc; } if (pidx == max_pidx) break; if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4) ring_fl_db(sc, fl); } } if (fl->pidx / 8 != fl->dbidx) ring_fl_db(sc, fl); return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); } /* * Attempt to refill all starving freelists. */ static void refill_sfl(void *arg) { struct adapter *sc = arg; struct sge_fl *fl, *fl_temp; mtx_assert(&sc->sfl_lock, MA_OWNED); TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) { FL_LOCK(fl); refill_fl(sc, fl, 64); if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) { TAILQ_REMOVE(&sc->sfl, fl, link); fl->flags &= ~FL_STARVING; } FL_UNLOCK(fl); } if (!TAILQ_EMPTY(&sc->sfl)) callout_schedule(&sc->sfl_callout, hz / 5); } static int alloc_fl_sdesc(struct sge_fl *fl) { fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); return (0); } static void free_fl_sdesc(struct adapter *sc, struct sge_fl *fl) { struct fl_sdesc *sd; struct cluster_metadata *clm; struct cluster_layout *cll; int i; sd = fl->sdesc; for (i = 0; i < fl->sidx * 8; i++, sd++) { if (sd->cl == NULL) continue; cll = &sd->cll; clm = cl_metadata(sc, fl, cll, sd->cl); if (sd->nmbuf == 0) uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) { uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); counter_u64_add(extfree_rels, 1); } sd->cl = NULL; } free(fl->sdesc, M_CXGBE); fl->sdesc = NULL; } static inline void get_pkt_gl(struct mbuf *m, struct sglist *gl) { int rc; M_ASSERTPKTHDR(m); sglist_reset(gl); rc = sglist_append_mbuf(gl, m); if (__predict_false(rc != 0)) { panic("%s: mbuf %p (%d segs) was vetted earlier but now fails " "with %d.", __func__, m, mbuf_nsegs(m), rc); } KASSERT(gl->sg_nseg == mbuf_nsegs(m), ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, mbuf_nsegs(m), gl->sg_nseg)); KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS), ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)); } /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int txpkt_len16(u_int nsegs, u_int tso) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); if (tso) n += sizeof(struct cpl_tx_pkt_lso_core); return (howmany(n, 16)); } /* * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts0_len16(u_int nsegs) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } /* * len16 for a txpkts type 1 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts1_len16(void) { u_int n; n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl); return (howmany(n, 16)); } static inline u_int imm_payload(u_int ndesc) { u_int n; n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) - sizeof(struct cpl_tx_pkt_core); return (n); } /* * Write a txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr, struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int len16, ndesc, pktlen, nsegs; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); MPASS(available > 0 && available < eq->sidx); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); else if (pktlen <= imm_payload(2) && available >= 2) { /* Immediate data. Recalculate len16 and set nsegs to 0. */ ctrl += pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + pktlen, 16); nsegs = 0; } ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc <= available); /* Firmware work request header */ MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: mbuf %p needs TSO but missing header lengths", __func__, m0)); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header)) ctrl |= V_LSO_ETHHDR_LEN(1); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); cpl = (void *)(lso + 1); txq->tso_wrs++; } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m0) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (needs_l4_csum(m0) == 0) ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); if (nsegs > 0) { write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; } else { struct mbuf *m; for (m = m0; m != NULL; m = m->m_next) { copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); #ifdef INVARIANTS pktlen -= m->m_len; #endif } #ifdef INVARIANTS KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen)); #endif txq->imm_wrs++; } txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } static int try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available) { u_int needed, nsegs1, nsegs2, l1, l2; if (cannot_use_txpkts(m) || cannot_use_txpkts(n)) return (1); nsegs1 = mbuf_nsegs(m); nsegs2 = mbuf_nsegs(n); if (nsegs1 + nsegs2 == 2) { txp->wr_type = 1; l1 = l2 = txpkts1_len16(); } else { txp->wr_type = 0; l1 = txpkts0_len16(nsegs1); l2 = txpkts0_len16(nsegs2); } txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2; needed = howmany(txp->len16, EQ_ESIZE / 16); if (needed > SGE_MAX_WR_NDESC || needed > available) return (1); txp->plen = m->m_pkthdr.len + n->m_pkthdr.len; if (txp->plen > 65535) return (1); txp->npkt = 2; set_mbuf_len16(m, l1); set_mbuf_len16(n, l2); return (0); } static int add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available) { u_int plen, len16, needed, nsegs; MPASS(txp->wr_type == 0 || txp->wr_type == 1); nsegs = mbuf_nsegs(m); if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1)) return (1); plen = txp->plen + m->m_pkthdr.len; if (plen > 65535) return (1); if (txp->wr_type == 0) len16 = txpkts0_len16(nsegs); else len16 = txpkts1_len16(); needed = howmany(txp->len16 + len16, EQ_ESIZE / 16); if (needed > SGE_MAX_WR_NDESC || needed > available) return (1); txp->npkt++; txp->plen = plen; txp->len16 += len16; set_mbuf_len16(m, len16); return (0); } /* * Write a txpkts WR for the packets in txp to the hardware descriptors, update * the software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr, struct mbuf *m0, const struct txpkts *txp, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int ndesc, checkwrap; struct mbuf *m; void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(txp->npkt > 0); MPASS(txp->plen < 65536); MPASS(m0 != NULL); MPASS(m0->m_nextpkt != NULL); MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); MPASS(available > 0 && available < eq->sidx); ndesc = howmany(txp->len16, EQ_ESIZE / 16); MPASS(ndesc <= available); MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); ctrl = V_FW_WR_LEN16(txp->len16); wr->equiq_to_len16 = htobe32(ctrl); wr->plen = htobe16(txp->plen); wr->npkt = txp->npkt; wr->r3 = 0; wr->type = txp->wr_type; flitp = wr + 1; /* * At this point we are 16B into a hardware descriptor. If checkwrap is * set then we know the WR is going to wrap around somewhere. We'll * check for that at appropriate points. */ checkwrap = eq->sidx - ndesc < eq->pidx; for (m = m0; m != NULL; m = m->m_nextpkt) { if (txp->wr_type == 0) { struct ulp_txpkt *ulpmc; struct ulptx_idata *ulpsc; /* ULP master command */ ulpmc = flitp; ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); ulpmc->len = htobe32(mbuf_len16(m)); /* ULP subcommand */ ulpsc = (void *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | F_ULP_TX_SC_MORE); ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); cpl = (void *)(ulpsc + 1); if (checkwrap && (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx]) cpl = (void *)&eq->desc[0]; txq->txpkts0_pkts += txp->npkt; txq->txpkts0_wrs++; } else { cpl = flitp; txq->txpkts1_pkts += txp->npkt; txq->txpkts1_wrs++; } /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (needs_l4_csum(m) == 0) ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(m->m_pkthdr.len); cpl->ctrl1 = htobe64(ctrl1); flitp = cpl + 1; if (checkwrap && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) flitp = (void *)&eq->desc[0]; write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap); } txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. */ static void write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap) { struct sge_eq *eq = &txq->eq; struct sglist *gl = txq->gl; struct sglist_seg *seg; __be64 *flitp, *wrap; struct ulptx_sgl *usgl; int i, nflits, nsegs; KASSERT(((uintptr_t)(*to) & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to)); MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); get_pkt_gl(m, gl); nsegs = gl->sg_nseg; MPASS(nsegs > 0); nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; flitp = (__be64 *)(*to); wrap = (__be64 *)(&eq->desc[eq->sidx]); seg = &gl->sg_segs[0]; usgl = (void *)flitp; /* * We start at a 16 byte boundary somewhere inside the tx descriptor * ring, so we're at least 16 bytes away from the status page. There is * no chance of a wrap around in the middle of usgl (which is 16 bytes). */ usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); usgl->len0 = htobe32(seg->ss_len); usgl->addr0 = htobe64(seg->ss_paddr); seg++; if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) { /* Won't wrap around at all */ for (i = 0; i < nsegs - 1; i++, seg++) { usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); flitp += nflits; } else { /* Will wrap somewhere in the rest of the SGL */ /* 2 flits already written, write the rest flit by flit */ flitp = (void *)(usgl + 1); for (i = 0; i < nflits - 2; i++) { if (flitp == wrap) flitp = (void *)eq->desc; *flitp++ = get_flit(seg, nsegs - 1, i); } } if (nflits & 1) { MPASS(((uintptr_t)flitp) & 0xf); *flitp++ = 0; } MPASS((((uintptr_t)flitp) & 0xf) == 0); if (__predict_false(flitp == wrap)) *to = (void *)eq->desc; else *to = (void *)flitp; } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)&eq->desc[eq->sidx])) { bcopy(from, *to, len); (*to) += len; } else { int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ bcopy(from, (void *)eq->desc, portion); (*to) = (caddr_t)eq->desc + portion; } } static inline void ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n) { u_int db; MPASS(n > 0); db = eq->doorbells; if (n > 1) clrbit(&db, DOORBELL_WCWR); wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); break; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; int i; /* * Queues whose 128B doorbell segment fits in the page do not * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ KASSERT(eq->udb_qid == 0 && n == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p", __func__, eq->doorbells, n, eq->dbidx, eq)); dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); i = eq->dbidx; src = (void *)&eq->desc[i]; while (src != (void *)&eq->desc[i + 1]) *dst++ = *src++; wmb(); break; } case DOORBELL_UDBWC: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); wmb(); break; case DOORBELL_KDB: - t4_write_reg(sc, MYPF_REG(A_SGE_PF_KDOORBELL), + t4_write_reg(sc, sc->sge_kdoorbell_reg, V_QID(eq->cntxt_id) | V_PIDX(n)); break; } IDXINCR(eq->dbidx, n, eq->sidx); } static inline u_int reclaimable_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx; hw_cidx = read_hw_cidx(eq); return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx)); } static inline u_int total_available_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx, pidx; hw_cidx = read_hw_cidx(eq); pidx = eq->pidx; if (pidx == hw_cidx) return (eq->sidx - 1); else return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1); } static inline uint16_t read_hw_cidx(struct sge_eq *eq) { struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; uint16_t cidx = spg->cidx; /* stable snapshot */ return (be16toh(cidx)); } /* * Reclaim 'n' descriptors approximately. */ static u_int reclaim_tx_descs(struct sge_txq *txq, u_int n) { struct tx_sdesc *txsd; struct sge_eq *eq = &txq->eq; u_int can_reclaim, reclaimed; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(n > 0); reclaimed = 0; can_reclaim = reclaimable_tx_desc(eq); while (can_reclaim && reclaimed < n) { int ndesc; struct mbuf *m, *nextpkt; txsd = &txq->sdesc[eq->cidx]; ndesc = txsd->desc_used; /* Firmware doesn't return "partial" credits. */ KASSERT(can_reclaim >= ndesc, ("%s: unexpected number of credits: %d, %d", __func__, can_reclaim, ndesc)); for (m = txsd->m; m != NULL; m = nextpkt) { nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } reclaimed += ndesc; can_reclaim -= ndesc; IDXINCR(eq->cidx, ndesc, eq->sidx); } return (reclaimed); } static void tx_reclaim(void *arg, int n) { struct sge_txq *txq = arg; struct sge_eq *eq = &txq->eq; do { if (TXQ_TRYLOCK(txq) == 0) break; n = reclaim_tx_descs(txq, 32); if (eq->cidx == eq->pidx) eq->equeqidx = eq->pidx; TXQ_UNLOCK(txq); } while (n > 0); } static __be64 get_flit(struct sglist_seg *segs, int nsegs, int idx) { int i = (idx / 3) * 2; switch (idx % 3) { case 0: { __be64 rc; rc = htobe32(segs[i].ss_len); if (i + 1 < nsegs) rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32; return (rc); } case 1: return (htobe64(segs[i].ss_paddr)); case 2: return (htobe64(segs[i + 1].ss_paddr)); } return (0); } static void find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp) { int8_t zidx, hwidx, idx; uint16_t region1, region3; int spare, spare_needed, n; struct sw_zone_info *swz; struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0]; /* * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize * large enough for the max payload and cluster metadata. Otherwise * settle for the largest bufsize that leaves enough room in the cluster * for metadata. * * Without buffer packing: Look for the smallest zone which has a * bufsize large enough for the max payload. Settle for the largest * bufsize available if there's nothing big enough for max payload. */ spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0; swz = &sc->sge.sw_zone_info[0]; hwidx = -1; for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) { if (swz->size > largest_rx_cluster) { if (__predict_true(hwidx != -1)) break; /* * This is a misconfiguration. largest_rx_cluster is * preventing us from finding a refill source. See * dev.t5nex..buffer_sizes to figure out why. */ device_printf(sc->dev, "largest_rx_cluster=%u leaves no" " refill source for fl %p (dma %u). Ignored.\n", largest_rx_cluster, fl, maxp); } for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (spare < spare_needed) continue; hwidx = idx; /* best option so far */ if (hwb->size >= maxp) { if ((fl->flags & FL_BUF_PACKING) == 0) goto done; /* stop looking (not packing) */ if (swz->size >= safest_rx_cluster) goto done; /* stop looking (packing) */ } break; /* keep looking, next zone */ } } done: /* A usable hwidx has been located. */ MPASS(hwidx != -1); hwb = &hwb_list[hwidx]; zidx = hwb->zidx; swz = &sc->sge.sw_zone_info[zidx]; region1 = 0; region3 = swz->size - hwb->size; /* * Stay within this zone and see if there is a better match when mbuf * inlining is allowed. Remember that the hwidx's are sorted in * decreasing order of size (so in increasing order of spare area). */ for (idx = hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (allow_mbufs_in_cluster == 0 || hwb->size < maxp) break; /* * Do not inline mbufs if doing so would violate the pad/pack * boundary alignment requirement. */ if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0) continue; if (fl->flags & FL_BUF_PACKING && (MSIZE % sc->params.sge.pack_boundary) != 0) continue; if (spare < CL_METADATA_SIZE + MSIZE) continue; n = (spare - CL_METADATA_SIZE) / MSIZE; if (n > howmany(hwb->size, maxp)) break; hwidx = idx; if (fl->flags & FL_BUF_PACKING) { region1 = n * MSIZE; region3 = spare - region1; } else { region1 = MSIZE; region3 = spare - region1; break; } } KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES, ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp)); KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES, ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp)); KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 == sc->sge.sw_zone_info[zidx].size, ("%s: bad buffer layout for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); if (fl->flags & FL_BUF_PACKING || region1 > 0) { KASSERT(region3 >= CL_METADATA_SIZE, ("%s: no room for metadata. fl %p, maxp %d; " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); KASSERT(region1 % MSIZE == 0, ("%s: bad mbuf region for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); } fl->cll_def.zidx = zidx; fl->cll_def.hwidx = hwidx; fl->cll_def.region1 = region1; fl->cll_def.region3 = region3; } static void find_safe_refill_source(struct adapter *sc, struct sge_fl *fl) { struct sge *s = &sc->sge; struct hw_buf_info *hwb; struct sw_zone_info *swz; int spare; int8_t hwidx; if (fl->flags & FL_BUF_PACKING) hwidx = s->safe_hwidx2; /* with room for metadata */ else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) { hwidx = s->safe_hwidx2; hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; /* no good if there isn't room for an mbuf as well */ if (spare < CL_METADATA_SIZE + MSIZE) hwidx = s->safe_hwidx1; } else hwidx = s->safe_hwidx1; if (hwidx == -1) { /* No fallback source */ fl->cll_alt.hwidx = -1; fl->cll_alt.zidx = -1; return; } hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; fl->cll_alt.hwidx = hwidx; fl->cll_alt.zidx = hwb->zidx; if (allow_mbufs_in_cluster && (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0)) fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE; else fl->cll_alt.region1 = 0; fl->cll_alt.region3 = spare - fl->cll_alt.region1; } static void add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); if ((fl->flags & FL_DOOMED) == 0) { fl->flags |= FL_STARVING; TAILQ_INSERT_TAIL(&sc->sfl, fl, link); callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc); } FL_UNLOCK(fl); mtx_unlock(&sc->sfl_lock); } static void handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_wrq *wrq = (void *)eq; atomic_readandclear_int(&eq->equiq); taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task); } static void handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_txq *txq = (void *)eq; MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH); atomic_readandclear_int(&eq->equiq); mp_ring_check_drainage(txq->r, 0); taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task); } static int handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1); unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid)); struct adapter *sc = iq->adapter; struct sge *s = &sc->sge; struct sge_eq *eq; static void (*h[])(struct adapter *, struct sge_eq *) = {NULL, &handle_wrq_egr_update, &handle_eth_egr_update, &handle_wrq_egr_update}; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); eq = s->eqmap[qid - s->eq_start]; (*h[eq->flags & EQ_TYPEMASK])(sc, eq); return (0); } /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */ CTASSERT(offsetof(struct cpl_fw4_msg, data) == \ offsetof(struct cpl_fw6_msg, data)); static int handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw6_msg *cpl = (const void *)(rss + 1); KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) { const struct rss_header *rss2; rss2 = (const struct rss_header *)&cpl->data[0]; return (t4_cpl_handler[rss2->opcode](iq, rss2, m)); } return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0])); } /** * t4_handle_wrerr_rpl - process a FW work request error message * @adap: the adapter * @rpl: start of the FW message */ static int t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl) { u8 opcode = *(const u8 *)rpl; const struct fw_error_cmd *e = (const void *)rpl; unsigned int i; if (opcode != FW_ERROR_CMD) { log(LOG_ERR, "%s: Received WRERR_RPL message with opcode %#x\n", device_get_nameunit(adap->dev), opcode); return (EINVAL); } log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev), G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" : "non-fatal"); switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) { case FW_ERROR_TYPE_EXCEPTION: log(LOG_ERR, "exception info:\n"); for (i = 0; i < nitems(e->u.exception.info); i++) log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ", be32toh(e->u.exception.info[i])); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_HWMODULE: log(LOG_ERR, "HW module regaddr %08x regval %08x\n", be32toh(e->u.hwmodule.regaddr), be32toh(e->u.hwmodule.regval)); break; case FW_ERROR_TYPE_WR: log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n", be16toh(e->u.wr.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)), be32toh(e->u.wr.eqid)); for (i = 0; i < nitems(e->u.wr.wrhdr); i++) log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ", e->u.wr.wrhdr[i]); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_ACL: log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s", be16toh(e->u.acl.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)), be32toh(e->u.acl.eqid), G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" : "MAC"); for (i = 0; i < nitems(e->u.acl.val); i++) log(LOG_ERR, " %02x", e->u.acl.val[i]); log(LOG_ERR, "\n"); break; default: log(LOG_ERR, "type %#x\n", G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))); return (EINVAL); } return (0); } static int sysctl_uint16(SYSCTL_HANDLER_ARGS) { uint16_t *id = arg1; int i = *id; return sysctl_handle_int(oidp, &i, 0, req); } static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS) { struct sge *s = arg1; struct hw_buf_info *hwb = &s->hw_buf_info[0]; struct sw_zone_info *swz = &s->sw_zone_info[0]; int i, rc; struct sbuf sb; char c; sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) { if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster) c = '*'; else c = '\0'; sbuf_printf(&sb, "%u%c ", hwb->size, c); } sbuf_trim(&sb); sbuf_finish(&sb); rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (rc); } static int sysctl_tc(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct port_info *pi; struct adapter *sc; struct sge_txq *txq; struct tx_sched_class *tc; int qidx = arg2, rc, tc_idx; uint32_t fw_queue, fw_class; MPASS(qidx >= 0 && qidx < vi->ntxq); pi = vi->pi; sc = pi->adapter; txq = &sc->sge.txq[vi->first_txq + qidx]; tc_idx = txq->tc_idx; rc = sysctl_handle_int(oidp, &tc_idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); /* Note that -1 is legitimate input (it means unbind). */ if (tc_idx < -1 || tc_idx >= sc->chip_params->nsched_cls) return (EINVAL); rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4stc"); if (rc) return (rc); if (tc_idx == txq->tc_idx) { rc = 0; /* No change, nothing to do. */ goto done; } fw_queue = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id); if (tc_idx == -1) fw_class = 0xffffffff; /* Unbind. */ else { /* * Bind to a different class. Ethernet txq's are only allowed * to bind to cl-rl mode-class for now. XXX: too restrictive. */ tc = &pi->tc[tc_idx]; if (tc->flags & TX_SC_OK && tc->params.level == SCHED_CLASS_LEVEL_CL_RL && tc->params.mode == SCHED_CLASS_MODE_CLASS) { /* Ok to proceed. */ fw_class = tc_idx; } else { rc = tc->flags & TX_SC_OK ? EBUSY : ENXIO; goto done; } } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class); if (rc == 0) { if (txq->tc_idx != -1) { tc = &pi->tc[txq->tc_idx]; MPASS(tc->refcount > 0); tc->refcount--; } if (tc_idx != -1) { tc = &pi->tc[tc_idx]; tc->refcount++; } txq->tc_idx = tc_idx; } done: end_synchronized_op(sc, 0); return (rc); } Index: user/alc/PQ_LAUNDRY/sys/dev/pci/pci_pci.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/dev/pci/pci_pci.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/dev/pci/pci_pci.c (revision 303654) @@ -1,2809 +1,2816 @@ /*- * Copyright (c) 1994,1995 Stefan Esser, Wolfgang StanglMeier * Copyright (c) 2000 Michael Smith * Copyright (c) 2000 BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * PCI:PCI bridge support. */ #include "opt_pci.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pcib_if.h" static int pcib_probe(device_t dev); static int pcib_suspend(device_t dev); static int pcib_resume(device_t dev); static int pcib_power_for_sleep(device_t pcib, device_t dev, int *pstate); static int pcib_ari_get_id(device_t pcib, device_t dev, enum pci_id_type type, uintptr_t *id); static uint32_t pcib_read_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, int width); static void pcib_write_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, uint32_t val, int width); static int pcib_ari_maxslots(device_t dev); static int pcib_ari_maxfuncs(device_t dev); static int pcib_try_enable_ari(device_t pcib, device_t dev); static int pcib_ari_enabled(device_t pcib); static void pcib_ari_decode_rid(device_t pcib, uint16_t rid, int *bus, int *slot, int *func); #ifdef PCI_HP static void pcib_pcie_ab_timeout(void *arg); static void pcib_pcie_cc_timeout(void *arg); static void pcib_pcie_dll_timeout(void *arg); #endif static device_method_t pcib_methods[] = { /* Device interface */ DEVMETHOD(device_probe, pcib_probe), DEVMETHOD(device_attach, pcib_attach), DEVMETHOD(device_detach, pcib_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, pcib_suspend), DEVMETHOD(device_resume, pcib_resume), /* Bus interface */ DEVMETHOD(bus_child_present, pcib_child_present), DEVMETHOD(bus_read_ivar, pcib_read_ivar), DEVMETHOD(bus_write_ivar, pcib_write_ivar), DEVMETHOD(bus_alloc_resource, pcib_alloc_resource), #ifdef NEW_PCIB DEVMETHOD(bus_adjust_resource, pcib_adjust_resource), DEVMETHOD(bus_release_resource, pcib_release_resource), #else DEVMETHOD(bus_adjust_resource, bus_generic_adjust_resource), DEVMETHOD(bus_release_resource, bus_generic_release_resource), #endif DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), /* pcib interface */ DEVMETHOD(pcib_maxslots, pcib_ari_maxslots), DEVMETHOD(pcib_maxfuncs, pcib_ari_maxfuncs), DEVMETHOD(pcib_read_config, pcib_read_config), DEVMETHOD(pcib_write_config, pcib_write_config), DEVMETHOD(pcib_route_interrupt, pcib_route_interrupt), DEVMETHOD(pcib_alloc_msi, pcib_alloc_msi), DEVMETHOD(pcib_release_msi, pcib_release_msi), DEVMETHOD(pcib_alloc_msix, pcib_alloc_msix), DEVMETHOD(pcib_release_msix, pcib_release_msix), DEVMETHOD(pcib_map_msi, pcib_map_msi), DEVMETHOD(pcib_power_for_sleep, pcib_power_for_sleep), DEVMETHOD(pcib_get_id, pcib_ari_get_id), DEVMETHOD(pcib_try_enable_ari, pcib_try_enable_ari), DEVMETHOD(pcib_ari_enabled, pcib_ari_enabled), DEVMETHOD(pcib_decode_rid, pcib_ari_decode_rid), DEVMETHOD_END }; static devclass_t pcib_devclass; DEFINE_CLASS_0(pcib, pcib_driver, pcib_methods, sizeof(struct pcib_softc)); DRIVER_MODULE(pcib, pci, pcib_driver, pcib_devclass, NULL, NULL); #if defined(NEW_PCIB) || defined(PCI_HP) SYSCTL_DECL(_hw_pci); #endif #ifdef NEW_PCIB static int pci_clear_pcib; SYSCTL_INT(_hw_pci, OID_AUTO, clear_pcib, CTLFLAG_RDTUN, &pci_clear_pcib, 0, "Clear firmware-assigned resources for PCI-PCI bridge I/O windows."); /* * Is a resource from a child device sub-allocated from one of our * resource managers? */ static int pcib_is_resource_managed(struct pcib_softc *sc, int type, struct resource *r) { switch (type) { #ifdef PCI_RES_BUS case PCI_RES_BUS: return (rman_is_region_manager(r, &sc->bus.rman)); #endif case SYS_RES_IOPORT: return (rman_is_region_manager(r, &sc->io.rman)); case SYS_RES_MEMORY: /* Prefetchable resources may live in either memory rman. */ if (rman_get_flags(r) & RF_PREFETCHABLE && rman_is_region_manager(r, &sc->pmem.rman)) return (1); return (rman_is_region_manager(r, &sc->mem.rman)); } return (0); } static int pcib_is_window_open(struct pcib_window *pw) { return (pw->valid && pw->base < pw->limit); } /* * XXX: If RF_ACTIVE did not also imply allocating a bus space tag and * handle for the resource, we could pass RF_ACTIVE up to the PCI bus * when allocating the resource windows and rely on the PCI bus driver * to do this for us. */ static void pcib_activate_window(struct pcib_softc *sc, int type) { PCI_ENABLE_IO(device_get_parent(sc->dev), sc->dev, type); } static void pcib_write_windows(struct pcib_softc *sc, int mask) { device_t dev; uint32_t val; dev = sc->dev; if (sc->io.valid && mask & WIN_IO) { val = pci_read_config(dev, PCIR_IOBASEL_1, 1); if ((val & PCIM_BRIO_MASK) == PCIM_BRIO_32) { pci_write_config(dev, PCIR_IOBASEH_1, sc->io.base >> 16, 2); pci_write_config(dev, PCIR_IOLIMITH_1, sc->io.limit >> 16, 2); } pci_write_config(dev, PCIR_IOBASEL_1, sc->io.base >> 8, 1); pci_write_config(dev, PCIR_IOLIMITL_1, sc->io.limit >> 8, 1); } if (mask & WIN_MEM) { pci_write_config(dev, PCIR_MEMBASE_1, sc->mem.base >> 16, 2); pci_write_config(dev, PCIR_MEMLIMIT_1, sc->mem.limit >> 16, 2); } if (sc->pmem.valid && mask & WIN_PMEM) { val = pci_read_config(dev, PCIR_PMBASEL_1, 2); if ((val & PCIM_BRPM_MASK) == PCIM_BRPM_64) { pci_write_config(dev, PCIR_PMBASEH_1, sc->pmem.base >> 32, 4); pci_write_config(dev, PCIR_PMLIMITH_1, sc->pmem.limit >> 32, 4); } pci_write_config(dev, PCIR_PMBASEL_1, sc->pmem.base >> 16, 2); pci_write_config(dev, PCIR_PMLIMITL_1, sc->pmem.limit >> 16, 2); } } /* * This is used to reject I/O port allocations that conflict with an * ISA alias range. */ static int pcib_is_isa_range(struct pcib_softc *sc, rman_res_t start, rman_res_t end, rman_res_t count) { rman_res_t next_alias; if (!(sc->bridgectl & PCIB_BCR_ISA_ENABLE)) return (0); /* Only check fixed ranges for overlap. */ if (start + count - 1 != end) return (0); /* ISA aliases are only in the lower 64KB of I/O space. */ if (start >= 65536) return (0); /* Check for overlap with 0x000 - 0x0ff as a special case. */ if (start < 0x100) goto alias; /* * If the start address is an alias, the range is an alias. * Otherwise, compute the start of the next alias range and * check if it is before the end of the candidate range. */ if ((start & 0x300) != 0) goto alias; next_alias = (start & ~0x3fful) | 0x100; if (next_alias <= end) goto alias; return (0); alias: if (bootverbose) device_printf(sc->dev, "I/O range %#jx-%#jx overlaps with an ISA alias\n", start, end); return (1); } static void pcib_add_window_resources(struct pcib_window *w, struct resource **res, int count) { struct resource **newarray; int error, i; newarray = malloc(sizeof(struct resource *) * (w->count + count), M_DEVBUF, M_WAITOK); if (w->res != NULL) bcopy(w->res, newarray, sizeof(struct resource *) * w->count); bcopy(res, newarray + w->count, sizeof(struct resource *) * count); free(w->res, M_DEVBUF); w->res = newarray; w->count += count; for (i = 0; i < count; i++) { error = rman_manage_region(&w->rman, rman_get_start(res[i]), rman_get_end(res[i])); if (error) panic("Failed to add resource to rman"); } } typedef void (nonisa_callback)(rman_res_t start, rman_res_t end, void *arg); static void pcib_walk_nonisa_ranges(rman_res_t start, rman_res_t end, nonisa_callback *cb, void *arg) { rman_res_t next_end; /* * If start is within an ISA alias range, move up to the start * of the next non-alias range. As a special case, addresses * in the range 0x000 - 0x0ff should also be skipped since * those are used for various system I/O devices in ISA * systems. */ if (start <= 65535) { if (start < 0x100 || (start & 0x300) != 0) { start &= ~0x3ff; start += 0x400; } } /* ISA aliases are only in the lower 64KB of I/O space. */ while (start <= MIN(end, 65535)) { next_end = MIN(start | 0xff, end); cb(start, next_end, arg); start += 0x400; } if (start <= end) cb(start, end, arg); } static void count_ranges(rman_res_t start, rman_res_t end, void *arg) { int *countp; countp = arg; (*countp)++; } struct alloc_state { struct resource **res; struct pcib_softc *sc; int count, error; }; static void alloc_ranges(rman_res_t start, rman_res_t end, void *arg) { struct alloc_state *as; struct pcib_window *w; int rid; as = arg; if (as->error != 0) return; w = &as->sc->io; rid = w->reg; if (bootverbose) device_printf(as->sc->dev, "allocating non-ISA range %#jx-%#jx\n", start, end); as->res[as->count] = bus_alloc_resource(as->sc->dev, SYS_RES_IOPORT, &rid, start, end, end - start + 1, 0); if (as->res[as->count] == NULL) as->error = ENXIO; else as->count++; } static int pcib_alloc_nonisa_ranges(struct pcib_softc *sc, rman_res_t start, rman_res_t end) { struct alloc_state as; int i, new_count; /* First, see how many ranges we need. */ new_count = 0; pcib_walk_nonisa_ranges(start, end, count_ranges, &new_count); /* Second, allocate the ranges. */ as.res = malloc(sizeof(struct resource *) * new_count, M_DEVBUF, M_WAITOK); as.sc = sc; as.count = 0; as.error = 0; pcib_walk_nonisa_ranges(start, end, alloc_ranges, &as); if (as.error != 0) { for (i = 0; i < as.count; i++) bus_release_resource(sc->dev, SYS_RES_IOPORT, sc->io.reg, as.res[i]); free(as.res, M_DEVBUF); return (as.error); } KASSERT(as.count == new_count, ("%s: count mismatch", __func__)); /* Third, add the ranges to the window. */ pcib_add_window_resources(&sc->io, as.res, as.count); free(as.res, M_DEVBUF); return (0); } static void pcib_alloc_window(struct pcib_softc *sc, struct pcib_window *w, int type, int flags, pci_addr_t max_address) { struct resource *res; char buf[64]; int error, rid; if (max_address != (rman_res_t)max_address) max_address = ~0; w->rman.rm_start = 0; w->rman.rm_end = max_address; w->rman.rm_type = RMAN_ARRAY; snprintf(buf, sizeof(buf), "%s %s window", device_get_nameunit(sc->dev), w->name); w->rman.rm_descr = strdup(buf, M_DEVBUF); error = rman_init(&w->rman); if (error) panic("Failed to initialize %s %s rman", device_get_nameunit(sc->dev), w->name); if (!pcib_is_window_open(w)) return; if (w->base > max_address || w->limit > max_address) { device_printf(sc->dev, "initial %s window has too many bits, ignoring\n", w->name); return; } if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE) (void)pcib_alloc_nonisa_ranges(sc, w->base, w->limit); else { rid = w->reg; res = bus_alloc_resource(sc->dev, type, &rid, w->base, w->limit, w->limit - w->base + 1, flags); if (res != NULL) pcib_add_window_resources(w, &res, 1); } if (w->res == NULL) { device_printf(sc->dev, "failed to allocate initial %s window: %#jx-%#jx\n", w->name, (uintmax_t)w->base, (uintmax_t)w->limit); w->base = max_address; w->limit = 0; pcib_write_windows(sc, w->mask); return; } pcib_activate_window(sc, type); } /* * Initialize I/O windows. */ static void pcib_probe_windows(struct pcib_softc *sc) { pci_addr_t max; device_t dev; uint32_t val; dev = sc->dev; if (pci_clear_pcib) { pcib_bridge_init(dev); } /* Determine if the I/O port window is implemented. */ val = pci_read_config(dev, PCIR_IOBASEL_1, 1); if (val == 0) { /* * If 'val' is zero, then only 16-bits of I/O space * are supported. */ pci_write_config(dev, PCIR_IOBASEL_1, 0xff, 1); if (pci_read_config(dev, PCIR_IOBASEL_1, 1) != 0) { sc->io.valid = 1; pci_write_config(dev, PCIR_IOBASEL_1, 0, 1); } } else sc->io.valid = 1; /* Read the existing I/O port window. */ if (sc->io.valid) { sc->io.reg = PCIR_IOBASEL_1; sc->io.step = 12; sc->io.mask = WIN_IO; sc->io.name = "I/O port"; if ((val & PCIM_BRIO_MASK) == PCIM_BRIO_32) { sc->io.base = PCI_PPBIOBASE( pci_read_config(dev, PCIR_IOBASEH_1, 2), val); sc->io.limit = PCI_PPBIOLIMIT( pci_read_config(dev, PCIR_IOLIMITH_1, 2), pci_read_config(dev, PCIR_IOLIMITL_1, 1)); max = 0xffffffff; } else { sc->io.base = PCI_PPBIOBASE(0, val); sc->io.limit = PCI_PPBIOLIMIT(0, pci_read_config(dev, PCIR_IOLIMITL_1, 1)); max = 0xffff; } pcib_alloc_window(sc, &sc->io, SYS_RES_IOPORT, 0, max); } /* Read the existing memory window. */ sc->mem.valid = 1; sc->mem.reg = PCIR_MEMBASE_1; sc->mem.step = 20; sc->mem.mask = WIN_MEM; sc->mem.name = "memory"; sc->mem.base = PCI_PPBMEMBASE(0, pci_read_config(dev, PCIR_MEMBASE_1, 2)); sc->mem.limit = PCI_PPBMEMLIMIT(0, pci_read_config(dev, PCIR_MEMLIMIT_1, 2)); pcib_alloc_window(sc, &sc->mem, SYS_RES_MEMORY, 0, 0xffffffff); /* Determine if the prefetchable memory window is implemented. */ val = pci_read_config(dev, PCIR_PMBASEL_1, 2); if (val == 0) { /* * If 'val' is zero, then only 32-bits of memory space * are supported. */ pci_write_config(dev, PCIR_PMBASEL_1, 0xffff, 2); if (pci_read_config(dev, PCIR_PMBASEL_1, 2) != 0) { sc->pmem.valid = 1; pci_write_config(dev, PCIR_PMBASEL_1, 0, 2); } } else sc->pmem.valid = 1; /* Read the existing prefetchable memory window. */ if (sc->pmem.valid) { sc->pmem.reg = PCIR_PMBASEL_1; sc->pmem.step = 20; sc->pmem.mask = WIN_PMEM; sc->pmem.name = "prefetch"; if ((val & PCIM_BRPM_MASK) == PCIM_BRPM_64) { sc->pmem.base = PCI_PPBMEMBASE( pci_read_config(dev, PCIR_PMBASEH_1, 4), val); sc->pmem.limit = PCI_PPBMEMLIMIT( pci_read_config(dev, PCIR_PMLIMITH_1, 4), pci_read_config(dev, PCIR_PMLIMITL_1, 2)); max = 0xffffffffffffffff; } else { sc->pmem.base = PCI_PPBMEMBASE(0, val); sc->pmem.limit = PCI_PPBMEMLIMIT(0, pci_read_config(dev, PCIR_PMLIMITL_1, 2)); max = 0xffffffff; } pcib_alloc_window(sc, &sc->pmem, SYS_RES_MEMORY, RF_PREFETCHABLE, max); } } static void pcib_release_window(struct pcib_softc *sc, struct pcib_window *w, int type) { device_t dev; int error, i; if (!w->valid) return; dev = sc->dev; error = rman_fini(&w->rman); if (error) { device_printf(dev, "failed to release %s rman\n", w->name); return; } free(__DECONST(char *, w->rman.rm_descr), M_DEVBUF); for (i = 0; i < w->count; i++) { error = bus_free_resource(dev, type, w->res[i]); if (error) device_printf(dev, "failed to release %s resource: %d\n", w->name, error); } free(w->res, M_DEVBUF); } static void pcib_free_windows(struct pcib_softc *sc) { pcib_release_window(sc, &sc->pmem, SYS_RES_MEMORY); pcib_release_window(sc, &sc->mem, SYS_RES_MEMORY); pcib_release_window(sc, &sc->io, SYS_RES_IOPORT); } #ifdef PCI_RES_BUS /* * Allocate a suitable secondary bus for this bridge if needed and * initialize the resource manager for the secondary bus range. Note * that the minimum count is a desired value and this may allocate a * smaller range. */ void pcib_setup_secbus(device_t dev, struct pcib_secbus *bus, int min_count) { char buf[64]; int error, rid, sec_reg; switch (pci_read_config(dev, PCIR_HDRTYPE, 1) & PCIM_HDRTYPE) { case PCIM_HDRTYPE_BRIDGE: sec_reg = PCIR_SECBUS_1; bus->sub_reg = PCIR_SUBBUS_1; break; case PCIM_HDRTYPE_CARDBUS: sec_reg = PCIR_SECBUS_2; bus->sub_reg = PCIR_SUBBUS_2; break; default: panic("not a PCI bridge"); } bus->sec = pci_read_config(dev, sec_reg, 1); bus->sub = pci_read_config(dev, bus->sub_reg, 1); bus->dev = dev; bus->rman.rm_start = 0; bus->rman.rm_end = PCI_BUSMAX; bus->rman.rm_type = RMAN_ARRAY; snprintf(buf, sizeof(buf), "%s bus numbers", device_get_nameunit(dev)); bus->rman.rm_descr = strdup(buf, M_DEVBUF); error = rman_init(&bus->rman); if (error) panic("Failed to initialize %s bus number rman", device_get_nameunit(dev)); /* * Allocate a bus range. This will return an existing bus range * if one exists, or a new bus range if one does not. */ rid = 0; bus->res = bus_alloc_resource_anywhere(dev, PCI_RES_BUS, &rid, min_count, 0); if (bus->res == NULL) { /* * Fall back to just allocating a range of a single bus * number. */ bus->res = bus_alloc_resource_anywhere(dev, PCI_RES_BUS, &rid, 1, 0); } else if (rman_get_size(bus->res) < min_count) /* * Attempt to grow the existing range to satisfy the * minimum desired count. */ (void)bus_adjust_resource(dev, PCI_RES_BUS, bus->res, rman_get_start(bus->res), rman_get_start(bus->res) + min_count - 1); /* * Add the initial resource to the rman. */ if (bus->res != NULL) { error = rman_manage_region(&bus->rman, rman_get_start(bus->res), rman_get_end(bus->res)); if (error) panic("Failed to add resource to rman"); bus->sec = rman_get_start(bus->res); bus->sub = rman_get_end(bus->res); } } void pcib_free_secbus(device_t dev, struct pcib_secbus *bus) { int error; error = rman_fini(&bus->rman); if (error) { device_printf(dev, "failed to release bus number rman\n"); return; } free(__DECONST(char *, bus->rman.rm_descr), M_DEVBUF); error = bus_free_resource(dev, PCI_RES_BUS, bus->res); if (error) device_printf(dev, "failed to release bus numbers resource: %d\n", error); } static struct resource * pcib_suballoc_bus(struct pcib_secbus *bus, device_t child, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource *res; res = rman_reserve_resource(&bus->rman, start, end, count, flags, child); if (res == NULL) return (NULL); if (bootverbose) device_printf(bus->dev, "allocated bus range (%ju-%ju) for rid %d of %s\n", rman_get_start(res), rman_get_end(res), *rid, pcib_child_name(child)); rman_set_rid(res, *rid); return (res); } /* * Attempt to grow the secondary bus range. This is much simpler than * for I/O windows as the range can only be grown by increasing * subbus. */ static int pcib_grow_subbus(struct pcib_secbus *bus, rman_res_t new_end) { rman_res_t old_end; int error; old_end = rman_get_end(bus->res); KASSERT(new_end > old_end, ("attempt to shrink subbus")); error = bus_adjust_resource(bus->dev, PCI_RES_BUS, bus->res, rman_get_start(bus->res), new_end); if (error) return (error); if (bootverbose) device_printf(bus->dev, "grew bus range to %ju-%ju\n", rman_get_start(bus->res), rman_get_end(bus->res)); error = rman_manage_region(&bus->rman, old_end + 1, rman_get_end(bus->res)); if (error) panic("Failed to add resource to rman"); bus->sub = rman_get_end(bus->res); pci_write_config(bus->dev, bus->sub_reg, bus->sub, 1); return (0); } struct resource * pcib_alloc_subbus(struct pcib_secbus *bus, device_t child, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource *res; rman_res_t start_free, end_free, new_end; /* * First, see if the request can be satisified by the existing * bus range. */ res = pcib_suballoc_bus(bus, child, rid, start, end, count, flags); if (res != NULL) return (res); /* * Figure out a range to grow the bus range. First, find the * first bus number after the last allocated bus in the rman and * enforce that as a minimum starting point for the range. */ if (rman_last_free_region(&bus->rman, &start_free, &end_free) != 0 || end_free != bus->sub) start_free = bus->sub + 1; if (start_free < start) start_free = start; new_end = start_free + count - 1; /* * See if this new range would satisfy the request if it * succeeds. */ if (new_end > end) return (NULL); /* Finally, attempt to grow the existing resource. */ if (bootverbose) { device_printf(bus->dev, "attempting to grow bus range for %ju buses\n", count); printf("\tback candidate range: %ju-%ju\n", start_free, new_end); } if (pcib_grow_subbus(bus, new_end) == 0) return (pcib_suballoc_bus(bus, child, rid, start, end, count, flags)); return (NULL); } #endif #else /* * Is the prefetch window open (eg, can we allocate memory in it?) */ static int pcib_is_prefetch_open(struct pcib_softc *sc) { return (sc->pmembase > 0 && sc->pmembase < sc->pmemlimit); } /* * Is the nonprefetch window open (eg, can we allocate memory in it?) */ static int pcib_is_nonprefetch_open(struct pcib_softc *sc) { return (sc->membase > 0 && sc->membase < sc->memlimit); } /* * Is the io window open (eg, can we allocate ports in it?) */ static int pcib_is_io_open(struct pcib_softc *sc) { return (sc->iobase > 0 && sc->iobase < sc->iolimit); } /* * Get current I/O decode. */ static void pcib_get_io_decode(struct pcib_softc *sc) { device_t dev; uint32_t iolow; dev = sc->dev; iolow = pci_read_config(dev, PCIR_IOBASEL_1, 1); if ((iolow & PCIM_BRIO_MASK) == PCIM_BRIO_32) sc->iobase = PCI_PPBIOBASE( pci_read_config(dev, PCIR_IOBASEH_1, 2), iolow); else sc->iobase = PCI_PPBIOBASE(0, iolow); iolow = pci_read_config(dev, PCIR_IOLIMITL_1, 1); if ((iolow & PCIM_BRIO_MASK) == PCIM_BRIO_32) sc->iolimit = PCI_PPBIOLIMIT( pci_read_config(dev, PCIR_IOLIMITH_1, 2), iolow); else sc->iolimit = PCI_PPBIOLIMIT(0, iolow); } /* * Get current memory decode. */ static void pcib_get_mem_decode(struct pcib_softc *sc) { device_t dev; pci_addr_t pmemlow; dev = sc->dev; sc->membase = PCI_PPBMEMBASE(0, pci_read_config(dev, PCIR_MEMBASE_1, 2)); sc->memlimit = PCI_PPBMEMLIMIT(0, pci_read_config(dev, PCIR_MEMLIMIT_1, 2)); pmemlow = pci_read_config(dev, PCIR_PMBASEL_1, 2); if ((pmemlow & PCIM_BRPM_MASK) == PCIM_BRPM_64) sc->pmembase = PCI_PPBMEMBASE( pci_read_config(dev, PCIR_PMBASEH_1, 4), pmemlow); else sc->pmembase = PCI_PPBMEMBASE(0, pmemlow); pmemlow = pci_read_config(dev, PCIR_PMLIMITL_1, 2); if ((pmemlow & PCIM_BRPM_MASK) == PCIM_BRPM_64) sc->pmemlimit = PCI_PPBMEMLIMIT( pci_read_config(dev, PCIR_PMLIMITH_1, 4), pmemlow); else sc->pmemlimit = PCI_PPBMEMLIMIT(0, pmemlow); } /* * Restore previous I/O decode. */ static void pcib_set_io_decode(struct pcib_softc *sc) { device_t dev; uint32_t iohi; dev = sc->dev; iohi = sc->iobase >> 16; if (iohi > 0) pci_write_config(dev, PCIR_IOBASEH_1, iohi, 2); pci_write_config(dev, PCIR_IOBASEL_1, sc->iobase >> 8, 1); iohi = sc->iolimit >> 16; if (iohi > 0) pci_write_config(dev, PCIR_IOLIMITH_1, iohi, 2); pci_write_config(dev, PCIR_IOLIMITL_1, sc->iolimit >> 8, 1); } /* * Restore previous memory decode. */ static void pcib_set_mem_decode(struct pcib_softc *sc) { device_t dev; pci_addr_t pmemhi; dev = sc->dev; pci_write_config(dev, PCIR_MEMBASE_1, sc->membase >> 16, 2); pci_write_config(dev, PCIR_MEMLIMIT_1, sc->memlimit >> 16, 2); pmemhi = sc->pmembase >> 32; if (pmemhi > 0) pci_write_config(dev, PCIR_PMBASEH_1, pmemhi, 4); pci_write_config(dev, PCIR_PMBASEL_1, sc->pmembase >> 16, 2); pmemhi = sc->pmemlimit >> 32; if (pmemhi > 0) pci_write_config(dev, PCIR_PMLIMITH_1, pmemhi, 4); pci_write_config(dev, PCIR_PMLIMITL_1, sc->pmemlimit >> 16, 2); } #endif #ifdef PCI_HP /* * PCI-express HotPlug support. */ static int pci_enable_pcie_hp = 1; SYSCTL_INT(_hw_pci, OID_AUTO, enable_pcie_hp, CTLFLAG_RDTUN, &pci_enable_pcie_hp, 0, "Enable support for native PCI-express HotPlug."); static void pcib_probe_hotplug(struct pcib_softc *sc) { device_t dev; if (!pci_enable_pcie_hp) return; dev = sc->dev; if (pci_find_cap(dev, PCIY_EXPRESS, NULL) != 0) return; if (!(pcie_read_config(dev, PCIER_FLAGS, 2) & PCIEM_FLAGS_SLOT)) return; sc->pcie_link_cap = pcie_read_config(dev, PCIER_LINK_CAP, 4); sc->pcie_slot_cap = pcie_read_config(dev, PCIER_SLOT_CAP, 4); + /* + * XXX: Handling of slots with a power controller needs to be + * reexamined. Ignore hotplug on such slots for now. + */ + if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PCP) + return; + if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_HPC) sc->flags |= PCIB_HOTPLUG; } /* * Send a HotPlug command to the slot control register. If this slot * uses command completion interrupts and a previous command is still * in progress, then the command is dropped. Once the previous * command completes or times out, pcib_pcie_hotplug_update() will be * invoked to post a new command based on the slot's state at that * time. */ static void pcib_pcie_hotplug_command(struct pcib_softc *sc, uint16_t val, uint16_t mask) { device_t dev; uint16_t ctl, new; dev = sc->dev; if (sc->flags & PCIB_HOTPLUG_CMD_PENDING) return; ctl = pcie_read_config(dev, PCIER_SLOT_CTL, 2); new = (ctl & ~mask) | val; if (new == ctl) return; pcie_write_config(dev, PCIER_SLOT_CTL, new, 2); if (!(sc->pcie_slot_cap & PCIEM_SLOT_CAP_NCCS) && (ctl & new) & PCIEM_SLOT_CTL_CCIE) { sc->flags |= PCIB_HOTPLUG_CMD_PENDING; if (!cold) callout_reset(&sc->pcie_cc_timer, hz, pcib_pcie_cc_timeout, sc); } } static void pcib_pcie_hotplug_command_completed(struct pcib_softc *sc) { device_t dev; dev = sc->dev; if (bootverbose) device_printf(dev, "Command Completed\n"); if (!(sc->flags & PCIB_HOTPLUG_CMD_PENDING)) return; callout_stop(&sc->pcie_cc_timer); sc->flags &= ~PCIB_HOTPLUG_CMD_PENDING; wakeup(sc); } /* * Returns true if a card is fully inserted from the user's * perspective. It may not yet be ready for access, but the driver * can now start enabling access if necessary. */ static bool pcib_hotplug_inserted(struct pcib_softc *sc) { /* Pretend the card isn't present if a detach is forced. */ if (sc->flags & PCIB_DETACHING) return (false); /* Card must be present in the slot. */ if ((sc->pcie_slot_sta & PCIEM_SLOT_STA_PDS) == 0) return (false); /* A power fault implicitly turns off power to the slot. */ if (sc->pcie_slot_sta & PCIEM_SLOT_STA_PFD) return (false); /* If the MRL is disengaged, the slot is powered off. */ if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP && (sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSS) != 0) return (false); return (true); } /* * Returns -1 if the card is fully inserted, powered, and ready for * access. Otherwise, returns 0. */ static int pcib_hotplug_present(struct pcib_softc *sc) { device_t dev; dev = sc->dev; /* Card must be inserted. */ if (!pcib_hotplug_inserted(sc)) return (0); /* * Require the Electromechanical Interlock to be engaged if * present. */ if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_EIP && (sc->pcie_slot_sta & PCIEM_SLOT_STA_EIS) == 0) return (0); /* Require the Data Link Layer to be active. */ if (sc->pcie_link_cap & PCIEM_LINK_CAP_DL_ACTIVE) { if (!(sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE)) return (0); } return (-1); } static void pcib_pcie_hotplug_update(struct pcib_softc *sc, uint16_t val, uint16_t mask, bool schedule_task) { bool card_inserted; /* Clear DETACHING if Present Detect has cleared. */ if ((sc->pcie_slot_sta & (PCIEM_SLOT_STA_PDC | PCIEM_SLOT_STA_PDS)) == PCIEM_SLOT_STA_PDC) sc->flags &= ~PCIB_DETACHING; card_inserted = pcib_hotplug_inserted(sc); /* Turn the power indicator on if a card is inserted. */ if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PIP) { mask |= PCIEM_SLOT_CTL_PIC; if (card_inserted) val |= PCIEM_SLOT_CTL_PI_ON; else if (sc->flags & PCIB_DETACH_PENDING) val |= PCIEM_SLOT_CTL_PI_BLINK; else val |= PCIEM_SLOT_CTL_PI_OFF; } /* Turn the power on via the Power Controller if a card is inserted. */ if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PCP) { mask |= PCIEM_SLOT_CTL_PCC; if (card_inserted) val |= PCIEM_SLOT_CTL_PC_ON; else val |= PCIEM_SLOT_CTL_PC_OFF; } /* * If a card is inserted, enable the Electromechanical * Interlock. If a card is not inserted (or we are in the * process of detaching), disable the Electromechanical * Interlock. */ if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_EIP) { mask |= PCIEM_SLOT_CTL_EIC; if (card_inserted != !(sc->pcie_slot_sta & PCIEM_SLOT_STA_EIS)) val |= PCIEM_SLOT_CTL_EIC; } /* * Start a timer to see if the Data Link Layer times out. * Note that we only start the timer if Presence Detect * changed on this interrupt. Stop any scheduled timer if * the Data Link Layer is active. */ if (sc->pcie_link_cap & PCIEM_LINK_CAP_DL_ACTIVE) { if (card_inserted && !(sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE) && sc->pcie_slot_sta & PCIEM_SLOT_STA_PDC) { if (cold) device_printf(sc->dev, "Data Link Layer inactive\n"); else callout_reset(&sc->pcie_dll_timer, hz, pcib_pcie_dll_timeout, sc); } else if (sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE) callout_stop(&sc->pcie_dll_timer); } pcib_pcie_hotplug_command(sc, val, mask); /* * During attach the child "pci" device is added sychronously; * otherwise, the task is scheduled to manage the child * device. */ if (schedule_task && (pcib_hotplug_present(sc) != 0) != (sc->child != NULL)) taskqueue_enqueue(taskqueue_thread, &sc->pcie_hp_task); } static void pcib_pcie_intr(void *arg) { struct pcib_softc *sc; device_t dev; sc = arg; dev = sc->dev; sc->pcie_slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2); /* Clear the events just reported. */ pcie_write_config(dev, PCIER_SLOT_STA, sc->pcie_slot_sta, 2); if (sc->pcie_slot_sta & PCIEM_SLOT_STA_ABP) { if (sc->flags & PCIB_DETACH_PENDING) { device_printf(dev, "Attention Button Pressed: Detach Cancelled\n"); sc->flags &= ~PCIB_DETACH_PENDING; callout_stop(&sc->pcie_ab_timer); } else { device_printf(dev, "Attention Button Pressed: Detaching in 5 seconds\n"); sc->flags |= PCIB_DETACH_PENDING; callout_reset(&sc->pcie_ab_timer, 5 * hz, pcib_pcie_ab_timeout, sc); } } if (sc->pcie_slot_sta & PCIEM_SLOT_STA_PFD) device_printf(dev, "Power Fault Detected\n"); if (sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSC) device_printf(dev, "MRL Sensor Changed to %s\n", sc->pcie_slot_sta & PCIEM_SLOT_STA_MRLSS ? "open" : "closed"); if (bootverbose && sc->pcie_slot_sta & PCIEM_SLOT_STA_PDC) device_printf(dev, "Present Detect Changed to %s\n", sc->pcie_slot_sta & PCIEM_SLOT_STA_PDS ? "card present" : "empty"); if (sc->pcie_slot_sta & PCIEM_SLOT_STA_CC) pcib_pcie_hotplug_command_completed(sc); if (sc->pcie_slot_sta & PCIEM_SLOT_STA_DLLSC) { sc->pcie_link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2); if (bootverbose) device_printf(dev, "Data Link Layer State Changed to %s\n", sc->pcie_link_sta & PCIEM_LINK_STA_DL_ACTIVE ? "active" : "inactive"); } pcib_pcie_hotplug_update(sc, 0, 0, true); } static void pcib_pcie_hotplug_task(void *context, int pending) { struct pcib_softc *sc; device_t dev; sc = context; mtx_lock(&Giant); dev = sc->dev; if (pcib_hotplug_present(sc) != 0) { if (sc->child == NULL) { sc->child = device_add_child(dev, "pci", -1); bus_generic_attach(dev); } } else { if (sc->child != NULL) { if (device_delete_child(dev, sc->child) == 0) sc->child = NULL; } } mtx_unlock(&Giant); } static void pcib_pcie_ab_timeout(void *arg) { struct pcib_softc *sc; device_t dev; sc = arg; dev = sc->dev; mtx_assert(&Giant, MA_OWNED); if (sc->flags & PCIB_DETACH_PENDING) { sc->flags |= PCIB_DETACHING; sc->flags &= ~PCIB_DETACH_PENDING; pcib_pcie_hotplug_update(sc, 0, 0, true); } } static void pcib_pcie_cc_timeout(void *arg) { struct pcib_softc *sc; device_t dev; uint16_t sta; sc = arg; dev = sc->dev; mtx_assert(&Giant, MA_OWNED); sta = pcie_read_config(dev, PCIER_SLOT_STA, 2); if (!(sta & PCIEM_SLOT_STA_CC)) { device_printf(dev, "Hotplug Command Timed Out - forcing detach\n"); sc->flags &= ~(PCIB_HOTPLUG_CMD_PENDING | PCIB_DETACH_PENDING); sc->flags |= PCIB_DETACHING; pcib_pcie_hotplug_update(sc, 0, 0, true); } else { device_printf(dev, "Missed HotPlug interrupt waiting for Command Completion\n"); pcib_pcie_intr(sc); } } static void pcib_pcie_dll_timeout(void *arg) { struct pcib_softc *sc; device_t dev; uint16_t sta; sc = arg; dev = sc->dev; mtx_assert(&Giant, MA_OWNED); sta = pcie_read_config(dev, PCIER_LINK_STA, 2); if (!(sta & PCIEM_LINK_STA_DL_ACTIVE)) { device_printf(dev, "Timed out waiting for Data Link Layer Active\n"); sc->flags |= PCIB_DETACHING; pcib_pcie_hotplug_update(sc, 0, 0, true); } else if (sta != sc->pcie_link_sta) { device_printf(dev, "Missed HotPlug interrupt waiting for DLL Active\n"); pcib_pcie_intr(sc); } } static int pcib_alloc_pcie_irq(struct pcib_softc *sc) { device_t dev; int count, error, rid; rid = -1; dev = sc->dev; /* * For simplicity, only use MSI-X if there is a single message. * To support a device with multiple messages we would have to * use remap intr if the MSI number is not 0. */ count = pci_msix_count(dev); if (count == 1) { error = pci_alloc_msix(dev, &count); if (error == 0) rid = 1; } if (rid < 0 && pci_msi_count(dev) > 0) { count = 1; error = pci_alloc_msi(dev, &count); if (error == 0) rid = 1; } if (rid < 0) rid = 0; sc->pcie_irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (sc->pcie_irq == NULL) { device_printf(dev, "Failed to allocate interrupt for PCI-e events\n"); if (rid > 0) pci_release_msi(dev); return (ENXIO); } error = bus_setup_intr(dev, sc->pcie_irq, INTR_TYPE_MISC, NULL, pcib_pcie_intr, sc, &sc->pcie_ihand); if (error) { device_printf(dev, "Failed to setup PCI-e interrupt handler\n"); bus_release_resource(dev, SYS_RES_IRQ, rid, sc->pcie_irq); if (rid > 0) pci_release_msi(dev); return (error); } return (0); } static int pcib_release_pcie_irq(struct pcib_softc *sc) { device_t dev; int error; dev = sc->dev; error = bus_teardown_intr(dev, sc->pcie_irq, sc->pcie_ihand); if (error) return (error); error = bus_free_resource(dev, SYS_RES_IRQ, sc->pcie_irq); if (error) return (error); return (pci_release_msi(dev)); } static void pcib_setup_hotplug(struct pcib_softc *sc) { device_t dev; uint16_t mask, val; dev = sc->dev; callout_init(&sc->pcie_ab_timer, 0); callout_init(&sc->pcie_cc_timer, 0); callout_init(&sc->pcie_dll_timer, 0); TASK_INIT(&sc->pcie_hp_task, 0, pcib_pcie_hotplug_task, sc); /* Allocate IRQ. */ if (pcib_alloc_pcie_irq(sc) != 0) return; sc->pcie_link_sta = pcie_read_config(dev, PCIER_LINK_STA, 2); sc->pcie_slot_sta = pcie_read_config(dev, PCIER_SLOT_STA, 2); /* Clear any events previously pending. */ pcie_write_config(dev, PCIER_SLOT_STA, sc->pcie_slot_sta, 2); /* Enable HotPlug events. */ mask = PCIEM_SLOT_CTL_DLLSCE | PCIEM_SLOT_CTL_HPIE | PCIEM_SLOT_CTL_CCIE | PCIEM_SLOT_CTL_PDCE | PCIEM_SLOT_CTL_MRLSCE | PCIEM_SLOT_CTL_PFDE | PCIEM_SLOT_CTL_ABPE; val = PCIEM_SLOT_CTL_PDCE | PCIEM_SLOT_CTL_HPIE; if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_APB) val |= PCIEM_SLOT_CTL_ABPE; if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_PCP) val |= PCIEM_SLOT_CTL_PFDE; if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_MRLSP) val |= PCIEM_SLOT_CTL_MRLSCE; if (!(sc->pcie_slot_cap & PCIEM_SLOT_CAP_NCCS)) val |= PCIEM_SLOT_CTL_CCIE; if (sc->pcie_link_cap & PCIEM_LINK_CAP_DL_ACTIVE) val |= PCIEM_SLOT_CTL_DLLSCE; /* Turn the attention indicator off. */ if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_AIP) { mask |= PCIEM_SLOT_CTL_AIC; val |= PCIEM_SLOT_CTL_AI_OFF; } pcib_pcie_hotplug_update(sc, val, mask, false); } static int pcib_detach_hotplug(struct pcib_softc *sc) { uint16_t mask, val; int error; /* Disable the card in the slot and force it to detach. */ if (sc->flags & PCIB_DETACH_PENDING) { sc->flags &= ~PCIB_DETACH_PENDING; callout_stop(&sc->pcie_ab_timer); } sc->flags |= PCIB_DETACHING; if (sc->flags & PCIB_HOTPLUG_CMD_PENDING) { callout_stop(&sc->pcie_cc_timer); tsleep(sc, 0, "hpcmd", hz); sc->flags &= ~PCIB_HOTPLUG_CMD_PENDING; } /* Disable HotPlug events. */ mask = PCIEM_SLOT_CTL_DLLSCE | PCIEM_SLOT_CTL_HPIE | PCIEM_SLOT_CTL_CCIE | PCIEM_SLOT_CTL_PDCE | PCIEM_SLOT_CTL_MRLSCE | PCIEM_SLOT_CTL_PFDE | PCIEM_SLOT_CTL_ABPE; val = 0; /* Turn the attention indicator off. */ if (sc->pcie_slot_cap & PCIEM_SLOT_CAP_AIP) { mask |= PCIEM_SLOT_CTL_AIC; val |= PCIEM_SLOT_CTL_AI_OFF; } pcib_pcie_hotplug_update(sc, val, mask, false); error = pcib_release_pcie_irq(sc); if (error) return (error); taskqueue_drain(taskqueue_thread, &sc->pcie_hp_task); callout_drain(&sc->pcie_ab_timer); callout_drain(&sc->pcie_cc_timer); callout_drain(&sc->pcie_dll_timer); return (0); } #endif /* * Get current bridge configuration. */ static void pcib_cfg_save(struct pcib_softc *sc) { #ifndef NEW_PCIB device_t dev; uint16_t command; dev = sc->dev; command = pci_read_config(dev, PCIR_COMMAND, 2); if (command & PCIM_CMD_PORTEN) pcib_get_io_decode(sc); if (command & PCIM_CMD_MEMEN) pcib_get_mem_decode(sc); #endif } /* * Restore previous bridge configuration. */ static void pcib_cfg_restore(struct pcib_softc *sc) { device_t dev; #ifndef NEW_PCIB uint16_t command; #endif dev = sc->dev; #ifdef NEW_PCIB pcib_write_windows(sc, WIN_IO | WIN_MEM | WIN_PMEM); #else command = pci_read_config(dev, PCIR_COMMAND, 2); if (command & PCIM_CMD_PORTEN) pcib_set_io_decode(sc); if (command & PCIM_CMD_MEMEN) pcib_set_mem_decode(sc); #endif } /* * Generic device interface */ static int pcib_probe(device_t dev) { if ((pci_get_class(dev) == PCIC_BRIDGE) && (pci_get_subclass(dev) == PCIS_BRIDGE_PCI)) { device_set_desc(dev, "PCI-PCI bridge"); return(-10000); } return(ENXIO); } void pcib_attach_common(device_t dev) { struct pcib_softc *sc; struct sysctl_ctx_list *sctx; struct sysctl_oid *soid; int comma; sc = device_get_softc(dev); sc->dev = dev; /* * Get current bridge configuration. */ sc->domain = pci_get_domain(dev); #if !(defined(NEW_PCIB) && defined(PCI_RES_BUS)) sc->bus.sec = pci_read_config(dev, PCIR_SECBUS_1, 1); sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1); #endif sc->bridgectl = pci_read_config(dev, PCIR_BRIDGECTL_1, 2); pcib_cfg_save(sc); /* * The primary bus register should always be the bus of the * parent. */ sc->pribus = pci_get_bus(dev); pci_write_config(dev, PCIR_PRIBUS_1, sc->pribus, 1); /* * Setup sysctl reporting nodes */ sctx = device_get_sysctl_ctx(dev); soid = device_get_sysctl_tree(dev); SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "domain", CTLFLAG_RD, &sc->domain, 0, "Domain number"); SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "pribus", CTLFLAG_RD, &sc->pribus, 0, "Primary bus number"); SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "secbus", CTLFLAG_RD, &sc->bus.sec, 0, "Secondary bus number"); SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "subbus", CTLFLAG_RD, &sc->bus.sub, 0, "Subordinate bus number"); /* * Quirk handling. */ switch (pci_get_devid(dev)) { #if !(defined(NEW_PCIB) && defined(PCI_RES_BUS)) case 0x12258086: /* Intel 82454KX/GX (Orion) */ { uint8_t supbus; supbus = pci_read_config(dev, 0x41, 1); if (supbus != 0xff) { sc->bus.sec = supbus + 1; sc->bus.sub = supbus + 1; } break; } #endif /* * The i82380FB mobile docking controller is a PCI-PCI bridge, * and it is a subtractive bridge. However, the ProgIf is wrong * so the normal setting of PCIB_SUBTRACTIVE bit doesn't * happen. There are also Toshiba and Cavium ThunderX bridges * that behave this way. */ case 0xa002177d: /* Cavium ThunderX */ case 0x124b8086: /* Intel 82380FB Mobile */ case 0x060513d7: /* Toshiba ???? */ sc->flags |= PCIB_SUBTRACTIVE; break; #if !(defined(NEW_PCIB) && defined(PCI_RES_BUS)) /* Compaq R3000 BIOS sets wrong subordinate bus number. */ case 0x00dd10de: { char *cp; if ((cp = kern_getenv("smbios.planar.maker")) == NULL) break; if (strncmp(cp, "Compal", 6) != 0) { freeenv(cp); break; } freeenv(cp); if ((cp = kern_getenv("smbios.planar.product")) == NULL) break; if (strncmp(cp, "08A0", 4) != 0) { freeenv(cp); break; } freeenv(cp); if (sc->bus.sub < 0xa) { pci_write_config(dev, PCIR_SUBBUS_1, 0xa, 1); sc->bus.sub = pci_read_config(dev, PCIR_SUBBUS_1, 1); } break; } #endif } if (pci_msi_device_blacklisted(dev)) sc->flags |= PCIB_DISABLE_MSI; if (pci_msix_device_blacklisted(dev)) sc->flags |= PCIB_DISABLE_MSIX; /* * Intel 815, 845 and other chipsets say they are PCI-PCI bridges, * but have a ProgIF of 0x80. The 82801 family (AA, AB, BAM/CAM, * BA/CA/DB and E) PCI bridges are HUB-PCI bridges, in Intelese. * This means they act as if they were subtractively decoding * bridges and pass all transactions. Mark them and real ProgIf 1 * parts as subtractive. */ if ((pci_get_devid(dev) & 0xff00ffff) == 0x24008086 || pci_read_config(dev, PCIR_PROGIF, 1) == PCIP_BRIDGE_PCI_SUBTRACTIVE) sc->flags |= PCIB_SUBTRACTIVE; #ifdef PCI_HP pcib_probe_hotplug(sc); #endif #ifdef NEW_PCIB #ifdef PCI_RES_BUS pcib_setup_secbus(dev, &sc->bus, 1); #endif pcib_probe_windows(sc); #endif #ifdef PCI_HP if (sc->flags & PCIB_HOTPLUG) pcib_setup_hotplug(sc); #endif if (bootverbose) { device_printf(dev, " domain %d\n", sc->domain); device_printf(dev, " secondary bus %d\n", sc->bus.sec); device_printf(dev, " subordinate bus %d\n", sc->bus.sub); #ifdef NEW_PCIB if (pcib_is_window_open(&sc->io)) device_printf(dev, " I/O decode 0x%jx-0x%jx\n", (uintmax_t)sc->io.base, (uintmax_t)sc->io.limit); if (pcib_is_window_open(&sc->mem)) device_printf(dev, " memory decode 0x%jx-0x%jx\n", (uintmax_t)sc->mem.base, (uintmax_t)sc->mem.limit); if (pcib_is_window_open(&sc->pmem)) device_printf(dev, " prefetched decode 0x%jx-0x%jx\n", (uintmax_t)sc->pmem.base, (uintmax_t)sc->pmem.limit); #else if (pcib_is_io_open(sc)) device_printf(dev, " I/O decode 0x%x-0x%x\n", sc->iobase, sc->iolimit); if (pcib_is_nonprefetch_open(sc)) device_printf(dev, " memory decode 0x%jx-0x%jx\n", (uintmax_t)sc->membase, (uintmax_t)sc->memlimit); if (pcib_is_prefetch_open(sc)) device_printf(dev, " prefetched decode 0x%jx-0x%jx\n", (uintmax_t)sc->pmembase, (uintmax_t)sc->pmemlimit); #endif if (sc->bridgectl & (PCIB_BCR_ISA_ENABLE | PCIB_BCR_VGA_ENABLE) || sc->flags & PCIB_SUBTRACTIVE) { device_printf(dev, " special decode "); comma = 0; if (sc->bridgectl & PCIB_BCR_ISA_ENABLE) { printf("ISA"); comma = 1; } if (sc->bridgectl & PCIB_BCR_VGA_ENABLE) { printf("%sVGA", comma ? ", " : ""); comma = 1; } if (sc->flags & PCIB_SUBTRACTIVE) printf("%ssubtractive", comma ? ", " : ""); printf("\n"); } } /* * Always enable busmastering on bridges so that transactions * initiated on the secondary bus are passed through to the * primary bus. */ pci_enable_busmaster(dev); } #ifdef PCI_HP static int pcib_present(struct pcib_softc *sc) { if (sc->flags & PCIB_HOTPLUG) return (pcib_hotplug_present(sc) != 0); return (1); } #endif int pcib_attach_child(device_t dev) { struct pcib_softc *sc; sc = device_get_softc(dev); if (sc->bus.sec == 0) { /* no secondary bus; we should have fixed this */ return(0); } #ifdef PCI_HP if (!pcib_present(sc)) { /* An empty HotPlug slot, so don't add a PCI bus yet. */ return (0); } #endif sc->child = device_add_child(dev, "pci", -1); return (bus_generic_attach(dev)); } int pcib_attach(device_t dev) { pcib_attach_common(dev); return (pcib_attach_child(dev)); } int pcib_detach(device_t dev) { #if defined(PCI_HP) || defined(NEW_PCIB) struct pcib_softc *sc; #endif int error; #if defined(PCI_HP) || defined(NEW_PCIB) sc = device_get_softc(dev); #endif error = bus_generic_detach(dev); if (error) return (error); #ifdef PCI_HP if (sc->flags & PCIB_HOTPLUG) { error = pcib_detach_hotplug(sc); if (error) return (error); } #endif error = device_delete_children(dev); if (error) return (error); #ifdef NEW_PCIB pcib_free_windows(sc); #ifdef PCI_RES_BUS pcib_free_secbus(dev, &sc->bus); #endif #endif return (0); } int pcib_suspend(device_t dev) { pcib_cfg_save(device_get_softc(dev)); return (bus_generic_suspend(dev)); } int pcib_resume(device_t dev) { pcib_cfg_restore(device_get_softc(dev)); return (bus_generic_resume(dev)); } void pcib_bridge_init(device_t dev) { pci_write_config(dev, PCIR_IOBASEL_1, 0xff, 1); pci_write_config(dev, PCIR_IOBASEH_1, 0xffff, 2); pci_write_config(dev, PCIR_IOLIMITL_1, 0, 1); pci_write_config(dev, PCIR_IOLIMITH_1, 0, 2); pci_write_config(dev, PCIR_MEMBASE_1, 0xffff, 2); pci_write_config(dev, PCIR_MEMLIMIT_1, 0, 2); pci_write_config(dev, PCIR_PMBASEL_1, 0xffff, 2); pci_write_config(dev, PCIR_PMBASEH_1, 0xffffffff, 4); pci_write_config(dev, PCIR_PMLIMITL_1, 0, 2); pci_write_config(dev, PCIR_PMLIMITH_1, 0, 4); } int pcib_child_present(device_t dev, device_t child) { #ifdef PCI_HP struct pcib_softc *sc = device_get_softc(dev); int retval; retval = bus_child_present(dev); if (retval != 0 && sc->flags & PCIB_HOTPLUG) retval = pcib_hotplug_present(sc); return (retval); #else return (bus_child_present(dev)); #endif } int pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *result) { struct pcib_softc *sc = device_get_softc(dev); switch (which) { case PCIB_IVAR_DOMAIN: *result = sc->domain; return(0); case PCIB_IVAR_BUS: *result = sc->bus.sec; return(0); } return(ENOENT); } int pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t value) { switch (which) { case PCIB_IVAR_DOMAIN: return(EINVAL); case PCIB_IVAR_BUS: return(EINVAL); } return(ENOENT); } #ifdef NEW_PCIB /* * Attempt to allocate a resource from the existing resources assigned * to a window. */ static struct resource * pcib_suballoc_resource(struct pcib_softc *sc, struct pcib_window *w, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource *res; if (!pcib_is_window_open(w)) return (NULL); res = rman_reserve_resource(&w->rman, start, end, count, flags & ~RF_ACTIVE, child); if (res == NULL) return (NULL); if (bootverbose) device_printf(sc->dev, "allocated %s range (%#jx-%#jx) for rid %x of %s\n", w->name, rman_get_start(res), rman_get_end(res), *rid, pcib_child_name(child)); rman_set_rid(res, *rid); /* * If the resource should be active, pass that request up the * tree. This assumes the parent drivers can handle * activating sub-allocated resources. */ if (flags & RF_ACTIVE) { if (bus_activate_resource(child, type, *rid, res) != 0) { rman_release_resource(res); return (NULL); } } return (res); } /* Allocate a fresh resource range for an unconfigured window. */ static int pcib_alloc_new_window(struct pcib_softc *sc, struct pcib_window *w, int type, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct resource *res; rman_res_t base, limit, wmask; int rid; /* * If this is an I/O window on a bridge with ISA enable set * and the start address is below 64k, then try to allocate an * initial window of 0x1000 bytes long starting at address * 0xf000 and walking down. Note that if the original request * was larger than the non-aliased range size of 0x100 our * caller would have raised the start address up to 64k * already. */ if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE && start < 65536) { for (base = 0xf000; (long)base >= 0; base -= 0x1000) { limit = base + 0xfff; /* * Skip ranges that wouldn't work for the * original request. Note that the actual * window that overlaps are the non-alias * ranges within [base, limit], so this isn't * quite a simple comparison. */ if (start + count > limit - 0x400) continue; if (base == 0) { /* * The first open region for the window at * 0 is 0x400-0x4ff. */ if (end - count + 1 < 0x400) continue; } else { if (end - count + 1 < base) continue; } if (pcib_alloc_nonisa_ranges(sc, base, limit) == 0) { w->base = base; w->limit = limit; return (0); } } return (ENOSPC); } wmask = ((rman_res_t)1 << w->step) - 1; if (RF_ALIGNMENT(flags) < w->step) { flags &= ~RF_ALIGNMENT_MASK; flags |= RF_ALIGNMENT_LOG2(w->step); } start &= ~wmask; end |= wmask; count = roundup2(count, (rman_res_t)1 << w->step); rid = w->reg; res = bus_alloc_resource(sc->dev, type, &rid, start, end, count, flags & ~RF_ACTIVE); if (res == NULL) return (ENOSPC); pcib_add_window_resources(w, &res, 1); pcib_activate_window(sc, type); w->base = rman_get_start(res); w->limit = rman_get_end(res); return (0); } /* Try to expand an existing window to the requested base and limit. */ static int pcib_expand_window(struct pcib_softc *sc, struct pcib_window *w, int type, rman_res_t base, rman_res_t limit) { struct resource *res; int error, i, force_64k_base; KASSERT(base <= w->base && limit >= w->limit, ("attempting to shrink window")); /* * XXX: pcib_grow_window() doesn't try to do this anyway and * the error handling for all the edge cases would be tedious. */ KASSERT(limit == w->limit || base == w->base, ("attempting to grow both ends of a window")); /* * Yet more special handling for requests to expand an I/O * window behind an ISA-enabled bridge. Since I/O windows * have to grow in 0x1000 increments and the end of the 0xffff * range is an alias, growing a window below 64k will always * result in allocating new resources and never adjusting an * existing resource. */ if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE && (limit <= 65535 || (base <= 65535 && base != w->base))) { KASSERT(limit == w->limit || limit <= 65535, ("attempting to grow both ends across 64k ISA alias")); if (base != w->base) error = pcib_alloc_nonisa_ranges(sc, base, w->base - 1); else error = pcib_alloc_nonisa_ranges(sc, w->limit + 1, limit); if (error == 0) { w->base = base; w->limit = limit; } return (error); } /* * Find the existing resource to adjust. Usually there is only one, * but for an ISA-enabled bridge we might be growing the I/O window * above 64k and need to find the existing resource that maps all * of the area above 64k. */ for (i = 0; i < w->count; i++) { if (rman_get_end(w->res[i]) == w->limit) break; } KASSERT(i != w->count, ("did not find existing resource")); res = w->res[i]; /* * Usually the resource we found should match the window's * existing range. The one exception is the ISA-enabled case * mentioned above in which case the resource should start at * 64k. */ if (type == SYS_RES_IOPORT && sc->bridgectl & PCIB_BCR_ISA_ENABLE && w->base <= 65535) { KASSERT(rman_get_start(res) == 65536, ("existing resource mismatch")); force_64k_base = 1; } else { KASSERT(w->base == rman_get_start(res), ("existing resource mismatch")); force_64k_base = 0; } error = bus_adjust_resource(sc->dev, type, res, force_64k_base ? rman_get_start(res) : base, limit); if (error) return (error); /* Add the newly allocated region to the resource manager. */ if (w->base != base) { error = rman_manage_region(&w->rman, base, w->base - 1); w->base = base; } else { error = rman_manage_region(&w->rman, w->limit + 1, limit); w->limit = limit; } if (error) { if (bootverbose) device_printf(sc->dev, "failed to expand %s resource manager\n", w->name); (void)bus_adjust_resource(sc->dev, type, res, force_64k_base ? rman_get_start(res) : w->base, w->limit); } return (error); } /* * Attempt to grow a window to make room for a given resource request. */ static int pcib_grow_window(struct pcib_softc *sc, struct pcib_window *w, int type, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { rman_res_t align, start_free, end_free, front, back, wmask; int error; /* * Clamp the desired resource range to the maximum address * this window supports. Reject impossible requests. * * For I/O port requests behind a bridge with the ISA enable * bit set, force large allocations to start above 64k. */ if (!w->valid) return (EINVAL); if (sc->bridgectl & PCIB_BCR_ISA_ENABLE && count > 0x100 && start < 65536) start = 65536; if (end > w->rman.rm_end) end = w->rman.rm_end; if (start + count - 1 > end || start + count < start) return (EINVAL); wmask = ((rman_res_t)1 << w->step) - 1; /* * If there is no resource at all, just try to allocate enough * aligned space for this resource. */ if (w->res == NULL) { error = pcib_alloc_new_window(sc, w, type, start, end, count, flags); if (error) { if (bootverbose) device_printf(sc->dev, "failed to allocate initial %s window (%#jx-%#jx,%#jx)\n", w->name, start, end, count); return (error); } if (bootverbose) device_printf(sc->dev, "allocated initial %s window of %#jx-%#jx\n", w->name, (uintmax_t)w->base, (uintmax_t)w->limit); goto updatewin; } /* * See if growing the window would help. Compute the minimum * amount of address space needed on both the front and back * ends of the existing window to satisfy the allocation. * * For each end, build a candidate region adjusting for the * required alignment, etc. If there is a free region at the * edge of the window, grow from the inner edge of the free * region. Otherwise grow from the window boundary. * * Growing an I/O window below 64k for a bridge with the ISA * enable bit doesn't require any special magic as the step * size of an I/O window (1k) always includes multiple * non-alias ranges when it is grown in either direction. * * XXX: Special case: if w->res is completely empty and the * request size is larger than w->res, we should find the * optimal aligned buffer containing w->res and allocate that. */ if (bootverbose) device_printf(sc->dev, "attempting to grow %s window for (%#jx-%#jx,%#jx)\n", w->name, start, end, count); align = (rman_res_t)1 << RF_ALIGNMENT(flags); if (start < w->base) { if (rman_first_free_region(&w->rman, &start_free, &end_free) != 0 || start_free != w->base) end_free = w->base; if (end_free > end) end_free = end + 1; /* Move end_free down until it is properly aligned. */ end_free &= ~(align - 1); end_free--; front = end_free - (count - 1); /* * The resource would now be allocated at (front, * end_free). Ensure that fits in the (start, end) * bounds. end_free is checked above. If 'front' is * ok, ensure it is properly aligned for this window. * Also check for underflow. */ if (front >= start && front <= end_free) { if (bootverbose) printf("\tfront candidate range: %#jx-%#jx\n", front, end_free); front &= ~wmask; front = w->base - front; } else front = 0; } else front = 0; if (end > w->limit) { if (rman_last_free_region(&w->rman, &start_free, &end_free) != 0 || end_free != w->limit) start_free = w->limit + 1; if (start_free < start) start_free = start; /* Move start_free up until it is properly aligned. */ start_free = roundup2(start_free, align); back = start_free + count - 1; /* * The resource would now be allocated at (start_free, * back). Ensure that fits in the (start, end) * bounds. start_free is checked above. If 'back' is * ok, ensure it is properly aligned for this window. * Also check for overflow. */ if (back <= end && start_free <= back) { if (bootverbose) printf("\tback candidate range: %#jx-%#jx\n", start_free, back); back |= wmask; back -= w->limit; } else back = 0; } else back = 0; /* * Try to allocate the smallest needed region first. * If that fails, fall back to the other region. */ error = ENOSPC; while (front != 0 || back != 0) { if (front != 0 && (front <= back || back == 0)) { error = pcib_expand_window(sc, w, type, w->base - front, w->limit); if (error == 0) break; front = 0; } else { error = pcib_expand_window(sc, w, type, w->base, w->limit + back); if (error == 0) break; back = 0; } } if (error) return (error); if (bootverbose) device_printf(sc->dev, "grew %s window to %#jx-%#jx\n", w->name, (uintmax_t)w->base, (uintmax_t)w->limit); updatewin: /* Write the new window. */ KASSERT((w->base & wmask) == 0, ("start address is not aligned")); KASSERT((w->limit & wmask) == wmask, ("end address is not aligned")); pcib_write_windows(sc, w->mask); return (0); } /* * We have to trap resource allocation requests and ensure that the bridge * is set up to, or capable of handling them. */ struct resource * pcib_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct pcib_softc *sc; struct resource *r; sc = device_get_softc(dev); /* * VGA resources are decoded iff the VGA enable bit is set in * the bridge control register. VGA resources do not fall into * the resource windows and are passed up to the parent. */ if ((type == SYS_RES_IOPORT && pci_is_vga_ioport_range(start, end)) || (type == SYS_RES_MEMORY && pci_is_vga_memory_range(start, end))) { if (sc->bridgectl & PCIB_BCR_VGA_ENABLE) return (bus_generic_alloc_resource(dev, child, type, rid, start, end, count, flags)); else return (NULL); } switch (type) { #ifdef PCI_RES_BUS case PCI_RES_BUS: return (pcib_alloc_subbus(&sc->bus, child, rid, start, end, count, flags)); #endif case SYS_RES_IOPORT: if (pcib_is_isa_range(sc, start, end, count)) return (NULL); r = pcib_suballoc_resource(sc, &sc->io, child, type, rid, start, end, count, flags); if (r != NULL || (sc->flags & PCIB_SUBTRACTIVE) != 0) break; if (pcib_grow_window(sc, &sc->io, type, start, end, count, flags) == 0) r = pcib_suballoc_resource(sc, &sc->io, child, type, rid, start, end, count, flags); break; case SYS_RES_MEMORY: /* * For prefetchable resources, prefer the prefetchable * memory window, but fall back to the regular memory * window if that fails. Try both windows before * attempting to grow a window in case the firmware * has used a range in the regular memory window to * map a prefetchable BAR. */ if (flags & RF_PREFETCHABLE) { r = pcib_suballoc_resource(sc, &sc->pmem, child, type, rid, start, end, count, flags); if (r != NULL) break; } r = pcib_suballoc_resource(sc, &sc->mem, child, type, rid, start, end, count, flags); if (r != NULL || (sc->flags & PCIB_SUBTRACTIVE) != 0) break; if (flags & RF_PREFETCHABLE) { if (pcib_grow_window(sc, &sc->pmem, type, start, end, count, flags) == 0) { r = pcib_suballoc_resource(sc, &sc->pmem, child, type, rid, start, end, count, flags); if (r != NULL) break; } } if (pcib_grow_window(sc, &sc->mem, type, start, end, count, flags & ~RF_PREFETCHABLE) == 0) r = pcib_suballoc_resource(sc, &sc->mem, child, type, rid, start, end, count, flags); break; default: return (bus_generic_alloc_resource(dev, child, type, rid, start, end, count, flags)); } /* * If attempts to suballocate from the window fail but this is a * subtractive bridge, pass the request up the tree. */ if (sc->flags & PCIB_SUBTRACTIVE && r == NULL) return (bus_generic_alloc_resource(dev, child, type, rid, start, end, count, flags)); return (r); } int pcib_adjust_resource(device_t bus, device_t child, int type, struct resource *r, rman_res_t start, rman_res_t end) { struct pcib_softc *sc; sc = device_get_softc(bus); if (pcib_is_resource_managed(sc, type, r)) return (rman_adjust_resource(r, start, end)); return (bus_generic_adjust_resource(bus, child, type, r, start, end)); } int pcib_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { struct pcib_softc *sc; int error; sc = device_get_softc(dev); if (pcib_is_resource_managed(sc, type, r)) { if (rman_get_flags(r) & RF_ACTIVE) { error = bus_deactivate_resource(child, type, rid, r); if (error) return (error); } return (rman_release_resource(r)); } return (bus_generic_release_resource(dev, child, type, rid, r)); } #else /* * We have to trap resource allocation requests and ensure that the bridge * is set up to, or capable of handling them. */ struct resource * pcib_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct pcib_softc *sc = device_get_softc(dev); const char *name, *suffix; int ok; /* * Fail the allocation for this range if it's not supported. */ name = device_get_nameunit(child); if (name == NULL) { name = ""; suffix = ""; } else suffix = " "; switch (type) { case SYS_RES_IOPORT: ok = 0; if (!pcib_is_io_open(sc)) break; ok = (start >= sc->iobase && end <= sc->iolimit); /* * Make sure we allow access to VGA I/O addresses when the * bridge has the "VGA Enable" bit set. */ if (!ok && pci_is_vga_ioport_range(start, end)) ok = (sc->bridgectl & PCIB_BCR_VGA_ENABLE) ? 1 : 0; if ((sc->flags & PCIB_SUBTRACTIVE) == 0) { if (!ok) { if (start < sc->iobase) start = sc->iobase; if (end > sc->iolimit) end = sc->iolimit; if (start < end) ok = 1; } } else { ok = 1; #if 0 /* * If we overlap with the subtractive range, then * pick the upper range to use. */ if (start < sc->iolimit && end > sc->iobase) start = sc->iolimit + 1; #endif } if (end < start) { device_printf(dev, "ioport: end (%jx) < start (%jx)\n", end, start); start = 0; end = 0; ok = 0; } if (!ok) { device_printf(dev, "%s%srequested unsupported I/O " "range 0x%jx-0x%jx (decoding 0x%x-0x%x)\n", name, suffix, start, end, sc->iobase, sc->iolimit); return (NULL); } if (bootverbose) device_printf(dev, "%s%srequested I/O range 0x%jx-0x%jx: in range\n", name, suffix, start, end); break; case SYS_RES_MEMORY: ok = 0; if (pcib_is_nonprefetch_open(sc)) ok = ok || (start >= sc->membase && end <= sc->memlimit); if (pcib_is_prefetch_open(sc)) ok = ok || (start >= sc->pmembase && end <= sc->pmemlimit); /* * Make sure we allow access to VGA memory addresses when the * bridge has the "VGA Enable" bit set. */ if (!ok && pci_is_vga_memory_range(start, end)) ok = (sc->bridgectl & PCIB_BCR_VGA_ENABLE) ? 1 : 0; if ((sc->flags & PCIB_SUBTRACTIVE) == 0) { if (!ok) { ok = 1; if (flags & RF_PREFETCHABLE) { if (pcib_is_prefetch_open(sc)) { if (start < sc->pmembase) start = sc->pmembase; if (end > sc->pmemlimit) end = sc->pmemlimit; } else { ok = 0; } } else { /* non-prefetchable */ if (pcib_is_nonprefetch_open(sc)) { if (start < sc->membase) start = sc->membase; if (end > sc->memlimit) end = sc->memlimit; } else { ok = 0; } } } } else if (!ok) { ok = 1; /* subtractive bridge: always ok */ #if 0 if (pcib_is_nonprefetch_open(sc)) { if (start < sc->memlimit && end > sc->membase) start = sc->memlimit + 1; } if (pcib_is_prefetch_open(sc)) { if (start < sc->pmemlimit && end > sc->pmembase) start = sc->pmemlimit + 1; } #endif } if (end < start) { device_printf(dev, "memory: end (%jx) < start (%jx)\n", end, start); start = 0; end = 0; ok = 0; } if (!ok && bootverbose) device_printf(dev, "%s%srequested unsupported memory range %#jx-%#jx " "(decoding %#jx-%#jx, %#jx-%#jx)\n", name, suffix, start, end, (uintmax_t)sc->membase, (uintmax_t)sc->memlimit, (uintmax_t)sc->pmembase, (uintmax_t)sc->pmemlimit); if (!ok) return (NULL); if (bootverbose) device_printf(dev,"%s%srequested memory range " "0x%jx-0x%jx: good\n", name, suffix, start, end); break; default: break; } /* * Bridge is OK decoding this resource, so pass it up. */ return (bus_generic_alloc_resource(dev, child, type, rid, start, end, count, flags)); } #endif /* * If ARI is enabled on this downstream port, translate the function number * to the non-ARI slot/function. The downstream port will convert it back in * hardware. If ARI is not enabled slot and func are not modified. */ static __inline void pcib_xlate_ari(device_t pcib, int bus, int *slot, int *func) { struct pcib_softc *sc; int ari_func; sc = device_get_softc(pcib); ari_func = *func; if (sc->flags & PCIB_ENABLE_ARI) { KASSERT(*slot == 0, ("Non-zero slot number with ARI enabled!")); *slot = PCIE_ARI_SLOT(ari_func); *func = PCIE_ARI_FUNC(ari_func); } } static void pcib_enable_ari(struct pcib_softc *sc, uint32_t pcie_pos) { uint32_t ctl2; ctl2 = pci_read_config(sc->dev, pcie_pos + PCIER_DEVICE_CTL2, 4); ctl2 |= PCIEM_CTL2_ARI; pci_write_config(sc->dev, pcie_pos + PCIER_DEVICE_CTL2, ctl2, 4); sc->flags |= PCIB_ENABLE_ARI; } /* * PCIB interface. */ int pcib_maxslots(device_t dev) { return (PCI_SLOTMAX); } static int pcib_ari_maxslots(device_t dev) { struct pcib_softc *sc; sc = device_get_softc(dev); if (sc->flags & PCIB_ENABLE_ARI) return (PCIE_ARI_SLOTMAX); else return (PCI_SLOTMAX); } static int pcib_ari_maxfuncs(device_t dev) { struct pcib_softc *sc; sc = device_get_softc(dev); if (sc->flags & PCIB_ENABLE_ARI) return (PCIE_ARI_FUNCMAX); else return (PCI_FUNCMAX); } static void pcib_ari_decode_rid(device_t pcib, uint16_t rid, int *bus, int *slot, int *func) { struct pcib_softc *sc; sc = device_get_softc(pcib); *bus = PCI_RID2BUS(rid); if (sc->flags & PCIB_ENABLE_ARI) { *slot = PCIE_ARI_RID2SLOT(rid); *func = PCIE_ARI_RID2FUNC(rid); } else { *slot = PCI_RID2SLOT(rid); *func = PCI_RID2FUNC(rid); } } /* * Since we are a child of a PCI bus, its parent must support the pcib interface. */ static uint32_t pcib_read_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, int width) { #ifdef PCI_HP struct pcib_softc *sc; sc = device_get_softc(dev); if (!pcib_present(sc)) { switch (width) { case 2: return (0xffff); case 1: return (0xff); default: return (0xffffffff); } } #endif pcib_xlate_ari(dev, b, &s, &f); return(PCIB_READ_CONFIG(device_get_parent(device_get_parent(dev)), b, s, f, reg, width)); } static void pcib_write_config(device_t dev, u_int b, u_int s, u_int f, u_int reg, uint32_t val, int width) { #ifdef PCI_HP struct pcib_softc *sc; sc = device_get_softc(dev); if (!pcib_present(sc)) return; #endif pcib_xlate_ari(dev, b, &s, &f); PCIB_WRITE_CONFIG(device_get_parent(device_get_parent(dev)), b, s, f, reg, val, width); } /* * Route an interrupt across a PCI bridge. */ int pcib_route_interrupt(device_t pcib, device_t dev, int pin) { device_t bus; int parent_intpin; int intnum; /* * * The PCI standard defines a swizzle of the child-side device/intpin to * the parent-side intpin as follows. * * device = device on child bus * child_intpin = intpin on child bus slot (0-3) * parent_intpin = intpin on parent bus slot (0-3) * * parent_intpin = (device + child_intpin) % 4 */ parent_intpin = (pci_get_slot(dev) + (pin - 1)) % 4; /* * Our parent is a PCI bus. Its parent must export the pcib interface * which includes the ability to route interrupts. */ bus = device_get_parent(pcib); intnum = PCIB_ROUTE_INTERRUPT(device_get_parent(bus), pcib, parent_intpin + 1); if (PCI_INTERRUPT_VALID(intnum) && bootverbose) { device_printf(pcib, "slot %d INT%c is routed to irq %d\n", pci_get_slot(dev), 'A' + pin - 1, intnum); } return(intnum); } /* Pass request to alloc MSI/MSI-X messages up to the parent bridge. */ int pcib_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs) { struct pcib_softc *sc = device_get_softc(pcib); device_t bus; if (sc->flags & PCIB_DISABLE_MSI) return (ENXIO); bus = device_get_parent(pcib); return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount, irqs)); } /* Pass request to release MSI/MSI-X messages up to the parent bridge. */ int pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs) { device_t bus; bus = device_get_parent(pcib); return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs)); } /* Pass request to alloc an MSI-X message up to the parent bridge. */ int pcib_alloc_msix(device_t pcib, device_t dev, int *irq) { struct pcib_softc *sc = device_get_softc(pcib); device_t bus; if (sc->flags & PCIB_DISABLE_MSIX) return (ENXIO); bus = device_get_parent(pcib); return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq)); } /* Pass request to release an MSI-X message up to the parent bridge. */ int pcib_release_msix(device_t pcib, device_t dev, int irq) { device_t bus; bus = device_get_parent(pcib); return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq)); } /* Pass request to map MSI/MSI-X message up to parent bridge. */ int pcib_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data) { device_t bus; int error; bus = device_get_parent(pcib); error = PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data); if (error) return (error); pci_ht_map_msi(pcib, *addr); return (0); } /* Pass request for device power state up to parent bridge. */ int pcib_power_for_sleep(device_t pcib, device_t dev, int *pstate) { device_t bus; bus = device_get_parent(pcib); return (PCIB_POWER_FOR_SLEEP(bus, dev, pstate)); } static int pcib_ari_enabled(device_t pcib) { struct pcib_softc *sc; sc = device_get_softc(pcib); return ((sc->flags & PCIB_ENABLE_ARI) != 0); } static int pcib_ari_get_id(device_t pcib, device_t dev, enum pci_id_type type, uintptr_t *id) { struct pcib_softc *sc; device_t bus_dev; uint8_t bus, slot, func; if (type != PCI_ID_RID) { bus_dev = device_get_parent(pcib); return (PCIB_GET_ID(device_get_parent(bus_dev), dev, type, id)); } sc = device_get_softc(pcib); if (sc->flags & PCIB_ENABLE_ARI) { bus = pci_get_bus(dev); func = pci_get_function(dev); *id = (PCI_ARI_RID(bus, func)); } else { bus = pci_get_bus(dev); slot = pci_get_slot(dev); func = pci_get_function(dev); *id = (PCI_RID(bus, slot, func)); } return (0); } /* * Check that the downstream port (pcib) and the endpoint device (dev) both * support ARI. If so, enable it and return 0, otherwise return an error. */ static int pcib_try_enable_ari(device_t pcib, device_t dev) { struct pcib_softc *sc; int error; uint32_t cap2; int ari_cap_off; uint32_t ari_ver; uint32_t pcie_pos; sc = device_get_softc(pcib); /* * ARI is controlled in a register in the PCIe capability structure. * If the downstream port does not have the PCIe capability structure * then it does not support ARI. */ error = pci_find_cap(pcib, PCIY_EXPRESS, &pcie_pos); if (error != 0) return (ENODEV); /* Check that the PCIe port advertises ARI support. */ cap2 = pci_read_config(pcib, pcie_pos + PCIER_DEVICE_CAP2, 4); if (!(cap2 & PCIEM_CAP2_ARI)) return (ENODEV); /* * Check that the endpoint device advertises ARI support via the ARI * extended capability structure. */ error = pci_find_extcap(dev, PCIZ_ARI, &ari_cap_off); if (error != 0) return (ENODEV); /* * Finally, check that the endpoint device supports the same version * of ARI that we do. */ ari_ver = pci_read_config(dev, ari_cap_off, 4); if (PCI_EXTCAP_VER(ari_ver) != PCIB_SUPPORTED_ARI_VER) { if (bootverbose) device_printf(pcib, "Unsupported version of ARI (%d) detected\n", PCI_EXTCAP_VER(ari_ver)); return (ENXIO); } pcib_enable_ari(sc, pcie_pos); return (0); } Index: user/alc/PQ_LAUNDRY/sys/kern/kern_mutex.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/kern_mutex.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/kern/kern_mutex.c (revision 303654) @@ -1,1047 +1,1080 @@ /*- * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Machine independent bits of mutex implementation. */ #include __FBSDID("$FreeBSD$"); #include "opt_adaptive_mutexes.h" #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES) #define ADAPTIVE_MUTEXES #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , lock, failed); #endif /* * Return the mutex address when the lock cookie address is provided. * This functionality assumes that struct mtx* have a member named mtx_lock. */ #define mtxlock2mtx(c) (__containerof(c, struct mtx, mtx_lock)) /* * Internal utility macros. */ #define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) #define mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED) #define mtx_owner(m) ((struct thread *)((m)->mtx_lock & ~MTX_FLAGMASK)) static void assert_mtx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_mtx(const struct lock_object *lock); #endif static void lock_mtx(struct lock_object *lock, uintptr_t how); static void lock_spin(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_mtx(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_mtx(struct lock_object *lock); static uintptr_t unlock_spin(struct lock_object *lock); /* * Lock classes for sleep and spin mutexes. */ struct lock_class lock_class_mtx_sleep = { .lc_name = "sleep mutex", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE, .lc_assert = assert_mtx, #ifdef DDB .lc_ddb_show = db_show_mtx, #endif .lc_lock = lock_mtx, .lc_unlock = unlock_mtx, #ifdef KDTRACE_HOOKS .lc_owner = owner_mtx, #endif }; struct lock_class lock_class_mtx_spin = { .lc_name = "spin mutex", .lc_flags = LC_SPINLOCK | LC_RECURSABLE, .lc_assert = assert_mtx, #ifdef DDB .lc_ddb_show = db_show_mtx, #endif .lc_lock = lock_spin, .lc_unlock = unlock_spin, #ifdef KDTRACE_HOOKS .lc_owner = owner_mtx, #endif }; +#ifdef ADAPTIVE_MUTEXES +static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging"); + +static struct lock_delay_config mtx_delay = { + .initial = 1000, + .step = 500, + .min = 100, + .max = 5000, +}; + +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_initial, CTLFLAG_RW, &mtx_delay.initial, + 0, ""); +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_step, CTLFLAG_RW, &mtx_delay.step, + 0, ""); +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_min, CTLFLAG_RW, &mtx_delay.min, + 0, ""); +SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_delay.max, + 0, ""); + +static void +mtx_delay_sysinit(void *dummy) +{ + + mtx_delay.initial = mp_ncpus * 25; + mtx_delay.step = (mp_ncpus * 25) / 2; + mtx_delay.min = mp_ncpus * 5; + mtx_delay.max = mp_ncpus * 25 * 10; +} +LOCK_DELAY_SYSINIT(mtx_delay_sysinit); +#endif + /* * System-wide mutexes */ struct mtx blocked_lock; struct mtx Giant; void assert_mtx(const struct lock_object *lock, int what) { mtx_assert((const struct mtx *)lock, what); } void lock_mtx(struct lock_object *lock, uintptr_t how) { mtx_lock((struct mtx *)lock); } void lock_spin(struct lock_object *lock, uintptr_t how) { panic("spin locks can only use msleep_spin"); } uintptr_t unlock_mtx(struct lock_object *lock) { struct mtx *m; m = (struct mtx *)lock; mtx_assert(m, MA_OWNED | MA_NOTRECURSED); mtx_unlock(m); return (0); } uintptr_t unlock_spin(struct lock_object *lock) { panic("spin locks can only use msleep_spin"); } #ifdef KDTRACE_HOOKS int owner_mtx(const struct lock_object *lock, struct thread **owner) { const struct mtx *m = (const struct mtx *)lock; *owner = mtx_owner(m); return (mtx_unowned(m) == 0); } #endif /* * Function versions of the inlined __mtx_* macros. These are used by * modules and can also be called from assembly language if needed. */ void __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_lock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); __mtx_lock(m, curthread, opts, file, line); LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } void __mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_unlock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); __mtx_unlock(m, curthread, opts, file, line); TD_LOCKS_DEC(curthread); } void __mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_lock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); if (mtx_owned(m)) KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0, ("mtx_lock_spin: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); opts &= ~MTX_RECURSE; WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); __mtx_lock_spin(m, curthread, opts, file, line); LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); } int __mtx_trylock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return (1); m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_trylock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_trylock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); KASSERT((opts & MTX_RECURSE) == 0, ("mtx_trylock_spin: unsupp. opt MTX_RECURSE on mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); if (__mtx_trylock_spin(m, curthread, opts, file, line)) { LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 1, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); return (1); } LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 0, file, line); return (0); } void __mtx_unlock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_unlock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); __mtx_unlock_spin(m); } /* * The important part of mtx_trylock{,_flags}() * Tries to acquire lock `m.' If this function is called on a mutex that * is already owned, it will recursively acquire the lock. */ int _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int rval; if (SCHEDULER_STOPPED()) return (1); m = mtxlock2mtx(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_trylock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); if (mtx_owned(m) && ((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0)) { m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); rval = 1; } else rval = _mtx_obtain_lock(m, (uintptr_t)curthread); opts &= ~MTX_RECURSE; LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line); if (rval) { WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); TD_LOCKS_INC(curthread); if (m->mtx_recurse == 0) LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, contested, waittime, file, line); } return (rval); } /* * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. * * We call this if the lock is either contested (i.e. we need to go to * sleep waiting for it), or if we need to recurse on it. */ void __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts, const char *file, int line) { struct mtx *m; struct turnstile *ts; uintptr_t v; #ifdef ADAPTIVE_MUTEXES volatile struct thread *owner; #endif #ifdef KTR int cont_logged = 0; #endif #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif +#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS) + struct lock_delay_arg lda; +#endif #ifdef KDTRACE_HOOKS - u_int spin_cnt = 0; u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return; +#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, &mtx_delay); +#endif m = mtxlock2mtx(c); if (mtx_owned(m)) { KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0, ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); opts &= ~MTX_RECURSE; m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); return; } opts &= ~MTX_RECURSE; #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR4(KTR_LOCK, "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d", m->lock_object.lo_name, (void *)m->mtx_lock, file, line); #ifdef KDTRACE_HOOKS all_time -= lockstat_nsecs(&m->lock_object); #endif for (;;) { if (m->mtx_lock == MTX_UNOWNED && _mtx_obtain_lock(m, tid)) break; #ifdef KDTRACE_HOOKS - spin_cnt++; + lda.spin_cnt++; #endif #ifdef ADAPTIVE_MUTEXES /* * If the owner is running on another CPU, spin until the * owner stops running or the state of the lock changes. */ v = m->mtx_lock; if (v != MTX_UNOWNED) { owner = (struct thread *)(v & ~MTX_FLAGMASK); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&m->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, m, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "spinning", "lockname:\"%s\"", m->lock_object.lo_name); while (mtx_owner(m) == owner && - TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + TD_IS_RUNNING(owner)) + lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "running"); continue; } } #endif ts = turnstile_trywait(&m->lock_object); v = m->mtx_lock; /* * Check if the lock has been released while spinning for * the turnstile chain lock. */ if (v == MTX_UNOWNED) { turnstile_cancel(ts); continue; } #ifdef ADAPTIVE_MUTEXES /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ owner = (struct thread *)(v & ~MTX_FLAGMASK); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } #endif /* * If the mutex isn't already contested and a failure occurs * setting the contested bit, the mutex was either released * or the state of the MTX_RECURSED bit changed. */ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) { turnstile_cancel(ts); continue; } /* * We definitely must sleep for this lock. */ mtx_assert(m, MA_NOTOWNED); #ifdef KTR if (!cont_logged) { CTR6(KTR_CONTENTION, "contention: %p at %s:%d wants %s, taken by %s:%d", (void *)tid, file, line, m->lock_object.lo_name, WITNESS_FILE(&m->lock_object), WITNESS_LINE(&m->lock_object)); cont_logged = 1; } #endif /* * Block on the turnstile. */ #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&m->lock_object); #endif turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&m->lock_object); sleep_cnt++; #endif } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&m->lock_object); #endif #ifdef KTR if (cont_logged) { CTR4(KTR_CONTENTION, "contention end: %s acquired by %p at %s:%d", m->lock_object.lo_name, (void *)tid, file, line); } #endif LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, contested, waittime, file, line); #ifdef KDTRACE_HOOKS if (sleep_time) LOCKSTAT_RECORD1(adaptive__block, m, sleep_time); /* * Only record the loops spinning and not sleeping. */ - if (spin_cnt > sleep_cnt) + if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD1(adaptive__spin, m, all_time - sleep_time); #endif } static void _mtx_lock_spin_failed(struct mtx *m) { struct thread *td; td = mtx_owner(m); /* If the mutex is unlocked, try again. */ if (td == NULL) return; printf( "spin lock %p (%s) held by %p (tid %d) too long\n", m, m->lock_object.lo_name, td, td->td_tid); #ifdef WITNESS witness_display_spinlock(&m->lock_object, td, printf); #endif panic("spin lock held too long"); } #ifdef SMP /* * _mtx_lock_spin_cookie: the tougher part of acquiring an MTX_SPIN lock. * * This is only called if we need to actually spin for the lock. Recursion * is handled inline. */ void _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts, const char *file, int line) { struct mtx *m; int i = 0; #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #ifdef KDTRACE_HOOKS int64_t spin_time = 0; #endif if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "spinning", "lockname:\"%s\"", m->lock_object.lo_name); #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); #ifdef KDTRACE_HOOKS spin_time -= lockstat_nsecs(&m->lock_object); #endif for (;;) { if (m->mtx_lock == MTX_UNOWNED && _mtx_obtain_lock(m, tid)) break; /* Give interrupts a chance while we spin. */ spinlock_exit(); while (m->mtx_lock != MTX_UNOWNED) { if (i++ < 10000000) { cpu_spinwait(); continue; } if (i < 60000000 || kdb_active || panicstr != NULL) DELAY(1); else _mtx_lock_spin_failed(m); cpu_spinwait(); } spinlock_enter(); } #ifdef KDTRACE_HOOKS spin_time += lockstat_nsecs(&m->lock_object); #endif if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "running"); #ifdef KDTRACE_HOOKS LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, m, contested, waittime, file, line); if (spin_time != 0) LOCKSTAT_RECORD1(spin__spin, m, spin_time); #endif } #endif /* SMP */ void thread_lock_flags_(struct thread *td, int opts, const char *file, int line) { struct mtx *m; uintptr_t tid; int i; #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #ifdef KDTRACE_HOOKS int64_t spin_time = 0; #endif i = 0; tid = (uintptr_t)curthread; if (SCHEDULER_STOPPED()) { /* * Ensure that spinlock sections are balanced even when the * scheduler is stopped, since we may otherwise inadvertently * re-enable interrupts while dumping core. */ spinlock_enter(); return; } #ifdef KDTRACE_HOOKS spin_time -= lockstat_nsecs(&td->td_lock->lock_object); #endif for (;;) { retry: spinlock_enter(); m = td->td_lock; KASSERT(m->mtx_lock != MTX_DESTROYED, ("thread_lock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("thread_lock() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); if (mtx_owned(m)) KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0, ("thread_lock: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); for (;;) { if (m->mtx_lock == MTX_UNOWNED && _mtx_obtain_lock(m, tid)) break; if (m->mtx_lock == tid) { m->mtx_recurse++; break; } #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); /* Give interrupts a chance while we spin. */ spinlock_exit(); while (m->mtx_lock != MTX_UNOWNED) { if (i++ < 10000000) cpu_spinwait(); else if (i < 60000000 || kdb_active || panicstr != NULL) DELAY(1); else _mtx_lock_spin_failed(m); cpu_spinwait(); if (m != td->td_lock) goto retry; } spinlock_enter(); } if (m == td->td_lock) break; __mtx_unlock_spin(m); /* does spinlock_exit() */ } #ifdef KDTRACE_HOOKS spin_time += lockstat_nsecs(&m->lock_object); #endif if (m->mtx_recurse == 0) LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, m, contested, waittime, file, line); LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); #ifdef KDTRACE_HOOKS if (spin_time != 0) LOCKSTAT_RECORD1(thread__spin, m, spin_time); #endif } struct mtx * thread_lock_block(struct thread *td) { struct mtx *lock; THREAD_LOCK_ASSERT(td, MA_OWNED); lock = td->td_lock; td->td_lock = &blocked_lock; mtx_unlock_spin(lock); return (lock); } void thread_lock_unblock(struct thread *td, struct mtx *new) { mtx_assert(new, MA_OWNED); MPASS(td->td_lock == &blocked_lock); atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new); } void thread_lock_set(struct thread *td, struct mtx *new) { struct mtx *lock; mtx_assert(new, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); lock = td->td_lock; td->td_lock = new; mtx_unlock_spin(lock); } /* * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * * We are only called here if the lock is recursed or contested (i.e. we * need to wake up a blocked thread). */ void __mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; struct turnstile *ts; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); if (mtx_recursed(m)) { if (--(m->mtx_recurse) == 0) atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); return; } /* * We have to lock the chain before the turnstile so this turnstile * can be removed from the hash list if it is empty. */ turnstile_chain_lock(&m->lock_object); ts = turnstile_lookup(&m->lock_object); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); MPASS(ts != NULL); turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE); _mtx_release_lock_quick(m); /* * This turnstile is now no longer associated with the mutex. We can * unlock the chain lock so a new turnstile may take it's place. */ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); turnstile_chain_unlock(&m->lock_object); } /* * All the unlocking of MTX_SPIN locks is done inline. * See the __mtx_unlock_spin() macro for the details. */ /* * The backing function for the INVARIANTS-enabled mtx_assert() */ #ifdef INVARIANT_SUPPORT void __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line) { const struct mtx *m; if (panicstr != NULL || dumping) return; m = mtxlock2mtx(c); switch (what) { case MA_OWNED: case MA_OWNED | MA_RECURSED: case MA_OWNED | MA_NOTRECURSED: if (!mtx_owned(m)) panic("mutex %s not owned at %s:%d", m->lock_object.lo_name, file, line); if (mtx_recursed(m)) { if ((what & MA_NOTRECURSED) != 0) panic("mutex %s recursed at %s:%d", m->lock_object.lo_name, file, line); } else if ((what & MA_RECURSED) != 0) { panic("mutex %s unrecursed at %s:%d", m->lock_object.lo_name, file, line); } break; case MA_NOTOWNED: if (mtx_owned(m)) panic("mutex %s owned at %s:%d", m->lock_object.lo_name, file, line); break; default: panic("unknown mtx_assert at %s:%d", file, line); } } #endif /* * General init routine used by the MTX_SYSINIT() macro. */ void mtx_sysinit(void *arg) { struct mtx_args *margs = arg; mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts); } /* * Mutex initialization routine; initialize lock `m' of type contained in * `opts' with options contained in `opts' and name `name.' The optional * lock type `type' is used as a general lock category name for use with * witness. */ void _mtx_init(volatile uintptr_t *c, const char *name, const char *type, int opts) { struct mtx *m; struct lock_class *class; int flags; m = mtxlock2mtx(c); MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE | MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE | MTX_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(m->mtx_lock, ("%s: mtx_lock not aligned for %s: %p", __func__, name, &m->mtx_lock)); /* Determine lock class and lock flags. */ if (opts & MTX_SPIN) class = &lock_class_mtx_spin; else class = &lock_class_mtx_sleep; flags = 0; if (opts & MTX_QUIET) flags |= LO_QUIET; if (opts & MTX_RECURSE) flags |= LO_RECURSABLE; if ((opts & MTX_NOWITNESS) == 0) flags |= LO_WITNESS; if (opts & MTX_DUPOK) flags |= LO_DUPOK; if (opts & MTX_NOPROFILE) flags |= LO_NOPROFILE; if (opts & MTX_NEW) flags |= LO_NEW; /* Initialize mutex. */ lock_init(&m->lock_object, class, name, type, flags); m->mtx_lock = MTX_UNOWNED; m->mtx_recurse = 0; } /* * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be * passed in as a flag here because if the corresponding mtx_init() was * called with MTX_QUIET set, then it will already be set in the mutex's * flags. */ void _mtx_destroy(volatile uintptr_t *c) { struct mtx *m; m = mtxlock2mtx(c); if (!mtx_owned(m)) MPASS(mtx_unowned(m)); else { MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); /* Perform the non-mtx related part of mtx_unlock_spin(). */ if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin) spinlock_exit(); else TD_LOCKS_DEC(curthread); lock_profile_release_lock(&m->lock_object); /* Tell witness this isn't locked to make it happy. */ WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__, __LINE__); } m->mtx_lock = MTX_DESTROYED; lock_destroy(&m->lock_object); } /* * Intialize the mutex code and system mutexes. This is called from the MD * startup code prior to mi_startup(). The per-CPU data space needs to be * setup before this is called. */ void mutex_init(void) { /* Setup turnstiles so that sleep mutexes work. */ init_turnstiles(); /* * Initialize mutexes. */ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN); blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN); mtx_init(&proc0.p_statmtx, "pstatl", NULL, MTX_SPIN); mtx_init(&proc0.p_itimmtx, "pitiml", NULL, MTX_SPIN); mtx_init(&proc0.p_profmtx, "pprofl", NULL, MTX_SPIN); mtx_init(&devmtx, "cdev", NULL, MTX_DEF); mtx_lock(&Giant); } #ifdef DDB void db_show_mtx(const struct lock_object *lock) { struct thread *td; const struct mtx *m; m = (const struct mtx *)lock; db_printf(" flags: {"); if (LOCK_CLASS(lock) == &lock_class_mtx_spin) db_printf("SPIN"); else db_printf("DEF"); if (m->lock_object.lo_flags & LO_RECURSABLE) db_printf(", RECURSE"); if (m->lock_object.lo_flags & LO_DUPOK) db_printf(", DUPOK"); db_printf("}\n"); db_printf(" state: {"); if (mtx_unowned(m)) db_printf("UNOWNED"); else if (mtx_destroyed(m)) db_printf("DESTROYED"); else { db_printf("OWNED"); if (m->mtx_lock & MTX_CONTESTED) db_printf(", CONTESTED"); if (m->mtx_lock & MTX_RECURSED) db_printf(", RECURSED"); } db_printf("}\n"); if (!mtx_unowned(m) && !mtx_destroyed(m)) { td = mtx_owner(m); db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (mtx_recursed(m)) db_printf(" recursed: %d\n", m->mtx_recurse); } } #endif Index: user/alc/PQ_LAUNDRY/sys/kern/kern_rwlock.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/kern_rwlock.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/kern/kern_rwlock.c (revision 303654) @@ -1,1277 +1,1307 @@ /*- * Copyright (c) 2006 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Machine independent bits of reader/writer lock implementation. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_no_adaptive_rwlocks.h" #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS) #define ADAPTIVE_RWLOCKS #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif /* * Return the rwlock address when the lock cookie address is provided. * This functionality assumes that struct rwlock* have a member named rw_lock. */ #define rwlock2rw(c) (__containerof(c, struct rwlock, rw_lock)) -#ifdef ADAPTIVE_RWLOCKS -static int rowner_retries = 10; -static int rowner_loops = 10000; -static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL, - "rwlock debugging"); -SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); -SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, ""); -#endif - #ifdef DDB #include static void db_show_rwlock(const struct lock_object *lock); #endif static void assert_rw(const struct lock_object *lock, int what); static void lock_rw(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_rw(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_rw(struct lock_object *lock); struct lock_class lock_class_rw = { .lc_name = "rw", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_rw, #ifdef DDB .lc_ddb_show = db_show_rwlock, #endif .lc_lock = lock_rw, .lc_unlock = unlock_rw, #ifdef KDTRACE_HOOKS .lc_owner = owner_rw, #endif }; +#ifdef ADAPTIVE_RWLOCKS +static int rowner_retries = 10; +static int rowner_loops = 10000; +static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL, + "rwlock debugging"); +SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); +SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, ""); + +static struct lock_delay_config rw_delay = { + .initial = 1000, + .step = 500, + .min = 100, + .max = 5000, +}; + +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_initial, CTLFLAG_RW, &rw_delay.initial, + 0, ""); +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_step, CTLFLAG_RW, &rw_delay.step, + 0, ""); +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_min, CTLFLAG_RW, &rw_delay.min, + 0, ""); +SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max, + 0, ""); + +static void +rw_delay_sysinit(void *dummy) +{ + + rw_delay.initial = mp_ncpus * 25; + rw_delay.step = (mp_ncpus * 25) / 2; + rw_delay.min = mp_ncpus * 5; + rw_delay.max = mp_ncpus * 25 * 10; +} +LOCK_DELAY_SYSINIT(rw_delay_sysinit); +#endif + /* * Return a pointer to the owning thread if the lock is write-locked or * NULL if the lock is unlocked or read-locked. */ #define rw_wowner(rw) \ ((rw)->rw_lock & RW_LOCK_READ ? NULL : \ (struct thread *)RW_OWNER((rw)->rw_lock)) /* * Returns if a write owner is recursed. Write ownership is not assured * here and should be previously checked. */ #define rw_recursed(rw) ((rw)->rw_recurse != 0) /* * Return true if curthread helds the lock. */ #define rw_wlocked(rw) (rw_wowner((rw)) == curthread) /* * Return a pointer to the owning thread for this lock who should receive * any priority lent by threads that block on this lock. Currently this * is identical to rw_wowner(). */ #define rw_owner(rw) rw_wowner(rw) #ifndef INVARIANTS #define __rw_assert(c, what, file, line) #endif void assert_rw(const struct lock_object *lock, int what) { rw_assert((const struct rwlock *)lock, what); } void lock_rw(struct lock_object *lock, uintptr_t how) { struct rwlock *rw; rw = (struct rwlock *)lock; if (how) rw_rlock(rw); else rw_wlock(rw); } uintptr_t unlock_rw(struct lock_object *lock) { struct rwlock *rw; rw = (struct rwlock *)lock; rw_assert(rw, RA_LOCKED | LA_NOTRECURSED); if (rw->rw_lock & RW_LOCK_READ) { rw_runlock(rw); return (1); } else { rw_wunlock(rw); return (0); } } #ifdef KDTRACE_HOOKS int owner_rw(const struct lock_object *lock, struct thread **owner) { const struct rwlock *rw = (const struct rwlock *)lock; uintptr_t x = rw->rw_lock; *owner = rw_wowner(rw); return ((x & RW_LOCK_READ) != 0 ? (RW_READERS(x) != 0) : (*owner != NULL)); } #endif void _rw_init_flags(volatile uintptr_t *c, const char *name, int opts) { struct rwlock *rw; int flags; rw = rwlock2rw(c); MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET | RW_RECURSE | RW_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock, ("%s: rw_lock not aligned for %s: %p", __func__, name, &rw->rw_lock)); flags = LO_UPGRADABLE; if (opts & RW_DUPOK) flags |= LO_DUPOK; if (opts & RW_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & RW_NOWITNESS)) flags |= LO_WITNESS; if (opts & RW_RECURSE) flags |= LO_RECURSABLE; if (opts & RW_QUIET) flags |= LO_QUIET; if (opts & RW_NEW) flags |= LO_NEW; lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags); rw->rw_lock = RW_UNLOCKED; rw->rw_recurse = 0; } void _rw_destroy(volatile uintptr_t *c) { struct rwlock *rw; rw = rwlock2rw(c); KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw)); KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw)); rw->rw_lock = RW_DESTROYED; lock_destroy(&rw->lock_object); } void rw_sysinit(void *arg) { struct rw_args *args = arg; rw_init((struct rwlock *)args->ra_rw, args->ra_desc); } void rw_sysinit_flags(void *arg) { struct rw_args_flags *args = arg; rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc, args->ra_flags); } int _rw_wowned(const volatile uintptr_t *c) { return (rw_wowner(rwlock2rw(c)) == curthread); } void _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_wlock() of destroyed rwlock @ %s:%d", file, line)); WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); __rw_wlock(rw, curthread, file, line); LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line); WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } int __rw_try_wlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; int rval; if (SCHEDULER_STOPPED()) return (1); rw = rwlock2rw(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line)); if (rw_wlocked(rw) && (rw->lock_object.lo_flags & LO_RECURSABLE) != 0) { rw->rw_recurse++; rval = 1; } else rval = atomic_cmpset_acq_ptr(&rw->rw_lock, RW_UNLOCKED, (uintptr_t)curthread); LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); if (!rw_recursed(rw)) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_WRITER); TD_LOCKS_INC(curthread); } return (rval); } void _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_WLOCKED, file, line); WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line); __rw_wunlock(rw, curthread, file, line); TD_LOCKS_DEC(curthread); } /* * Determines whether a new reader can acquire a lock. Succeeds if the * reader already owns a read lock and the lock is locked for read to * prevent deadlock from reader recursion. Also succeeds if the lock * is unlocked and has no writer waiters or spinners. Failing otherwise * prioritizes writers before readers. */ #define RW_CAN_READ(_rw) \ ((curthread->td_rw_rlocks && (_rw) & RW_LOCK_READ) || ((_rw) & \ (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER)) == \ RW_LOCK_READ) void __rw_rlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; #ifdef ADAPTIVE_RWLOCKS volatile struct thread *owner; int spintries = 0; int i; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif uintptr_t v; +#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) + struct lock_delay_arg lda; +#endif #ifdef KDTRACE_HOOKS uintptr_t state; - u_int spin_cnt = 0; u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return; +#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, &rw_delay); +#endif rw = rwlock2rw(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_rlock() of destroyed rwlock @ %s:%d", file, line)); KASSERT(rw_wowner(rw) != curthread, ("rw_rlock: wlock already held for %s @ %s:%d", rw->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL); #ifdef KDTRACE_HOOKS all_time -= lockstat_nsecs(&rw->lock_object); state = rw->rw_lock; #endif for (;;) { /* * Handle the easy case. If no other thread has a write * lock, then try to bump up the count of read locks. Note * that we have to preserve the current state of the * RW_LOCK_WRITE_WAITERS flag. If we fail to acquire a * read lock, then rw_lock must have changed, so restart * the loop. Note that this handles the case of a * completely unlocked rwlock since such a lock is encoded * as a read lock with no waiters. */ v = rw->rw_lock; if (RW_CAN_READ(v)) { /* * The RW_LOCK_READ_WAITERS flag should only be set * if the lock has been unlocked and write waiters * were present. */ if (atomic_cmpset_acq_ptr(&rw->rw_lock, v, v + RW_ONE_READER)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, rw, (void *)v, (void *)(v + RW_ONE_READER)); break; } continue; } #ifdef KDTRACE_HOOKS - spin_cnt++; + lda.spin_cnt++; #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&rw->lock_object, &contested, &waittime); #ifdef ADAPTIVE_RWLOCKS /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if ((v & RW_LOCK_READ) == 0) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); while ((struct thread*)RW_OWNER(rw->rw_lock) == - owner && TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + owner && TD_IS_RUNNING(owner)) + lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } else if (spintries < rowner_retries) { spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); for (i = 0; i < rowner_loops; i++) { v = rw->rw_lock; if ((v & RW_LOCK_READ) == 0 || RW_CAN_READ(v)) break; cpu_spinwait(); } #ifdef KDTRACE_HOOKS - spin_cnt += rowner_loops - i; + lda.spin_cnt += rowner_loops - i; #endif KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i != rowner_loops) continue; } #endif /* * Okay, now it's the hard case. Some other thread already * has a write lock or there are write waiters present, * acquire the turnstile lock so we can begin the process * of blocking. */ ts = turnstile_trywait(&rw->lock_object); /* * The lock might have been released while we spun, so * recheck its state and restart the loop if needed. */ v = rw->rw_lock; if (RW_CAN_READ(v)) { turnstile_cancel(ts); continue; } #ifdef ADAPTIVE_RWLOCKS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if ((v & RW_LOCK_READ) == 0) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } } #endif /* * The lock is held in write mode or it already has waiters. */ MPASS(!RW_CAN_READ(v)); /* * If the RW_LOCK_READ_WAITERS flag is already set, then * we can go ahead and block. If it is not set then try * to set it. If we fail to set it drop the turnstile * lock and restart the loop. */ if (!(v & RW_LOCK_READ_WAITERS)) { if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_READ_WAITERS)) { turnstile_cancel(ts); continue; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set read waiters flag", __func__, rw); } /* * We were unable to acquire the lock and the read waiters * flag is set, so we must block on the turnstile. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&rw->lock_object); #endif turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&rw->lock_object); sleep_cnt++; #endif if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&rw->lock_object); if (sleep_time) LOCKSTAT_RECORD4(rw__block, rw, sleep_time, LOCKSTAT_READER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); /* Record only the loops spinning and not sleeping. */ - if (spin_cnt > sleep_cnt) + if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time, LOCKSTAT_READER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); #endif /* * TODO: acquire "owner of record" here. Here be turnstile dragons * however. turnstiles don't like owners changing between calls to * turnstile_wait() currently. */ LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested, waittime, file, line, LOCKSTAT_READER); LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line); WITNESS_LOCK(&rw->lock_object, 0, file, line); TD_LOCKS_INC(curthread); curthread->td_rw_rlocks++; } int __rw_try_rlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; uintptr_t x; if (SCHEDULER_STOPPED()) return (1); rw = rwlock2rw(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); for (;;) { x = rw->rw_lock; KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line)); if (!(x & RW_LOCK_READ)) break; if (atomic_cmpset_acq_ptr(&rw->rw_lock, x, x + RW_ONE_READER)) { LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file, line); WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line); LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_READER); TD_LOCKS_INC(curthread); curthread->td_rw_rlocks++; return (1); } } LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line); return (0); } void _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; uintptr_t x, v, queue; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_runlock() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_RLOCKED, file, line); WITNESS_UNLOCK(&rw->lock_object, 0, file, line); LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line); /* TODO: drop "owner of record" here. */ for (;;) { /* * See if there is more than one read lock held. If so, * just drop one and return. */ x = rw->rw_lock; if (RW_READERS(x) > 1) { if (atomic_cmpset_rel_ptr(&rw->rw_lock, x, x - RW_ONE_READER)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, rw, (void *)x, (void *)(x - RW_ONE_READER)); break; } continue; } /* * If there aren't any waiters for a write lock, then try * to drop it quickly. */ if (!(x & RW_LOCK_WAITERS)) { MPASS((x & ~RW_LOCK_WRITE_SPINNER) == RW_READERS_LOCK(1)); if (atomic_cmpset_rel_ptr(&rw->rw_lock, x, RW_UNLOCKED)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded", __func__, rw); break; } continue; } /* * Ok, we know we have waiters and we think we are the * last reader, so grab the turnstile lock. */ turnstile_chain_lock(&rw->lock_object); v = rw->rw_lock & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER); MPASS(v & RW_LOCK_WAITERS); /* * Try to drop our lock leaving the lock in a unlocked * state. * * If you wanted to do explicit lock handoff you'd have to * do it here. You'd also want to use turnstile_signal() * and you'd have to handle the race where a higher * priority thread blocks on the write lock before the * thread you wakeup actually runs and have the new thread * "steal" the lock. For now it's a lot simpler to just * wakeup all of the waiters. * * As above, if we fail, then another thread might have * acquired a read lock, so drop the turnstile lock and * restart. */ x = RW_UNLOCKED; if (v & RW_LOCK_WRITE_WAITERS) { queue = TS_EXCLUSIVE_QUEUE; x |= (v & RW_LOCK_READ_WAITERS); } else queue = TS_SHARED_QUEUE; if (!atomic_cmpset_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v, x)) { turnstile_chain_unlock(&rw->lock_object); continue; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded with waiters", __func__, rw); /* * Ok. The lock is released and all that's left is to * wake up the waiters. Note that the lock might not be * free anymore, but in that case the writers will just * block again if they run before the new lock holder(s) * release the lock. */ ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); turnstile_broadcast(ts, queue); turnstile_unpend(ts, TS_SHARED_LOCK); turnstile_chain_unlock(&rw->lock_object); break; } LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_READER); TD_LOCKS_DEC(curthread); curthread->td_rw_rlocks--; } /* * This function is called when we are unable to obtain a write lock on the * first try. This means that at least one other thread holds either a * read or write lock. */ void __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; #ifdef ADAPTIVE_RWLOCKS volatile struct thread *owner; int spintries = 0; int i; #endif uintptr_t v, x; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif +#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) + struct lock_delay_arg lda; +#endif #ifdef KDTRACE_HOOKS uintptr_t state; - u_int spin_cnt = 0; u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return; +#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, &rw_delay); +#endif rw = rwlock2rw(c); if (rw_wlocked(rw)) { KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE, ("%s: recursing but non-recursive rw %s @ %s:%d\n", __func__, rw->lock_object.lo_name, file, line)); rw->rw_recurse++; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw); return; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, rw->lock_object.lo_name, (void *)rw->rw_lock, file, line); #ifdef KDTRACE_HOOKS all_time -= lockstat_nsecs(&rw->lock_object); state = rw->rw_lock; #endif for (;;) { if (rw->rw_lock == RW_UNLOCKED && _rw_write_lock(rw, tid)) break; #ifdef KDTRACE_HOOKS - spin_cnt++; + lda.spin_cnt++; #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&rw->lock_object, &contested, &waittime); #ifdef ADAPTIVE_RWLOCKS /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ v = rw->rw_lock; owner = (struct thread *)RW_OWNER(v); if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); while ((struct thread*)RW_OWNER(rw->rw_lock) == owner && - TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + TD_IS_RUNNING(owner)) + lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } if ((v & RW_LOCK_READ) && RW_READERS(v) && spintries < rowner_retries) { if (!(v & RW_LOCK_WRITE_SPINNER)) { if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_WRITE_SPINNER)) { continue; } } spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); for (i = 0; i < rowner_loops; i++) { if ((rw->rw_lock & RW_LOCK_WRITE_SPINNER) == 0) break; cpu_spinwait(); } KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); #ifdef KDTRACE_HOOKS - spin_cnt += rowner_loops - i; + lda.spin_cnt += rowner_loops - i; #endif if (i != rowner_loops) continue; } #endif ts = turnstile_trywait(&rw->lock_object); v = rw->rw_lock; #ifdef ADAPTIVE_RWLOCKS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (!(v & RW_LOCK_READ)) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } } #endif /* * Check for the waiters flags about this rwlock. * If the lock was released, without maintain any pending * waiters queue, simply try to acquire it. * If a pending waiters queue is present, claim the lock * ownership and maintain the pending queue. */ x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER); if ((v & ~x) == RW_UNLOCKED) { x &= ~RW_LOCK_WRITE_SPINNER; if (atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid | x)) { if (x) turnstile_claim(ts); else turnstile_cancel(ts); break; } turnstile_cancel(ts); continue; } /* * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to * set it. If we fail to set it, then loop back and try * again. */ if (!(v & RW_LOCK_WRITE_WAITERS)) { if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_WRITE_WAITERS)) { turnstile_cancel(ts); continue; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set write waiters flag", __func__, rw); } /* * We were unable to acquire the lock and the write waiters * flag is set, so we must block on the turnstile. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&rw->lock_object); #endif turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&rw->lock_object); sleep_cnt++; #endif if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); #ifdef ADAPTIVE_RWLOCKS spintries = 0; #endif } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&rw->lock_object); if (sleep_time) LOCKSTAT_RECORD4(rw__block, rw, sleep_time, LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); /* Record only the loops spinning and not sleeping. */ - if (spin_cnt > sleep_cnt) + if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time, LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); #endif LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested, waittime, file, line, LOCKSTAT_WRITER); } /* * This function is called if the first try at releasing a write lock failed. * This means that one of the 2 waiter bits must be set indicating that at * least one thread is waiting on this lock. */ void __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; uintptr_t v; int queue; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); if (rw_wlocked(rw) && rw_recursed(rw)) { rw->rw_recurse--; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw); return; } KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS), ("%s: neither of the waiter flags are set", __func__)); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, rw); turnstile_chain_lock(&rw->lock_object); ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); /* * Use the same algo as sx locks for now. Prefer waking up shared * waiters if we have any over writers. This is probably not ideal. * * 'v' is the value we are going to write back to rw_lock. If we * have waiters on both queues, we need to preserve the state of * the waiter flag for the queue we don't wake up. For now this is * hardcoded for the algorithm mentioned above. * * In the case of both readers and writers waiting we wakeup the * readers but leave the RW_LOCK_WRITE_WAITERS flag set. If a * new writer comes in before a reader it will claim the lock up * above. There is probably a potential priority inversion in * there that could be worked around either by waking both queues * of waiters or doing some complicated lock handoff gymnastics. */ v = RW_UNLOCKED; if (rw->rw_lock & RW_LOCK_WRITE_WAITERS) { queue = TS_EXCLUSIVE_QUEUE; v |= (rw->rw_lock & RW_LOCK_READ_WAITERS); } else queue = TS_SHARED_QUEUE; /* Wake up all waiters for the specific queue. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw, queue == TS_SHARED_QUEUE ? "read" : "write"); turnstile_broadcast(ts, queue); atomic_store_rel_ptr(&rw->rw_lock, v); turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); turnstile_chain_unlock(&rw->lock_object); } /* * Attempt to do a non-blocking upgrade from a read lock to a write * lock. This will only succeed if this thread holds a single read * lock. Returns true if the upgrade succeeded and false otherwise. */ int __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; uintptr_t v, x, tid; struct turnstile *ts; int success; if (SCHEDULER_STOPPED()) return (1); rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_RLOCKED, file, line); /* * Attempt to switch from one reader to a writer. If there * are any write waiters, then we will have to lock the * turnstile first to prevent races with another writer * calling turnstile_wait() before we have claimed this * turnstile. So, do the simple case of no waiters first. */ tid = (uintptr_t)curthread; success = 0; for (;;) { v = rw->rw_lock; if (RW_READERS(v) > 1) break; if (!(v & RW_LOCK_WAITERS)) { success = atomic_cmpset_ptr(&rw->rw_lock, v, tid); if (!success) continue; break; } /* * Ok, we think we have waiters, so lock the turnstile. */ ts = turnstile_trywait(&rw->lock_object); v = rw->rw_lock; if (RW_READERS(v) > 1) { turnstile_cancel(ts); break; } /* * Try to switch from one reader to a writer again. This time * we honor the current state of the waiters flags. * If we obtain the lock with the flags set, then claim * ownership of the turnstile. */ x = rw->rw_lock & RW_LOCK_WAITERS; success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x); if (success) { if (x) turnstile_claim(ts); else turnstile_cancel(ts); break; } turnstile_cancel(ts); } LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line); if (success) { curthread->td_rw_rlocks--; WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(rw__upgrade, rw); } return (success); } /* * Downgrade a write lock into a single read lock. */ void __rw_downgrade(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; uintptr_t tid, v; int rwait, wwait; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_WLOCKED | RA_NOTRECURSED, file, line); #ifndef INVARIANTS if (rw_recursed(rw)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line); /* * Convert from a writer to a single reader. First we handle * the easy case with no waiters. If there are any waiters, we * lock the turnstile and "disown" the lock. */ tid = (uintptr_t)curthread; if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1))) goto out; /* * Ok, we think we have waiters, so lock the turnstile so we can * read the waiter flags without any races. */ turnstile_chain_lock(&rw->lock_object); v = rw->rw_lock & RW_LOCK_WAITERS; rwait = v & RW_LOCK_READ_WAITERS; wwait = v & RW_LOCK_WRITE_WAITERS; MPASS(rwait | wwait); /* * Downgrade from a write lock while preserving waiters flag * and give up ownership of the turnstile. */ ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); if (!wwait) v &= ~RW_LOCK_READ_WAITERS; atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v); /* * Wake other readers if there are no writers pending. Otherwise they * won't be able to acquire the lock anyway. */ if (rwait && !wwait) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); } else turnstile_disown(ts); turnstile_chain_unlock(&rw->lock_object); out: curthread->td_rw_rlocks++; LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(rw__downgrade, rw); } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef __rw_assert #endif /* * In the non-WITNESS case, rw_assert() can only detect that at least * *some* thread owns an rlock, but it cannot guarantee that *this* * thread owns an rlock. */ void __rw_assert(const volatile uintptr_t *c, int what, const char *file, int line) { const struct rwlock *rw; if (panicstr != NULL) return; rw = rwlock2rw(c); switch (what) { case RA_LOCKED: case RA_LOCKED | RA_RECURSED: case RA_LOCKED | RA_NOTRECURSED: case RA_RLOCKED: case RA_RLOCKED | RA_RECURSED: case RA_RLOCKED | RA_NOTRECURSED: #ifdef WITNESS witness_assert(&rw->lock_object, what, file, line); #else /* * If some other thread has a write lock or we have one * and are asserting a read lock, fail. Also, if no one * has a lock at all, fail. */ if (rw->rw_lock == RW_UNLOCKED || (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED || rw_wowner(rw) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", rw->lock_object.lo_name, (what & RA_RLOCKED) ? "read " : "", file, line); if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) { if (rw_recursed(rw)) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } #endif break; case RA_WLOCKED: case RA_WLOCKED | RA_RECURSED: case RA_WLOCKED | RA_NOTRECURSED: if (rw_wowner(rw) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", rw->lock_object.lo_name, file, line); if (rw_recursed(rw)) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); break; case RA_UNLOCKED: #ifdef WITNESS witness_assert(&rw->lock_object, what, file, line); #else /* * If we hold a write lock fail. We can't reliably check * to see if we hold a read lock or not. */ if (rw_wowner(rw) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", rw->lock_object.lo_name, file, line); #endif break; default: panic("Unknown rw lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB void db_show_rwlock(const struct lock_object *lock) { const struct rwlock *rw; struct thread *td; rw = (const struct rwlock *)lock; db_printf(" state: "); if (rw->rw_lock == RW_UNLOCKED) db_printf("UNLOCKED\n"); else if (rw->rw_lock == RW_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (rw->rw_lock & RW_LOCK_READ) db_printf("RLOCK: %ju locks\n", (uintmax_t)(RW_READERS(rw->rw_lock))); else { td = rw_wowner(rw); db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (rw_recursed(rw)) db_printf(" recursed: %u\n", rw->rw_recurse); } db_printf(" waiters: "); switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) { case RW_LOCK_READ_WAITERS: db_printf("readers\n"); break; case RW_LOCK_WRITE_WAITERS: db_printf("writers\n"); break; case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS: db_printf("readers and writers\n"); break; default: db_printf("none\n"); break; } } #endif Index: user/alc/PQ_LAUNDRY/sys/kern/kern_sx.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/kern_sx.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/kern/kern_sx.c (revision 303654) @@ -1,1258 +1,1289 @@ /*- * Copyright (c) 2007 Attilio Rao * Copyright (c) 2001 Jason Evans * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * Shared/exclusive locks. This implementation attempts to ensure * deterministic lock granting behavior, so that slocks and xlocks are * interleaved. * * Priority propagation will not generally raise the priority of lock holders, * so should not be relied upon in combination with sx locks. */ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_no_adaptive_sx.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include +#include #include #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #include #endif #ifdef DDB #include #endif #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #define ADAPTIVE_SX #endif CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE); #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif /* Handy macros for sleep queues. */ #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 /* * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file. We * drop Giant anytime we have to sleep or if we adaptively spin. */ #define GIANT_DECLARE \ int _giantcnt = 0; \ WITNESS_SAVE_DECL(Giant) \ #define GIANT_SAVE() do { \ if (mtx_owned(&Giant)) { \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _giantcnt++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) #define GIANT_RESTORE() do { \ if (_giantcnt > 0) { \ mtx_assert(&Giant, MA_NOTOWNED); \ while (_giantcnt--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) /* * Returns true if an exclusive lock is recursed. It assumes * curthread currently has an exclusive lock. */ #define sx_recursed(sx) ((sx)->sx_recurse != 0) static void assert_sx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_sx(const struct lock_object *lock); #endif static void lock_sx(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_sx(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_sx(struct lock_object *lock); struct lock_class lock_class_sx = { .lc_name = "sx", .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_sx, #ifdef DDB .lc_ddb_show = db_show_sx, #endif .lc_lock = lock_sx, .lc_unlock = unlock_sx, #ifdef KDTRACE_HOOKS .lc_owner = owner_sx, #endif }; #ifndef INVARIANTS #define _sx_assert(sx, what, file, line) #endif #ifdef ADAPTIVE_SX static u_int asx_retries = 10; static u_int asx_loops = 10000; static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging"); SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, ""); SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, ""); + +static struct lock_delay_config sx_delay = { + .initial = 1000, + .step = 500, + .min = 100, + .max = 5000, +}; + +SYSCTL_INT(_debug_sx, OID_AUTO, delay_initial, CTLFLAG_RW, &sx_delay.initial, + 0, ""); +SYSCTL_INT(_debug_sx, OID_AUTO, delay_step, CTLFLAG_RW, &sx_delay.step, + 0, ""); +SYSCTL_INT(_debug_sx, OID_AUTO, delay_min, CTLFLAG_RW, &sx_delay.min, + 0, ""); +SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max, + 0, ""); + +static void +sx_delay_sysinit(void *dummy) +{ + + sx_delay.initial = mp_ncpus * 25; + sx_delay.step = (mp_ncpus * 25) / 2; + sx_delay.min = mp_ncpus * 5; + sx_delay.max = mp_ncpus * 25 * 10; +} +LOCK_DELAY_SYSINIT(sx_delay_sysinit); #endif void assert_sx(const struct lock_object *lock, int what) { sx_assert((const struct sx *)lock, what); } void lock_sx(struct lock_object *lock, uintptr_t how) { struct sx *sx; sx = (struct sx *)lock; if (how) sx_slock(sx); else sx_xlock(sx); } uintptr_t unlock_sx(struct lock_object *lock) { struct sx *sx; sx = (struct sx *)lock; sx_assert(sx, SA_LOCKED | SA_NOTRECURSED); if (sx_xlocked(sx)) { sx_xunlock(sx); return (0); } else { sx_sunlock(sx); return (1); } } #ifdef KDTRACE_HOOKS int owner_sx(const struct lock_object *lock, struct thread **owner) { const struct sx *sx = (const struct sx *)lock; uintptr_t x = sx->sx_lock; *owner = (struct thread *)SX_OWNER(x); return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) : (*owner != NULL)); } #endif void sx_sysinit(void *arg) { struct sx_args *sargs = arg; sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags); } void sx_init_flags(struct sx *sx, const char *description, int opts) { int flags; MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK | SX_NOPROFILE | SX_NOADAPTIVE | SX_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock, ("%s: sx_lock not aligned for %s: %p", __func__, description, &sx->sx_lock)); flags = LO_SLEEPABLE | LO_UPGRADABLE; if (opts & SX_DUPOK) flags |= LO_DUPOK; if (opts & SX_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & SX_NOWITNESS)) flags |= LO_WITNESS; if (opts & SX_RECURSE) flags |= LO_RECURSABLE; if (opts & SX_QUIET) flags |= LO_QUIET; if (opts & SX_NEW) flags |= LO_NEW; flags |= opts & SX_NOADAPTIVE; lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags); sx->sx_lock = SX_LOCK_UNLOCKED; sx->sx_recurse = 0; } void sx_destroy(struct sx *sx) { KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held")); KASSERT(sx->sx_recurse == 0, ("sx lock still recursed")); sx->sx_lock = SX_LOCK_DESTROYED; lock_destroy(&sx->lock_object); } int _sx_slock(struct sx *sx, int opts, const char *file, int line) { int error = 0; if (SCHEDULER_STOPPED()) return (0); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_slock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL); error = __sx_slock(sx, opts, file, line); if (!error) { LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line); WITNESS_LOCK(&sx->lock_object, 0, file, line); TD_LOCKS_INC(curthread); } return (error); } int sx_try_slock_(struct sx *sx, const char *file, int line) { uintptr_t x; if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_try_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); for (;;) { x = sx->sx_lock; KASSERT(x != SX_LOCK_DESTROYED, ("sx_try_slock() of destroyed sx @ %s:%d", file, line)); if (!(x & SX_LOCK_SHARED)) break; if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) { LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line); WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line); LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_READER); TD_LOCKS_INC(curthread); return (1); } } LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line); return (0); } int _sx_xlock(struct sx *sx, int opts, const char *file, int line) { int error = 0; if (SCHEDULER_STOPPED()) return (0); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xlock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); error = __sx_xlock(sx, curthread, opts, file, line); if (!error) { LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } return (error); } int sx_try_xlock_(struct sx *sx, const char *file, int line) { int rval; if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_xlock() of destroyed sx @ %s:%d", file, line)); if (sx_xlocked(sx) && (sx->lock_object.lo_flags & LO_RECURSABLE) != 0) { sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); rval = 1; } else rval = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, (uintptr_t)curthread); LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); if (!sx_recursed(sx)) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_WRITER); TD_LOCKS_INC(curthread); } return (rval); } void _sx_sunlock(struct sx *sx, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_sunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line); __sx_sunlock(sx, file, line); TD_LOCKS_DEC(curthread); } void _sx_xunlock(struct sx *sx, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); __sx_xunlock(sx, curthread, file, line); TD_LOCKS_DEC(curthread); } /* * Try to do a non-blocking upgrade from a shared lock to an exclusive lock. * This will only succeed if this thread holds a single shared lock. * Return 1 if if the upgrade succeed, 0 otherwise. */ int sx_try_upgrade_(struct sx *sx, const char *file, int line) { uintptr_t x; int success; if (SCHEDULER_STOPPED()) return (1); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); /* * Try to switch from one shared lock to an exclusive lock. We need * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that * we will wake up the exclusive waiters when we drop the lock. */ x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS; success = atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x, (uintptr_t)curthread | x); LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line); if (success) { WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(sx__upgrade, sx); } return (success); } /* * Downgrade an unrecursed exclusive lock into a single shared lock. */ void sx_downgrade_(struct sx *sx, const char *file, int line) { uintptr_t x; int wakeup_swapper; if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_downgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line); #ifndef INVARIANTS if (sx_recursed(sx)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line); /* * Try to switch from an exclusive lock with no shared waiters * to one sharer with no shared waiters. If there are * exclusive waiters, we don't need to lock the sleep queue so * long as we preserve the flag. We do one quick try and if * that fails we grab the sleepq lock to keep the flags from * changing and do it the slow way. * * We have to lock the sleep queue if there are shared waiters * so we can wake them up. */ x = sx->sx_lock; if (!(x & SX_LOCK_SHARED_WAITERS) && atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS))) { LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line); return; } /* * Lock the sleep queue so we can read the waiters bits * without any races and wakeup any shared waiters. */ sleepq_lock(&sx->lock_object); /* * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single * shared lock. If there are any shared waiters, wake them up. */ wakeup_swapper = 0; x = sx->sx_lock; atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS)); if (x & SX_LOCK_SHARED_WAITERS) wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_SHARED_QUEUE); sleepq_release(&sx->lock_object); LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(sx__downgrade, sx); if (wakeup_swapper) kick_proc0(); } /* * This function represents the so-called 'hard case' for sx_xlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ int _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file, int line) { GIANT_DECLARE; #ifdef ADAPTIVE_SX volatile struct thread *owner; u_int i, spintries = 0; #endif uintptr_t x; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int error = 0; +#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) + struct lock_delay_arg lda; +#endif #ifdef KDTRACE_HOOKS uintptr_t state; - u_int spin_cnt = 0; u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return (0); +#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, &sx_delay); +#endif + /* If we already hold an exclusive lock, then recurse. */ if (sx_xlocked(sx)) { KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0, ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n", sx->lock_object.lo_name, file, line)); sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx); return (0); } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, sx->lock_object.lo_name, (void *)sx->sx_lock, file, line); #ifdef KDTRACE_HOOKS all_time -= lockstat_nsecs(&sx->lock_object); state = sx->sx_lock; #endif for (;;) { if (sx->sx_lock == SX_LOCK_UNLOCKED && atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) break; #ifdef KDTRACE_HOOKS - spin_cnt++; + lda.spin_cnt++; #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifdef ADAPTIVE_SX /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ x = sx->sx_lock; if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { if ((x & SX_LOCK_SHARED) == 0) { x = SX_OWNER(x); owner = (struct thread *)x; if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(); while (SX_OWNER(sx->sx_lock) == x && - TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + TD_IS_RUNNING(owner)) + lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } else if (SX_SHARERS(x) && spintries < asx_retries) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(); spintries++; for (i = 0; i < asx_loops; i++) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: shared spinning on %p with %u and %u", __func__, sx, spintries, i); x = sx->sx_lock; if ((x & SX_LOCK_SHARED) == 0 || SX_SHARERS(x) == 0) break; cpu_spinwait(); #ifdef KDTRACE_HOOKS - spin_cnt++; + lda.spin_cnt++; #endif } KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i != asx_loops) continue; } } #endif sleepq_lock(&sx->lock_object); x = sx->sx_lock; /* * If the lock was released while spinning on the * sleep queue chain lock, try again. */ if (x == SX_LOCK_UNLOCKED) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the sleep queue * chain lock. If so, drop the sleep queue lock and try * again. */ if (!(x & SX_LOCK_SHARED) && (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); continue; } } #endif /* * If an exclusive lock was released with both shared * and exclusive waiters and a shared waiter hasn't * woken up and acquired the lock yet, sx_lock will be * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS. * If we see that value, try to acquire it once. Note * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS * as there are other exclusive waiters still. If we * fail, restart the loop. */ if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) { if (atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS, tid | SX_LOCK_EXCLUSIVE_WAITERS)) { sleepq_release(&sx->lock_object); CTR2(KTR_LOCK, "%s: %p claimed by new writer", __func__, sx); break; } sleepq_release(&sx->lock_object); continue; } /* * Try to set the SX_LOCK_EXCLUSIVE_WAITERS. If we fail, * than loop back and retry. */ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { if (!atomic_cmpset_ptr(&sx->sx_lock, x, x | SX_LOCK_EXCLUSIVE_WAITERS)) { sleepq_release(&sx->lock_object); continue; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set excl waiters flag", __func__, sx); } /* * Since we have been unable to acquire the exclusive * lock and the exclusive waiters flag is set, we have * to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif GIANT_SAVE(); sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); - if (spin_cnt > sleep_cnt) + if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); #endif if (!error) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_WRITER); GIANT_RESTORE(); return (error); } /* * This function represents the so-called 'hard case' for sx_xunlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ void _sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line) { uintptr_t x; int queue, wakeup_swapper; if (SCHEDULER_STOPPED()) return; MPASS(!(sx->sx_lock & SX_LOCK_SHARED)); /* If the lock is recursed, then unrecurse one level. */ if (sx_xlocked(sx) && sx_recursed(sx)) { if ((--sx->sx_recurse) == 0) atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx); return; } MPASS(sx->sx_lock & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, sx); sleepq_lock(&sx->lock_object); x = SX_LOCK_UNLOCKED; /* * The wake up algorithm here is quite simple and probably not * ideal. It gives precedence to shared waiters if they are * present. For this condition, we have to preserve the * state of the exclusive waiters flag. * If interruptible sleeps left the shared queue empty avoid a * starvation for the threads sleeping on the exclusive queue by giving * them precedence and cleaning up the shared waiters bit anyway. */ if ((sx->sx_lock & SX_LOCK_SHARED_WAITERS) != 0 && sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) { queue = SQ_SHARED_QUEUE; x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS); } else queue = SQ_EXCLUSIVE_QUEUE; /* Wake up all the waiters for the specific queue. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue", __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); atomic_store_rel_ptr(&sx->sx_lock, x); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, queue); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); } /* * This function represents the so-called 'hard case' for sx_slock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ int _sx_slock_hard(struct sx *sx, int opts, const char *file, int line) { GIANT_DECLARE; #ifdef ADAPTIVE_SX volatile struct thread *owner; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif uintptr_t x; int error = 0; +#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) + struct lock_delay_arg lda; +#endif #ifdef KDTRACE_HOOKS uintptr_t state; - u_int spin_cnt = 0; u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return (0); +#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, &sx_delay); +#endif #ifdef KDTRACE_HOOKS state = sx->sx_lock; all_time -= lockstat_nsecs(&sx->lock_object); #endif /* * As with rwlocks, we don't make any attempt to try to block * shared locks once there is an exclusive waiter. */ for (;;) { #ifdef KDTRACE_HOOKS - spin_cnt++; + lda.spin_cnt++; #endif x = sx->sx_lock; /* * If no other thread has an exclusive lock then try to bump up * the count of sharers. Since we have to preserve the state * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the * shared lock loop back and retry. */ if (x & SX_LOCK_SHARED) { MPASS(!(x & SX_LOCK_SHARED_WAITERS)); if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, sx, (void *)x, (void *)(x + SX_ONE_SHARER)); break; } continue; } #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { x = SX_OWNER(x); owner = (struct thread *)x; if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(); while (SX_OWNER(sx->sx_lock) == x && - TD_IS_RUNNING(owner)) { - cpu_spinwait(); -#ifdef KDTRACE_HOOKS - spin_cnt++; -#endif - } + TD_IS_RUNNING(owner)) + lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } #endif /* * Some other thread already has an exclusive lock, so * start the process of blocking. */ sleepq_lock(&sx->lock_object); x = sx->sx_lock; /* * The lock could have been released while we spun. * In this case loop back and retry. */ if (x & SX_LOCK_SHARED) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if (!(x & SX_LOCK_SHARED) && (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); continue; } } #endif /* * Try to set the SX_LOCK_SHARED_WAITERS flag. If we * fail to set it drop the sleep queue lock and loop * back. */ if (!(x & SX_LOCK_SHARED_WAITERS)) { if (!atomic_cmpset_ptr(&sx->sx_lock, x, x | SX_LOCK_SHARED_WAITERS)) { sleepq_release(&sx->lock_object); continue; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set shared waiters flag", __func__, sx); } /* * Since we have been unable to acquire the shared lock, * we have to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif GIANT_SAVE(); sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); - if (spin_cnt > sleep_cnt) + if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); #endif if (error == 0) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_READER); GIANT_RESTORE(); return (error); } /* * This function represents the so-called 'hard case' for sx_sunlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ void _sx_sunlock_hard(struct sx *sx, const char *file, int line) { uintptr_t x; int wakeup_swapper; if (SCHEDULER_STOPPED()) return; for (;;) { x = sx->sx_lock; /* * We should never have sharers while at least one thread * holds a shared lock. */ KASSERT(!(x & SX_LOCK_SHARED_WAITERS), ("%s: waiting sharers", __func__)); /* * See if there is more than one shared lock held. If * so, just drop one and return. */ if (SX_SHARERS(x) > 1) { if (atomic_cmpset_rel_ptr(&sx->sx_lock, x, x - SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, sx, (void *)x, (void *)(x - SX_ONE_SHARER)); break; } continue; } /* * If there aren't any waiters for an exclusive lock, * then try to drop it quickly. */ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { MPASS(x == SX_SHARERS_LOCK(1)); if (atomic_cmpset_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1), SX_LOCK_UNLOCKED)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded", __func__, sx); break; } continue; } /* * At this point, there should just be one sharer with * exclusive waiters. */ MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS)); sleepq_lock(&sx->lock_object); /* * Wake up semantic here is quite simple: * Just wake up all the exclusive waiters. * Note that the state of the lock could have changed, * so if it fails loop back and retry. */ if (!atomic_cmpset_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS, SX_LOCK_UNLOCKED)) { sleepq_release(&sx->lock_object); continue; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p waking up all thread on" "exclusive queue", __func__, sx); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_EXCLUSIVE_QUEUE); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); break; } } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef _sx_assert #endif /* * In the non-WITNESS case, sx_assert() can only detect that at least * *some* thread owns an slock, but it cannot guarantee that *this* * thread owns an slock. */ void _sx_assert(const struct sx *sx, int what, const char *file, int line) { #ifndef WITNESS int slocked = 0; #endif if (panicstr != NULL) return; switch (what) { case SA_SLOCKED: case SA_SLOCKED | SA_NOTRECURSED: case SA_SLOCKED | SA_RECURSED: #ifndef WITNESS slocked = 1; /* FALLTHROUGH */ #endif case SA_LOCKED: case SA_LOCKED | SA_NOTRECURSED: case SA_LOCKED | SA_RECURSED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If some other thread has an exclusive lock or we * have one and are asserting a shared lock, fail. * Also, if no one has a lock at all, fail. */ if (sx->sx_lock == SX_LOCK_UNLOCKED || (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked || sx_xholder(sx) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", sx->lock_object.lo_name, slocked ? "share " : "", file, line); if (!(sx->sx_lock & SX_LOCK_SHARED)) { if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } #endif break; case SA_XLOCKED: case SA_XLOCKED | SA_NOTRECURSED: case SA_XLOCKED | SA_RECURSED: if (sx_xholder(sx) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); break; case SA_UNLOCKED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If we hold an exclusve lock fail. We can't * reliably check to see if we hold a shared lock or * not. */ if (sx_xholder(sx) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); #endif break; default: panic("Unknown sx lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB static void db_show_sx(const struct lock_object *lock) { struct thread *td; const struct sx *sx; sx = (const struct sx *)lock; db_printf(" state: "); if (sx->sx_lock == SX_LOCK_UNLOCKED) db_printf("UNLOCKED\n"); else if (sx->sx_lock == SX_LOCK_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else { td = sx_xholder(sx); db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (sx_recursed(sx)) db_printf(" recursed: %d\n", sx->sx_recurse); } db_printf(" waiters: "); switch(sx->sx_lock & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) { case SX_LOCK_SHARED_WAITERS: db_printf("shared\n"); break; case SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive and shared\n"); break; default: db_printf("none\n"); } } /* * Check to see if a thread that is blocked on a sleep queue is actually * blocked on an sx lock. If so, output some details and return true. * If the lock has an exclusive owner, return that in *ownerp. */ int sx_chain(struct thread *td, struct thread **ownerp) { struct sx *sx; /* * Check to see if this thread is blocked on an sx lock. * First, we check the lock class. If that is ok, then we * compare the lock name against the wait message. */ sx = td->td_wchan; if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx || sx->lock_object.lo_name != td->td_wmesg) return (0); /* We think we have an sx lock, so output some details. */ db_printf("blocked on sx \"%s\" ", td->td_wmesg); *ownerp = sx_xholder(sx); if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK (count %ju)\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else db_printf("XLOCK\n"); return (1); } #endif Index: user/alc/PQ_LAUNDRY/sys/kern/subr_lock.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/subr_lock.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/kern/subr_lock.c (revision 303654) @@ -1,646 +1,674 @@ /*- * Copyright (c) 2006 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables and functions used to maintain * lock_object structures. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mprof.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include CTASSERT(LOCK_CLASS_MAX == 15); struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = { &lock_class_mtx_spin, &lock_class_mtx_sleep, &lock_class_sx, &lock_class_rm, &lock_class_rm_sleepable, &lock_class_rw, &lock_class_lockmgr, }; void lock_init(struct lock_object *lock, struct lock_class *class, const char *name, const char *type, int flags) { int i; /* Check for double-init and zero object. */ KASSERT(flags & LO_NEW || !lock_initialized(lock), ("lock \"%s\" %p already initialized", name, lock)); /* Look up lock class to find its index. */ for (i = 0; i < LOCK_CLASS_MAX; i++) if (lock_classes[i] == class) { lock->lo_flags = i << LO_CLASSSHIFT; break; } KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class)); /* Initialize the lock object. */ lock->lo_name = name; lock->lo_flags |= flags | LO_INITIALIZED; LOCK_LOG_INIT(lock, 0); WITNESS_INIT(lock, (type != NULL) ? type : name); } void lock_destroy(struct lock_object *lock) { KASSERT(lock_initialized(lock), ("lock %p is not initialized", lock)); WITNESS_DESTROY(lock); LOCK_LOG_DESTROY(lock, 0); lock->lo_flags &= ~LO_INITIALIZED; } +void +lock_delay(struct lock_delay_arg *la) +{ + u_int i, delay, backoff, min, max; + struct lock_delay_config *lc = la->config; + + delay = la->delay; + + if (delay == 0) + delay = lc->initial; + else { + delay += lc->step; + max = lc->max; + if (delay > max) + delay = max; + } + + backoff = cpu_ticks() % delay; + min = lc->min; + if (backoff < min) + backoff = min; + for (i = 0; i < backoff; i++) + cpu_spinwait(); + + la->delay = delay; + la->spin_cnt += backoff; +} + #ifdef DDB DB_SHOW_COMMAND(lock, db_show_lock) { struct lock_object *lock; struct lock_class *class; if (!have_addr) return; lock = (struct lock_object *)addr; if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) { db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock)); return; } class = LOCK_CLASS(lock); db_printf(" class: %s\n", class->lc_name); db_printf(" name: %s\n", lock->lo_name); class->lc_ddb_show(lock); } #endif #ifdef LOCK_PROFILING /* * One object per-thread for each lock the thread owns. Tracks individual * lock instances. */ struct lock_profile_object { LIST_ENTRY(lock_profile_object) lpo_link; struct lock_object *lpo_obj; const char *lpo_file; int lpo_line; uint16_t lpo_ref; uint16_t lpo_cnt; uint64_t lpo_acqtime; uint64_t lpo_waittime; u_int lpo_contest_locking; }; /* * One lock_prof for each (file, line, lock object) triple. */ struct lock_prof { SLIST_ENTRY(lock_prof) link; struct lock_class *class; const char *file; const char *name; int line; int ticks; uintmax_t cnt_wait_max; uintmax_t cnt_max; uintmax_t cnt_tot; uintmax_t cnt_wait; uintmax_t cnt_cur; uintmax_t cnt_contest_locking; }; SLIST_HEAD(lphead, lock_prof); #define LPROF_HASH_SIZE 4096 #define LPROF_HASH_MASK (LPROF_HASH_SIZE - 1) #define LPROF_CACHE_SIZE 4096 /* * Array of objects and profs for each type of object for each cpu. Spinlocks * are handled separately because a thread may be preempted and acquire a * spinlock while in the lock profiling code of a non-spinlock. In this way * we only need a critical section to protect the per-cpu lists. */ struct lock_prof_type { struct lphead lpt_lpalloc; struct lpohead lpt_lpoalloc; struct lphead lpt_hash[LPROF_HASH_SIZE]; struct lock_prof lpt_prof[LPROF_CACHE_SIZE]; struct lock_profile_object lpt_objs[LPROF_CACHE_SIZE]; }; struct lock_prof_cpu { struct lock_prof_type lpc_types[2]; /* One for spin one for other. */ }; struct lock_prof_cpu *lp_cpu[MAXCPU]; volatile int lock_prof_enable = 0; static volatile int lock_prof_resetting; #define LPROF_SBUF_SIZE 256 static int lock_prof_rejected; static int lock_prof_skipspin; static int lock_prof_skipcount; #ifndef USE_CPU_NANOSECONDS uint64_t nanoseconds(void) { struct bintime bt; uint64_t ns; binuptime(&bt); /* From bintime2timespec */ ns = bt.sec * (uint64_t)1000000000; ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32; return (ns); } #endif static void lock_prof_init_type(struct lock_prof_type *type) { int i; SLIST_INIT(&type->lpt_lpalloc); LIST_INIT(&type->lpt_lpoalloc); for (i = 0; i < LPROF_CACHE_SIZE; i++) { SLIST_INSERT_HEAD(&type->lpt_lpalloc, &type->lpt_prof[i], link); LIST_INSERT_HEAD(&type->lpt_lpoalloc, &type->lpt_objs[i], lpo_link); } } static void lock_prof_init(void *arg) { int cpu; for (cpu = 0; cpu <= mp_maxid; cpu++) { lp_cpu[cpu] = malloc(sizeof(*lp_cpu[cpu]), M_DEVBUF, M_WAITOK | M_ZERO); lock_prof_init_type(&lp_cpu[cpu]->lpc_types[0]); lock_prof_init_type(&lp_cpu[cpu]->lpc_types[1]); } } SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL); static void lock_prof_reset_wait(void) { /* * Spin relinquishing our cpu so that quiesce_all_cpus may * complete. */ while (lock_prof_resetting) sched_relinquish(curthread); } static void lock_prof_reset(void) { struct lock_prof_cpu *lpc; int enabled, i, cpu; /* * We not only race with acquiring and releasing locks but also * thread exit. To be certain that threads exit without valid head * pointers they must see resetting set before enabled is cleared. * Otherwise a lock may not be removed from a per-thread list due * to disabled being set but not wait for reset() to remove it below. */ atomic_store_rel_int(&lock_prof_resetting, 1); enabled = lock_prof_enable; lock_prof_enable = 0; quiesce_all_cpus("profreset", 0); /* * Some objects may have migrated between CPUs. Clear all links * before we zero the structures. Some items may still be linked * into per-thread lists as well. */ for (cpu = 0; cpu <= mp_maxid; cpu++) { lpc = lp_cpu[cpu]; for (i = 0; i < LPROF_CACHE_SIZE; i++) { LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link); LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link); } } for (cpu = 0; cpu <= mp_maxid; cpu++) { lpc = lp_cpu[cpu]; bzero(lpc, sizeof(*lpc)); lock_prof_init_type(&lpc->lpc_types[0]); lock_prof_init_type(&lpc->lpc_types[1]); } atomic_store_rel_int(&lock_prof_resetting, 0); lock_prof_enable = enabled; } static void lock_prof_output(struct lock_prof *lp, struct sbuf *sb) { const char *p; for (p = lp->file; p != NULL && strncmp(p, "../", 3) == 0; p += 3); sbuf_printf(sb, "%8ju %9ju %11ju %11ju %11ju %6ju %6ju %2ju %6ju %s:%d (%s:%s)\n", lp->cnt_max / 1000, lp->cnt_wait_max / 1000, lp->cnt_tot / 1000, lp->cnt_wait / 1000, lp->cnt_cur, lp->cnt_cur == 0 ? (uintmax_t)0 : lp->cnt_tot / (lp->cnt_cur * 1000), lp->cnt_cur == 0 ? (uintmax_t)0 : lp->cnt_wait / (lp->cnt_cur * 1000), (uintmax_t)0, lp->cnt_contest_locking, p, lp->line, lp->class->lc_name, lp->name); } static void lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash, int spin, int t) { struct lock_prof_type *type; struct lock_prof *l; int cpu; dst->file = match->file; dst->line = match->line; dst->class = match->class; dst->name = match->name; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (lp_cpu[cpu] == NULL) continue; type = &lp_cpu[cpu]->lpc_types[spin]; SLIST_FOREACH(l, &type->lpt_hash[hash], link) { if (l->ticks == t) continue; if (l->file != match->file || l->line != match->line || l->name != match->name) continue; l->ticks = t; if (l->cnt_max > dst->cnt_max) dst->cnt_max = l->cnt_max; if (l->cnt_wait_max > dst->cnt_wait_max) dst->cnt_wait_max = l->cnt_wait_max; dst->cnt_tot += l->cnt_tot; dst->cnt_wait += l->cnt_wait; dst->cnt_cur += l->cnt_cur; dst->cnt_contest_locking += l->cnt_contest_locking; } } } static void lock_prof_type_stats(struct lock_prof_type *type, struct sbuf *sb, int spin, int t) { struct lock_prof *l; int i; for (i = 0; i < LPROF_HASH_SIZE; ++i) { SLIST_FOREACH(l, &type->lpt_hash[i], link) { struct lock_prof lp = {}; if (l->ticks == t) continue; lock_prof_sum(l, &lp, i, spin, t); lock_prof_output(&lp, sb); } } } static int dump_lock_prof_stats(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error, cpu, t; int enabled; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, LPROF_SBUF_SIZE, req); sbuf_printf(sb, "\n%8s %9s %11s %11s %11s %6s %6s %2s %6s %s\n", "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name"); enabled = lock_prof_enable; lock_prof_enable = 0; quiesce_all_cpus("profstat", 0); t = ticks; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (lp_cpu[cpu] == NULL) continue; lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[0], sb, 0, t); lock_prof_type_stats(&lp_cpu[cpu]->lpc_types[1], sb, 1, t); } lock_prof_enable = enabled; error = sbuf_finish(sb); /* Output a trailing NUL. */ if (error == 0) error = SYSCTL_OUT(req, "", 1); sbuf_delete(sb); return (error); } static int enable_lock_prof(SYSCTL_HANDLER_ARGS) { int error, v; v = lock_prof_enable; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == lock_prof_enable) return (0); if (v == 1) lock_prof_reset(); lock_prof_enable = !!v; return (0); } static int reset_lock_prof_stats(SYSCTL_HANDLER_ARGS) { int error, v; v = 0; error = sysctl_handle_int(oidp, &v, 0, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == 0) return (0); lock_prof_reset(); return (0); } static struct lock_prof * lock_profile_lookup(struct lock_object *lo, int spin, const char *file, int line) { const char *unknown = "(unknown)"; struct lock_prof_type *type; struct lock_prof *lp; struct lphead *head; const char *p; u_int hash; p = file; if (p == NULL || *p == '\0') p = unknown; hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line; hash &= LPROF_HASH_MASK; type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; head = &type->lpt_hash[hash]; SLIST_FOREACH(lp, head, link) { if (lp->line == line && lp->file == p && lp->name == lo->lo_name) return (lp); } lp = SLIST_FIRST(&type->lpt_lpalloc); if (lp == NULL) { lock_prof_rejected++; return (lp); } SLIST_REMOVE_HEAD(&type->lpt_lpalloc, link); lp->file = p; lp->line = line; lp->class = LOCK_CLASS(lo); lp->name = lo->lo_name; SLIST_INSERT_HEAD(&type->lpt_hash[hash], lp, link); return (lp); } static struct lock_profile_object * lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file, int line) { struct lock_profile_object *l; struct lock_prof_type *type; struct lpohead *head; head = &curthread->td_lprof[spin]; LIST_FOREACH(l, head, lpo_link) if (l->lpo_obj == lo && l->lpo_file == file && l->lpo_line == line) return (l); type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; l = LIST_FIRST(&type->lpt_lpoalloc); if (l == NULL) { lock_prof_rejected++; return (NULL); } LIST_REMOVE(l, lpo_link); l->lpo_obj = lo; l->lpo_file = file; l->lpo_line = line; l->lpo_cnt = 0; LIST_INSERT_HEAD(head, l, lpo_link); return (l); } void lock_profile_obtain_lock_success(struct lock_object *lo, int contested, uint64_t waittime, const char *file, int line) { static int lock_prof_count; struct lock_profile_object *l; int spin; if (SCHEDULER_STOPPED()) return; /* don't reset the timer when/if recursing */ if (!lock_prof_enable || (lo->lo_flags & LO_NOPROFILE)) return; if (lock_prof_skipcount && (++lock_prof_count % lock_prof_skipcount) != 0) return; spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0; if (spin && lock_prof_skipspin == 1) return; critical_enter(); /* Recheck enabled now that we're in a critical section. */ if (lock_prof_enable == 0) goto out; l = lock_profile_object_lookup(lo, spin, file, line); if (l == NULL) goto out; l->lpo_cnt++; if (++l->lpo_ref > 1) goto out; l->lpo_contest_locking = contested; l->lpo_acqtime = nanoseconds(); if (waittime && (l->lpo_acqtime > waittime)) l->lpo_waittime = l->lpo_acqtime - waittime; else l->lpo_waittime = 0; out: critical_exit(); } void lock_profile_thread_exit(struct thread *td) { #ifdef INVARIANTS struct lock_profile_object *l; MPASS(curthread->td_critnest == 0); #endif /* * If lock profiling was disabled we have to wait for reset to * clear our pointers before we can exit safely. */ lock_prof_reset_wait(); #ifdef INVARIANTS LIST_FOREACH(l, &td->td_lprof[0], lpo_link) printf("thread still holds lock acquired at %s:%d\n", l->lpo_file, l->lpo_line); LIST_FOREACH(l, &td->td_lprof[1], lpo_link) printf("thread still holds lock acquired at %s:%d\n", l->lpo_file, l->lpo_line); #endif MPASS(LIST_FIRST(&td->td_lprof[0]) == NULL); MPASS(LIST_FIRST(&td->td_lprof[1]) == NULL); } void lock_profile_release_lock(struct lock_object *lo) { struct lock_profile_object *l; struct lock_prof_type *type; struct lock_prof *lp; uint64_t curtime, holdtime; struct lpohead *head; int spin; if (SCHEDULER_STOPPED()) return; if (lo->lo_flags & LO_NOPROFILE) return; spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0; head = &curthread->td_lprof[spin]; if (LIST_FIRST(head) == NULL) return; critical_enter(); /* Recheck enabled now that we're in a critical section. */ if (lock_prof_enable == 0 && lock_prof_resetting == 1) goto out; /* * If lock profiling is not enabled we still want to remove the * lpo from our queue. */ LIST_FOREACH(l, head, lpo_link) if (l->lpo_obj == lo) break; if (l == NULL) goto out; if (--l->lpo_ref > 0) goto out; lp = lock_profile_lookup(lo, spin, l->lpo_file, l->lpo_line); if (lp == NULL) goto release; curtime = nanoseconds(); if (curtime < l->lpo_acqtime) goto release; holdtime = curtime - l->lpo_acqtime; /* * Record if the lock has been held longer now than ever * before. */ if (holdtime > lp->cnt_max) lp->cnt_max = holdtime; if (l->lpo_waittime > lp->cnt_wait_max) lp->cnt_wait_max = l->lpo_waittime; lp->cnt_tot += holdtime; lp->cnt_wait += l->lpo_waittime; lp->cnt_contest_locking += l->lpo_contest_locking; lp->cnt_cur += l->lpo_cnt; release: LIST_REMOVE(l, lpo_link); type = &lp_cpu[PCPU_GET(cpuid)]->lpc_types[spin]; LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link); out: critical_exit(); } static SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging"); static SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL, "lock profiling"); SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipspin, CTLFLAG_RW, &lock_prof_skipspin, 0, "Skip profiling on spinlocks."); SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipcount, CTLFLAG_RW, &lock_prof_skipcount, 0, "Sample approximately every N lock acquisitions."); SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD, &lock_prof_rejected, 0, "Number of rejected profiling records"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW, NULL, 0, enable_lock_prof, "I", "Enable lock profiling"); #endif Index: user/alc/PQ_LAUNDRY/sys/net/route.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/net/route.h (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/net/route.h (revision 303654) @@ -1,498 +1,498 @@ /*- * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)route.h 8.4 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _NET_ROUTE_H_ #define _NET_ROUTE_H_ #include #include /* * Kernel resident routing tables. * * The routing tables are initialized when interface addresses * are set by making entries for all directly connected interfaces. */ /* * Struct route consiste of a destination address, * a route entry pointer, link-layer prepend data pointer along * with its length. */ struct route { struct rtentry *ro_rt; struct llentry *ro_lle; /* * ro_prepend and ro_plen are only used for bpf to pass in a * preformed header. They are not cacheable. */ char *ro_prepend; uint16_t ro_plen; uint16_t ro_flags; uint16_t ro_mtu; /* saved ro_rt mtu */ uint16_t spare; struct sockaddr ro_dst; }; #define RT_L2_ME_BIT 2 /* dst L2 addr is our address */ #define RT_MAY_LOOP_BIT 3 /* dst may require loop copy */ #define RT_HAS_HEADER_BIT 4 /* mbuf already have its header prepended */ #define RT_CACHING_CONTEXT 0x1 /* XXX: not used anywhere */ #define RT_NORTREF 0x2 /* doesn't hold reference on ro_rt */ #define RT_L2_ME (1 << RT_L2_ME_BIT) /* 0x0004 */ #define RT_MAY_LOOP (1 << RT_MAY_LOOP_BIT) /* 0x0008 */ #define RT_HAS_HEADER (1 << RT_HAS_HEADER_BIT) /* 0x0010 */ #define RT_REJECT 0x0020 /* Destination is reject */ #define RT_BLACKHOLE 0x0040 /* Destination is blackhole */ #define RT_HAS_GW 0x0080 /* Destination has GW */ #define RT_LLE_CACHE 0x0100 /* Cache link layer */ struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_hopcount; /* max hops expected */ u_long rmx_expire; /* lifetime for route, e.g. redirect */ u_long rmx_recvpipe; /* inbound delay-bandwidth product */ u_long rmx_sendpipe; /* outbound delay-bandwidth product */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ u_long rmx_rtt; /* estimated round trip time */ u_long rmx_rttvar; /* estimated rtt variance */ u_long rmx_pksent; /* packets sent using this route */ u_long rmx_weight; /* route weight */ u_long rmx_filler[3]; /* will be used for T/TCP later */ }; /* * rmx_rtt and rmx_rttvar are stored as microseconds; * RTTTOPRHZ(rtt) converts to a value suitable for use * by a protocol slowtimo counter. */ #define RTM_RTTUNIT 1000000 /* units for rtt, rttvar, as units per sec */ #define RTTTOPRHZ(r) ((r) / (RTM_RTTUNIT / PR_SLOWHZ)) /* lle state is exported in rmx_state rt_metrics field */ #define rmx_state rmx_weight /* * Keep a generation count of routing table, incremented on route addition, * so we can invalidate caches. This is accessed without a lock, as precision * is not required. */ typedef volatile u_int rt_gen_t; /* tree generation (for adds) */ #define RT_GEN(fibnum, af) rt_tables_get_gen(fibnum, af) #define RT_DEFAULT_FIB 0 /* Explicitly mark fib=0 restricted cases */ #define RT_ALL_FIBS -1 /* Announce event for every fib */ #ifdef _KERNEL extern u_int rt_numfibs; /* number of usable routing tables */ VNET_DECLARE(u_int, rt_add_addr_allfibs); /* Announce interfaces to all fibs */ #define V_rt_add_addr_allfibs VNET(rt_add_addr_allfibs) #endif /* * We distinguish between routes to hosts and routes to networks, * preferring the former if available. For each route we infer * the interface to use from the gateway address supplied when * the route was entered. Routes that forward packets through * gateways are marked so that the output routines know to address the * gateway rather than the ultimate destination. */ #ifndef RNF_NORMAL #include #ifdef RADIX_MPATH #include #endif #endif #if defined(_KERNEL) || defined(_WANT_RTENTRY) struct rtentry { struct radix_node rt_nodes[2]; /* tree glue, and other values */ /* * XXX struct rtentry must begin with a struct radix_node (or two!) * because the code does some casts of a 'struct radix_node *' * to a 'struct rtentry *' */ #define rt_key(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_key))) #define rt_mask(r) (*((struct sockaddr **)(&(r)->rt_nodes->rn_mask))) struct sockaddr *rt_gateway; /* value */ struct ifnet *rt_ifp; /* the answer: interface to use */ struct ifaddr *rt_ifa; /* the answer: interface address to use */ int rt_flags; /* up/down?, host/net */ int rt_refcnt; /* # held references */ u_int rt_fibnum; /* which FIB */ u_long rt_mtu; /* MTU for this path */ u_long rt_weight; /* absolute weight */ u_long rt_expire; /* lifetime for route, e.g. redirect */ #define rt_endzero rt_pksent counter_u64_t rt_pksent; /* packets sent using this route */ struct mtx rt_mtx; /* mutex for routing entry */ struct rtentry *rt_chain; /* pointer to next rtentry to delete */ }; #endif /* _KERNEL || _WANT_RTENTRY */ #define RTF_UP 0x1 /* route usable */ #define RTF_GATEWAY 0x2 /* destination is a gateway */ #define RTF_HOST 0x4 /* host entry (net otherwise) */ #define RTF_REJECT 0x8 /* host or net unreachable */ #define RTF_DYNAMIC 0x10 /* created dynamically (by redirect) */ #define RTF_MODIFIED 0x20 /* modified dynamically (by redirect) */ #define RTF_DONE 0x40 /* message confirmed */ /* 0x80 unused, was RTF_DELCLONE */ /* 0x100 unused, was RTF_CLONING */ #define RTF_XRESOLVE 0x200 /* external daemon resolves name */ #define RTF_LLINFO 0x400 /* DEPRECATED - exists ONLY for backward compatibility */ #define RTF_LLDATA 0x400 /* used by apps to add/del L2 entries */ #define RTF_STATIC 0x800 /* manually added */ #define RTF_BLACKHOLE 0x1000 /* just discard pkts (during updates) */ #define RTF_PROTO2 0x4000 /* protocol specific routing flag */ #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ /* 0x10000 unused, was RTF_PRCLONING */ /* 0x20000 unused, was RTF_WASCLONED */ #define RTF_PROTO3 0x40000 /* protocol specific routing flag */ #define RTF_FIXEDMTU 0x80000 /* MTU was explicitly specified */ #define RTF_PINNED 0x100000 /* route is immutable */ #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ #define RTF_MULTICAST 0x800000 /* route represents a mcast address */ /* 0x8000000 and up unassigned */ #define RTF_STICKY 0x10000000 /* always route dst->src */ #define RTF_RNH_LOCKED 0x40000000 /* unused */ #define RTF_GWFLAG_COMPAT 0x80000000 /* a compatibility bit for interacting with existing routing apps */ /* Mask of RTF flags that are allowed to be modified by RTM_CHANGE. */ #define RTF_FMASK \ (RTF_PROTO1 | RTF_PROTO2 | RTF_PROTO3 | RTF_BLACKHOLE | \ RTF_REJECT | RTF_STATIC | RTF_STICKY) /* * fib_ nexthop API flags. */ /* Consumer-visible nexthop info flags */ #define NHF_REJECT 0x0010 /* RTF_REJECT */ #define NHF_BLACKHOLE 0x0020 /* RTF_BLACKHOLE */ #define NHF_REDIRECT 0x0040 /* RTF_DYNAMIC|RTF_MODIFIED */ #define NHF_DEFAULT 0x0080 /* Default route */ #define NHF_BROADCAST 0x0100 /* RTF_BROADCAST */ #define NHF_GATEWAY 0x0200 /* RTF_GATEWAY */ /* Nexthop request flags */ #define NHR_IFAIF 0x01 /* Return ifa_ifp interface */ #define NHR_REF 0x02 /* For future use */ /* Control plane route request flags */ #define NHR_COPY 0x100 /* Copy rte data */ #ifdef _KERNEL /* rte<>ro_flags translation */ static inline void rt_update_ro_flags(struct route *ro) { int rt_flags = ro->ro_rt->rt_flags; ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW); ro->ro_flags |= (rt_flags & RTF_REJECT) ? RT_REJECT : 0; ro->ro_flags |= (rt_flags & RTF_BLACKHOLE) ? RT_BLACKHOLE : 0; ro->ro_flags |= (rt_flags & RTF_GATEWAY) ? RT_HAS_GW : 0; } #endif /* * Routing statistics. */ struct rtstat { short rts_badredirect; /* bogus redirect calls */ short rts_dynamic; /* routes created by redirects */ short rts_newgateway; /* routes modified by redirects */ short rts_unreach; /* lookups which failed */ short rts_wildcard; /* lookups satisfied by a wildcard */ }; /* * Structures for routing messages. */ struct rt_msghdr { u_short rtm_msglen; /* to skip over non-understood messages */ u_char rtm_version; /* future binary compatibility */ u_char rtm_type; /* message type */ u_short rtm_index; /* index for associated ifp */ int rtm_flags; /* flags, incl. kern & message, e.g. DONE */ int rtm_addrs; /* bitmask identifying sockaddrs in msg */ pid_t rtm_pid; /* identify sender */ int rtm_seq; /* for sender to identify action */ int rtm_errno; /* why failed */ int rtm_fmask; /* bitmask used in RTM_CHANGE message */ u_long rtm_inits; /* which metrics we are initializing */ struct rt_metrics rtm_rmx; /* metrics themselves */ }; #define RTM_VERSION 5 /* Up the ante and ignore older versions */ /* * Message types. */ #define RTM_ADD 0x1 /* Add Route */ #define RTM_DELETE 0x2 /* Delete Route */ #define RTM_CHANGE 0x3 /* Change Metrics or flags */ #define RTM_GET 0x4 /* Report Metrics */ #define RTM_LOSING 0x5 /* Kernel Suspects Partitioning */ #define RTM_REDIRECT 0x6 /* Told to use different route */ #define RTM_MISS 0x7 /* Lookup failed on this address */ #define RTM_LOCK 0x8 /* fix specified metrics */ /* 0x9 */ /* 0xa */ #define RTM_RESOLVE 0xb /* req to resolve dst to LL addr */ #define RTM_NEWADDR 0xc /* address being added to iface */ #define RTM_DELADDR 0xd /* address being removed from iface */ #define RTM_IFINFO 0xe /* iface going up/down etc. */ #define RTM_NEWMADDR 0xf /* mcast group membership being added to if */ #define RTM_DELMADDR 0x10 /* mcast group membership being deleted */ #define RTM_IFANNOUNCE 0x11 /* iface arrival/departure */ #define RTM_IEEE80211 0x12 /* IEEE80211 wireless event */ /* * Bitmask values for rtm_inits and rmx_locks. */ #define RTV_MTU 0x1 /* init or lock _mtu */ #define RTV_HOPCOUNT 0x2 /* init or lock _hopcount */ #define RTV_EXPIRE 0x4 /* init or lock _expire */ #define RTV_RPIPE 0x8 /* init or lock _recvpipe */ #define RTV_SPIPE 0x10 /* init or lock _sendpipe */ #define RTV_SSTHRESH 0x20 /* init or lock _ssthresh */ #define RTV_RTT 0x40 /* init or lock _rtt */ #define RTV_RTTVAR 0x80 /* init or lock _rttvar */ #define RTV_WEIGHT 0x100 /* init or lock _weight */ /* * Bitmask values for rtm_addrs. */ #define RTA_DST 0x1 /* destination sockaddr present */ #define RTA_GATEWAY 0x2 /* gateway sockaddr present */ #define RTA_NETMASK 0x4 /* netmask sockaddr present */ #define RTA_GENMASK 0x8 /* cloning mask sockaddr present */ #define RTA_IFP 0x10 /* interface name sockaddr present */ #define RTA_IFA 0x20 /* interface addr sockaddr present */ #define RTA_AUTHOR 0x40 /* sockaddr for author of redirect */ #define RTA_BRD 0x80 /* for NEWADDR, broadcast or p-p dest addr */ /* * Index offsets for sockaddr array for alternate internal encoding. */ #define RTAX_DST 0 /* destination sockaddr present */ #define RTAX_GATEWAY 1 /* gateway sockaddr present */ #define RTAX_NETMASK 2 /* netmask sockaddr present */ #define RTAX_GENMASK 3 /* cloning mask sockaddr present */ #define RTAX_IFP 4 /* interface name sockaddr present */ #define RTAX_IFA 5 /* interface addr sockaddr present */ #define RTAX_AUTHOR 6 /* sockaddr for author of redirect */ #define RTAX_BRD 7 /* for NEWADDR, broadcast or p-p dest addr */ #define RTAX_MAX 8 /* size of array to allocate */ typedef int rt_filter_f_t(const struct rtentry *, void *); struct rt_addrinfo { int rti_addrs; /* Route RTF_ flags */ int rti_flags; /* Route RTF_ flags */ struct sockaddr *rti_info[RTAX_MAX]; /* Sockaddr data */ struct ifaddr *rti_ifa; /* value of rt_ifa addr */ struct ifnet *rti_ifp; /* route interface */ rt_filter_f_t *rti_filter; /* filter function */ void *rti_filterdata; /* filter paramenters */ u_long rti_mflags; /* metrics RTV_ flags */ u_long rti_spare; /* Will be used for fib */ struct rt_metrics *rti_rmx; /* Pointer to route metrics */ }; /* * This macro returns the size of a struct sockaddr when passed * through a routing socket. Basically we round up sa_len to * a multiple of sizeof(long), with a minimum of sizeof(long). * The check for a NULL pointer is just a convenience, probably never used. * The case sa_len == 0 should only apply to empty structures. */ #define SA_SIZE(sa) \ ( (!(sa) || ((struct sockaddr *)(sa))->sa_len == 0) ? \ sizeof(long) : \ 1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(long) - 1) ) ) #define sa_equal(a, b) ( \ (((const struct sockaddr *)(a))->sa_len == ((const struct sockaddr *)(b))->sa_len) && \ (bcmp((a), (b), ((const struct sockaddr *)(b))->sa_len) == 0)) #ifdef _KERNEL #define RT_LINK_IS_UP(ifp) (!((ifp)->if_capabilities & IFCAP_LINKSTATE) \ || (ifp)->if_link_state == LINK_STATE_UP) #define RT_LOCK_INIT(_rt) \ - mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK) + mtx_init(&(_rt)->rt_mtx, "rtentry", NULL, MTX_DEF | MTX_DUPOK | MTX_NEW) #define RT_LOCK(_rt) mtx_lock(&(_rt)->rt_mtx) #define RT_UNLOCK(_rt) mtx_unlock(&(_rt)->rt_mtx) #define RT_LOCK_DESTROY(_rt) mtx_destroy(&(_rt)->rt_mtx) #define RT_LOCK_ASSERT(_rt) mtx_assert(&(_rt)->rt_mtx, MA_OWNED) #define RT_UNLOCK_COND(_rt) do { \ if (mtx_owned(&(_rt)->rt_mtx)) \ mtx_unlock(&(_rt)->rt_mtx); \ } while (0) #define RT_ADDREF(_rt) do { \ RT_LOCK_ASSERT(_rt); \ KASSERT((_rt)->rt_refcnt >= 0, \ ("negative refcnt %d", (_rt)->rt_refcnt)); \ (_rt)->rt_refcnt++; \ } while (0) #define RT_REMREF(_rt) do { \ RT_LOCK_ASSERT(_rt); \ KASSERT((_rt)->rt_refcnt > 0, \ ("bogus refcnt %d", (_rt)->rt_refcnt)); \ (_rt)->rt_refcnt--; \ } while (0) #define RTFREE_LOCKED(_rt) do { \ if ((_rt)->rt_refcnt <= 1) \ rtfree(_rt); \ else { \ RT_REMREF(_rt); \ RT_UNLOCK(_rt); \ } \ /* guard against invalid refs */ \ _rt = 0; \ } while (0) #define RTFREE(_rt) do { \ RT_LOCK(_rt); \ RTFREE_LOCKED(_rt); \ } while (0) #define RO_RTFREE(_ro) do { \ if ((_ro)->ro_rt) { \ if ((_ro)->ro_flags & RT_NORTREF) { \ (_ro)->ro_flags &= ~RT_NORTREF; \ (_ro)->ro_rt = NULL; \ (_ro)->ro_lle = NULL; \ } else { \ RT_LOCK((_ro)->ro_rt); \ RTFREE_LOCKED((_ro)->ro_rt); \ } \ } \ } while (0) /* * Validate a cached route based on a supplied cookie. If there is an * out-of-date cache, simply free it. Update the generation number * for the new allocation */ #define RT_VALIDATE(ro, cookiep, fibnum) do { \ rt_gen_t cookie = RT_GEN(fibnum, (ro)->ro_dst.sa_family); \ if (*(cookiep) != cookie) { \ if ((ro)->ro_rt != NULL) { \ RTFREE((ro)->ro_rt); \ (ro)->ro_rt = NULL; \ } \ *(cookiep) = cookie; \ } \ } while (0) struct ifmultiaddr; struct rib_head; void rt_ieee80211msg(struct ifnet *, int, void *, size_t); void rt_ifannouncemsg(struct ifnet *, int); void rt_ifmsg(struct ifnet *); void rt_missmsg(int, struct rt_addrinfo *, int, int); void rt_missmsg_fib(int, struct rt_addrinfo *, int, int, int); void rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *); void rt_newaddrmsg_fib(int, struct ifaddr *, int, struct rtentry *, int); int rt_addrmsg(int, struct ifaddr *, int); int rt_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int); void rt_newmaddrmsg(int, struct ifmultiaddr *); int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *); void rt_maskedcopy(struct sockaddr *, struct sockaddr *, struct sockaddr *); struct rib_head *rt_table_init(int); void rt_table_destroy(struct rib_head *); u_int rt_tables_get_gen(int table, int fam); int rtsock_addrmsg(int, struct ifaddr *, int); int rtsock_routemsg(int, struct ifnet *ifp, int, struct rtentry *, int); /* * Note the following locking behavior: * * rtalloc1() returns a locked rtentry * * rtfree() and RTFREE_LOCKED() require a locked rtentry * * RTFREE() uses an unlocked entry. */ void rtfree(struct rtentry *); void rt_updatemtu(struct ifnet *); typedef int rt_walktree_f_t(struct rtentry *, void *); typedef void rt_setwarg_t(struct rib_head *, uint32_t, int, void *); void rt_foreach_fib_walk(int af, rt_setwarg_t *, rt_walktree_f_t *, void *); void rt_foreach_fib_walk_del(int af, rt_filter_f_t *filter_f, void *arg); void rt_flushifroutes_af(struct ifnet *, int); void rt_flushifroutes(struct ifnet *ifp); /* XXX MRT COMPAT VERSIONS THAT SET UNIVERSE to 0 */ /* Thes are used by old code not yet converted to use multiple FIBS */ struct rtentry *rtalloc1(struct sockaddr *, int, u_long); int rtinit(struct ifaddr *, int, int); /* XXX MRT NEW VERSIONS THAT USE FIBs * For now the protocol indepedent versions are the same as the AF_INET ones * but this will change.. */ int rt_getifa_fib(struct rt_addrinfo *, u_int fibnum); void rtalloc_ign_fib(struct route *ro, u_long ignflags, u_int fibnum); struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int); int rtioctl_fib(u_long, caddr_t, u_int); void rtredirect_fib(struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, u_int); int rtrequest_fib(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, u_int); int rtrequest1_fib(int, struct rt_addrinfo *, struct rtentry **, u_int); int rib_lookup_info(uint32_t, const struct sockaddr *, uint32_t, uint32_t, struct rt_addrinfo *); void rib_free_info(struct rt_addrinfo *info); #endif #endif Index: user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 303654) @@ -1,1545 +1,1551 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #include #include #include #include #include /* For ARPHRD_xxx */ #include #include #include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size = IPOIB_TX_RING_SIZE; int ipoib_recvq_size = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level = 1; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct ipoib_dev_priv *priv; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device); static void ipoib_start(struct ifnet *dev); static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static void ipoib_input(struct ifnet *ifp, struct mbuf *m); #define IPOIB_MTAP(_ifp, _m) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_mb((_ifp), (_m)); \ } \ } while (0) /* * This is for clients that have an ipoib_header in the mbuf. */ static void ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb) { struct ipoib_header *ih; struct ether_header eh; ih = mtod(mb, struct ipoib_header *); eh.ether_type = ih->proto; bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN); bzero(&eh.ether_shost, ETHER_ADDR_LEN); mb->m_data += sizeof(struct ipoib_header); mb->m_len -= sizeof(struct ipoib_header); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); mb->m_data -= sizeof(struct ipoib_header); mb->m_len += sizeof(struct ipoib_header); } void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto) { struct ether_header eh; eh.ether_type = proto; bzero(&eh.ether_shost, ETHER_ADDR_LEN); bzero(&eh.ether_dhost, ETHER_ADDR_LEN); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); } static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one }; int ipoib_open(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "bringing up interface\n"); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(priv)) return 0; if (ipoib_ib_dev_open(priv)) goto err_disable; if (ipoib_ib_dev_up(priv)) goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(cpriv); mutex_unlock(&priv->vlan_mutex); } dev->if_drv_flags |= IFF_DRV_RUNNING; dev->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; err_stop: ipoib_ib_dev_stop(priv, 1); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static void ipoib_init(void *arg) { struct ifnet *dev; struct ipoib_dev_priv *priv; priv = arg; dev = priv->dev; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(priv); queue_work(ipoib_workqueue, &priv->flush_light); } static int ipoib_stop(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0) ipoib_stop(cpriv); mutex_unlock(&priv->vlan_mutex); } return 0; } int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu) { struct ifnet *dev = priv->dev; /* dev->if_mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(priv)) { if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv))) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); dev->if_mtu = new_mtu; return 0; } if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu); queue_work(ipoib_workqueue, &priv->flush_light); return 0; } static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ipoib_dev_priv *priv = ifp->if_softc; struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); switch (command) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) error = -ipoib_open(priv); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_stop(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifp->if_drv_flags & IFF_DRV_RUNNING) queue_work(ipoib_workqueue, &priv->restart_task); break; case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, INFINIBAND_ALEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ error = -ipoib_change_mtu(priv, ifr->ifr_mtu); break; default: error = EINVAL; break; } return (error); } static struct ipoib_path * __path_find(struct ipoib_dev_priv *priv, void *gid) { struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path) { _IF_DRAIN(&path->queue); if (path->ah) ipoib_put_ah(path->ah); if (ipoib_cm_get(path)) ipoib_cm_destroy_tx(ipoib_cm_get(path)); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter * ipoib_path_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n", be16_to_cpu(path->pathrec.dlid), path->pathrec.dgid.raw, ":"); path->valid = 0; } spin_unlock_irq(&priv->lock); } void ipoib_flush_paths(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); wait_for_completion(&path->done); ipoib_path_free(priv, path); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; struct ipoib_dev_priv *priv = path->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ifqueue mbqueue; struct mbuf *mb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n", be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":"); else ipoib_dbg(priv, "PathRec status %d for GID %16D\n", status, path->pathrec.dgid.raw, ":"); bzero(&mbqueue, sizeof(mbqueue)); if (!status) { struct ib_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) ah = ipoib_create_ah(priv, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); if (ah) { path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); for (;;) { _IF_DEQUEUE(&path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } #ifdef CONFIG_INFINIBAND_IPOIB_CM if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path)) ipoib_cm_set(path, ipoib_cm_create_tx(priv, path)); #endif path->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (old_ah) ipoib_put_ah(old_ah); for (;;) { _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } } static struct ipoib_path * path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof *path, GFP_ATOMIC); if (!path) return NULL; path->priv = priv; bzero(&path->queue, sizeof(path->queue)); #ifdef CONFIG_INFINIBAND_IPOIB_CM memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN); #endif memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } static int path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ifnet *dev = priv->dev; ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; struct ib_sa_path_rec p_rec; p_rec = path->pathrec; p_rec.mtu_selector = IB_SA_GT; switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) { case 512: p_rec.mtu = IB_MTU_256; break; case 1024: p_rec.mtu = IB_MTU_512; break; case 2048: p_rec.mtu = IB_MTU_1024; break; case 4096: p_rec.mtu = IB_MTU_2048; break; default: /* Wildcard everything */ comp_mask = 0; p_rec.mtu = 0; p_rec.mtu_selector = 0; } ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n", p_rec.dgid.raw, ":", comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &p_rec, comp_mask | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh) { struct ipoib_path *path; path = __path_find(priv, eh->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { path = path_rec_create(priv, eh->hwaddr); new_path = 1; } if (path) { - _IF_ENQUEUE(&path->queue, mb); + if (_IF_QLEN(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) + _IF_ENQUEUE(&path->queue, mb); + else { + if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); + m_freem(mb); + } + if (!path->query && path_rec_start(priv, path)) { spin_unlock_irqrestore(&priv->lock, flags); if (new_path) ipoib_path_free(priv, path); return; } else __path_add(priv, path); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } return; } if (ipoib_cm_get(path) && ipoib_cm_up(path)) { ipoib_cm_send(priv, mb, ipoib_cm_get(path)); } else if (path->ah) { ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr)); } else if ((path->query || !path_rec_start(priv, path)) && path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) { _IF_ENQUEUE(&path->queue, mb); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } } static int ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb) { struct ipoib_header *eh; eh = mtod(mb, struct ipoib_header *); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { /* Add in the P_Key for multicast*/ eh->hwaddr[8] = (priv->pkey >> 8) & 0xff; eh->hwaddr[9] = priv->pkey & 0xff; ipoib_mcast_send(priv, eh->hwaddr + 4, mb); } else ipoib_unicast_send(mb, priv, eh); return 0; } static void _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv) { struct mbuf *mb; if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; spin_lock(&priv->lock); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) && (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; IPOIB_MTAP(dev, mb); ipoib_send_one(priv, mb); } spin_unlock(&priv->lock); } static void ipoib_start(struct ifnet *dev) { _ipoib_start(dev, dev->if_softc); } static void ipoib_vlan_start(struct ifnet *dev) { struct ipoib_dev_priv *priv; struct mbuf *mb; priv = VLAN_COOKIE(dev); if (priv != NULL) return _ipoib_start(dev, priv); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; m_freem(mb); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); } } int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { /* Allocate RX/TX "rings" to hold queued mbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); goto out; } priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(priv, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: kfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: return -ENOMEM; } static void ipoib_detach(struct ipoib_dev_priv *priv) { struct ifnet *dev; dev = priv->dev; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { priv->gone = 1; bpfdetach(dev); if_detach(dev); if_free(dev); } else VLAN_SETCOOKIE(priv->dev, NULL); free(priv, M_TEMP); } void ipoib_dev_cleanup(struct ipoib_dev_priv *priv) { struct ipoib_dev_priv *cpriv, *tcpriv; /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { ipoib_dev_cleanup(cpriv); ipoib_detach(cpriv); } ipoib_ib_dev_cleanup(priv); kfree(priv->rx_ring); kfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static volatile int ipoib_unit; static struct ipoib_dev_priv * ipoib_priv_alloc(void) { struct ipoib_dev_priv *priv; priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); spin_lock_init(&priv->drain_lock); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN); return (priv); } struct ipoib_dev_priv * ipoib_intf_alloc(const char *name) { struct ipoib_dev_priv *priv; struct sockaddr_dl *sdl; struct ifnet *dev; priv = ipoib_priv_alloc(); dev = priv->dev = if_alloc(IFT_INFINIBAND); if (!dev) { free(priv, M_TEMP); return NULL; } dev->if_softc = priv; if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1)); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; dev->if_addrlen = INFINIBAND_ALEN; dev->if_hdrlen = IPOIB_HEADER_LEN; if_attach(dev); dev->if_init = ipoib_init; dev->if_ioctl = ipoib_ioctl; dev->if_start = ipoib_start; dev->if_output = ipoib_output; dev->if_input = ipoib_input; dev->if_resolvemulti = ipoib_resolvemulti; dev->if_baudrate = IF_Gbps(10); dev->if_broadcastaddr = priv->broadcastaddr; dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr; sdl->sdl_type = IFT_INFINIBAND; sdl->sdl_alen = dev->if_addrlen; priv->dev = dev; if_link_state_change(dev, LINK_STATE_DOWN); bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN); return dev->if_softc; } int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { struct ib_device_attr *device_attr; int result = -ENOMEM; device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); if (!device_attr) { printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", hca->name, sizeof *device_attr); return result; } result = ib_query_device(hca, device_attr); if (result) { printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", hca->name, result); kfree(device_attr); return result; } priv->hca_caps = device_attr->device_cap_flags; kfree(device_attr); priv->dev->if_hwassist = 0; priv->dev->if_capabilities = 0; #ifndef CONFIG_INFINIBAND_IPOIB_CM if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { set_bit(IPOIB_FLAG_CSUM, &priv->flags); priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } #if 0 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) { priv->dev->if_capabilities |= IFCAP_TSO4; priv->dev->if_hwassist |= CSUM_TSO; } #endif #endif priv->dev->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE; priv->dev->if_capenable = priv->dev->if_capabilities; return 0; } static struct ifnet * ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } /* MTU will be reset when mcast join happens */ priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu; result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_set_dev_features(priv, hca)) goto device_init_failed; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; result = ib_query_gid(hca, port, 0, &priv->local_gid); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); result = ipoib_dev_init(priv, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_cm_admin_enabled(priv)) priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); result = ib_register_event_handler(&priv->event_handler); if (result < 0) { printk(KERN_WARNING "%s: ib_register_event_handler failed for " "port %d (ret = %d)\n", hca->name, port, result); goto event_failed; } if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port); return priv->dev; event_failed: ipoib_dev_cleanup(priv); device_init_failed: ipoib_detach(priv); alloc_mem_failed: return ERR_PTR(result); } static void ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct ifnet *dev; struct ipoib_dev_priv *priv; int s, e, p; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib", device, p); if (!IS_ERR(dev)) { priv = dev->if_softc; list_add_tail(&priv->list, dev_list); } } ib_set_client_data(device, &ipoib_client, dev_list); } static void ipoib_remove_one(struct ib_device *device) { struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = ib_get_client_data(device, &ipoib_client); list_for_each_entry_safe(priv, tmp, dev_list, list) { if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) continue; ipoib_stop(priv); ib_unregister_event_handler(&priv->event_handler); /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */ flush_workqueue(ipoib_workqueue); ipoib_dev_cleanup(priv); ipoib_detach(priv); } kfree(dev_list); } static void ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; int error; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev == NULL) return; priv = NULL; error = 0; parent = ifp->if_softc; /* We only support 15 bits of pkey. */ if (vtag & 0x8000) return; pkey = vtag | 0x8000; /* Set full membership bit. */ if (pkey == parent->pkey) return; /* Check for dups */ mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { priv = NULL; error = EBUSY; goto out; } } priv = ipoib_priv_alloc(); priv->dev = dev; priv->max_ib_mtu = parent->max_ib_mtu; priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); error = ipoib_set_dev_features(priv, parent->ca); if (error) goto out; priv->pkey = pkey; priv->broadcastaddr[8] = pkey >> 8; priv->broadcastaddr[9] = pkey & 0xff; dev->if_broadcastaddr = priv->broadcastaddr; error = ipoib_dev_init(priv, parent->ca, parent->port); if (error) goto out; priv->parent = parent->dev; list_add_tail(&priv->list, &parent->child_intfs); VLAN_SETCOOKIE(dev, priv); dev->if_start = ipoib_vlan_start; dev->if_drv_flags &= ~IFF_DRV_RUNNING; dev->if_hdrlen = IPOIB_HEADER_LEN; if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_open(priv); mutex_unlock(&parent->vlan_mutex); return; out: mutex_unlock(&parent->vlan_mutex); if (priv) free(priv, M_TEMP); if (error) ipoib_warn(parent, "failed to initialize subinterface: device %s, port %d vtag 0x%X", parent->ca->name, parent->port, vtag); return; } static void ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev) VLAN_SETCOOKIE(dev, NULL); pkey = vtag | 0x8000; parent = ifp->if_softc; mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { ipoib_dev_cleanup(priv); list_del(&priv->list); break; } } mutex_unlock(&parent->vlan_mutex); } eventhandler_tag ipoib_vlan_attach; eventhandler_tag ipoib_vlan_detach; static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE)); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST); ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST); /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't * use schedule_work()/flush_scheduled_work() because both * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. */ ipoib_workqueue = create_singlethread_workqueue("ipoib"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; return 0; err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: return ret; } static void __exit ipoib_cleanup_module(void) { EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); } /* * Infiniband output routine. */ static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_char edst[INFINIBAND_ALEN]; #if defined(INET) || defined(INET6) struct llentry *lle = NULL; #endif struct ipoib_header *eh; int error = 0, is_gw = 0; short type; if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto bad; #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) { error = ENETDOWN; goto bad; } if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { error = ENETDOWN; goto bad; } switch (dst->sa_family) { #ifdef INET case AF_INET: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); else error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); else bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); } break; #endif #ifdef INET6 case AF_INET6: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); else error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); error = EAFNOSUPPORT; goto bad; } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } eh = mtod(m, struct ipoib_header *); (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); bad: if (m != NULL) m_freem(m); return (error); } /* * Upper layer processing for a received Infiniband packet. */ void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto) { int isr; #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { if_printf(ifp, "discard frame at IFF_MONITOR\n"); m_freem(m); return; } /* * Dispatch frame to upper layer. */ switch (proto) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: m_freem(m); } /* * Process a received Infiniband packet. */ static void ipoib_input(struct ifnet *ifp, struct mbuf *m) { struct ipoib_header *eh; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } CURVNET_SET_QUIET(ifp->if_vnet); /* Let BPF have it before we strip the header. */ IPOIB_MTAP(ifp, m); eh = mtod(m, struct ipoib_header *); /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Infiniband header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, IPOIB_HEADER_LEN); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { if (memcmp(eh->hwaddr, ifp->if_broadcastaddr, ifp->if_addrlen) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } ipoib_demux(ifp, m, ntohs(eh->proto)); CURVNET_RESTORE(); } static int ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!IPOIB_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; /* * An IP6 address of 0 means listen to all * of the multicast address used for IP6. * This has no meaning in ipoib. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) return EADDRNOTAVAIL; if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: return EAFNOSUPPORT; } } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module); static int ipoib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ipoib_mod = { .name = "ipoib", .evhand = ipoib_evhand, }; DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1); Index: user/alc/PQ_LAUNDRY/sys/opencrypto/xform_aes_icm.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/opencrypto/xform_aes_icm.c (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/opencrypto/xform_aes_icm.c (revision 303654) @@ -1,152 +1,152 @@ /* $OpenBSD: xform.c,v 1.16 2001/08/28 12:20:43 ben Exp $ */ /*- * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr), * Niels Provos (provos@physnet.uni-hamburg.de) and * Damien Miller (djm@mindrot.org). * * This code was written by John Ioannidis for BSD/OS in Athens, Greece, * in November 1995. * * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996, * by Angelos D. Keromytis. * * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis * and Niels Provos. * * Additional features in 1999 by Angelos D. Keromytis. * * AES XTS implementation in 2008 by Damien Miller * * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis, * Angelos D. Keromytis and Niels Provos. * * Copyright (C) 2001, Angelos D. Keromytis. * * Copyright (C) 2008, Damien Miller * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by John-Mark Gurney * under sponsorship of the FreeBSD Foundation and * Rubicon Communications, LLC (Netgate). * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all copies of any software which is or includes a copy or * modification of this software. * You may use this code under the GNU public license if you so wish. Please * contribute changes back to the authors under this freer than GPL license * so that we may further the use of strong encryption without limitations to * all. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #include __FBSDID("$FreeBSD$"); #include static int aes_icm_setkey(u_int8_t **, u_int8_t *, int); static void aes_icm_crypt(caddr_t, u_int8_t *); static void aes_icm_zerokey(u_int8_t **); static void aes_icm_reinit(caddr_t, u_int8_t *); static void aes_gcm_reinit(caddr_t, u_int8_t *); /* Encryption instances */ struct enc_xform enc_xform_aes_icm = { CRYPTO_AES_ICM, "AES-ICM", AES_BLOCK_LEN, AES_BLOCK_LEN, AES_MIN_KEY, AES_MAX_KEY, aes_icm_crypt, aes_icm_crypt, aes_icm_setkey, - rijndael128_zerokey, + aes_icm_zerokey, aes_icm_reinit, }; struct enc_xform enc_xform_aes_nist_gcm = { CRYPTO_AES_NIST_GCM_16, "AES-GCM", AES_ICM_BLOCK_LEN, AES_GCM_IV_LEN, AES_MIN_KEY, AES_MAX_KEY, aes_icm_crypt, aes_icm_crypt, aes_icm_setkey, aes_icm_zerokey, aes_gcm_reinit, }; /* * Encryption wrapper routines. */ static void aes_icm_reinit(caddr_t key, u_int8_t *iv) { struct aes_icm_ctx *ctx; ctx = (struct aes_icm_ctx *)key; bcopy(iv, ctx->ac_block, AESICM_BLOCKSIZE); } static void aes_gcm_reinit(caddr_t key, u_int8_t *iv) { struct aes_icm_ctx *ctx; aes_icm_reinit(key, iv); ctx = (struct aes_icm_ctx *)key; /* GCM starts with 2 as counter 1 is used for final xor of tag. */ bzero(&ctx->ac_block[AESICM_BLOCKSIZE - 4], 4); ctx->ac_block[AESICM_BLOCKSIZE - 1] = 2; } static void aes_icm_crypt(caddr_t key, u_int8_t *data) { struct aes_icm_ctx *ctx; u_int8_t keystream[AESICM_BLOCKSIZE]; int i; ctx = (struct aes_icm_ctx *)key; rijndaelEncrypt(ctx->ac_ek, ctx->ac_nr, ctx->ac_block, keystream); for (i = 0; i < AESICM_BLOCKSIZE; i++) data[i] ^= keystream[i]; explicit_bzero(keystream, sizeof(keystream)); /* increment counter */ for (i = AESICM_BLOCKSIZE - 1; i >= 0; i--) if (++ctx->ac_block[i]) /* continue on overflow */ break; } static int aes_icm_setkey(u_int8_t **sched, u_int8_t *key, int len) { struct aes_icm_ctx *ctx; if (len != 16 && len != 24 && len != 32) return EINVAL; *sched = KMALLOC(sizeof(struct aes_icm_ctx), M_CRYPTO_DATA, M_NOWAIT | M_ZERO); if (*sched == NULL) return ENOMEM; ctx = (struct aes_icm_ctx *)*sched; ctx->ac_nr = rijndaelKeySetupEnc(ctx->ac_ek, (u_char *)key, len * 8); return 0; } static void aes_icm_zerokey(u_int8_t **sched) { bzero(*sched, sizeof(struct aes_icm_ctx)); KFREE(*sched, M_CRYPTO_DATA); *sched = NULL; } Index: user/alc/PQ_LAUNDRY/sys/sys/lock.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/sys/lock.h (revision 303653) +++ user/alc/PQ_LAUNDRY/sys/sys/lock.h (revision 303654) @@ -1,313 +1,337 @@ /*- * Copyright (c) 1997 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI Id: mutex.h,v 2.7.2.35 2000/04/27 03:10:26 cp * $FreeBSD$ */ #ifndef _SYS_LOCK_H_ #define _SYS_LOCK_H_ #include #include #include struct lock_list_entry; struct thread; /* * Lock classes. Each lock has a class which describes characteristics * common to all types of locks of a given class. * * Spin locks in general must always protect against preemption, as it is * an error to perform any type of context switch while holding a spin lock. * Also, for an individual lock to be recursable, its class must allow * recursion and the lock itself must explicitly allow recursion. * * The 'lc_ddb_show' function pointer is used to dump class-specific * data for the 'show lock' DDB command. The 'lc_lock' and * 'lc_unlock' function pointers are used in sleep(9) and cv_wait(9) * to lock and unlock locks while blocking on a sleep queue. The * return value of 'lc_unlock' will be passed to 'lc_lock' on resume * to allow communication of state between the two routines. */ struct lock_class { const char *lc_name; u_int lc_flags; void (*lc_assert)(const struct lock_object *lock, int what); void (*lc_ddb_show)(const struct lock_object *lock); void (*lc_lock)(struct lock_object *lock, uintptr_t how); int (*lc_owner)(const struct lock_object *lock, struct thread **owner); uintptr_t (*lc_unlock)(struct lock_object *lock); }; #define LC_SLEEPLOCK 0x00000001 /* Sleep lock. */ #define LC_SPINLOCK 0x00000002 /* Spin lock. */ #define LC_SLEEPABLE 0x00000004 /* Sleeping allowed with this lock. */ #define LC_RECURSABLE 0x00000008 /* Locks of this type may recurse. */ #define LC_UPGRADABLE 0x00000010 /* Upgrades and downgrades permitted. */ #define LO_CLASSFLAGS 0x0000ffff /* Class specific flags. */ #define LO_INITIALIZED 0x00010000 /* Lock has been initialized. */ #define LO_WITNESS 0x00020000 /* Should witness monitor this lock. */ #define LO_QUIET 0x00040000 /* Don't log locking operations. */ #define LO_RECURSABLE 0x00080000 /* Lock may recurse. */ #define LO_SLEEPABLE 0x00100000 /* Lock may be held while sleeping. */ #define LO_UPGRADABLE 0x00200000 /* Lock may be upgraded/downgraded. */ #define LO_DUPOK 0x00400000 /* Don't check for duplicate acquires */ #define LO_IS_VNODE 0x00800000 /* Tell WITNESS about a VNODE lock */ #define LO_CLASSMASK 0x0f000000 /* Class index bitmask. */ #define LO_NOPROFILE 0x10000000 /* Don't profile this lock */ #define LO_NEW 0x20000000 /* Don't check for double-init */ /* * Lock classes are statically assigned an index into the gobal lock_classes * array. Debugging code looks up the lock class for a given lock object * by indexing the array. */ #define LO_CLASSSHIFT 24 #define LO_CLASSINDEX(lock) ((((lock)->lo_flags) & LO_CLASSMASK) >> LO_CLASSSHIFT) #define LOCK_CLASS(lock) (lock_classes[LO_CLASSINDEX((lock))]) #define LOCK_CLASS_MAX (LO_CLASSMASK >> LO_CLASSSHIFT) /* * Option flags passed to lock operations that witness also needs to know * about or that are generic across all locks. */ #define LOP_NEWORDER 0x00000001 /* Define a new lock order. */ #define LOP_QUIET 0x00000002 /* Don't log locking operations. */ #define LOP_TRYLOCK 0x00000004 /* Don't check lock order. */ #define LOP_EXCLUSIVE 0x00000008 /* Exclusive lock. */ #define LOP_DUPOK 0x00000010 /* Don't check for duplicate acquires */ /* Flags passed to witness_assert. */ #define LA_MASKASSERT 0x000000ff /* Mask for witness defined asserts. */ #define LA_UNLOCKED 0x00000000 /* Lock is unlocked. */ #define LA_LOCKED 0x00000001 /* Lock is at least share locked. */ #define LA_SLOCKED 0x00000002 /* Lock is exactly share locked. */ #define LA_XLOCKED 0x00000004 /* Lock is exclusively locked. */ #define LA_RECURSED 0x00000008 /* Lock is recursed. */ #define LA_NOTRECURSED 0x00000010 /* Lock is not recursed. */ #ifdef _KERNEL /* * If any of WITNESS, INVARIANTS, or KTR_LOCK KTR tracing has been enabled, * then turn on LOCK_DEBUG. When this option is on, extra debugging * facilities such as tracking the file and line number of lock operations * are enabled. Also, mutex locking operations are not inlined to avoid * bloat from all the extra debugging code. We also have to turn on all the * calling conventions for this debugging code in modules so that modules can * work with both debug and non-debug kernels. */ #if defined(KLD_MODULE) || defined(WITNESS) || defined(INVARIANTS) || defined(INVARIANT_SUPPORT) || defined(LOCK_PROFILING) || (defined(KTR) && (KTR_COMPILE & KTR_LOCK)) #define LOCK_DEBUG 1 #else #define LOCK_DEBUG 0 #endif /* * In the LOCK_DEBUG case, use the filename and line numbers for debugging * operations. Otherwise, use default values to avoid the unneeded bloat. */ #if LOCK_DEBUG > 0 #define LOCK_FILE __FILE__ #define LOCK_LINE __LINE__ #else #define LOCK_FILE NULL #define LOCK_LINE 0 #endif /* * Macros for KTR_LOCK tracing. * * opname - name of this operation (LOCK/UNLOCK/SLOCK, etc.) * lo - struct lock_object * for this lock * flags - flags passed to the lock operation * recurse - this locks recursion level (or 0 if class is not recursable) * result - result of a try lock operation * file - file name * line - line number */ #define LOCK_LOG_TEST(lo, flags) \ (((flags) & LOP_QUIET) == 0 && ((lo)->lo_flags & LO_QUIET) == 0) #define LOCK_LOG_LOCK(opname, lo, flags, recurse, file, line) do { \ if (LOCK_LOG_TEST((lo), (flags))) \ CTR6(KTR_LOCK, opname " (%s) %s %p r = %d at %s:%d", \ LOCK_CLASS(lo)->lc_name, (lo)->lo_name, \ (lo), (u_int)(recurse), (file), (line)); \ } while (0) #define LOCK_LOG_TRY(opname, lo, flags, result, file, line) do { \ if (LOCK_LOG_TEST((lo), (flags))) \ CTR6(KTR_LOCK, "TRY_" opname " (%s) %s %p result=%d at %s:%d",\ LOCK_CLASS(lo)->lc_name, (lo)->lo_name, \ (lo), (u_int)(result), (file), (line)); \ } while (0) #define LOCK_LOG_INIT(lo, flags) do { \ if (LOCK_LOG_TEST((lo), (flags))) \ CTR4(KTR_LOCK, "%s: %p (%s) %s", __func__, (lo), \ LOCK_CLASS(lo)->lc_name, (lo)->lo_name); \ } while (0) #define LOCK_LOG_DESTROY(lo, flags) LOCK_LOG_INIT(lo, flags) #define lock_initialized(lo) ((lo)->lo_flags & LO_INITIALIZED) /* * Helpful macros for quickly coming up with assertions with informative * panic messages. */ #define MPASS(ex) MPASS4(ex, #ex, __FILE__, __LINE__) #define MPASS2(ex, what) MPASS4(ex, what, __FILE__, __LINE__) #define MPASS3(ex, file, line) MPASS4(ex, #ex, file, line) #define MPASS4(ex, what, file, line) \ KASSERT((ex), ("Assertion %s failed at %s:%d", what, file, line)) extern struct lock_class lock_class_mtx_sleep; extern struct lock_class lock_class_mtx_spin; extern struct lock_class lock_class_sx; extern struct lock_class lock_class_rw; extern struct lock_class lock_class_rm; extern struct lock_class lock_class_rm_sleepable; extern struct lock_class lock_class_lockmgr; extern struct lock_class *lock_classes[]; +struct lock_delay_config { + u_int initial; + u_int step; + u_int min; + u_int max; +}; + +struct lock_delay_arg { + struct lock_delay_config *config; + u_int delay; + u_int spin_cnt; +}; + +static inline void +lock_delay_arg_init(struct lock_delay_arg *la, struct lock_delay_config *lc) { + la->config = lc; + la->delay = 0; + la->spin_cnt = 0; +} + +#define LOCK_DELAY_SYSINIT(func) \ + SYSINIT(func##_ld, SI_SUB_LOCK, SI_ORDER_ANY, func, NULL) + void lock_init(struct lock_object *, struct lock_class *, const char *, const char *, int); void lock_destroy(struct lock_object *); +void lock_delay(struct lock_delay_arg *); void spinlock_enter(void); void spinlock_exit(void); void witness_init(struct lock_object *, const char *); void witness_destroy(struct lock_object *); int witness_defineorder(struct lock_object *, struct lock_object *); void witness_checkorder(struct lock_object *, int, const char *, int, struct lock_object *); void witness_lock(struct lock_object *, int, const char *, int); void witness_upgrade(struct lock_object *, int, const char *, int); void witness_downgrade(struct lock_object *, int, const char *, int); void witness_unlock(struct lock_object *, int, const char *, int); void witness_save(struct lock_object *, const char **, int *); void witness_restore(struct lock_object *, const char *, int); int witness_list_locks(struct lock_list_entry **, int (*)(const char *, ...)); int witness_warn(int, struct lock_object *, const char *, ...); void witness_assert(const struct lock_object *, int, const char *, int); void witness_display_spinlock(struct lock_object *, struct thread *, int (*)(const char *, ...)); int witness_line(struct lock_object *); void witness_norelease(struct lock_object *); void witness_releaseok(struct lock_object *); const char *witness_file(struct lock_object *); void witness_thread_exit(struct thread *); #ifdef WITNESS /* Flags for witness_warn(). */ #define WARN_GIANTOK 0x01 /* Giant is exempt from this check. */ #define WARN_PANIC 0x02 /* Panic if check fails. */ #define WARN_SLEEPOK 0x04 /* Sleepable locks are exempt from check. */ #define WITNESS_INIT(lock, type) \ witness_init((lock), (type)) #define WITNESS_DESTROY(lock) \ witness_destroy(lock) #define WITNESS_CHECKORDER(lock, flags, file, line, interlock) \ witness_checkorder((lock), (flags), (file), (line), (interlock)) #define WITNESS_DEFINEORDER(lock1, lock2) \ witness_defineorder((struct lock_object *)(lock1), \ (struct lock_object *)(lock2)) #define WITNESS_LOCK(lock, flags, file, line) \ witness_lock((lock), (flags), (file), (line)) #define WITNESS_UPGRADE(lock, flags, file, line) \ witness_upgrade((lock), (flags), (file), (line)) #define WITNESS_DOWNGRADE(lock, flags, file, line) \ witness_downgrade((lock), (flags), (file), (line)) #define WITNESS_UNLOCK(lock, flags, file, line) \ witness_unlock((lock), (flags), (file), (line)) #define WITNESS_CHECK(flags, lock, fmt, ...) \ witness_warn((flags), (lock), (fmt), ## __VA_ARGS__) #define WITNESS_WARN(flags, lock, fmt, ...) \ witness_warn((flags), (lock), (fmt), ## __VA_ARGS__) #define WITNESS_SAVE_DECL(n) \ const char * __CONCAT(n, __wf); \ int __CONCAT(n, __wl) #define WITNESS_SAVE(lock, n) \ witness_save((lock), &__CONCAT(n, __wf), &__CONCAT(n, __wl)) #define WITNESS_RESTORE(lock, n) \ witness_restore((lock), __CONCAT(n, __wf), __CONCAT(n, __wl)) #define WITNESS_NORELEASE(lock) \ witness_norelease(&(lock)->lock_object) #define WITNESS_RELEASEOK(lock) \ witness_releaseok(&(lock)->lock_object) #define WITNESS_FILE(lock) \ witness_file(lock) #define WITNESS_LINE(lock) \ witness_line(lock) #else /* WITNESS */ #define WITNESS_INIT(lock, type) (void)0 #define WITNESS_DESTROY(lock) (void)0 #define WITNESS_DEFINEORDER(lock1, lock2) 0 #define WITNESS_CHECKORDER(lock, flags, file, line, interlock) (void)0 #define WITNESS_LOCK(lock, flags, file, line) (void)0 #define WITNESS_UPGRADE(lock, flags, file, line) (void)0 #define WITNESS_DOWNGRADE(lock, flags, file, line) (void)0 #define WITNESS_UNLOCK(lock, flags, file, line) (void)0 #define WITNESS_CHECK(flags, lock, fmt, ...) 0 #define WITNESS_WARN(flags, lock, fmt, ...) (void)0 #define WITNESS_SAVE_DECL(n) (void)0 #define WITNESS_SAVE(lock, n) (void)0 #define WITNESS_RESTORE(lock, n) (void)0 #define WITNESS_NORELEASE(lock) (void)0 #define WITNESS_RELEASEOK(lock) (void)0 #define WITNESS_FILE(lock) ("?") #define WITNESS_LINE(lock) (0) #endif /* WITNESS */ #endif /* _KERNEL */ #endif /* _SYS_LOCK_H_ */ Index: user/alc/PQ_LAUNDRY =================================================================== --- user/alc/PQ_LAUNDRY (revision 303653) +++ user/alc/PQ_LAUNDRY (revision 303654) Property changes on: user/alc/PQ_LAUNDRY ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r303642-303653