Index: head/sys/dev/cxgbe/t4_main.c =================================================================== --- head/sys/dev/cxgbe/t4_main.c (revision 348253) +++ head/sys/dev/cxgbe/t4_main.c (revision 348254) @@ -1,10850 +1,10845 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #include #include #if defined(__i386__) || defined(__amd64__) #include #include #include #include #endif #include #ifdef DDB #include #include #endif #include "common/common.h" #include "common/t4_msg.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "cudbg/cudbg.h" #include "t4_clip.h" #include "t4_ioctl.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #include "t4_if.h" #include "t4_smt.h" /* T4 bus driver interface */ static int t4_probe(device_t); static int t4_attach(device_t); static int t4_detach(device_t); static int t4_child_location_str(device_t, device_t, char *, size_t); static int t4_ready(device_t); static int t4_read_port_device(device_t, int, device_t *); static device_method_t t4_methods[] = { DEVMETHOD(device_probe, t4_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(bus_child_location_str, t4_child_location_str), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t4_driver = { "t4nex", t4_methods, sizeof(struct adapter) }; /* T4 port (cxgbe) interface */ static int cxgbe_probe(device_t); static int cxgbe_attach(device_t); static int cxgbe_detach(device_t); device_method_t cxgbe_methods[] = { DEVMETHOD(device_probe, cxgbe_probe), DEVMETHOD(device_attach, cxgbe_attach), DEVMETHOD(device_detach, cxgbe_detach), { 0, 0 } }; static driver_t cxgbe_driver = { "cxgbe", cxgbe_methods, sizeof(struct port_info) }; /* T4 VI (vcxgbe) interface */ static int vcxgbe_probe(device_t); static int vcxgbe_attach(device_t); static int vcxgbe_detach(device_t); static device_method_t vcxgbe_methods[] = { DEVMETHOD(device_probe, vcxgbe_probe), DEVMETHOD(device_attach, vcxgbe_attach), DEVMETHOD(device_detach, vcxgbe_detach), { 0, 0 } }; static driver_t vcxgbe_driver = { "vcxgbe", vcxgbe_methods, sizeof(struct vi_info) }; static d_ioctl_t t4_ioctl; static struct cdevsw t4_cdevsw = { .d_version = D_VERSION, .d_ioctl = t4_ioctl, .d_name = "t4nex", }; /* T5 bus driver interface */ static int t5_probe(device_t); static device_method_t t5_methods[] = { DEVMETHOD(device_probe, t5_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(bus_child_location_str, t4_child_location_str), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t5_driver = { "t5nex", t5_methods, sizeof(struct adapter) }; /* T5 port (cxl) interface */ static driver_t cxl_driver = { "cxl", cxgbe_methods, sizeof(struct port_info) }; /* T5 VI (vcxl) interface */ static driver_t vcxl_driver = { "vcxl", vcxgbe_methods, sizeof(struct vi_info) }; /* T6 bus driver interface */ static int t6_probe(device_t); static device_method_t t6_methods[] = { DEVMETHOD(device_probe, t6_probe), DEVMETHOD(device_attach, t4_attach), DEVMETHOD(device_detach, t4_detach), DEVMETHOD(bus_child_location_str, t4_child_location_str), DEVMETHOD(t4_is_main_ready, t4_ready), DEVMETHOD(t4_read_port_device, t4_read_port_device), DEVMETHOD_END }; static driver_t t6_driver = { "t6nex", t6_methods, sizeof(struct adapter) }; /* T6 port (cc) interface */ static driver_t cc_driver = { "cc", cxgbe_methods, sizeof(struct port_info) }; /* T6 VI (vcc) interface */ static driver_t vcc_driver = { "vcc", vcxgbe_methods, sizeof(struct vi_info) }; /* ifnet interface */ static void cxgbe_init(void *); static int cxgbe_ioctl(struct ifnet *, unsigned long, caddr_t); static int cxgbe_transmit(struct ifnet *, struct mbuf *); static void cxgbe_qflush(struct ifnet *); MALLOC_DEFINE(M_CXGBE, "cxgbe", "Chelsio T4/T5 Ethernet driver and services"); /* * Correct lock order when you need to acquire multiple locks is t4_list_lock, * then ADAPTER_LOCK, then t4_uld_list_lock. */ static struct sx t4_list_lock; SLIST_HEAD(, adapter) t4_list; #ifdef TCP_OFFLOAD static struct sx t4_uld_list_lock; SLIST_HEAD(, uld_info) t4_uld_list; #endif /* * Tunables. See tweak_tunables() too. * * Each tunable is set to a default value here if it's known at compile-time. * Otherwise it is set to -n as an indication to tweak_tunables() that it should * provide a reasonable default (upto n) when the driver is loaded. * * Tunables applicable to both T4 and T5 are under hw.cxgbe. Those specific to * T5 are under hw.cxl. */ SYSCTL_NODE(_hw, OID_AUTO, cxgbe, CTLFLAG_RD, 0, "cxgbe(4) parameters"); SYSCTL_NODE(_hw, OID_AUTO, cxl, CTLFLAG_RD, 0, "cxgbe(4) T5+ parameters"); SYSCTL_NODE(_hw_cxgbe, OID_AUTO, toe, CTLFLAG_RD, 0, "cxgbe(4) TOE parameters"); /* * Number of queues for tx and rx, NIC and offload. */ #define NTXQ 16 int t4_ntxq = -NTXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq, CTLFLAG_RDTUN, &t4_ntxq, 0, "Number of TX queues per port"); TUNABLE_INT("hw.cxgbe.ntxq10g", &t4_ntxq); /* Old name, undocumented */ #define NRXQ 8 int t4_nrxq = -NRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq, CTLFLAG_RDTUN, &t4_nrxq, 0, "Number of RX queues per port"); TUNABLE_INT("hw.cxgbe.nrxq10g", &t4_nrxq); /* Old name, undocumented */ #define NTXQ_VI 1 static int t4_ntxq_vi = -NTXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, ntxq_vi, CTLFLAG_RDTUN, &t4_ntxq_vi, 0, "Number of TX queues per VI"); #define NRXQ_VI 1 static int t4_nrxq_vi = -NRXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nrxq_vi, CTLFLAG_RDTUN, &t4_nrxq_vi, 0, "Number of RX queues per VI"); static int t4_rsrv_noflowq = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, rsrv_noflowq, CTLFLAG_RDTUN, &t4_rsrv_noflowq, 0, "Reserve TX queue 0 of each VI for non-flowid packets"); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) #define NOFLDTXQ 8 static int t4_nofldtxq = -NOFLDTXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0, "Number of offload TX queues per port"); #define NOFLDRXQ 2 static int t4_nofldrxq = -NOFLDRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0, "Number of offload RX queues per port"); #define NOFLDTXQ_VI 1 static int t4_nofldtxq_vi = -NOFLDTXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq_vi, CTLFLAG_RDTUN, &t4_nofldtxq_vi, 0, "Number of offload TX queues per VI"); #define NOFLDRXQ_VI 1 static int t4_nofldrxq_vi = -NOFLDRXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq_vi, CTLFLAG_RDTUN, &t4_nofldrxq_vi, 0, "Number of offload RX queues per VI"); #define TMR_IDX_OFLD 1 int t4_tmr_idx_ofld = TMR_IDX_OFLD; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx_ofld, CTLFLAG_RDTUN, &t4_tmr_idx_ofld, 0, "Holdoff timer index for offload queues"); #define PKTC_IDX_OFLD (-1) int t4_pktc_idx_ofld = PKTC_IDX_OFLD; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx_ofld, CTLFLAG_RDTUN, &t4_pktc_idx_ofld, 0, "holdoff packet counter index for offload queues"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_keepalive_idle = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_idle, CTLFLAG_RDTUN, &t4_toe_keepalive_idle, 0, "TOE keepalive idle timer (us)"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_keepalive_interval = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, keepalive_interval, CTLFLAG_RDTUN, &t4_toe_keepalive_interval, 0, "TOE keepalive interval timer (us)"); /* 0 means chip/fw default, non-zero number is # of keepalives before abort */ static int t4_toe_keepalive_count = 0; SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, keepalive_count, CTLFLAG_RDTUN, &t4_toe_keepalive_count, 0, "Number of TOE keepalive probes before abort"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_rexmt_min = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_min, CTLFLAG_RDTUN, &t4_toe_rexmt_min, 0, "Minimum TOE retransmit interval (us)"); /* 0 means chip/fw default, non-zero number is value in microseconds */ static u_long t4_toe_rexmt_max = 0; SYSCTL_ULONG(_hw_cxgbe_toe, OID_AUTO, rexmt_max, CTLFLAG_RDTUN, &t4_toe_rexmt_max, 0, "Maximum TOE retransmit interval (us)"); /* 0 means chip/fw default, non-zero number is # of rexmt before abort */ static int t4_toe_rexmt_count = 0; SYSCTL_INT(_hw_cxgbe_toe, OID_AUTO, rexmt_count, CTLFLAG_RDTUN, &t4_toe_rexmt_count, 0, "Number of TOE retransmissions before abort"); /* -1 means chip/fw default, other values are raw backoff values to use */ static int t4_toe_rexmt_backoff[16] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; SYSCTL_NODE(_hw_cxgbe_toe, OID_AUTO, rexmt_backoff, CTLFLAG_RD, 0, "cxgbe(4) TOE retransmit backoff values"); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 0, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[0], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 1, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[1], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 2, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[2], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 3, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[3], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 4, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[4], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 5, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[5], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 6, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[6], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 7, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[7], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 8, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[8], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 9, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[9], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 10, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[10], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 11, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[11], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 12, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[12], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 13, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[13], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 14, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[14], 0, ""); SYSCTL_INT(_hw_cxgbe_toe_rexmt_backoff, OID_AUTO, 15, CTLFLAG_RDTUN, &t4_toe_rexmt_backoff[15], 0, ""); #endif #ifdef DEV_NETMAP #define NNMTXQ_VI 2 static int t4_nnmtxq_vi = -NNMTXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmtxq_vi, CTLFLAG_RDTUN, &t4_nnmtxq_vi, 0, "Number of netmap TX queues per VI"); #define NNMRXQ_VI 2 static int t4_nnmrxq_vi = -NNMRXQ_VI; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nnmrxq_vi, CTLFLAG_RDTUN, &t4_nnmrxq_vi, 0, "Number of netmap RX queues per VI"); #endif /* * Holdoff parameters for ports. */ #define TMR_IDX 1 int t4_tmr_idx = TMR_IDX; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_timer_idx, CTLFLAG_RDTUN, &t4_tmr_idx, 0, "Holdoff timer index"); TUNABLE_INT("hw.cxgbe.holdoff_timer_idx_10G", &t4_tmr_idx); /* Old name */ #define PKTC_IDX (-1) int t4_pktc_idx = PKTC_IDX; SYSCTL_INT(_hw_cxgbe, OID_AUTO, holdoff_pktc_idx, CTLFLAG_RDTUN, &t4_pktc_idx, 0, "Holdoff packet counter index"); TUNABLE_INT("hw.cxgbe.holdoff_pktc_idx_10G", &t4_pktc_idx); /* Old name */ /* * Size (# of entries) of each tx and rx queue. */ unsigned int t4_qsize_txq = TX_EQ_QSIZE; SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_txq, CTLFLAG_RDTUN, &t4_qsize_txq, 0, "Number of descriptors in each TX queue"); unsigned int t4_qsize_rxq = RX_IQ_QSIZE; SYSCTL_INT(_hw_cxgbe, OID_AUTO, qsize_rxq, CTLFLAG_RDTUN, &t4_qsize_rxq, 0, "Number of descriptors in each RX queue"); /* * Interrupt types allowed (bits 0, 1, 2 = INTx, MSI, MSI-X respectively). */ int t4_intr_types = INTR_MSIX | INTR_MSI | INTR_INTX; SYSCTL_INT(_hw_cxgbe, OID_AUTO, interrupt_types, CTLFLAG_RDTUN, &t4_intr_types, 0, "Interrupt types allowed (bit 0 = INTx, 1 = MSI, 2 = MSI-X)"); /* * Configuration file. All the _CF names here are special. */ #define DEFAULT_CF "default" #define BUILTIN_CF "built-in" #define FLASH_CF "flash" #define UWIRE_CF "uwire" #define FPGA_CF "fpga" static char t4_cfg_file[32] = DEFAULT_CF; SYSCTL_STRING(_hw_cxgbe, OID_AUTO, config_file, CTLFLAG_RDTUN, t4_cfg_file, sizeof(t4_cfg_file), "Firmware configuration file"); /* * PAUSE settings (bit 0, 1, 2 = rx_pause, tx_pause, pause_autoneg respectively). * rx_pause = 1 to heed incoming PAUSE frames, 0 to ignore them. * tx_pause = 1 to emit PAUSE frames when the rx FIFO reaches its high water * mark or when signalled to do so, 0 to never emit PAUSE. * pause_autoneg = 1 means PAUSE will be negotiated if possible and the * negotiated settings will override rx_pause/tx_pause. * Otherwise rx_pause/tx_pause are applied forcibly. */ static int t4_pause_settings = PAUSE_RX | PAUSE_TX | PAUSE_AUTONEG; SYSCTL_INT(_hw_cxgbe, OID_AUTO, pause_settings, CTLFLAG_RDTUN, &t4_pause_settings, 0, "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)"); /* * Forward Error Correction settings (bit 0, 1 = RS, BASER respectively). * -1 to run with the firmware default. Same as FEC_AUTO (bit 5) * 0 to disable FEC. */ static int t4_fec = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fec, CTLFLAG_RDTUN, &t4_fec, 0, "Forward Error Correction (bit 0 = RS, bit 1 = BASER_RS)"); /* * Link autonegotiation. * -1 to run with the firmware default. * 0 to disable. * 1 to enable. */ static int t4_autoneg = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, autoneg, CTLFLAG_RDTUN, &t4_autoneg, 0, "Link autonegotiation"); /* * Firmware auto-install by driver during attach (0, 1, 2 = prohibited, allowed, * encouraged respectively). '-n' is the same as 'n' except the firmware * version used in the checks is read from the firmware bundled with the driver. */ static int t4_fw_install = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fw_install, CTLFLAG_RDTUN, &t4_fw_install, 0, "Firmware auto-install (0 = prohibited, 1 = allowed, 2 = encouraged)"); /* * ASIC features that will be used. Disable the ones you don't want so that the * chip resources aren't wasted on features that will not be used. */ static int t4_nbmcaps_allowed = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nbmcaps_allowed, CTLFLAG_RDTUN, &t4_nbmcaps_allowed, 0, "Default NBM capabilities"); static int t4_linkcaps_allowed = 0; /* No DCBX, PPP, etc. by default */ SYSCTL_INT(_hw_cxgbe, OID_AUTO, linkcaps_allowed, CTLFLAG_RDTUN, &t4_linkcaps_allowed, 0, "Default link capabilities"); static int t4_switchcaps_allowed = FW_CAPS_CONFIG_SWITCH_INGRESS | FW_CAPS_CONFIG_SWITCH_EGRESS; SYSCTL_INT(_hw_cxgbe, OID_AUTO, switchcaps_allowed, CTLFLAG_RDTUN, &t4_switchcaps_allowed, 0, "Default switch capabilities"); #ifdef RATELIMIT static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | FW_CAPS_CONFIG_NIC_HASHFILTER | FW_CAPS_CONFIG_NIC_ETHOFLD; #else static int t4_niccaps_allowed = FW_CAPS_CONFIG_NIC | FW_CAPS_CONFIG_NIC_HASHFILTER; #endif SYSCTL_INT(_hw_cxgbe, OID_AUTO, niccaps_allowed, CTLFLAG_RDTUN, &t4_niccaps_allowed, 0, "Default NIC capabilities"); static int t4_toecaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, toecaps_allowed, CTLFLAG_RDTUN, &t4_toecaps_allowed, 0, "Default TCP offload capabilities"); static int t4_rdmacaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, rdmacaps_allowed, CTLFLAG_RDTUN, &t4_rdmacaps_allowed, 0, "Default RDMA capabilities"); static int t4_cryptocaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, cryptocaps_allowed, CTLFLAG_RDTUN, &t4_cryptocaps_allowed, 0, "Default crypto capabilities"); static int t4_iscsicaps_allowed = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, iscsicaps_allowed, CTLFLAG_RDTUN, &t4_iscsicaps_allowed, 0, "Default iSCSI capabilities"); static int t4_fcoecaps_allowed = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fcoecaps_allowed, CTLFLAG_RDTUN, &t4_fcoecaps_allowed, 0, "Default FCoE capabilities"); static int t5_write_combine = 0; SYSCTL_INT(_hw_cxl, OID_AUTO, write_combine, CTLFLAG_RDTUN, &t5_write_combine, 0, "Use WC instead of UC for BAR2"); static int t4_num_vis = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, num_vis, CTLFLAG_RDTUN, &t4_num_vis, 0, "Number of VIs per port"); /* * PCIe Relaxed Ordering. * -1: driver should figure out a good value. * 0: disable RO. * 1: enable RO. * 2: leave RO alone. */ static int pcie_relaxed_ordering = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, pcie_relaxed_ordering, CTLFLAG_RDTUN, &pcie_relaxed_ordering, 0, "PCIe Relaxed Ordering: 0 = disable, 1 = enable, 2 = leave alone"); static int t4_panic_on_fatal_err = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, panic_on_fatal_err, CTLFLAG_RDTUN, &t4_panic_on_fatal_err, 0, "panic on fatal errors"); #ifdef TCP_OFFLOAD /* * TOE tunables. */ static int t4_cop_managed_offloading = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, cop_managed_offloading, CTLFLAG_RDTUN, &t4_cop_managed_offloading, 0, "COP (Connection Offload Policy) controls all TOE offload"); #endif /* Functions used by VIs to obtain unique MAC addresses for each VI. */ static int vi_mac_funcs[] = { FW_VI_FUNC_ETH, FW_VI_FUNC_OFLD, FW_VI_FUNC_IWARP, FW_VI_FUNC_OPENISCSI, FW_VI_FUNC_OPENFCOE, FW_VI_FUNC_FOISCSI, FW_VI_FUNC_FOFCOE, }; struct intrs_and_queues { uint16_t intr_type; /* INTx, MSI, or MSI-X */ uint16_t num_vis; /* number of VIs for each port */ uint16_t nirq; /* Total # of vectors */ uint16_t ntxq; /* # of NIC txq's for each port */ uint16_t nrxq; /* # of NIC rxq's for each port */ uint16_t nofldtxq; /* # of TOE/ETHOFLD txq's for each port */ uint16_t nofldrxq; /* # of TOE rxq's for each port */ /* The vcxgbe/vcxl interfaces use these and not the ones above. */ uint16_t ntxq_vi; /* # of NIC txq's */ uint16_t nrxq_vi; /* # of NIC rxq's */ uint16_t nofldtxq_vi; /* # of TOE txq's */ uint16_t nofldrxq_vi; /* # of TOE rxq's */ uint16_t nnmtxq_vi; /* # of netmap txq's */ uint16_t nnmrxq_vi; /* # of netmap rxq's */ }; static void setup_memwin(struct adapter *); static void position_memwin(struct adapter *, int, uint32_t); static int validate_mem_range(struct adapter *, uint32_t, uint32_t); static int fwmtype_to_hwmtype(int); static int validate_mt_off_len(struct adapter *, int, uint32_t, uint32_t, uint32_t *); static int fixup_devlog_params(struct adapter *); static int cfg_itype_and_nqueues(struct adapter *, struct intrs_and_queues *); static int contact_firmware(struct adapter *); static int partition_resources(struct adapter *); static int get_params__pre_init(struct adapter *); static int set_params__pre_init(struct adapter *); static int get_params__post_init(struct adapter *); static int set_params__post_init(struct adapter *); static void t4_set_desc(struct adapter *); static bool fixed_ifmedia(struct port_info *); static void build_medialist(struct port_info *); static void init_link_config(struct port_info *); static int fixup_link_config(struct port_info *); static int apply_link_config(struct port_info *); static int cxgbe_init_synchronized(struct vi_info *); static int cxgbe_uninit_synchronized(struct vi_info *); static void quiesce_txq(struct adapter *, struct sge_txq *); static void quiesce_wrq(struct adapter *, struct sge_wrq *); static void quiesce_iq(struct adapter *, struct sge_iq *); static void quiesce_fl(struct adapter *, struct sge_fl *); static int t4_alloc_irq(struct adapter *, struct irq *, int rid, driver_intr_t *, void *, char *); static int t4_free_irq(struct adapter *, struct irq *); static void get_regs(struct adapter *, struct t4_regdump *, uint8_t *); static void vi_refresh_stats(struct adapter *, struct vi_info *); static void cxgbe_refresh_stats(struct adapter *, struct port_info *); static void cxgbe_tick(void *); static void cxgbe_sysctls(struct port_info *); static int sysctl_int_array(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS); static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS); static int sysctl_btphy(SYSCTL_HANDLER_ARGS); static int sysctl_noflowq(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS); static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS); static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS); static int sysctl_fec(SYSCTL_HANDLER_ARGS); static int sysctl_autoneg(SYSCTL_HANDLER_ARGS); static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS); static int sysctl_temperature(SYSCTL_HANDLER_ARGS); static int sysctl_loadavg(SYSCTL_HANDLER_ARGS); static int sysctl_cctrl(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS); static int sysctl_cim_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS); static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS); static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS); static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_devlog(SYSCTL_HANDLER_ARGS); static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS); static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS); static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS); static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS); static int sysctl_meminfo(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS); static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS); static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS); static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS); static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tids(SYSCTL_HANDLER_ARGS); static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS); static int sysctl_tp_la(SYSCTL_HANDLER_ARGS); static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS); static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS); static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS); static int sysctl_cpus(SYSCTL_HANDLER_ARGS); #ifdef TCP_OFFLOAD static int sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS); static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS); static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS); static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS); static int sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS); static int sysctl_tp_backoff(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS); static int sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS); #endif static int get_sge_context(struct adapter *, struct t4_sge_context *); static int load_fw(struct adapter *, struct t4_data *); static int load_cfg(struct adapter *, struct t4_data *); static int load_boot(struct adapter *, struct t4_bootrom *); static int load_bootcfg(struct adapter *, struct t4_data *); static int cudbg_dump(struct adapter *, struct t4_cudbg_dump *); static void free_offload_policy(struct t4_offload_policy *); static int set_offload_policy(struct adapter *, struct t4_offload_policy *); static int read_card_mem(struct adapter *, int, struct t4_mem_range *); static int read_i2c(struct adapter *, struct t4_i2c_data *); #ifdef TCP_OFFLOAD static int toe_capability(struct vi_info *, int); #endif static int mod_event(module_t, int, void *); static int notify_siblings(device_t, int); struct { uint16_t device; char *desc; } t4_pciids[] = { {0xa000, "Chelsio Terminator 4 FPGA"}, {0x4400, "Chelsio T440-dbg"}, {0x4401, "Chelsio T420-CR"}, {0x4402, "Chelsio T422-CR"}, {0x4403, "Chelsio T440-CR"}, {0x4404, "Chelsio T420-BCH"}, {0x4405, "Chelsio T440-BCH"}, {0x4406, "Chelsio T440-CH"}, {0x4407, "Chelsio T420-SO"}, {0x4408, "Chelsio T420-CX"}, {0x4409, "Chelsio T420-BT"}, {0x440a, "Chelsio T404-BT"}, {0x440e, "Chelsio T440-LP-CR"}, }, t5_pciids[] = { {0xb000, "Chelsio Terminator 5 FPGA"}, {0x5400, "Chelsio T580-dbg"}, {0x5401, "Chelsio T520-CR"}, /* 2 x 10G */ {0x5402, "Chelsio T522-CR"}, /* 2 x 10G, 2 X 1G */ {0x5403, "Chelsio T540-CR"}, /* 4 x 10G */ {0x5407, "Chelsio T520-SO"}, /* 2 x 10G, nomem */ {0x5409, "Chelsio T520-BT"}, /* 2 x 10GBaseT */ {0x540a, "Chelsio T504-BT"}, /* 4 x 1G */ {0x540d, "Chelsio T580-CR"}, /* 2 x 40G */ {0x540e, "Chelsio T540-LP-CR"}, /* 4 x 10G */ {0x5410, "Chelsio T580-LP-CR"}, /* 2 x 40G */ {0x5411, "Chelsio T520-LL-CR"}, /* 2 x 10G */ {0x5412, "Chelsio T560-CR"}, /* 1 x 40G, 2 x 10G */ {0x5414, "Chelsio T580-LP-SO-CR"}, /* 2 x 40G, nomem */ {0x5415, "Chelsio T502-BT"}, /* 2 x 1G */ {0x5418, "Chelsio T540-BT"}, /* 4 x 10GBaseT */ {0x5419, "Chelsio T540-LP-BT"}, /* 4 x 10GBaseT */ {0x541a, "Chelsio T540-SO-BT"}, /* 4 x 10GBaseT, nomem */ {0x541b, "Chelsio T540-SO-CR"}, /* 4 x 10G, nomem */ /* Custom */ {0x5483, "Custom T540-CR"}, {0x5484, "Custom T540-BT"}, }, t6_pciids[] = { {0xc006, "Chelsio Terminator 6 FPGA"}, /* T6 PE10K6 FPGA (PF0) */ {0x6400, "Chelsio T6-DBG-25"}, /* 2 x 10/25G, debug */ {0x6401, "Chelsio T6225-CR"}, /* 2 x 10/25G */ {0x6402, "Chelsio T6225-SO-CR"}, /* 2 x 10/25G, nomem */ {0x6403, "Chelsio T6425-CR"}, /* 4 x 10/25G */ {0x6404, "Chelsio T6425-SO-CR"}, /* 4 x 10/25G, nomem */ {0x6405, "Chelsio T6225-OCP-SO"}, /* 2 x 10/25G, nomem */ {0x6406, "Chelsio T62100-OCP-SO"}, /* 2 x 40/50/100G, nomem */ {0x6407, "Chelsio T62100-LP-CR"}, /* 2 x 40/50/100G */ {0x6408, "Chelsio T62100-SO-CR"}, /* 2 x 40/50/100G, nomem */ {0x6409, "Chelsio T6210-BT"}, /* 2 x 10GBASE-T */ {0x640d, "Chelsio T62100-CR"}, /* 2 x 40/50/100G */ {0x6410, "Chelsio T6-DBG-100"}, /* 2 x 40/50/100G, debug */ {0x6411, "Chelsio T6225-LL-CR"}, /* 2 x 10/25G */ {0x6414, "Chelsio T61100-OCP-SO"}, /* 1 x 40/50/100G, nomem */ {0x6415, "Chelsio T6201-BT"}, /* 2 x 1000BASE-T */ /* Custom */ {0x6480, "Custom T6225-CR"}, {0x6481, "Custom T62100-CR"}, {0x6482, "Custom T6225-CR"}, {0x6483, "Custom T62100-CR"}, {0x6484, "Custom T64100-CR"}, {0x6485, "Custom T6240-SO"}, {0x6486, "Custom T6225-SO-CR"}, {0x6487, "Custom T6225-CR"}, }; #ifdef TCP_OFFLOAD /* * service_iq_fl() has an iq and needs the fl. Offset of fl from the iq should * be exactly the same for both rxq and ofld_rxq. */ CTASSERT(offsetof(struct sge_ofld_rxq, iq) == offsetof(struct sge_rxq, iq)); CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl)); #endif CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE); static int t4_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xa000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t4_pciids); i++) { if (d == t4_pciids[i].device) { device_set_desc(dev, t4_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t5_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); uint8_t f = pci_get_function(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); /* Attach only to PF0 of the FPGA */ if (d == 0xb000 && f != 0) return (ENXIO); for (i = 0; i < nitems(t5_pciids); i++) { if (d == t5_pciids[i].device) { device_set_desc(dev, t5_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static int t6_probe(device_t dev) { int i; uint16_t v = pci_get_vendor(dev); uint16_t d = pci_get_device(dev); if (v != PCI_VENDOR_ID_CHELSIO) return (ENXIO); for (i = 0; i < nitems(t6_pciids); i++) { if (d == t6_pciids[i].device) { device_set_desc(dev, t6_pciids[i].desc); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void t5_attribute_workaround(device_t dev) { device_t root_port; uint32_t v; /* * The T5 chips do not properly echo the No Snoop and Relaxed * Ordering attributes when replying to a TLP from a Root * Port. As a workaround, find the parent Root Port and * disable No Snoop and Relaxed Ordering. Note that this * affects all devices under this root port. */ root_port = pci_find_pcie_root_port(dev); if (root_port == NULL) { device_printf(dev, "Unable to find parent root port\n"); return; } v = pcie_adjust_config(root_port, PCIER_DEVICE_CTL, PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE, 0, 2); if ((v & (PCIEM_CTL_RELAXED_ORD_ENABLE | PCIEM_CTL_NOSNOOP_ENABLE)) != 0) device_printf(dev, "Disabled No Snoop/Relaxed Ordering on %s\n", device_get_nameunit(root_port)); } static const struct devnames devnames[] = { { .nexus_name = "t4nex", .ifnet_name = "cxgbe", .vi_ifnet_name = "vcxgbe", .pf03_drv_name = "t4iov", .vf_nexus_name = "t4vf", .vf_ifnet_name = "cxgbev" }, { .nexus_name = "t5nex", .ifnet_name = "cxl", .vi_ifnet_name = "vcxl", .pf03_drv_name = "t5iov", .vf_nexus_name = "t5vf", .vf_ifnet_name = "cxlv" }, { .nexus_name = "t6nex", .ifnet_name = "cc", .vi_ifnet_name = "vcc", .pf03_drv_name = "t6iov", .vf_nexus_name = "t6vf", .vf_ifnet_name = "ccv" } }; void t4_init_devnames(struct adapter *sc) { int id; id = chip_id(sc); if (id >= CHELSIO_T4 && id - CHELSIO_T4 < nitems(devnames)) sc->names = &devnames[id - CHELSIO_T4]; else { device_printf(sc->dev, "chip id %d is not supported.\n", id); sc->names = NULL; } } static int t4_ifnet_unit(struct adapter *sc, struct port_info *pi) { const char *parent, *name; long value; int line, unit; line = 0; parent = device_get_nameunit(sc->dev); name = sc->names->ifnet_name; while (resource_find_dev(&line, name, &unit, "at", parent) == 0) { if (resource_long_value(name, unit, "port", &value) == 0 && value == pi->port_id) return (unit); } return (-1); } static int t4_attach(device_t dev) { struct adapter *sc; int rc = 0, i, j, rqidx, tqidx, nports; struct make_dev_args mda; struct intrs_and_queues iaq; struct sge *s; uint32_t *buf; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) int ofld_tqidx; #endif #ifdef TCP_OFFLOAD int ofld_rqidx; #endif #ifdef DEV_NETMAP int nm_rqidx, nm_tqidx; #endif int num_vis; sc = device_get_softc(dev); sc->dev = dev; TUNABLE_INT_FETCH("hw.cxgbe.dflags", &sc->debug_flags); if ((pci_get_device(dev) & 0xff00) == 0x5400) t5_attribute_workaround(dev); pci_enable_busmaster(dev); if (pci_find_cap(dev, PCIY_EXPRESS, &i) == 0) { uint32_t v; pci_set_max_read_req(dev, 4096); v = pci_read_config(dev, i + PCIER_DEVICE_CTL, 2); sc->params.pci.mps = 128 << ((v & PCIEM_CTL_MAX_PAYLOAD) >> 5); if (pcie_relaxed_ordering == 0 && (v & PCIEM_CTL_RELAXED_ORD_ENABLE) != 0) { v &= ~PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); } else if (pcie_relaxed_ordering == 1 && (v & PCIEM_CTL_RELAXED_ORD_ENABLE) == 0) { v |= PCIEM_CTL_RELAXED_ORD_ENABLE; pci_write_config(dev, i + PCIER_DEVICE_CTL, v, 2); } } sc->sge_gts_reg = MYPF_REG(A_SGE_PF_GTS); sc->sge_kdoorbell_reg = MYPF_REG(A_SGE_PF_KDOORBELL); sc->traceq = -1; mtx_init(&sc->ifp_lock, sc->ifp_lockname, 0, MTX_DEF); snprintf(sc->ifp_lockname, sizeof(sc->ifp_lockname), "%s tracer", device_get_nameunit(dev)); snprintf(sc->lockname, sizeof(sc->lockname), "%s", device_get_nameunit(dev)); mtx_init(&sc->sc_lock, sc->lockname, 0, MTX_DEF); t4_add_adapter(sc); mtx_init(&sc->sfl_lock, "starving freelists", 0, MTX_DEF); TAILQ_INIT(&sc->sfl); callout_init_mtx(&sc->sfl_callout, &sc->sfl_lock, 0); mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF); sc->policy = NULL; rw_init(&sc->policy_lock, "connection offload policy"); rc = t4_map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ memset(sc->chan_map, 0xff, sizeof(sc->chan_map)); /* Prepare the adapter for operation. */ buf = malloc(PAGE_SIZE, M_CXGBE, M_ZERO | M_WAITOK); rc = -t4_prep_adapter(sc, buf); free(buf, M_CXGBE); if (rc != 0) { device_printf(dev, "failed to prepare adapter: %d.\n", rc); goto done; } /* * This is the real PF# to which we're attaching. Works from within PCI * passthrough environments too, where pci_get_function() could return a * different PF# depending on the passthrough configuration. We need to * use the real PF# in all our communication with the firmware. */ j = t4_read_reg(sc, A_PL_WHOAMI); sc->pf = chip_id(sc) <= CHELSIO_T5 ? G_SOURCEPF(j) : G_T6_SOURCEPF(j); sc->mbox = sc->pf; t4_init_devnames(sc); if (sc->names == NULL) { rc = ENOTSUP; goto done; /* error message displayed already */ } /* * Do this really early, with the memory windows set up even before the * character device. The userland tool's register i/o and mem read * will work even in "recovery mode". */ setup_memwin(sc); if (t4_init_devlog_params(sc, 0) == 0) fixup_devlog_params(sc); make_dev_args_init(&mda); mda.mda_devsw = &t4_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; rc = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); if (rc != 0) device_printf(dev, "failed to create nexus char device: %d.\n", rc); /* Go no further if recovery mode has been requested. */ if (TUNABLE_INT_FETCH("hw.cxgbe.sos", &i) && i != 0) { device_printf(dev, "recovery mode.\n"); goto done; } #if defined(__i386__) if ((cpu_feature & CPUID_CX8) == 0) { device_printf(dev, "64 bit atomics not available.\n"); rc = ENOTSUP; goto done; } #endif /* Contact the firmware and try to become the master driver. */ rc = contact_firmware(sc); if (rc != 0) goto done; /* error message displayed already */ MPASS(sc->flags & FW_OK); rc = get_params__pre_init(sc); if (rc != 0) goto done; /* error message displayed already */ if (sc->flags & MASTER_PF) { rc = partition_resources(sc); if (rc != 0) goto done; /* error message displayed already */ t4_intr_clear(sc); } rc = get_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = set_params__post_init(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_map_bar_2(sc); if (rc != 0) goto done; /* error message displayed already */ rc = t4_create_dma_tag(sc); if (rc != 0) goto done; /* error message displayed already */ /* * First pass over all the ports - allocate VIs and initialize some * basic parameters like mac address, port type, etc. */ for_each_port(sc, i) { struct port_info *pi; pi = malloc(sizeof(*pi), M_CXGBE, M_ZERO | M_WAITOK); sc->port[i] = pi; /* These must be set before t4_port_init */ pi->adapter = sc; pi->port_id = i; /* * XXX: vi[0] is special so we can't delay this allocation until * pi->nvi's final value is known. */ pi->vi = malloc(sizeof(struct vi_info) * t4_num_vis, M_CXGBE, M_ZERO | M_WAITOK); /* * Allocate the "main" VI and initialize parameters * like mac addr. */ rc = -t4_port_init(sc, sc->mbox, sc->pf, 0, i); if (rc != 0) { device_printf(dev, "unable to initialize port %d: %d\n", i, rc); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); sc->port[i] = NULL; goto done; } snprintf(pi->lockname, sizeof(pi->lockname), "%sp%d", device_get_nameunit(dev), i); mtx_init(&pi->pi_lock, pi->lockname, 0, MTX_DEF); sc->chan_map[pi->tx_chan] = i; /* All VIs on this port share this media. */ ifmedia_init(&pi->media, IFM_IMASK, cxgbe_media_change, cxgbe_media_status); PORT_LOCK(pi); init_link_config(pi); fixup_link_config(pi); build_medialist(pi); if (fixed_ifmedia(pi)) pi->flags |= FIXED_IFMEDIA; PORT_UNLOCK(pi); pi->dev = device_add_child(dev, sc->names->ifnet_name, t4_ifnet_unit(sc, pi)); if (pi->dev == NULL) { device_printf(dev, "failed to add device for port %d.\n", i); rc = ENXIO; goto done; } pi->vi[0].dev = pi->dev; device_set_softc(pi->dev, pi); } /* * Interrupt type, # of interrupts, # of rx/tx queues, etc. */ nports = sc->params.nports; rc = cfg_itype_and_nqueues(sc, &iaq); if (rc != 0) goto done; /* error message displayed already */ num_vis = iaq.num_vis; sc->intr_type = iaq.intr_type; sc->intr_count = iaq.nirq; s = &sc->sge; s->nrxq = nports * iaq.nrxq; s->ntxq = nports * iaq.ntxq; if (num_vis > 1) { s->nrxq += nports * (num_vis - 1) * iaq.nrxq_vi; s->ntxq += nports * (num_vis - 1) * iaq.ntxq_vi; } s->neq = s->ntxq + s->nrxq; /* the free list in an rxq is an eq */ s->neq += nports; /* ctrl queues: 1 per port */ s->niq = s->nrxq + 1; /* 1 extra for firmware event queue */ #if defined(TCP_OFFLOAD) || defined(RATELIMIT) if (is_offload(sc) || is_ethoffload(sc)) { s->nofldtxq = nports * iaq.nofldtxq; if (num_vis > 1) s->nofldtxq += nports * (num_vis - 1) * iaq.nofldtxq_vi; s->neq += s->nofldtxq; s->ofld_txq = malloc(s->nofldtxq * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { s->nofldrxq = nports * iaq.nofldrxq; if (num_vis > 1) s->nofldrxq += nports * (num_vis - 1) * iaq.nofldrxq_vi; s->neq += s->nofldrxq; /* free list */ s->niq += s->nofldrxq; s->ofld_rxq = malloc(s->nofldrxq * sizeof(struct sge_ofld_rxq), M_CXGBE, M_ZERO | M_WAITOK); } #endif #ifdef DEV_NETMAP if (num_vis > 1) { s->nnmrxq = nports * (num_vis - 1) * iaq.nnmrxq_vi; s->nnmtxq = nports * (num_vis - 1) * iaq.nnmtxq_vi; } s->neq += s->nnmtxq + s->nnmrxq; s->niq += s->nnmrxq; s->nm_rxq = malloc(s->nnmrxq * sizeof(struct sge_nm_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->nm_txq = malloc(s->nnmtxq * sizeof(struct sge_nm_txq), M_CXGBE, M_ZERO | M_WAITOK); #endif s->ctrlq = malloc(nports * sizeof(struct sge_wrq), M_CXGBE, M_ZERO | M_WAITOK); s->rxq = malloc(s->nrxq * sizeof(struct sge_rxq), M_CXGBE, M_ZERO | M_WAITOK); s->txq = malloc(s->ntxq * sizeof(struct sge_txq), M_CXGBE, M_ZERO | M_WAITOK); s->iqmap = malloc(s->niq * sizeof(struct sge_iq *), M_CXGBE, M_ZERO | M_WAITOK); s->eqmap = malloc(s->neq * sizeof(struct sge_eq *), M_CXGBE, M_ZERO | M_WAITOK); sc->irq = malloc(sc->intr_count * sizeof(struct irq), M_CXGBE, M_ZERO | M_WAITOK); t4_init_l2t(sc, M_WAITOK); t4_init_smt(sc, M_WAITOK); t4_init_tx_sched(sc); #ifdef RATELIMIT t4_init_etid_table(sc); #endif #ifdef INET6 t4_init_clip_table(sc); #endif if (sc->vres.key.size != 0) sc->key_map = vmem_create("T4TLS key map", sc->vres.key.start, sc->vres.key.size, 32, 0, M_FIRSTFIT | M_WAITOK); /* * Second pass over the ports. This time we know the number of rx and * tx queues that each port should get. */ rqidx = tqidx = 0; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) ofld_tqidx = 0; #endif #ifdef TCP_OFFLOAD ofld_rqidx = 0; #endif #ifdef DEV_NETMAP nm_rqidx = nm_tqidx = 0; #endif for_each_port(sc, i) { struct port_info *pi = sc->port[i]; struct vi_info *vi; if (pi == NULL) continue; pi->nvi = num_vis; for_each_vi(pi, j, vi) { vi->pi = pi; vi->qsize_rxq = t4_qsize_rxq; vi->qsize_txq = t4_qsize_txq; vi->first_rxq = rqidx; vi->first_txq = tqidx; vi->tmr_idx = t4_tmr_idx; vi->pktc_idx = t4_pktc_idx; vi->nrxq = j == 0 ? iaq.nrxq : iaq.nrxq_vi; vi->ntxq = j == 0 ? iaq.ntxq : iaq.ntxq_vi; rqidx += vi->nrxq; tqidx += vi->ntxq; if (j == 0 && vi->ntxq > 1) vi->rsrv_noflowq = t4_rsrv_noflowq ? 1 : 0; else vi->rsrv_noflowq = 0; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) vi->first_ofld_txq = ofld_tqidx; vi->nofldtxq = j == 0 ? iaq.nofldtxq : iaq.nofldtxq_vi; ofld_tqidx += vi->nofldtxq; #endif #ifdef TCP_OFFLOAD vi->ofld_tmr_idx = t4_tmr_idx_ofld; vi->ofld_pktc_idx = t4_pktc_idx_ofld; vi->first_ofld_rxq = ofld_rqidx; vi->nofldrxq = j == 0 ? iaq.nofldrxq : iaq.nofldrxq_vi; ofld_rqidx += vi->nofldrxq; #endif #ifdef DEV_NETMAP if (j > 0) { vi->first_nm_rxq = nm_rqidx; vi->first_nm_txq = nm_tqidx; vi->nnmrxq = iaq.nnmrxq_vi; vi->nnmtxq = iaq.nnmtxq_vi; nm_rqidx += vi->nnmrxq; nm_tqidx += vi->nnmtxq; } #endif } } rc = t4_setup_intr_handlers(sc); if (rc != 0) { device_printf(dev, "failed to setup interrupt handlers: %d\n", rc); goto done; } rc = bus_generic_probe(dev); if (rc != 0) { device_printf(dev, "failed to probe child drivers: %d\n", rc); goto done; } /* * Ensure thread-safe mailbox access (in debug builds). * * So far this was the only thread accessing the mailbox but various * ifnets and sysctls are about to be created and their handlers/ioctls * will access the mailbox from different threads. */ sc->flags |= CHK_MBOX_ACCESS; rc = bus_generic_attach(dev); if (rc != 0) { device_printf(dev, "failed to attach all child ports: %d\n", rc); goto done; } device_printf(dev, "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n", sc->params.pci.speed, sc->params.pci.width, sc->params.nports, sc->intr_count, sc->intr_type == INTR_MSIX ? "MSI-X" : (sc->intr_type == INTR_MSI ? "MSI" : "INTx"), sc->intr_count > 1 ? "s" : "", sc->sge.neq, sc->sge.niq); t4_set_desc(sc); notify_siblings(dev, 0); done: if (rc != 0 && sc->cdev) { /* cdev was created and so cxgbetool works; recover that way. */ device_printf(dev, "error during attach, adapter is now in recovery mode.\n"); rc = 0; } if (rc != 0) t4_detach_common(dev); else t4_sysctls(sc); return (rc); } static int t4_child_location_str(device_t bus, device_t dev, char *buf, size_t buflen) { struct adapter *sc; struct port_info *pi; int i; sc = device_get_softc(bus); buf[0] = '\0'; for_each_port(sc, i) { pi = sc->port[i]; if (pi != NULL && pi->dev == dev) { snprintf(buf, buflen, "port=%d", pi->port_id); break; } } return (0); } static int t4_ready(device_t dev) { struct adapter *sc; sc = device_get_softc(dev); if (sc->flags & FW_OK) return (0); return (ENXIO); } static int t4_read_port_device(device_t dev, int port, device_t *child) { struct adapter *sc; struct port_info *pi; sc = device_get_softc(dev); if (port < 0 || port >= MAX_NPORTS) return (EINVAL); pi = sc->port[port]; if (pi == NULL || pi->dev == NULL) return (ENXIO); *child = pi->dev; return (0); } static int notify_siblings(device_t dev, int detaching) { device_t sibling; int error, i; error = 0; for (i = 0; i < PCI_FUNCMAX; i++) { if (i == pci_get_function(dev)) continue; sibling = pci_find_dbsf(pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), i); if (sibling == NULL || !device_is_attached(sibling)) continue; if (detaching) error = T4_DETACH_CHILD(sibling); else (void)T4_ATTACH_CHILD(sibling); if (error) break; } return (error); } /* * Idempotent */ static int t4_detach(device_t dev) { struct adapter *sc; int rc; sc = device_get_softc(dev); rc = notify_siblings(dev, 1); if (rc) { device_printf(dev, "failed to detach sibling devices: %d\n", rc); return (rc); } return (t4_detach_common(dev)); } int t4_detach_common(device_t dev) { struct adapter *sc; struct port_info *pi; int i, rc; sc = device_get_softc(dev); if (sc->cdev) { destroy_dev(sc->cdev); sc->cdev = NULL; } sc->flags &= ~CHK_MBOX_ACCESS; if (sc->flags & FULL_INIT_DONE) { if (!(sc->flags & IS_VF)) t4_intr_disable(sc); } if (device_is_attached(dev)) { rc = bus_generic_detach(dev); if (rc) { device_printf(dev, "failed to detach child devices: %d\n", rc); return (rc); } } for (i = 0; i < sc->intr_count; i++) t4_free_irq(sc, &sc->irq[i]); if ((sc->flags & (IS_VF | FW_OK)) == FW_OK) t4_free_tx_sched(sc); for (i = 0; i < MAX_NPORTS; i++) { pi = sc->port[i]; if (pi) { t4_free_vi(sc, sc->mbox, sc->pf, 0, pi->vi[0].viid); if (pi->dev) device_delete_child(dev, pi->dev); mtx_destroy(&pi->pi_lock); free(pi->vi, M_CXGBE); free(pi, M_CXGBE); } } device_delete_children(dev); if (sc->flags & FULL_INIT_DONE) adapter_full_uninit(sc); if ((sc->flags & (IS_VF | FW_OK)) == FW_OK) t4_fw_bye(sc, sc->mbox); if (sc->intr_type == INTR_MSI || sc->intr_type == INTR_MSIX) pci_release_msi(dev); if (sc->regs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->regs_rid, sc->regs_res); if (sc->udbs_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->udbs_rid, sc->udbs_res); if (sc->msix_res) bus_release_resource(dev, SYS_RES_MEMORY, sc->msix_rid, sc->msix_res); if (sc->l2t) t4_free_l2t(sc->l2t); if (sc->smt) t4_free_smt(sc->smt); #ifdef RATELIMIT t4_free_etid_table(sc); #endif if (sc->key_map) vmem_destroy(sc->key_map); #ifdef INET6 t4_destroy_clip_table(sc); #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) free(sc->sge.ofld_txq, M_CXGBE); #endif #ifdef TCP_OFFLOAD free(sc->sge.ofld_rxq, M_CXGBE); #endif #ifdef DEV_NETMAP free(sc->sge.nm_rxq, M_CXGBE); free(sc->sge.nm_txq, M_CXGBE); #endif free(sc->irq, M_CXGBE); free(sc->sge.rxq, M_CXGBE); free(sc->sge.txq, M_CXGBE); free(sc->sge.ctrlq, M_CXGBE); free(sc->sge.iqmap, M_CXGBE); free(sc->sge.eqmap, M_CXGBE); free(sc->tids.ftid_tab, M_CXGBE); free(sc->tids.hpftid_tab, M_CXGBE); free_hftid_hash(&sc->tids); free(sc->tids.atid_tab, M_CXGBE); free(sc->tids.tid_tab, M_CXGBE); free(sc->tt.tls_rx_ports, M_CXGBE); t4_destroy_dma_tag(sc); if (mtx_initialized(&sc->sc_lock)) { sx_xlock(&t4_list_lock); SLIST_REMOVE(&t4_list, sc, adapter, link); sx_xunlock(&t4_list_lock); mtx_destroy(&sc->sc_lock); } callout_drain(&sc->sfl_callout); if (mtx_initialized(&sc->tids.ftid_lock)) { mtx_destroy(&sc->tids.ftid_lock); cv_destroy(&sc->tids.ftid_cv); } if (mtx_initialized(&sc->tids.atid_lock)) mtx_destroy(&sc->tids.atid_lock); if (mtx_initialized(&sc->sfl_lock)) mtx_destroy(&sc->sfl_lock); if (mtx_initialized(&sc->ifp_lock)) mtx_destroy(&sc->ifp_lock); if (mtx_initialized(&sc->reg_lock)) mtx_destroy(&sc->reg_lock); if (rw_initialized(&sc->policy_lock)) { rw_destroy(&sc->policy_lock); #ifdef TCP_OFFLOAD if (sc->policy != NULL) free_offload_policy(sc->policy); #endif } for (i = 0; i < NUM_MEMWIN; i++) { struct memwin *mw = &sc->memwin[i]; if (rw_initialized(&mw->mw_lock)) rw_destroy(&mw->mw_lock); } bzero(sc, sizeof(*sc)); return (0); } static int cxgbe_probe(device_t dev) { char buf[128]; struct port_info *pi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d", pi->port_id); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } #define T4_CAP (IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | \ IFCAP_VLAN_HWCSUM | IFCAP_TSO | IFCAP_JUMBO_MTU | IFCAP_LRO | \ IFCAP_VLAN_HWTSO | IFCAP_LINKSTATE | IFCAP_HWCSUM_IPV6 | IFCAP_HWSTATS | \ IFCAP_HWRXTSTMP) #define T4_CAP_ENABLE (T4_CAP) static int cxgbe_vi_attach(device_t dev, struct vi_info *vi) { struct ifnet *ifp; struct sbuf *sb; vi->xact_addr_filt = -1; callout_init(&vi->tick, 1); /* Allocate an ifnet and set it up */ ifp = if_alloc_dev(IFT_ETHER, dev); if (ifp == NULL) { device_printf(dev, "Cannot allocate ifnet\n"); return (ENOMEM); } vi->ifp = ifp; ifp->if_softc = vi; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = cxgbe_init; ifp->if_ioctl = cxgbe_ioctl; ifp->if_transmit = cxgbe_transmit; ifp->if_qflush = cxgbe_qflush; ifp->if_get_counter = cxgbe_get_counter; #ifdef RATELIMIT ifp->if_snd_tag_alloc = cxgbe_snd_tag_alloc; ifp->if_snd_tag_modify = cxgbe_snd_tag_modify; ifp->if_snd_tag_query = cxgbe_snd_tag_query; ifp->if_snd_tag_free = cxgbe_snd_tag_free; #endif ifp->if_capabilities = T4_CAP; ifp->if_capenable = T4_CAP_ENABLE; #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) ifp->if_capabilities |= IFCAP_TOE; #endif #ifdef RATELIMIT if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0) { ifp->if_capabilities |= IFCAP_TXRTLMT; ifp->if_capenable |= IFCAP_TXRTLMT; } #endif ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6; ifp->if_hw_tsomax = IP_MAXPACKET; ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO; #ifdef RATELIMIT if (is_ethoffload(vi->pi->adapter) && vi->nofldtxq != 0) ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO; #endif ifp->if_hw_tsomaxsegsize = 65536; ether_ifattach(ifp, vi->hw_addr); #ifdef DEV_NETMAP if (vi->nnmrxq != 0) cxgbe_nm_attach(vi); #endif sb = sbuf_new_auto(); sbuf_printf(sb, "%d txq, %d rxq (NIC)", vi->ntxq, vi->nrxq); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) switch (ifp->if_capabilities & (IFCAP_TOE | IFCAP_TXRTLMT)) { case IFCAP_TOE: sbuf_printf(sb, "; %d txq (TOE)", vi->nofldtxq); break; case IFCAP_TOE | IFCAP_TXRTLMT: sbuf_printf(sb, "; %d txq (TOE/ETHOFLD)", vi->nofldtxq); break; case IFCAP_TXRTLMT: sbuf_printf(sb, "; %d txq (ETHOFLD)", vi->nofldtxq); break; } #endif #ifdef TCP_OFFLOAD if (ifp->if_capabilities & IFCAP_TOE) sbuf_printf(sb, ", %d rxq (TOE)", vi->nofldrxq); #endif #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) sbuf_printf(sb, "; %d txq, %d rxq (netmap)", vi->nnmtxq, vi->nnmrxq); #endif sbuf_finish(sb); device_printf(dev, "%s\n", sbuf_data(sb)); sbuf_delete(sb); vi_sysctls(vi); return (0); } static int cxgbe_attach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; struct vi_info *vi; int i, rc; callout_init_mtx(&pi->tick, &pi->pi_lock, 0); rc = cxgbe_vi_attach(dev, &pi->vi[0]); if (rc) return (rc); for_each_vi(pi, i, vi) { if (i == 0) continue; vi->dev = device_add_child(dev, sc->names->vi_ifnet_name, -1); if (vi->dev == NULL) { device_printf(dev, "failed to add VI %d\n", i); continue; } device_set_softc(vi->dev, vi); } cxgbe_sysctls(pi); bus_generic_attach(dev); return (0); } static void cxgbe_vi_detach(struct vi_info *vi) { struct ifnet *ifp = vi->ifp; ether_ifdetach(ifp); /* Let detach proceed even if these fail. */ #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) cxgbe_nm_detach(vi); #endif cxgbe_uninit_synchronized(vi); callout_drain(&vi->tick); vi_full_uninit(vi); if_free(vi->ifp); vi->ifp = NULL; } static int cxgbe_detach(device_t dev) { struct port_info *pi = device_get_softc(dev); struct adapter *sc = pi->adapter; int rc; /* Detach the extra VIs first. */ rc = bus_generic_detach(dev); if (rc) return (rc); device_delete_children(dev); doom_vi(sc, &pi->vi[0]); if (pi->flags & HAS_TRACEQ) { sc->traceq = -1; /* cloner should not create ifnet */ t4_tracer_port_detach(sc); } cxgbe_vi_detach(&pi->vi[0]); callout_drain(&pi->tick); ifmedia_removeall(&pi->media); end_synchronized_op(sc, 0); return (0); } static void cxgbe_init(void *arg) { struct vi_info *vi = arg; struct adapter *sc = vi->pi->adapter; if (begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4init") != 0) return; cxgbe_init_synchronized(vi); end_synchronized_op(sc, 0); } static int cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, caddr_t data) { int rc = 0, mtu, flags; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifreq *ifr = (struct ifreq *)data; uint32_t mask; switch (cmd) { case SIOCSIFMTU: mtu = ifr->ifr_mtu; if (mtu < ETHERMIN || mtu > MAX_MTU) return (EINVAL); rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4mtu"); if (rc) return (rc); ifp->if_mtu = mtu; if (vi->flags & VI_INIT_DONE) { t4_update_fl_bufsize(ifp); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MTU); } end_synchronized_op(sc, 0); break; case SIOCSIFFLAGS: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4flg"); if (rc) return (rc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { flags = vi->if_flags; if ((ifp->if_flags ^ flags) & (IFF_PROMISC | IFF_ALLMULTI)) { rc = update_mac_settings(ifp, XGMAC_PROMISC | XGMAC_ALLMULTI); } } else { rc = cxgbe_init_synchronized(vi); } vi->if_flags = ifp->if_flags; } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { rc = cxgbe_uninit_synchronized(vi); } end_synchronized_op(sc, 0); break; case SIOCADDMULTI: case SIOCDELMULTI: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4multi"); if (rc) return (rc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_MCADDRS); end_synchronized_op(sc, 0); break; case SIOCSIFCAP: rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4cap"); if (rc) return (rc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO4 & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO4; if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (IFCAP_TSO6 & ifp->if_capenable && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO6; if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; /* * Note that we leave CSUM_TSO alone (it is always set). The * kernel takes both IFCAP_TSOx and CSUM_TSO into account before * sending a TSO request our way, so it's sufficient to toggle * IFCAP_TSOx only. */ if (mask & IFCAP_TSO4) { if (!(IFCAP_TSO4 & ifp->if_capenable) && !(IFCAP_TXCSUM & ifp->if_capenable)) { if_printf(ifp, "enable txcsum first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO4; } if (mask & IFCAP_TSO6) { if (!(IFCAP_TSO6 & ifp->if_capenable) && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { if_printf(ifp, "enable txcsum6 first.\n"); rc = EAGAIN; goto fail; } ifp->if_capenable ^= IFCAP_TSO6; } if (mask & IFCAP_LRO) { #if defined(INET) || defined(INET6) int i; struct sge_rxq *rxq; ifp->if_capenable ^= IFCAP_LRO; for_each_rxq(vi, i, rxq) { if (ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; else rxq->iq.flags &= ~IQ_LRO_ENABLED; } #endif } #ifdef TCP_OFFLOAD if (mask & IFCAP_TOE) { int enable = (ifp->if_capenable ^ mask) & IFCAP_TOE; rc = toe_capability(vi, enable); if (rc != 0) goto fail; ifp->if_capenable ^= mask; } #endif if (mask & IFCAP_VLAN_HWTAGGING) { ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (ifp->if_drv_flags & IFF_DRV_RUNNING) rc = update_mac_settings(ifp, XGMAC_VLANEX); } if (mask & IFCAP_VLAN_MTU) { ifp->if_capenable ^= IFCAP_VLAN_MTU; /* Need to find out how to disable auto-mtu-inflation */ } if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (mask & IFCAP_VLAN_HWCSUM) ifp->if_capenable ^= IFCAP_VLAN_HWCSUM; #ifdef RATELIMIT if (mask & IFCAP_TXRTLMT) ifp->if_capenable ^= IFCAP_TXRTLMT; #endif if (mask & IFCAP_HWRXTSTMP) { int i; struct sge_rxq *rxq; ifp->if_capenable ^= IFCAP_HWRXTSTMP; for_each_rxq(vi, i, rxq) { if (ifp->if_capenable & IFCAP_HWRXTSTMP) rxq->iq.flags |= IQ_RX_TIMESTAMP; else rxq->iq.flags &= ~IQ_RX_TIMESTAMP; } } #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); #endif fail: end_synchronized_op(sc, 0); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: ifmedia_ioctl(ifp, ifr, &pi->media, cmd); break; case SIOCGI2C: { struct ifi2creq i2c; rc = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (rc != 0) break; if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) { rc = EPERM; break; } if (i2c.len > sizeof(i2c.data)) { rc = EINVAL; break; } rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4i2c"); if (rc) return (rc); rc = -t4_i2c_rd(sc, sc->mbox, pi->port_id, i2c.dev_addr, i2c.offset, i2c.len, &i2c.data[0]); end_synchronized_op(sc, 0); if (rc == 0) rc = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); break; } default: rc = ether_ioctl(ifp, cmd, data); } return (rc); } static int cxgbe_transmit(struct ifnet *ifp, struct mbuf *m) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_txq *txq; void *items[1]; int rc; M_ASSERTPKTHDR(m); MPASS(m->m_nextpkt == NULL); /* not quite ready for this yet */ if (__predict_false(pi->link_cfg.link_ok == false)) { m_freem(m); return (ENETDOWN); } rc = parse_pkt(sc, &m); if (__predict_false(rc != 0)) { MPASS(m == NULL); /* was freed already */ atomic_add_int(&pi->tx_parse_error, 1); /* rare, atomic is ok */ return (rc); } #ifdef RATELIMIT - if (m->m_pkthdr.snd_tag != NULL) { - /* EAGAIN tells the stack we are not the correct interface. */ - if (__predict_false(ifp != m->m_pkthdr.snd_tag->ifp)) { - m_freem(m); - return (EAGAIN); - } - + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); return (ethofld_transmit(ifp, m)); } #endif /* Select a txq. */ txq = &sc->sge.txq[vi->first_txq]; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) txq += ((m->m_pkthdr.flowid % (vi->ntxq - vi->rsrv_noflowq)) + vi->rsrv_noflowq); items[0] = m; rc = mp_ring_enqueue(txq->r, items, 1, 4096); if (__predict_false(rc != 0)) m_freem(m); return (rc); } static void cxgbe_qflush(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct sge_txq *txq; int i; /* queues do not exist if !VI_INIT_DONE. */ if (vi->flags & VI_INIT_DONE) { for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_QFLUSH; TXQ_UNLOCK(txq); while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 0); pause("qflush", 1); } TXQ_LOCK(txq); txq->eq.flags &= ~EQ_QFLUSH; TXQ_UNLOCK(txq); } } if_qflush(ifp); } static uint64_t vi_get_counter(struct ifnet *ifp, ift_counter c) { struct vi_info *vi = ifp->if_softc; struct fw_vi_stats_vf *s = &vi->stats; vi_refresh_stats(vi->pi->adapter, vi); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_bcast_frames + s->rx_mcast_frames + s->rx_ucast_frames); case IFCOUNTER_IERRORS: return (s->rx_err_frames); case IFCOUNTER_OPACKETS: return (s->tx_bcast_frames + s->tx_mcast_frames + s->tx_ucast_frames + s->tx_offload_frames); case IFCOUNTER_OERRORS: return (s->tx_drop_frames); case IFCOUNTER_IBYTES: return (s->rx_bcast_bytes + s->rx_mcast_bytes + s->rx_ucast_bytes); case IFCOUNTER_OBYTES: return (s->tx_bcast_bytes + s->tx_mcast_bytes + s->tx_ucast_bytes + s->tx_offload_bytes); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = 0; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->drops); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } uint64_t cxgbe_get_counter(struct ifnet *ifp, ift_counter c) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct port_stats *s = &pi->stats; if (pi->nvi > 1 || sc->flags & IS_VF) return (vi_get_counter(ifp, c)); cxgbe_refresh_stats(sc, pi); switch (c) { case IFCOUNTER_IPACKETS: return (s->rx_frames); case IFCOUNTER_IERRORS: return (s->rx_jabber + s->rx_runt + s->rx_too_long + s->rx_fcs_err + s->rx_len_err); case IFCOUNTER_OPACKETS: return (s->tx_frames); case IFCOUNTER_OERRORS: return (s->tx_error_frames); case IFCOUNTER_IBYTES: return (s->rx_octets); case IFCOUNTER_OBYTES: return (s->tx_octets); case IFCOUNTER_IMCASTS: return (s->rx_mcast_frames); case IFCOUNTER_OMCASTS: return (s->tx_mcast_frames); case IFCOUNTER_IQDROPS: return (s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 + s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 + s->rx_trunc3 + pi->tnl_cong_drops); case IFCOUNTER_OQDROPS: { uint64_t drops; drops = s->tx_drop; if (vi->flags & VI_INIT_DONE) { int i; struct sge_txq *txq; for_each_txq(vi, i, txq) drops += counter_u64_fetch(txq->r->drops); } return (drops); } default: return (if_get_counter_default(ifp, c)); } } /* * The kernel picks a media from the list we had provided but we still validate * the requeste. */ int cxgbe_media_change(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct ifmedia *ifm = &pi->media; struct link_config *lc = &pi->link_cfg; struct adapter *sc = pi->adapter; int rc; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mec"); if (rc != 0) return (rc); PORT_LOCK(pi); if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO) { /* ifconfig .. media autoselect */ if (!(lc->supported & FW_PORT_CAP32_ANEG)) { rc = ENOTSUP; /* AN not supported by transceiver */ goto done; } lc->requested_aneg = AUTONEG_ENABLE; lc->requested_speed = 0; lc->requested_fc |= PAUSE_AUTONEG; } else { lc->requested_aneg = AUTONEG_DISABLE; lc->requested_speed = ifmedia_baudrate(ifm->ifm_media) / 1000000; lc->requested_fc = 0; if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_RXPAUSE) lc->requested_fc |= PAUSE_RX; if (IFM_OPTIONS(ifm->ifm_media) & IFM_ETH_TXPAUSE) lc->requested_fc |= PAUSE_TX; } if (pi->up_vis > 0) { fixup_link_config(pi); rc = apply_link_config(pi); } done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); return (rc); } /* * Base media word (without ETHER, pause, link active, etc.) for the port at the * given speed. */ static int port_mword(struct port_info *pi, uint32_t speed) { MPASS(speed & M_FW_PORT_CAP32_SPEED); MPASS(powerof2(speed)); switch(pi->port_type) { case FW_PORT_TYPE_BT_SGMII: case FW_PORT_TYPE_BT_XFI: case FW_PORT_TYPE_BT_XAUI: /* BaseT */ switch (speed) { case FW_PORT_CAP32_SPEED_100M: return (IFM_100_T); case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_T); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_T); } break; case FW_PORT_TYPE_KX4: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_KX4); break; case FW_PORT_TYPE_CX4: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_CX4); break; case FW_PORT_TYPE_KX: if (speed == FW_PORT_CAP32_SPEED_1G) return (IFM_1000_KX); break; case FW_PORT_TYPE_KR: case FW_PORT_TYPE_BP_AP: case FW_PORT_TYPE_BP4_AP: case FW_PORT_TYPE_BP40_BA: case FW_PORT_TYPE_KR4_100G: case FW_PORT_TYPE_KR_SFP28: case FW_PORT_TYPE_KR_XLAUI: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_KX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_KR); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_KR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_KR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_KR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_KR4); } break; case FW_PORT_TYPE_FIBER_XFI: case FW_PORT_TYPE_FIBER_XAUI: case FW_PORT_TYPE_SFP: case FW_PORT_TYPE_QSFP_10G: case FW_PORT_TYPE_QSA: case FW_PORT_TYPE_QSFP: case FW_PORT_TYPE_CR4_QSFP: case FW_PORT_TYPE_CR_QSFP: case FW_PORT_TYPE_CR2_QSFP: case FW_PORT_TYPE_SFP28: /* Pluggable transceiver */ switch (pi->mod_type) { case FW_PORT_MOD_TYPE_LR: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_LX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_LR); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_LR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_LR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_LR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_LR4); } break; case FW_PORT_MOD_TYPE_SR: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_SX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_SR); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_SR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_SR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_SR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_SR4); } break; case FW_PORT_MOD_TYPE_ER: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_ER); break; case FW_PORT_MOD_TYPE_TWINAX_PASSIVE: case FW_PORT_MOD_TYPE_TWINAX_ACTIVE: switch (speed) { case FW_PORT_CAP32_SPEED_1G: return (IFM_1000_CX); case FW_PORT_CAP32_SPEED_10G: return (IFM_10G_TWINAX); case FW_PORT_CAP32_SPEED_25G: return (IFM_25G_CR); case FW_PORT_CAP32_SPEED_40G: return (IFM_40G_CR4); case FW_PORT_CAP32_SPEED_50G: return (IFM_50G_CR2); case FW_PORT_CAP32_SPEED_100G: return (IFM_100G_CR4); } break; case FW_PORT_MOD_TYPE_LRM: if (speed == FW_PORT_CAP32_SPEED_10G) return (IFM_10G_LRM); break; case FW_PORT_MOD_TYPE_NA: MPASS(0); /* Not pluggable? */ /* fall throough */ case FW_PORT_MOD_TYPE_ERROR: case FW_PORT_MOD_TYPE_UNKNOWN: case FW_PORT_MOD_TYPE_NOTSUPPORTED: break; case FW_PORT_MOD_TYPE_NONE: return (IFM_NONE); } break; case FW_PORT_TYPE_NONE: return (IFM_NONE); } return (IFM_UNKNOWN); } void cxgbe_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4med") != 0) return; PORT_LOCK(pi); if (pi->up_vis == 0) { /* * If all the interfaces are administratively down the firmware * does not report transceiver changes. Refresh port info here * so that ifconfig displays accurate ifmedia at all times. * This is the only reason we have a synchronized op in this * function. Just PORT_LOCK would have been enough otherwise. */ t4_update_port_info(pi); build_medialist(pi); } /* ifm_status */ ifmr->ifm_status = IFM_AVALID; if (lc->link_ok == false) goto done; ifmr->ifm_status |= IFM_ACTIVE; /* ifm_active */ ifmr->ifm_active = IFM_ETHER | IFM_FDX; ifmr->ifm_active &= ~(IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE); if (lc->fc & PAUSE_RX) ifmr->ifm_active |= IFM_ETH_RXPAUSE; if (lc->fc & PAUSE_TX) ifmr->ifm_active |= IFM_ETH_TXPAUSE; ifmr->ifm_active |= port_mword(pi, speed_to_fwcap(lc->speed)); done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } static int vcxgbe_probe(device_t dev) { char buf[128]; struct vi_info *vi = device_get_softc(dev); snprintf(buf, sizeof(buf), "port %d vi %td", vi->pi->port_id, vi - vi->pi->vi); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } static int alloc_extra_vi(struct adapter *sc, struct port_info *pi, struct vi_info *vi) { int func, index, rc; uint32_t param, val; ASSERT_SYNCHRONIZED_OP(sc); index = vi - pi->vi; MPASS(index > 0); /* This function deals with _extra_ VIs only */ KASSERT(index < nitems(vi_mac_funcs), ("%s: VI %s doesn't have a MAC func", __func__, device_get_nameunit(vi->dev))); func = vi_mac_funcs[index]; rc = t4_alloc_vi_func(sc, sc->mbox, pi->tx_chan, sc->pf, 0, 1, vi->hw_addr, &vi->rss_size, &vi->vfvld, &vi->vin, func, 0); if (rc < 0) { device_printf(vi->dev, "failed to allocate virtual interface %d" "for port %d: %d\n", index, pi->port_id, -rc); return (-rc); } vi->viid = rc; if (vi->rss_size == 1) { /* * This VI didn't get a slice of the RSS table. Reduce the * number of VIs being created (hw.cxgbe.num_vis) or modify the * configuration file (nvi, rssnvi for this PF) if this is a * problem. */ device_printf(vi->dev, "RSS table not available.\n"); vi->rss_base = 0xffff; return (0); } param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_RSSINFO) | V_FW_PARAMS_PARAM_YZ(vi->viid); rc = t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc) vi->rss_base = 0xffff; else { MPASS((val >> 16) == vi->rss_size); vi->rss_base = val & 0xffff; } return (0); } static int vcxgbe_attach(device_t dev) { struct vi_info *vi; struct port_info *pi; struct adapter *sc; int rc; vi = device_get_softc(dev); pi = vi->pi; sc = pi->adapter; rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4via"); if (rc) return (rc); rc = alloc_extra_vi(sc, pi, vi); end_synchronized_op(sc, 0); if (rc) return (rc); rc = cxgbe_vi_attach(dev, vi); if (rc) { t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); return (rc); } return (0); } static int vcxgbe_detach(device_t dev) { struct vi_info *vi; struct adapter *sc; vi = device_get_softc(dev); sc = vi->pi->adapter; doom_vi(sc, vi); cxgbe_vi_detach(vi); t4_free_vi(sc, sc->mbox, sc->pf, 0, vi->viid); end_synchronized_op(sc, 0); return (0); } static struct callout fatal_callout; static void delayed_panic(void *arg) { struct adapter *sc = arg; panic("%s: panic on fatal error", device_get_nameunit(sc->dev)); } void t4_fatal_err(struct adapter *sc, bool fw_error) { t4_shutdown_adapter(sc); log(LOG_ALERT, "%s: encountered fatal error, adapter stopped.\n", device_get_nameunit(sc->dev)); if (fw_error) { ASSERT_SYNCHRONIZED_OP(sc); sc->flags |= ADAP_ERR; } else { ADAPTER_LOCK(sc); sc->flags |= ADAP_ERR; ADAPTER_UNLOCK(sc); } if (t4_panic_on_fatal_err) { log(LOG_ALERT, "%s: panic on fatal error after 30s", device_get_nameunit(sc->dev)); callout_reset(&fatal_callout, hz * 30, delayed_panic, sc); } } void t4_add_adapter(struct adapter *sc) { sx_xlock(&t4_list_lock); SLIST_INSERT_HEAD(&t4_list, sc, link); sx_xunlock(&t4_list_lock); } int t4_map_bars_0_and_4(struct adapter *sc) { sc->regs_rid = PCIR_BAR(0); sc->regs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->regs_rid, RF_ACTIVE); if (sc->regs_res == NULL) { device_printf(sc->dev, "cannot map registers.\n"); return (ENXIO); } sc->bt = rman_get_bustag(sc->regs_res); sc->bh = rman_get_bushandle(sc->regs_res); sc->mmio_len = rman_get_size(sc->regs_res); setbit(&sc->doorbells, DOORBELL_KDB); sc->msix_rid = PCIR_BAR(4); sc->msix_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->msix_rid, RF_ACTIVE); if (sc->msix_res == NULL) { device_printf(sc->dev, "cannot map MSI-X BAR.\n"); return (ENXIO); } return (0); } int t4_map_bar_2(struct adapter *sc) { /* * T4: only iWARP driver uses the userspace doorbells. There is no need * to map it if RDMA is disabled. */ if (is_t4(sc) && sc->rdmacaps == 0) return (0); sc->udbs_rid = PCIR_BAR(2); sc->udbs_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &sc->udbs_rid, RF_ACTIVE); if (sc->udbs_res == NULL) { device_printf(sc->dev, "cannot map doorbell BAR.\n"); return (ENXIO); } sc->udbs_base = rman_get_virtual(sc->udbs_res); if (chip_id(sc) >= CHELSIO_T5) { setbit(&sc->doorbells, DOORBELL_UDB); #if defined(__i386__) || defined(__amd64__) if (t5_write_combine) { int rc, mode; /* * Enable write combining on BAR2. This is the * userspace doorbell BAR and is split into 128B * (UDBS_SEG_SIZE) doorbell regions, each associated * with an egress queue. The first 64B has the doorbell * and the second 64B can be used to submit a tx work * request with an implicit doorbell. */ rc = pmap_change_attr((vm_offset_t)sc->udbs_base, rman_get_size(sc->udbs_res), PAT_WRITE_COMBINING); if (rc == 0) { clrbit(&sc->doorbells, DOORBELL_UDB); setbit(&sc->doorbells, DOORBELL_WCWR); setbit(&sc->doorbells, DOORBELL_UDBWC); } else { device_printf(sc->dev, "couldn't enable write combining: %d\n", rc); } mode = is_t5(sc) ? V_STATMODE(0) : V_T6_STATMODE(0); t4_write_reg(sc, A_SGE_STAT_CFG, V_STATSOURCE_T5(7) | mode); } #endif } sc->iwt.wc_en = isset(&sc->doorbells, DOORBELL_UDBWC) ? 1 : 0; return (0); } struct memwin_init { uint32_t base; uint32_t aperture; }; static const struct memwin_init t4_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T4, MEMWIN2_APERTURE_T4 } }; static const struct memwin_init t5_memwin[NUM_MEMWIN] = { { MEMWIN0_BASE, MEMWIN0_APERTURE }, { MEMWIN1_BASE, MEMWIN1_APERTURE }, { MEMWIN2_BASE_T5, MEMWIN2_APERTURE_T5 }, }; static void setup_memwin(struct adapter *sc) { const struct memwin_init *mw_init; struct memwin *mw; int i; uint32_t bar0; if (is_t4(sc)) { /* * Read low 32b of bar0 indirectly via the hardware backdoor * mechanism. Works from within PCI passthrough environments * too, where rman_get_start() can return a different value. We * need to program the T4 memory window decoders with the actual * addresses that will be coming across the PCIe link. */ bar0 = t4_hw_pci_read_cfg4(sc, PCIR_BAR(0)); bar0 &= (uint32_t) PCIM_BAR_MEM_BASE; mw_init = &t4_memwin[0]; } else { /* T5+ use the relative offset inside the PCIe BAR */ bar0 = 0; mw_init = &t5_memwin[0]; } for (i = 0, mw = &sc->memwin[0]; i < NUM_MEMWIN; i++, mw_init++, mw++) { rw_init(&mw->mw_lock, "memory window access"); mw->mw_base = mw_init->base; mw->mw_aperture = mw_init->aperture; mw->mw_curpos = 0; t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, i), (mw->mw_base + bar0) | V_BIR(0) | V_WINDOW(ilog2(mw->mw_aperture) - 10)); rw_wlock(&mw->mw_lock); position_memwin(sc, i, 0); rw_wunlock(&mw->mw_lock); } /* flush */ t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_BASE_WIN, 2)); } /* * Positions the memory window at the given address in the card's address space. * There are some alignment requirements and the actual position may be at an * address prior to the requested address. mw->mw_curpos always has the actual * position of the window. */ static void position_memwin(struct adapter *sc, int idx, uint32_t addr) { struct memwin *mw; uint32_t pf; uint32_t reg; MPASS(idx >= 0 && idx < NUM_MEMWIN); mw = &sc->memwin[idx]; rw_assert(&mw->mw_lock, RA_WLOCKED); if (is_t4(sc)) { pf = 0; mw->mw_curpos = addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); mw->mw_curpos = addr & ~0x7f; /* start must be 128B aligned */ } reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, idx); t4_write_reg(sc, reg, mw->mw_curpos | pf); t4_read_reg(sc, reg); /* flush */ } int rw_via_memwin(struct adapter *sc, int idx, uint32_t addr, uint32_t *val, int len, int rw) { struct memwin *mw; uint32_t mw_end, v; MPASS(idx >= 0 && idx < NUM_MEMWIN); /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len <= 0) return (EINVAL); mw = &sc->memwin[idx]; while (len > 0) { rw_rlock(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; if (addr >= mw_end || addr < mw->mw_curpos) { /* Will need to reposition the window */ if (!rw_try_upgrade(&mw->mw_lock)) { rw_runlock(&mw->mw_lock); rw_wlock(&mw->mw_lock); } rw_assert(&mw->mw_lock, RA_WLOCKED); position_memwin(sc, idx, addr); rw_downgrade(&mw->mw_lock); mw_end = mw->mw_curpos + mw->mw_aperture; } rw_assert(&mw->mw_lock, RA_RLOCKED); while (addr < mw_end && len > 0) { if (rw == 0) { v = t4_read_reg(sc, mw->mw_base + addr - mw->mw_curpos); *val++ = le32toh(v); } else { v = *val++; t4_write_reg(sc, mw->mw_base + addr - mw->mw_curpos, htole32(v)); } addr += 4; len -= 4; } rw_runlock(&mw->mw_lock); } return (0); } int alloc_atid_tab(struct tid_info *t, int flags) { int i; MPASS(t->natids > 0); MPASS(t->atid_tab == NULL); t->atid_tab = malloc(t->natids * sizeof(*t->atid_tab), M_CXGBE, M_ZERO | flags); if (t->atid_tab == NULL) return (ENOMEM); mtx_init(&t->atid_lock, "atid lock", NULL, MTX_DEF); t->afree = t->atid_tab; t->atids_in_use = 0; for (i = 1; i < t->natids; i++) t->atid_tab[i - 1].next = &t->atid_tab[i]; t->atid_tab[t->natids - 1].next = NULL; return (0); } void free_atid_tab(struct tid_info *t) { KASSERT(t->atids_in_use == 0, ("%s: %d atids still in use.", __func__, t->atids_in_use)); if (mtx_initialized(&t->atid_lock)) mtx_destroy(&t->atid_lock); free(t->atid_tab, M_CXGBE); t->atid_tab = NULL; } int alloc_atid(struct adapter *sc, void *ctx) { struct tid_info *t = &sc->tids; int atid = -1; mtx_lock(&t->atid_lock); if (t->afree) { union aopen_entry *p = t->afree; atid = p - t->atid_tab; MPASS(atid <= M_TID_TID); t->afree = p->next; p->data = ctx; t->atids_in_use++; } mtx_unlock(&t->atid_lock); return (atid); } void * lookup_atid(struct adapter *sc, int atid) { struct tid_info *t = &sc->tids; return (t->atid_tab[atid].data); } void free_atid(struct adapter *sc, int atid) { struct tid_info *t = &sc->tids; union aopen_entry *p = &t->atid_tab[atid]; mtx_lock(&t->atid_lock); p->next = t->afree; t->afree = p; t->atids_in_use--; mtx_unlock(&t->atid_lock); } static void queue_tid_release(struct adapter *sc, int tid) { CXGBE_UNIMPLEMENTED("deferred tid release"); } void release_tid(struct adapter *sc, int tid, struct sge_wrq *ctrlq) { struct wrqe *wr; struct cpl_tid_release *req; wr = alloc_wrqe(sizeof(*req), ctrlq); if (wr == NULL) { queue_tid_release(sc, tid); /* defer */ return; } req = wrtod(wr); INIT_TP_WR_MIT_CPL(req, CPL_TID_RELEASE, tid); t4_wrq_tx(sc, wr); } static int t4_range_cmp(const void *a, const void *b) { return ((const struct t4_range *)a)->start - ((const struct t4_range *)b)->start; } /* * Verify that the memory range specified by the addr/len pair is valid within * the card's address space. */ static int validate_mem_range(struct adapter *sc, uint32_t addr, uint32_t len) { struct t4_range mem_ranges[4], *r, *next; uint32_t em, addr_len; int i, n, remaining; /* Memory can only be accessed in naturally aligned 4 byte units */ if (addr & 3 || len & 3 || len == 0) return (EINVAL); /* Enabled memories */ em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); r = &mem_ranges[0]; n = 0; bzero(r, sizeof(mem_ranges)); if (em & F_EDRAM0_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); r->size = G_EDRAM0_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM0_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EDRAM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); r->size = G_EDRAM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EDRAM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (em & F_EXT_MEM_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); r->size = G_EXT_MEM_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } if (is_t5(sc) && em & F_EXT_MEM1_ENABLE) { addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); r->size = G_EXT_MEM1_SIZE(addr_len) << 20; if (r->size > 0) { r->start = G_EXT_MEM1_BASE(addr_len) << 20; if (addr >= r->start && addr + len <= r->start + r->size) return (0); r++; n++; } } MPASS(n <= nitems(mem_ranges)); if (n > 1) { /* Sort and merge the ranges. */ qsort(mem_ranges, n, sizeof(struct t4_range), t4_range_cmp); /* Start from index 0 and examine the next n - 1 entries. */ r = &mem_ranges[0]; for (remaining = n - 1; remaining > 0; remaining--, r++) { MPASS(r->size > 0); /* r is a valid entry. */ next = r + 1; MPASS(next->size > 0); /* and so is the next one. */ while (r->start + r->size >= next->start) { /* Merge the next one into the current entry. */ r->size = max(r->start + r->size, next->start + next->size) - r->start; n--; /* One fewer entry in total. */ if (--remaining == 0) goto done; /* short circuit */ next++; } if (next != r + 1) { /* * Some entries were merged into r and next * points to the first valid entry that couldn't * be merged. */ MPASS(next->size > 0); /* must be valid */ memcpy(r + 1, next, remaining * sizeof(*r)); #ifdef INVARIANTS /* * This so that the foo->size assertion in the * next iteration of the loop do the right * thing for entries that were pulled up and are * no longer valid. */ MPASS(n < nitems(mem_ranges)); bzero(&mem_ranges[n], (nitems(mem_ranges) - n) * sizeof(struct t4_range)); #endif } } done: /* Done merging the ranges. */ MPASS(n > 0); r = &mem_ranges[0]; for (i = 0; i < n; i++, r++) { if (addr >= r->start && addr + len <= r->start + r->size) return (0); } } return (EFAULT); } static int fwmtype_to_hwmtype(int mtype) { switch (mtype) { case FW_MEMTYPE_EDC0: return (MEM_EDC0); case FW_MEMTYPE_EDC1: return (MEM_EDC1); case FW_MEMTYPE_EXTMEM: return (MEM_MC0); case FW_MEMTYPE_EXTMEM1: return (MEM_MC1); default: panic("%s: cannot translate fw mtype %d.", __func__, mtype); } } /* * Verify that the memory range specified by the memtype/offset/len pair is * valid and lies entirely within the memtype specified. The global address of * the start of the range is returned in addr. */ static int validate_mt_off_len(struct adapter *sc, int mtype, uint32_t off, uint32_t len, uint32_t *addr) { uint32_t em, addr_len, maddr; /* Memory can only be accessed in naturally aligned 4 byte units */ if (off & 3 || len & 3 || len == 0) return (EINVAL); em = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); switch (fwmtype_to_hwmtype(mtype)) { case MEM_EDC0: if (!(em & F_EDRAM0_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM0_BAR); maddr = G_EDRAM0_BASE(addr_len) << 20; break; case MEM_EDC1: if (!(em & F_EDRAM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EDRAM1_BAR); maddr = G_EDRAM1_BASE(addr_len) << 20; break; case MEM_MC: if (!(em & F_EXT_MEM_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); maddr = G_EXT_MEM_BASE(addr_len) << 20; break; case MEM_MC1: if (!is_t5(sc) || !(em & F_EXT_MEM1_ENABLE)) return (EINVAL); addr_len = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); maddr = G_EXT_MEM1_BASE(addr_len) << 20; break; default: return (EINVAL); } *addr = maddr + off; /* global address */ return (validate_mem_range(sc, *addr, len)); } static int fixup_devlog_params(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; int rc; rc = validate_mt_off_len(sc, dparams->memtype, dparams->start, dparams->size, &dparams->addr); return (rc); } static void update_nirq(struct intrs_and_queues *iaq, int nports) { int extra = T4_EXTRA_INTR; iaq->nirq = extra; iaq->nirq += nports * (iaq->nrxq + iaq->nofldrxq); iaq->nirq += nports * (iaq->num_vis - 1) * max(iaq->nrxq_vi, iaq->nnmrxq_vi); iaq->nirq += nports * (iaq->num_vis - 1) * iaq->nofldrxq_vi; } /* * Adjust requirements to fit the number of interrupts available. */ static void calculate_iaq(struct adapter *sc, struct intrs_and_queues *iaq, int itype, int navail) { int old_nirq; const int nports = sc->params.nports; MPASS(nports > 0); MPASS(navail > 0); bzero(iaq, sizeof(*iaq)); iaq->intr_type = itype; iaq->num_vis = t4_num_vis; iaq->ntxq = t4_ntxq; iaq->ntxq_vi = t4_ntxq_vi; iaq->nrxq = t4_nrxq; iaq->nrxq_vi = t4_nrxq_vi; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) if (is_offload(sc) || is_ethoffload(sc)) { iaq->nofldtxq = t4_nofldtxq; iaq->nofldtxq_vi = t4_nofldtxq_vi; } #endif #ifdef TCP_OFFLOAD if (is_offload(sc)) { iaq->nofldrxq = t4_nofldrxq; iaq->nofldrxq_vi = t4_nofldrxq_vi; } #endif #ifdef DEV_NETMAP iaq->nnmtxq_vi = t4_nnmtxq_vi; iaq->nnmrxq_vi = t4_nnmrxq_vi; #endif update_nirq(iaq, nports); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { /* * This is the normal case -- there are enough interrupts for * everything. */ goto done; } /* * If extra VIs have been configured try reducing their count and see if * that works. */ while (iaq->num_vis > 1) { iaq->num_vis--; update_nirq(iaq, nports); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { device_printf(sc->dev, "virtual interfaces per port " "reduced to %d from %d. nrxq=%u, nofldrxq=%u, " "nrxq_vi=%u nofldrxq_vi=%u, nnmrxq_vi=%u. " "itype %d, navail %u, nirq %d.\n", iaq->num_vis, t4_num_vis, iaq->nrxq, iaq->nofldrxq, iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi, itype, navail, iaq->nirq); goto done; } } /* * Extra VIs will not be created. Log a message if they were requested. */ MPASS(iaq->num_vis == 1); iaq->ntxq_vi = iaq->nrxq_vi = 0; iaq->nofldtxq_vi = iaq->nofldrxq_vi = 0; iaq->nnmtxq_vi = iaq->nnmrxq_vi = 0; if (iaq->num_vis != t4_num_vis) { device_printf(sc->dev, "extra virtual interfaces disabled. " "nrxq=%u, nofldrxq=%u, nrxq_vi=%u nofldrxq_vi=%u, " "nnmrxq_vi=%u. itype %d, navail %u, nirq %d.\n", iaq->nrxq, iaq->nofldrxq, iaq->nrxq_vi, iaq->nofldrxq_vi, iaq->nnmrxq_vi, itype, navail, iaq->nirq); } /* * Keep reducing the number of NIC rx queues to the next lower power of * 2 (for even RSS distribution) and halving the TOE rx queues and see * if that works. */ do { if (iaq->nrxq > 1) { do { iaq->nrxq--; } while (!powerof2(iaq->nrxq)); } if (iaq->nofldrxq > 1) iaq->nofldrxq >>= 1; old_nirq = iaq->nirq; update_nirq(iaq, nports); if (iaq->nirq <= navail && (itype != INTR_MSI || powerof2(iaq->nirq))) { device_printf(sc->dev, "running with reduced number of " "rx queues because of shortage of interrupts. " "nrxq=%u, nofldrxq=%u. " "itype %d, navail %u, nirq %d.\n", iaq->nrxq, iaq->nofldrxq, itype, navail, iaq->nirq); goto done; } } while (old_nirq != iaq->nirq); /* One interrupt for everything. Ugh. */ device_printf(sc->dev, "running with minimal number of queues. " "itype %d, navail %u.\n", itype, navail); iaq->nirq = 1; MPASS(iaq->nrxq == 1); iaq->ntxq = 1; if (iaq->nofldrxq > 1) iaq->nofldtxq = 1; done: MPASS(iaq->num_vis > 0); if (iaq->num_vis > 1) { MPASS(iaq->nrxq_vi > 0); MPASS(iaq->ntxq_vi > 0); } MPASS(iaq->nirq > 0); MPASS(iaq->nrxq > 0); MPASS(iaq->ntxq > 0); if (itype == INTR_MSI) { MPASS(powerof2(iaq->nirq)); } } static int cfg_itype_and_nqueues(struct adapter *sc, struct intrs_and_queues *iaq) { int rc, itype, navail, nalloc; for (itype = INTR_MSIX; itype; itype >>= 1) { if ((itype & t4_intr_types) == 0) continue; /* not allowed */ if (itype == INTR_MSIX) navail = pci_msix_count(sc->dev); else if (itype == INTR_MSI) navail = pci_msi_count(sc->dev); else navail = 1; restart: if (navail == 0) continue; calculate_iaq(sc, iaq, itype, navail); nalloc = iaq->nirq; rc = 0; if (itype == INTR_MSIX) rc = pci_alloc_msix(sc->dev, &nalloc); else if (itype == INTR_MSI) rc = pci_alloc_msi(sc->dev, &nalloc); if (rc == 0 && nalloc > 0) { if (nalloc == iaq->nirq) return (0); /* * Didn't get the number requested. Use whatever number * the kernel is willing to allocate. */ device_printf(sc->dev, "fewer vectors than requested, " "type=%d, req=%d, rcvd=%d; will downshift req.\n", itype, iaq->nirq, nalloc); pci_release_msi(sc->dev); navail = nalloc; goto restart; } device_printf(sc->dev, "failed to allocate vectors:%d, type=%d, req=%d, rcvd=%d\n", itype, rc, iaq->nirq, nalloc); } device_printf(sc->dev, "failed to find a usable interrupt type. " "allowed=%d, msi-x=%d, msi=%d, intx=1", t4_intr_types, pci_msix_count(sc->dev), pci_msi_count(sc->dev)); return (ENXIO); } #define FW_VERSION(chip) ( \ V_FW_HDR_FW_VER_MAJOR(chip##FW_VERSION_MAJOR) | \ V_FW_HDR_FW_VER_MINOR(chip##FW_VERSION_MINOR) | \ V_FW_HDR_FW_VER_MICRO(chip##FW_VERSION_MICRO) | \ V_FW_HDR_FW_VER_BUILD(chip##FW_VERSION_BUILD)) #define FW_INTFVER(chip, intf) (chip##FW_HDR_INTFVER_##intf) /* Just enough of fw_hdr to cover all version info. */ struct fw_h { __u8 ver; __u8 chip; __be16 len512; __be32 fw_ver; __be32 tp_microcode_ver; __u8 intfver_nic; __u8 intfver_vnic; __u8 intfver_ofld; __u8 intfver_ri; __u8 intfver_iscsipdu; __u8 intfver_iscsi; __u8 intfver_fcoepdu; __u8 intfver_fcoe; }; /* Spot check a couple of fields. */ CTASSERT(offsetof(struct fw_h, fw_ver) == offsetof(struct fw_hdr, fw_ver)); CTASSERT(offsetof(struct fw_h, intfver_nic) == offsetof(struct fw_hdr, intfver_nic)); CTASSERT(offsetof(struct fw_h, intfver_fcoe) == offsetof(struct fw_hdr, intfver_fcoe)); struct fw_info { uint8_t chip; char *kld_name; char *fw_mod_name; struct fw_h fw_h; } fw_info[] = { { .chip = CHELSIO_T4, .kld_name = "t4fw_cfg", .fw_mod_name = "t4fw", .fw_h = { .chip = FW_HDR_CHIP_T4, .fw_ver = htobe32(FW_VERSION(T4)), .intfver_nic = FW_INTFVER(T4, NIC), .intfver_vnic = FW_INTFVER(T4, VNIC), .intfver_ofld = FW_INTFVER(T4, OFLD), .intfver_ri = FW_INTFVER(T4, RI), .intfver_iscsipdu = FW_INTFVER(T4, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T4, ISCSI), .intfver_fcoepdu = FW_INTFVER(T4, FCOEPDU), .intfver_fcoe = FW_INTFVER(T4, FCOE), }, }, { .chip = CHELSIO_T5, .kld_name = "t5fw_cfg", .fw_mod_name = "t5fw", .fw_h = { .chip = FW_HDR_CHIP_T5, .fw_ver = htobe32(FW_VERSION(T5)), .intfver_nic = FW_INTFVER(T5, NIC), .intfver_vnic = FW_INTFVER(T5, VNIC), .intfver_ofld = FW_INTFVER(T5, OFLD), .intfver_ri = FW_INTFVER(T5, RI), .intfver_iscsipdu = FW_INTFVER(T5, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T5, ISCSI), .intfver_fcoepdu = FW_INTFVER(T5, FCOEPDU), .intfver_fcoe = FW_INTFVER(T5, FCOE), }, }, { .chip = CHELSIO_T6, .kld_name = "t6fw_cfg", .fw_mod_name = "t6fw", .fw_h = { .chip = FW_HDR_CHIP_T6, .fw_ver = htobe32(FW_VERSION(T6)), .intfver_nic = FW_INTFVER(T6, NIC), .intfver_vnic = FW_INTFVER(T6, VNIC), .intfver_ofld = FW_INTFVER(T6, OFLD), .intfver_ri = FW_INTFVER(T6, RI), .intfver_iscsipdu = FW_INTFVER(T6, ISCSIPDU), .intfver_iscsi = FW_INTFVER(T6, ISCSI), .intfver_fcoepdu = FW_INTFVER(T6, FCOEPDU), .intfver_fcoe = FW_INTFVER(T6, FCOE), }, } }; static struct fw_info * find_fw_info(int chip) { int i; for (i = 0; i < nitems(fw_info); i++) { if (fw_info[i].chip == chip) return (&fw_info[i]); } return (NULL); } /* * Is the given firmware API compatible with the one the driver was compiled * with? */ static int fw_compatible(const struct fw_h *hdr1, const struct fw_h *hdr2) { /* short circuit if it's the exact same firmware version */ if (hdr1->chip == hdr2->chip && hdr1->fw_ver == hdr2->fw_ver) return (1); /* * XXX: Is this too conservative? Perhaps I should limit this to the * features that are supported in the driver. */ #define SAME_INTF(x) (hdr1->intfver_##x == hdr2->intfver_##x) if (hdr1->chip == hdr2->chip && SAME_INTF(nic) && SAME_INTF(vnic) && SAME_INTF(ofld) && SAME_INTF(ri) && SAME_INTF(iscsipdu) && SAME_INTF(iscsi) && SAME_INTF(fcoepdu) && SAME_INTF(fcoe)) return (1); #undef SAME_INTF return (0); } static int load_fw_module(struct adapter *sc, const struct firmware **dcfg, const struct firmware **fw) { struct fw_info *fw_info; *dcfg = NULL; if (fw != NULL) *fw = NULL; fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } *dcfg = firmware_get(fw_info->kld_name); if (*dcfg != NULL) { if (fw != NULL) *fw = firmware_get(fw_info->fw_mod_name); return (0); } return (ENOENT); } static void unload_fw_module(struct adapter *sc, const struct firmware *dcfg, const struct firmware *fw) { if (fw != NULL) firmware_put(fw, FIRMWARE_UNLOAD); if (dcfg != NULL) firmware_put(dcfg, FIRMWARE_UNLOAD); } /* * Return values: * 0 means no firmware install attempted. * ERESTART means a firmware install was attempted and was successful. * +ve errno means a firmware install was attempted but failed. */ static int install_kld_firmware(struct adapter *sc, struct fw_h *card_fw, const struct fw_h *drv_fw, const char *reason, int *already) { const struct firmware *cfg, *fw; const uint32_t c = be32toh(card_fw->fw_ver); uint32_t d, k; int rc, fw_install; struct fw_h bundled_fw; bool load_attempted; cfg = fw = NULL; load_attempted = false; fw_install = t4_fw_install < 0 ? -t4_fw_install : t4_fw_install; memcpy(&bundled_fw, drv_fw, sizeof(bundled_fw)); if (t4_fw_install < 0) { rc = load_fw_module(sc, &cfg, &fw); if (rc != 0 || fw == NULL) { device_printf(sc->dev, "failed to load firmware module: %d. cfg %p, fw %p;" " will use compiled-in firmware version for" "hw.cxgbe.fw_install checks.\n", rc, cfg, fw); } else { memcpy(&bundled_fw, fw->data, sizeof(bundled_fw)); } load_attempted = true; } d = be32toh(bundled_fw.fw_ver); if (reason != NULL) goto install; if ((sc->flags & FW_OK) == 0) { if (c == 0xffffffff) { reason = "missing"; goto install; } rc = 0; goto done; } if (!fw_compatible(card_fw, &bundled_fw)) { reason = "incompatible or unusable"; goto install; } if (d > c) { reason = "older than the version bundled with this driver"; goto install; } if (fw_install == 2 && d != c) { reason = "different than the version bundled with this driver"; goto install; } /* No reason to do anything to the firmware already on the card. */ rc = 0; goto done; install: rc = 0; if ((*already)++) goto done; if (fw_install == 0) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver is prohibited from installing a firmware " "on the card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); goto done; } /* * We'll attempt to install a firmware. Load the module first (if it * hasn't been loaded already). */ if (!load_attempted) { rc = load_fw_module(sc, &cfg, &fw); if (rc != 0 || fw == NULL) { device_printf(sc->dev, "failed to load firmware module: %d. cfg %p, fw %p\n", rc, cfg, fw); /* carry on */ } } if (fw == NULL) { device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "but the driver cannot take corrective action because it " "is unable to load the firmware module.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason); rc = sc->flags & FW_OK ? 0 : ENOENT; goto done; } k = be32toh(((const struct fw_hdr *)fw->data)->fw_ver); if (k != d) { MPASS(t4_fw_install > 0); device_printf(sc->dev, "firmware in KLD (%u.%u.%u.%u) is not what the driver was " "expecting (%u.%u.%u.%u) and will not be used.\n", G_FW_HDR_FW_VER_MAJOR(k), G_FW_HDR_FW_VER_MINOR(k), G_FW_HDR_FW_VER_MICRO(k), G_FW_HDR_FW_VER_BUILD(k), G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d)); rc = sc->flags & FW_OK ? 0 : EINVAL; goto done; } device_printf(sc->dev, "firmware on card (%u.%u.%u.%u) is %s, " "installing firmware %u.%u.%u.%u on card.\n", G_FW_HDR_FW_VER_MAJOR(c), G_FW_HDR_FW_VER_MINOR(c), G_FW_HDR_FW_VER_MICRO(c), G_FW_HDR_FW_VER_BUILD(c), reason, G_FW_HDR_FW_VER_MAJOR(d), G_FW_HDR_FW_VER_MINOR(d), G_FW_HDR_FW_VER_MICRO(d), G_FW_HDR_FW_VER_BUILD(d)); rc = -t4_fw_upgrade(sc, sc->mbox, fw->data, fw->datasize, 0); if (rc != 0) { device_printf(sc->dev, "failed to install firmware: %d\n", rc); } else { /* Installed successfully, update the cached header too. */ rc = ERESTART; memcpy(card_fw, fw->data, sizeof(*card_fw)); } done: unload_fw_module(sc, cfg, fw); return (rc); } /* * Establish contact with the firmware and attempt to become the master driver. * * A firmware will be installed to the card if needed (if the driver is allowed * to do so). */ static int contact_firmware(struct adapter *sc) { int rc, already = 0; enum dev_state state; struct fw_info *fw_info; struct fw_hdr *card_fw; /* fw on the card */ const struct fw_h *drv_fw; fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); return (EINVAL); } drv_fw = &fw_info->fw_h; /* Read the header of the firmware on the card */ card_fw = malloc(sizeof(*card_fw), M_CXGBE, M_ZERO | M_WAITOK); restart: rc = -t4_get_fw_hdr(sc, card_fw); if (rc != 0) { device_printf(sc->dev, "unable to read firmware header from card's flash: %d\n", rc); goto done; } rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, NULL, &already); if (rc == ERESTART) goto restart; if (rc != 0) goto done; rc = t4_fw_hello(sc, sc->mbox, sc->mbox, MASTER_MAY, &state); if (rc < 0 || state == DEV_STATE_ERR) { rc = -rc; device_printf(sc->dev, "failed to connect to the firmware: %d, %d. " "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW)); #if 0 if (install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, "not responding properly to HELLO", &already) == ERESTART) goto restart; #endif goto done; } MPASS(be32toh(card_fw->flags) & FW_HDR_FLAGS_RESET_HALT); sc->flags |= FW_OK; /* The firmware responded to the FW_HELLO. */ if (rc == sc->pf) { sc->flags |= MASTER_PF; rc = install_kld_firmware(sc, (struct fw_h *)card_fw, drv_fw, NULL, &already); if (rc == ERESTART) rc = 0; else if (rc != 0) goto done; } else if (state == DEV_STATE_UNINIT) { /* * We didn't get to be the master so we definitely won't be * configuring the chip. It's a bug if someone else hasn't * configured it already. */ device_printf(sc->dev, "couldn't be master(%d), " "device not already initialized either(%d). " "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW)); rc = EPROTO; goto done; } else { /* * Some other PF is the master and has configured the chip. * This is allowed but untested. */ device_printf(sc->dev, "PF%d is master, device state %d. " "PCIE_FW 0x%08x\n", rc, state, t4_read_reg(sc, A_PCIE_FW)); snprintf(sc->cfg_file, sizeof(sc->cfg_file), "pf%d", rc); sc->cfcsum = 0; rc = 0; } done: if (rc != 0 && sc->flags & FW_OK) { t4_fw_bye(sc, sc->mbox); sc->flags &= ~FW_OK; } free(card_fw, M_CXGBE); return (rc); } static int copy_cfg_file_to_card(struct adapter *sc, char *cfg_file, uint32_t mtype, uint32_t moff) { struct fw_info *fw_info; const struct firmware *dcfg, *rcfg = NULL; const uint32_t *cfdata; uint32_t cflen, addr; int rc; load_fw_module(sc, &dcfg, NULL); /* Card specific interpretation of "default". */ if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { if (pci_get_device(sc->dev) == 0x440a) snprintf(cfg_file, sizeof(t4_cfg_file), UWIRE_CF); if (is_fpga(sc)) snprintf(cfg_file, sizeof(t4_cfg_file), FPGA_CF); } if (strncmp(cfg_file, DEFAULT_CF, sizeof(t4_cfg_file)) == 0) { if (dcfg == NULL) { device_printf(sc->dev, "KLD with default config is not available.\n"); rc = ENOENT; goto done; } cfdata = dcfg->data; cflen = dcfg->datasize & ~3; } else { char s[32]; fw_info = find_fw_info(chip_id(sc)); if (fw_info == NULL) { device_printf(sc->dev, "unable to look up firmware information for chip %d.\n", chip_id(sc)); rc = EINVAL; goto done; } snprintf(s, sizeof(s), "%s_%s", fw_info->kld_name, cfg_file); rcfg = firmware_get(s); if (rcfg == NULL) { device_printf(sc->dev, "unable to load module \"%s\" for configuration " "profile \"%s\".\n", s, cfg_file); rc = ENOENT; goto done; } cfdata = rcfg->data; cflen = rcfg->datasize & ~3; } if (cflen > FLASH_CFG_MAX_SIZE) { device_printf(sc->dev, "config file too long (%d, max allowed is %d).\n", cflen, FLASH_CFG_MAX_SIZE); rc = EINVAL; goto done; } rc = validate_mt_off_len(sc, mtype, moff, cflen, &addr); if (rc != 0) { device_printf(sc->dev, "%s: addr (%d/0x%x) or len %d is not valid: %d.\n", __func__, mtype, moff, cflen, rc); rc = EINVAL; goto done; } write_via_memwin(sc, 2, addr, cfdata, cflen); done: if (rcfg != NULL) firmware_put(rcfg, FIRMWARE_UNLOAD); unload_fw_module(sc, dcfg, NULL); return (rc); } struct caps_allowed { uint16_t nbmcaps; uint16_t linkcaps; uint16_t switchcaps; uint16_t niccaps; uint16_t toecaps; uint16_t rdmacaps; uint16_t cryptocaps; uint16_t iscsicaps; uint16_t fcoecaps; }; #define FW_PARAM_DEV(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_##param)) #define FW_PARAM_PFVF(param) \ (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_PFVF) | \ V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_PFVF_##param)) /* * Provide a configuration profile to the firmware and have it initialize the * chip accordingly. This may involve uploading a configuration file to the * card. */ static int apply_cfg_and_initialize(struct adapter *sc, char *cfg_file, const struct caps_allowed *caps_allowed) { int rc; struct fw_caps_config_cmd caps; uint32_t mtype, moff, finicsum, cfcsum, param, val; rc = -t4_fw_reset(sc, sc->mbox, F_PIORSTMODE | F_PIORST); if (rc != 0) { device_printf(sc->dev, "firmware reset failed: %d.\n", rc); return (rc); } bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); if (strncmp(cfg_file, BUILTIN_CF, sizeof(t4_cfg_file)) == 0) { mtype = 0; moff = 0; caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); } else if (strncmp(cfg_file, FLASH_CF, sizeof(t4_cfg_file)) == 0) { mtype = FW_MEMTYPE_FLASH; moff = t4_flash_cfg_addr(sc); caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); } else { /* * Ask the firmware where it wants us to upload the config file. */ param = FW_PARAM_DEV(CF); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* No support for config file? Shouldn't happen. */ device_printf(sc->dev, "failed to query config file location: %d.\n", rc); goto done; } mtype = G_FW_PARAMS_PARAM_Y(val); moff = G_FW_PARAMS_PARAM_Z(val) << 16; caps.cfvalid_to_len16 = htobe32(F_FW_CAPS_CONFIG_CMD_CFVALID | V_FW_CAPS_CONFIG_CMD_MEMTYPE_CF(mtype) | V_FW_CAPS_CONFIG_CMD_MEMADDR64K_CF(moff >> 16) | FW_LEN16(caps)); rc = copy_cfg_file_to_card(sc, cfg_file, mtype, moff); if (rc != 0) { device_printf(sc->dev, "failed to upload config file to card: %d.\n", rc); goto done; } } rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to pre-process config file: %d " "(mtype %d, moff 0x%x).\n", rc, mtype, moff); goto done; } finicsum = be32toh(caps.finicsum); cfcsum = be32toh(caps.cfcsum); /* actual */ if (finicsum != cfcsum) { device_printf(sc->dev, "WARNING: config file checksum mismatch: %08x %08x\n", finicsum, cfcsum); } sc->cfcsum = cfcsum; snprintf(sc->cfg_file, sizeof(sc->cfg_file), "%s", cfg_file); /* * Let the firmware know what features will (not) be used so it can tune * things accordingly. */ #define LIMIT_CAPS(x) do { \ caps.x##caps &= htobe16(caps_allowed->x##caps); \ } while (0) LIMIT_CAPS(nbm); LIMIT_CAPS(link); LIMIT_CAPS(switch); LIMIT_CAPS(nic); LIMIT_CAPS(toe); LIMIT_CAPS(rdma); LIMIT_CAPS(crypto); LIMIT_CAPS(iscsi); LIMIT_CAPS(fcoe); #undef LIMIT_CAPS if (caps.niccaps & htobe16(FW_CAPS_CONFIG_NIC_HASHFILTER)) { /* * TOE and hashfilters are mutually exclusive. It is a config * file or firmware bug if both are reported as available. Try * to cope with the situation in non-debug builds by disabling * TOE. */ MPASS(caps.toecaps == 0); caps.toecaps = 0; caps.rdmacaps = 0; caps.iscsicaps = 0; } caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), NULL); if (rc != 0) { device_printf(sc->dev, "failed to process config file: %d.\n", rc); goto done; } t4_tweak_chip_settings(sc); set_params__pre_init(sc); /* get basic stuff going */ rc = -t4_fw_initialize(sc, sc->mbox); if (rc != 0) { device_printf(sc->dev, "fw_initialize failed: %d.\n", rc); goto done; } done: return (rc); } /* * Partition chip resources for use between various PFs, VFs, etc. */ static int partition_resources(struct adapter *sc) { char cfg_file[sizeof(t4_cfg_file)]; struct caps_allowed caps_allowed; int rc; bool fallback; /* Only the master driver gets to configure the chip resources. */ MPASS(sc->flags & MASTER_PF); #define COPY_CAPS(x) do { \ caps_allowed.x##caps = t4_##x##caps_allowed; \ } while (0) bzero(&caps_allowed, sizeof(caps_allowed)); COPY_CAPS(nbm); COPY_CAPS(link); COPY_CAPS(switch); COPY_CAPS(nic); COPY_CAPS(toe); COPY_CAPS(rdma); COPY_CAPS(crypto); COPY_CAPS(iscsi); COPY_CAPS(fcoe); fallback = sc->debug_flags & DF_DISABLE_CFG_RETRY ? false : true; snprintf(cfg_file, sizeof(cfg_file), "%s", t4_cfg_file); retry: rc = apply_cfg_and_initialize(sc, cfg_file, &caps_allowed); if (rc != 0 && fallback) { device_printf(sc->dev, "failed (%d) to configure card with \"%s\" profile, " "will fall back to a basic configuration and retry.\n", rc, cfg_file); snprintf(cfg_file, sizeof(cfg_file), "%s", BUILTIN_CF); bzero(&caps_allowed, sizeof(caps_allowed)); COPY_CAPS(nbm); COPY_CAPS(link); COPY_CAPS(switch); COPY_CAPS(nic); fallback = false; goto retry; } #undef COPY_CAPS return (rc); } /* * Retrieve parameters that are needed (or nice to have) very early. */ static int get_params__pre_init(struct adapter *sc) { int rc; uint32_t param[2], val[2]; t4_get_version_info(sc); snprintf(sc->fw_version, sizeof(sc->fw_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MINOR(sc->params.fw_vers), G_FW_HDR_FW_VER_MICRO(sc->params.fw_vers), G_FW_HDR_FW_VER_BUILD(sc->params.fw_vers)); snprintf(sc->bs_version, sizeof(sc->bs_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.bs_vers), G_FW_HDR_FW_VER_MINOR(sc->params.bs_vers), G_FW_HDR_FW_VER_MICRO(sc->params.bs_vers), G_FW_HDR_FW_VER_BUILD(sc->params.bs_vers)); snprintf(sc->tp_version, sizeof(sc->tp_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MINOR(sc->params.tp_vers), G_FW_HDR_FW_VER_MICRO(sc->params.tp_vers), G_FW_HDR_FW_VER_BUILD(sc->params.tp_vers)); snprintf(sc->er_version, sizeof(sc->er_version), "%u.%u.%u.%u", G_FW_HDR_FW_VER_MAJOR(sc->params.er_vers), G_FW_HDR_FW_VER_MINOR(sc->params.er_vers), G_FW_HDR_FW_VER_MICRO(sc->params.er_vers), G_FW_HDR_FW_VER_BUILD(sc->params.er_vers)); param[0] = FW_PARAM_DEV(PORTVEC); param[1] = FW_PARAM_DEV(CCLK); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (pre_init): %d.\n", rc); return (rc); } sc->params.portvec = val[0]; sc->params.nports = bitcount32(val[0]); sc->params.vpd.cclk = val[1]; /* Read device log parameters. */ rc = -t4_init_devlog_params(sc, 1); if (rc == 0) fixup_devlog_params(sc); else { device_printf(sc->dev, "failed to get devlog parameters: %d.\n", rc); rc = 0; /* devlog isn't critical for device operation */ } return (rc); } /* * Any params that need to be set before FW_INITIALIZE. */ static int set_params__pre_init(struct adapter *sc) { int rc = 0; uint32_t param, val; if (chip_id(sc) >= CHELSIO_T6) { param = FW_PARAM_DEV(HPFILTER_REGION_SUPPORT); val = 1; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); /* firmwares < 1.20.1.0 do not have this param. */ if (rc == FW_EINVAL && sc->params.fw_vers < (V_FW_HDR_FW_VER_MAJOR(1) | V_FW_HDR_FW_VER_MINOR(20) | V_FW_HDR_FW_VER_MICRO(1) | V_FW_HDR_FW_VER_BUILD(0))) { rc = 0; } if (rc != 0) { device_printf(sc->dev, "failed to enable high priority filters :%d.\n", rc); } } /* Enable opaque VIIDs with firmwares that support it. */ param = FW_PARAM_DEV(OPAQUE_VIID_SMT_EXTN); val = 1; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc == 0 && val == 1) sc->params.viid_smt_extn_support = true; else sc->params.viid_smt_extn_support = false; return (rc); } /* * Retrieve various parameters that are of interest to the driver. The device * has been initialized by the firmware at this point. */ static int get_params__post_init(struct adapter *sc) { int rc; uint32_t param[7], val[7]; struct fw_caps_config_cmd caps; param[0] = FW_PARAM_PFVF(IQFLINT_START); param[1] = FW_PARAM_PFVF(EQ_START); param[2] = FW_PARAM_PFVF(FILTER_START); param[3] = FW_PARAM_PFVF(FILTER_END); param[4] = FW_PARAM_PFVF(L2T_START); param[5] = FW_PARAM_PFVF(L2T_END); param[6] = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_VDD); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 7, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query parameters (post_init): %d.\n", rc); return (rc); } sc->sge.iq_start = val[0]; sc->sge.eq_start = val[1]; if ((int)val[3] > (int)val[2]) { sc->tids.ftid_base = val[2]; sc->tids.ftid_end = val[3]; sc->tids.nftids = val[3] - val[2] + 1; } sc->vres.l2t.start = val[4]; sc->vres.l2t.size = val[5] - val[4] + 1; KASSERT(sc->vres.l2t.size <= L2T_SIZE, ("%s: L2 table size (%u) larger than expected (%u)", __func__, sc->vres.l2t.size, L2T_SIZE)); sc->params.core_vdd = val[6]; if (chip_id(sc) >= CHELSIO_T6) { sc->tids.tid_base = t4_read_reg(sc, A_LE_DB_ACTIVE_TABLE_START_INDEX); param[0] = FW_PARAM_PFVF(HPFILTER_START); param[1] = FW_PARAM_PFVF(HPFILTER_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query hpfilter parameters: %d.\n", rc); return (rc); } if ((int)val[1] > (int)val[0]) { sc->tids.hpftid_base = val[0]; sc->tids.hpftid_end = val[1]; sc->tids.nhpftids = val[1] - val[0] + 1; /* * These should go off if the layout changes and the * driver needs to catch up. */ MPASS(sc->tids.hpftid_base == 0); MPASS(sc->tids.tid_base == sc->tids.nhpftids); } } /* * MPSBGMAP is queried separately because only recent firmwares support * it as a parameter and we don't want the compound query above to fail * on older firmwares. */ param[0] = FW_PARAM_DEV(MPSBGMAP); val[0] = 0; rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.mps_bg_map = val[0]; else sc->params.mps_bg_map = 0; /* * Determine whether the firmware supports the filter2 work request. * This is queried separately for the same reason as MPSBGMAP above. */ param[0] = FW_PARAM_DEV(FILTER2_WR); val[0] = 0; rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.filter2_wr_support = val[0] != 0; else sc->params.filter2_wr_support = 0; /* * Find out whether we're allowed to use the ULPTX MEMWRITE DSGL. * This is queried separately for the same reason as other params above. */ param[0] = FW_PARAM_DEV(ULPTX_MEMWRITE_DSGL); val[0] = 0; rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc == 0) sc->params.ulptx_memwrite_dsgl = val[0] != 0; else sc->params.ulptx_memwrite_dsgl = false; /* get capabilites */ bzero(&caps, sizeof(caps)); caps.op_to_write = htobe32(V_FW_CMD_OP(FW_CAPS_CONFIG_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ); caps.cfvalid_to_len16 = htobe32(FW_LEN16(caps)); rc = -t4_wr_mbox(sc, sc->mbox, &caps, sizeof(caps), &caps); if (rc != 0) { device_printf(sc->dev, "failed to get card capabilities: %d.\n", rc); return (rc); } #define READ_CAPS(x) do { \ sc->x = htobe16(caps.x); \ } while (0) READ_CAPS(nbmcaps); READ_CAPS(linkcaps); READ_CAPS(switchcaps); READ_CAPS(niccaps); READ_CAPS(toecaps); READ_CAPS(rdmacaps); READ_CAPS(cryptocaps); READ_CAPS(iscsicaps); READ_CAPS(fcoecaps); if (sc->niccaps & FW_CAPS_CONFIG_NIC_HASHFILTER) { MPASS(chip_id(sc) > CHELSIO_T4); MPASS(sc->toecaps == 0); sc->toecaps = 0; param[0] = FW_PARAM_DEV(NTID); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query HASHFILTER parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; if (sc->params.fw_vers < (V_FW_HDR_FW_VER_MAJOR(1) | V_FW_HDR_FW_VER_MINOR(20) | V_FW_HDR_FW_VER_MICRO(5) | V_FW_HDR_FW_VER_BUILD(0))) { MPASS(sc->tids.ntids >= sc->tids.nhpftids); sc->tids.ntids -= sc->tids.nhpftids; } sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); sc->params.hash_filter = 1; } if (sc->niccaps & FW_CAPS_CONFIG_NIC_ETHOFLD) { param[0] = FW_PARAM_PFVF(ETHOFLD_START); param[1] = FW_PARAM_PFVF(ETHOFLD_END); param[2] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 3, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query NIC parameters: %d.\n", rc); return (rc); } if ((int)val[1] > (int)val[0]) { sc->tids.etid_base = val[0]; sc->tids.etid_end = val[1]; sc->tids.netids = val[1] - val[0] + 1; sc->params.eo_wr_cred = val[2]; sc->params.ethoffload = 1; } } if (sc->toecaps) { /* query offload-related parameters */ param[0] = FW_PARAM_DEV(NTID); param[1] = FW_PARAM_PFVF(SERVER_START); param[2] = FW_PARAM_PFVF(SERVER_END); param[3] = FW_PARAM_PFVF(TDDP_START); param[4] = FW_PARAM_PFVF(TDDP_END); param[5] = FW_PARAM_DEV(FLOWC_BUFFIFO_SZ); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TOE parameters: %d.\n", rc); return (rc); } sc->tids.ntids = val[0]; if (sc->params.fw_vers < (V_FW_HDR_FW_VER_MAJOR(1) | V_FW_HDR_FW_VER_MINOR(20) | V_FW_HDR_FW_VER_MICRO(5) | V_FW_HDR_FW_VER_BUILD(0))) { MPASS(sc->tids.ntids >= sc->tids.nhpftids); sc->tids.ntids -= sc->tids.nhpftids; } sc->tids.natids = min(sc->tids.ntids / 2, MAX_ATIDS); if ((int)val[2] > (int)val[1]) { sc->tids.stid_base = val[1]; sc->tids.nstids = val[2] - val[1] + 1; } sc->vres.ddp.start = val[3]; sc->vres.ddp.size = val[4] - val[3] + 1; sc->params.ofldq_wr_cred = val[5]; sc->params.offload = 1; } else { /* * The firmware attempts memfree TOE configuration for -SO cards * and will report toecaps=0 if it runs out of resources (this * depends on the config file). It may not report 0 for other * capabilities dependent on the TOE in this case. Set them to * 0 here so that the driver doesn't bother tracking resources * that will never be used. */ sc->iscsicaps = 0; sc->rdmacaps = 0; } if (sc->rdmacaps) { param[0] = FW_PARAM_PFVF(STAG_START); param[1] = FW_PARAM_PFVF(STAG_END); param[2] = FW_PARAM_PFVF(RQ_START); param[3] = FW_PARAM_PFVF(RQ_END); param[4] = FW_PARAM_PFVF(PBL_START); param[5] = FW_PARAM_PFVF(PBL_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(1): %d.\n", rc); return (rc); } sc->vres.stag.start = val[0]; sc->vres.stag.size = val[1] - val[0] + 1; sc->vres.rq.start = val[2]; sc->vres.rq.size = val[3] - val[2] + 1; sc->vres.pbl.start = val[4]; sc->vres.pbl.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SQRQ_START); param[1] = FW_PARAM_PFVF(SQRQ_END); param[2] = FW_PARAM_PFVF(CQ_START); param[3] = FW_PARAM_PFVF(CQ_END); param[4] = FW_PARAM_PFVF(OCQ_START); param[5] = FW_PARAM_PFVF(OCQ_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 6, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(2): %d.\n", rc); return (rc); } sc->vres.qp.start = val[0]; sc->vres.qp.size = val[1] - val[0] + 1; sc->vres.cq.start = val[2]; sc->vres.cq.size = val[3] - val[2] + 1; sc->vres.ocq.start = val[4]; sc->vres.ocq.size = val[5] - val[4] + 1; param[0] = FW_PARAM_PFVF(SRQ_START); param[1] = FW_PARAM_PFVF(SRQ_END); param[2] = FW_PARAM_DEV(MAXORDIRD_QP); param[3] = FW_PARAM_DEV(MAXIRD_ADAPTER); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 4, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query RDMA parameters(3): %d.\n", rc); return (rc); } sc->vres.srq.start = val[0]; sc->vres.srq.size = val[1] - val[0] + 1; sc->params.max_ordird_qp = val[2]; sc->params.max_ird_adapter = val[3]; } if (sc->iscsicaps) { param[0] = FW_PARAM_PFVF(ISCSI_START); param[1] = FW_PARAM_PFVF(ISCSI_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query iSCSI parameters: %d.\n", rc); return (rc); } sc->vres.iscsi.start = val[0]; sc->vres.iscsi.size = val[1] - val[0] + 1; } if (sc->cryptocaps & FW_CAPS_CONFIG_TLSKEYS) { param[0] = FW_PARAM_PFVF(TLS_START); param[1] = FW_PARAM_PFVF(TLS_END); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); if (rc != 0) { device_printf(sc->dev, "failed to query TLS parameters: %d.\n", rc); return (rc); } sc->vres.key.start = val[0]; sc->vres.key.size = val[1] - val[0] + 1; } t4_init_sge_params(sc); /* * We've got the params we wanted to query via the firmware. Now grab * some others directly from the chip. */ rc = t4_read_chip_settings(sc); return (rc); } static int set_params__post_init(struct adapter *sc) { uint32_t param, val; #ifdef TCP_OFFLOAD int i, v, shift; #endif /* ask for encapsulated CPLs */ param = FW_PARAM_PFVF(CPLFW4MSG_ENCAP); val = 1; (void)t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); /* Enable 32b port caps if the firmware supports it. */ param = FW_PARAM_PFVF(PORT_CAPS32); val = 1; if (t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val) == 0) sc->params.port_caps32 = 1; /* Let filter + maskhash steer to a part of the VI's RSS region. */ val = 1 << (G_MASKSIZE(t4_read_reg(sc, A_TP_RSS_CONFIG_TNL)) - 1); t4_set_reg_field(sc, A_TP_RSS_CONFIG_TNL, V_MASKFILTER(M_MASKFILTER), V_MASKFILTER(val - 1)); #ifdef TCP_OFFLOAD /* * Override the TOE timers with user provided tunables. This is not the * recommended way to change the timers (the firmware config file is) so * these tunables are not documented. * * All the timer tunables are in microseconds. */ if (t4_toe_keepalive_idle != 0) { v = us_to_tcp_ticks(sc, t4_toe_keepalive_idle); v &= M_KEEPALIVEIDLE; t4_set_reg_field(sc, A_TP_KEEP_IDLE, V_KEEPALIVEIDLE(M_KEEPALIVEIDLE), V_KEEPALIVEIDLE(v)); } if (t4_toe_keepalive_interval != 0) { v = us_to_tcp_ticks(sc, t4_toe_keepalive_interval); v &= M_KEEPALIVEINTVL; t4_set_reg_field(sc, A_TP_KEEP_INTVL, V_KEEPALIVEINTVL(M_KEEPALIVEINTVL), V_KEEPALIVEINTVL(v)); } if (t4_toe_keepalive_count != 0) { v = t4_toe_keepalive_count & M_KEEPALIVEMAXR2; t4_set_reg_field(sc, A_TP_SHIFT_CNT, V_KEEPALIVEMAXR1(M_KEEPALIVEMAXR1) | V_KEEPALIVEMAXR2(M_KEEPALIVEMAXR2), V_KEEPALIVEMAXR1(1) | V_KEEPALIVEMAXR2(v)); } if (t4_toe_rexmt_min != 0) { v = us_to_tcp_ticks(sc, t4_toe_rexmt_min); v &= M_RXTMIN; t4_set_reg_field(sc, A_TP_RXT_MIN, V_RXTMIN(M_RXTMIN), V_RXTMIN(v)); } if (t4_toe_rexmt_max != 0) { v = us_to_tcp_ticks(sc, t4_toe_rexmt_max); v &= M_RXTMAX; t4_set_reg_field(sc, A_TP_RXT_MAX, V_RXTMAX(M_RXTMAX), V_RXTMAX(v)); } if (t4_toe_rexmt_count != 0) { v = t4_toe_rexmt_count & M_RXTSHIFTMAXR2; t4_set_reg_field(sc, A_TP_SHIFT_CNT, V_RXTSHIFTMAXR1(M_RXTSHIFTMAXR1) | V_RXTSHIFTMAXR2(M_RXTSHIFTMAXR2), V_RXTSHIFTMAXR1(1) | V_RXTSHIFTMAXR2(v)); } for (i = 0; i < nitems(t4_toe_rexmt_backoff); i++) { if (t4_toe_rexmt_backoff[i] != -1) { v = t4_toe_rexmt_backoff[i] & M_TIMERBACKOFFINDEX0; shift = (i & 3) << 3; t4_set_reg_field(sc, A_TP_TCP_BACKOFF_REG0 + (i & ~3), M_TIMERBACKOFFINDEX0 << shift, v << shift); } } #endif return (0); } #undef FW_PARAM_PFVF #undef FW_PARAM_DEV static void t4_set_desc(struct adapter *sc) { char buf[128]; struct adapter_params *p = &sc->params; snprintf(buf, sizeof(buf), "Chelsio %s", p->vpd.id); device_set_desc_copy(sc->dev, buf); } static inline void ifmedia_add4(struct ifmedia *ifm, int m) { ifmedia_add(ifm, m, 0, NULL); ifmedia_add(ifm, m | IFM_ETH_TXPAUSE, 0, NULL); ifmedia_add(ifm, m | IFM_ETH_RXPAUSE, 0, NULL); ifmedia_add(ifm, m | IFM_ETH_TXPAUSE | IFM_ETH_RXPAUSE, 0, NULL); } /* * This is the selected media, which is not quite the same as the active media. * The media line in ifconfig is "media: Ethernet selected (active)" if selected * and active are not the same, and "media: Ethernet selected" otherwise. */ static void set_current_media(struct port_info *pi) { struct link_config *lc; struct ifmedia *ifm; int mword; u_int speed; PORT_LOCK_ASSERT_OWNED(pi); /* Leave current media alone if it's already set to IFM_NONE. */ ifm = &pi->media; if (ifm->ifm_cur != NULL && IFM_SUBTYPE(ifm->ifm_cur->ifm_media) == IFM_NONE) return; lc = &pi->link_cfg; if (lc->requested_aneg != AUTONEG_DISABLE && lc->supported & FW_PORT_CAP32_ANEG) { ifmedia_set(ifm, IFM_ETHER | IFM_AUTO); return; } mword = IFM_ETHER | IFM_FDX; if (lc->requested_fc & PAUSE_TX) mword |= IFM_ETH_TXPAUSE; if (lc->requested_fc & PAUSE_RX) mword |= IFM_ETH_RXPAUSE; if (lc->requested_speed == 0) speed = port_top_speed(pi) * 1000; /* Gbps -> Mbps */ else speed = lc->requested_speed; mword |= port_mword(pi, speed_to_fwcap(speed)); ifmedia_set(ifm, mword); } /* * Returns true if the ifmedia list for the port cannot change. */ static bool fixed_ifmedia(struct port_info *pi) { return (pi->port_type == FW_PORT_TYPE_BT_SGMII || pi->port_type == FW_PORT_TYPE_BT_XFI || pi->port_type == FW_PORT_TYPE_BT_XAUI || pi->port_type == FW_PORT_TYPE_KX4 || pi->port_type == FW_PORT_TYPE_KX || pi->port_type == FW_PORT_TYPE_KR || pi->port_type == FW_PORT_TYPE_BP_AP || pi->port_type == FW_PORT_TYPE_BP4_AP || pi->port_type == FW_PORT_TYPE_BP40_BA || pi->port_type == FW_PORT_TYPE_KR4_100G || pi->port_type == FW_PORT_TYPE_KR_SFP28 || pi->port_type == FW_PORT_TYPE_KR_XLAUI); } static void build_medialist(struct port_info *pi) { uint32_t ss, speed; int unknown, mword, bit; struct link_config *lc; struct ifmedia *ifm; PORT_LOCK_ASSERT_OWNED(pi); if (pi->flags & FIXED_IFMEDIA) return; /* * Rebuild the ifmedia list. */ ifm = &pi->media; ifmedia_removeall(ifm); lc = &pi->link_cfg; ss = G_FW_PORT_CAP32_SPEED(lc->supported); /* Supported Speeds */ if (__predict_false(ss == 0)) { /* not supposed to happen. */ MPASS(ss != 0); no_media: MPASS(LIST_EMPTY(&ifm->ifm_list)); ifmedia_add(ifm, IFM_ETHER | IFM_NONE, 0, NULL); ifmedia_set(ifm, IFM_ETHER | IFM_NONE); return; } unknown = 0; for (bit = S_FW_PORT_CAP32_SPEED; bit < fls(ss); bit++) { speed = 1 << bit; MPASS(speed & M_FW_PORT_CAP32_SPEED); if (ss & speed) { mword = port_mword(pi, speed); if (mword == IFM_NONE) { goto no_media; } else if (mword == IFM_UNKNOWN) unknown++; else ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | mword); } } if (unknown > 0) /* Add one unknown for all unknown media types. */ ifmedia_add4(ifm, IFM_ETHER | IFM_FDX | IFM_UNKNOWN); if (lc->supported & FW_PORT_CAP32_ANEG) ifmedia_add(ifm, IFM_ETHER | IFM_AUTO, 0, NULL); set_current_media(pi); } /* * Initialize the requested fields in the link config based on driver tunables. */ static void init_link_config(struct port_info *pi) { struct link_config *lc = &pi->link_cfg; PORT_LOCK_ASSERT_OWNED(pi); lc->requested_speed = 0; if (t4_autoneg == 0) lc->requested_aneg = AUTONEG_DISABLE; else if (t4_autoneg == 1) lc->requested_aneg = AUTONEG_ENABLE; else lc->requested_aneg = AUTONEG_AUTO; lc->requested_fc = t4_pause_settings & (PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG); if (t4_fec == -1 || t4_fec & FEC_AUTO) lc->requested_fec = FEC_AUTO; else { lc->requested_fec = FEC_NONE; if (t4_fec & FEC_RS) lc->requested_fec |= FEC_RS; if (t4_fec & FEC_BASER_RS) lc->requested_fec |= FEC_BASER_RS; } } /* * Makes sure that all requested settings comply with what's supported by the * port. Returns the number of settings that were invalid and had to be fixed. */ static int fixup_link_config(struct port_info *pi) { int n = 0; struct link_config *lc = &pi->link_cfg; uint32_t fwspeed; PORT_LOCK_ASSERT_OWNED(pi); /* Speed (when not autonegotiating) */ if (lc->requested_speed != 0) { fwspeed = speed_to_fwcap(lc->requested_speed); if ((fwspeed & lc->supported) == 0) { n++; lc->requested_speed = 0; } } /* Link autonegotiation */ MPASS(lc->requested_aneg == AUTONEG_ENABLE || lc->requested_aneg == AUTONEG_DISABLE || lc->requested_aneg == AUTONEG_AUTO); if (lc->requested_aneg == AUTONEG_ENABLE && !(lc->supported & FW_PORT_CAP32_ANEG)) { n++; lc->requested_aneg = AUTONEG_AUTO; } /* Flow control */ MPASS((lc->requested_fc & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) == 0); if (lc->requested_fc & PAUSE_TX && !(lc->supported & FW_PORT_CAP32_FC_TX)) { n++; lc->requested_fc &= ~PAUSE_TX; } if (lc->requested_fc & PAUSE_RX && !(lc->supported & FW_PORT_CAP32_FC_RX)) { n++; lc->requested_fc &= ~PAUSE_RX; } if (!(lc->requested_fc & PAUSE_AUTONEG) && !(lc->supported & FW_PORT_CAP32_FORCE_PAUSE)) { n++; lc->requested_fc |= PAUSE_AUTONEG; } /* FEC */ if ((lc->requested_fec & FEC_RS && !(lc->supported & FW_PORT_CAP32_FEC_RS)) || (lc->requested_fec & FEC_BASER_RS && !(lc->supported & FW_PORT_CAP32_FEC_BASER_RS))) { n++; lc->requested_fec = FEC_AUTO; } return (n); } /* * Apply the requested L1 settings, which are expected to be valid, to the * hardware. */ static int apply_link_config(struct port_info *pi) { struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; #ifdef INVARIANTS ASSERT_SYNCHRONIZED_OP(sc); PORT_LOCK_ASSERT_OWNED(pi); if (lc->requested_aneg == AUTONEG_ENABLE) MPASS(lc->supported & FW_PORT_CAP32_ANEG); if (!(lc->requested_fc & PAUSE_AUTONEG)) MPASS(lc->supported & FW_PORT_CAP32_FORCE_PAUSE); if (lc->requested_fc & PAUSE_TX) MPASS(lc->supported & FW_PORT_CAP32_FC_TX); if (lc->requested_fc & PAUSE_RX) MPASS(lc->supported & FW_PORT_CAP32_FC_RX); if (lc->requested_fec & FEC_RS) MPASS(lc->supported & FW_PORT_CAP32_FEC_RS); if (lc->requested_fec & FEC_BASER_RS) MPASS(lc->supported & FW_PORT_CAP32_FEC_BASER_RS); #endif rc = -t4_link_l1cfg(sc, sc->mbox, pi->tx_chan, lc); if (rc != 0) { /* Don't complain if the VF driver gets back an EPERM. */ if (!(sc->flags & IS_VF) || rc != FW_EPERM) device_printf(pi->dev, "l1cfg failed: %d\n", rc); } else { /* * An L1_CFG will almost always result in a link-change event if * the link is up, and the driver will refresh the actual * fec/fc/etc. when the notification is processed. If the link * is down then the actual settings are meaningless. * * This takes care of the case where a change in the L1 settings * may not result in a notification. */ if (lc->link_ok && !(lc->requested_fc & PAUSE_AUTONEG)) lc->fc = lc->requested_fc & (PAUSE_TX | PAUSE_RX); } return (rc); } #define FW_MAC_EXACT_CHUNK 7 /* * Program the port's XGMAC based on parameters in ifnet. The caller also * indicates which parameters should be programmed (the rest are left alone). */ int update_mac_settings(struct ifnet *ifp, int flags) { int rc = 0; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1; ASSERT_SYNCHRONIZED_OP(sc); KASSERT(flags, ("%s: not told what to update.", __func__)); if (flags & XGMAC_MTU) mtu = ifp->if_mtu; if (flags & XGMAC_PROMISC) promisc = ifp->if_flags & IFF_PROMISC ? 1 : 0; if (flags & XGMAC_ALLMULTI) allmulti = ifp->if_flags & IFF_ALLMULTI ? 1 : 0; if (flags & XGMAC_VLANEX) vlanex = ifp->if_capenable & IFCAP_VLAN_HWTAGGING ? 1 : 0; if (flags & (XGMAC_MTU|XGMAC_PROMISC|XGMAC_ALLMULTI|XGMAC_VLANEX)) { rc = -t4_set_rxmode(sc, sc->mbox, vi->viid, mtu, promisc, allmulti, 1, vlanex, false); if (rc) { if_printf(ifp, "set_rxmode (%x) failed: %d\n", flags, rc); return (rc); } } if (flags & XGMAC_UCADDR) { uint8_t ucaddr[ETHER_ADDR_LEN]; bcopy(IF_LLADDR(ifp), ucaddr, sizeof(ucaddr)); rc = t4_change_mac(sc, sc->mbox, vi->viid, vi->xact_addr_filt, ucaddr, true, &vi->smt_idx); if (rc < 0) { rc = -rc; if_printf(ifp, "change_mac failed: %d\n", rc); return (rc); } else { vi->xact_addr_filt = rc; rc = 0; } } if (flags & XGMAC_MCADDRS) { const uint8_t *mcaddr[FW_MAC_EXACT_CHUNK]; int del = 1; uint64_t hash = 0; struct ifmultiaddr *ifma; int i = 0, j; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mcaddr[i] = LLADDR((struct sockaddr_dl *)ifma->ifma_addr); MPASS(ETHER_IS_MULTICAST(mcaddr[i])); i++; if (i == FW_MAC_EXACT_CHUNK) { rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } del = 0; i = 0; } } if (i > 0) { rc = t4_alloc_mac_filt(sc, sc->mbox, vi->viid, del, i, mcaddr, NULL, &hash, 0); if (rc < 0) { rc = -rc; for (j = 0; j < i; j++) { if_printf(ifp, "failed to add mc address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", mcaddr[j][0], mcaddr[j][1], mcaddr[j][2], mcaddr[j][3], mcaddr[j][4], mcaddr[j][5], rc); } goto mcfail; } } rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, hash, 0); if (rc != 0) if_printf(ifp, "failed to set mc address hash: %d", rc); mcfail: if_maddr_runlock(ifp); } return (rc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ int begin_synchronized_op(struct adapter *sc, struct vi_info *vi, int flags, char *wmesg) { int rc, pri; #ifdef WITNESS /* the caller thinks it's ok to sleep, but is it really? */ if (flags & SLEEP_OK) WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "begin_synchronized_op"); #endif if (INTR_OK) pri = PCATCH; else pri = 0; ADAPTER_LOCK(sc); for (;;) { if (vi && IS_DOOMED(vi)) { rc = ENXIO; goto done; } if (!IS_BUSY(sc)) { rc = 0; break; } if (!(flags & SLEEP_OK)) { rc = EBUSY; goto done; } if (mtx_sleep(&sc->flags, &sc->sc_lock, pri, wmesg, 0)) { rc = EINTR; goto done; } } KASSERT(!IS_BUSY(sc), ("%s: controller busy.", __func__)); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = wmesg; sc->last_op_thr = curthread; sc->last_op_flags = flags; #endif done: if (!(flags & HOLD_LOCK) || rc) ADAPTER_UNLOCK(sc); return (rc); } /* * Tell if_ioctl and if_init that the VI is going away. This is * special variant of begin_synchronized_op and must be paired with a * call to end_synchronized_op. */ void doom_vi(struct adapter *sc, struct vi_info *vi) { ADAPTER_LOCK(sc); SET_DOOMED(vi); wakeup(&sc->flags); while (IS_BUSY(sc)) mtx_sleep(&sc->flags, &sc->sc_lock, 0, "t4detach", 0); SET_BUSY(sc); #ifdef INVARIANTS sc->last_op = "t4detach"; sc->last_op_thr = curthread; sc->last_op_flags = 0; #endif ADAPTER_UNLOCK(sc); } /* * {begin|end}_synchronized_op must be called from the same thread. */ void end_synchronized_op(struct adapter *sc, int flags) { if (flags & LOCK_HELD) ADAPTER_LOCK_ASSERT_OWNED(sc); else ADAPTER_LOCK(sc); KASSERT(IS_BUSY(sc), ("%s: controller not busy.", __func__)); CLR_BUSY(sc); wakeup(&sc->flags); ADAPTER_UNLOCK(sc); } static int cxgbe_init_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; int rc = 0, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return (0); /* already running */ if (!(sc->flags & FULL_INIT_DONE) && ((rc = adapter_full_init(sc)) != 0)) return (rc); /* error message displayed already */ if (!(vi->flags & VI_INIT_DONE) && ((rc = vi_full_init(vi)) != 0)) return (rc); /* error message displayed already */ rc = update_mac_settings(ifp, XGMAC_ALL); if (rc) goto done; /* error message displayed already */ PORT_LOCK(pi); if (pi->up_vis == 0) { t4_update_port_info(pi); fixup_link_config(pi); build_medialist(pi); apply_link_config(pi); } rc = -t4_enable_vi(sc, sc->mbox, vi->viid, true, true); if (rc != 0) { if_printf(ifp, "enable_vi failed: %d\n", rc); PORT_UNLOCK(pi); goto done; } /* * Can't fail from this point onwards. Review cxgbe_uninit_synchronized * if this changes. */ for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags |= EQ_ENABLED; TXQ_UNLOCK(txq); } /* * The first iq of the first port to come up is used for tracing. */ if (sc->traceq < 0 && IS_MAIN_VI(vi)) { sc->traceq = sc->sge.rxq[vi->first_rxq].iq.abs_id; t4_write_reg(sc, is_t4(sc) ? A_MPS_TRC_RSS_CONTROL : A_MPS_T5_TRC_RSS_CONTROL, V_RSSCONTROL(pi->tx_chan) | V_QUEUENUMBER(sc->traceq)); pi->flags |= HAS_TRACEQ; } /* all ok */ pi->up_vis++; ifp->if_drv_flags |= IFF_DRV_RUNNING; if (pi->nvi > 1 || sc->flags & IS_VF) callout_reset(&vi->tick, hz, vi_tick, vi); else callout_reset(&pi->tick, hz, cxgbe_tick, pi); if (pi->link_cfg.link_ok) t4_os_link_changed(pi); PORT_UNLOCK(pi); done: if (rc != 0) cxgbe_uninit_synchronized(vi); return (rc); } /* * Idempotent. */ static int cxgbe_uninit_synchronized(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; int rc, i; struct sge_txq *txq; ASSERT_SYNCHRONIZED_OP(sc); if (!(vi->flags & VI_INIT_DONE)) { if (__predict_false(ifp->if_drv_flags & IFF_DRV_RUNNING)) { KASSERT(0, ("uninited VI is running")); if_printf(ifp, "uninited VI with running ifnet. " "vi->flags 0x%016lx, if_flags 0x%08x, " "if_drv_flags 0x%08x\n", vi->flags, ifp->if_flags, ifp->if_drv_flags); } return (0); } /* * Disable the VI so that all its data in either direction is discarded * by the MPS. Leave everything else (the queues, interrupts, and 1Hz * tick) intact as the TP can deliver negative advice or data that it's * holding in its RAM (for an offloaded connection) even after the VI is * disabled. */ rc = -t4_enable_vi(sc, sc->mbox, vi->viid, false, false); if (rc) { if_printf(ifp, "disable_vi failed: %d\n", rc); return (rc); } for_each_txq(vi, i, txq) { TXQ_LOCK(txq); txq->eq.flags &= ~EQ_ENABLED; TXQ_UNLOCK(txq); } PORT_LOCK(pi); if (pi->nvi > 1 || sc->flags & IS_VF) callout_stop(&vi->tick); else callout_stop(&pi->tick); if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { PORT_UNLOCK(pi); return (0); } ifp->if_drv_flags &= ~IFF_DRV_RUNNING; pi->up_vis--; if (pi->up_vis > 0) { PORT_UNLOCK(pi); return (0); } pi->link_cfg.link_ok = false; pi->link_cfg.speed = 0; pi->link_cfg.link_down_rc = 255; t4_os_link_changed(pi); PORT_UNLOCK(pi); return (0); } /* * It is ok for this function to fail midway and return right away. t4_detach * will walk the entire sc->irq list and clean up whatever is valid. */ int t4_setup_intr_handlers(struct adapter *sc) { int rc, rid, p, q, v; char s[8]; struct irq *irq; struct port_info *pi; struct vi_info *vi; struct sge *sge = &sc->sge; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; #endif #ifdef RSS int nbuckets = rss_getnumbuckets(); #endif /* * Setup interrupts. */ irq = &sc->irq[0]; rid = sc->intr_type == INTR_INTX ? 0 : 1; if (forwarding_intr_to_fwq(sc)) return (t4_alloc_irq(sc, irq, rid, t4_intr_all, sc, "all")); /* Multiple interrupts. */ if (sc->flags & IS_VF) KASSERT(sc->intr_count >= T4VF_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); else KASSERT(sc->intr_count >= T4_EXTRA_INTR + sc->params.nports, ("%s: too few intr.", __func__)); /* The first one is always error intr on PFs */ if (!(sc->flags & IS_VF)) { rc = t4_alloc_irq(sc, irq, rid, t4_intr_err, sc, "err"); if (rc != 0) return (rc); irq++; rid++; } /* The second one is always the firmware event queue (first on VFs) */ rc = t4_alloc_irq(sc, irq, rid, t4_intr_evt, &sge->fwq, "evt"); if (rc != 0) return (rc); irq++; rid++; for_each_port(sc, p) { pi = sc->port[p]; for_each_vi(pi, v, vi) { vi->first_intr = rid - 1; if (vi->nnmrxq > 0) { int n = max(vi->nrxq, vi->nnmrxq); rxq = &sge->rxq[vi->first_rxq]; #ifdef DEV_NETMAP nm_rxq = &sge->nm_rxq[vi->first_nm_rxq]; #endif for (q = 0; q < n; q++) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); if (q < vi->nrxq) irq->rxq = rxq++; #ifdef DEV_NETMAP if (q < vi->nnmrxq) irq->nm_rxq = nm_rxq++; if (irq->nm_rxq != NULL && irq->rxq == NULL) { /* Netmap rx only */ rc = t4_alloc_irq(sc, irq, rid, t4_nm_intr, irq->nm_rxq, s); } if (irq->nm_rxq != NULL && irq->rxq != NULL) { /* NIC and Netmap rx */ rc = t4_alloc_irq(sc, irq, rid, t4_vi_intr, irq, s); } #endif if (irq->rxq != NULL && irq->nm_rxq == NULL) { /* NIC rx only */ rc = t4_alloc_irq(sc, irq, rid, t4_intr, irq->rxq, s); } if (rc != 0) return (rc); #ifdef RSS if (q < vi->nrxq) { bus_bind_intr(sc->dev, irq->res, rss_getcpu(q % nbuckets)); } #endif irq++; rid++; vi->nintr++; } } else { for_each_rxq(vi, q, rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'a' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, rxq, s); if (rc != 0) return (rc); #ifdef RSS bus_bind_intr(sc->dev, irq->res, rss_getcpu(q % nbuckets)); #endif irq++; rid++; vi->nintr++; } } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, q, ofld_rxq) { snprintf(s, sizeof(s), "%x%c%x", p, 'A' + v, q); rc = t4_alloc_irq(sc, irq, rid, t4_intr, ofld_rxq, s); if (rc != 0) return (rc); irq++; rid++; vi->nintr++; } #endif } } MPASS(irq == &sc->irq[sc->intr_count]); return (0); } int adapter_full_init(struct adapter *sc) { int rc, i; #ifdef RSS uint32_t raw_rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; uint32_t rss_key[RSS_KEYSIZE / sizeof(uint32_t)]; #endif ASSERT_SYNCHRONIZED_OP(sc); ADAPTER_LOCK_ASSERT_NOTOWNED(sc); KASSERT((sc->flags & FULL_INIT_DONE) == 0, ("%s: FULL_INIT_DONE already", __func__)); /* * queues that belong to the adapter (not any particular port). */ rc = t4_setup_adapter_queues(sc); if (rc != 0) goto done; for (i = 0; i < nitems(sc->tq); i++) { sc->tq[i] = taskqueue_create("t4 taskq", M_NOWAIT, taskqueue_thread_enqueue, &sc->tq[i]); if (sc->tq[i] == NULL) { device_printf(sc->dev, "failed to allocate task queue %d\n", i); rc = ENOMEM; goto done; } taskqueue_start_threads(&sc->tq[i], 1, PI_NET, "%s tq%d", device_get_nameunit(sc->dev), i); } #ifdef RSS MPASS(RSS_KEYSIZE == 40); rss_getkey((void *)&raw_rss_key[0]); for (i = 0; i < nitems(rss_key); i++) { rss_key[i] = htobe32(raw_rss_key[nitems(rss_key) - 1 - i]); } t4_write_rss_key(sc, &rss_key[0], -1, 1); #endif if (!(sc->flags & IS_VF)) t4_intr_enable(sc); sc->flags |= FULL_INIT_DONE; done: if (rc != 0) adapter_full_uninit(sc); return (rc); } int adapter_full_uninit(struct adapter *sc) { int i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); t4_teardown_adapter_queues(sc); for (i = 0; i < nitems(sc->tq) && sc->tq[i]; i++) { taskqueue_free(sc->tq[i]); sc->tq[i] = NULL; } sc->flags &= ~FULL_INIT_DONE; return (0); } #ifdef RSS #define SUPPORTED_RSS_HASHTYPES (RSS_HASHTYPE_RSS_IPV4 | \ RSS_HASHTYPE_RSS_TCP_IPV4 | RSS_HASHTYPE_RSS_IPV6 | \ RSS_HASHTYPE_RSS_TCP_IPV6 | RSS_HASHTYPE_RSS_UDP_IPV4 | \ RSS_HASHTYPE_RSS_UDP_IPV6) /* Translates kernel hash types to hardware. */ static int hashconfig_to_hashen(int hashconfig) { int hashen = 0; if (hashconfig & RSS_HASHTYPE_RSS_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV4) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_UDP_IPV6) { hashen |= F_FW_RSS_VI_CONFIG_CMD_UDPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; } if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV4) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN; if (hashconfig & RSS_HASHTYPE_RSS_TCP_IPV6) hashen |= F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN; return (hashen); } /* Translates hardware hash types to kernel. */ static int hashen_to_hashconfig(int hashen) { int hashconfig = 0; if (hashen & F_FW_RSS_VI_CONFIG_CMD_UDPEN) { /* * If UDP hashing was enabled it must have been enabled for * either IPv4 or IPv6 (inclusive or). Enabling UDP without * enabling any 4-tuple hash is nonsense configuration. */ MPASS(hashen & (F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN)); if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_UDP_IPV6; } if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN) hashconfig |= RSS_HASHTYPE_RSS_TCP_IPV6; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV4; if (hashen & F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN) hashconfig |= RSS_HASHTYPE_RSS_IPV6; return (hashconfig); } #endif int vi_full_init(struct vi_info *vi) { struct adapter *sc = vi->pi->adapter; struct ifnet *ifp = vi->ifp; uint16_t *rss; struct sge_rxq *rxq; int rc, i, j; #ifdef RSS int nbuckets = rss_getnumbuckets(); int hashconfig = rss_gethashconfig(); int extra; #endif ASSERT_SYNCHRONIZED_OP(sc); KASSERT((vi->flags & VI_INIT_DONE) == 0, ("%s: VI_INIT_DONE already", __func__)); sysctl_ctx_init(&vi->ctx); vi->flags |= VI_SYSCTL_CTX; /* * Allocate tx/rx/fl queues for this VI. */ rc = t4_setup_vi_queues(vi); if (rc != 0) goto done; /* error message displayed already */ /* * Setup RSS for this VI. Save a copy of the RSS table for later use. */ if (vi->nrxq > vi->rss_size) { if_printf(ifp, "nrxq (%d) > hw RSS table size (%d); " "some queues will never receive traffic.\n", vi->nrxq, vi->rss_size); } else if (vi->rss_size % vi->nrxq) { if_printf(ifp, "nrxq (%d), hw RSS table size (%d); " "expect uneven traffic distribution.\n", vi->nrxq, vi->rss_size); } #ifdef RSS if (vi->nrxq != nbuckets) { if_printf(ifp, "nrxq (%d) != kernel RSS buckets (%d);" "performance will be impacted.\n", vi->nrxq, nbuckets); } #endif rss = malloc(vi->rss_size * sizeof (*rss), M_CXGBE, M_ZERO | M_WAITOK); for (i = 0; i < vi->rss_size;) { #ifdef RSS j = rss_get_indirection_to_bucket(i); j %= vi->nrxq; rxq = &sc->sge.rxq[vi->first_rxq + j]; rss[i++] = rxq->iq.abs_id; #else for_each_rxq(vi, j, rxq) { rss[i++] = rxq->iq.abs_id; if (i == vi->rss_size) break; } #endif } rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size, rss, vi->rss_size); if (rc != 0) { free(rss, M_CXGBE); if_printf(ifp, "rss_config failed: %d\n", rc); goto done; } #ifdef RSS vi->hashen = hashconfig_to_hashen(hashconfig); /* * We may have had to enable some hashes even though the global config * wants them disabled. This is a potential problem that must be * reported to the user. */ extra = hashen_to_hashconfig(vi->hashen) ^ hashconfig; /* * If we consider only the supported hash types, then the enabled hashes * are a superset of the requested hashes. In other words, there cannot * be any supported hash that was requested but not enabled, but there * can be hashes that were not requested but had to be enabled. */ extra &= SUPPORTED_RSS_HASHTYPES; MPASS((extra & hashconfig) == 0); if (extra) { if_printf(ifp, "global RSS config (0x%x) cannot be accommodated.\n", hashconfig); } if (extra & RSS_HASHTYPE_RSS_IPV4) if_printf(ifp, "IPv4 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV4) if_printf(ifp, "TCP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_IPV6) if_printf(ifp, "IPv6 2-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_TCP_IPV6) if_printf(ifp, "TCP/IPv6 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV4) if_printf(ifp, "UDP/IPv4 4-tuple hashing forced on.\n"); if (extra & RSS_HASHTYPE_RSS_UDP_IPV6) if_printf(ifp, "UDP/IPv6 4-tuple hashing forced on.\n"); #else vi->hashen = F_FW_RSS_VI_CONFIG_CMD_IP6FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP6TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4FOURTUPEN | F_FW_RSS_VI_CONFIG_CMD_IP4TWOTUPEN | F_FW_RSS_VI_CONFIG_CMD_UDPEN; #endif rc = -t4_config_vi_rss(sc, sc->mbox, vi->viid, vi->hashen, rss[0], 0, 0); if (rc != 0) { free(rss, M_CXGBE); if_printf(ifp, "rss hash/defaultq config failed: %d\n", rc); goto done; } vi->rss = rss; vi->flags |= VI_INIT_DONE; done: if (rc != 0) vi_full_uninit(vi); return (rc); } /* * Idempotent. */ int vi_full_uninit(struct vi_info *vi) { struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; int i; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_wrq *ofld_txq; #endif if (vi->flags & VI_INIT_DONE) { /* Need to quiesce queues. */ /* XXX: Only for the first VI? */ if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF)) quiesce_wrq(sc, &sc->sge.ctrlq[pi->port_id]); for_each_txq(vi, i, txq) { quiesce_txq(sc, txq); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { quiesce_wrq(sc, ofld_txq); } #endif for_each_rxq(vi, i, rxq) { quiesce_iq(sc, &rxq->iq); quiesce_fl(sc, &rxq->fl); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { quiesce_iq(sc, &ofld_rxq->iq); quiesce_fl(sc, &ofld_rxq->fl); } #endif free(vi->rss, M_CXGBE); free(vi->nm_rss, M_CXGBE); } t4_teardown_vi_queues(vi); vi->flags &= ~VI_INIT_DONE; return (0); } static void quiesce_txq(struct adapter *sc, struct sge_txq *txq) { struct sge_eq *eq = &txq->eq; struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; (void) sc; /* unused */ #ifdef INVARIANTS TXQ_LOCK(txq); MPASS((eq->flags & EQ_ENABLED) == 0); TXQ_UNLOCK(txq); #endif /* Wait for the mp_ring to empty. */ while (!mp_ring_is_idle(txq->r)) { mp_ring_check_drainage(txq->r, 0); pause("rquiesce", 1); } /* Then wait for the hardware to finish. */ while (spg->cidx != htobe16(eq->pidx)) pause("equiesce", 1); /* Finally, wait for the driver to reclaim all descriptors. */ while (eq->cidx != eq->pidx) pause("dquiesce", 1); } static void quiesce_wrq(struct adapter *sc, struct sge_wrq *wrq) { /* XXXTX */ } static void quiesce_iq(struct adapter *sc, struct sge_iq *iq) { (void) sc; /* unused */ /* Synchronize with the interrupt handler */ while (!atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_DISABLED)) pause("iqfree", 1); } static void quiesce_fl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); fl->flags |= FL_DOOMED; FL_UNLOCK(fl); callout_stop(&sc->sfl_callout); mtx_unlock(&sc->sfl_lock); KASSERT((fl->flags & FL_STARVING) == 0, ("%s: still starving", __func__)); } static int t4_alloc_irq(struct adapter *sc, struct irq *irq, int rid, driver_intr_t *handler, void *arg, char *name) { int rc; irq->rid = rid; irq->res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->rid, RF_SHAREABLE | RF_ACTIVE); if (irq->res == NULL) { device_printf(sc->dev, "failed to allocate IRQ for rid %d, name %s.\n", rid, name); return (ENOMEM); } rc = bus_setup_intr(sc->dev, irq->res, INTR_MPSAFE | INTR_TYPE_NET, NULL, handler, arg, &irq->tag); if (rc != 0) { device_printf(sc->dev, "failed to setup interrupt for rid %d, name %s: %d\n", rid, name, rc); } else if (name) bus_describe_intr(sc->dev, irq->res, irq->tag, "%s", name); return (rc); } static int t4_free_irq(struct adapter *sc, struct irq *irq) { if (irq->tag) bus_teardown_intr(sc->dev, irq->res, irq->tag); if (irq->res) bus_release_resource(sc->dev, SYS_RES_IRQ, irq->rid, irq->res); bzero(irq, sizeof(*irq)); return (0); } static void get_regs(struct adapter *sc, struct t4_regdump *regs, uint8_t *buf) { regs->version = chip_id(sc) | chip_rev(sc) << 10; t4_get_regs(sc, buf, regs->len); } #define A_PL_INDIR_CMD 0x1f8 #define S_PL_AUTOINC 31 #define M_PL_AUTOINC 0x1U #define V_PL_AUTOINC(x) ((x) << S_PL_AUTOINC) #define G_PL_AUTOINC(x) (((x) >> S_PL_AUTOINC) & M_PL_AUTOINC) #define S_PL_VFID 20 #define M_PL_VFID 0xffU #define V_PL_VFID(x) ((x) << S_PL_VFID) #define G_PL_VFID(x) (((x) >> S_PL_VFID) & M_PL_VFID) #define S_PL_ADDR 0 #define M_PL_ADDR 0xfffffU #define V_PL_ADDR(x) ((x) << S_PL_ADDR) #define G_PL_ADDR(x) (((x) >> S_PL_ADDR) & M_PL_ADDR) #define A_PL_INDIR_DATA 0x1fc static uint64_t read_vf_stat(struct adapter *sc, u_int vin, int reg) { u32 stats[2]; mtx_assert(&sc->reg_lock, MA_OWNED); if (sc->flags & IS_VF) { stats[0] = t4_read_reg(sc, VF_MPS_REG(reg)); stats[1] = t4_read_reg(sc, VF_MPS_REG(reg + 4)); } else { t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(vin) | V_PL_ADDR(VF_MPS_REG(reg))); stats[0] = t4_read_reg(sc, A_PL_INDIR_DATA); stats[1] = t4_read_reg(sc, A_PL_INDIR_DATA); } return (((uint64_t)stats[1]) << 32 | stats[0]); } static void t4_get_vi_stats(struct adapter *sc, u_int vin, struct fw_vi_stats_vf *stats) { #define GET_STAT(name) \ read_vf_stat(sc, vin, A_MPS_VF_STAT_##name##_L) stats->tx_bcast_bytes = GET_STAT(TX_VF_BCAST_BYTES); stats->tx_bcast_frames = GET_STAT(TX_VF_BCAST_FRAMES); stats->tx_mcast_bytes = GET_STAT(TX_VF_MCAST_BYTES); stats->tx_mcast_frames = GET_STAT(TX_VF_MCAST_FRAMES); stats->tx_ucast_bytes = GET_STAT(TX_VF_UCAST_BYTES); stats->tx_ucast_frames = GET_STAT(TX_VF_UCAST_FRAMES); stats->tx_drop_frames = GET_STAT(TX_VF_DROP_FRAMES); stats->tx_offload_bytes = GET_STAT(TX_VF_OFFLOAD_BYTES); stats->tx_offload_frames = GET_STAT(TX_VF_OFFLOAD_FRAMES); stats->rx_bcast_bytes = GET_STAT(RX_VF_BCAST_BYTES); stats->rx_bcast_frames = GET_STAT(RX_VF_BCAST_FRAMES); stats->rx_mcast_bytes = GET_STAT(RX_VF_MCAST_BYTES); stats->rx_mcast_frames = GET_STAT(RX_VF_MCAST_FRAMES); stats->rx_ucast_bytes = GET_STAT(RX_VF_UCAST_BYTES); stats->rx_ucast_frames = GET_STAT(RX_VF_UCAST_FRAMES); stats->rx_err_frames = GET_STAT(RX_VF_ERR_FRAMES); #undef GET_STAT } static void t4_clr_vi_stats(struct adapter *sc, u_int vin) { int reg; t4_write_reg(sc, A_PL_INDIR_CMD, V_PL_AUTOINC(1) | V_PL_VFID(vin) | V_PL_ADDR(VF_MPS_REG(A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L))); for (reg = A_MPS_VF_STAT_TX_VF_BCAST_BYTES_L; reg <= A_MPS_VF_STAT_RX_VF_ERR_FRAMES_H; reg += 4) t4_write_reg(sc, A_PL_INDIR_DATA, 0); } static void vi_refresh_stats(struct adapter *sc, struct vi_info *vi) { struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ if (!(vi->flags & VI_INIT_DONE)) return; getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &vi->last_refreshed, <)) return; mtx_lock(&sc->reg_lock); t4_get_vi_stats(sc, vi->vin, &vi->stats); getmicrotime(&vi->last_refreshed); mtx_unlock(&sc->reg_lock); } static void cxgbe_refresh_stats(struct adapter *sc, struct port_info *pi) { u_int i, v, tnl_cong_drops, bg_map; struct timeval tv; const struct timeval interval = {0, 250000}; /* 250ms */ getmicrotime(&tv); timevalsub(&tv, &interval); if (timevalcmp(&tv, &pi->last_refreshed, <)) return; tnl_cong_drops = 0; t4_get_port_stats(sc, pi->tx_chan, &pi->stats); bg_map = pi->mps_bg_map; while (bg_map) { i = ffs(bg_map) - 1; mtx_lock(&sc->reg_lock); t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); mtx_unlock(&sc->reg_lock); tnl_cong_drops += v; bg_map &= ~(1 << i); } pi->tnl_cong_drops = tnl_cong_drops; getmicrotime(&pi->last_refreshed); } static void cxgbe_tick(void *arg) { struct port_info *pi = arg; struct adapter *sc = pi->adapter; PORT_LOCK_ASSERT_OWNED(pi); cxgbe_refresh_stats(sc, pi); callout_schedule(&pi->tick, hz); } void vi_tick(void *arg) { struct vi_info *vi = arg; struct adapter *sc = vi->pi->adapter; vi_refresh_stats(sc, vi); callout_schedule(&vi->tick, hz); } /* * Should match fw_caps_config_ enums in t4fw_interface.h */ static char *caps_decoder[] = { "\20\001IPMI\002NCSI", /* 0: NBM */ "\20\001PPP\002QFC\003DCBX", /* 1: link */ "\20\001INGRESS\002EGRESS", /* 2: switch */ "\20\001NIC\002VM\003IDS\004UM\005UM_ISGL" /* 3: NIC */ "\006HASHFILTER\007ETHOFLD", "\20\001TOE", /* 4: TOE */ "\20\001RDDP\002RDMAC", /* 5: RDMA */ "\20\001INITIATOR_PDU\002TARGET_PDU" /* 6: iSCSI */ "\003INITIATOR_CNXOFLD\004TARGET_CNXOFLD" "\005INITIATOR_SSNOFLD\006TARGET_SSNOFLD" "\007T10DIF" "\010INITIATOR_CMDOFLD\011TARGET_CMDOFLD", "\20\001LOOKASIDE\002TLSKEYS", /* 7: Crypto */ "\20\001INITIATOR\002TARGET\003CTRL_OFLD" /* 8: FCoE */ "\004PO_INITIATOR\005PO_TARGET", }; void t4_sysctls(struct adapter *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *c0; static char *doorbells = {"\20\1UDB\2WCWR\3UDBWC\4KDB"}; ctx = device_get_sysctl_ctx(sc->dev); /* * dev.t4nex.X. */ oid = device_get_sysctl_tree(sc->dev); c0 = children = SYSCTL_CHILDREN(oid); sc->sc_do_rxcopy = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "do_rx_copy", CTLFLAG_RW, &sc->sc_do_rxcopy, 1, "Do RX copy of small frames"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nports", CTLFLAG_RD, NULL, sc->params.nports, "# of ports"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "doorbells", CTLTYPE_STRING | CTLFLAG_RD, doorbells, (uintptr_t)&sc->doorbells, sysctl_bitfield_8b, "A", "available doorbells"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_clock", CTLFLAG_RD, NULL, sc->params.vpd.cclk, "core clock frequency (in KHz)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_timers", CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.timer_val, sizeof(sc->params.sge.timer_val), sysctl_int_array, "A", "interrupt holdoff timer values (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pkt_counts", CTLTYPE_STRING | CTLFLAG_RD, sc->params.sge.counter_val, sizeof(sc->params.sge.counter_val), sysctl_int_array, "A", "interrupt holdoff packet counter values"); t4_sge_sysctls(sc, ctx, children); sc->lro_timeout = 100; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "lro_timeout", CTLFLAG_RW, &sc->lro_timeout, 0, "lro inactive-flush timeout (in us)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dflags", CTLFLAG_RW, &sc->debug_flags, 0, "flags to enable runtime debugging"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "tp_version", CTLFLAG_RD, sc->tp_version, 0, "TP microcode version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version, 0, "firmware version"); if (sc->flags & IS_VF) return; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "hw_revision", CTLFLAG_RD, NULL, chip_rev(sc), "chip hardware revision"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "sn", CTLFLAG_RD, sc->params.vpd.sn, 0, "serial number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "pn", CTLFLAG_RD, sc->params.vpd.pn, 0, "part number"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "ec", CTLFLAG_RD, sc->params.vpd.ec, 0, "engineering change"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "md_version", CTLFLAG_RD, sc->params.vpd.md, 0, "manufacturing diags version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "na", CTLFLAG_RD, sc->params.vpd.na, 0, "network address"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "er_version", CTLFLAG_RD, sc->er_version, 0, "expansion ROM version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "bs_version", CTLFLAG_RD, sc->bs_version, 0, "bootstrap firmware version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "scfg_version", CTLFLAG_RD, NULL, sc->params.scfg_vers, "serial config version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "vpd_version", CTLFLAG_RD, NULL, sc->params.vpd_vers, "VPD version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "cf", CTLFLAG_RD, sc->cfg_file, 0, "configuration file"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cfcsum", CTLFLAG_RD, NULL, sc->cfcsum, "config file checksum"); #define SYSCTL_CAP(name, n, text) \ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, #name, \ CTLTYPE_STRING | CTLFLAG_RD, caps_decoder[n], (uintptr_t)&sc->name, \ sysctl_bitfield_16b, "A", "available " text " capabilities") SYSCTL_CAP(nbmcaps, 0, "NBM"); SYSCTL_CAP(linkcaps, 1, "link"); SYSCTL_CAP(switchcaps, 2, "switch"); SYSCTL_CAP(niccaps, 3, "NIC"); SYSCTL_CAP(toecaps, 4, "TCP offload"); SYSCTL_CAP(rdmacaps, 5, "RDMA"); SYSCTL_CAP(iscsicaps, 6, "iSCSI"); SYSCTL_CAP(cryptocaps, 7, "crypto"); SYSCTL_CAP(fcoecaps, 8, "FCoE"); #undef SYSCTL_CAP SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nfilters", CTLFLAG_RD, NULL, sc->tids.nftids, "number of filters"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, sc, 0, sysctl_temperature, "I", "chip temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "loadavg", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_loadavg, "A", "microprocessor load averages (debug firmwares only)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "core_vdd", CTLFLAG_RD, &sc->params.core_vdd, 0, "core Vdd (in mV)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "local_cpus", CTLTYPE_STRING | CTLFLAG_RD, sc, LOCAL_CPUS, sysctl_cpus, "A", "local CPUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_cpus", CTLTYPE_STRING | CTLFLAG_RD, sc, INTR_CPUS, sysctl_cpus, "A", "preferred CPUs for interrupts"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "swintr", CTLFLAG_RW, &sc->swintr, 0, "software triggered interrupts"); /* * dev.t4nex.X.misc. Marked CTLFLAG_SKIP to avoid information overload. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "misc", CTLFLAG_RD | CTLFLAG_SKIP, NULL, "logs and miscellaneous information"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cctrl", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cctrl, "A", "congestion control"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ibq_obq, "A", "CIM IBQ 0 (TP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_tp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_cim_ibq_obq, "A", "CIM IBQ 1 (TP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ulp", CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_cim_ibq_obq, "A", "CIM IBQ 2 (ULP)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge0", CTLTYPE_STRING | CTLFLAG_RD, sc, 3, sysctl_cim_ibq_obq, "A", "CIM IBQ 3 (SGE0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_sge1", CTLTYPE_STRING | CTLFLAG_RD, sc, 4, sysctl_cim_ibq_obq, "A", "CIM IBQ 4 (SGE1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ibq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5, sysctl_cim_ibq_obq, "A", "CIM IBQ 5 (NCSI)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_la, "A", "CIM logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_ma_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_ma_la, "A", "CIM MA logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp0", CTLTYPE_STRING | CTLFLAG_RD, sc, 0 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 0 (ULP0)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp1", CTLTYPE_STRING | CTLFLAG_RD, sc, 1 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 1 (ULP1)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp2", CTLTYPE_STRING | CTLFLAG_RD, sc, 2 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 2 (ULP2)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ulp3", CTLTYPE_STRING | CTLFLAG_RD, sc, 3 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 3 (ULP3)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge", CTLTYPE_STRING | CTLFLAG_RD, sc, 4 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 4 (SGE)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_ncsi", CTLTYPE_STRING | CTLFLAG_RD, sc, 5 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 5 (NCSI)"); if (chip_id(sc) > CHELSIO_T4) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge0_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 6 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 6 (SGE0-RX)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_obq_sge1_rx", CTLTYPE_STRING | CTLFLAG_RD, sc, 7 + CIM_NUM_IBQ, sysctl_cim_ibq_obq, "A", "CIM OBQ 7 (SGE1-RX)"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_pif_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_pif_la, "A", "CIM PIF logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cim_qcfg", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cim_qcfg, "A", "CIM queue configuration"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cpl_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_cpl_stats, "A", "CPL statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ddp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ddp_stats, "A", "non-TCP DDP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "devlog", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_devlog, "A", "firmware's device log"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fcoe_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_fcoe_stats, "A", "FCoE statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "hw_sched", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_hw_sched, "A", "hardware scheduler "); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "l2t", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_l2t, "A", "hardware L2 table"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "smt", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_smt, "A", "hardware source MAC table"); #ifdef INET6 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "clip", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_clip, "A", "active CLIP table entries"); #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "lb_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_lb_stats, "A", "loopback statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "meminfo", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_meminfo, "A", "memory regions"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "mps_tcam", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, chip_id(sc) <= CHELSIO_T5 ? sysctl_mps_tcam : sysctl_mps_tcam_t6, "A", "MPS TCAM entries"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "path_mtus", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_path_mtus, "A", "path MTUs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pm_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_pm_stats, "A", "PM statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_rdma_stats, "A", "RDMA statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tcp_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tcp_stats, "A", "TCP statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tids", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tids, "A", "TID information"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_err_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_err_stats, "A", "TP error statistics"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la_mask", CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_tp_la_mask, "I", "TP logic analyzer event capture mask"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tp_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_la, "A", "TP logic analyzer"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tx_rate", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tx_rate, "A", "Tx rate"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "ulprx_la", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_ulprx_la, "A", "ULPRX logic analyzer"); if (chip_id(sc) >= CHELSIO_T5) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "wcwr_stats", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_wcwr_stats, "A", "write combined work requests"); } #ifdef TCP_OFFLOAD if (is_offload(sc)) { int i; char s[4]; /* * dev.t4nex.X.toe. */ oid = SYSCTL_ADD_NODE(ctx, c0, OID_AUTO, "toe", CTLFLAG_RD, NULL, "TOE parameters"); children = SYSCTL_CHILDREN(oid); sc->tt.cong_algorithm = -1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_algorithm", CTLFLAG_RW, &sc->tt.cong_algorithm, 0, "congestion control " "(-1 = default, 0 = reno, 1 = tahoe, 2 = newreno, " "3 = highspeed)"); sc->tt.sndbuf = 256 * 1024; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sndbuf", CTLFLAG_RW, &sc->tt.sndbuf, 0, "max hardware send buffer size"); sc->tt.ddp = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ddp", CTLFLAG_RW, &sc->tt.ddp, 0, "DDP allowed"); sc->tt.rx_coalesce = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_coalesce", CTLFLAG_RW, &sc->tt.rx_coalesce, 0, "receive coalescing"); sc->tt.tls = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tls", CTLFLAG_RW, &sc->tt.tls, 0, "Inline TLS allowed"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "tls_rx_ports", CTLTYPE_INT | CTLFLAG_RW, sc, 0, sysctl_tls_rx_ports, "I", "TCP ports that use inline TLS+TOE RX"); sc->tt.tx_align = 1; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_align", CTLFLAG_RW, &sc->tt.tx_align, 0, "chop and align payload"); sc->tt.tx_zcopy = 0; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_zcopy", CTLFLAG_RW, &sc->tt.tx_zcopy, 0, "Enable zero-copy aio_write(2)"); sc->tt.cop_managed_offloading = !!t4_cop_managed_offloading; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cop_managed_offloading", CTLFLAG_RW, &sc->tt.cop_managed_offloading, 0, "COP (Connection Offload Policy) controls all TOE offload"); sc->tt.autorcvbuf_inc = 16 * 1024; SYSCTL_ADD_INT(ctx, children, OID_AUTO, "autorcvbuf_inc", CTLFLAG_RW, &sc->tt.autorcvbuf_inc, 0, "autorcvbuf increment"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_tick, "A", "TP timer tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timestamp_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 1, sysctl_tp_tick, "A", "TCP timestamp tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 2, sysctl_tp_tick, "A", "DACK tick (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dack_timer", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, sysctl_tp_dack_timer, "IU", "DACK timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_min", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MIN, sysctl_tp_timer, "LU", "Minimum retransmit interval (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_max", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_RXT_MAX, sysctl_tp_timer, "LU", "Maximum retransmit interval (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_min", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MIN, sysctl_tp_timer, "LU", "Persist timer min (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "persist_max", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_PERS_MAX, sysctl_tp_timer, "LU", "Persist timer max (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_idle", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_IDLE, sysctl_tp_timer, "LU", "Keepalive idle timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_interval", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_KEEP_INTVL, sysctl_tp_timer, "LU", "Keepalive interval timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "initial_srtt", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_INIT_SRTT, sysctl_tp_timer, "LU", "Initial SRTT (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "finwait2_timer", CTLTYPE_ULONG | CTLFLAG_RD, sc, A_TP_FINWAIT2_TIMER, sysctl_tp_timer, "LU", "FINWAIT2 timer (us)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "syn_rexmt_count", CTLTYPE_UINT | CTLFLAG_RD, sc, S_SYNSHIFTMAX, sysctl_tp_shift_cnt, "IU", "Number of SYN retransmissions before abort"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rexmt_count", CTLTYPE_UINT | CTLFLAG_RD, sc, S_RXTSHIFTMAXR2, sysctl_tp_shift_cnt, "IU", "Number of retransmissions before abort"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "keepalive_count", CTLTYPE_UINT | CTLFLAG_RD, sc, S_KEEPALIVEMAXR2, sysctl_tp_shift_cnt, "IU", "Number of keepalive probes before abort"); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "rexmt_backoff", CTLFLAG_RD, NULL, "TOE retransmit backoffs"); children = SYSCTL_CHILDREN(oid); for (i = 0; i < 16; i++) { snprintf(s, sizeof(s), "%u", i); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, s, CTLTYPE_UINT | CTLFLAG_RD, sc, i, sysctl_tp_backoff, "IU", "TOE retransmit backoff"); } } #endif } void vi_sysctls(struct vi_info *vi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children; ctx = device_get_sysctl_ctx(vi->dev); /* * dev.v?(cxgbe|cxl).X. */ oid = device_get_sysctl_tree(vi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "viid", CTLFLAG_RD, NULL, vi->viid, "VI identifer"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nrxq", CTLFLAG_RD, &vi->nrxq, 0, "# of rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "ntxq", CTLFLAG_RD, &vi->ntxq, 0, "# of tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_rxq", CTLFLAG_RD, &vi->first_rxq, 0, "index of first rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_txq", CTLFLAG_RD, &vi->first_txq, 0, "index of first tx queue"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_base", CTLFLAG_RD, NULL, vi->rss_base, "start of RSS indirection table"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rss_size", CTLFLAG_RD, NULL, vi->rss_size, "size of RSS indirection table"); if (IS_MAIN_VI(vi)) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rsrv_noflowq", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_noflowq, "IU", "Reserve queue 0 for non-flowid packets"); } #ifdef TCP_OFFLOAD if (vi->nofldrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldrxq", CTLFLAG_RD, &vi->nofldrxq, 0, "# of rx queues for offloaded TCP connections"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_rxq", CTLFLAG_RD, &vi->first_ofld_rxq, 0, "index of first TOE rx queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx_ofld", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_tmr_idx_ofld, "I", "holdoff timer index for TOE queues"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx_ofld", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_pktc_idx_ofld, "I", "holdoff packet counter index for TOE queues"); } #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) if (vi->nofldtxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nofldtxq", CTLFLAG_RD, &vi->nofldtxq, 0, "# of tx queues for TOE/ETHOFLD"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_ofld_txq", CTLFLAG_RD, &vi->first_ofld_txq, 0, "index of first TOE/ETHOFLD tx queue"); } #endif #ifdef DEV_NETMAP if (vi->nnmrxq != 0) { SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmrxq", CTLFLAG_RD, &vi->nnmrxq, 0, "# of netmap rx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "nnmtxq", CTLFLAG_RD, &vi->nnmtxq, 0, "# of netmap tx queues"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_rxq", CTLFLAG_RD, &vi->first_nm_rxq, 0, "index of first netmap rx queue"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "first_nm_txq", CTLFLAG_RD, &vi->first_nm_txq, 0, "index of first netmap tx queue"); } #endif SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_tmr_idx", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_tmr_idx, "I", "holdoff timer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "holdoff_pktc_idx", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_holdoff_pktc_idx, "I", "holdoff packet counter index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_rxq", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_rxq, "I", "rx queue size"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "qsize_txq", CTLTYPE_INT | CTLFLAG_RW, vi, 0, sysctl_qsize_txq, "I", "tx queue size"); } static void cxgbe_sysctls(struct port_info *pi) { struct sysctl_ctx_list *ctx; struct sysctl_oid *oid; struct sysctl_oid_list *children, *children2; struct adapter *sc = pi->adapter; int i; char name[16]; static char *tc_flags = {"\20\1USER\2SYNC\3ASYNC\4ERR"}; ctx = device_get_sysctl_ctx(pi->dev); /* * dev.cxgbe.X. */ oid = device_get_sysctl_tree(pi->dev); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "linkdnrc", CTLTYPE_STRING | CTLFLAG_RD, pi, 0, sysctl_linkdnrc, "A", "reason why link is down"); if (pi->port_type == FW_PORT_TYPE_BT_XAUI) { SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, pi, 0, sysctl_btphy, "I", "PHY temperature (in Celsius)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fw_version", CTLTYPE_INT | CTLFLAG_RD, pi, 1, sysctl_btphy, "I", "PHY firmware version"); } SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pause_settings", CTLTYPE_STRING | CTLFLAG_RW, pi, 0, sysctl_pause_settings, "A", "PAUSE settings (bit 0 = rx_pause, 1 = tx_pause, 2 = pause_autoneg)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "fec", CTLTYPE_STRING | CTLFLAG_RW, pi, 0, sysctl_fec, "A", "Forward Error Correction (bit 0 = RS, bit 1 = BASER_RS)"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "autoneg", CTLTYPE_INT | CTLFLAG_RW, pi, 0, sysctl_autoneg, "I", "autonegotiation (-1 = not supported)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "max_speed", CTLFLAG_RD, NULL, port_top_speed(pi), "max speed (in Gbps)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "mps_bg_map", CTLFLAG_RD, NULL, pi->mps_bg_map, "MPS buffer group map"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_e_chan_map", CTLFLAG_RD, NULL, pi->rx_e_chan_map, "TP rx e-channel map"); if (sc->flags & IS_VF) return; /* * dev.(cxgbe|cxl).X.tc. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "tc", CTLFLAG_RD, NULL, "Tx scheduler traffic classes (cl_rl)"); children2 = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "pktsize", CTLFLAG_RW, &pi->sched_params->pktsize, 0, "pktsize for per-flow cl-rl (0 means up to the driver )"); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "burstsize", CTLFLAG_RW, &pi->sched_params->burstsize, 0, "burstsize for per-flow cl-rl (0 means up to the driver)"); for (i = 0; i < sc->chip_params->nsched_cls; i++) { struct tx_cl_rl_params *tc = &pi->sched_params->cl_rl[i]; snprintf(name, sizeof(name), "%d", i); children2 = SYSCTL_CHILDREN(SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL, "traffic class")); SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "flags", CTLTYPE_STRING | CTLFLAG_RD, tc_flags, (uintptr_t)&tc->flags, sysctl_bitfield_8b, "A", "flags"); SYSCTL_ADD_UINT(ctx, children2, OID_AUTO, "refcount", CTLFLAG_RD, &tc->refcount, 0, "references to this class"); SYSCTL_ADD_PROC(ctx, children2, OID_AUTO, "params", CTLTYPE_STRING | CTLFLAG_RD, sc, (pi->port_id << 16) | i, sysctl_tc_params, "A", "traffic class parameters"); } /* * dev.cxgbe.X.stats. */ oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "stats", CTLFLAG_RD, NULL, "port statistics"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_parse_error", CTLFLAG_RD, &pi->tx_parse_error, 0, "# of tx packets with invalid length or # of segments"); #define SYSCTL_ADD_T4_REG64(pi, name, desc, reg) \ SYSCTL_ADD_OID(ctx, children, OID_AUTO, name, \ CTLTYPE_U64 | CTLFLAG_RD, sc, reg, \ sysctl_handle_t4_reg64, "QU", desc) SYSCTL_ADD_T4_REG64(pi, "tx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "tx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "tx_error_frames", "# of error frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_64", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_65_127", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_128_255", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_256_511", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_512_1023", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1024_1518", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "tx_frames_1519_max", "# of tx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "tx_drop", "# of dropped tx frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_DROP_L)); SYSCTL_ADD_T4_REG64(pi, "tx_pause", "# of pause frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp0", "# of PPP prio 0 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp1", "# of PPP prio 1 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp2", "# of PPP prio 2 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp3", "# of PPP prio 3 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp4", "# of PPP prio 4 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp5", "# of PPP prio 5 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp6", "# of PPP prio 6 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "tx_ppp7", "# of PPP prio 7 frames transmitted", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_TX_PORT_PPP7_L)); SYSCTL_ADD_T4_REG64(pi, "rx_octets", "# of octets in good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BYTES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames", "total # of good frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_FRAMES_L)); SYSCTL_ADD_T4_REG64(pi, "rx_bcast_frames", "# of broadcast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_BCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_mcast_frames", "# of multicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ucast_frames", "# of unicast frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_UCAST_L)); SYSCTL_ADD_T4_REG64(pi, "rx_too_long", "# of frames exceeding MTU", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_jabber", "# of jabber frames", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_MTU_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_fcs_err", "# of frames received with bad FCS", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_CRC_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_len_err", "# of frames received with length error", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LEN_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_symbol_err", "symbol errors", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_SYM_ERROR_L)); SYSCTL_ADD_T4_REG64(pi, "rx_runt", "# of short frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_LESS_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_64", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_64B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_65_127", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_65B_127B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_128_255", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_128B_255B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_256_511", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_256B_511B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_512_1023", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_512B_1023B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1024_1518", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1024B_1518B_L)); SYSCTL_ADD_T4_REG64(pi, "rx_frames_1519_max", "# of rx frames in this range", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_1519B_MAX_L)); SYSCTL_ADD_T4_REG64(pi, "rx_pause", "# of pause frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PAUSE_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp0", "# of PPP prio 0 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP0_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp1", "# of PPP prio 1 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP1_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp2", "# of PPP prio 2 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP2_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp3", "# of PPP prio 3 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP3_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp4", "# of PPP prio 4 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP4_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp5", "# of PPP prio 5 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP5_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp6", "# of PPP prio 6 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP6_L)); SYSCTL_ADD_T4_REG64(pi, "rx_ppp7", "# of PPP prio 7 frames received", PORT_REG(pi->tx_chan, A_MPS_PORT_STAT_RX_PORT_PPP7_L)); #undef SYSCTL_ADD_T4_REG64 #define SYSCTL_ADD_T4_PORTSTAT(name, desc) \ SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, #name, CTLFLAG_RD, \ &pi->stats.name, desc) /* We get these from port_stats and they may be stale by up to 1s */ SYSCTL_ADD_T4_PORTSTAT(rx_ovflow0, "# drops due to buffer-group 0 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow1, "# drops due to buffer-group 1 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow2, "# drops due to buffer-group 2 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_ovflow3, "# drops due to buffer-group 3 overflows"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc0, "# of buffer-group 0 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc1, "# of buffer-group 1 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc2, "# of buffer-group 2 truncated packets"); SYSCTL_ADD_T4_PORTSTAT(rx_trunc3, "# of buffer-group 3 truncated packets"); #undef SYSCTL_ADD_T4_PORTSTAT SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tx_tls_records", CTLFLAG_RD, &pi->tx_tls_records, "# of TLS records transmitted"); SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tx_tls_octets", CTLFLAG_RD, &pi->tx_tls_octets, "# of payload octets in transmitted TLS records"); SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_tls_records", CTLFLAG_RD, &pi->rx_tls_records, "# of TLS records received"); SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "rx_tls_octets", CTLFLAG_RD, &pi->rx_tls_octets, "# of payload octets in received TLS records"); } static int sysctl_int_array(SYSCTL_HANDLER_ARGS) { int rc, *i, space = 0; struct sbuf sb; sbuf_new_for_sysctl(&sb, NULL, 64, req); for (i = arg1; arg2; arg2 -= sizeof(int), i++) { if (space) sbuf_printf(&sb, " "); sbuf_printf(&sb, "%d", *i); space = 1; } rc = sbuf_finish(&sb); sbuf_delete(&sb); return (rc); } static int sysctl_bitfield_8b(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", *(uint8_t *)(uintptr_t)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_bitfield_16b(SYSCTL_HANDLER_ARGS) { int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "%b", *(uint16_t *)(uintptr_t)arg2, (char *)arg1); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_btphy(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; int op = arg2; struct adapter *sc = pi->adapter; u_int v; int rc; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4btt"); if (rc) return (rc); /* XXX: magic numbers */ rc = -t4_mdio_rd(sc, sc->mbox, pi->mdio_addr, 0x1e, op ? 0x20 : 0xc820, &v); end_synchronized_op(sc, 0); if (rc) return (rc); if (op == 0) v /= 256; rc = sysctl_handle_int(oidp, &v, 0, req); return (rc); } static int sysctl_noflowq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; int rc, val; val = vi->rsrv_noflowq; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if ((val >= 1) && (vi->ntxq > 1)) vi->rsrv_noflowq = 1; else vi->rsrv_noflowq = 0; return (rc); } static int sysctl_holdoff_tmr_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int idx, rc, i; struct sge_rxq *rxq; uint8_t v; idx = vi->tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4tmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->pktc_idx != -1); for_each_rxq(vi, i, rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&rxq->iq.intr_params, v); #else rxq->iq.intr_params = v; #endif } vi->tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int idx, rc; idx = vi->pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4pktc"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_rxq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int qsize, rc; qsize = vi->qsize_rxq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || (qsize & 7)) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4rxqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_rxq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_qsize_txq(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int qsize, rc; qsize = vi->qsize_txq; rc = sysctl_handle_int(oidp, &qsize, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (qsize < 128 || qsize > 65536) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4txqs"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->qsize_txq = qsize; end_synchronized_op(sc, LOCK_HELD); return (rc); } static int sysctl_pause_settings(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; if (req->newptr == NULL) { struct sbuf *sb; static char *bits = "\20\1RX\2TX\3AUTO"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); if (lc->link_ok) { sbuf_printf(sb, "%b", (lc->fc & (PAUSE_TX | PAUSE_RX)) | (lc->requested_fc & PAUSE_AUTONEG), bits); } else { sbuf_printf(sb, "%b", lc->requested_fc & (PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG), bits); } rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[2]; int n; s[0] = '0' + (lc->requested_fc & (PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)); s[1] = 0; rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); if (s[1] != 0) return (EINVAL); if (s[0] < '0' || s[0] > '9') return (EINVAL); /* not a number */ n = s[0] - '0'; if (n & ~(PAUSE_TX | PAUSE_RX | PAUSE_AUTONEG)) return (EINVAL); /* some other bit is set too */ rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4PAUSE"); if (rc) return (rc); PORT_LOCK(pi); lc->requested_fc = n; fixup_link_config(pi); if (pi->up_vis > 0) rc = apply_link_config(pi); set_current_media(pi); PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } return (rc); } static int sysctl_fec(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc; int8_t old; if (req->newptr == NULL) { struct sbuf *sb; static char *bits = "\20\1RS\2BASE-R\3RSVD1\4RSVD2\5RSVD3\6AUTO"; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); /* * Display the requested_fec when the link is down -- the actual * FEC makes sense only when the link is up. */ if (lc->link_ok) { sbuf_printf(sb, "%b", (lc->fec & M_FW_PORT_CAP32_FEC) | (lc->requested_fec & FEC_AUTO), bits); } else { sbuf_printf(sb, "%b", lc->requested_fec, bits); } rc = sbuf_finish(sb); sbuf_delete(sb); } else { char s[3]; int n; snprintf(s, sizeof(s), "%d", lc->requested_fec == FEC_AUTO ? -1 : lc->requested_fec & M_FW_PORT_CAP32_FEC); rc = sysctl_handle_string(oidp, s, sizeof(s), req); if (rc != 0) return(rc); n = strtol(&s[0], NULL, 0); if (n < 0 || n & FEC_AUTO) n = FEC_AUTO; else { if (n & ~M_FW_PORT_CAP32_FEC) return (EINVAL);/* some other bit is set too */ if (!powerof2(n)) return (EINVAL);/* one bit can be set at most */ } rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4fec"); if (rc) return (rc); PORT_LOCK(pi); old = lc->requested_fec; if (n == FEC_AUTO) lc->requested_fec = FEC_AUTO; else if (n == 0) lc->requested_fec = FEC_NONE; else { if ((lc->supported | V_FW_PORT_CAP32_FEC(n)) != lc->supported) { rc = ENOTSUP; goto done; } lc->requested_fec = n; } fixup_link_config(pi); if (pi->up_vis > 0) { rc = apply_link_config(pi); if (rc != 0) { lc->requested_fec = old; if (rc == FW_EPROTO) rc = ENOTSUP; } } done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); } return (rc); } static int sysctl_autoneg(SYSCTL_HANDLER_ARGS) { struct port_info *pi = arg1; struct adapter *sc = pi->adapter; struct link_config *lc = &pi->link_cfg; int rc, val; if (lc->supported & FW_PORT_CAP32_ANEG) val = lc->requested_aneg == AUTONEG_DISABLE ? 0 : 1; else val = -1; rc = sysctl_handle_int(oidp, &val, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (val == 0) val = AUTONEG_DISABLE; else if (val == 1) val = AUTONEG_ENABLE; else val = AUTONEG_AUTO; rc = begin_synchronized_op(sc, &pi->vi[0], SLEEP_OK | INTR_OK, "t4aneg"); if (rc) return (rc); PORT_LOCK(pi); if (val == AUTONEG_ENABLE && !(lc->supported & FW_PORT_CAP32_ANEG)) { rc = ENOTSUP; goto done; } lc->requested_aneg = val; fixup_link_config(pi); if (pi->up_vis > 0) rc = apply_link_config(pi); set_current_media(pi); done: PORT_UNLOCK(pi); end_synchronized_op(sc, 0); return (rc); } static int sysctl_handle_t4_reg64(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int reg = arg2; uint64_t val; val = t4_read_reg64(sc, reg); return (sysctl_handle_64(oidp, &val, 0, req)); } static int sysctl_temperature(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc, t; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4temp"); if (rc) return (rc); param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_DIAG) | V_FW_PARAMS_PARAM_Y(FW_PARAM_DEV_DIAG_TMP); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); end_synchronized_op(sc, 0); if (rc) return (rc); /* unknown is returned as 0 but we display -1 in that case */ t = val == 0 ? -1 : val; rc = sysctl_handle_int(oidp, &t, 0, req); return (rc); } static int sysctl_loadavg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint32_t param, val; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4lavg"); if (rc) return (rc); param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DEV) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DEV_LOAD); rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); end_synchronized_op(sc, 0); if (rc) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); if (val == 0xffffffff) { /* Only debug and custom firmwares report load averages. */ sbuf_printf(sb, "not available"); } else { sbuf_printf(sb, "%d %d %d", val & 0xff, (val >> 8) & 0xff, (val >> 16) & 0xff); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cctrl(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t incr[NMTUS][NCCTRL_WIN]; static const char *dec_fac[] = { "0.5", "0.5625", "0.625", "0.6875", "0.75", "0.8125", "0.875", "0.9375" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); t4_read_cong_tbl(sc, incr); for (i = 0; i < NCCTRL_WIN; ++i) { sbuf_printf(sb, "%2d: %4u %4u %4u %4u %4u %4u %4u %4u\n", i, incr[0][i], incr[1][i], incr[2][i], incr[3][i], incr[4][i], incr[5][i], incr[6][i], incr[7][i]); sbuf_printf(sb, "%8u %4u %4u %4u %4u %4u %4u %4u %5u %s\n", incr[8][i], incr[9][i], incr[10][i], incr[11][i], incr[12][i], incr[13][i], incr[14][i], incr[15][i], sc->params.a_wnd[i], dec_fac[sc->params.b_wnd[i]]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static const char *qname[CIM_NUM_IBQ + CIM_NUM_OBQ_T5] = { "TP0", "TP1", "ULP", "SGE0", "SGE1", "NC-SI", /* ibq's */ "ULP0", "ULP1", "ULP2", "ULP3", "SGE", "NC-SI", /* obq's */ "SGE0-RX", "SGE1-RX" /* additional obq's (T5 onwards) */ }; static int sysctl_cim_ibq_obq(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n, qid = arg2; uint32_t *buf, *p; char *qtype; u_int cim_num_obq = sc->chip_params->cim_num_obq; KASSERT(qid >= 0 && qid < CIM_NUM_IBQ + cim_num_obq, ("%s: bad qid %d\n", __func__, qid)); if (qid < CIM_NUM_IBQ) { /* inbound queue */ qtype = "IBQ"; n = 4 * CIM_IBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_ibq(sc, qid, buf, n); } else { /* outbound queue */ qtype = "OBQ"; qid -= CIM_NUM_IBQ; n = 4 * cim_num_obq * CIM_OBQ_SIZE; buf = malloc(n * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); rc = t4_read_cim_obq(sc, qid, buf, n); } if (rc < 0) { rc = -rc; goto done; } n = rc * sizeof(uint32_t); /* rc has # of words actually read */ rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) goto done; sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) { rc = ENOMEM; goto done; } sbuf_printf(sb, "%s%d %s", qtype , qid, qname[arg2]); for (i = 0, p = buf; i < n; i += 16, p += 4) sbuf_printf(sb, "\n%#06x: %08x %08x %08x %08x", i, p[0], p[1], p[2], p[3]); rc = sbuf_finish(sb); sbuf_delete(sb); done: free(buf, M_CXGBE); return (rc); } static void sbuf_cim_la4(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg) { uint32_t *p; sbuf_printf(sb, "Status Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 8]; p += 8) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x", p[5] & 0xff, p[6], p[7]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x", (p[3] >> 8) & 0xff, p[3] & 0xff, p[4] >> 8, p[4] & 0xff, p[5] >> 8); sbuf_printf(sb, "\n %02x %x%07x %x%07x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4); } else { sbuf_printf(sb, "\n %02x %x%07x %x%07x %08x %08x " "%08x%08x%08x%08x", (p[0] >> 4) & 0xff, p[0] & 0xf, p[1] >> 4, p[1] & 0xf, p[2] >> 4, p[2] & 0xf, p[3], p[4], p[5], p[6], p[7]); } } } static void sbuf_cim_la6(struct adapter *sc, struct sbuf *sb, uint32_t *buf, uint32_t cfg) { uint32_t *p; sbuf_printf(sb, "Status Inst Data PC%s", cfg & F_UPDBGLACAPTPCONLY ? "" : " LS0Stat LS0Addr LS0Data LS1Stat LS1Addr LS1Data"); for (p = buf; p <= &buf[sc->params.cim_la_size - 10]; p += 10) { if (cfg & F_UPDBGLACAPTPCONLY) { sbuf_printf(sb, "\n %02x %08x %08x %08x", p[3] & 0xff, p[2], p[1], p[0]); sbuf_printf(sb, "\n %02x %02x%06x %02x%06x %02x%06x", (p[6] >> 8) & 0xff, p[6] & 0xff, p[5] >> 8, p[5] & 0xff, p[4] >> 8, p[4] & 0xff, p[3] >> 8); sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16); } else { sbuf_printf(sb, "\n %02x %04x%04x %04x%04x %04x%04x " "%08x %08x %08x %08x %08x %08x", (p[9] >> 16) & 0xff, p[9] & 0xffff, p[8] >> 16, p[8] & 0xffff, p[7] >> 16, p[7] & 0xffff, p[6] >> 16, p[2], p[1], p[0], p[5], p[4], p[3]); } } } static int sbuf_cim_la(struct adapter *sc, struct sbuf *sb, int flags) { uint32_t cfg, *buf; int rc; rc = -t4_cim_read(sc, A_UP_UP_DBG_LA_CFG, 1, &cfg); if (rc != 0) return (rc); MPASS(flags == M_WAITOK || flags == M_NOWAIT); buf = malloc(sc->params.cim_la_size * sizeof(uint32_t), M_CXGBE, M_ZERO | flags); if (buf == NULL) return (ENOMEM); rc = -t4_cim_read_la(sc, buf, NULL); if (rc != 0) goto done; if (chip_id(sc) < CHELSIO_T6) sbuf_cim_la4(sc, sb, buf, cfg); else sbuf_cim_la6(sc, sb, buf, cfg); done: free(buf, M_CXGBE); return (rc); } static int sysctl_cim_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); rc = sbuf_cim_la(sc, sb, M_WAITOK); if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } bool t4_os_dump_cimla(struct adapter *sc, int arg, bool verbose) { struct sbuf sb; int rc; if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb) return (false); rc = sbuf_cim_la(sc, &sb, M_NOWAIT); if (rc == 0) { rc = sbuf_finish(&sb); if (rc == 0) { log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s", device_get_nameunit(sc->dev), sbuf_data(&sb)); } } sbuf_delete(&sb); return (false); } static int sysctl_cim_ma_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_MALA_SIZE * 5 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_ma_la(sc, buf, buf + 5 * CIM_MALA_SIZE); p = buf; for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%02x%08x%08x%08x%08x", p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCnt ID Tag UE Data RDY VLD"); for (i = 0; i < CIM_MALA_SIZE; i++, p += 5) { sbuf_printf(sb, "\n%3u %2u %x %u %08x%08x %u %u", (p[2] >> 10) & 0xff, (p[2] >> 7) & 7, (p[2] >> 3) & 0xf, (p[2] >> 2) & 1, (p[1] >> 2) | ((p[2] & 3) << 30), (p[0] >> 2) | ((p[1] & 3) << 30), (p[0] >> 1) & 1, p[0] & 1); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_pif_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int i; struct sbuf *sb; uint32_t *buf, *p; int rc; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(2 * CIM_PIFLA_SIZE * 6 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_cim_read_pif_la(sc, buf, buf + 6 * CIM_PIFLA_SIZE, NULL, NULL); p = buf; sbuf_printf(sb, "Cntl ID DataBE Addr Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %04x %08x %08x%08x%08x%08x", (p[5] >> 22) & 0xff, (p[5] >> 16) & 0x3f, p[5] & 0xffff, p[4], p[3], p[2], p[1], p[0]); } sbuf_printf(sb, "\n\nCntl ID Data"); for (i = 0; i < CIM_PIFLA_SIZE; i++, p += 6) { sbuf_printf(sb, "\n %02x %02x %08x%08x%08x%08x", (p[4] >> 6) & 0xff, p[4] & 0x3f, p[3], p[2], p[1], p[0]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_cim_qcfg(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint16_t base[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t size[CIM_NUM_IBQ + CIM_NUM_OBQ_T5]; uint16_t thres[CIM_NUM_IBQ]; uint32_t obq_wr[2 * CIM_NUM_OBQ_T5], *wr = obq_wr; uint32_t stat[4 * (CIM_NUM_IBQ + CIM_NUM_OBQ_T5)], *p = stat; u_int cim_num_obq, ibq_rdaddr, obq_rdaddr, nq; cim_num_obq = sc->chip_params->cim_num_obq; if (is_t4(sc)) { ibq_rdaddr = A_UP_IBQ_0_RDADDR; obq_rdaddr = A_UP_OBQ_0_REALADDR; } else { ibq_rdaddr = A_UP_IBQ_0_SHADOW_RDADDR; obq_rdaddr = A_UP_OBQ_0_SHADOW_REALADDR; } nq = CIM_NUM_IBQ + cim_num_obq; rc = -t4_cim_read(sc, ibq_rdaddr, 4 * nq, stat); if (rc == 0) rc = -t4_cim_read(sc, obq_rdaddr, 2 * cim_num_obq, obq_wr); if (rc != 0) return (rc); t4_read_cimq_cfg(sc, base, size, thres); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, PAGE_SIZE, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, " Queue Base Size Thres RdPtr WrPtr SOP EOP Avail"); for (i = 0; i < CIM_NUM_IBQ; i++, p += 4) sbuf_printf(sb, "\n%7s %5x %5u %5u %6x %4x %4u %4u %5u", qname[i], base[i], size[i], thres[i], G_IBQRDADDR(p[0]), G_IBQWRADDR(p[1]), G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); for ( ; i < nq; i++, p += 4, wr += 2) sbuf_printf(sb, "\n%7s %5x %5u %12x %4x %4u %4u %5u", qname[i], base[i], size[i], G_QUERDADDR(p[0]) & 0x3fff, wr[0] - base[i], G_QUESOPCNT(p[3]), G_QUEEOPCNT(p[3]), G_QUEREMFLITS(p[2]) * 16); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpl_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_cpl_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_cpl_stats(sc, &stats, 0); mtx_unlock(&sc->reg_lock); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\nCPL requests: %10u %10u %10u %10u", stats.req[0], stats.req[1], stats.req[2], stats.req[3]); sbuf_printf(sb, "\nCPL responses: %10u %10u %10u %10u", stats.rsp[0], stats.rsp[1], stats.rsp[2], stats.rsp[3]); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\nCPL requests: %10u %10u", stats.req[0], stats.req[1]); sbuf_printf(sb, "\nCPL responses: %10u %10u", stats.rsp[0], stats.rsp[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ddp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_usm_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_usm_stats(sc, &stats, 1); sbuf_printf(sb, "Frames: %u\n", stats.frames); sbuf_printf(sb, "Octets: %ju\n", stats.octets); sbuf_printf(sb, "Drops: %u", stats.drops); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static const char * const devlog_level_strings[] = { [FW_DEVLOG_LEVEL_EMERG] = "EMERG", [FW_DEVLOG_LEVEL_CRIT] = "CRIT", [FW_DEVLOG_LEVEL_ERR] = "ERR", [FW_DEVLOG_LEVEL_NOTICE] = "NOTICE", [FW_DEVLOG_LEVEL_INFO] = "INFO", [FW_DEVLOG_LEVEL_DEBUG] = "DEBUG" }; static const char * const devlog_facility_strings[] = { [FW_DEVLOG_FACILITY_CORE] = "CORE", [FW_DEVLOG_FACILITY_CF] = "CF", [FW_DEVLOG_FACILITY_SCHED] = "SCHED", [FW_DEVLOG_FACILITY_TIMER] = "TIMER", [FW_DEVLOG_FACILITY_RES] = "RES", [FW_DEVLOG_FACILITY_HW] = "HW", [FW_DEVLOG_FACILITY_FLR] = "FLR", [FW_DEVLOG_FACILITY_DMAQ] = "DMAQ", [FW_DEVLOG_FACILITY_PHY] = "PHY", [FW_DEVLOG_FACILITY_MAC] = "MAC", [FW_DEVLOG_FACILITY_PORT] = "PORT", [FW_DEVLOG_FACILITY_VI] = "VI", [FW_DEVLOG_FACILITY_FILTER] = "FILTER", [FW_DEVLOG_FACILITY_ACL] = "ACL", [FW_DEVLOG_FACILITY_TM] = "TM", [FW_DEVLOG_FACILITY_QFC] = "QFC", [FW_DEVLOG_FACILITY_DCB] = "DCB", [FW_DEVLOG_FACILITY_ETH] = "ETH", [FW_DEVLOG_FACILITY_OFLD] = "OFLD", [FW_DEVLOG_FACILITY_RI] = "RI", [FW_DEVLOG_FACILITY_ISCSI] = "ISCSI", [FW_DEVLOG_FACILITY_FCOE] = "FCOE", [FW_DEVLOG_FACILITY_FOISCSI] = "FOISCSI", [FW_DEVLOG_FACILITY_FOFCOE] = "FOFCOE", [FW_DEVLOG_FACILITY_CHNET] = "CHNET", }; static int sbuf_devlog(struct adapter *sc, struct sbuf *sb, int flags) { int i, j, rc, nentries, first = 0; struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e *buf, *e; uint64_t ftstamp = UINT64_MAX; if (dparams->addr == 0) return (ENXIO); MPASS(flags == M_WAITOK || flags == M_NOWAIT); buf = malloc(dparams->size, M_CXGBE, M_ZERO | flags); if (buf == NULL) return (ENOMEM); rc = read_via_memwin(sc, 1, dparams->addr, (void *)buf, dparams->size); if (rc != 0) goto done; nentries = dparams->size / sizeof(struct fw_devlog_e); for (i = 0; i < nentries; i++) { e = &buf[i]; if (e->timestamp == 0) break; /* end */ e->timestamp = be64toh(e->timestamp); e->seqno = be32toh(e->seqno); for (j = 0; j < 8; j++) e->params[j] = be32toh(e->params[j]); if (e->timestamp < ftstamp) { ftstamp = e->timestamp; first = i; } } if (buf[first].timestamp == 0) goto done; /* nothing in the log */ sbuf_printf(sb, "%10s %15s %8s %8s %s\n", "Seq#", "Tstamp", "Level", "Facility", "Message"); i = first; do { e = &buf[i]; if (e->timestamp == 0) break; /* end */ sbuf_printf(sb, "%10d %15ju %8s %8s ", e->seqno, e->timestamp, (e->level < nitems(devlog_level_strings) ? devlog_level_strings[e->level] : "UNKNOWN"), (e->facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e->facility] : "UNKNOWN")); sbuf_printf(sb, e->fmt, e->params[0], e->params[1], e->params[2], e->params[3], e->params[4], e->params[5], e->params[6], e->params[7]); if (++i == nentries) i = 0; } while (i != first); done: free(buf, M_CXGBE); return (rc); } static int sysctl_devlog(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int rc; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); rc = sbuf_devlog(sc, sb, M_WAITOK); if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } void t4_os_dump_devlog(struct adapter *sc) { int rc; struct sbuf sb; if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb) return; rc = sbuf_devlog(sc, &sb, M_NOWAIT); if (rc == 0) { rc = sbuf_finish(&sb); if (rc == 0) { log(LOG_DEBUG, "%s: device log follows.\n%s", device_get_nameunit(sc->dev), sbuf_data(&sb)); } } sbuf_delete(&sb); } static int sysctl_fcoe_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_fcoe_stats stats[MAX_NCHAN]; int i, nchan = sc->chip_params->nchan; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nchan; i++) t4_get_fcoe_stats(sc, i, &stats[i], 1); if (nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp, stats[2].octets_ddp, stats[3].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp, stats[2].frames_ddp, stats[3].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u %16u %16u", stats[0].frames_drop, stats[1].frames_drop, stats[2].frames_drop, stats[3].frames_drop); } else { sbuf_printf(sb, " channel 0 channel 1"); sbuf_printf(sb, "\noctetsDDP: %16ju %16ju", stats[0].octets_ddp, stats[1].octets_ddp); sbuf_printf(sb, "\nframesDDP: %16u %16u", stats[0].frames_ddp, stats[1].frames_ddp); sbuf_printf(sb, "\nframesDrop: %16u %16u", stats[0].frames_drop, stats[1].frames_drop); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_hw_sched(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; unsigned int map, kbps, ipg, mode; unsigned int pace_tab[NTX_SCHED]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); map = t4_read_reg(sc, A_TP_TX_MOD_QUEUE_REQ_MAP); mode = G_TIMERMODE(t4_read_reg(sc, A_TP_MOD_CONFIG)); t4_read_pace_tbl(sc, pace_tab); sbuf_printf(sb, "Scheduler Mode Channel Rate (Kbps) " "Class IPG (0.1 ns) Flow IPG (us)"); for (i = 0; i < NTX_SCHED; ++i, map >>= 2) { t4_get_tx_sched(sc, i, &kbps, &ipg, 1); sbuf_printf(sb, "\n %u %-5s %u ", i, (mode & (1 << i)) ? "flow" : "class", map & 3); if (kbps) sbuf_printf(sb, "%9u ", kbps); else sbuf_printf(sb, " disabled "); if (ipg) sbuf_printf(sb, "%13u ", ipg); else sbuf_printf(sb, " disabled "); if (pace_tab[i]) sbuf_printf(sb, "%10u", pace_tab[i]); else sbuf_printf(sb, " disabled"); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_lb_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, j; uint64_t *p0, *p1; struct lb_port_stats s[2]; static const char *stat_name[] = { "OctetsOK:", "FramesOK:", "BcastFrames:", "McastFrames:", "UcastFrames:", "ErrorFrames:", "Frames64:", "Frames65To127:", "Frames128To255:", "Frames256To511:", "Frames512To1023:", "Frames1024To1518:", "Frames1519ToMax:", "FramesDropped:", "BG0FramesDropped:", "BG1FramesDropped:", "BG2FramesDropped:", "BG3FramesDropped:", "BG0FramesTrunc:", "BG1FramesTrunc:", "BG2FramesTrunc:", "BG3FramesTrunc:" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); memset(s, 0, sizeof(s)); for (i = 0; i < sc->chip_params->nchan; i += 2) { t4_get_lb_stats(sc, i, &s[0]); t4_get_lb_stats(sc, i + 1, &s[1]); p0 = &s[0].octets; p1 = &s[1].octets; sbuf_printf(sb, "%s Loopback %u" " Loopback %u", i == 0 ? "" : "\n", i, i + 1); for (j = 0; j < nitems(stat_name); j++) sbuf_printf(sb, "\n%-17s %20ju %20ju", stat_name[j], *p0++, *p1++); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_linkdnrc(SYSCTL_HANDLER_ARGS) { int rc = 0; struct port_info *pi = arg1; struct link_config *lc = &pi->link_cfg; struct sbuf *sb; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return(rc); sb = sbuf_new_for_sysctl(NULL, NULL, 64, req); if (sb == NULL) return (ENOMEM); if (lc->link_ok || lc->link_down_rc == 255) sbuf_printf(sb, "n/a"); else sbuf_printf(sb, "%s", t4_link_down_rc_str(lc->link_down_rc)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } struct mem_desc { unsigned int base; unsigned int limit; unsigned int idx; }; static int mem_desc_cmp(const void *a, const void *b) { return ((const struct mem_desc *)a)->base - ((const struct mem_desc *)b)->base; } static void mem_region_show(struct sbuf *sb, const char *name, unsigned int from, unsigned int to) { unsigned int size; if (from == to) return; size = to - from + 1; if (size == 0) return; /* XXX: need humanize_number(3) in libkern for a more readable 'size' */ sbuf_printf(sb, "%-15s %#x-%#x [%u]\n", name, from, to, size); } static int sysctl_meminfo(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i, n; uint32_t lo, hi, used, alloc; static const char *memory[] = {"EDC0:", "EDC1:", "MC:", "MC0:", "MC1:"}; static const char *region[] = { "DBQ contexts:", "IMSG contexts:", "FLM cache:", "TCBs:", "Pstructs:", "Timers:", "Rx FL:", "Tx FL:", "Pstruct FL:", "Tx payload:", "Rx payload:", "LE hash:", "iSCSI region:", "TDDP region:", "TPT region:", "STAG region:", "RQ region:", "RQUDP region:", "PBL region:", "TXPBL region:", "DBVFIFO region:", "ULPRX state:", "ULPTX state:", "On-chip queues:", "TLS keys:", }; struct mem_desc avail[4]; struct mem_desc mem[nitems(region) + 3]; /* up to 3 holes */ struct mem_desc *md = mem; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); for (i = 0; i < nitems(mem); i++) { mem[i].limit = 0; mem[i].idx = i; } /* Find and sort the populated memory ranges */ i = 0; lo = t4_read_reg(sc, A_MA_TARGET_MEM_ENABLE); if (lo & F_EDRAM0_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM0_BAR); avail[i].base = G_EDRAM0_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM0_SIZE(hi) << 20); avail[i].idx = 0; i++; } if (lo & F_EDRAM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EDRAM1_BAR); avail[i].base = G_EDRAM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EDRAM1_SIZE(hi) << 20); avail[i].idx = 1; i++; } if (lo & F_EXT_MEM_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY_BAR); avail[i].base = G_EXT_MEM_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM_SIZE(hi) << 20); avail[i].idx = is_t5(sc) ? 3 : 2; /* Call it MC0 for T5 */ i++; } if (is_t5(sc) && lo & F_EXT_MEM1_ENABLE) { hi = t4_read_reg(sc, A_MA_EXT_MEMORY1_BAR); avail[i].base = G_EXT_MEM1_BASE(hi) << 20; avail[i].limit = avail[i].base + (G_EXT_MEM1_SIZE(hi) << 20); avail[i].idx = 4; i++; } if (!i) /* no memory available */ return 0; qsort(avail, i, sizeof(struct mem_desc), mem_desc_cmp); (md++)->base = t4_read_reg(sc, A_SGE_DBQ_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_IMSG_CTXT_BADDR); (md++)->base = t4_read_reg(sc, A_SGE_FLM_CACHE_BADDR); (md++)->base = t4_read_reg(sc, A_TP_CMM_TCB_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_TIMER_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_RX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_TX_FLST_BASE); (md++)->base = t4_read_reg(sc, A_TP_CMM_MM_PS_FLST_BASE); /* the next few have explicit upper bounds */ md->base = t4_read_reg(sc, A_TP_PMM_TX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE) * G_PMTXMAXPAGE(t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE)); md++; md->base = t4_read_reg(sc, A_TP_PMM_RX_BASE); md->limit = md->base - 1 + t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) * G_PMRXMAXPAGE(t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE)); md++; if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { if (chip_id(sc) <= CHELSIO_T5) md->base = t4_read_reg(sc, A_LE_DB_HASH_TID_BASE); else md->base = t4_read_reg(sc, A_LE_DB_HASH_TBL_BASE_ADDR); md->limit = 0; } else { md->base = 0; md->idx = nitems(region); /* hide it */ } md++; #define ulp_region(reg) \ md->base = t4_read_reg(sc, A_ULP_ ## reg ## _LLIMIT);\ (md++)->limit = t4_read_reg(sc, A_ULP_ ## reg ## _ULIMIT) ulp_region(RX_ISCSI); ulp_region(RX_TDDP); ulp_region(TX_TPT); ulp_region(RX_STAG); ulp_region(RX_RQ); ulp_region(RX_RQUDP); ulp_region(RX_PBL); ulp_region(TX_PBL); #undef ulp_region md->base = 0; md->idx = nitems(region); if (!is_t4(sc)) { uint32_t size = 0; uint32_t sge_ctrl = t4_read_reg(sc, A_SGE_CONTROL2); uint32_t fifo_size = t4_read_reg(sc, A_SGE_DBVFIFO_SIZE); if (is_t5(sc)) { if (sge_ctrl & F_VFIFO_ENABLE) size = G_DBVFIFO_SIZE(fifo_size); } else size = G_T6_DBVFIFO_SIZE(fifo_size); if (size) { md->base = G_BASEADDR(t4_read_reg(sc, A_SGE_DBVFIFO_BADDR)); md->limit = md->base + (size << 2) - 1; } } md++; md->base = t4_read_reg(sc, A_ULP_RX_CTX_BASE); md->limit = 0; md++; md->base = t4_read_reg(sc, A_ULP_TX_ERR_TABLE_BASE); md->limit = 0; md++; md->base = sc->vres.ocq.start; if (sc->vres.ocq.size) md->limit = md->base + sc->vres.ocq.size - 1; else md->idx = nitems(region); /* hide it */ md++; md->base = sc->vres.key.start; if (sc->vres.key.size) md->limit = md->base + sc->vres.key.size - 1; else md->idx = nitems(region); /* hide it */ md++; /* add any address-space holes, there can be up to 3 */ for (n = 0; n < i - 1; n++) if (avail[n].limit < avail[n + 1].base) (md++)->base = avail[n].limit; if (avail[n].limit) (md++)->base = avail[n].limit; n = md - mem; qsort(mem, n, sizeof(struct mem_desc), mem_desc_cmp); for (lo = 0; lo < i; lo++) mem_region_show(sb, memory[avail[lo].idx], avail[lo].base, avail[lo].limit - 1); sbuf_printf(sb, "\n"); for (i = 0; i < n; i++) { if (mem[i].idx >= nitems(region)) continue; /* skip holes */ if (!mem[i].limit) mem[i].limit = i < n - 1 ? mem[i + 1].base - 1 : ~0; mem_region_show(sb, region[mem[i].idx], mem[i].base, mem[i].limit); } sbuf_printf(sb, "\n"); lo = t4_read_reg(sc, A_CIM_SDRAM_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_SDRAM_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP RAM:", lo, hi); lo = t4_read_reg(sc, A_CIM_EXTMEM2_BASE_ADDR); hi = t4_read_reg(sc, A_CIM_EXTMEM2_ADDR_SIZE) + lo - 1; mem_region_show(sb, "uP Extmem2:", lo, hi); lo = t4_read_reg(sc, A_TP_PMM_RX_MAX_PAGE); sbuf_printf(sb, "\n%u Rx pages of size %uKiB for %u channels\n", G_PMRXMAXPAGE(lo), t4_read_reg(sc, A_TP_PMM_RX_PAGE_SIZE) >> 10, (lo & F_PMRXNUMCHN) ? 2 : 1); lo = t4_read_reg(sc, A_TP_PMM_TX_MAX_PAGE); hi = t4_read_reg(sc, A_TP_PMM_TX_PAGE_SIZE); sbuf_printf(sb, "%u Tx pages of size %u%ciB for %u channels\n", G_PMTXMAXPAGE(lo), hi >= (1 << 20) ? (hi >> 20) : (hi >> 10), hi >= (1 << 20) ? 'M' : 'K', 1 << G_PMTXNUMCHN(lo)); sbuf_printf(sb, "%u p-structs\n", t4_read_reg(sc, A_TP_CMM_MM_MAX_PSTRUCT)); for (i = 0; i < 4; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_MAC_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV0 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nPort %d using %u pages out of %u allocated", i, used, alloc); } for (i = 0; i < sc->chip_params->nchan; i++) { if (chip_id(sc) > CHELSIO_T5) lo = t4_read_reg(sc, A_MPS_RX_LPBK_BG_PG_CNT0 + i * 4); else lo = t4_read_reg(sc, A_MPS_RX_PG_RSV4 + i * 4); if (is_t5(sc)) { used = G_T5_USED(lo); alloc = G_T5_ALLOC(lo); } else { used = G_USED(lo); alloc = G_ALLOC(lo); } /* For T6 these are MAC buffer groups */ sbuf_printf(sb, "\nLoopback %d using %u pages out of %u allocated", i, used, alloc); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static inline void tcamxy2valmask(uint64_t x, uint64_t y, uint8_t *addr, uint64_t *mask) { *mask = x | y; y = htobe64(y); memcpy(addr, (char *)&y + 2, ETHER_ADDR_LEN); } static int sysctl_mps_tcam(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) <= CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask Vld Ports PF" " VF Replication P0 P1 P2 P3 ML"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint64_t tcamx, tcamy, mask; uint32_t cls_lo, cls_hi; uint8_t addr[ETHER_ADDR_LEN]; tcamy = t4_read_reg64(sc, MPS_CLS_TCAM_Y_L(i)); tcamx = t4_read_reg64(sc, MPS_CLS_TCAM_X_L(i)); if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x %012jx" " %c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, (cls_lo & F_SRAM_VLD) ? 'Y' : 'N', G_PORTMAP(cls_hi), G_PF(cls_lo), (cls_lo & F_VF_VALID) ? G_VF(cls_lo) : -1); if (cls_lo & F_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4mps"); if (rc) break; rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) { sbuf_printf(sb, "%36d", rc); rc = 0; } else { sbuf_printf(sb, " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%36s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#3x", G_SRAM_PRIO0(cls_lo), G_SRAM_PRIO1(cls_lo), G_SRAM_PRIO2(cls_lo), G_SRAM_PRIO3(cls_lo), (cls_lo >> S_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_mps_tcam_t6(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; MPASS(chip_id(sc) > CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); sbuf_printf(sb, "Idx Ethernet address Mask VNI Mask" " IVLAN Vld DIP_Hit Lookup Port Vld Ports PF VF" " Replication" " P0 P1 P2 P3 ML\n"); for (i = 0; i < sc->chip_params->mps_tcam_size; i++) { uint8_t dip_hit, vlan_vld, lookup_type, port_num; uint16_t ivlan; uint64_t tcamx, tcamy, val, mask; uint32_t cls_lo, cls_hi, ctl, data2, vnix, vniy; uint8_t addr[ETHER_ADDR_LEN]; ctl = V_CTLREQID(1) | V_CTLCMDTYPE(0) | V_CTLXYBITSEL(0); if (i < 256) ctl |= V_CTLTCAMINDEX(i) | V_CTLTCAMSEL(0); else ctl |= V_CTLTCAMINDEX(i - 256) | V_CTLTCAMSEL(1); t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamy = G_DMACH(val) << 32; tcamy |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); lookup_type = G_DATALKPTYPE(data2); port_num = G_DATAPORTNUM(data2); if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI */ vniy = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); dip_hit = data2 & F_DATADIPHIT; vlan_vld = 0; } else { vniy = 0; dip_hit = 0; vlan_vld = data2 & F_DATAVIDH2; ivlan = G_VIDL(val); } ctl |= V_CTLXYBITSEL(1); t4_write_reg(sc, A_MPS_CLS_TCAM_DATA2_CTL, ctl); val = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA1_REQ_ID1); tcamx = G_DMACH(val) << 32; tcamx |= t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA0_REQ_ID1); data2 = t4_read_reg(sc, A_MPS_CLS_TCAM_RDATA2_REQ_ID1); if (lookup_type && lookup_type != M_DATALKPTYPE) { /* Inner header VNI mask */ vnix = ((data2 & F_DATAVIDH2) << 23) | (G_DATAVIDH1(data2) << 16) | G_VIDL(val); } else vnix = 0; if (tcamx & tcamy) continue; tcamxy2valmask(tcamx, tcamy, addr, &mask); cls_lo = t4_read_reg(sc, MPS_CLS_SRAM_L(i)); cls_hi = t4_read_reg(sc, MPS_CLS_SRAM_H(i)); if (lookup_type && lookup_type != M_DATALKPTYPE) { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx %06x %06x - - %3c" " 'I' %4x %3c %#x%4u%4d", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask, vniy, vnix, dip_hit ? 'Y' : 'N', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } else { sbuf_printf(sb, "\n%3u %02x:%02x:%02x:%02x:%02x:%02x " "%012jx - - ", i, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5], (uintmax_t)mask); if (vlan_vld) sbuf_printf(sb, "%4u Y ", ivlan); else sbuf_printf(sb, " - N "); sbuf_printf(sb, "- %3c %4x %3c %#x%4u%4d", lookup_type ? 'I' : 'O', port_num, cls_lo & F_T6_SRAM_VLD ? 'Y' : 'N', G_PORTMAP(cls_hi), G_T6_PF(cls_lo), cls_lo & F_T6_VF_VALID ? G_T6_VF(cls_lo) : -1); } if (cls_lo & F_T6_REPLICATE) { struct fw_ldst_cmd ldst_cmd; memset(&ldst_cmd, 0, sizeof(ldst_cmd)); ldst_cmd.op_to_addrspace = htobe32(V_FW_CMD_OP(FW_LDST_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_READ | V_FW_LDST_CMD_ADDRSPACE(FW_LDST_ADDRSPC_MPS)); ldst_cmd.cycles_to_len16 = htobe32(FW_LEN16(ldst_cmd)); ldst_cmd.u.mps.rplc.fid_idx = htobe16(V_FW_LDST_CMD_FID(FW_LDST_MPS_RPLC) | V_FW_LDST_CMD_IDX(i)); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t6mps"); if (rc) break; rc = -t4_wr_mbox(sc, sc->mbox, &ldst_cmd, sizeof(ldst_cmd), &ldst_cmd); end_synchronized_op(sc, 0); if (rc != 0) { sbuf_printf(sb, "%72d", rc); rc = 0; } else { sbuf_printf(sb, " %08x %08x %08x %08x" " %08x %08x %08x %08x", be32toh(ldst_cmd.u.mps.rplc.rplc255_224), be32toh(ldst_cmd.u.mps.rplc.rplc223_192), be32toh(ldst_cmd.u.mps.rplc.rplc191_160), be32toh(ldst_cmd.u.mps.rplc.rplc159_128), be32toh(ldst_cmd.u.mps.rplc.rplc127_96), be32toh(ldst_cmd.u.mps.rplc.rplc95_64), be32toh(ldst_cmd.u.mps.rplc.rplc63_32), be32toh(ldst_cmd.u.mps.rplc.rplc31_0)); } } else sbuf_printf(sb, "%72s", ""); sbuf_printf(sb, "%4u%3u%3u%3u %#x", G_T6_SRAM_PRIO0(cls_lo), G_T6_SRAM_PRIO1(cls_lo), G_T6_SRAM_PRIO2(cls_lo), G_T6_SRAM_PRIO3(cls_lo), (cls_lo >> S_T6_MULTILISTEN0) & 0xf); } if (rc) (void) sbuf_finish(sb); else rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_path_mtus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; uint16_t mtus[NMTUS]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_read_mtu_tbl(sc, mtus, NULL); sbuf_printf(sb, "%u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u", mtus[0], mtus[1], mtus[2], mtus[3], mtus[4], mtus[5], mtus[6], mtus[7], mtus[8], mtus[9], mtus[10], mtus[11], mtus[12], mtus[13], mtus[14], mtus[15]); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_pm_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, i; uint32_t tx_cnt[MAX_PM_NSTATS], rx_cnt[MAX_PM_NSTATS]; uint64_t tx_cyc[MAX_PM_NSTATS], rx_cyc[MAX_PM_NSTATS]; static const char *tx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Bypass + mem:", "Tx FIFO wait", NULL, "Tx latency" }; static const char *rx_stats[MAX_PM_NSTATS] = { "Read:", "Write bypass:", "Write mem:", "Flush:", "Rx FIFO wait", NULL, "Rx latency" }; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_pmtx_get_stats(sc, tx_cnt, tx_cyc); t4_pmrx_get_stats(sc, rx_cnt, rx_cyc); sbuf_printf(sb, " Tx pcmds Tx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); } sbuf_printf(sb, "\n Rx pcmds Rx bytes"); for (i = 0; i < 4; i++) { sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } if (chip_id(sc) > CHELSIO_T5) { sbuf_printf(sb, "\n Total wait Total occupancy"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); i += 2; MPASS(i < nitems(tx_stats)); sbuf_printf(sb, "\n Reads Total wait"); sbuf_printf(sb, "\n%-13s %10u %20ju", tx_stats[i], tx_cnt[i], tx_cyc[i]); sbuf_printf(sb, "\n%-13s %10u %20ju", rx_stats[i], rx_cnt[i], rx_cyc[i]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_rdma_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_rdma_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_rdma_stats(sc, &stats, 0); mtx_unlock(&sc->reg_lock); sbuf_printf(sb, "NoRQEModDefferals: %u\n", stats.rqe_dfr_mod); sbuf_printf(sb, "NoRQEPktDefferals: %u", stats.rqe_dfr_pkt); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tcp_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_tcp_stats v4, v6; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_tcp_stats(sc, &v4, &v6, 0); mtx_unlock(&sc->reg_lock); sbuf_printf(sb, " IP IPv6\n"); sbuf_printf(sb, "OutRsts: %20u %20u\n", v4.tcp_out_rsts, v6.tcp_out_rsts); sbuf_printf(sb, "InSegs: %20ju %20ju\n", v4.tcp_in_segs, v6.tcp_in_segs); sbuf_printf(sb, "OutSegs: %20ju %20ju\n", v4.tcp_out_segs, v6.tcp_out_segs); sbuf_printf(sb, "RetransSegs: %20ju %20ju", v4.tcp_retrans_segs, v6.tcp_retrans_segs); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tids(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tid_info *t = &sc->tids; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); if (t->natids) { sbuf_printf(sb, "ATID range: 0-%u, in use: %u\n", t->natids - 1, t->atids_in_use); } if (t->nhpftids) { sbuf_printf(sb, "HPFTID range: %u-%u, in use: %u\n", t->hpftid_base, t->hpftid_end, t->hpftids_in_use); } if (t->ntids) { sbuf_printf(sb, "TID range: "); if (t4_read_reg(sc, A_LE_DB_CONFIG) & F_HASHEN) { uint32_t b, hb; if (chip_id(sc) <= CHELSIO_T5) { b = t4_read_reg(sc, A_LE_DB_SERVER_INDEX) / 4; hb = t4_read_reg(sc, A_LE_DB_TID_HASHBASE) / 4; } else { b = t4_read_reg(sc, A_LE_DB_SRVR_START_INDEX); hb = t4_read_reg(sc, A_T6_LE_DB_HASH_TID_BASE); } if (b) sbuf_printf(sb, "%u-%u, ", t->tid_base, b - 1); sbuf_printf(sb, "%u-%u", hb, t->ntids - 1); } else sbuf_printf(sb, "%u-%u", t->tid_base, t->ntids - 1); sbuf_printf(sb, ", in use: %u\n", atomic_load_acq_int(&t->tids_in_use)); } if (t->nstids) { sbuf_printf(sb, "STID range: %u-%u, in use: %u\n", t->stid_base, t->stid_base + t->nstids - 1, t->stids_in_use); } if (t->nftids) { sbuf_printf(sb, "FTID range: %u-%u, in use: %u\n", t->ftid_base, t->ftid_end, t->ftids_in_use); } if (t->netids) { sbuf_printf(sb, "ETID range: %u-%u, in use: %u\n", t->etid_base, t->etid_base + t->netids - 1, t->etids_in_use); } sbuf_printf(sb, "HW TID usage: %u IP users, %u IPv6 users", t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV4), t4_read_reg(sc, A_LE_DB_ACT_CNT_IPV6)); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_err_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; struct tp_err_stats stats; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); mtx_lock(&sc->reg_lock); t4_tp_get_err_stats(sc, &stats, 0); mtx_unlock(&sc->reg_lock); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "macInErrs: %10u %10u %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1], stats.mac_in_errs[2], stats.mac_in_errs[3]); sbuf_printf(sb, "hdrInErrs: %10u %10u %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1], stats.hdr_in_errs[2], stats.hdr_in_errs[3]); sbuf_printf(sb, "tcpInErrs: %10u %10u %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1], stats.tcp_in_errs[2], stats.tcp_in_errs[3]); sbuf_printf(sb, "tcp6InErrs: %10u %10u %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1], stats.tcp6_in_errs[2], stats.tcp6_in_errs[3]); sbuf_printf(sb, "tnlCongDrops: %10u %10u %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1], stats.tnl_cong_drops[2], stats.tnl_cong_drops[3]); sbuf_printf(sb, "tnlTxDrops: %10u %10u %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1], stats.tnl_tx_drops[2], stats.tnl_tx_drops[3]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1], stats.ofld_vlan_drops[2], stats.ofld_vlan_drops[3]); sbuf_printf(sb, "ofldChanDrops: %10u %10u %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1], stats.ofld_chan_drops[2], stats.ofld_chan_drops[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "macInErrs: %10u %10u\n", stats.mac_in_errs[0], stats.mac_in_errs[1]); sbuf_printf(sb, "hdrInErrs: %10u %10u\n", stats.hdr_in_errs[0], stats.hdr_in_errs[1]); sbuf_printf(sb, "tcpInErrs: %10u %10u\n", stats.tcp_in_errs[0], stats.tcp_in_errs[1]); sbuf_printf(sb, "tcp6InErrs: %10u %10u\n", stats.tcp6_in_errs[0], stats.tcp6_in_errs[1]); sbuf_printf(sb, "tnlCongDrops: %10u %10u\n", stats.tnl_cong_drops[0], stats.tnl_cong_drops[1]); sbuf_printf(sb, "tnlTxDrops: %10u %10u\n", stats.tnl_tx_drops[0], stats.tnl_tx_drops[1]); sbuf_printf(sb, "ofldVlanDrops: %10u %10u\n", stats.ofld_vlan_drops[0], stats.ofld_vlan_drops[1]); sbuf_printf(sb, "ofldChanDrops: %10u %10u\n\n", stats.ofld_chan_drops[0], stats.ofld_chan_drops[1]); } sbuf_printf(sb, "ofldNoNeigh: %u\nofldCongDefer: %u", stats.ofld_no_neigh, stats.ofld_cong_defer); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_tp_la_mask(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct tp_params *tpp = &sc->params.tp; u_int mask; int rc; mask = tpp->la_mask >> 16; rc = sysctl_handle_int(oidp, &mask, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (mask > 0xffff) return (EINVAL); tpp->la_mask = mask << 16; t4_set_reg_field(sc, A_TP_DBG_LA_CONFIG, 0xffff0000U, tpp->la_mask); return (0); } struct field_desc { const char *name; u_int start; u_int width; }; static void field_desc_show(struct sbuf *sb, uint64_t v, const struct field_desc *f) { char buf[32]; int line_size = 0; while (f->name) { uint64_t mask = (1ULL << f->width) - 1; int len = snprintf(buf, sizeof(buf), "%s: %ju", f->name, ((uintmax_t)v >> f->start) & mask); if (line_size + len >= 79) { line_size = 8; sbuf_printf(sb, "\n "); } sbuf_printf(sb, "%s ", buf); line_size += len + 1; f++; } sbuf_printf(sb, "\n"); } static const struct field_desc tp_la0[] = { { "RcfOpCodeOut", 60, 4 }, { "State", 56, 4 }, { "WcfState", 52, 4 }, { "RcfOpcSrcOut", 50, 2 }, { "CRxError", 49, 1 }, { "ERxError", 48, 1 }, { "SanityFailed", 47, 1 }, { "SpuriousMsg", 46, 1 }, { "FlushInputMsg", 45, 1 }, { "FlushInputCpl", 44, 1 }, { "RssUpBit", 43, 1 }, { "RssFilterHit", 42, 1 }, { "Tid", 32, 10 }, { "InitTcb", 31, 1 }, { "LineNumber", 24, 7 }, { "Emsg", 23, 1 }, { "EdataOut", 22, 1 }, { "Cmsg", 21, 1 }, { "CdataOut", 20, 1 }, { "EreadPdu", 19, 1 }, { "CreadPdu", 18, 1 }, { "TunnelPkt", 17, 1 }, { "RcfPeerFin", 16, 1 }, { "RcfReasonOut", 12, 4 }, { "TxCchannel", 10, 2 }, { "RcfTxChannel", 8, 2 }, { "RxEchannel", 6, 2 }, { "RcfRxChannel", 5, 1 }, { "RcfDataOutSrdy", 4, 1 }, { "RxDvld", 3, 1 }, { "RxOoDvld", 2, 1 }, { "RxCongestion", 1, 1 }, { "TxCongestion", 0, 1 }, { NULL } }; static const struct field_desc tp_la1[] = { { "CplCmdIn", 56, 8 }, { "CplCmdOut", 48, 8 }, { "ESynOut", 47, 1 }, { "EAckOut", 46, 1 }, { "EFinOut", 45, 1 }, { "ERstOut", 44, 1 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static const struct field_desc tp_la2[] = { { "CplCmdIn", 56, 8 }, { "MpsVfVld", 55, 1 }, { "MpsPf", 52, 3 }, { "MpsVf", 44, 8 }, { "SynIn", 43, 1 }, { "AckIn", 42, 1 }, { "FinIn", 41, 1 }, { "RstIn", 40, 1 }, { "DataIn", 39, 1 }, { "DataInVld", 38, 1 }, { "PadIn", 37, 1 }, { "RxBufEmpty", 36, 1 }, { "RxDdp", 35, 1 }, { "RxFbCongestion", 34, 1 }, { "TxFbCongestion", 33, 1 }, { "TxPktSumSrdy", 32, 1 }, { "RcfUlpType", 28, 4 }, { "Eread", 27, 1 }, { "Ebypass", 26, 1 }, { "Esave", 25, 1 }, { "Static0", 24, 1 }, { "Cread", 23, 1 }, { "Cbypass", 22, 1 }, { "Csave", 21, 1 }, { "CPktOut", 20, 1 }, { "RxPagePoolFull", 18, 2 }, { "RxLpbkPkt", 17, 1 }, { "TxLpbkPkt", 16, 1 }, { "RxVfValid", 15, 1 }, { "SynLearned", 14, 1 }, { "SetDelEntry", 13, 1 }, { "SetInvEntry", 12, 1 }, { "CpcmdDvld", 11, 1 }, { "CpcmdSave", 10, 1 }, { "RxPstructsFull", 8, 2 }, { "EpcmdDvld", 7, 1 }, { "EpcmdFlush", 6, 1 }, { "EpcmdTrimPrefix", 5, 1 }, { "EpcmdTrimPostfix", 4, 1 }, { "ERssIp4Pkt", 3, 1 }, { "ERssIp6Pkt", 2, 1 }, { "ERssTcpUdpPkt", 1, 1 }, { "ERssFceFipPkt", 0, 1 }, { NULL } }; static void tp_la_show(struct sbuf *sb, uint64_t *p, int idx) { field_desc_show(sb, *p, tp_la0); } static void tp_la_show2(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], tp_la0); } static void tp_la_show3(struct sbuf *sb, uint64_t *p, int idx) { if (idx) sbuf_printf(sb, "\n"); field_desc_show(sb, p[0], tp_la0); if (idx < (TPLA_SIZE / 2 - 1) || p[1] != ~0ULL) field_desc_show(sb, p[1], (p[0] & (1 << 17)) ? tp_la2 : tp_la1); } static int sysctl_tp_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint64_t *buf, *p; int rc; u_int i, inc; void (*show_func)(struct sbuf *, uint64_t *, int); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(TPLA_SIZE * sizeof(uint64_t), M_CXGBE, M_ZERO | M_WAITOK); t4_tp_read_la(sc, buf, NULL); p = buf; switch (G_DBGLAMODE(t4_read_reg(sc, A_TP_DBG_LA_CONFIG))) { case 2: inc = 2; show_func = tp_la_show2; break; case 3: inc = 2; show_func = tp_la_show3; break; default: inc = 1; show_func = tp_la_show; } for (i = 0; i < TPLA_SIZE / inc; i++, p += inc) (*show_func)(sb, p, i); rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_tx_rate(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc; u64 nrate[MAX_NCHAN], orate[MAX_NCHAN]; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 256, req); if (sb == NULL) return (ENOMEM); t4_get_chan_txrate(sc, nrate, orate); if (sc->chip_params->nchan > 2) { sbuf_printf(sb, " channel 0 channel 1" " channel 2 channel 3\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju %10ju %10ju\n", nrate[0], nrate[1], nrate[2], nrate[3]); sbuf_printf(sb, "Offload B/s: %10ju %10ju %10ju %10ju", orate[0], orate[1], orate[2], orate[3]); } else { sbuf_printf(sb, " channel 0 channel 1\n"); sbuf_printf(sb, "NIC B/s: %10ju %10ju\n", nrate[0], nrate[1]); sbuf_printf(sb, "Offload B/s: %10ju %10ju", orate[0], orate[1]); } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_ulprx_la(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; uint32_t *buf, *p; int rc, i; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); buf = malloc(ULPRX_LA_SIZE * 8 * sizeof(uint32_t), M_CXGBE, M_ZERO | M_WAITOK); t4_ulprx_read_la(sc, buf); p = buf; sbuf_printf(sb, " Pcmd Type Message" " Data"); for (i = 0; i < ULPRX_LA_SIZE; i++, p += 8) { sbuf_printf(sb, "\n%08x%08x %4x %08x %08x%08x%08x%08x", p[1], p[0], p[2], p[3], p[7], p[6], p[5], p[4]); } rc = sbuf_finish(sb); sbuf_delete(sb); free(buf, M_CXGBE); return (rc); } static int sysctl_wcwr_stats(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct sbuf *sb; int rc, v; MPASS(chip_id(sc) >= CHELSIO_T5); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); v = t4_read_reg(sc, A_SGE_STAT_CFG); if (G_STATSOURCE_T5(v) == 7) { int mode; mode = is_t5(sc) ? G_STATMODE(v) : G_T6_STATMODE(v); if (mode == 0) { sbuf_printf(sb, "total %d, incomplete %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } else if (mode == 1) { sbuf_printf(sb, "total %d, data overflow %d", t4_read_reg(sc, A_SGE_STAT_TOTAL), t4_read_reg(sc, A_SGE_STAT_MATCH)); } else { sbuf_printf(sb, "unknown mode %d", mode); } } rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } static int sysctl_cpus(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; enum cpu_sets op = arg2; cpuset_t cpuset; struct sbuf *sb; int i, rc; MPASS(op == LOCAL_CPUS || op == INTR_CPUS); CPU_ZERO(&cpuset); rc = bus_get_cpus(sc->dev, op, sizeof(cpuset), &cpuset); if (rc != 0) return (rc); rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); CPU_FOREACH(i) sbuf_printf(sb, "%d ", i); rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } #ifdef TCP_OFFLOAD static int sysctl_tls_rx_ports(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int *old_ports, *new_ports; int i, new_count, rc; if (req->newptr == NULL && req->oldptr == NULL) return (SYSCTL_OUT(req, NULL, imax(sc->tt.num_tls_rx_ports, 1) * sizeof(sc->tt.tls_rx_ports[0]))); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tlsrx"); if (rc) return (rc); if (sc->tt.num_tls_rx_ports == 0) { i = -1; rc = SYSCTL_OUT(req, &i, sizeof(i)); } else rc = SYSCTL_OUT(req, sc->tt.tls_rx_ports, sc->tt.num_tls_rx_ports * sizeof(sc->tt.tls_rx_ports[0])); if (rc == 0 && req->newptr != NULL) { new_count = req->newlen / sizeof(new_ports[0]); new_ports = malloc(new_count * sizeof(new_ports[0]), M_CXGBE, M_WAITOK); rc = SYSCTL_IN(req, new_ports, new_count * sizeof(new_ports[0])); if (rc) goto err; /* Allow setting to a single '-1' to clear the list. */ if (new_count == 1 && new_ports[0] == -1) { ADAPTER_LOCK(sc); old_ports = sc->tt.tls_rx_ports; sc->tt.tls_rx_ports = NULL; sc->tt.num_tls_rx_ports = 0; ADAPTER_UNLOCK(sc); free(old_ports, M_CXGBE); } else { for (i = 0; i < new_count; i++) { if (new_ports[i] < 1 || new_ports[i] > IPPORT_MAX) { rc = EINVAL; goto err; } } ADAPTER_LOCK(sc); old_ports = sc->tt.tls_rx_ports; sc->tt.tls_rx_ports = new_ports; sc->tt.num_tls_rx_ports = new_count; ADAPTER_UNLOCK(sc); free(old_ports, M_CXGBE); new_ports = NULL; } err: free(new_ports, M_CXGBE); } end_synchronized_op(sc, 0); return (rc); } static void unit_conv(char *buf, size_t len, u_int val, u_int factor) { u_int rem = val % factor; if (rem == 0) snprintf(buf, len, "%u", val / factor); else { while (rem % 10 == 0) rem /= 10; snprintf(buf, len, "%u.%u", val / factor, rem); } } static int sysctl_tp_tick(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; char buf[16]; u_int res, re; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION); switch (arg2) { case 0: /* timer_tick */ re = G_TIMERRESOLUTION(res); break; case 1: /* TCP timestamp tick */ re = G_TIMESTAMPRESOLUTION(res); break; case 2: /* DACK tick */ re = G_DELAYEDACKRESOLUTION(res); break; default: return (EDOOFUS); } unit_conv(buf, sizeof(buf), (cclk_ps << re), 1000000); return (sysctl_handle_string(oidp, buf, sizeof(buf), req)); } static int sysctl_tp_dack_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; u_int res, dack_re, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; res = t4_read_reg(sc, A_TP_TIMER_RESOLUTION); dack_re = G_DELAYEDACKRESOLUTION(res); v = ((cclk_ps << dack_re) / 1000000) * t4_read_reg(sc, A_TP_DACK_TIMER); return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_tp_timer(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int reg = arg2; u_int tre; u_long tp_tick_us, v; u_int cclk_ps = 1000000000 / sc->params.vpd.cclk; MPASS(reg == A_TP_RXT_MIN || reg == A_TP_RXT_MAX || reg == A_TP_PERS_MIN || reg == A_TP_PERS_MAX || reg == A_TP_KEEP_IDLE || reg == A_TP_KEEP_INTVL || reg == A_TP_INIT_SRTT || reg == A_TP_FINWAIT2_TIMER); tre = G_TIMERRESOLUTION(t4_read_reg(sc, A_TP_TIMER_RESOLUTION)); tp_tick_us = (cclk_ps << tre) / 1000000; if (reg == A_TP_INIT_SRTT) v = tp_tick_us * G_INITSRTT(t4_read_reg(sc, reg)); else v = tp_tick_us * t4_read_reg(sc, reg); return (sysctl_handle_long(oidp, &v, 0, req)); } /* * All fields in TP_SHIFT_CNT are 4b and the starting location of the field is * passed to this function. */ static int sysctl_tp_shift_cnt(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int idx = arg2; u_int v; MPASS(idx >= 0 && idx <= 24); v = (t4_read_reg(sc, A_TP_SHIFT_CNT) >> idx) & 0xf; return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_tp_backoff(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; int idx = arg2; u_int shift, v, r; MPASS(idx >= 0 && idx < 16); r = A_TP_TCP_BACKOFF_REG0 + (idx & ~3); shift = (idx & 3) << 3; v = (t4_read_reg(sc, r) >> shift) & M_TIMERBACKOFFINDEX0; return (sysctl_handle_int(oidp, &v, 0, req)); } static int sysctl_holdoff_tmr_idx_ofld(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int idx, rc, i; struct sge_ofld_rxq *ofld_rxq; uint8_t v; idx = vi->ofld_tmr_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < 0 || idx >= SGE_NTIMERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4otmr"); if (rc) return (rc); v = V_QINTR_TIMER_IDX(idx) | V_QINTR_CNT_EN(vi->ofld_pktc_idx != -1); for_each_ofld_rxq(vi, i, ofld_rxq) { #ifdef atomic_store_rel_8 atomic_store_rel_8(&ofld_rxq->iq.intr_params, v); #else ofld_rxq->iq.intr_params = v; #endif } vi->ofld_tmr_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (0); } static int sysctl_holdoff_pktc_idx_ofld(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct adapter *sc = vi->pi->adapter; int idx, rc; idx = vi->ofld_pktc_idx; rc = sysctl_handle_int(oidp, &idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (idx < -1 || idx >= SGE_NCOUNTERS) return (EINVAL); rc = begin_synchronized_op(sc, vi, HOLD_LOCK | SLEEP_OK | INTR_OK, "t4opktc"); if (rc) return (rc); if (vi->flags & VI_INIT_DONE) rc = EBUSY; /* cannot be changed once the queues are created */ else vi->ofld_pktc_idx = idx; end_synchronized_op(sc, LOCK_HELD); return (rc); } #endif static int get_sge_context(struct adapter *sc, struct t4_sge_context *cntxt) { int rc; if (cntxt->cid > M_CTXTQID) return (EINVAL); if (cntxt->mem_id != CTXT_EGRESS && cntxt->mem_id != CTXT_INGRESS && cntxt->mem_id != CTXT_FLM && cntxt->mem_id != CTXT_CNM) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ctxt"); if (rc) return (rc); if (sc->flags & FW_OK) { rc = -t4_sge_ctxt_rd(sc, sc->mbox, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); if (rc == 0) goto done; } /* * Read via firmware failed or wasn't even attempted. Read directly via * the backdoor. */ rc = -t4_sge_ctxt_rd_bd(sc, cntxt->cid, cntxt->mem_id, &cntxt->data[0]); done: end_synchronized_op(sc, 0); return (rc); } static int load_fw(struct adapter *sc, struct t4_data *fw) { int rc; uint8_t *fw_data; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldfw"); if (rc) return (rc); /* * The firmware, with the sole exception of the memory parity error * handler, runs from memory and not flash. It is almost always safe to * install a new firmware on a running system. Just set bit 1 in * hw.cxgbe.dflags or dev...dflags first. */ if (sc->flags & FULL_INIT_DONE && (sc->debug_flags & DF_LOAD_FW_ANYTIME) == 0) { rc = EBUSY; goto done; } fw_data = malloc(fw->len, M_CXGBE, M_WAITOK); if (fw_data == NULL) { rc = ENOMEM; goto done; } rc = copyin(fw->data, fw_data, fw->len); if (rc == 0) rc = -t4_load_fw(sc, fw_data, fw->len); free(fw_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int load_cfg(struct adapter *sc, struct t4_data *cfg) { int rc; uint8_t *cfg_data = NULL; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf"); if (rc) return (rc); if (cfg->len == 0) { /* clear */ rc = -t4_load_cfg(sc, NULL, 0); goto done; } cfg_data = malloc(cfg->len, M_CXGBE, M_WAITOK); if (cfg_data == NULL) { rc = ENOMEM; goto done; } rc = copyin(cfg->data, cfg_data, cfg->len); if (rc == 0) rc = -t4_load_cfg(sc, cfg_data, cfg->len); free(cfg_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int load_boot(struct adapter *sc, struct t4_bootrom *br) { int rc; uint8_t *br_data = NULL; u_int offset; if (br->len > 1024 * 1024) return (EFBIG); if (br->pf_offset == 0) { /* pfidx */ if (br->pfidx_addr > 7) return (EINVAL); offset = G_OFFSET(t4_read_reg(sc, PF_REG(br->pfidx_addr, A_PCIE_PF_EXPROM_OFST))); } else if (br->pf_offset == 1) { /* offset */ offset = G_OFFSET(br->pfidx_addr); } else { return (EINVAL); } rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldbr"); if (rc) return (rc); if (br->len == 0) { /* clear */ rc = -t4_load_boot(sc, NULL, offset, 0); goto done; } br_data = malloc(br->len, M_CXGBE, M_WAITOK); if (br_data == NULL) { rc = ENOMEM; goto done; } rc = copyin(br->data, br_data, br->len); if (rc == 0) rc = -t4_load_boot(sc, br_data, offset, br->len); free(br_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int load_bootcfg(struct adapter *sc, struct t4_data *bc) { int rc; uint8_t *bc_data = NULL; rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4ldcf"); if (rc) return (rc); if (bc->len == 0) { /* clear */ rc = -t4_load_bootcfg(sc, NULL, 0); goto done; } bc_data = malloc(bc->len, M_CXGBE, M_WAITOK); if (bc_data == NULL) { rc = ENOMEM; goto done; } rc = copyin(bc->data, bc_data, bc->len); if (rc == 0) rc = -t4_load_bootcfg(sc, bc_data, bc->len); free(bc_data, M_CXGBE); done: end_synchronized_op(sc, 0); return (rc); } static int cudbg_dump(struct adapter *sc, struct t4_cudbg_dump *dump) { int rc; struct cudbg_init *cudbg; void *handle, *buf; /* buf is large, don't block if no memory is available */ buf = malloc(dump->len, M_CXGBE, M_NOWAIT | M_ZERO); if (buf == NULL) return (ENOMEM); handle = cudbg_alloc_handle(); if (handle == NULL) { rc = ENOMEM; goto done; } cudbg = cudbg_get_init(handle); cudbg->adap = sc; cudbg->print = (cudbg_print_cb)printf; #ifndef notyet device_printf(sc->dev, "%s: wr_flash %u, len %u, data %p.\n", __func__, dump->wr_flash, dump->len, dump->data); #endif if (dump->wr_flash) cudbg->use_flash = 1; MPASS(sizeof(cudbg->dbg_bitmap) == sizeof(dump->bitmap)); memcpy(cudbg->dbg_bitmap, dump->bitmap, sizeof(cudbg->dbg_bitmap)); rc = cudbg_collect(handle, buf, &dump->len); if (rc != 0) goto done; rc = copyout(buf, dump->data, dump->len); done: cudbg_free_handle(handle); free(buf, M_CXGBE); return (rc); } static void free_offload_policy(struct t4_offload_policy *op) { struct offload_rule *r; int i; if (op == NULL) return; r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { free(r->bpf_prog.bf_insns, M_CXGBE); } free(op->rule, M_CXGBE); free(op, M_CXGBE); } static int set_offload_policy(struct adapter *sc, struct t4_offload_policy *uop) { int i, rc, len; struct t4_offload_policy *op, *old; struct bpf_program *bf; const struct offload_settings *s; struct offload_rule *r; void *u; if (!is_offload(sc)) return (ENODEV); if (uop->nrules == 0) { /* Delete installed policies. */ op = NULL; goto set_policy; } if (uop->nrules > 256) { /* arbitrary */ return (E2BIG); } /* Copy userspace offload policy to kernel */ op = malloc(sizeof(*op), M_CXGBE, M_ZERO | M_WAITOK); op->nrules = uop->nrules; len = op->nrules * sizeof(struct offload_rule); op->rule = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); rc = copyin(uop->rule, op->rule, len); if (rc) { free(op->rule, M_CXGBE); free(op, M_CXGBE); return (rc); } r = &op->rule[0]; for (i = 0; i < op->nrules; i++, r++) { /* Validate open_type */ if (r->open_type != OPEN_TYPE_LISTEN && r->open_type != OPEN_TYPE_ACTIVE && r->open_type != OPEN_TYPE_PASSIVE && r->open_type != OPEN_TYPE_DONTCARE) { error: /* * Rules 0 to i have malloc'd filters that need to be * freed. Rules i+1 to nrules have userspace pointers * and should be left alone. */ op->nrules = i; free_offload_policy(op); return (rc); } /* Validate settings */ s = &r->settings; if ((s->offload != 0 && s->offload != 1) || s->cong_algo < -1 || s->cong_algo > CONG_ALG_HIGHSPEED || s->sched_class < -1 || s->sched_class >= sc->chip_params->nsched_cls) { rc = EINVAL; goto error; } bf = &r->bpf_prog; u = bf->bf_insns; /* userspace ptr */ bf->bf_insns = NULL; if (bf->bf_len == 0) { /* legal, matches everything */ continue; } len = bf->bf_len * sizeof(*bf->bf_insns); bf->bf_insns = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); rc = copyin(u, bf->bf_insns, len); if (rc != 0) goto error; if (!bpf_validate(bf->bf_insns, bf->bf_len)) { rc = EINVAL; goto error; } } set_policy: rw_wlock(&sc->policy_lock); old = sc->policy; sc->policy = op; rw_wunlock(&sc->policy_lock); free_offload_policy(old); return (0); } #define MAX_READ_BUF_SIZE (128 * 1024) static int read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr) { uint32_t addr, remaining, n; uint32_t *buf; int rc; uint8_t *dst; rc = validate_mem_range(sc, mr->addr, mr->len); if (rc != 0) return (rc); buf = malloc(min(mr->len, MAX_READ_BUF_SIZE), M_CXGBE, M_WAITOK); addr = mr->addr; remaining = mr->len; dst = (void *)mr->data; while (remaining) { n = min(remaining, MAX_READ_BUF_SIZE); read_via_memwin(sc, 2, addr, buf, n); rc = copyout(buf, dst, n); if (rc != 0) break; dst += n; remaining -= n; addr += n; } free(buf, M_CXGBE); return (rc); } #undef MAX_READ_BUF_SIZE static int read_i2c(struct adapter *sc, struct t4_i2c_data *i2cd) { int rc; if (i2cd->len == 0 || i2cd->port_id >= sc->params.nports) return (EINVAL); if (i2cd->len > sizeof(i2cd->data)) return (EFBIG); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4i2crd"); if (rc) return (rc); rc = -t4_i2c_rd(sc, sc->mbox, i2cd->port_id, i2cd->dev_addr, i2cd->offset, i2cd->len, &i2cd->data[0]); end_synchronized_op(sc, 0); return (rc); } int t4_os_find_pci_capability(struct adapter *sc, int cap) { int i; return (pci_find_cap(sc->dev, cap, &i) == 0 ? i : 0); } int t4_os_pci_save_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_save(dev, dinfo, 0); return (0); } int t4_os_pci_restore_state(struct adapter *sc) { device_t dev; struct pci_devinfo *dinfo; dev = sc->dev; dinfo = device_get_ivars(dev); pci_cfg_restore(dev, dinfo); return (0); } void t4_os_portmod_changed(struct port_info *pi) { struct adapter *sc = pi->adapter; struct vi_info *vi; struct ifnet *ifp; static const char *mod_str[] = { NULL, "LR", "SR", "ER", "TWINAX", "active TWINAX", "LRM" }; KASSERT((pi->flags & FIXED_IFMEDIA) == 0, ("%s: port_type %u", __func__, pi->port_type)); vi = &pi->vi[0]; if (begin_synchronized_op(sc, vi, HOLD_LOCK, "t4mod") == 0) { PORT_LOCK(pi); build_medialist(pi); if (pi->mod_type != FW_PORT_MOD_TYPE_NONE) { fixup_link_config(pi); apply_link_config(pi); } PORT_UNLOCK(pi); end_synchronized_op(sc, LOCK_HELD); } ifp = vi->ifp; if (pi->mod_type == FW_PORT_MOD_TYPE_NONE) if_printf(ifp, "transceiver unplugged.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_UNKNOWN) if_printf(ifp, "unknown transceiver inserted.\n"); else if (pi->mod_type == FW_PORT_MOD_TYPE_NOTSUPPORTED) if_printf(ifp, "unsupported transceiver inserted.\n"); else if (pi->mod_type > 0 && pi->mod_type < nitems(mod_str)) { if_printf(ifp, "%dGbps %s transceiver inserted.\n", port_top_speed(pi), mod_str[pi->mod_type]); } else { if_printf(ifp, "transceiver (type %d) inserted.\n", pi->mod_type); } } void t4_os_link_changed(struct port_info *pi) { struct vi_info *vi; struct ifnet *ifp; struct link_config *lc; int v; PORT_LOCK_ASSERT_OWNED(pi); for_each_vi(pi, v, vi) { ifp = vi->ifp; if (ifp == NULL) continue; lc = &pi->link_cfg; if (lc->link_ok) { ifp->if_baudrate = IF_Mbps(lc->speed); if_link_state_change(ifp, LINK_STATE_UP); } else { if_link_state_change(ifp, LINK_STATE_DOWN); } } } void t4_iterate(void (*func)(struct adapter *, void *), void *arg) { struct adapter *sc; sx_slock(&t4_list_lock); SLIST_FOREACH(sc, &t4_list, link) { /* * func should not make any assumptions about what state sc is * in - the only guarantee is that sc->sc_lock is a valid lock. */ func(sc, arg); } sx_sunlock(&t4_list_lock); } static int t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, int fflag, struct thread *td) { int rc; struct adapter *sc = dev->si_drv1; rc = priv_check(td, PRIV_DRIVER); if (rc != 0) return (rc); switch (cmd) { case CHELSIO_T4_GETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) edata->val = t4_read_reg(sc, edata->addr); else if (edata->size == 8) edata->val = t4_read_reg64(sc, edata->addr); else return (EINVAL); break; } case CHELSIO_T4_SETREG: { struct t4_reg *edata = (struct t4_reg *)data; if ((edata->addr & 0x3) != 0 || edata->addr >= sc->mmio_len) return (EFAULT); if (edata->size == 4) { if (edata->val & 0xffffffff00000000) return (EINVAL); t4_write_reg(sc, edata->addr, (uint32_t) edata->val); } else if (edata->size == 8) t4_write_reg64(sc, edata->addr, edata->val); else return (EINVAL); break; } case CHELSIO_T4_REGDUMP: { struct t4_regdump *regs = (struct t4_regdump *)data; int reglen = t4_get_regs_len(sc); uint8_t *buf; if (regs->len < reglen) { regs->len = reglen; /* hint to the caller */ return (ENOBUFS); } regs->len = reglen; buf = malloc(reglen, M_CXGBE, M_WAITOK | M_ZERO); get_regs(sc, regs, buf); rc = copyout(buf, regs->data, reglen); free(buf, M_CXGBE); break; } case CHELSIO_T4_GET_FILTER_MODE: rc = get_filter_mode(sc, (uint32_t *)data); break; case CHELSIO_T4_SET_FILTER_MODE: rc = set_filter_mode(sc, *(uint32_t *)data); break; case CHELSIO_T4_GET_FILTER: rc = get_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_SET_FILTER: rc = set_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_DEL_FILTER: rc = del_filter(sc, (struct t4_filter *)data); break; case CHELSIO_T4_GET_SGE_CONTEXT: rc = get_sge_context(sc, (struct t4_sge_context *)data); break; case CHELSIO_T4_LOAD_FW: rc = load_fw(sc, (struct t4_data *)data); break; case CHELSIO_T4_GET_MEM: rc = read_card_mem(sc, 2, (struct t4_mem_range *)data); break; case CHELSIO_T4_GET_I2C: rc = read_i2c(sc, (struct t4_i2c_data *)data); break; case CHELSIO_T4_CLEAR_STATS: { int i, v, bg_map; u_int port_id = *(uint32_t *)data; struct port_info *pi; struct vi_info *vi; if (port_id >= sc->params.nports) return (EINVAL); pi = sc->port[port_id]; if (pi == NULL) return (EIO); /* MAC stats */ t4_clr_port_stats(sc, pi->tx_chan); pi->tx_parse_error = 0; pi->tnl_cong_drops = 0; mtx_lock(&sc->reg_lock); for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) t4_clr_vi_stats(sc, vi->vin); } bg_map = pi->mps_bg_map; v = 0; /* reuse */ while (bg_map) { i = ffs(bg_map) - 1; t4_write_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v, 1, A_TP_MIB_TNL_CNG_DROP_0 + i); bg_map &= ~(1 << i); } mtx_unlock(&sc->reg_lock); /* * Since this command accepts a port, clear stats for * all VIs on this port. */ for_each_vi(pi, v, vi) { if (vi->flags & VI_INIT_DONE) { struct sge_rxq *rxq; struct sge_txq *txq; struct sge_wrq *wrq; for_each_rxq(vi, i, rxq) { #if defined(INET) || defined(INET6) rxq->lro.lro_queued = 0; rxq->lro.lro_flushed = 0; #endif rxq->rxcsum = 0; rxq->vlan_extraction = 0; } for_each_txq(vi, i, txq) { txq->txcsum = 0; txq->tso_wrs = 0; txq->vlan_insertion = 0; txq->imm_wrs = 0; txq->sgl_wrs = 0; txq->txpkt_wrs = 0; txq->txpkts0_wrs = 0; txq->txpkts1_wrs = 0; txq->txpkts0_pkts = 0; txq->txpkts1_pkts = 0; txq->raw_wrs = 0; mp_ring_reset_stats(txq->r); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) /* nothing to clear for each ofld_rxq */ for_each_ofld_txq(vi, i, wrq) { wrq->tx_wrs_direct = 0; wrq->tx_wrs_copied = 0; } #endif if (IS_MAIN_VI(vi)) { wrq = &sc->sge.ctrlq[pi->port_id]; wrq->tx_wrs_direct = 0; wrq->tx_wrs_copied = 0; } } } break; } case CHELSIO_T4_SCHED_CLASS: rc = t4_set_sched_class(sc, (struct t4_sched_params *)data); break; case CHELSIO_T4_SCHED_QUEUE: rc = t4_set_sched_queue(sc, (struct t4_sched_queue *)data); break; case CHELSIO_T4_GET_TRACER: rc = t4_get_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_SET_TRACER: rc = t4_set_tracer(sc, (struct t4_tracer *)data); break; case CHELSIO_T4_LOAD_CFG: rc = load_cfg(sc, (struct t4_data *)data); break; case CHELSIO_T4_LOAD_BOOT: rc = load_boot(sc, (struct t4_bootrom *)data); break; case CHELSIO_T4_LOAD_BOOTCFG: rc = load_bootcfg(sc, (struct t4_data *)data); break; case CHELSIO_T4_CUDBG_DUMP: rc = cudbg_dump(sc, (struct t4_cudbg_dump *)data); break; case CHELSIO_T4_SET_OFLD_POLICY: rc = set_offload_policy(sc, (struct t4_offload_policy *)data); break; default: rc = ENOTTY; } return (rc); } #ifdef TCP_OFFLOAD static int toe_capability(struct vi_info *vi, int enable) { int rc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; ASSERT_SYNCHRONIZED_OP(sc); if (!is_offload(sc)) return (ENODEV); if (enable) { if ((vi->ifp->if_capenable & IFCAP_TOE) != 0) { /* TOE is already enabled. */ return (0); } /* * We need the port's queues around so that we're able to send * and receive CPLs to/from the TOE even if the ifnet for this * port has never been UP'd administratively. */ if (!(vi->flags & VI_INIT_DONE)) { rc = vi_full_init(vi); if (rc) return (rc); } if (!(pi->vi[0].flags & VI_INIT_DONE)) { rc = vi_full_init(&pi->vi[0]); if (rc) return (rc); } if (isset(&sc->offload_map, pi->port_id)) { /* TOE is enabled on another VI of this port. */ pi->uld_vis++; return (0); } if (!uld_active(sc, ULD_TOM)) { rc = t4_activate_uld(sc, ULD_TOM); if (rc == EAGAIN) { log(LOG_WARNING, "You must kldload t4_tom.ko before trying " "to enable TOE on a cxgbe interface.\n"); } if (rc != 0) return (rc); KASSERT(sc->tom_softc != NULL, ("%s: TOM activated but softc NULL", __func__)); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM activated but flag not set", __func__)); } /* Activate iWARP and iSCSI too, if the modules are loaded. */ if (!uld_active(sc, ULD_IWARP)) (void) t4_activate_uld(sc, ULD_IWARP); if (!uld_active(sc, ULD_ISCSI)) (void) t4_activate_uld(sc, ULD_ISCSI); pi->uld_vis++; setbit(&sc->offload_map, pi->port_id); } else { pi->uld_vis--; if (!isset(&sc->offload_map, pi->port_id) || pi->uld_vis > 0) return (0); KASSERT(uld_active(sc, ULD_TOM), ("%s: TOM never initialized?", __func__)); clrbit(&sc->offload_map, pi->port_id); } return (0); } /* * Add an upper layer driver to the global list. */ int t4_register_uld(struct uld_info *ui) { int rc = 0; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u->uld_id == ui->uld_id) { rc = EEXIST; goto done; } } SLIST_INSERT_HEAD(&t4_uld_list, ui, link); ui->refcount = 0; done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_unregister_uld(struct uld_info *ui) { int rc = EINVAL; struct uld_info *u; sx_xlock(&t4_uld_list_lock); SLIST_FOREACH(u, &t4_uld_list, link) { if (u == ui) { if (ui->refcount > 0) { rc = EBUSY; goto done; } SLIST_REMOVE(&t4_uld_list, ui, uld_info, link); rc = 0; goto done; } } done: sx_xunlock(&t4_uld_list_lock); return (rc); } int t4_activate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = EAGAIN; /* kldoad the module with this ULD and try again. */ sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { if (!(sc->flags & FULL_INIT_DONE)) { rc = adapter_full_init(sc); if (rc != 0) break; } rc = ui->activate(sc); if (rc == 0) { setbit(&sc->active_ulds, id); ui->refcount++; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } int t4_deactivate_uld(struct adapter *sc, int id) { int rc; struct uld_info *ui; ASSERT_SYNCHRONIZED_OP(sc); if (id < 0 || id > ULD_MAX) return (EINVAL); rc = ENXIO; sx_slock(&t4_uld_list_lock); SLIST_FOREACH(ui, &t4_uld_list, link) { if (ui->uld_id == id) { rc = ui->deactivate(sc); if (rc == 0) { clrbit(&sc->active_ulds, id); ui->refcount--; } break; } } sx_sunlock(&t4_uld_list_lock); return (rc); } int uld_active(struct adapter *sc, int uld_id) { MPASS(uld_id >= 0 && uld_id <= ULD_MAX); return (isset(&sc->active_ulds, uld_id)); } #endif /* * t = ptr to tunable. * nc = number of CPUs. * c = compiled in default for that tunable. */ static void calculate_nqueues(int *t, int nc, const int c) { int nq; if (*t > 0) return; nq = *t < 0 ? -*t : c; *t = min(nc, nq); } /* * Come up with reasonable defaults for some of the tunables, provided they're * not set by the user (in which case we'll use the values as is). */ static void tweak_tunables(void) { int nc = mp_ncpus; /* our snapshot of the number of CPUs */ if (t4_ntxq < 1) { #ifdef RSS t4_ntxq = rss_getnumbuckets(); #else calculate_nqueues(&t4_ntxq, nc, NTXQ); #endif } calculate_nqueues(&t4_ntxq_vi, nc, NTXQ_VI); if (t4_nrxq < 1) { #ifdef RSS t4_nrxq = rss_getnumbuckets(); #else calculate_nqueues(&t4_nrxq, nc, NRXQ); #endif } calculate_nqueues(&t4_nrxq_vi, nc, NRXQ_VI); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) calculate_nqueues(&t4_nofldtxq, nc, NOFLDTXQ); calculate_nqueues(&t4_nofldtxq_vi, nc, NOFLDTXQ_VI); #endif #ifdef TCP_OFFLOAD calculate_nqueues(&t4_nofldrxq, nc, NOFLDRXQ); calculate_nqueues(&t4_nofldrxq_vi, nc, NOFLDRXQ_VI); if (t4_toecaps_allowed == -1) t4_toecaps_allowed = FW_CAPS_CONFIG_TOE; if (t4_rdmacaps_allowed == -1) { t4_rdmacaps_allowed = FW_CAPS_CONFIG_RDMA_RDDP | FW_CAPS_CONFIG_RDMA_RDMAC; } if (t4_iscsicaps_allowed == -1) { t4_iscsicaps_allowed = FW_CAPS_CONFIG_ISCSI_INITIATOR_PDU | FW_CAPS_CONFIG_ISCSI_TARGET_PDU | FW_CAPS_CONFIG_ISCSI_T10DIF; } if (t4_tmr_idx_ofld < 0 || t4_tmr_idx_ofld >= SGE_NTIMERS) t4_tmr_idx_ofld = TMR_IDX_OFLD; if (t4_pktc_idx_ofld < -1 || t4_pktc_idx_ofld >= SGE_NCOUNTERS) t4_pktc_idx_ofld = PKTC_IDX_OFLD; #else if (t4_toecaps_allowed == -1) t4_toecaps_allowed = 0; if (t4_rdmacaps_allowed == -1) t4_rdmacaps_allowed = 0; if (t4_iscsicaps_allowed == -1) t4_iscsicaps_allowed = 0; #endif #ifdef DEV_NETMAP calculate_nqueues(&t4_nnmtxq_vi, nc, NNMTXQ_VI); calculate_nqueues(&t4_nnmrxq_vi, nc, NNMRXQ_VI); #endif if (t4_tmr_idx < 0 || t4_tmr_idx >= SGE_NTIMERS) t4_tmr_idx = TMR_IDX; if (t4_pktc_idx < -1 || t4_pktc_idx >= SGE_NCOUNTERS) t4_pktc_idx = PKTC_IDX; if (t4_qsize_txq < 128) t4_qsize_txq = 128; if (t4_qsize_rxq < 128) t4_qsize_rxq = 128; while (t4_qsize_rxq & 7) t4_qsize_rxq++; t4_intr_types &= INTR_MSIX | INTR_MSI | INTR_INTX; /* * Number of VIs to create per-port. The first VI is the "main" regular * VI for the port. The rest are additional virtual interfaces on the * same physical port. Note that the main VI does not have native * netmap support but the extra VIs do. * * Limit the number of VIs per port to the number of available * MAC addresses per port. */ if (t4_num_vis < 1) t4_num_vis = 1; if (t4_num_vis > nitems(vi_mac_funcs)) { t4_num_vis = nitems(vi_mac_funcs); printf("cxgbe: number of VIs limited to %d\n", t4_num_vis); } if (pcie_relaxed_ordering < 0 || pcie_relaxed_ordering > 2) { pcie_relaxed_ordering = 1; #if defined(__i386__) || defined(__amd64__) if (cpu_vendor_id == CPU_VENDOR_INTEL) pcie_relaxed_ordering = 0; #endif } } #ifdef DDB static void t4_dump_tcb(struct adapter *sc, int tid) { uint32_t base, i, j, off, pf, reg, save, tcb_addr, win_pos; reg = PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2); save = t4_read_reg(sc, reg); base = sc->memwin[2].mw_base; /* Dump TCB for the tid */ tcb_addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE); tcb_addr += tid * TCB_SIZE; if (is_t4(sc)) { pf = 0; win_pos = tcb_addr & ~0xf; /* start must be 16B aligned */ } else { pf = V_PFNUM(sc->pf); win_pos = tcb_addr & ~0x7f; /* start must be 128B aligned */ } t4_write_reg(sc, reg, win_pos | pf); t4_read_reg(sc, reg); off = tcb_addr - win_pos; for (i = 0; i < 4; i++) { uint32_t buf[8]; for (j = 0; j < 8; j++, off += 4) buf[j] = htonl(t4_read_reg(sc, base + off)); db_printf("%08x %08x %08x %08x %08x %08x %08x %08x\n", buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); } t4_write_reg(sc, reg, save); t4_read_reg(sc, reg); } static void t4_dump_devlog(struct adapter *sc) { struct devlog_params *dparams = &sc->params.devlog; struct fw_devlog_e e; int i, first, j, m, nentries, rc; uint64_t ftstamp = UINT64_MAX; if (dparams->start == 0) { db_printf("devlog params not valid\n"); return; } nentries = dparams->size / sizeof(struct fw_devlog_e); m = fwmtype_to_hwmtype(dparams->memtype); /* Find the first entry. */ first = -1; for (i = 0; i < nentries && !db_pager_quit; i++) { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) break; if (e.timestamp == 0) break; e.timestamp = be64toh(e.timestamp); if (e.timestamp < ftstamp) { ftstamp = e.timestamp; first = i; } } if (first == -1) return; i = first; do { rc = -t4_mem_read(sc, m, dparams->start + i * sizeof(e), sizeof(e), (void *)&e); if (rc != 0) return; if (e.timestamp == 0) return; e.timestamp = be64toh(e.timestamp); e.seqno = be32toh(e.seqno); for (j = 0; j < 8; j++) e.params[j] = be32toh(e.params[j]); db_printf("%10d %15ju %8s %8s ", e.seqno, e.timestamp, (e.level < nitems(devlog_level_strings) ? devlog_level_strings[e.level] : "UNKNOWN"), (e.facility < nitems(devlog_facility_strings) ? devlog_facility_strings[e.facility] : "UNKNOWN")); db_printf(e.fmt, e.params[0], e.params[1], e.params[2], e.params[3], e.params[4], e.params[5], e.params[6], e.params[7]); if (++i == nentries) i = 0; } while (i != first && !db_pager_quit); } static struct command_table db_t4_table = LIST_HEAD_INITIALIZER(db_t4_table); _DB_SET(_show, t4, NULL, db_show_table, 0, &db_t4_table); DB_FUNC(devlog, db_show_devlog, db_t4_table, CS_OWN, NULL) { device_t dev; int t; bool valid; valid = false; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); valid = true; } db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 devlog \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } t4_dump_devlog(device_get_softc(dev)); } DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL) { device_t dev; int radix, tid, t; bool valid; valid = false; radix = db_radix; db_radix = 10; t = db_read_token(); if (t == tIDENT) { dev = device_lookup_by_name(db_tok_string); t = db_read_token(); if (t == tNUMBER) { tid = db_tok_number; valid = true; } } db_radix = radix; db_skip_to_eol(); if (!valid) { db_printf("usage: show t4 tcb \n"); return; } if (dev == NULL) { db_printf("device not found\n"); return; } if (tid < 0) { db_printf("invalid tid\n"); return; } t4_dump_tcb(device_get_softc(dev), tid); } #endif /* * Borrowed from cesa_prep_aes_key(). * * NB: The crypto engine wants the words in the decryption key in reverse * order. */ void t4_aes_getdeckey(void *dec_key, const void *enc_key, unsigned int kbits) { uint32_t ek[4 * (RIJNDAEL_MAXNR + 1)]; uint32_t *dkey; int i; rijndaelKeySetupEnc(ek, enc_key, kbits); dkey = dec_key; dkey += (kbits / 8) / 4; switch (kbits) { case 128: for (i = 0; i < 4; i++) *--dkey = htobe32(ek[4 * 10 + i]); break; case 192: for (i = 0; i < 2; i++) *--dkey = htobe32(ek[4 * 11 + 2 + i]); for (i = 0; i < 4; i++) *--dkey = htobe32(ek[4 * 12 + i]); break; case 256: for (i = 0; i < 4; i++) *--dkey = htobe32(ek[4 * 13 + i]); for (i = 0; i < 4; i++) *--dkey = htobe32(ek[4 * 14 + i]); break; } MPASS(dkey == dec_key); } static struct sx mlu; /* mod load unload */ SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload"); static int mod_event(module_t mod, int cmd, void *arg) { int rc = 0; static int loaded = 0; switch (cmd) { case MOD_LOAD: sx_xlock(&mlu); if (loaded++ == 0) { t4_sge_modload(); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, t4_filter_rpl, CPL_COOKIE_FILTER); t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl, CPL_COOKIE_FILTER); t4_register_shared_cpl_handler(CPL_ACT_OPEN_RPL, t4_hashfilter_ao_rpl, CPL_COOKIE_HASHFILTER); t4_register_shared_cpl_handler(CPL_SET_TCB_RPL, t4_hashfilter_tcb_rpl, CPL_COOKIE_HASHFILTER); t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, t4_del_hashfilter_rpl, CPL_COOKIE_HASHFILTER); t4_register_cpl_handler(CPL_TRACE_PKT, t4_trace_pkt); t4_register_cpl_handler(CPL_T5_TRACE_PKT, t5_trace_pkt); t4_register_cpl_handler(CPL_SMT_WRITE_RPL, do_smt_write_rpl); sx_init(&t4_list_lock, "T4/T5 adapters"); SLIST_INIT(&t4_list); callout_init(&fatal_callout, 1); #ifdef TCP_OFFLOAD sx_init(&t4_uld_list_lock, "T4/T5 ULDs"); SLIST_INIT(&t4_uld_list); #endif #ifdef INET6 t4_clip_modload(); #endif t4_tracer_modload(); tweak_tunables(); } sx_xunlock(&mlu); break; case MOD_UNLOAD: sx_xlock(&mlu); if (--loaded == 0) { int tries; sx_slock(&t4_list_lock); if (!SLIST_EMPTY(&t4_list)) { rc = EBUSY; sx_sunlock(&t4_list_lock); goto done_unload; } #ifdef TCP_OFFLOAD sx_slock(&t4_uld_list_lock); if (!SLIST_EMPTY(&t4_uld_list)) { rc = EBUSY; sx_sunlock(&t4_uld_list_lock); sx_sunlock(&t4_list_lock); goto done_unload; } #endif tries = 0; while (tries++ < 5 && t4_sge_extfree_refs() != 0) { uprintf("%ju clusters with custom free routine " "still is use.\n", t4_sge_extfree_refs()); pause("t4unload", 2 * hz); } #ifdef TCP_OFFLOAD sx_sunlock(&t4_uld_list_lock); #endif sx_sunlock(&t4_list_lock); if (t4_sge_extfree_refs() == 0) { t4_tracer_modunload(); #ifdef INET6 t4_clip_modunload(); #endif #ifdef TCP_OFFLOAD sx_destroy(&t4_uld_list_lock); #endif sx_destroy(&t4_list_lock); t4_sge_modunload(); loaded = 0; } else { rc = EBUSY; loaded++; /* undo earlier decrement */ } } done_unload: sx_xunlock(&mlu); break; } return (rc); } static devclass_t t4_devclass, t5_devclass, t6_devclass; static devclass_t cxgbe_devclass, cxl_devclass, cc_devclass; static devclass_t vcxgbe_devclass, vcxl_devclass, vcc_devclass; DRIVER_MODULE(t4nex, pci, t4_driver, t4_devclass, mod_event, 0); MODULE_VERSION(t4nex, 1); MODULE_DEPEND(t4nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t4nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(t5nex, pci, t5_driver, t5_devclass, mod_event, 0); MODULE_VERSION(t5nex, 1); MODULE_DEPEND(t5nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t5nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(t6nex, pci, t6_driver, t6_devclass, mod_event, 0); MODULE_VERSION(t6nex, 1); MODULE_DEPEND(t6nex, firmware, 1, 1, 1); #ifdef DEV_NETMAP MODULE_DEPEND(t6nex, netmap, 1, 1, 1); #endif /* DEV_NETMAP */ DRIVER_MODULE(cxgbe, t4nex, cxgbe_driver, cxgbe_devclass, 0, 0); MODULE_VERSION(cxgbe, 1); DRIVER_MODULE(cxl, t5nex, cxl_driver, cxl_devclass, 0, 0); MODULE_VERSION(cxl, 1); DRIVER_MODULE(cc, t6nex, cc_driver, cc_devclass, 0, 0); MODULE_VERSION(cc, 1); DRIVER_MODULE(vcxgbe, cxgbe, vcxgbe_driver, vcxgbe_devclass, 0, 0); MODULE_VERSION(vcxgbe, 1); DRIVER_MODULE(vcxl, cxl, vcxl_driver, vcxl_devclass, 0, 0); MODULE_VERSION(vcxl, 1); DRIVER_MODULE(vcc, cc, vcc_driver, vcc_devclass, 0, 0); MODULE_VERSION(vcc, 1); Index: head/sys/dev/cxgbe/t4_sched.c =================================================================== --- head/sys/dev/cxgbe/t4_sched.c (revision 348253) +++ head/sys/dev/cxgbe/t4_sched.c (revision 348254) @@ -1,906 +1,906 @@ /*- * Copyright (c) 2017 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" static int in_range(int val, int lo, int hi) { return (val < 0 || (val <= hi && val >= lo)); } static int set_sched_class_config(struct adapter *sc, int minmax) { int rc; if (minmax < 0) return (EINVAL); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc"); if (rc) return (rc); rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1); end_synchronized_op(sc, 0); return (rc); } static int set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p, int sleep_ok) { int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode; struct port_info *pi; struct tx_cl_rl_params *tc, old; bool check_pktsize = false; if (p->level == SCHED_CLASS_LEVEL_CL_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL; else if (p->level == SCHED_CLASS_LEVEL_CL_WRR) fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR; else if (p->level == SCHED_CLASS_LEVEL_CH_RL) fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL; else return (EINVAL); if (p->level == SCHED_CLASS_LEVEL_CL_RL) { if (p->mode == SCHED_CLASS_MODE_CLASS) fw_mode = FW_SCHED_PARAMS_MODE_CLASS; else if (p->mode == SCHED_CLASS_MODE_FLOW) { check_pktsize = true; fw_mode = FW_SCHED_PARAMS_MODE_FLOW; } else return (EINVAL); } else fw_mode = 0; /* Valid channel must always be provided. */ if (p->channel < 0) return (EINVAL); if (!in_range(p->channel, 0, sc->chip_params->nchan - 1)) return (ERANGE); pi = sc->port[sc->chan_map[p->channel]]; if (pi == NULL) return (ENXIO); MPASS(pi->tx_chan == p->channel); top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */ if (p->level == SCHED_CLASS_LEVEL_CL_RL || p->level == SCHED_CLASS_LEVEL_CH_RL) { /* * Valid rate (mode, unit and values) must be provided. */ if (p->minrate < 0) p->minrate = 0; if (p->maxrate < 0) return (EINVAL); if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) { fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE; /* ratemode could be relative (%) or absolute. */ if (p->ratemode == SCHED_CLASS_RATEMODE_REL) { fw_ratemode = FW_SCHED_PARAMS_RATE_REL; /* maxrate is % of port bandwidth. */ if (!in_range(p->minrate, 0, 100) || !in_range(p->maxrate, 0, 100)) { return (ERANGE); } } else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) { fw_ratemode = FW_SCHED_PARAMS_RATE_ABS; /* maxrate is absolute value in kbps. */ if (!in_range(p->minrate, 0, top_speed) || !in_range(p->maxrate, 0, top_speed)) { return (ERANGE); } } else return (EINVAL); } else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) { /* maxrate is the absolute value in pps. */ check_pktsize = true; fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE; } else return (EINVAL); } else { MPASS(p->level == SCHED_CLASS_LEVEL_CL_WRR); /* * Valid weight must be provided. */ if (p->weight < 0) return (EINVAL); if (!in_range(p->weight, 1, 99)) return (ERANGE); fw_rateunit = 0; fw_ratemode = 0; } if (p->level == SCHED_CLASS_LEVEL_CL_RL || p->level == SCHED_CLASS_LEVEL_CL_WRR) { /* * Valid scheduling class must be provided. */ if (p->cl < 0) return (EINVAL); if (!in_range(p->cl, 0, sc->chip_params->nsched_cls - 1)) return (ERANGE); } if (check_pktsize) { if (p->pktsize < 0) return (EINVAL); if (!in_range(p->pktsize, 64, pi->vi[0].ifp->if_mtu)) return (ERANGE); } if (p->level == SCHED_CLASS_LEVEL_CL_RL) { tc = &pi->sched_params->cl_rl[p->cl]; mtx_lock(&sc->tc_lock); if (tc->refcount > 0 || tc->flags & (CLRL_SYNC | CLRL_ASYNC)) rc = EBUSY; else { tc->flags |= CLRL_SYNC | CLRL_USER; tc->ratemode = fw_ratemode; tc->rateunit = fw_rateunit; tc->mode = fw_mode; tc->maxrate = p->maxrate; tc->pktsize = p->pktsize; rc = 0; old= *tc; } mtx_unlock(&sc->tc_lock); if (rc != 0) return (rc); } rc = begin_synchronized_op(sc, NULL, sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp"); if (rc != 0) { if (p->level == SCHED_CLASS_LEVEL_CL_RL) { mtx_lock(&sc->tc_lock); *tc = old; mtx_unlock(&sc->tc_lock); } return (rc); } rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level, fw_mode, fw_rateunit, fw_ratemode, p->channel, p->cl, p->minrate, p->maxrate, p->weight, p->pktsize, 0, sleep_ok); end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD); if (p->level == SCHED_CLASS_LEVEL_CL_RL) { mtx_lock(&sc->tc_lock); MPASS(tc->flags & CLRL_SYNC); MPASS(tc->flags & CLRL_USER); MPASS(tc->refcount == 0); tc->flags &= ~CLRL_SYNC; if (rc == 0) tc->flags &= ~CLRL_ERR; else tc->flags |= CLRL_ERR; mtx_unlock(&sc->tc_lock); } return (rc); } static void update_tx_sched(void *context, int pending) { int i, j, rc; struct port_info *pi; struct tx_cl_rl_params *tc; struct adapter *sc = context; const int n = sc->chip_params->nsched_cls; mtx_lock(&sc->tc_lock); for_each_port(sc, i) { pi = sc->port[i]; tc = &pi->sched_params->cl_rl[0]; for (j = 0; j < n; j++, tc++) { MPASS(mtx_owned(&sc->tc_lock)); if ((tc->flags & CLRL_ASYNC) == 0) continue; mtx_unlock(&sc->tc_lock); if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4utxs") != 0) { mtx_lock(&sc->tc_lock); continue; } rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, FW_SCHED_PARAMS_LEVEL_CL_RL, tc->mode, tc->rateunit, tc->ratemode, pi->tx_chan, j, 0, tc->maxrate, 0, tc->pktsize, tc->burstsize, 1); end_synchronized_op(sc, 0); mtx_lock(&sc->tc_lock); MPASS(tc->flags & CLRL_ASYNC); tc->flags &= ~CLRL_ASYNC; if (rc == 0) tc->flags &= ~CLRL_ERR; else tc->flags |= CLRL_ERR; } } mtx_unlock(&sc->tc_lock); } int t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p) { if (p->type != SCHED_CLASS_TYPE_PACKET) return (EINVAL); if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG) return (set_sched_class_config(sc, p->u.config.minmax)); if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS) return (set_sched_class_params(sc, &p->u.params, 1)); return (EINVAL); } static int bind_txq_to_traffic_class(struct adapter *sc, struct sge_txq *txq, int idx) { struct tx_cl_rl_params *tc0, *tc; int rc, old_idx; uint32_t fw_mnem, fw_class; if (!(txq->eq.flags & EQ_ALLOCATED)) return (EAGAIN); mtx_lock(&sc->tc_lock); if (txq->tc_idx == -2) { rc = EBUSY; /* Another bind/unbind in progress already. */ goto done; } if (idx == txq->tc_idx) { rc = 0; /* No change, nothing to do. */ goto done; } tc0 = &sc->port[txq->eq.tx_chan]->sched_params->cl_rl[0]; if (idx != -1) { /* * Bind to a different class at index idx. */ tc = &tc0[idx]; if (tc->flags & CLRL_ERR) { rc = ENXIO; goto done; } else { /* * Ok to proceed. Place a reference on the new class * while still holding on to the reference on the * previous class, if any. */ tc->refcount++; } } /* Mark as busy before letting go of the lock. */ old_idx = txq->tc_idx; txq->tc_idx = -2; mtx_unlock(&sc->tc_lock); rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4btxq"); if (rc != 0) return (rc); fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) | V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id)); fw_class = idx < 0 ? 0xffffffff : idx; rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_mnem, &fw_class); end_synchronized_op(sc, 0); mtx_lock(&sc->tc_lock); MPASS(txq->tc_idx == -2); if (rc == 0) { /* * Unbind, bind, or bind to a different class succeeded. Remove * the reference on the old traffic class, if any. */ if (old_idx != -1) { tc = &tc0[old_idx]; MPASS(tc->refcount > 0); tc->refcount--; } txq->tc_idx = idx; } else { /* * Unbind, bind, or bind to a different class failed. Remove * the anticipatory reference on the new traffic class, if any. */ if (idx != -1) { tc = &tc0[idx]; MPASS(tc->refcount > 0); tc->refcount--; } txq->tc_idx = old_idx; } done: MPASS(txq->tc_idx >= -1 && txq->tc_idx < sc->chip_params->nsched_cls); mtx_unlock(&sc->tc_lock); return (rc); } int t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p) { struct port_info *pi = NULL; struct vi_info *vi; struct sge_txq *txq; int i, rc; if (p->port >= sc->params.nports) return (EINVAL); /* * XXX: cxgbetool allows the user to specify the physical port only. So * we always operate on the main VI. */ pi = sc->port[p->port]; vi = &pi->vi[0]; /* Checking VI_INIT_DONE outside a synch-op is a harmless race here. */ if (!(vi->flags & VI_INIT_DONE)) return (EAGAIN); MPASS(vi->ntxq > 0); if (!in_range(p->queue, 0, vi->ntxq - 1) || !in_range(p->cl, 0, sc->chip_params->nsched_cls - 1)) return (EINVAL); if (p->queue < 0) { /* * Change the scheduling on all the TX queues for the * interface. */ for_each_txq(vi, i, txq) { rc = bind_txq_to_traffic_class(sc, txq, p->cl); if (rc != 0) break; } } else { /* * If op.queue is non-negative, then we're only changing the * scheduling on a single specified TX queue. */ txq = &sc->sge.txq[vi->first_txq + p->queue]; rc = bind_txq_to_traffic_class(sc, txq, p->cl); } return (rc); } int t4_init_tx_sched(struct adapter *sc) { int i, j; const int n = sc->chip_params->nsched_cls; struct port_info *pi; struct tx_cl_rl_params *tc; mtx_init(&sc->tc_lock, "tx_sched lock", NULL, MTX_DEF); TASK_INIT(&sc->tc_task, 0, update_tx_sched, sc); for_each_port(sc, i) { pi = sc->port[i]; pi->sched_params = malloc(sizeof(*pi->sched_params) + n * sizeof(*tc), M_CXGBE, M_ZERO | M_WAITOK); tc = &pi->sched_params->cl_rl[0]; for (j = 0; j < n; j++, tc++) { tc->refcount = 0; tc->ratemode = FW_SCHED_PARAMS_RATE_ABS; tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE; tc->mode = FW_SCHED_PARAMS_MODE_CLASS; tc->maxrate = 1000 * 1000; /* 1 Gbps. Arbitrary */ if (t4_sched_params_cl_rl_kbps(sc, pi->tx_chan, j, tc->mode, tc->maxrate, tc->pktsize, 1) != 0) tc->flags = CLRL_ERR; } } return (0); } int t4_free_tx_sched(struct adapter *sc) { int i; taskqueue_drain(taskqueue_thread, &sc->tc_task); for_each_port(sc, i) { if (sc->port[i] != NULL) free(sc->port[i]->sched_params, M_CXGBE); } if (mtx_initialized(&sc->tc_lock)) mtx_destroy(&sc->tc_lock); return (0); } void t4_update_tx_sched(struct adapter *sc) { taskqueue_enqueue(taskqueue_thread, &sc->tc_task); } int t4_reserve_cl_rl_kbps(struct adapter *sc, int port_id, u_int maxrate, int *tc_idx) { int rc = 0, fa = -1, i, pktsize, burstsize; bool update; struct tx_cl_rl_params *tc; struct port_info *pi; MPASS(port_id >= 0 && port_id < sc->params.nports); pi = sc->port[port_id]; if (pi->sched_params->pktsize > 0) pktsize = pi->sched_params->pktsize; else pktsize = pi->vi[0].ifp->if_mtu; if (pi->sched_params->burstsize > 0) burstsize = pi->sched_params->burstsize; else burstsize = pktsize * 4; tc = &pi->sched_params->cl_rl[0]; update = false; mtx_lock(&sc->tc_lock); for (i = 0; i < sc->chip_params->nsched_cls; i++, tc++) { if (fa < 0 && tc->refcount == 0 && !(tc->flags & CLRL_USER)) fa = i; /* first available */ if (tc->ratemode == FW_SCHED_PARAMS_RATE_ABS && tc->rateunit == FW_SCHED_PARAMS_UNIT_BITRATE && tc->mode == FW_SCHED_PARAMS_MODE_FLOW && tc->maxrate == maxrate && tc->pktsize == pktsize && tc->burstsize == burstsize) { tc->refcount++; *tc_idx = i; if ((tc->flags & (CLRL_ERR | CLRL_ASYNC | CLRL_SYNC)) == CLRL_ERR) { update = true; } goto done; } } /* Not found */ MPASS(i == sc->chip_params->nsched_cls); if (fa != -1) { tc = &pi->sched_params->cl_rl[fa]; tc->refcount = 1; tc->ratemode = FW_SCHED_PARAMS_RATE_ABS; tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE; tc->mode = FW_SCHED_PARAMS_MODE_FLOW; tc->maxrate = maxrate; tc->pktsize = pktsize; tc->burstsize = burstsize; *tc_idx = fa; update = true; } else { *tc_idx = -1; rc = ENOSPC; } done: mtx_unlock(&sc->tc_lock); if (update) { tc->flags |= CLRL_ASYNC; t4_update_tx_sched(sc); } return (rc); } void t4_release_cl_rl(struct adapter *sc, int port_id, int tc_idx) { struct tx_cl_rl_params *tc; MPASS(port_id >= 0 && port_id < sc->params.nports); MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); mtx_lock(&sc->tc_lock); tc = &sc->port[port_id]->sched_params->cl_rl[tc_idx]; MPASS(tc->refcount > 0); tc->refcount--; mtx_unlock(&sc->tc_lock); } int sysctl_tc(SYSCTL_HANDLER_ARGS) { struct vi_info *vi = arg1; struct port_info *pi; struct adapter *sc; struct sge_txq *txq; int qidx = arg2, rc, tc_idx; MPASS(qidx >= 0 && qidx < vi->ntxq); pi = vi->pi; sc = pi->adapter; txq = &sc->sge.txq[vi->first_txq + qidx]; tc_idx = txq->tc_idx; rc = sysctl_handle_int(oidp, &tc_idx, 0, req); if (rc != 0 || req->newptr == NULL) return (rc); if (sc->flags & IS_VF) return (EPERM); if (!in_range(tc_idx, 0, sc->chip_params->nsched_cls - 1)) return (EINVAL); return (bind_txq_to_traffic_class(sc, txq, tc_idx)); } int sysctl_tc_params(SYSCTL_HANDLER_ARGS) { struct adapter *sc = arg1; struct tx_cl_rl_params tc; struct sbuf *sb; int i, rc, port_id, mbps, gbps; rc = sysctl_wire_old_buffer(req, 0); if (rc != 0) return (rc); sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req); if (sb == NULL) return (ENOMEM); port_id = arg2 >> 16; MPASS(port_id < sc->params.nports); MPASS(sc->port[port_id] != NULL); i = arg2 & 0xffff; MPASS(i < sc->chip_params->nsched_cls); mtx_lock(&sc->tc_lock); tc = sc->port[port_id]->sched_params->cl_rl[i]; mtx_unlock(&sc->tc_lock); switch (tc.rateunit) { case SCHED_CLASS_RATEUNIT_BITS: switch (tc.ratemode) { case SCHED_CLASS_RATEMODE_REL: /* XXX: top speed or actual link speed? */ gbps = port_top_speed(sc->port[port_id]); sbuf_printf(sb, "%u%% of %uGbps", tc.maxrate, gbps); break; case SCHED_CLASS_RATEMODE_ABS: mbps = tc.maxrate / 1000; gbps = tc.maxrate / 1000000; if (tc.maxrate == gbps * 1000000) sbuf_printf(sb, "%uGbps", gbps); else if (tc.maxrate == mbps * 1000) sbuf_printf(sb, "%uMbps", mbps); else sbuf_printf(sb, "%uKbps", tc.maxrate); break; default: rc = ENXIO; goto done; } break; case SCHED_CLASS_RATEUNIT_PKTS: sbuf_printf(sb, "%upps", tc.maxrate); break; default: rc = ENXIO; goto done; } switch (tc.mode) { case SCHED_CLASS_MODE_CLASS: sbuf_printf(sb, " aggregate"); break; case SCHED_CLASS_MODE_FLOW: sbuf_printf(sb, " per-flow"); if (tc.pktsize > 0) sbuf_printf(sb, " pkt-size %u", tc.pktsize); if (tc.burstsize > 0) sbuf_printf(sb, " burst-size %u", tc.burstsize); break; default: rc = ENXIO; goto done; } done: if (rc == 0) rc = sbuf_finish(sb); sbuf_delete(sb); return (rc); } #ifdef RATELIMIT void t4_init_etid_table(struct adapter *sc) { int i; struct tid_info *t; if (!is_ethoffload(sc)) return; t = &sc->tids; MPASS(t->netids > 0); mtx_init(&t->etid_lock, "etid lock", NULL, MTX_DEF); t->etid_tab = malloc(sizeof(*t->etid_tab) * t->netids, M_CXGBE, M_ZERO | M_WAITOK); t->efree = t->etid_tab; t->etids_in_use = 0; for (i = 1; i < t->netids; i++) t->etid_tab[i - 1].next = &t->etid_tab[i]; t->etid_tab[t->netids - 1].next = NULL; } void t4_free_etid_table(struct adapter *sc) { struct tid_info *t; if (!is_ethoffload(sc)) return; t = &sc->tids; MPASS(t->netids > 0); free(t->etid_tab, M_CXGBE); t->etid_tab = NULL; if (mtx_initialized(&t->etid_lock)) mtx_destroy(&t->etid_lock); } /* etid services */ static int alloc_etid(struct adapter *, struct cxgbe_snd_tag *); static void free_etid(struct adapter *, int); static int alloc_etid(struct adapter *sc, struct cxgbe_snd_tag *cst) { struct tid_info *t = &sc->tids; int etid = -1; mtx_lock(&t->etid_lock); if (t->efree) { union etid_entry *p = t->efree; etid = p - t->etid_tab + t->etid_base; t->efree = p->next; p->cst = cst; t->etids_in_use++; } mtx_unlock(&t->etid_lock); return (etid); } struct cxgbe_snd_tag * lookup_etid(struct adapter *sc, int etid) { struct tid_info *t = &sc->tids; return (t->etid_tab[etid - t->etid_base].cst); } static void free_etid(struct adapter *sc, int etid) { struct tid_info *t = &sc->tids; union etid_entry *p = &t->etid_tab[etid - t->etid_base]; mtx_lock(&t->etid_lock); p->next = t->efree; t->efree = p; t->etids_in_use--; mtx_unlock(&t->etid_lock); } int cxgbe_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **pt) { int rc, schedcl; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct cxgbe_snd_tag *cst; if (params->hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT) return (ENOTSUP); rc = t4_reserve_cl_rl_kbps(sc, pi->port_id, (params->rate_limit.max_rate * 8ULL / 1000), &schedcl); if (rc != 0) return (rc); MPASS(schedcl >= 0 && schedcl < sc->chip_params->nsched_cls); cst = malloc(sizeof(*cst), M_CXGBE, M_ZERO | M_NOWAIT); if (cst == NULL) { failed: t4_release_cl_rl(sc, pi->port_id, schedcl); return (ENOMEM); } cst->etid = alloc_etid(sc, cst); if (cst->etid < 0) { free(cst, M_CXGBE); goto failed; } mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF); mbufq_init(&cst->pending_tx, INT_MAX); mbufq_init(&cst->pending_fwack, INT_MAX); - cst->com.ifp = ifp; + m_snd_tag_init(&cst->com, ifp); cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF; cst->adapter = sc; cst->port_id = pi->port_id; cst->schedcl = schedcl; cst->max_rate = params->rate_limit.max_rate; cst->tx_credits = sc->params.eo_wr_cred; cst->tx_total = cst->tx_credits; cst->plen = 0; cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); /* * Queues will be selected later when the connection flowid is available. */ *pt = &cst->com; return (0); } /* * Change in parameters, no change in ifp. */ int cxgbe_snd_tag_modify(struct m_snd_tag *mst, union if_snd_tag_modify_params *params) { int rc, schedcl; struct cxgbe_snd_tag *cst = mst_to_cst(mst); struct adapter *sc = cst->adapter; /* XXX: is schedcl -1 ok here? */ MPASS(cst->schedcl >= 0 && cst->schedcl < sc->chip_params->nsched_cls); mtx_lock(&cst->lock); MPASS(cst->flags & EO_SND_TAG_REF); rc = t4_reserve_cl_rl_kbps(sc, cst->port_id, (params->rate_limit.max_rate * 8ULL / 1000), &schedcl); if (rc != 0) return (rc); MPASS(schedcl >= 0 && schedcl < sc->chip_params->nsched_cls); t4_release_cl_rl(sc, cst->port_id, cst->schedcl); cst->schedcl = schedcl; cst->max_rate = params->rate_limit.max_rate; mtx_unlock(&cst->lock); return (0); } int cxgbe_snd_tag_query(struct m_snd_tag *mst, union if_snd_tag_query_params *params) { struct cxgbe_snd_tag *cst = mst_to_cst(mst); params->rate_limit.max_rate = cst->max_rate; #define CST_TO_MST_QLEVEL_SCALE (IF_SND_QUEUE_LEVEL_MAX / cst->tx_total) params->rate_limit.queue_level = (cst->tx_total - cst->tx_credits) * CST_TO_MST_QLEVEL_SCALE; return (0); } /* * Unlocks cst and frees it. */ void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *cst) { struct adapter *sc = cst->adapter; mtx_assert(&cst->lock, MA_OWNED); MPASS((cst->flags & EO_SND_TAG_REF) == 0); MPASS(cst->tx_credits == cst->tx_total); MPASS(cst->plen == 0); MPASS(mbufq_first(&cst->pending_tx) == NULL); MPASS(mbufq_first(&cst->pending_fwack) == NULL); if (cst->etid >= 0) free_etid(sc, cst->etid); if (cst->schedcl != -1) t4_release_cl_rl(sc, cst->port_id, cst->schedcl); mtx_unlock(&cst->lock); mtx_destroy(&cst->lock); free(cst, M_CXGBE); } void cxgbe_snd_tag_free(struct m_snd_tag *mst) { struct cxgbe_snd_tag *cst = mst_to_cst(mst); mtx_lock(&cst->lock); /* The kernel is done with the snd_tag. Remove its reference. */ MPASS(cst->flags & EO_SND_TAG_REF); cst->flags &= ~EO_SND_TAG_REF; if (cst->ncompl == 0) { /* * No fw4_ack in flight. Free the tag right away if there are * no outstanding credits. Request the firmware to return all * credits for the etid otherwise. */ if (cst->tx_credits == cst->tx_total) { cxgbe_snd_tag_free_locked(cst); return; /* cst is gone. */ } send_etid_flush_wr(cst); } mtx_unlock(&cst->lock); } #endif Index: head/sys/dev/cxgbe/t4_sge.c =================================================================== --- head/sys/dev/cxgbe/t4_sge.c (revision 348253) +++ head/sys/dev/cxgbe/t4_sge.c (revision 348254) @@ -1,6054 +1,6087 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Chelsio Communications, Inc. * All rights reserved. * Written by: Navdeep Parhar * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_NETMAP #include #include #include #include #include #endif #include "common/common.h" #include "common/t4_regs.h" #include "common/t4_regs_values.h" #include "common/t4_msg.h" #include "t4_l2t.h" #include "t4_mp_ring.h" #ifdef T4_PKT_TIMESTAMP #define RX_COPY_THRESHOLD (MINCLSIZE - 8) #else #define RX_COPY_THRESHOLD MINCLSIZE #endif /* Internal mbuf flags stored in PH_loc.eight[1]. */ #define MC_RAW_WR 0x02 /* * Ethernet frames are DMA'd at this byte offset into the freelist buffer. * 0-7 are valid values. */ static int fl_pktshift = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pktshift, CTLFLAG_RDTUN, &fl_pktshift, 0, "payload DMA offset in rx buffer (bytes)"); /* * Pad ethernet payload up to this boundary. * -1: driver should figure out a good value. * 0: disable padding. * Any power of 2 from 32 to 4096 (both inclusive) is also a valid value. */ int fl_pad = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pad, CTLFLAG_RDTUN, &fl_pad, 0, "payload pad boundary (bytes)"); /* * Status page length. * -1: driver should figure out a good value. * 64 or 128 are the only other valid values. */ static int spg_len = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, spg_len, CTLFLAG_RDTUN, &spg_len, 0, "status page size (bytes)"); /* * Congestion drops. * -1: no congestion feedback (not recommended). * 0: backpressure the channel instead of dropping packets right away. * 1: no backpressure, drop packets for the congested queue immediately. */ static int cong_drop = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, cong_drop, CTLFLAG_RDTUN, &cong_drop, 0, "Congestion control for RX queues (0 = backpressure, 1 = drop"); /* * Deliver multiple frames in the same free list buffer if they fit. * -1: let the driver decide whether to enable buffer packing or not. * 0: disable buffer packing. * 1: enable buffer packing. */ static int buffer_packing = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, buffer_packing, CTLFLAG_RDTUN, &buffer_packing, 0, "Enable buffer packing"); /* * Start next frame in a packed buffer at this boundary. * -1: driver should figure out a good value. * T4: driver will ignore this and use the same value as fl_pad above. * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. */ static int fl_pack = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, fl_pack, CTLFLAG_RDTUN, &fl_pack, 0, "payload pack boundary (bytes)"); /* * Allow the driver to create mbuf(s) in a cluster allocated for rx. * 0: never; always allocate mbufs from the zone_mbuf UMA zone. * 1: ok to create mbuf(s) within a cluster if there is room. */ static int allow_mbufs_in_cluster = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, allow_mbufs_in_cluster, CTLFLAG_RDTUN, &allow_mbufs_in_cluster, 0, "Allow driver to create mbufs within a rx cluster"); /* * Largest rx cluster size that the driver is allowed to allocate. */ static int largest_rx_cluster = MJUM16BYTES; SYSCTL_INT(_hw_cxgbe, OID_AUTO, largest_rx_cluster, CTLFLAG_RDTUN, &largest_rx_cluster, 0, "Largest rx cluster (bytes)"); /* * Size of cluster allocation that's most likely to succeed. The driver will * fall back to this size if it fails to allocate clusters larger than this. */ static int safest_rx_cluster = PAGE_SIZE; SYSCTL_INT(_hw_cxgbe, OID_AUTO, safest_rx_cluster, CTLFLAG_RDTUN, &safest_rx_cluster, 0, "Safe rx cluster (bytes)"); #ifdef RATELIMIT /* * Knob to control TCP timestamp rewriting, and the granularity of the tick used * for rewriting. -1 and 0-3 are all valid values. * -1: hardware should leave the TCP timestamps alone. * 0: 1ms * 1: 100us * 2: 10us * 3: 1us */ static int tsclk = -1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tsclk, CTLFLAG_RDTUN, &tsclk, 0, "Control TCP timestamp rewriting when using pacing"); static int eo_max_backlog = 1024 * 1024; SYSCTL_INT(_hw_cxgbe, OID_AUTO, eo_max_backlog, CTLFLAG_RDTUN, &eo_max_backlog, 0, "Maximum backlog of ratelimited data per flow"); #endif /* * The interrupt holdoff timers are multiplied by this value on T6+. * 1 and 3-17 (both inclusive) are legal values. */ static int tscale = 1; SYSCTL_INT(_hw_cxgbe, OID_AUTO, tscale, CTLFLAG_RDTUN, &tscale, 0, "Interrupt holdoff timer scale on T6+"); /* * Number of LRO entries in the lro_ctrl structure per rx queue. */ static int lro_entries = TCP_LRO_ENTRIES; SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_entries, CTLFLAG_RDTUN, &lro_entries, 0, "Number of LRO entries per RX queue"); /* * This enables presorting of frames before they're fed into tcp_lro_rx. */ static int lro_mbufs = 0; SYSCTL_INT(_hw_cxgbe, OID_AUTO, lro_mbufs, CTLFLAG_RDTUN, &lro_mbufs, 0, "Enable presorting of LRO frames"); struct txpkts { u_int wr_type; /* type 0 or type 1 */ u_int npkt; /* # of packets in this work request */ u_int plen; /* total payload (sum of all packets) */ u_int len16; /* # of 16B pieces used by this work request */ }; /* A packet's SGL. This + m_pkthdr has all info needed for tx */ struct sgl { struct sglist sg; struct sglist_seg seg[TX_SGL_SEGS]; }; static int service_iq(struct sge_iq *, int); static int service_iq_fl(struct sge_iq *, int); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int); static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *); static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t, uint16_t, char *); static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *, bus_addr_t *, void **); static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t, void *); static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *, int, int); static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *); static void add_iq_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_iq *); static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *, struct sysctl_oid *, struct sge_fl *); static int alloc_fwq(struct adapter *); static int free_fwq(struct adapter *); static int alloc_ctrlq(struct adapter *, struct sge_wrq *, int, struct sysctl_oid *); static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int, struct sysctl_oid *); static int free_rxq(struct vi_info *, struct sge_rxq *); #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int, struct sysctl_oid *); static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *); #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int, struct sysctl_oid *); static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *); static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int, struct sysctl_oid *); static int free_nm_txq(struct vi_info *, struct sge_nm_txq *); #endif static int ctrl_eq_alloc(struct adapter *, struct sge_eq *); static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #if defined(TCP_OFFLOAD) || defined(RATELIMIT) static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *); #endif static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *); static int free_eq(struct adapter *, struct sge_eq *); static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *, struct sysctl_oid *); static int free_wrq(struct adapter *, struct sge_wrq *); static int alloc_txq(struct vi_info *, struct sge_txq *, int, struct sysctl_oid *); static int free_txq(struct vi_info *, struct sge_txq *); static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int); static inline void ring_fl_db(struct adapter *, struct sge_fl *); static int refill_fl(struct adapter *, struct sge_fl *, int); static void refill_sfl(void *); static int alloc_fl_sdesc(struct sge_fl *); static void free_fl_sdesc(struct adapter *, struct sge_fl *); static void find_best_refill_source(struct adapter *, struct sge_fl *, int); static void find_safe_refill_source(struct adapter *, struct sge_fl *); static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static inline void get_pkt_gl(struct mbuf *, struct sglist *); static inline u_int txpkt_len16(u_int, u_int); static inline u_int txpkt_vm_len16(u_int, u_int); static inline u_int txpkts0_len16(u_int); static inline u_int txpkts1_len16(void); static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int); static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *, struct mbuf *, u_int); static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *, struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int); static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int); static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int); static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *, struct mbuf *, const struct txpkts *, u_int); static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int); static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int); static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int); static inline uint16_t read_hw_cidx(struct sge_eq *); static inline u_int reclaimable_tx_desc(struct sge_eq *); static inline u_int total_available_tx_desc(struct sge_eq *); static u_int reclaim_tx_descs(struct sge_txq *, u_int); static void tx_reclaim(void *, int); static __be64 get_flit(struct sglist_seg *, int, int); static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *, struct mbuf *); static int handle_fw_msg(struct sge_iq *, const struct rss_header *, struct mbuf *); static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *); static void wrq_tx_drain(void *, int); static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *); static int sysctl_uint16(SYSCTL_HANDLER_ARGS); static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS); #ifdef RATELIMIT static inline u_int txpkt_eo_len16(u_int, u_int, u_int); static int ethofld_fw4_ack(struct sge_iq *, const struct rss_header *, struct mbuf *); #endif static counter_u64_t extfree_refs; static counter_u64_t extfree_rels; an_handler_t t4_an_handler; fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES]; cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS]; cpl_handler_t set_tcb_rpl_handlers[NUM_CPL_COOKIES]; cpl_handler_t l2t_write_rpl_handlers[NUM_CPL_COOKIES]; cpl_handler_t act_open_rpl_handlers[NUM_CPL_COOKIES]; cpl_handler_t abort_rpl_rss_handlers[NUM_CPL_COOKIES]; cpl_handler_t fw4_ack_handlers[NUM_CPL_COOKIES]; void t4_register_an_handler(an_handler_t h) { uintptr_t *loc; MPASS(h == NULL || t4_an_handler == NULL); loc = (uintptr_t *)&t4_an_handler; atomic_store_rel_ptr(loc, (uintptr_t)h); } void t4_register_fw_msg_handler(int type, fw_msg_handler_t h) { uintptr_t *loc; MPASS(type < nitems(t4_fw_msg_handler)); MPASS(h == NULL || t4_fw_msg_handler[type] == NULL); /* * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL * handler dispatch table. Reject any attempt to install a handler for * this subtype. */ MPASS(type != FW_TYPE_RSSCPL); MPASS(type != FW6_TYPE_RSSCPL); loc = (uintptr_t *)&t4_fw_msg_handler[type]; atomic_store_rel_ptr(loc, (uintptr_t)h); } void t4_register_cpl_handler(int opcode, cpl_handler_t h) { uintptr_t *loc; MPASS(opcode < nitems(t4_cpl_handler)); MPASS(h == NULL || t4_cpl_handler[opcode] == NULL); loc = (uintptr_t *)&t4_cpl_handler[opcode]; atomic_store_rel_ptr(loc, (uintptr_t)h); } static int set_tcb_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_set_tcb_rpl *cpl = (const void *)(rss + 1); u_int tid; int cookie; MPASS(m == NULL); tid = GET_TID(cpl); if (is_hpftid(iq->adapter, tid) || is_ftid(iq->adapter, tid)) { /* * The return code for filter-write is put in the CPL cookie so * we have to rely on the hardware tid (is_ftid) to determine * that this is a response to a filter. */ cookie = CPL_COOKIE_FILTER; } else { cookie = G_COOKIE(cpl->cookie); } MPASS(cookie > CPL_COOKIE_RESERVED); MPASS(cookie < nitems(set_tcb_rpl_handlers)); return (set_tcb_rpl_handlers[cookie](iq, rss, m)); } static int l2t_write_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1); unsigned int cookie; MPASS(m == NULL); cookie = GET_TID(rpl) & F_SYNC_WR ? CPL_COOKIE_TOM : CPL_COOKIE_FILTER; return (l2t_write_rpl_handlers[cookie](iq, rss, m)); } static int act_open_rpl_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_act_open_rpl *cpl = (const void *)(rss + 1); u_int cookie = G_TID_COOKIE(G_AOPEN_ATID(be32toh(cpl->atid_status))); MPASS(m == NULL); MPASS(cookie != CPL_COOKIE_RESERVED); return (act_open_rpl_handlers[cookie](iq, rss, m)); } static int abort_rpl_rss_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; u_int cookie; MPASS(m == NULL); if (is_hashfilter(sc)) cookie = CPL_COOKIE_HASHFILTER; else cookie = CPL_COOKIE_TOM; return (abort_rpl_rss_handlers[cookie](iq, rss, m)); } static int fw4_ack_handler(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); u_int cookie; MPASS(m == NULL); if (is_etid(sc, tid)) cookie = CPL_COOKIE_ETHOFLD; else cookie = CPL_COOKIE_TOM; return (fw4_ack_handlers[cookie](iq, rss, m)); } static void t4_init_shared_cpl_handlers(void) { t4_register_cpl_handler(CPL_SET_TCB_RPL, set_tcb_rpl_handler); t4_register_cpl_handler(CPL_L2T_WRITE_RPL, l2t_write_rpl_handler); t4_register_cpl_handler(CPL_ACT_OPEN_RPL, act_open_rpl_handler); t4_register_cpl_handler(CPL_ABORT_RPL_RSS, abort_rpl_rss_handler); t4_register_cpl_handler(CPL_FW4_ACK, fw4_ack_handler); } void t4_register_shared_cpl_handler(int opcode, cpl_handler_t h, int cookie) { uintptr_t *loc; MPASS(opcode < nitems(t4_cpl_handler)); MPASS(cookie > CPL_COOKIE_RESERVED); MPASS(cookie < NUM_CPL_COOKIES); MPASS(t4_cpl_handler[opcode] != NULL); switch (opcode) { case CPL_SET_TCB_RPL: loc = (uintptr_t *)&set_tcb_rpl_handlers[cookie]; break; case CPL_L2T_WRITE_RPL: loc = (uintptr_t *)&l2t_write_rpl_handlers[cookie]; break; case CPL_ACT_OPEN_RPL: loc = (uintptr_t *)&act_open_rpl_handlers[cookie]; break; case CPL_ABORT_RPL_RSS: loc = (uintptr_t *)&abort_rpl_rss_handlers[cookie]; break; case CPL_FW4_ACK: loc = (uintptr_t *)&fw4_ack_handlers[cookie]; break; default: MPASS(0); return; } MPASS(h == NULL || *loc == (uintptr_t)NULL); atomic_store_rel_ptr(loc, (uintptr_t)h); } /* * Called on MOD_LOAD. Validates and calculates the SGE tunables. */ void t4_sge_modload(void) { if (fl_pktshift < 0 || fl_pktshift > 7) { printf("Invalid hw.cxgbe.fl_pktshift value (%d)," " using 0 instead.\n", fl_pktshift); fl_pktshift = 0; } if (spg_len != 64 && spg_len != 128) { int len; #if defined(__i386__) || defined(__amd64__) len = cpu_clflush_line_size > 64 ? 128 : 64; #else len = 64; #endif if (spg_len != -1) { printf("Invalid hw.cxgbe.spg_len value (%d)," " using %d instead.\n", spg_len, len); } spg_len = len; } if (cong_drop < -1 || cong_drop > 1) { printf("Invalid hw.cxgbe.cong_drop value (%d)," " using 0 instead.\n", cong_drop); cong_drop = 0; } if (tscale != 1 && (tscale < 3 || tscale > 17)) { printf("Invalid hw.cxgbe.tscale value (%d)," " using 1 instead.\n", tscale); tscale = 1; } extfree_refs = counter_u64_alloc(M_WAITOK); extfree_rels = counter_u64_alloc(M_WAITOK); counter_u64_zero(extfree_refs); counter_u64_zero(extfree_rels); t4_init_shared_cpl_handlers(); t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg); t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update); t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx); #ifdef RATELIMIT t4_register_shared_cpl_handler(CPL_FW4_ACK, ethofld_fw4_ack, CPL_COOKIE_ETHOFLD); #endif t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl); t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl); } void t4_sge_modunload(void) { counter_u64_free(extfree_refs); counter_u64_free(extfree_rels); } uint64_t t4_sge_extfree_refs(void) { uint64_t refs, rels; rels = counter_u64_fetch(extfree_rels); refs = counter_u64_fetch(extfree_refs); return (refs - rels); } static inline void setup_pad_and_pack_boundaries(struct adapter *sc) { uint32_t v, m; int pad, pack, pad_shift; pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT : X_INGPADBOUNDARY_SHIFT; pad = fl_pad; if (fl_pad < (1 << pad_shift) || fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) || !powerof2(fl_pad)) { /* * If there is any chance that we might use buffer packing and * the chip is a T4, then pick 64 as the pad/pack boundary. Set * it to the minimum allowed in all other cases. */ pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift; /* * For fl_pad = 0 we'll still write a reasonable value to the * register but all the freelists will opt out of padding. * We'll complain here only if the user tried to set it to a * value greater than 0 that was invalid. */ if (fl_pad > 0) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value" " (%d), using %d instead.\n", fl_pad, pad); } } m = V_INGPADBOUNDARY(M_INGPADBOUNDARY); v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); if (is_t4(sc)) { if (fl_pack != -1 && fl_pack != pad) { /* Complain but carry on. */ device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored," " using %d instead.\n", fl_pack, pad); } return; } pack = fl_pack; if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || !powerof2(fl_pack)) { pack = max(sc->params.pci.mps, CACHE_LINE_SIZE); MPASS(powerof2(pack)); if (pack < 16) pack = 16; if (pack == 32) pack = 64; if (pack > 4096) pack = 4096; if (fl_pack != -1) { device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value" " (%d), using %d instead.\n", fl_pack, pack); } } m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); if (pack == 16) v = V_INGPACKBOUNDARY(0); else v = V_INGPACKBOUNDARY(ilog2(pack) - 5); MPASS(!is_t4(sc)); /* T4 doesn't have SGE_CONTROL2 */ t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); } /* * adap->params.vpd.cclk must be set up before this is called. */ void t4_tweak_chip_settings(struct adapter *sc) { int i; uint32_t v, m; int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200}; int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk; int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */ uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sge_flbuf_sizes[] = { MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, MJUMPAGESIZE - CL_METADATA_SIZE, MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE, #endif MJUM9BYTES, MJUM16BYTES, MCLBYTES - MSIZE - CL_METADATA_SIZE, MJUM9BYTES - CL_METADATA_SIZE, MJUM16BYTES - CL_METADATA_SIZE, }; KASSERT(sc->flags & MASTER_PF, ("%s: trying to change chip settings when not master.", __func__)); m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); t4_set_reg_field(sc, A_SGE_CONTROL, m, v); setup_pad_and_pack_boundaries(sc); v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10); t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v); KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES, ("%s: hw buffer size table too big", __func__)); t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0, 4096); t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE1, 65536); for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) { t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE15 - (4 * i), sge_flbuf_sizes[i]); } v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) | V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]); t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v); KASSERT(intr_timer[0] <= timer_max, ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0], timer_max)); for (i = 1; i < nitems(intr_timer); i++) { KASSERT(intr_timer[i] >= intr_timer[i - 1], ("%s: timers not listed in increasing order (%d)", __func__, i)); while (intr_timer[i] > timer_max) { if (i == nitems(intr_timer) - 1) { intr_timer[i] = timer_max; break; } intr_timer[i] += intr_timer[i - 1]; intr_timer[i] /= 2; } } v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) | V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1])); t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v); v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) | V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3])); t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v); v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) | V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5])); t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v); if (chip_id(sc) >= CHELSIO_T6) { m = V_TSCALE(M_TSCALE); if (tscale == 1) v = 0; else v = V_TSCALE(tscale - 2); t4_set_reg_field(sc, A_SGE_ITP_CONTROL, m, v); if (sc->debug_flags & DF_DISABLE_TCB_CACHE) { m = V_RDTHRESHOLD(M_RDTHRESHOLD) | F_WRTHRTHRESHEN | V_WRTHRTHRESH(M_WRTHRTHRESH); t4_tp_pio_read(sc, &v, 1, A_TP_CMM_CONFIG, 1); v &= ~m; v |= V_RDTHRESHOLD(1) | F_WRTHRTHRESHEN | V_WRTHRTHRESH(16); t4_tp_pio_write(sc, &v, 1, A_TP_CMM_CONFIG, 1); } } /* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */ v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v); /* * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP. These have been * chosen with MAXPHYS = 128K in mind. The largest DDP buffer that we * may have to deal with is MAXPHYS + 1 page. */ v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4); t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v); /* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */ m = v = F_TDDPTAGTCB | F_ISCSITAGTCB; t4_set_reg_field(sc, A_ULP_RX_CTL, m, v); m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; t4_set_reg_field(sc, A_TP_PARA_REG5, m, v); } /* * SGE wants the buffer to be at least 64B and then a multiple of 16. If * padding is in use, the buffer's start and end need to be aligned to the pad * boundary as well. We'll just make sure that the size is a multiple of the * boundary here, it is up to the buffer allocation code to make sure the start * of the buffer is aligned as well. */ static inline int hwsz_ok(struct adapter *sc, int hwsz) { int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1; return (hwsz >= 64 && (hwsz & mask) == 0); } /* * XXX: driver really should be able to deal with unexpected settings. */ int t4_read_chip_settings(struct adapter *sc) { struct sge *s = &sc->sge; struct sge_params *sp = &sc->params.sge; int i, j, n, rc = 0; uint32_t m, v, r; uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE); static int sw_buf_sizes[] = { /* Sorted by size */ MCLBYTES, #if MJUMPAGESIZE != MCLBYTES MJUMPAGESIZE, #endif MJUM9BYTES, MJUM16BYTES }; struct sw_zone_info *swz, *safe_swz; struct hw_buf_info *hwb; m = F_RXPKTCPLMODE; v = F_RXPKTCPLMODE; r = sc->params.sge.sge_control; if ((r & m) != v) { device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r); rc = EINVAL; } /* * If this changes then every single use of PAGE_SHIFT in the driver * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift. */ if (sp->page_shift != PAGE_SHIFT) { device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r); rc = EINVAL; } /* Filter out unusable hw buffer sizes entirely (mark with -2). */ hwb = &s->hw_buf_info[0]; for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) { r = sc->params.sge.sge_fl_buffer_size[i]; hwb->size = r; hwb->zidx = hwsz_ok(sc, r) ? -1 : -2; hwb->next = -1; } /* * Create a sorted list in decreasing order of hw buffer sizes (and so * increasing order of spare area) for each software zone. * * If padding is enabled then the start and end of the buffer must align * to the pad boundary; if packing is enabled then they must align with * the pack boundary as well. Allocations from the cluster zones are * aligned to min(size, 4K), so the buffer starts at that alignment and * ends at hwb->size alignment. If mbuf inlining is allowed the * starting alignment will be reduced to MSIZE and the driver will * exercise appropriate caution when deciding on the best buffer layout * to use. */ n = 0; /* no usable buffer size to begin with */ swz = &s->sw_zone_info[0]; safe_swz = NULL; for (i = 0; i < SW_ZONE_SIZES; i++, swz++) { int8_t head = -1, tail = -1; swz->size = sw_buf_sizes[i]; swz->zone = m_getzone(swz->size); swz->type = m_gettype(swz->size); if (swz->size < PAGE_SIZE) { MPASS(powerof2(swz->size)); if (fl_pad && (swz->size % sp->pad_boundary != 0)) continue; } if (swz->size == safest_rx_cluster) safe_swz = swz; hwb = &s->hw_buf_info[0]; for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) { if (hwb->zidx != -1 || hwb->size > swz->size) continue; #ifdef INVARIANTS if (fl_pad) MPASS(hwb->size % sp->pad_boundary == 0); #endif hwb->zidx = i; if (head == -1) head = tail = j; else if (hwb->size < s->hw_buf_info[tail].size) { s->hw_buf_info[tail].next = j; tail = j; } else { int8_t *cur; struct hw_buf_info *t; for (cur = &head; *cur != -1; cur = &t->next) { t = &s->hw_buf_info[*cur]; if (hwb->size == t->size) { hwb->zidx = -2; break; } if (hwb->size > t->size) { hwb->next = *cur; *cur = j; break; } } } } swz->head_hwidx = head; swz->tail_hwidx = tail; if (tail != -1) { n++; if (swz->size - s->hw_buf_info[tail].size >= CL_METADATA_SIZE) sc->flags |= BUF_PACKING_OK; } } if (n == 0) { device_printf(sc->dev, "no usable SGE FL buffer size.\n"); rc = EINVAL; } s->safe_hwidx1 = -1; s->safe_hwidx2 = -1; if (safe_swz != NULL) { s->safe_hwidx1 = safe_swz->head_hwidx; for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) { int spare; hwb = &s->hw_buf_info[i]; #ifdef INVARIANTS if (fl_pad) MPASS(hwb->size % sp->pad_boundary == 0); #endif spare = safe_swz->size - hwb->size; if (spare >= CL_METADATA_SIZE) { s->safe_hwidx2 = i; break; } } } if (sc->flags & IS_VF) return (0); v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6); r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ); if (r != v) { device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r); rc = EINVAL; } m = v = F_TDDPTAGTCB; r = t4_read_reg(sc, A_ULP_RX_CTL); if ((r & m) != v) { device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r); rc = EINVAL; } m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET; r = t4_read_reg(sc, A_TP_PARA_REG5); if ((r & m) != v) { device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r); rc = EINVAL; } t4_init_tp_params(sc, 1); t4_read_mtu_tbl(sc, sc->params.mtus, NULL); t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd); return (rc); } int t4_create_dma_tag(struct adapter *sc) { int rc; rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE, BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmat); if (rc != 0) { device_printf(sc->dev, "failed to create main DMA tag: %d\n", rc); } return (rc); } void t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) { struct sge_params *sp = &sc->params.sge; SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes", CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A", "freelist buffer sizes"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD, NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD, NULL, sp->pad_boundary, "payload pad boundary (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD, NULL, sp->spg_len, "status page size (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD, NULL, cong_drop, "congestion drop setting"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, NULL, sp->pack_boundary, "payload pack boundary (bytes)"); } int t4_destroy_dma_tag(struct adapter *sc) { if (sc->dmat) bus_dma_tag_destroy(sc->dmat); return (0); } /* * Allocate and initialize the firmware event queue, control queues, and special * purpose rx queues owned by the adapter. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. */ int t4_setup_adapter_queues(struct adapter *sc) { struct sysctl_oid *oid; struct sysctl_oid_list *children; int rc, i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); sysctl_ctx_init(&sc->ctx); sc->flags |= ADAP_SYSCTL_CTX; /* * Firmware event queue */ rc = alloc_fwq(sc); if (rc != 0) return (rc); /* * That's all for the VF driver. */ if (sc->flags & IS_VF) return (rc); oid = device_get_sysctl_tree(sc->dev); children = SYSCTL_CHILDREN(oid); /* * XXX: General purpose rx queues, one per port. */ /* * Control queues, one per port. */ oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD, NULL, "control queues"); for_each_port(sc, i) { struct sge_wrq *ctrlq = &sc->sge.ctrlq[i]; rc = alloc_ctrlq(sc, ctrlq, i, oid); if (rc != 0) return (rc); } return (rc); } /* * Idempotent */ int t4_teardown_adapter_queues(struct adapter *sc) { int i; ADAPTER_LOCK_ASSERT_NOTOWNED(sc); /* Do this before freeing the queue */ if (sc->flags & ADAP_SYSCTL_CTX) { sysctl_ctx_free(&sc->ctx); sc->flags &= ~ADAP_SYSCTL_CTX; } if (!(sc->flags & IS_VF)) { for_each_port(sc, i) free_wrq(sc, &sc->sge.ctrlq[i]); } free_fwq(sc); return (0); } /* Maximum payload that can be delivered with a single iq descriptor */ static inline int mtu_to_max_payload(struct adapter *sc, int mtu, const int toe) { int payload; #ifdef TCP_OFFLOAD if (toe) { int rxcs = G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)); /* Note that COP can set rx_coalesce on/off per connection. */ payload = max(mtu, rxcs); } else { #endif /* large enough even when hw VLAN extraction is disabled */ payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + mtu; #ifdef TCP_OFFLOAD } #endif return (payload); } int t4_setup_vi_queues(struct vi_info *vi) { int rc = 0, i, intr_idx, iqidx; struct sge_rxq *rxq; struct sge_txq *txq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct sge_wrq *ofld_txq; #endif #ifdef DEV_NETMAP int saved_idx; struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif char name[16]; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct ifnet *ifp = vi->ifp; struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); int maxp, mtu = ifp->if_mtu; /* Interrupt vector to start from (when using multiple vectors) */ intr_idx = vi->first_intr; #ifdef DEV_NETMAP saved_idx = intr_idx; if (ifp->if_capabilities & IFCAP_NETMAP) { /* netmap is supported with direct interrupts only. */ MPASS(!forwarding_intr_to_fwq(sc)); /* * We don't have buffers to back the netmap rx queues * right now so we create the queues in a way that * doesn't set off any congestion signal in the chip. */ oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq", CTLFLAG_RD, NULL, "rx queues"); for_each_nm_rxq(vi, i, nm_rxq) { rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq", CTLFLAG_RD, NULL, "tx queues"); for_each_nm_txq(vi, i, nm_txq) { iqidx = vi->first_nm_rxq + (i % vi->nnmrxq); rc = alloc_nm_txq(vi, nm_txq, iqidx, i, oid); if (rc != 0) goto done; } } /* Normal rx queues and netmap rx queues share the same interrupts. */ intr_idx = saved_idx; #endif /* * Allocate rx queues first because a default iqid is required when * creating a tx queue. */ maxp = mtu_to_max_payload(sc, mtu, 0); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); for_each_rxq(vi, i, rxq) { init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq); snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(vi->dev), i); init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name); rc = alloc_rxq(vi, rxq, forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } #ifdef DEV_NETMAP if (ifp->if_capabilities & IFCAP_NETMAP) intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq); #endif #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq", CTLFLAG_RD, NULL, "rx queues for offloaded TCP connections"); for_each_ofld_rxq(vi, i, ofld_rxq) { init_iq(&ofld_rxq->iq, sc, vi->ofld_tmr_idx, vi->ofld_pktc_idx, vi->qsize_rxq); snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(vi->dev), i); init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name); rc = alloc_ofld_rxq(vi, ofld_rxq, forwarding_intr_to_fwq(sc) ? -1 : intr_idx, i, oid); if (rc != 0) goto done; intr_idx++; } #endif /* * Now the tx queues. */ oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD, NULL, "tx queues"); for_each_txq(vi, i, txq) { iqidx = vi->first_rxq + (i % vi->nrxq); snprintf(name, sizeof(name), "%s txq%d", device_get_nameunit(vi->dev), i); init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, sc->sge.rxq[iqidx].iq.cntxt_id, name); rc = alloc_txq(vi, txq, i, oid); if (rc != 0) goto done; } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq", CTLFLAG_RD, NULL, "tx queues for TOE/ETHOFLD"); for_each_ofld_txq(vi, i, ofld_txq) { struct sysctl_oid *oid2; snprintf(name, sizeof(name), "%s ofld_txq%d", device_get_nameunit(vi->dev), i); if (vi->nofldrxq > 0) { iqidx = vi->first_ofld_rxq + (i % vi->nofldrxq); init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan, sc->sge.ofld_rxq[iqidx].iq.cntxt_id, name); } else { iqidx = vi->first_rxq + (i % vi->nrxq); init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan, sc->sge.rxq[iqidx].iq.cntxt_id, name); } snprintf(name, sizeof(name), "%d", i); oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO, name, CTLFLAG_RD, NULL, "offload tx queue"); rc = alloc_wrq(sc, vi, ofld_txq, oid2); if (rc != 0) goto done; } #endif done: if (rc) t4_teardown_vi_queues(vi); return (rc); } /* * Idempotent */ int t4_teardown_vi_queues(struct vi_info *vi) { int i; struct sge_rxq *rxq; struct sge_txq *txq; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_wrq *ofld_txq; #endif #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif #ifdef DEV_NETMAP struct sge_nm_rxq *nm_rxq; struct sge_nm_txq *nm_txq; #endif /* Do this before freeing the queues */ if (vi->flags & VI_SYSCTL_CTX) { sysctl_ctx_free(&vi->ctx); vi->flags &= ~VI_SYSCTL_CTX; } #ifdef DEV_NETMAP if (vi->ifp->if_capabilities & IFCAP_NETMAP) { for_each_nm_txq(vi, i, nm_txq) { free_nm_txq(vi, nm_txq); } for_each_nm_rxq(vi, i, nm_rxq) { free_nm_rxq(vi, nm_rxq); } } #endif /* * Take down all the tx queues first, as they reference the rx queues * (for egress updates, etc.). */ for_each_txq(vi, i, txq) { free_txq(vi, txq); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) for_each_ofld_txq(vi, i, ofld_txq) { free_wrq(sc, ofld_txq); } #endif /* * Then take down the rx queues. */ for_each_rxq(vi, i, rxq) { free_rxq(vi, rxq); } #ifdef TCP_OFFLOAD for_each_ofld_rxq(vi, i, ofld_rxq) { free_ofld_rxq(vi, ofld_rxq); } #endif return (0); } /* * Interrupt handler when the driver is using only 1 interrupt. This is a very * unusual scenario. * * a) Deals with errors, if any. * b) Services firmware event queue, which is taking interrupts for all other * queues. */ void t4_intr_all(void *arg) { struct adapter *sc = arg; struct sge_iq *fwq = &sc->sge.fwq; MPASS(sc->intr_count == 1); if (sc->intr_type == INTR_INTX) t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0); t4_intr_err(arg); t4_intr_evt(fwq); } /* * Interrupt handler for errors (installed directly when multiple interrupts are * being used, or called by t4_intr_all). */ void t4_intr_err(void *arg) { struct adapter *sc = arg; uint32_t v; const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0; if (sc->flags & ADAP_ERR) return; v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE)); if (v & F_PFSW) { sc->swintr++; t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v); } t4_slow_intr_handler(sc, verbose); } /* * Interrupt handler for iq-only queues. The firmware event queue is the only * such queue right now. */ void t4_intr_evt(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq(iq, 0); (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } /* * Interrupt handler for iq+fl queues. */ void t4_intr(void *arg) { struct sge_iq *iq = arg; if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) { service_iq_fl(iq, 0); (void) atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE); } } #ifdef DEV_NETMAP /* * Interrupt handler for netmap rx queues. */ void t4_nm_intr(void *arg) { struct sge_nm_rxq *nm_rxq = arg; if (atomic_cmpset_int(&nm_rxq->nm_state, NM_ON, NM_BUSY)) { service_nm_rxq(nm_rxq); (void) atomic_cmpset_int(&nm_rxq->nm_state, NM_BUSY, NM_ON); } } /* * Interrupt handler for vectors shared between NIC and netmap rx queues. */ void t4_vi_intr(void *arg) { struct irq *irq = arg; MPASS(irq->nm_rxq != NULL); t4_nm_intr(irq->nm_rxq); MPASS(irq->rxq != NULL); t4_intr(irq->rxq); } #endif /* * Deals with interrupts on an iq-only (no freelist) queue. */ static int service_iq(struct sge_iq *iq, int budget) { struct sge_iq *q; struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs = 0, limit; int rsp_type; uint32_t lq; STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql); KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); KASSERT((iq->flags & IQ_HAS_FL) == 0, ("%s: called for iq %p with fl (iq->flags 0x%x)", __func__, iq, iq->flags)); MPASS((iq->flags & IQ_ADJ_CREDIT) == 0); MPASS((iq->flags & IQ_LRO_ENABLED) == 0); limit = budget ? budget : iq->qsize / 16; /* * We always come back and check the descriptor ring for new indirect * interrupts and other responses after running a single handler. */ for (;;) { while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: panic("%s: data for an iq (%p) with no freelist", __func__, iq); /* NOTREACHED */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); t4_cpl_handler[d->rss.opcode](iq, &d->rss, NULL); break; case X_RSPD_TYPE_INTR: /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. */ if (__predict_true(lq >= 1024)) { t4_an_handler(iq, &d->rsp); break; } q = sc->sge.iqmap[lq - sc->sge.iq_start - sc->sge.iq_base]; if (atomic_cmpset_int(&q->state, IQS_IDLE, IQS_BUSY)) { if (service_iq_fl(q, q->qsize / 16) == 0) { (void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); } else { STAILQ_INSERT_TAIL(&iql, q, link); } } break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndescs = 0; if (budget) { return (EINPROGRESS); } } } if (STAILQ_EMPTY(&iql)) break; /* * Process the head only, and send it to the back of the list if * it's still not done. */ q = STAILQ_FIRST(&iql); STAILQ_REMOVE_HEAD(&iql, link); if (service_iq_fl(q, q->qsize / 8) == 0) (void) atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE); else STAILQ_INSERT_TAIL(&iql, q, link); } t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); return (0); } static inline int sort_before_lro(struct lro_ctrl *lro) { return (lro->lro_mbuf_max != 0); } static inline uint64_t last_flit_to_ns(struct adapter *sc, uint64_t lf) { uint64_t n = be64toh(lf) & 0xfffffffffffffff; /* 60b, not 64b. */ if (n > UINT64_MAX / 1000000) return (n / sc->params.vpd.cclk * 1000000); else return (n * 1000000 / sc->params.vpd.cclk); } /* * Deals with interrupts on an iq+fl queue. */ static int service_iq_fl(struct sge_iq *iq, int budget) { struct sge_rxq *rxq = iq_to_rxq(iq); struct sge_fl *fl; struct adapter *sc = iq->adapter; struct iq_desc *d = &iq->desc[iq->cidx]; int ndescs = 0, limit; int rsp_type, refill, starved; uint32_t lq; uint16_t fl_hw_cidx; struct mbuf *m0; #if defined(INET) || defined(INET6) const struct timeval lro_timeout = {0, sc->lro_timeout}; struct lro_ctrl *lro = &rxq->lro; #endif KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq)); MPASS(iq->flags & IQ_HAS_FL); limit = budget ? budget : iq->qsize / 16; fl = &rxq->fl; fl_hw_cidx = fl->hw_cidx; /* stable snapshot */ #if defined(INET) || defined(INET6) if (iq->flags & IQ_ADJ_CREDIT) { MPASS(sort_before_lro(lro)); iq->flags &= ~IQ_ADJ_CREDIT; if ((d->rsp.u.type_gen & F_RSPD_GEN) != iq->gen) { tcp_lro_flush_all(lro); t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(1) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); return (0); } ndescs = 1; } #else MPASS((iq->flags & IQ_ADJ_CREDIT) == 0); #endif while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) { rmb(); refill = 0; m0 = NULL; rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen); lq = be32toh(d->rsp.pldbuflen_qid); switch (rsp_type) { case X_RSPD_TYPE_FLBUF: m0 = get_fl_payload(sc, fl, lq); if (__predict_false(m0 == NULL)) goto out; refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2; if (iq->flags & IQ_RX_TIMESTAMP) { /* * Fill up rcv_tstmp but do not set M_TSTMP. * rcv_tstmp is not in the format that the * kernel expects and we don't want to mislead * it. For now this is only for custom code * that knows how to interpret cxgbe's stamp. */ m0->m_pkthdr.rcv_tstmp = last_flit_to_ns(sc, d->rsp.u.last_flit); #ifdef notyet m0->m_flags |= M_TSTMP; #endif } /* fall through */ case X_RSPD_TYPE_CPL: KASSERT(d->rss.opcode < NUM_CPL_CMDS, ("%s: bad opcode %02x.", __func__, d->rss.opcode)); t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0); break; case X_RSPD_TYPE_INTR: /* * There are 1K interrupt-capable queues (qids 0 * through 1023). A response type indicating a * forwarded interrupt with a qid >= 1K is an * iWARP async notification. That is the only * acceptable indirect interrupt on this queue. */ if (__predict_false(lq < 1024)) { panic("%s: indirect interrupt on iq_fl %p " "with qid %u", __func__, iq, lq); } t4_an_handler(iq, &d->rsp); break; default: KASSERT(0, ("%s: illegal response type %d on iq %p", __func__, rsp_type, iq)); log(LOG_ERR, "%s: illegal response type %d on iq %p", device_get_nameunit(sc->dev), rsp_type, iq); break; } d++; if (__predict_false(++iq->cidx == iq->sidx)) { iq->cidx = 0; iq->gen ^= F_RSPD_GEN; d = &iq->desc[0]; } if (__predict_false(++ndescs == limit)) { t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID(iq->cntxt_id) | V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX))); ndescs = 0; #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED && !sort_before_lro(lro) && sc->lro_timeout != 0) { tcp_lro_flush_inactive(lro, &lro_timeout); } #endif if (budget) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); return (EINPROGRESS); } } if (refill) { FL_LOCK(fl); refill_fl(sc, fl, 32); FL_UNLOCK(fl); fl_hw_cidx = fl->hw_cidx; } } out: #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED) { if (ndescs > 0 && lro->lro_mbuf_count > 8) { MPASS(sort_before_lro(lro)); /* hold back one credit and don't flush LRO state */ iq->flags |= IQ_ADJ_CREDIT; ndescs--; } else { tcp_lro_flush_all(lro); } } #endif t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) | V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params)); FL_LOCK(fl); starved = refill_fl(sc, fl, 64); FL_UNLOCK(fl); if (__predict_false(starved != 0)) add_fl_to_sfl(sc, fl); return (0); } static inline int cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll) { int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0; if (rc) MPASS(cll->region3 >= CL_METADATA_SIZE); return (rc); } static inline struct cluster_metadata * cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll, caddr_t cl) { if (cl_has_metadata(fl, cll)) { struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; return ((struct cluster_metadata *)(cl + swz->size) - 1); } return (NULL); } static void rxb_free(struct mbuf *m) { uma_zone_t zone = m->m_ext.ext_arg1; void *cl = m->m_ext.ext_arg2; uma_zfree(zone, cl); counter_u64_add(extfree_rels, 1); } /* * The mbuf returned by this function could be allocated from zone_mbuf or * constructed in spare room in the cluster. * * The mbuf carries the payload in one of these ways * a) frame inside the mbuf (mbuf from zone_mbuf) * b) m_cljset (for clusters without metadata) zone_mbuf * c) m_extaddref (cluster with metadata) inline mbuf * d) m_extaddref (cluster with metadata) zone_mbuf */ static struct mbuf * get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset, int remaining) { struct mbuf *m; struct fl_sdesc *sd = &fl->sdesc[fl->cidx]; struct cluster_layout *cll = &sd->cll; struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx]; struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx]; struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl); int len, blen; caddr_t payload; blen = hwb->size - fl->rx_offset; /* max possible in this buf */ len = min(remaining, blen); payload = sd->cl + cll->region1 + fl->rx_offset; if (fl->flags & FL_BUF_PACKING) { const u_int l = fr_offset + len; const u_int pad = roundup2(l, fl->buf_boundary) - l; if (fl->rx_offset + len + pad < hwb->size) blen = len + pad; MPASS(fl->rx_offset + blen <= hwb->size); } else { MPASS(fl->rx_offset == 0); /* not packing */ } if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { /* * Copy payload into a freshly allocated mbuf. */ m = fr_offset == 0 ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; /* copy data to mbuf */ bcopy(payload, mtod(m, caddr_t), len); } else if (sd->nmbuf * MSIZE < cll->region1) { /* * There's spare room in the cluster for an mbuf. Create one * and associate it with the payload that's in the cluster. */ MPASS(clm != NULL); m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE); /* No bzero required */ if (m_init(m, M_NOWAIT, MT_DATA, fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE)) return (NULL); fl->mbuf_inlined++; m_extaddref(m, payload, blen, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { /* * Grab an mbuf from zone_mbuf and associate it with the * payload in the cluster. */ m = fr_offset == 0 ? m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); fl->mbuf_allocated++; if (clm != NULL) { m_extaddref(m, payload, blen, &clm->refcount, rxb_free, swz->zone, sd->cl); if (sd->nmbuf++ == 0) counter_u64_add(extfree_refs, 1); } else { m_cljset(m, sd->cl, swz->type); sd->cl = NULL; /* consumed, not a recycle candidate */ } } if (fr_offset == 0) m->m_pkthdr.len = remaining; m->m_len = len; if (fl->flags & FL_BUF_PACKING) { fl->rx_offset += blen; MPASS(fl->rx_offset <= hwb->size); if (fl->rx_offset < hwb->size) return (m); /* without advancing the cidx */ } if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } fl->rx_offset = 0; return (m); } static struct mbuf * get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf) { struct mbuf *m0, *m, **pnext; u_int remaining; const u_int total = G_RSPD_LEN(len_newbuf); if (__predict_false(fl->flags & FL_BUF_RESUME)) { M_ASSERTPKTHDR(fl->m0); MPASS(fl->m0->m_pkthdr.len == total); MPASS(fl->remaining < total); m0 = fl->m0; pnext = fl->pnext; remaining = fl->remaining; fl->flags &= ~FL_BUF_RESUME; goto get_segment; } if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) { fl->rx_offset = 0; if (__predict_false(++fl->cidx % 8 == 0)) { uint16_t cidx = fl->cidx / 8; if (__predict_false(cidx == fl->sidx)) fl->cidx = cidx = 0; fl->hw_cidx = cidx; } } /* * Payload starts at rx_offset in the current hw buffer. Its length is * 'len' and it may span multiple hw buffers. */ m0 = get_scatter_segment(sc, fl, 0, total); if (m0 == NULL) return (NULL); remaining = total - m0->m_len; pnext = &m0->m_next; while (remaining > 0) { get_segment: MPASS(fl->rx_offset == 0); m = get_scatter_segment(sc, fl, total - remaining, remaining); if (__predict_false(m == NULL)) { fl->m0 = m0; fl->pnext = pnext; fl->remaining = remaining; fl->flags |= FL_BUF_RESUME; return (NULL); } *pnext = m; pnext = &m->m_next; remaining -= m->m_len; } *pnext = NULL; M_ASSERTPKTHDR(m0); return (m0); } static int t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) { struct sge_rxq *rxq = iq_to_rxq(iq); struct ifnet *ifp = rxq->ifp; struct adapter *sc = iq->adapter; const struct cpl_rx_pkt *cpl = (const void *)(rss + 1); #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxq->lro; #endif static const int sw_hashtype[4][2] = { {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6}, {M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6}, {M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6}, }; KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__, rss->opcode)); m0->m_pkthdr.len -= sc->params.sge.fl_pktshift; m0->m_len -= sc->params.sge.fl_pktshift; m0->m_data += sc->params.sge.fl_pktshift; m0->m_pkthdr.rcvif = ifp; M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]); m0->m_pkthdr.flowid = be32toh(rss->hash_val); if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) { if (ifp->if_capenable & IFCAP_RXCSUM && cpl->l2info & htobe32(F_RXF_IP)) { m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); rxq->rxcsum++; } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && cpl->l2info & htobe32(F_RXF_IP6)) { m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR); rxq->rxcsum++; } if (__predict_false(cpl->ip_frag)) m0->m_pkthdr.csum_data = be16toh(cpl->csum); else m0->m_pkthdr.csum_data = 0xffff; } if (cpl->vlan_ex) { m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan); m0->m_flags |= M_VLANTAG; rxq->vlan_extraction++; } #ifdef NUMA m0->m_pkthdr.numa_domain = ifp->if_numa_domain; #endif #if defined(INET) || defined(INET6) if (iq->flags & IQ_LRO_ENABLED) { if (sort_before_lro(lro)) { tcp_lro_queue_mbuf(lro, m0); return (0); /* queued for sort, then LRO */ } if (tcp_lro_rx(lro, m0, 0) == 0) return (0); /* queued for LRO */ } #endif ifp->if_input(ifp, m0); return (0); } /* * Must drain the wrq or make sure that someone else will. */ static void wrq_tx_drain(void *arg, int n) { struct sge_wrq *wrq = arg; struct sge_eq *eq = &wrq->eq; EQ_LOCK(eq); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(wrq->adapter, wrq); EQ_UNLOCK(eq); } static void drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq) { struct sge_eq *eq = &wrq->eq; u_int available, dbdiff; /* # of hardware descriptors */ u_int n; struct wrqe *wr; struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ EQ_LOCK_ASSERT_OWNED(eq); MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs)); wr = STAILQ_FIRST(&wrq->wr_list); MPASS(wr != NULL); /* Must be called with something useful to do */ MPASS(eq->pidx == eq->dbidx); dbdiff = 0; do { eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; MPASS(wr->wrq == wrq); n = howmany(wr->wr_len, EQ_ESIZE); if (available < n) break; dst = (void *)&eq->desc[eq->pidx]; if (__predict_true(eq->sidx - eq->pidx > n)) { /* Won't wrap, won't end exactly at the status page. */ bcopy(&wr->wr[0], dst, wr->wr_len); eq->pidx += n; } else { int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE; bcopy(&wr->wr[0], dst, first_portion); if (wr->wr_len > first_portion) { bcopy(&wr->wr[first_portion], &eq->desc[0], wr->wr_len - first_portion); } eq->pidx = n - (eq->sidx - eq->pidx); } wrq->tx_wrs_copied++; if (available < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { /* * XXX: This is not 100% reliable with some * types of WRs. But this is a very unusual * situation for an ofld/ctrl queue anyway. */ dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); } dbdiff += n; if (dbdiff >= 16) { ring_eq_db(sc, eq, dbdiff); dbdiff = 0; } STAILQ_REMOVE_HEAD(&wrq->wr_list, link); free_wrqe(wr); MPASS(wrq->nwr_pending > 0); wrq->nwr_pending--; MPASS(wrq->ndesc_needed >= n); wrq->ndesc_needed -= n; } while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL); if (dbdiff) ring_eq_db(sc, eq, dbdiff); } /* * Doesn't fail. Holds on to work requests it can't send right away. */ void t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr) { #ifdef INVARIANTS struct sge_eq *eq = &wrq->eq; #endif EQ_LOCK_ASSERT_OWNED(eq); MPASS(wr != NULL); MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN); MPASS((wr->wr_len & 0x7) == 0); STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link); wrq->nwr_pending++; wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE); if (!TAILQ_EMPTY(&wrq->incomplete_wrs)) return; /* commit_wrq_wr will drain wr_list as well. */ drain_wrq_wr_list(sc, wrq); /* Doorbell must have caught up to the pidx. */ MPASS(eq->pidx == eq->dbidx); } void t4_update_fl_bufsize(struct ifnet *ifp) { struct vi_info *vi = ifp->if_softc; struct adapter *sc = vi->pi->adapter; struct sge_rxq *rxq; #ifdef TCP_OFFLOAD struct sge_ofld_rxq *ofld_rxq; #endif struct sge_fl *fl; int i, maxp, mtu = ifp->if_mtu; maxp = mtu_to_max_payload(sc, mtu, 0); for_each_rxq(vi, i, rxq) { fl = &rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #ifdef TCP_OFFLOAD maxp = mtu_to_max_payload(sc, mtu, 1); for_each_ofld_rxq(vi, i, ofld_rxq) { fl = &ofld_rxq->fl; FL_LOCK(fl); find_best_refill_source(sc, fl, maxp); FL_UNLOCK(fl); } #endif } static inline int mbuf_nsegs(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.l5hlen > 0, ("%s: mbuf %p missing information on # of segments.", __func__, m)); return (m->m_pkthdr.l5hlen); } static inline void set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs) { M_ASSERTPKTHDR(m); m->m_pkthdr.l5hlen = nsegs; } static inline int mbuf_cflags(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_loc.eight[4]); } static inline void set_mbuf_cflags(struct mbuf *m, uint8_t flags) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[4] = flags; } static inline int mbuf_len16(struct mbuf *m) { int n; M_ASSERTPKTHDR(m); n = m->m_pkthdr.PH_loc.eight[0]; MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); return (n); } static inline void set_mbuf_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[0] = len16; } #ifdef RATELIMIT static inline int mbuf_eo_nsegs(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_loc.eight[1]); } static inline void set_mbuf_eo_nsegs(struct mbuf *m, uint8_t nsegs) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[1] = nsegs; } static inline int mbuf_eo_len16(struct mbuf *m) { int n; M_ASSERTPKTHDR(m); n = m->m_pkthdr.PH_loc.eight[2]; MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16); return (n); } static inline void set_mbuf_eo_len16(struct mbuf *m, uint8_t len16) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[2] = len16; } static inline int mbuf_eo_tsclk_tsoff(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.PH_loc.eight[3]); } static inline void set_mbuf_eo_tsclk_tsoff(struct mbuf *m, uint8_t tsclk_tsoff) { M_ASSERTPKTHDR(m); m->m_pkthdr.PH_loc.eight[3] = tsclk_tsoff; } static inline int needs_eo(struct mbuf *m) { - return (m->m_pkthdr.snd_tag != NULL); + return (m->m_pkthdr.csum_flags & CSUM_SND_TAG); } #endif /* * Try to allocate an mbuf to contain a raw work request. To make it * easy to construct the work request, don't allocate a chain but a * single mbuf. */ struct mbuf * alloc_wr_mbuf(int len, int how) { struct mbuf *m; if (len <= MHLEN) m = m_gethdr(how, MT_DATA); else if (len <= MCLBYTES) m = m_getcl(how, MT_DATA, M_PKTHDR); else m = NULL; if (m == NULL) return (NULL); m->m_pkthdr.len = len; m->m_len = len; set_mbuf_cflags(m, MC_RAW_WR); set_mbuf_len16(m, howmany(len, 16)); return (m); } static inline int needs_tso(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & CSUM_TSO); } static inline int needs_l3_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)); } static inline int needs_l4_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)); } static inline int needs_tcp_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO)); } #ifdef RATELIMIT static inline int needs_udp_csum(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6)); } #endif static inline int needs_vlan_insertion(struct mbuf *m) { M_ASSERTPKTHDR(m); return (m->m_flags & M_VLANTAG); } static void * m_advance(struct mbuf **pm, int *poffset, int len) { struct mbuf *m = *pm; int offset = *poffset; uintptr_t p = 0; MPASS(len > 0); for (;;) { if (offset + len < m->m_len) { offset += len; p = mtod(m, uintptr_t) + offset; break; } len -= m->m_len - offset; m = m->m_next; offset = 0; MPASS(m != NULL); } *poffset = offset; *pm = m; return ((void *)p); } /* * Can deal with empty mbufs in the chain that have m_len = 0, but the chain * must have at least one mbuf that's not empty. It is possible for this * routine to return 0 if skip accounts for all the contents of the mbuf chain. */ static inline int count_mbuf_nsegs(struct mbuf *m, int skip) { vm_paddr_t lastb, next; vm_offset_t va; int len, nsegs; M_ASSERTPKTHDR(m); MPASS(m->m_pkthdr.len > 0); MPASS(m->m_pkthdr.len >= skip); nsegs = 0; lastb = 0; for (; m; m = m->m_next) { len = m->m_len; if (__predict_false(len == 0)) continue; if (skip >= len) { skip -= len; continue; } va = mtod(m, vm_offset_t) + skip; len -= skip; skip = 0; next = pmap_kextract(va); nsegs += sglist_count((void *)(uintptr_t)va, len); if (lastb + 1 == next) nsegs--; lastb = pmap_kextract(va + len - 1); } return (nsegs); } /* * Analyze the mbuf to determine its tx needs. The mbuf passed in may change: * a) caller can assume it's been freed if this function returns with an error. * b) it may get defragged up if the gather list is too long for the hardware. */ int parse_pkt(struct adapter *sc, struct mbuf **mp) { struct mbuf *m0 = *mp, *m; int rc, nsegs, defragged = 0, offset; struct ether_header *eh; void *l3hdr; #if defined(INET) || defined(INET6) struct tcphdr *tcp; #endif uint16_t eh_type; M_ASSERTPKTHDR(m0); if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) { rc = EINVAL; fail: m_freem(m0); *mp = NULL; return (rc); } restart: /* * First count the number of gather list segments in the payload. * Defrag the mbuf if nsegs exceeds the hardware limit. */ M_ASSERTPKTHDR(m0); MPASS(m0->m_pkthdr.len > 0); nsegs = count_mbuf_nsegs(m0, 0); if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) { if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) { rc = EFBIG; goto fail; } *mp = m0 = m; /* update caller's copy after defrag */ goto restart; } if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) { m0 = m_pullup(m0, m0->m_pkthdr.len); if (m0 == NULL) { /* Should have left well enough alone. */ rc = EFBIG; goto fail; } *mp = m0; /* update caller's copy after pullup */ goto restart; } set_mbuf_nsegs(m0, nsegs); set_mbuf_cflags(m0, 0); if (sc->flags & IS_VF) set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0))); else set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0))); #ifdef RATELIMIT /* * Ethofld is limited to TCP and UDP for now, and only when L4 hw * checksumming is enabled. needs_l4_csum happens to check for all the * right things. */ - if (__predict_false(needs_eo(m0) && !needs_l4_csum(m0))) + if (__predict_false(needs_eo(m0) && !needs_l4_csum(m0))) { + m_snd_tag_rele(m0->m_pkthdr.snd_tag); m0->m_pkthdr.snd_tag = NULL; + m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; + } #endif if (!needs_tso(m0) && #ifdef RATELIMIT !needs_eo(m0) && #endif !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0)))) return (0); m = m0; eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); if (eh_type == ETHERTYPE_VLAN) { struct ether_vlan_header *evh = (void *)eh; eh_type = ntohs(evh->evl_proto); m0->m_pkthdr.l2hlen = sizeof(*evh); } else m0->m_pkthdr.l2hlen = sizeof(*eh); offset = 0; l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen); switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6 = l3hdr; MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP); m0->m_pkthdr.l3hlen = sizeof(*ip6); break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; m0->m_pkthdr.l3hlen = ip->ip_hl * 4; break; } #endif default: panic("%s: ethertype 0x%04x unknown. if_cxgbe must be compiled" " with the same INET/INET6 options as the kernel.", __func__, eh_type); } #if defined(INET) || defined(INET6) if (needs_tcp_csum(m0)) { tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen); m0->m_pkthdr.l4hlen = tcp->th_off * 4; #ifdef RATELIMIT if (tsclk >= 0 && *(uint32_t *)(tcp + 1) == ntohl(0x0101080a)) { set_mbuf_eo_tsclk_tsoff(m0, V_FW_ETH_TX_EO_WR_TSCLK(tsclk) | V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1)); } else set_mbuf_eo_tsclk_tsoff(m0, 0); } else if (needs_udp_csum(m)) { m0->m_pkthdr.l4hlen = sizeof(struct udphdr); #endif } #ifdef RATELIMIT if (needs_eo(m0)) { u_int immhdrs; /* EO WRs have the headers in the WR and not the GL. */ immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen; nsegs = count_mbuf_nsegs(m0, immhdrs); set_mbuf_eo_nsegs(m0, nsegs); set_mbuf_eo_len16(m0, txpkt_eo_len16(nsegs, immhdrs, needs_tso(m0))); } #endif #endif MPASS(m0 == *mp); return (0); } void * start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, available; struct wrqe *wr; void *w; MPASS(len16 > 0); ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC); EQ_LOCK(eq); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); if (!STAILQ_EMPTY(&wrq->wr_list)) { slowpath: EQ_UNLOCK(eq); wr = alloc_wrqe(len16 * 16, wrq); if (__predict_false(wr == NULL)) return (NULL); cookie->pidx = -1; cookie->ndesc = ndesc; return (&wr->wr); } eq->cidx = read_hw_cidx(eq); if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; if (available < ndesc) goto slowpath; cookie->pidx = eq->pidx; cookie->ndesc = ndesc; TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link); w = &eq->desc[eq->pidx]; IDXINCR(eq->pidx, ndesc, eq->sidx); if (__predict_false(cookie->pidx + ndesc > eq->sidx)) { w = &wrq->ss[0]; wrq->ss_pidx = cookie->pidx; wrq->ss_len = len16 * 16; } EQ_UNLOCK(eq); return (w); } void commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie) { struct sge_eq *eq = &wrq->eq; struct adapter *sc = wrq->adapter; int ndesc, pidx; struct wrq_cookie *prev, *next; if (cookie->pidx == -1) { struct wrqe *wr = __containerof(w, struct wrqe, wr); t4_wrq_tx(sc, wr); return; } if (__predict_false(w == &wrq->ss[0])) { int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE; MPASS(wrq->ss_len > n); /* WR had better wrap around. */ bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n); bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n); wrq->tx_wrs_ss++; } else wrq->tx_wrs_direct++; EQ_LOCK(eq); ndesc = cookie->ndesc; /* Can be more than SGE_MAX_WR_NDESC here. */ pidx = cookie->pidx; MPASS(pidx >= 0 && pidx < eq->sidx); prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link); next = TAILQ_NEXT(cookie, link); if (prev == NULL) { MPASS(pidx == eq->dbidx); if (next == NULL || ndesc >= 16) { int available; struct fw_eth_tx_pkt_wr *dst; /* any fw WR struct will do */ /* * Note that the WR via which we'll request tx updates * is at pidx and not eq->pidx, which has moved on * already. */ dst = (void *)&eq->desc[pidx]; available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; if (available < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { /* * XXX: This is not 100% reliable with some * types of WRs. But this is a very unusual * situation for an ofld/ctrl queue anyway. */ dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); } ring_eq_db(wrq->adapter, eq, ndesc); } else { MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc); next->pidx = pidx; next->ndesc += ndesc; } } else { MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc); prev->ndesc += ndesc; } TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link); if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list)) drain_wrq_wr_list(sc, wrq); #ifdef INVARIANTS if (TAILQ_EMPTY(&wrq->incomplete_wrs)) { /* Doorbell must have caught up to the pidx. */ MPASS(wrq->eq.pidx == wrq->eq.dbidx); } #endif EQ_UNLOCK(eq); } static u_int can_resume_eth_tx(struct mp_ring *r) { struct sge_eq *eq = r->cookie; return (total_available_tx_desc(eq) > eq->sidx / 8); } static inline int cannot_use_txpkts(struct mbuf *m) { /* maybe put a GL limit too, to avoid silliness? */ return (needs_tso(m) || (mbuf_cflags(m) & MC_RAW_WR) != 0); } static inline int discard_tx(struct sge_eq *eq) { return ((eq->flags & (EQ_ENABLED | EQ_QFLUSH)) != EQ_ENABLED); } static inline int wr_can_update_eq(struct fw_eth_tx_pkts_wr *wr) { switch (G_FW_WR_OP(be32toh(wr->op_pkd))) { case FW_ULPTX_WR: case FW_ETH_TX_PKT_WR: case FW_ETH_TX_PKTS_WR: case FW_ETH_TX_PKT_VM_WR: return (1); default: return (0); } } /* * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to * be consumed. Return the actual number consumed. 0 indicates a stall. */ static u_int eth_tx(struct mp_ring *r, u_int cidx, u_int pidx) { struct sge_txq *txq = r->cookie; struct sge_eq *eq = &txq->eq; struct ifnet *ifp = txq->ifp; struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; u_int total, remaining; /* # of packets */ u_int available, dbdiff; /* # of hardware descriptors */ u_int n, next_cidx; struct mbuf *m0, *tail; struct txpkts txp; struct fw_eth_tx_pkts_wr *wr; /* any fw WR struct will do */ remaining = IDXDIFF(pidx, cidx, r->size); MPASS(remaining > 0); /* Must not be called without work to do. */ total = 0; TXQ_LOCK(txq); if (__predict_false(discard_tx(eq))) { while (cidx != pidx) { m0 = r->items[cidx]; m_freem(m0); if (++cidx == r->size) cidx = 0; } reclaim_tx_descs(txq, 2048); total = remaining; goto done; } /* How many hardware descriptors do we have readily available. */ if (eq->pidx == eq->cidx) available = eq->sidx - 1; else available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1; dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx); while (remaining > 0) { m0 = r->items[cidx]; M_ASSERTPKTHDR(m0); MPASS(m0->m_nextpkt == NULL); if (available < SGE_MAX_WR_NDESC) { available += reclaim_tx_descs(txq, 64); if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16)) break; /* out of descriptors */ } next_cidx = cidx + 1; if (__predict_false(next_cidx == r->size)) next_cidx = 0; wr = (void *)&eq->desc[eq->pidx]; if (sc->flags & IS_VF) { total++; remaining--; ETHER_BPF_MTAP(ifp, m0); n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0, available); } else if (remaining > 1 && try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) { /* pkts at cidx, next_cidx should both be in txp. */ MPASS(txp.npkt == 2); tail = r->items[next_cidx]; MPASS(tail->m_nextpkt == NULL); ETHER_BPF_MTAP(ifp, m0); ETHER_BPF_MTAP(ifp, tail); m0->m_nextpkt = tail; if (__predict_false(++next_cidx == r->size)) next_cidx = 0; while (next_cidx != pidx) { if (add_to_txpkts(r->items[next_cidx], &txp, available) != 0) break; tail->m_nextpkt = r->items[next_cidx]; tail = tail->m_nextpkt; ETHER_BPF_MTAP(ifp, tail); if (__predict_false(++next_cidx == r->size)) next_cidx = 0; } n = write_txpkts_wr(txq, wr, m0, &txp, available); total += txp.npkt; remaining -= txp.npkt; } else if (mbuf_cflags(m0) & MC_RAW_WR) { total++; remaining--; n = write_raw_wr(txq, (void *)wr, m0, available); } else { total++; remaining--; ETHER_BPF_MTAP(ifp, m0); n = write_txpkt_wr(txq, (void *)wr, m0, available); } MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC); available -= n; dbdiff += n; IDXINCR(eq->pidx, n, eq->sidx); if (wr_can_update_eq(wr)) { if (total_available_tx_desc(eq) < eq->sidx / 4 && atomic_cmpset_int(&eq->equiq, 0, 1)) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ | F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) { wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ); eq->equeqidx = eq->pidx; } } if (dbdiff >= 16 && remaining >= 4) { ring_eq_db(sc, eq, dbdiff); available += reclaim_tx_descs(txq, 4 * dbdiff); dbdiff = 0; } cidx = next_cidx; } if (dbdiff != 0) { ring_eq_db(sc, eq, dbdiff); reclaim_tx_descs(txq, 32); } done: TXQ_UNLOCK(txq); return (total); } static inline void init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, int qsize) { KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS, ("%s: bad tmr_idx %d", __func__, tmr_idx)); KASSERT(pktc_idx < SGE_NCOUNTERS, /* -ve is ok, means don't use */ ("%s: bad pktc_idx %d", __func__, pktc_idx)); iq->flags = 0; iq->adapter = sc; iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx); iq->intr_pktc_idx = SGE_NCOUNTERS - 1; if (pktc_idx >= 0) { iq->intr_params |= F_QINTR_CNT_EN; iq->intr_pktc_idx = pktc_idx; } iq->qsize = roundup2(qsize, 16); /* See FW_IQ_CMD/iqsize */ iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE; } static inline void init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name) { fl->qsize = qsize; fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(fl->lockname, name, sizeof(fl->lockname)); if (sc->flags & BUF_PACKING_OK && ((!is_t4(sc) && buffer_packing) || /* T5+: enabled unless 0 */ (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */ fl->flags |= FL_BUF_PACKING; find_best_refill_source(sc, fl, maxp); find_safe_refill_source(sc, fl); } static inline void init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize, uint8_t tx_chan, uint16_t iqid, char *name) { KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype)); eq->flags = eqtype & EQ_TYPEMASK; eq->tx_chan = tx_chan; eq->iqid = iqid; eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE; strlcpy(eq->lockname, name, sizeof(eq->lockname)); } static int alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag, bus_dmamap_t *map, bus_addr_t *pa, void **va) { int rc; rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc); goto done; } rc = bus_dmamem_alloc(*tag, va, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map); if (rc != 0) { device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc); goto done; } rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0); if (rc != 0) { device_printf(sc->dev, "cannot load DMA map: %d\n", rc); goto done; } done: if (rc) free_ring(sc, *tag, *map, *pa, *va); return (rc); } static int free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map, bus_addr_t pa, void *va) { if (pa) bus_dmamap_unload(tag, map); if (va) bus_dmamem_free(tag, va, map); if (tag) bus_dma_tag_destroy(tag); return (0); } /* * Allocates the ring for an ingress queue and an optional freelist. If the * freelist is specified it will be allocated and then associated with the * ingress queue. * * Returns errno on failure. Resources allocated up to that point may still be * allocated. Caller is responsible for cleanup in case this function fails. * * If the ingress queue will take interrupts directly then the intr_idx * specifies the vector, starting from 0. -1 means the interrupts for this * queue should be forwarded to the fwq. */ static int alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl, int intr_idx, int cong) { int rc, i, cntxt_id; size_t len; struct fw_iq_cmd c; struct port_info *pi = vi->pi; struct adapter *sc = iq->adapter; struct sge_params *sp = &sc->params.sge; __be32 v = 0; len = iq->qsize * IQ_ESIZE; rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba, (void **)&iq->desc); if (rc != 0) return (rc); bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) | V_FW_IQ_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART | FW_LEN16(c)); /* Special handling for firmware event queue */ if (iq == &sc->sge.fwq) v |= F_FW_IQ_CMD_IQASYNCH; if (intr_idx < 0) { /* Forwarded interrupts, all headed to fwq */ v |= F_FW_IQ_CMD_IQANDST; v |= V_FW_IQ_CMD_IQANDSTINDEX(sc->sge.fwq.cntxt_id); } else { KASSERT(intr_idx < sc->intr_count, ("%s: invalid direct intr_idx %d", __func__, intr_idx)); v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx); } c.type_to_iqandstindex = htobe32(v | V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) | V_FW_IQ_CMD_VIID(vi->viid) | V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT)); c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) | F_FW_IQ_CMD_IQGTSMODE | V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) | V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4)); c.iqsize = htobe16(iq->qsize); c.iqaddr = htobe64(iq->ba); if (cong >= 0) c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN); if (fl) { mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF); len = fl->qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map, &fl->ba, (void **)&fl->desc); if (rc) return (rc); /* Allocate space for one software descriptor per buffer. */ rc = alloc_fl_sdesc(fl); if (rc != 0) { device_printf(sc->dev, "failed to setup fl software descriptors: %d\n", rc); return (rc); } if (fl->flags & FL_BUF_PACKING) { fl->lowat = roundup2(sp->fl_starve_threshold2, 8); fl->buf_boundary = sp->pack_boundary; } else { fl->lowat = roundup2(sp->fl_starve_threshold, 8); fl->buf_boundary = 16; } if (fl_pad && fl->buf_boundary < sp->pad_boundary) fl->buf_boundary = sp->pad_boundary; c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO | (fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) | (fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN : 0)); if (cong >= 0) { c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) | F_FW_IQ_CMD_FL0CONGCIF | F_FW_IQ_CMD_FL0CONGEN); } c.fl0dcaen_to_fl0cidxfthresh = htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) | V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ? X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B)); c.fl0size = htobe16(fl->qsize); c.fl0addr = htobe64(fl->ba); } rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create ingress queue: %d\n", rc); return (rc); } iq->cidx = 0; iq->gen = F_RSPD_GEN; iq->intr_next = iq->intr_params; iq->cntxt_id = be16toh(c.iqid); iq->abs_id = be16toh(c.physiqid); iq->flags |= IQ_ALLOCATED; cntxt_id = iq->cntxt_id - sc->sge.iq_start; if (cntxt_id >= sc->sge.niq) { panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.niq - 1); } sc->sge.iqmap[cntxt_id] = iq; if (fl) { u_int qid; iq->flags |= IQ_HAS_FL; fl->cntxt_id = be16toh(c.fl0id); fl->pidx = fl->cidx = 0; cntxt_id = fl->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) { panic("%s: fl->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); } sc->sge.eqmap[cntxt_id] = (void *)fl; qid = fl->cntxt_id; if (isset(&sc->doorbells, DOORBELL_UDB)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (qid >> s_qpp) << PAGE_SHIFT; qid &= mask; if (qid < PAGE_SIZE / UDBS_SEG_SIZE) { udb += qid << UDBS_SEG_SHIFT; qid = 0; } fl->udb = (volatile void *)udb; } fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db; FL_LOCK(fl); /* Enough to make sure the SGE doesn't think it's starved */ refill_fl(sc, fl, fl->lowat); FL_UNLOCK(fl); } if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) { uint32_t param, val; param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) | V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) | V_FW_PARAMS_PARAM_YZ(iq->cntxt_id); if (cong == 0) val = 1 << 19; else { val = 2 << 19; for (i = 0; i < 4; i++) { if (cong & (1 << i)) val |= 1 << (i << 2); } } rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, ¶m, &val); if (rc != 0) { /* report error but carry on */ device_printf(sc->dev, "failed to set congestion manager context for " "ingress queue %d: %d\n", iq->cntxt_id, rc); } } /* Enable IQ interrupts */ atomic_store_rel_int(&iq->state, IQS_IDLE); t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) | V_INGRESSQID(iq->cntxt_id)); return (0); } static int free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl) { int rc; struct adapter *sc = iq->adapter; device_t dev; if (sc == NULL) return (0); /* nothing to do */ dev = vi ? vi->dev : sc->dev; if (iq->flags & IQ_ALLOCATED) { rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id, fl ? fl->cntxt_id : 0xffff, 0xffff); if (rc != 0) { device_printf(dev, "failed to free queue %p: %d\n", iq, rc); return (rc); } iq->flags &= ~IQ_ALLOCATED; } free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc); bzero(iq, sizeof(*iq)); if (fl) { free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba, fl->desc); if (fl->sdesc) free_fl_sdesc(sc, fl); if (mtx_initialized(&fl->fl_lock)) mtx_destroy(&fl->fl_lock); bzero(fl, sizeof(*fl)); } return (0); } static void add_iq_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_iq *iq) { struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &iq->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, iq->qsize * IQ_ESIZE, "descriptor ring size in bytes"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &iq->abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &iq->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &iq->cidx, 0, sysctl_uint16, "I", "consumer index"); } static void add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, struct sge_fl *fl) { struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &fl->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, fl->sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL, fl_pad ? 1 : 0, "padding enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL, fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, 0, "consumer index"); if (fl->flags & FL_BUF_PACKING) { SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset", CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset"); } SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx, 0, "producer index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated", CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined", CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated", CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled", CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled", CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)"); } static int alloc_fwq(struct adapter *sc) { int rc, intr_idx; struct sge_iq *fwq = &sc->sge.fwq; struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE); if (sc->flags & IS_VF) intr_idx = 0; else intr_idx = sc->intr_count > 1 ? 1 : 0; rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1); if (rc != 0) { device_printf(sc->dev, "failed to create firmware event queue: %d\n", rc); return (rc); } oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD, NULL, "firmware event queue"); add_iq_sysctls(&sc->ctx, oid, fwq); return (0); } static int free_fwq(struct adapter *sc) { return free_iq_fl(NULL, &sc->sge.fwq, NULL); } static int alloc_ctrlq(struct adapter *sc, struct sge_wrq *ctrlq, int idx, struct sysctl_oid *oid) { int rc; char name[16]; struct sysctl_oid_list *children; snprintf(name, sizeof(name), "%s ctrlq%d", device_get_nameunit(sc->dev), idx); init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[idx]->tx_chan, sc->sge.fwq.cntxt_id, name); children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "ctrl queue"); rc = alloc_wrq(sc, NULL, ctrlq, oid); return (rc); } int tnl_cong(struct port_info *pi, int drop) { if (drop == -1) return (-1); else if (drop == 1) return (0); else return (pi->rx_e_chan_map); } static int alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct adapter *sc = vi->pi->adapter; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx, tnl_cong(vi->pi, cong_drop)); if (rc != 0) return (rc); if (idx == 0) sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id; else KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id, ("iq_base mismatch")); KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF, ("PF with non-zero iq_base")); /* * The freelist is just barely above the starvation threshold right now, * fill it up a bit more. */ FL_LOCK(&rxq->fl); refill_fl(sc, &rxq->fl, 128); FL_UNLOCK(&rxq->fl); #if defined(INET) || defined(INET6) rc = tcp_lro_init_args(&rxq->lro, vi->ifp, lro_entries, lro_mbufs); if (rc != 0) return (rc); MPASS(rxq->lro.ifp == vi->ifp); /* also indicates LRO init'ed */ if (vi->ifp->if_capenable & IFCAP_LRO) rxq->iq.flags |= IQ_LRO_ENABLED; #endif if (vi->ifp->if_capenable & IFCAP_HWRXTSTMP) rxq->iq.flags |= IQ_RX_TIMESTAMP; rxq->ifp = vi->ifp; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); add_iq_sysctls(&vi->ctx, oid, &rxq->iq); #if defined(INET) || defined(INET6) SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD, &rxq->lro.lro_queued, 0, NULL); SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD, &rxq->lro.lro_flushed, 0, NULL); #endif SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD, &rxq->rxcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl); return (rc); } static int free_rxq(struct vi_info *vi, struct sge_rxq *rxq) { int rc; #if defined(INET) || defined(INET6) if (rxq->lro.ifp) { tcp_lro_free(&rxq->lro); rxq->lro.ifp = NULL; } #endif rc = free_iq_fl(vi, &rxq->iq, &rxq->fl); if (rc == 0) bzero(rxq, sizeof(*rxq)); return (rc); } #ifdef TCP_OFFLOAD static int alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { struct port_info *pi = vi->pi; int rc; struct sysctl_oid_list *children; char name[16]; rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx, 0); if (rc != 0) return (rc); children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); add_iq_sysctls(&vi->ctx, oid, &ofld_rxq->iq); add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl); return (rc); } static int free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq) { int rc; rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl); if (rc == 0) bzero(ofld_rxq, sizeof(*ofld_rxq)); return (rc); } #endif #ifdef DEV_NETMAP static int alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx, int idx, struct sysctl_oid *oid) { int rc; struct sysctl_oid_list *children; struct sysctl_ctx_list *ctx; char name[16]; size_t len; struct adapter *sc = vi->pi->adapter; struct netmap_adapter *na = NA(vi->ifp); MPASS(na != NULL); len = vi->qsize_rxq * IQ_ESIZE; rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map, &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc); if (rc != 0) return (rc); len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len; rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map, &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc); if (rc != 0) return (rc); nm_rxq->vi = vi; nm_rxq->nid = idx; nm_rxq->iq_cidx = 0; nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE; nm_rxq->iq_gen = F_RSPD_GEN; nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0; nm_rxq->fl_sidx = na->num_rx_desc; nm_rxq->intr_idx = intr_idx; nm_rxq->iq_cntxt_id = INVALID_NM_RXQ_CNTXT_ID; ctx = &vi->ctx; children = SYSCTL_CHILDREN(oid); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "rx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16, "I", "absolute id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I", "consumer index"); children = SYSCTL_CHILDREN(oid); oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL, "freelist"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &nm_rxq->fl_cidx, 0, "consumer index"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &nm_rxq->fl_pidx, 0, "producer index"); return (rc); } static int free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq) { struct adapter *sc = vi->pi->adapter; if (vi->flags & VI_INIT_DONE) MPASS(nm_rxq->iq_cntxt_id == INVALID_NM_RXQ_CNTXT_ID); else MPASS(nm_rxq->iq_cntxt_id == 0); free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba, nm_rxq->iq_desc); free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba, nm_rxq->fl_desc); return (0); } static int alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx, struct sysctl_oid *oid) { int rc; size_t len; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct netmap_adapter *na = NA(vi->ifp); char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len; rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map, &nm_txq->ba, (void **)&nm_txq->desc); if (rc) return (rc); nm_txq->pidx = nm_txq->cidx = 0; nm_txq->sidx = na->num_tx_desc; nm_txq->nid = idx; nm_txq->iqidx = iqidx; nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); nm_txq->cntxt_id = INVALID_NM_TXQ_CNTXT_ID; snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "netmap tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &nm_txq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I", "producer index"); return (rc); } static int free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq) { struct adapter *sc = vi->pi->adapter; if (vi->flags & VI_INIT_DONE) MPASS(nm_txq->cntxt_id == INVALID_NM_TXQ_CNTXT_ID); else MPASS(nm_txq->cntxt_id == 0); free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba, nm_txq->desc); return (0); } #endif /* * Returns a reasonable automatic cidx flush threshold for a given queue size. */ static u_int qsize_to_fthresh(int qsize) { u_int fthresh; while (!powerof2(qsize)) qsize++; fthresh = ilog2(qsize); if (fthresh > X_CIDXFLUSHTHRESH_128) fthresh = X_CIDXFLUSHTHRESH_128; return (fthresh); } static int ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ctrl_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) | V_FW_EQ_CTRL_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC | F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c)); c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid)); c.physeqid_pkd = htobe32(0); c.fetchszm_to_iqid = htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_CTRL_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) | V_FW_EQ_CTRL_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(sc->dev, "failed to create control queue %d: %d\n", eq->tx_chan, rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } static int eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_eth_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) | V_FW_EQ_ETH_CMD_VFN(0)); c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC | F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c)); c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE | F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid)); c.fetchszm_to_iqid = htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) | V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO | V_FW_EQ_ETH_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_ETH_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create Ethernet egress queue: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd)); eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #if defined(TCP_OFFLOAD) || defined(RATELIMIT) static int ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, cntxt_id; struct fw_eq_ofld_cmd c; int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; bzero(&c, sizeof(c)); c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST | F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) | V_FW_EQ_OFLD_CMD_VFN(0)); c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC | F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c)); c.fetchszm_to_iqid = htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_STATUS_PAGE) | V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid)); c.dcaen_to_eqsize = htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) | V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) | V_FW_EQ_OFLD_CMD_CIDXFTHRESH(qsize_to_fthresh(qsize)) | V_FW_EQ_OFLD_CMD_EQSIZE(qsize)); c.eqaddr = htobe64(eq->ba); rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c); if (rc != 0) { device_printf(vi->dev, "failed to create egress queue for TCP offload: %d\n", rc); return (rc); } eq->flags |= EQ_ALLOCATED; eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd)); cntxt_id = eq->cntxt_id - sc->sge.eq_start; if (cntxt_id >= sc->sge.neq) panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__, cntxt_id, sc->sge.neq - 1); sc->sge.eqmap[cntxt_id] = eq; return (rc); } #endif static int alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq) { int rc, qsize; size_t len; mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF); qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE; len = qsize * EQ_ESIZE; rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map, &eq->ba, (void **)&eq->desc); if (rc) return (rc); eq->pidx = eq->cidx = eq->dbidx = 0; /* Note that equeqidx is not used with sge_wrq (OFLD/CTRL) queues. */ eq->equeqidx = 0; eq->doorbells = sc->doorbells; switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = ctrl_eq_alloc(sc, eq); break; case EQ_ETH: rc = eth_eq_alloc(sc, vi, eq); break; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) case EQ_OFLD: rc = ofld_eq_alloc(sc, vi, eq); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to allocate egress queue(%d): %d\n", eq->flags & EQ_TYPEMASK, rc); } if (isset(&eq->doorbells, DOORBELL_UDB) || isset(&eq->doorbells, DOORBELL_UDBWC) || isset(&eq->doorbells, DOORBELL_WCWR)) { uint32_t s_qpp = sc->params.sge.eq_s_qpp; uint32_t mask = (1 << s_qpp) - 1; volatile uint8_t *udb; udb = sc->udbs_base + UDBS_DB_OFFSET; udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT; /* pg offset */ eq->udb_qid = eq->cntxt_id & mask; /* id in page */ if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE) clrbit(&eq->doorbells, DOORBELL_WCWR); else { udb += eq->udb_qid << UDBS_SEG_SHIFT; /* seg offset */ eq->udb_qid = 0; } eq->udb = (volatile void *)udb; } return (rc); } static int free_eq(struct adapter *sc, struct sge_eq *eq) { int rc; if (eq->flags & EQ_ALLOCATED) { switch (eq->flags & EQ_TYPEMASK) { case EQ_CTRL: rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; case EQ_ETH: rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #if defined(TCP_OFFLOAD) || defined(RATELIMIT) case EQ_OFLD: rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0, eq->cntxt_id); break; #endif default: panic("%s: invalid eq type %d.", __func__, eq->flags & EQ_TYPEMASK); } if (rc != 0) { device_printf(sc->dev, "failed to free egress queue (%d): %d\n", eq->flags & EQ_TYPEMASK, rc); return (rc); } eq->flags &= ~EQ_ALLOCATED; } free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc); if (mtx_initialized(&eq->eq_lock)) mtx_destroy(&eq->eq_lock); bzero(eq, sizeof(*eq)); return (0); } static int alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq, struct sysctl_oid *oid) { int rc; struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = alloc_eq(sc, vi, &wrq->eq); if (rc) return (rc); wrq->adapter = sc; TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq); TAILQ_INIT(&wrq->incomplete_wrs); STAILQ_INIT(&wrq->wr_list); wrq->nwr_pending = 0; wrq->ndesc_needed = 0; SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &wrq->eq.ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &wrq->eq.cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL, wrq->eq.sidx, "status page index"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD, &wrq->tx_wrs_direct, "# of work requests (direct)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD, &wrq->tx_wrs_copied, "# of work requests (copied)"); SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD, &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)"); return (rc); } static int free_wrq(struct adapter *sc, struct sge_wrq *wrq) { int rc; rc = free_eq(sc, &wrq->eq); if (rc) return (rc); bzero(wrq, sizeof(*wrq)); return (0); } static int alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx, struct sysctl_oid *oid) { int rc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; struct sge_eq *eq = &txq->eq; char name[16]; struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx, M_CXGBE, M_WAITOK); if (rc != 0) { device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc); return (rc); } rc = alloc_eq(sc, vi, eq); if (rc != 0) { mp_ring_free(txq->r); txq->r = NULL; return (rc); } /* Can't fail after this point. */ if (idx == 0) sc->sge.eq_base = eq->abs_id - eq->cntxt_id; else KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id, ("eq_base mismatch")); KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF, ("PF with non-zero eq_base")); TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq); txq->ifp = vi->ifp; txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK); if (sc->flags & IS_VF) txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) | V_TXPKT_INTF(pi->tx_chan)); else txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) | V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) | V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld)); txq->tc_idx = -1; txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE, M_ZERO | M_WAITOK); snprintf(name, sizeof(name), "%d", idx); oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL, "tx queue"); children = SYSCTL_CHILDREN(oid); SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD, &eq->ba, "bus address of descriptor ring"); SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL, eq->sidx * EQ_ESIZE + sc->params.sge.spg_len, "desc ring size in bytes"); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD, &eq->abs_id, 0, "absolute id of the queue"); SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD, &eq->cntxt_id, 0, "SGE context id of the queue"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx", CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I", "consumer index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx", CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I", "producer index"); SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL, eq->sidx, "status page index"); SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc", CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I", "traffic class (-1 means none)"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD, &txq->txcsum, "# of times hardware assisted with checksum"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion", CTLFLAG_RD, &txq->vlan_insertion, "# of times hardware inserted 802.1Q tag"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD, &txq->tso_wrs, "# of TSO work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD, &txq->imm_wrs, "# of work requests with immediate data"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD, &txq->sgl_wrs, "# of work requests with direct SGL"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD, &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs", CTLFLAG_RD, &txq->txpkts0_wrs, "# of txpkts (type 0) work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs", CTLFLAG_RD, &txq->txpkts1_wrs, "# of txpkts (type 1) work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts", CTLFLAG_RD, &txq->txpkts0_pkts, "# of frames tx'd using type0 txpkts work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts", CTLFLAG_RD, &txq->txpkts1_pkts, "# of frames tx'd using type1 txpkts work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD, &txq->raw_wrs, "# of raw work requests (non-packets)"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues", CTLFLAG_RD, &txq->r->enqueues, "# of enqueues to the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops", CTLFLAG_RD, &txq->r->drops, "# of drops in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts", CTLFLAG_RD, &txq->r->starts, "# of normal consumer starts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls", CTLFLAG_RD, &txq->r->stalls, "# of consumer stalls in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts", CTLFLAG_RD, &txq->r->restarts, "# of consumer restarts in the mp_ring for this queue"); SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications", CTLFLAG_RD, &txq->r->abdications, "# of consumer abdications in the mp_ring for this queue"); return (0); } static int free_txq(struct vi_info *vi, struct sge_txq *txq) { int rc; struct adapter *sc = vi->pi->adapter; struct sge_eq *eq = &txq->eq; rc = free_eq(sc, eq); if (rc) return (rc); sglist_free(txq->gl); free(txq->sdesc, M_CXGBE); mp_ring_free(txq->r); bzero(txq, sizeof(*txq)); return (0); } static void oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *ba = arg; KASSERT(nseg == 1, ("%s meant for single segment mappings only.", __func__)); *ba = error ? 0 : segs->ds_addr; } static inline void ring_fl_db(struct adapter *sc, struct sge_fl *fl) { uint32_t n, v; n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx); MPASS(n > 0); wmb(); v = fl->dbval | V_PIDX(n); if (fl->udb) *fl->udb = htole32(v); else t4_write_reg(sc, sc->sge_kdoorbell_reg, v); IDXINCR(fl->dbidx, n, fl->sidx); } /* * Fills up the freelist by allocating up to 'n' buffers. Buffers that are * recycled do not count towards this allocation budget. * * Returns non-zero to indicate that this freelist should be added to the list * of starving freelists. */ static int refill_fl(struct adapter *sc, struct sge_fl *fl, int n) { __be64 *d; struct fl_sdesc *sd; uintptr_t pa; caddr_t cl; struct cluster_layout *cll; struct sw_zone_info *swz; struct cluster_metadata *clm; uint16_t max_pidx; uint16_t hw_cidx = fl->hw_cidx; /* stable snapshot */ FL_LOCK_ASSERT_OWNED(fl); /* * We always stop at the beginning of the hardware descriptor that's just * before the one with the hw cidx. This is to avoid hw pidx = hw cidx, * which would mean an empty freelist to the chip. */ max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1; if (fl->pidx == max_pidx * 8) return (0); d = &fl->desc[fl->pidx]; sd = &fl->sdesc[fl->pidx]; cll = &fl->cll_def; /* default layout */ swz = &sc->sge.sw_zone_info[cll->zidx]; while (n > 0) { if (sd->cl != NULL) { if (sd->nmbuf == 0) { /* * Fast recycle without involving any atomics on * the cluster's metadata (if the cluster has * metadata). This happens when all frames * received in the cluster were small enough to * fit within a single mbuf each. */ fl->cl_fast_recycled++; #ifdef INVARIANTS clm = cl_metadata(sc, fl, &sd->cll, sd->cl); if (clm != NULL) MPASS(clm->refcount == 1); #endif goto recycled_fast; } /* * Cluster is guaranteed to have metadata. Clusters * without metadata always take the fast recycle path * when they're recycled. */ clm = cl_metadata(sc, fl, &sd->cll, sd->cl); MPASS(clm != NULL); if (atomic_fetchadd_int(&clm->refcount, -1) == 1) { fl->cl_recycled++; counter_u64_add(extfree_rels, 1); goto recycled; } sd->cl = NULL; /* gave up my reference */ } MPASS(sd->cl == NULL); alloc: cl = uma_zalloc(swz->zone, M_NOWAIT); if (__predict_false(cl == NULL)) { if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 || fl->cll_def.zidx == fl->cll_alt.zidx) break; /* fall back to the safe zone */ cll = &fl->cll_alt; swz = &sc->sge.sw_zone_info[cll->zidx]; goto alloc; } fl->cl_allocated++; n--; pa = pmap_kextract((vm_offset_t)cl); pa += cll->region1; sd->cl = cl; sd->cll = *cll; *d = htobe64(pa | cll->hwidx); clm = cl_metadata(sc, fl, cll, cl); if (clm != NULL) { recycled: #ifdef INVARIANTS clm->sd = sd; #endif clm->refcount = 1; } sd->nmbuf = 0; recycled_fast: d++; sd++; if (__predict_false(++fl->pidx % 8 == 0)) { uint16_t pidx = fl->pidx / 8; if (__predict_false(pidx == fl->sidx)) { fl->pidx = 0; pidx = 0; sd = fl->sdesc; d = fl->desc; } if (pidx == max_pidx) break; if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4) ring_fl_db(sc, fl); } } if (fl->pidx / 8 != fl->dbidx) ring_fl_db(sc, fl); return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING)); } /* * Attempt to refill all starving freelists. */ static void refill_sfl(void *arg) { struct adapter *sc = arg; struct sge_fl *fl, *fl_temp; mtx_assert(&sc->sfl_lock, MA_OWNED); TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) { FL_LOCK(fl); refill_fl(sc, fl, 64); if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) { TAILQ_REMOVE(&sc->sfl, fl, link); fl->flags &= ~FL_STARVING; } FL_UNLOCK(fl); } if (!TAILQ_EMPTY(&sc->sfl)) callout_schedule(&sc->sfl_callout, hz / 5); } static int alloc_fl_sdesc(struct sge_fl *fl) { fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE, M_ZERO | M_WAITOK); return (0); } static void free_fl_sdesc(struct adapter *sc, struct sge_fl *fl) { struct fl_sdesc *sd; struct cluster_metadata *clm; struct cluster_layout *cll; int i; sd = fl->sdesc; for (i = 0; i < fl->sidx * 8; i++, sd++) { if (sd->cl == NULL) continue; cll = &sd->cll; clm = cl_metadata(sc, fl, cll, sd->cl); if (sd->nmbuf == 0) uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) { uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl); counter_u64_add(extfree_rels, 1); } sd->cl = NULL; } free(fl->sdesc, M_CXGBE); fl->sdesc = NULL; } static inline void get_pkt_gl(struct mbuf *m, struct sglist *gl) { int rc; M_ASSERTPKTHDR(m); sglist_reset(gl); rc = sglist_append_mbuf(gl, m); if (__predict_false(rc != 0)) { panic("%s: mbuf %p (%d segs) was vetted earlier but now fails " "with %d.", __func__, m, mbuf_nsegs(m), rc); } KASSERT(gl->sg_nseg == mbuf_nsegs(m), ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, mbuf_nsegs(m), gl->sg_nseg)); KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS), ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)); } /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int txpkt_len16(u_int nsegs, u_int tso) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); if (tso) n += sizeof(struct cpl_tx_pkt_lso_core); return (howmany(n, 16)); } /* * len16 for a txpkt_vm WR with a GL. Includes the firmware work * request header. */ static inline u_int txpkt_vm_len16(u_int nsegs, u_int tso) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct fw_eth_tx_pkt_vm_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); if (tso) n += sizeof(struct cpl_tx_pkt_lso_core); return (howmany(n, 16)); } /* * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts0_len16(u_int nsegs) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); return (howmany(n, 16)); } /* * len16 for a txpkts type 1 WR with a GL. Does not include the firmware work * request header. */ static inline u_int txpkts1_len16(void) { u_int n; n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl); return (howmany(n, 16)); } static inline u_int imm_payload(u_int ndesc) { u_int n; n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) - sizeof(struct cpl_tx_pkt_core); return (n); } /* * Write a VM txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq, struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int csum_type, len16, ndesc, pktlen, nsegs; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); MPASS(available > 0 && available < eq->sidx); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc <= available); /* Firmware work request header */ MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3[0] = 0; wr->r3[1] = 0; /* * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci. * vlantci is ignored unless the ethtype is 0x8100, so it's * simpler to always copy it rather than making it * conditional. Also, it seems that we do not have to set * vlantci or fake the ethtype when doing VLAN tag insertion. */ m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst); csum_type = -1; if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: mbuf %p needs TSO but missing header lengths", __func__, m0)); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header)) ctrl |= V_LSO_ETHHDR_LEN(1); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) csum_type = TX_CSUM_TCPIP6; else csum_type = TX_CSUM_TCPIP; cpl = (void *)(lso + 1); txq->tso_wrs++; } else { if (m0->m_pkthdr.csum_flags & CSUM_IP_TCP) csum_type = TX_CSUM_TCPIP; else if (m0->m_pkthdr.csum_flags & CSUM_IP_UDP) csum_type = TX_CSUM_UDPIP; else if (m0->m_pkthdr.csum_flags & CSUM_IP6_TCP) csum_type = TX_CSUM_TCPIP6; else if (m0->m_pkthdr.csum_flags & CSUM_IP6_UDP) csum_type = TX_CSUM_UDPIP6; #if defined(INET) else if (m0->m_pkthdr.csum_flags & CSUM_IP) { /* * XXX: The firmware appears to stomp on the * fragment/flags field of the IP header when * using TX_CSUM_IP. Fall back to doing * software checksums. */ u_short *sump; struct mbuf *m; int offset; m = m0; offset = 0; sump = m_advance(&m, &offset, m0->m_pkthdr.l2hlen + offsetof(struct ip, ip_sum)); *sump = in_cksum_skip(m0, m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen, m0->m_pkthdr.l2hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } #endif cpl = (void *)(wr + 1); } /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m0) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (csum_type >= 0) { KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0, ("%s: mbuf %p needs checksum offload but missing header lengths", __func__, m0)); if (chip_id(sc) <= CHELSIO_T5) { ctrl1 |= V_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen - ETHER_HDR_LEN); } else { ctrl1 |= V_T6_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen - ETHER_HDR_LEN); } ctrl1 |= V_TXPKT_IPHDR_LEN(m0->m_pkthdr.l3hlen); ctrl1 |= V_TXPKT_CSUM_TYPE(csum_type); } else ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); /* * A packet using TSO will use up an entire descriptor for the * firmware work request header, LSO CPL, and TX_PKT_XT CPL. * If this descriptor is the last descriptor in the ring, wrap * around to the front of the ring explicitly for the start of * the sgl. */ if (dst == (void *)&eq->desc[eq->sidx]) { dst = (void *)&eq->desc[0]; write_gl_to_txd(txq, m0, &dst, 0); } else write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * Write a raw WR to the hardware descriptors, update the software * descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_raw_wr(struct sge_txq *txq, void *wr, struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct mbuf *m; caddr_t dst; int len16, ndesc; len16 = mbuf_len16(m0); ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc <= available); dst = wr; for (m = m0; m != NULL; m = m->m_next) copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); txq->raw_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * Write a txpkt WR for this packet to the hardware descriptors, update the * software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr, struct mbuf *m0, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; /* used in many unrelated places */ uint64_t ctrl1; int len16, ndesc, pktlen, nsegs; caddr_t dst; TXQ_LOCK_ASSERT_OWNED(txq); M_ASSERTPKTHDR(m0); MPASS(available > 0 && available < eq->sidx); len16 = mbuf_len16(m0); nsegs = mbuf_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); else if (pktlen <= imm_payload(2) && available >= 2) { /* Immediate data. Recalculate len16 and set nsegs to 0. */ ctrl += pktlen; len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + pktlen, 16); nsegs = 0; } ndesc = howmany(len16, EQ_ESIZE / 16); MPASS(ndesc <= available); /* Firmware work request header */ MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) | V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl)); ctrl = V_FW_WR_LEN16(len16); wr->equiq_to_len16 = htobe32(ctrl); wr->r3 = 0; if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: mbuf %p needs TSO but missing header lengths", __func__, m0)); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header)) ctrl |= V_LSO_ETHHDR_LEN(1); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); cpl = (void *)(lso + 1); txq->tso_wrs++; } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m0) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (needs_l4_csum(m0) == 0) ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* SGL */ dst = (void *)(cpl + 1); if (nsegs > 0) { write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); txq->sgl_wrs++; } else { struct mbuf *m; for (m = m0; m != NULL; m = m->m_next) { copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len); #ifdef INVARIANTS pktlen -= m->m_len; #endif } #ifdef INVARIANTS KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen)); #endif txq->imm_wrs++; } txq->txpkt_wrs++; txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } static int try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available) { u_int needed, nsegs1, nsegs2, l1, l2; if (cannot_use_txpkts(m) || cannot_use_txpkts(n)) return (1); nsegs1 = mbuf_nsegs(m); nsegs2 = mbuf_nsegs(n); if (nsegs1 + nsegs2 == 2) { txp->wr_type = 1; l1 = l2 = txpkts1_len16(); } else { txp->wr_type = 0; l1 = txpkts0_len16(nsegs1); l2 = txpkts0_len16(nsegs2); } txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2; needed = howmany(txp->len16, EQ_ESIZE / 16); if (needed > SGE_MAX_WR_NDESC || needed > available) return (1); txp->plen = m->m_pkthdr.len + n->m_pkthdr.len; if (txp->plen > 65535) return (1); txp->npkt = 2; set_mbuf_len16(m, l1); set_mbuf_len16(n, l2); return (0); } static int add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available) { u_int plen, len16, needed, nsegs; MPASS(txp->wr_type == 0 || txp->wr_type == 1); if (cannot_use_txpkts(m)) return (1); nsegs = mbuf_nsegs(m); if (txp->wr_type == 1 && nsegs != 1) return (1); plen = txp->plen + m->m_pkthdr.len; if (plen > 65535) return (1); if (txp->wr_type == 0) len16 = txpkts0_len16(nsegs); else len16 = txpkts1_len16(); needed = howmany(txp->len16 + len16, EQ_ESIZE / 16); if (needed > SGE_MAX_WR_NDESC || needed > available) return (1); txp->npkt++; txp->plen = plen; txp->len16 += len16; set_mbuf_len16(m, len16); return (0); } /* * Write a txpkts WR for the packets in txp to the hardware descriptors, update * the software descriptor, and advance the pidx. It is guaranteed that enough * descriptors are available. * * The return value is the # of hardware descriptors used. */ static u_int write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr, struct mbuf *m0, const struct txpkts *txp, u_int available) { struct sge_eq *eq = &txq->eq; struct tx_sdesc *txsd; struct cpl_tx_pkt_core *cpl; uint32_t ctrl; uint64_t ctrl1; int ndesc, checkwrap; struct mbuf *m; void *flitp; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(txp->npkt > 0); MPASS(txp->plen < 65536); MPASS(m0 != NULL); MPASS(m0->m_nextpkt != NULL); MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16)); MPASS(available > 0 && available < eq->sidx); ndesc = howmany(txp->len16, EQ_ESIZE / 16); MPASS(ndesc <= available); MPASS(wr == (void *)&eq->desc[eq->pidx]); wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)); ctrl = V_FW_WR_LEN16(txp->len16); wr->equiq_to_len16 = htobe32(ctrl); wr->plen = htobe16(txp->plen); wr->npkt = txp->npkt; wr->r3 = 0; wr->type = txp->wr_type; flitp = wr + 1; /* * At this point we are 16B into a hardware descriptor. If checkwrap is * set then we know the WR is going to wrap around somewhere. We'll * check for that at appropriate points. */ checkwrap = eq->sidx - ndesc < eq->pidx; for (m = m0; m != NULL; m = m->m_nextpkt) { if (txp->wr_type == 0) { struct ulp_txpkt *ulpmc; struct ulptx_idata *ulpsc; /* ULP master command */ ulpmc = flitp; ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid)); ulpmc->len = htobe32(mbuf_len16(m)); /* ULP subcommand */ ulpsc = (void *)(ulpmc + 1); ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) | F_ULP_TX_SC_MORE); ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core)); cpl = (void *)(ulpsc + 1); if (checkwrap && (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx]) cpl = (void *)&eq->desc[0]; } else { cpl = flitp; } /* Checksum offload */ ctrl1 = 0; if (needs_l3_csum(m) == 0) ctrl1 |= F_TXPKT_IPCSUM_DIS; if (needs_l4_csum(m) == 0) ctrl1 |= F_TXPKT_L4CSUM_DIS; if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) txq->txcsum++; /* some hardware assistance provided */ /* VLAN tag insertion */ if (needs_vlan_insertion(m)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m->m_pkthdr.ether_vtag); txq->vlan_insertion++; } /* CPL header */ cpl->ctrl0 = txq->cpl_ctrl0; cpl->pack = 0; cpl->len = htobe16(m->m_pkthdr.len); cpl->ctrl1 = htobe64(ctrl1); flitp = cpl + 1; if (checkwrap && (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx]) flitp = (void *)&eq->desc[0]; write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap); } if (txp->wr_type == 0) { txq->txpkts0_pkts += txp->npkt; txq->txpkts0_wrs++; } else { txq->txpkts1_pkts += txp->npkt; txq->txpkts1_wrs++; } txsd = &txq->sdesc[eq->pidx]; txsd->m = m0; txsd->desc_used = ndesc; return (ndesc); } /* * If the SGL ends on an address that is not 16 byte aligned, this function will * add a 0 filled flit at the end. */ static void write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap) { struct sge_eq *eq = &txq->eq; struct sglist *gl = txq->gl; struct sglist_seg *seg; __be64 *flitp, *wrap; struct ulptx_sgl *usgl; int i, nflits, nsegs; KASSERT(((uintptr_t)(*to) & 0xf) == 0, ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to)); MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); get_pkt_gl(m, gl); nsegs = gl->sg_nseg; MPASS(nsegs > 0); nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2; flitp = (__be64 *)(*to); wrap = (__be64 *)(&eq->desc[eq->sidx]); seg = &gl->sg_segs[0]; usgl = (void *)flitp; /* * We start at a 16 byte boundary somewhere inside the tx descriptor * ring, so we're at least 16 bytes away from the status page. There is * no chance of a wrap around in the middle of usgl (which is 16 bytes). */ usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); usgl->len0 = htobe32(seg->ss_len); usgl->addr0 = htobe64(seg->ss_paddr); seg++; if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) { /* Won't wrap around at all */ for (i = 0; i < nsegs - 1; i++, seg++) { usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); flitp += nflits; } else { /* Will wrap somewhere in the rest of the SGL */ /* 2 flits already written, write the rest flit by flit */ flitp = (void *)(usgl + 1); for (i = 0; i < nflits - 2; i++) { if (flitp == wrap) flitp = (void *)eq->desc; *flitp++ = get_flit(seg, nsegs - 1, i); } } if (nflits & 1) { MPASS(((uintptr_t)flitp) & 0xf); *flitp++ = 0; } MPASS((((uintptr_t)flitp) & 0xf) == 0); if (__predict_false(flitp == wrap)) *to = (void *)eq->desc; else *to = (void *)flitp; } static inline void copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len) { MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]); MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]); if (__predict_true((uintptr_t)(*to) + len <= (uintptr_t)&eq->desc[eq->sidx])) { bcopy(from, *to, len); (*to) += len; } else { int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to); bcopy(from, *to, portion); from += portion; portion = len - portion; /* remaining */ bcopy(from, (void *)eq->desc, portion); (*to) = (caddr_t)eq->desc + portion; } } static inline void ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n) { u_int db; MPASS(n > 0); db = eq->doorbells; if (n > 1) clrbit(&db, DOORBELL_WCWR); wmb(); switch (ffs(db) - 1) { case DOORBELL_UDB: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); break; case DOORBELL_WCWR: { volatile uint64_t *dst, *src; int i; /* * Queues whose 128B doorbell segment fits in the page do not * use relative qid (udb_qid is always 0). Only queues with * doorbell segments can do WCWR. */ KASSERT(eq->udb_qid == 0 && n == 1, ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p", __func__, eq->doorbells, n, eq->dbidx, eq)); dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET - UDBS_DB_OFFSET); i = eq->dbidx; src = (void *)&eq->desc[i]; while (src != (void *)&eq->desc[i + 1]) *dst++ = *src++; wmb(); break; } case DOORBELL_UDBWC: *eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n)); wmb(); break; case DOORBELL_KDB: t4_write_reg(sc, sc->sge_kdoorbell_reg, V_QID(eq->cntxt_id) | V_PIDX(n)); break; } IDXINCR(eq->dbidx, n, eq->sidx); } static inline u_int reclaimable_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx; hw_cidx = read_hw_cidx(eq); return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx)); } static inline u_int total_available_tx_desc(struct sge_eq *eq) { uint16_t hw_cidx, pidx; hw_cidx = read_hw_cidx(eq); pidx = eq->pidx; if (pidx == hw_cidx) return (eq->sidx - 1); else return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1); } static inline uint16_t read_hw_cidx(struct sge_eq *eq) { struct sge_qstat *spg = (void *)&eq->desc[eq->sidx]; uint16_t cidx = spg->cidx; /* stable snapshot */ return (be16toh(cidx)); } /* * Reclaim 'n' descriptors approximately. */ static u_int reclaim_tx_descs(struct sge_txq *txq, u_int n) { struct tx_sdesc *txsd; struct sge_eq *eq = &txq->eq; u_int can_reclaim, reclaimed; TXQ_LOCK_ASSERT_OWNED(txq); MPASS(n > 0); reclaimed = 0; can_reclaim = reclaimable_tx_desc(eq); while (can_reclaim && reclaimed < n) { int ndesc; struct mbuf *m, *nextpkt; txsd = &txq->sdesc[eq->cidx]; ndesc = txsd->desc_used; /* Firmware doesn't return "partial" credits. */ KASSERT(can_reclaim >= ndesc, ("%s: unexpected number of credits: %d, %d", __func__, can_reclaim, ndesc)); KASSERT(ndesc != 0, ("%s: descriptor with no credits: cidx %d", __func__, eq->cidx)); for (m = txsd->m; m != NULL; m = nextpkt) { nextpkt = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } reclaimed += ndesc; can_reclaim -= ndesc; IDXINCR(eq->cidx, ndesc, eq->sidx); } return (reclaimed); } static void tx_reclaim(void *arg, int n) { struct sge_txq *txq = arg; struct sge_eq *eq = &txq->eq; do { if (TXQ_TRYLOCK(txq) == 0) break; n = reclaim_tx_descs(txq, 32); if (eq->cidx == eq->pidx) eq->equeqidx = eq->pidx; TXQ_UNLOCK(txq); } while (n > 0); } static __be64 get_flit(struct sglist_seg *segs, int nsegs, int idx) { int i = (idx / 3) * 2; switch (idx % 3) { case 0: { uint64_t rc; rc = (uint64_t)segs[i].ss_len << 32; if (i + 1 < nsegs) rc |= (uint64_t)(segs[i + 1].ss_len); return (htobe64(rc)); } case 1: return (htobe64(segs[i].ss_paddr)); case 2: return (htobe64(segs[i + 1].ss_paddr)); } return (0); } static void find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp) { int8_t zidx, hwidx, idx; uint16_t region1, region3; int spare, spare_needed, n; struct sw_zone_info *swz; struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0]; /* * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize * large enough for the max payload and cluster metadata. Otherwise * settle for the largest bufsize that leaves enough room in the cluster * for metadata. * * Without buffer packing: Look for the smallest zone which has a * bufsize large enough for the max payload. Settle for the largest * bufsize available if there's nothing big enough for max payload. */ spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0; swz = &sc->sge.sw_zone_info[0]; hwidx = -1; for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) { if (swz->size > largest_rx_cluster) { if (__predict_true(hwidx != -1)) break; /* * This is a misconfiguration. largest_rx_cluster is * preventing us from finding a refill source. See * dev.t5nex..buffer_sizes to figure out why. */ device_printf(sc->dev, "largest_rx_cluster=%u leaves no" " refill source for fl %p (dma %u). Ignored.\n", largest_rx_cluster, fl, maxp); } for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (spare < spare_needed) continue; hwidx = idx; /* best option so far */ if (hwb->size >= maxp) { if ((fl->flags & FL_BUF_PACKING) == 0) goto done; /* stop looking (not packing) */ if (swz->size >= safest_rx_cluster) goto done; /* stop looking (packing) */ } break; /* keep looking, next zone */ } } done: /* A usable hwidx has been located. */ MPASS(hwidx != -1); hwb = &hwb_list[hwidx]; zidx = hwb->zidx; swz = &sc->sge.sw_zone_info[zidx]; region1 = 0; region3 = swz->size - hwb->size; /* * Stay within this zone and see if there is a better match when mbuf * inlining is allowed. Remember that the hwidx's are sorted in * decreasing order of size (so in increasing order of spare area). */ for (idx = hwidx; idx != -1; idx = hwb->next) { hwb = &hwb_list[idx]; spare = swz->size - hwb->size; if (allow_mbufs_in_cluster == 0 || hwb->size < maxp) break; /* * Do not inline mbufs if doing so would violate the pad/pack * boundary alignment requirement. */ if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0) continue; if (fl->flags & FL_BUF_PACKING && (MSIZE % sc->params.sge.pack_boundary) != 0) continue; if (spare < CL_METADATA_SIZE + MSIZE) continue; n = (spare - CL_METADATA_SIZE) / MSIZE; if (n > howmany(hwb->size, maxp)) break; hwidx = idx; if (fl->flags & FL_BUF_PACKING) { region1 = n * MSIZE; region3 = spare - region1; } else { region1 = MSIZE; region3 = spare - region1; break; } } KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES, ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp)); KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES, ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp)); KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 == sc->sge.sw_zone_info[zidx].size, ("%s: bad buffer layout for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); if (fl->flags & FL_BUF_PACKING || region1 > 0) { KASSERT(region3 >= CL_METADATA_SIZE, ("%s: no room for metadata. fl %p, maxp %d; " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); KASSERT(region1 % MSIZE == 0, ("%s: bad mbuf region for fl %p, maxp %d. " "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp, sc->sge.sw_zone_info[zidx].size, region1, sc->sge.hw_buf_info[hwidx].size, region3)); } fl->cll_def.zidx = zidx; fl->cll_def.hwidx = hwidx; fl->cll_def.region1 = region1; fl->cll_def.region3 = region3; } static void find_safe_refill_source(struct adapter *sc, struct sge_fl *fl) { struct sge *s = &sc->sge; struct hw_buf_info *hwb; struct sw_zone_info *swz; int spare; int8_t hwidx; if (fl->flags & FL_BUF_PACKING) hwidx = s->safe_hwidx2; /* with room for metadata */ else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) { hwidx = s->safe_hwidx2; hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; /* no good if there isn't room for an mbuf as well */ if (spare < CL_METADATA_SIZE + MSIZE) hwidx = s->safe_hwidx1; } else hwidx = s->safe_hwidx1; if (hwidx == -1) { /* No fallback source */ fl->cll_alt.hwidx = -1; fl->cll_alt.zidx = -1; return; } hwb = &s->hw_buf_info[hwidx]; swz = &s->sw_zone_info[hwb->zidx]; spare = swz->size - hwb->size; fl->cll_alt.hwidx = hwidx; fl->cll_alt.zidx = hwb->zidx; if (allow_mbufs_in_cluster && (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0)) fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE; else fl->cll_alt.region1 = 0; fl->cll_alt.region3 = spare - fl->cll_alt.region1; } static void add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl) { mtx_lock(&sc->sfl_lock); FL_LOCK(fl); if ((fl->flags & FL_DOOMED) == 0) { fl->flags |= FL_STARVING; TAILQ_INSERT_TAIL(&sc->sfl, fl, link); callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc); } FL_UNLOCK(fl); mtx_unlock(&sc->sfl_lock); } static void handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_wrq *wrq = (void *)eq; atomic_readandclear_int(&eq->equiq); taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task); } static void handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq) { struct sge_txq *txq = (void *)eq; MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH); atomic_readandclear_int(&eq->equiq); mp_ring_check_drainage(txq->r, 0); taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task); } static int handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1); unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid)); struct adapter *sc = iq->adapter; struct sge *s = &sc->sge; struct sge_eq *eq; static void (*h[])(struct adapter *, struct sge_eq *) = {NULL, &handle_wrq_egr_update, &handle_eth_egr_update, &handle_wrq_egr_update}; KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); eq = s->eqmap[qid - s->eq_start - s->eq_base]; (*h[eq->flags & EQ_TYPEMASK])(sc, eq); return (0); } /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */ CTASSERT(offsetof(struct cpl_fw4_msg, data) == \ offsetof(struct cpl_fw6_msg, data)); static int handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) { struct adapter *sc = iq->adapter; const struct cpl_fw6_msg *cpl = (const void *)(rss + 1); KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__, rss->opcode)); if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) { const struct rss_header *rss2; rss2 = (const struct rss_header *)&cpl->data[0]; return (t4_cpl_handler[rss2->opcode](iq, rss2, m)); } return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0])); } /** * t4_handle_wrerr_rpl - process a FW work request error message * @adap: the adapter * @rpl: start of the FW message */ static int t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl) { u8 opcode = *(const u8 *)rpl; const struct fw_error_cmd *e = (const void *)rpl; unsigned int i; if (opcode != FW_ERROR_CMD) { log(LOG_ERR, "%s: Received WRERR_RPL message with opcode %#x\n", device_get_nameunit(adap->dev), opcode); return (EINVAL); } log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev), G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" : "non-fatal"); switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) { case FW_ERROR_TYPE_EXCEPTION: log(LOG_ERR, "exception info:\n"); for (i = 0; i < nitems(e->u.exception.info); i++) log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ", be32toh(e->u.exception.info[i])); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_HWMODULE: log(LOG_ERR, "HW module regaddr %08x regval %08x\n", be32toh(e->u.hwmodule.regaddr), be32toh(e->u.hwmodule.regval)); break; case FW_ERROR_TYPE_WR: log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n", be16toh(e->u.wr.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)), be32toh(e->u.wr.eqid)); for (i = 0; i < nitems(e->u.wr.wrhdr); i++) log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ", e->u.wr.wrhdr[i]); log(LOG_ERR, "\n"); break; case FW_ERROR_TYPE_ACL: log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s", be16toh(e->u.acl.cidx), G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)), G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)), be32toh(e->u.acl.eqid), G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" : "MAC"); for (i = 0; i < nitems(e->u.acl.val); i++) log(LOG_ERR, " %02x", e->u.acl.val[i]); log(LOG_ERR, "\n"); break; default: log(LOG_ERR, "type %#x\n", G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))); return (EINVAL); } return (0); } static int sysctl_uint16(SYSCTL_HANDLER_ARGS) { uint16_t *id = arg1; int i = *id; return sysctl_handle_int(oidp, &i, 0, req); } static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS) { struct sge *s = arg1; struct hw_buf_info *hwb = &s->hw_buf_info[0]; struct sw_zone_info *swz = &s->sw_zone_info[0]; int i, rc; struct sbuf sb; char c; sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND); for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) { if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster) c = '*'; else c = '\0'; sbuf_printf(&sb, "%u%c ", hwb->size, c); } sbuf_trim(&sb); sbuf_finish(&sb); rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (rc); } #ifdef RATELIMIT /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int txpkt_eo_len16(u_int nsegs, u_int immhdrs, u_int tso) { u_int n; MPASS(immhdrs > 0); n = roundup2(sizeof(struct fw_eth_tx_eo_wr) + sizeof(struct cpl_tx_pkt_core) + immhdrs, 16); if (__predict_false(nsegs == 0)) goto done; nsegs--; /* first segment is part of ulptx_sgl */ n += sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); if (tso) n += sizeof(struct cpl_tx_pkt_lso_core); done: return (howmany(n, 16)); } #define ETID_FLOWC_NPARAMS 6 #define ETID_FLOWC_LEN (roundup2((sizeof(struct fw_flowc_wr) + \ ETID_FLOWC_NPARAMS * sizeof(struct fw_flowc_mnemval)), 16)) #define ETID_FLOWC_LEN16 (howmany(ETID_FLOWC_LEN, 16)) static int send_etid_flowc_wr(struct cxgbe_snd_tag *cst, struct port_info *pi, struct vi_info *vi) { struct wrq_cookie cookie; u_int pfvf = pi->adapter->pf << S_FW_VIID_PFN; struct fw_flowc_wr *flowc; mtx_assert(&cst->lock, MA_OWNED); MPASS((cst->flags & (EO_FLOWC_PENDING | EO_FLOWC_RPL_PENDING)) == EO_FLOWC_PENDING); flowc = start_wrq_wr(cst->eo_txq, ETID_FLOWC_LEN16, &cookie); if (__predict_false(flowc == NULL)) return (ENOMEM); bzero(flowc, ETID_FLOWC_LEN); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(ETID_FLOWC_NPARAMS) | V_FW_WR_COMPL(0)); flowc->flowid_len16 = htonl(V_FW_WR_LEN16(ETID_FLOWC_LEN16) | V_FW_WR_FLOWID(cst->etid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = htobe32(pfvf); flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH; flowc->mnemval[1].val = htobe32(pi->tx_chan); flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT; flowc->mnemval[2].val = htobe32(pi->tx_chan); flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID; flowc->mnemval[3].val = htobe32(cst->iqid); flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_EOSTATE; flowc->mnemval[4].val = htobe32(FW_FLOWC_MNEM_EOSTATE_ESTABLISHED); flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; flowc->mnemval[5].val = htobe32(cst->schedcl); commit_wrq_wr(cst->eo_txq, flowc, &cookie); cst->flags &= ~EO_FLOWC_PENDING; cst->flags |= EO_FLOWC_RPL_PENDING; MPASS(cst->tx_credits >= ETID_FLOWC_LEN16); /* flowc is first WR. */ cst->tx_credits -= ETID_FLOWC_LEN16; return (0); } #define ETID_FLUSH_LEN16 (howmany(sizeof (struct fw_flowc_wr), 16)) void send_etid_flush_wr(struct cxgbe_snd_tag *cst) { struct fw_flowc_wr *flowc; struct wrq_cookie cookie; mtx_assert(&cst->lock, MA_OWNED); flowc = start_wrq_wr(cst->eo_txq, ETID_FLUSH_LEN16, &cookie); if (__predict_false(flowc == NULL)) CXGBE_UNIMPLEMENTED(__func__); bzero(flowc, ETID_FLUSH_LEN16 * 16); flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | V_FW_FLOWC_WR_NPARAMS(0) | F_FW_WR_COMPL); flowc->flowid_len16 = htobe32(V_FW_WR_LEN16(ETID_FLUSH_LEN16) | V_FW_WR_FLOWID(cst->etid)); commit_wrq_wr(cst->eo_txq, flowc, &cookie); cst->flags |= EO_FLUSH_RPL_PENDING; MPASS(cst->tx_credits >= ETID_FLUSH_LEN16); cst->tx_credits -= ETID_FLUSH_LEN16; cst->ncompl++; } static void write_ethofld_wr(struct cxgbe_snd_tag *cst, struct fw_eth_tx_eo_wr *wr, struct mbuf *m0, int compl) { struct cpl_tx_pkt_core *cpl; uint64_t ctrl1; uint32_t ctrl; /* used in many unrelated places */ int len16, pktlen, nsegs, immhdrs; caddr_t dst; uintptr_t p; struct ulptx_sgl *usgl; struct sglist sg; struct sglist_seg segs[38]; /* XXX: find real limit. XXX: get off the stack */ mtx_assert(&cst->lock, MA_OWNED); M_ASSERTPKTHDR(m0); KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && m0->m_pkthdr.l4hlen > 0, ("%s: ethofld mbuf %p is missing header lengths", __func__, m0)); len16 = mbuf_eo_len16(m0); nsegs = mbuf_eo_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); if (needs_tso(m0)) ctrl += sizeof(struct cpl_tx_pkt_lso_core); immhdrs = m0->m_pkthdr.l2hlen + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen; ctrl += immhdrs; wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_EO_WR) | V_FW_ETH_TX_EO_WR_IMMDLEN(ctrl) | V_FW_WR_COMPL(!!compl)); wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) | V_FW_WR_FLOWID(cst->etid)); wr->r3 = 0; if (needs_udp_csum(m0)) { wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG; wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen; wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); wr->u.udpseg.udplen = m0->m_pkthdr.l4hlen; wr->u.udpseg.rtplen = 0; wr->u.udpseg.r4 = 0; wr->u.udpseg.mss = htobe16(pktlen - immhdrs); wr->u.udpseg.schedpktsize = wr->u.udpseg.mss; wr->u.udpseg.plen = htobe32(pktlen - immhdrs); cpl = (void *)(wr + 1); } else { MPASS(needs_tcp_csum(m0)); wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG; wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen; wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); wr->u.tcpseg.tcplen = m0->m_pkthdr.l4hlen; wr->u.tcpseg.tsclk_tsoff = mbuf_eo_tsclk_tsoff(m0); wr->u.tcpseg.r4 = 0; wr->u.tcpseg.r5 = 0; wr->u.tcpseg.plen = htobe32(pktlen - immhdrs); if (needs_tso(m0)) { struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); wr->u.tcpseg.mss = htobe16(m0->m_pkthdr.tso_segsz); ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header)) ctrl |= V_LSO_ETHHDR_LEN(1); if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) ctrl |= F_LSO_IPV6; lso->lso_ctrl = htobe32(ctrl); lso->ipid_ofst = htobe16(0); lso->mss = htobe16(m0->m_pkthdr.tso_segsz); lso->seqno_offset = htobe32(0); lso->len = htobe32(pktlen); cpl = (void *)(lso + 1); } else { wr->u.tcpseg.mss = htobe16(0xffff); cpl = (void *)(wr + 1); } } /* Checksum offload must be requested for ethofld. */ ctrl1 = 0; MPASS(needs_l4_csum(m0)); /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); } /* CPL header */ cpl->ctrl0 = cst->ctrl0; cpl->pack = 0; cpl->len = htobe16(pktlen); cpl->ctrl1 = htobe64(ctrl1); /* Copy Ethernet, IP & TCP/UDP hdrs as immediate data */ p = (uintptr_t)(cpl + 1); m_copydata(m0, 0, immhdrs, (void *)p); /* SGL */ dst = (void *)(cpl + 1); if (nsegs > 0) { int i, pad; /* zero-pad upto next 16Byte boundary, if not 16Byte aligned */ p += immhdrs; pad = 16 - (immhdrs & 0xf); bzero((void *)p, pad); usgl = (void *)(p + pad); usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) | V_ULPTX_NSGE(nsegs)); sglist_init(&sg, nitems(segs), segs); for (; m0 != NULL; m0 = m0->m_next) { if (__predict_false(m0->m_len == 0)) continue; if (immhdrs >= m0->m_len) { immhdrs -= m0->m_len; continue; } sglist_append(&sg, mtod(m0, char *) + immhdrs, m0->m_len - immhdrs); immhdrs = 0; } MPASS(sg.sg_nseg == nsegs); /* * Zero pad last 8B in case the WR doesn't end on a 16B * boundary. */ *(uint64_t *)((char *)wr + len16 * 16 - 8) = 0; usgl->len0 = htobe32(segs[0].ss_len); usgl->addr0 = htobe64(segs[0].ss_paddr); for (i = 0; i < nsegs - 1; i++) { usgl->sge[i / 2].len[i & 1] = htobe32(segs[i + 1].ss_len); usgl->sge[i / 2].addr[i & 1] = htobe64(segs[i + 1].ss_paddr); } if (i & 1) usgl->sge[i / 2].len[1] = htobe32(0); } } static void ethofld_tx(struct cxgbe_snd_tag *cst) { struct mbuf *m; struct wrq_cookie cookie; int next_credits, compl; struct fw_eth_tx_eo_wr *wr; mtx_assert(&cst->lock, MA_OWNED); while ((m = mbufq_first(&cst->pending_tx)) != NULL) { M_ASSERTPKTHDR(m); /* How many len16 credits do we need to send this mbuf. */ next_credits = mbuf_eo_len16(m); MPASS(next_credits > 0); if (next_credits > cst->tx_credits) { /* * Tx will make progress eventually because there is at * least one outstanding fw4_ack that will return * credits and kick the tx. */ MPASS(cst->ncompl > 0); return; } wr = start_wrq_wr(cst->eo_txq, next_credits, &cookie); if (__predict_false(wr == NULL)) { /* XXX: wishful thinking, not a real assertion. */ MPASS(cst->ncompl > 0); return; } cst->tx_credits -= next_credits; cst->tx_nocompl += next_credits; compl = cst->ncompl == 0 || cst->tx_nocompl >= cst->tx_total / 2; ETHER_BPF_MTAP(cst->com.ifp, m); write_ethofld_wr(cst, wr, m, compl); commit_wrq_wr(cst->eo_txq, wr, &cookie); if (compl) { cst->ncompl++; cst->tx_nocompl = 0; } (void) mbufq_dequeue(&cst->pending_tx); + + /* + * Drop the mbuf's reference on the tag now rather + * than waiting until m_freem(). This ensures that + * cxgbe_snd_tag_free gets called when the inp drops + * its reference on the tag and there are no more + * mbufs in the pending_tx queue and can flush any + * pending requests. Otherwise if the last mbuf + * doesn't request a completion the etid will never be + * released. + */ + m->m_pkthdr.snd_tag = NULL; + m->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; + m_snd_tag_rele(&cst->com); + mbufq_enqueue(&cst->pending_fwack, m); } } int ethofld_transmit(struct ifnet *ifp, struct mbuf *m0) { struct cxgbe_snd_tag *cst; int rc; MPASS(m0->m_nextpkt == NULL); + MPASS(m0->m_pkthdr.csum_flags & CSUM_SND_TAG); MPASS(m0->m_pkthdr.snd_tag != NULL); cst = mst_to_cst(m0->m_pkthdr.snd_tag); mtx_lock(&cst->lock); MPASS(cst->flags & EO_SND_TAG_REF); if (__predict_false(cst->flags & EO_FLOWC_PENDING)) { struct vi_info *vi = ifp->if_softc; struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; const uint32_t rss_mask = vi->rss_size - 1; uint32_t rss_hash; cst->eo_txq = &sc->sge.ofld_txq[vi->first_ofld_txq]; if (M_HASHTYPE_ISHASH(m0)) rss_hash = m0->m_pkthdr.flowid; else rss_hash = arc4random(); /* We assume RSS hashing */ cst->iqid = vi->rss[rss_hash & rss_mask]; cst->eo_txq += rss_hash % vi->nofldtxq; rc = send_etid_flowc_wr(cst, pi, vi); if (rc != 0) goto done; } if (__predict_false(cst->plen + m0->m_pkthdr.len > eo_max_backlog)) { rc = ENOBUFS; goto done; } mbufq_enqueue(&cst->pending_tx, m0); cst->plen += m0->m_pkthdr.len; + /* + * Hold an extra reference on the tag while generating work + * requests to ensure that we don't try to free the tag during + * ethofld_tx() in case we are sending the final mbuf after + * the inp was freed. + */ + m_snd_tag_ref(&cst->com); ethofld_tx(cst); - rc = 0; + mtx_unlock(&cst->lock); + m_snd_tag_rele(&cst->com); + return (0); + done: mtx_unlock(&cst->lock); if (__predict_false(rc != 0)) m_freem(m0); return (rc); } static int ethofld_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0) { struct adapter *sc = iq->adapter; const struct cpl_fw4_ack *cpl = (const void *)(rss + 1); struct mbuf *m; u_int etid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl))); struct cxgbe_snd_tag *cst; uint8_t credits = cpl->credits; cst = lookup_etid(sc, etid); mtx_lock(&cst->lock); if (__predict_false(cst->flags & EO_FLOWC_RPL_PENDING)) { MPASS(credits >= ETID_FLOWC_LEN16); credits -= ETID_FLOWC_LEN16; cst->flags &= ~EO_FLOWC_RPL_PENDING; } KASSERT(cst->ncompl > 0, ("%s: etid %u (%p) wasn't expecting completion.", __func__, etid, cst)); cst->ncompl--; while (credits > 0) { m = mbufq_dequeue(&cst->pending_fwack); if (__predict_false(m == NULL)) { /* * The remaining credits are for the final flush that * was issued when the tag was freed by the kernel. */ MPASS((cst->flags & (EO_FLUSH_RPL_PENDING | EO_SND_TAG_REF)) == EO_FLUSH_RPL_PENDING); MPASS(credits == ETID_FLUSH_LEN16); MPASS(cst->tx_credits + cpl->credits == cst->tx_total); MPASS(cst->ncompl == 0); cst->flags &= ~EO_FLUSH_RPL_PENDING; cst->tx_credits += cpl->credits; -freetag: cxgbe_snd_tag_free_locked(cst); return (0); /* cst is gone. */ } KASSERT(m != NULL, ("%s: too many credits (%u, %u)", __func__, cpl->credits, credits)); KASSERT(credits >= mbuf_eo_len16(m), ("%s: too few credits (%u, %u, %u)", __func__, cpl->credits, credits, mbuf_eo_len16(m))); credits -= mbuf_eo_len16(m); cst->plen -= m->m_pkthdr.len; m_freem(m); } cst->tx_credits += cpl->credits; MPASS(cst->tx_credits <= cst->tx_total); - m = mbufq_first(&cst->pending_tx); - if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m)) - ethofld_tx(cst); - - if (__predict_false((cst->flags & EO_SND_TAG_REF) == 0) && - cst->ncompl == 0) { - if (cst->tx_credits == cst->tx_total) - goto freetag; - else { - MPASS((cst->flags & EO_FLUSH_RPL_PENDING) == 0); - send_etid_flush_wr(cst); - } + if (cst->flags & EO_SND_TAG_REF) { + /* + * As with ethofld_transmit(), hold an extra reference + * so that the tag is stable across ethold_tx(). + */ + m_snd_tag_ref(&cst->com); + m = mbufq_first(&cst->pending_tx); + if (m != NULL && cst->tx_credits >= mbuf_eo_len16(m)) + ethofld_tx(cst); + mtx_unlock(&cst->lock); + m_snd_tag_rele(&cst->com); + } else { + /* + * There shouldn't be any pending packets if the tag + * was freed by the kernel since any pending packet + * should hold a reference to the tag. + */ + MPASS(mbufq_first(&cst->pending_tx) == NULL); + mtx_unlock(&cst->lock); } - - mtx_unlock(&cst->lock); return (0); } #endif Index: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c =================================================================== --- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c (revision 348253) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c (revision 348254) @@ -1,4490 +1,4491 @@ /*- * Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "en.h" #include #include #include #ifndef ETH_DRIVER_VERSION #define ETH_DRIVER_VERSION "3.5.1" #endif #define DRIVER_RELDATE "April 2019" static const char mlx5e_version[] = "mlx5en: Mellanox Ethernet driver " ETH_DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs); struct mlx5e_channel_param { struct mlx5e_rq_param rq; struct mlx5e_sq_param sq; struct mlx5e_cq_param rx_cq; struct mlx5e_cq_param tx_cq; }; struct media { u32 subtype; u64 baudrate; }; static const struct media mlx5e_mode_table[MLX5E_LINK_SPEEDS_NUMBER][MLX5E_LINK_MODES_NUMBER] = { [MLX5E_1000BASE_CX_SGMII][MLX5E_SGMII] = { .subtype = IFM_1000_CX_SGMII, .baudrate = IF_Mbps(1000ULL), }, [MLX5E_1000BASE_KX][MLX5E_KX] = { .subtype = IFM_1000_KX, .baudrate = IF_Mbps(1000ULL), }, [MLX5E_10GBASE_CX4][MLX5E_CX4] = { .subtype = IFM_10G_CX4, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_KX4][MLX5E_KX4] = { .subtype = IFM_10G_KX4, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_KR][MLX5E_KR] = { .subtype = IFM_10G_KR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_20GBASE_KR2][MLX5E_KR2] = { .subtype = IFM_20G_KR2, .baudrate = IF_Gbps(20ULL), }, [MLX5E_40GBASE_CR4][MLX5E_CR4] = { .subtype = IFM_40G_CR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_KR4][MLX5E_KR4] = { .subtype = IFM_40G_KR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_56GBASE_R4][MLX5E_R] = { .subtype = IFM_56G_R4, .baudrate = IF_Gbps(56ULL), }, [MLX5E_10GBASE_CR][MLX5E_CR1] = { .subtype = IFM_10G_CR1, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_SR][MLX5E_SR] = { .subtype = IFM_10G_SR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_ER_LR][MLX5E_ER] = { .subtype = IFM_10G_ER, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_ER_LR][MLX5E_LR] = { .subtype = IFM_10G_LR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_40GBASE_SR4][MLX5E_SR4] = { .subtype = IFM_40G_SR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_LR4_ER4][MLX5E_LR4] = { .subtype = IFM_40G_LR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_LR4_ER4][MLX5E_ER4] = { .subtype = IFM_40G_ER4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_100GBASE_CR4][MLX5E_CR4] = { .subtype = IFM_100G_CR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GBASE_SR4][MLX5E_SR4] = { .subtype = IFM_100G_SR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GBASE_KR4][MLX5E_KR4] = { .subtype = IFM_100G_KR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GBASE_LR4][MLX5E_LR4] = { .subtype = IFM_100G_LR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100BASE_TX][MLX5E_TX] = { .subtype = IFM_100_TX, .baudrate = IF_Mbps(100ULL), }, [MLX5E_1000BASE_T][MLX5E_T] = { .subtype = IFM_1000_T, .baudrate = IF_Mbps(1000ULL), }, [MLX5E_10GBASE_T][MLX5E_T] = { .subtype = IFM_10G_T, .baudrate = IF_Gbps(10ULL), }, [MLX5E_25GBASE_CR][MLX5E_CR] = { .subtype = IFM_25G_CR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GBASE_KR][MLX5E_KR] = { .subtype = IFM_25G_KR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GBASE_SR][MLX5E_SR] = { .subtype = IFM_25G_SR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_50GBASE_CR2][MLX5E_CR2] = { .subtype = IFM_50G_CR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GBASE_KR2][MLX5E_KR2] = { .subtype = IFM_50G_KR2, .baudrate = IF_Gbps(50ULL), }, }; static const struct media mlx5e_ext_mode_table[MLX5E_EXT_LINK_SPEEDS_NUMBER][MLX5E_LINK_MODES_NUMBER] = { [MLX5E_SGMII_100M][MLX5E_SGMII] = { .subtype = IFM_100_SGMII, .baudrate = IF_Mbps(100), }, [MLX5E_1000BASE_X_SGMII][MLX5E_KX] = { .subtype = IFM_1000_KX, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_CX_SGMII] = { .subtype = IFM_1000_CX_SGMII, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_CX] = { .subtype = IFM_1000_CX, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_LX] = { .subtype = IFM_1000_LX, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_SX] = { .subtype = IFM_1000_SX, .baudrate = IF_Mbps(1000), }, [MLX5E_1000BASE_X_SGMII][MLX5E_T] = { .subtype = IFM_1000_T, .baudrate = IF_Mbps(1000), }, [MLX5E_5GBASE_R][MLX5E_T] = { .subtype = IFM_5000_T, .baudrate = IF_Mbps(5000), }, [MLX5E_5GBASE_R][MLX5E_KR] = { .subtype = IFM_5000_KR, .baudrate = IF_Mbps(5000), }, [MLX5E_5GBASE_R][MLX5E_KR1] = { .subtype = IFM_5000_KR1, .baudrate = IF_Mbps(5000), }, [MLX5E_5GBASE_R][MLX5E_KR_S] = { .subtype = IFM_5000_KR_S, .baudrate = IF_Mbps(5000), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_ER] = { .subtype = IFM_10G_ER, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_KR] = { .subtype = IFM_10G_KR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_LR] = { .subtype = IFM_10G_LR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_SR] = { .subtype = IFM_10G_SR, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_T] = { .subtype = IFM_10G_T, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_AOC] = { .subtype = IFM_10G_AOC, .baudrate = IF_Gbps(10ULL), }, [MLX5E_10GBASE_XFI_XAUI_1][MLX5E_CR1] = { .subtype = IFM_10G_CR1, .baudrate = IF_Gbps(10ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_CR4] = { .subtype = IFM_40G_CR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_KR4] = { .subtype = IFM_40G_KR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_LR4] = { .subtype = IFM_40G_LR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_SR4] = { .subtype = IFM_40G_SR4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_40GBASE_XLAUI_4_XLPPI_4][MLX5E_ER4] = { .subtype = IFM_40G_ER4, .baudrate = IF_Gbps(40ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR] = { .subtype = IFM_25G_CR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR] = { .subtype = IFM_25G_KR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_SR] = { .subtype = IFM_25G_SR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_ACC] = { .subtype = IFM_25G_ACC, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_AOC] = { .subtype = IFM_25G_AOC, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR1] = { .subtype = IFM_25G_CR1, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_CR_S] = { .subtype = IFM_25G_CR_S, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR1] = { .subtype = IFM_5000_KR1, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_KR_S] = { .subtype = IFM_25G_KR_S, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_LR] = { .subtype = IFM_25G_LR, .baudrate = IF_Gbps(25ULL), }, [MLX5E_25GAUI_1_25GBASE_CR_KR][MLX5E_T] = { .subtype = IFM_25G_T, .baudrate = IF_Gbps(25ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_CR2] = { .subtype = IFM_50G_CR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_KR2] = { .subtype = IFM_50G_KR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_SR2] = { .subtype = IFM_50G_SR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_2_LAUI_2_50GBASE_CR2_KR2][MLX5E_LR2] = { .subtype = IFM_50G_LR2, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_LR] = { .subtype = IFM_50G_LR, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_SR] = { .subtype = IFM_50G_SR, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_CP] = { .subtype = IFM_50G_CP, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_FR] = { .subtype = IFM_50G_FR, .baudrate = IF_Gbps(50ULL), }, [MLX5E_50GAUI_1_LAUI_1_50GBASE_CR_KR][MLX5E_KR_PAM4] = { .subtype = IFM_50G_KR_PAM4, .baudrate = IF_Gbps(50ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_CR4] = { .subtype = IFM_100G_CR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_KR4] = { .subtype = IFM_100G_KR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_LR4] = { .subtype = IFM_100G_LR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_CAUI_4_100GBASE_CR4_KR4][MLX5E_SR4] = { .subtype = IFM_100G_SR4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_SR2] = { .subtype = IFM_100G_SR2, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_CP2] = { .subtype = IFM_100G_CP2, .baudrate = IF_Gbps(100ULL), }, [MLX5E_100GAUI_2_100GBASE_CR2_KR2][MLX5E_KR2_PAM4] = { .subtype = IFM_100G_KR2_PAM4, .baudrate = IF_Gbps(100ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_DR4] = { .subtype = IFM_200G_DR4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_LR4] = { .subtype = IFM_200G_LR4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_SR4] = { .subtype = IFM_200G_SR4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_FR4] = { .subtype = IFM_200G_FR4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_CR4_PAM4] = { .subtype = IFM_200G_CR4_PAM4, .baudrate = IF_Gbps(200ULL), }, [MLX5E_200GAUI_4_200GBASE_CR4_KR4][MLX5E_KR4_PAM4] = { .subtype = IFM_200G_KR4_PAM4, .baudrate = IF_Gbps(200ULL), }, }; MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet"); static void mlx5e_update_carrier(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u32 out[MLX5_ST_SZ_DW(ptys_reg)]; u32 eth_proto_oper; int error; u8 port_state; u8 is_er_type; u8 i, j; bool ext; struct media media_entry = {}; port_state = mlx5_query_vport_state(mdev, MLX5_QUERY_VPORT_STATE_IN_OP_MOD_VNIC_VPORT, 0); if (port_state == VPORT_STATE_UP) { priv->media_status_last |= IFM_ACTIVE; } else { priv->media_status_last &= ~IFM_ACTIVE; priv->media_active_last = IFM_ETHER; if_link_state_change(priv->ifp, LINK_STATE_DOWN); return; } error = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); if (error) { priv->media_active_last = IFM_ETHER; priv->ifp->if_baudrate = 1; if_printf(priv->ifp, "%s: query port ptys failed: " "0x%x\n", __func__, error); return; } ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); eth_proto_oper = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_oper); i = ilog2(eth_proto_oper); for (j = 0; j != MLX5E_LINK_MODES_NUMBER; j++) { media_entry = ext ? mlx5e_ext_mode_table[i][j] : mlx5e_mode_table[i][j]; if (media_entry.baudrate != 0) break; } if (media_entry.subtype == 0) { if_printf(priv->ifp, "%s: Could not find operational " "media subtype\n", __func__); return; } switch (media_entry.subtype) { case IFM_10G_ER: error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type); if (error != 0) { if_printf(priv->ifp, "%s: query port pddr failed: %d\n", __func__, error); } if (error != 0 || is_er_type == 0) media_entry.subtype = IFM_10G_LR; break; case IFM_40G_LR4: error = mlx5_query_pddr_range_info(mdev, 1, &is_er_type); if (error != 0) { if_printf(priv->ifp, "%s: query port pddr failed: %d\n", __func__, error); } if (error == 0 && is_er_type != 0) media_entry.subtype = IFM_40G_ER4; break; } priv->media_active_last = media_entry.subtype | IFM_ETHER | IFM_FDX; priv->ifp->if_baudrate = media_entry.baudrate; if_link_state_change(priv->ifp, LINK_STATE_UP); } static void mlx5e_media_status(struct ifnet *dev, struct ifmediareq *ifmr) { struct mlx5e_priv *priv = dev->if_softc; ifmr->ifm_status = priv->media_status_last; ifmr->ifm_active = priv->media_active_last | (priv->params.rx_pauseframe_control ? IFM_ETH_RXPAUSE : 0) | (priv->params.tx_pauseframe_control ? IFM_ETH_TXPAUSE : 0); } static u32 mlx5e_find_link_mode(u32 subtype, bool ext) { u32 i; u32 j; u32 link_mode = 0; u32 speeds_num = 0; struct media media_entry = {}; switch (subtype) { case IFM_10G_LR: subtype = IFM_10G_ER; break; case IFM_40G_ER4: subtype = IFM_40G_LR4; break; } speeds_num = ext ? MLX5E_EXT_LINK_SPEEDS_NUMBER : MLX5E_LINK_SPEEDS_NUMBER; for (i = 0; i != speeds_num; i++) { for (j = 0; j < MLX5E_LINK_MODES_NUMBER ; ++j) { media_entry = ext ? mlx5e_ext_mode_table[i][j] : mlx5e_mode_table[i][j]; if (media_entry.baudrate == 0) continue; if (media_entry.subtype == subtype) { link_mode |= MLX5E_PROT_MASK(i); } } } return (link_mode); } static int mlx5e_set_port_pause_and_pfc(struct mlx5e_priv *priv) { return (mlx5_set_port_pause_and_pfc(priv->mdev, 1, priv->params.rx_pauseframe_control, priv->params.tx_pauseframe_control, priv->params.rx_priority_flow_control, priv->params.tx_priority_flow_control)); } static int mlx5e_set_port_pfc(struct mlx5e_priv *priv) { int error; if (priv->gone != 0) { error = -ENXIO; } else if (priv->params.rx_pauseframe_control || priv->params.tx_pauseframe_control) { if_printf(priv->ifp, "Global pauseframes must be disabled before " "enabling PFC.\n"); error = -EINVAL; } else { error = mlx5e_set_port_pause_and_pfc(priv); } return (error); } static int mlx5e_media_change(struct ifnet *dev) { struct mlx5e_priv *priv = dev->if_softc; struct mlx5_core_dev *mdev = priv->mdev; u32 eth_proto_cap; u32 link_mode; u32 out[MLX5_ST_SZ_DW(ptys_reg)]; int was_opened; int locked; int error; bool ext; locked = PRIV_LOCKED(priv); if (!locked) PRIV_LOCK(priv); if (IFM_TYPE(priv->media.ifm_media) != IFM_ETHER) { error = EINVAL; goto done; } error = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1); if (error != 0) { if_printf(dev, "Query port media capability failed\n"); goto done; } ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); link_mode = mlx5e_find_link_mode(IFM_SUBTYPE(priv->media.ifm_media), ext); /* query supported capabilities */ eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_capability); /* check for autoselect */ if (IFM_SUBTYPE(priv->media.ifm_media) == IFM_AUTO) { link_mode = eth_proto_cap; if (link_mode == 0) { if_printf(dev, "Port media capability is zero\n"); error = EINVAL; goto done; } } else { link_mode = link_mode & eth_proto_cap; if (link_mode == 0) { if_printf(dev, "Not supported link mode requested\n"); error = EINVAL; goto done; } } if (priv->media.ifm_media & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) { /* check if PFC is enabled */ if (priv->params.rx_priority_flow_control || priv->params.tx_priority_flow_control) { if_printf(dev, "PFC must be disabled before enabling global pauseframes.\n"); error = EINVAL; goto done; } } /* update pauseframe control bits */ priv->params.rx_pauseframe_control = (priv->media.ifm_media & IFM_ETH_RXPAUSE) ? 1 : 0; priv->params.tx_pauseframe_control = (priv->media.ifm_media & IFM_ETH_TXPAUSE) ? 1 : 0; /* check if device is opened */ was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); /* reconfigure the hardware */ mlx5_set_port_status(mdev, MLX5_PORT_DOWN); mlx5_set_port_proto(mdev, link_mode, MLX5_PTYS_EN, ext); error = -mlx5e_set_port_pause_and_pfc(priv); if (was_opened) mlx5_set_port_status(mdev, MLX5_PORT_UP); done: if (!locked) PRIV_UNLOCK(priv); return (error); } static void mlx5e_update_carrier_work(struct work_struct *work) { struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, update_carrier_work); PRIV_LOCK(priv); if (test_bit(MLX5E_STATE_OPENED, &priv->state)) mlx5e_update_carrier(priv); PRIV_UNLOCK(priv); } #define MLX5E_PCIE_PERF_GET_64(a,b,c,d,e,f) \ s_debug->c = MLX5_GET64(mpcnt_reg, out, counter_set.f.c); #define MLX5E_PCIE_PERF_GET_32(a,b,c,d,e,f) \ s_debug->c = MLX5_GET(mpcnt_reg, out, counter_set.f.c); static void mlx5e_update_pcie_counters(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug; const unsigned sz = MLX5_ST_SZ_BYTES(mpcnt_reg); void *out; void *in; int err; /* allocate firmware request structures */ in = mlx5_vzalloc(sz); out = mlx5_vzalloc(sz); if (in == NULL || out == NULL) goto free_out; MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP); err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); if (err != 0) goto free_out; MLX5E_PCIE_PERFORMANCE_COUNTERS_64(MLX5E_PCIE_PERF_GET_64) MLX5E_PCIE_PERFORMANCE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32) MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP); err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); if (err != 0) goto free_out; MLX5E_PCIE_TIMERS_AND_STATES_COUNTERS_32(MLX5E_PCIE_PERF_GET_32) MLX5_SET(mpcnt_reg, in, grp, MLX5_PCIE_LANE_COUNTERS_GROUP); err = mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_MPCNT, 0, 0); if (err != 0) goto free_out; MLX5E_PCIE_LANE_COUNTERS_32(MLX5E_PCIE_PERF_GET_32) free_out: /* free firmware request structures */ kvfree(in); kvfree(out); } /* * This function reads the physical port counters from the firmware * using a pre-defined layout defined by various MLX5E_PPORT_XXX() * macros. The output is converted from big-endian 64-bit values into * host endian ones and stored in the "priv->stats.pport" structure. */ static void mlx5e_update_pport_counters(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_pport_stats *s = &priv->stats.pport; struct mlx5e_port_stats_debug *s_debug = &priv->stats.port_stats_debug; u32 *in; u32 *out; const u64 *ptr; unsigned sz = MLX5_ST_SZ_BYTES(ppcnt_reg); unsigned x; unsigned y; unsigned z; /* allocate firmware request structures */ in = mlx5_vzalloc(sz); out = mlx5_vzalloc(sz); if (in == NULL || out == NULL) goto free_out; /* * Get pointer to the 64-bit counter set which is located at a * fixed offset in the output firmware request structure: */ ptr = (const uint64_t *)MLX5_ADDR_OF(ppcnt_reg, out, counter_set); MLX5_SET(ppcnt_reg, in, local_port, 1); /* read IEEE802_3 counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_IEEE_802_3_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0, y = MLX5E_PPORT_PER_PRIO_STATS_NUM; x != MLX5E_PPORT_IEEE802_3_STATS_NUM; x++, y++) s->arg[y] = be64toh(ptr[x]); /* read RFC2819 counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2819_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM; x++, y++) s->arg[y] = be64toh(ptr[x]); for (y = 0; x != MLX5E_PPORT_RFC2819_STATS_NUM + MLX5E_PPORT_RFC2819_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read RFC2863 counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_RFC_2863_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_RFC2863_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read physical layer stats counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_PHYSICAL_LAYER_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read Extended Ethernet counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_ETHERNET_EXTENDED_STATS_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); /* read Extended Statistical Group */ if (MLX5_CAP_GEN(mdev, pcam_reg) && MLX5_CAP_PCAM_FEATURE(mdev, ppcnt_statistical_group) && MLX5_CAP_PCAM_FEATURE(mdev, per_lane_error_counters)) { /* read Extended Statistical counter group using predefined counter layout */ MLX5_SET(ppcnt_reg, in, grp, MLX5_PHYSICAL_LAYER_STATISTICAL_GROUP); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); for (x = 0; x != MLX5E_PPORT_STATISTICAL_DEBUG_NUM; x++, y++) s_debug->arg[y] = be64toh(ptr[x]); } /* read PCIE counters */ mlx5e_update_pcie_counters(priv); /* read per-priority counters */ MLX5_SET(ppcnt_reg, in, grp, MLX5_PER_PRIORITY_COUNTERS_GROUP); /* iterate all the priorities */ for (y = z = 0; z != MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO; z++) { MLX5_SET(ppcnt_reg, in, prio_tc, z); mlx5_core_access_reg(mdev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); /* read per priority stats counter group using predefined counter layout */ for (x = 0; x != (MLX5E_PPORT_PER_PRIO_STATS_NUM / MLX5E_PPORT_PER_PRIO_STATS_NUM_PRIO); x++, y++) s->arg[y] = be64toh(ptr[x]); } free_out: /* free firmware request structures */ kvfree(in); kvfree(out); } static void mlx5e_grp_vnic_env_update_stats(struct mlx5e_priv *priv) { u32 out[MLX5_ST_SZ_DW(query_vnic_env_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vnic_env_in)] = {}; if (!MLX5_CAP_GEN(priv->mdev, nic_receive_steering_discard)) return; MLX5_SET(query_vnic_env_in, in, opcode, MLX5_CMD_OP_QUERY_VNIC_ENV); MLX5_SET(query_vnic_env_in, in, op_mod, 0); MLX5_SET(query_vnic_env_in, in, other_vport, 0); if (mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)) != 0) return; priv->stats.vport.rx_steer_missed_packets = MLX5_GET64(query_vnic_env_out, out, vport_env.nic_receive_steering_discard); } /* * This function is called regularly to collect all statistics * counters from the firmware. The values can be viewed through the * sysctl interface. Execution is serialized using the priv's global * configuration lock. */ static void mlx5e_update_stats_locked(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_vport_stats *s = &priv->stats.vport; struct mlx5e_sq_stats *sq_stats; struct buf_ring *sq_br; #if (__FreeBSD_version < 1100000) struct ifnet *ifp = priv->ifp; #endif u32 in[MLX5_ST_SZ_DW(query_vport_counter_in)]; u32 *out; int outlen = MLX5_ST_SZ_BYTES(query_vport_counter_out); u64 tso_packets = 0; u64 tso_bytes = 0; u64 tx_queue_dropped = 0; u64 tx_defragged = 0; u64 tx_offload_none = 0; u64 lro_packets = 0; u64 lro_bytes = 0; u64 sw_lro_queued = 0; u64 sw_lro_flushed = 0; u64 rx_csum_none = 0; u64 rx_wqe_err = 0; u64 rx_packets = 0; u64 rx_bytes = 0; u32 rx_out_of_buffer = 0; int i; int j; out = mlx5_vzalloc(outlen); if (out == NULL) goto free_out; /* Collect firts the SW counters and then HW for consistency */ for (i = 0; i < priv->params.num_channels; i++) { struct mlx5e_channel *pch = priv->channel + i; struct mlx5e_rq *rq = &pch->rq; struct mlx5e_rq_stats *rq_stats = &pch->rq.stats; /* collect stats from LRO */ rq_stats->sw_lro_queued = rq->lro.lro_queued; rq_stats->sw_lro_flushed = rq->lro.lro_flushed; sw_lro_queued += rq_stats->sw_lro_queued; sw_lro_flushed += rq_stats->sw_lro_flushed; lro_packets += rq_stats->lro_packets; lro_bytes += rq_stats->lro_bytes; rx_csum_none += rq_stats->csum_none; rx_wqe_err += rq_stats->wqe_err; rx_packets += rq_stats->packets; rx_bytes += rq_stats->bytes; for (j = 0; j < priv->num_tc; j++) { sq_stats = &pch->sq[j].stats; sq_br = pch->sq[j].br; tso_packets += sq_stats->tso_packets; tso_bytes += sq_stats->tso_bytes; tx_queue_dropped += sq_stats->dropped; if (sq_br != NULL) tx_queue_dropped += sq_br->br_drops; tx_defragged += sq_stats->defragged; tx_offload_none += sq_stats->csum_offload_none; } } /* update counters */ s->tso_packets = tso_packets; s->tso_bytes = tso_bytes; s->tx_queue_dropped = tx_queue_dropped; s->tx_defragged = tx_defragged; s->lro_packets = lro_packets; s->lro_bytes = lro_bytes; s->sw_lro_queued = sw_lro_queued; s->sw_lro_flushed = sw_lro_flushed; s->rx_csum_none = rx_csum_none; s->rx_wqe_err = rx_wqe_err; s->rx_packets = rx_packets; s->rx_bytes = rx_bytes; mlx5e_grp_vnic_env_update_stats(priv); /* HW counters */ memset(in, 0, sizeof(in)); MLX5_SET(query_vport_counter_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_COUNTER); MLX5_SET(query_vport_counter_in, in, op_mod, 0); MLX5_SET(query_vport_counter_in, in, other_vport, 0); memset(out, 0, outlen); /* get number of out-of-buffer drops first */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0 && mlx5_vport_query_out_of_rx_buffer(mdev, priv->counter_set_id, &rx_out_of_buffer) == 0) { s->rx_out_of_buffer = rx_out_of_buffer; } /* get port statistics */ if (mlx5_cmd_exec(mdev, in, sizeof(in), out, outlen) == 0) { #define MLX5_GET_CTR(out, x) \ MLX5_GET64(query_vport_counter_out, out, x) s->rx_error_packets = MLX5_GET_CTR(out, received_errors.packets); s->rx_error_bytes = MLX5_GET_CTR(out, received_errors.octets); s->tx_error_packets = MLX5_GET_CTR(out, transmit_errors.packets); s->tx_error_bytes = MLX5_GET_CTR(out, transmit_errors.octets); s->rx_unicast_packets = MLX5_GET_CTR(out, received_eth_unicast.packets); s->rx_unicast_bytes = MLX5_GET_CTR(out, received_eth_unicast.octets); s->tx_unicast_packets = MLX5_GET_CTR(out, transmitted_eth_unicast.packets); s->tx_unicast_bytes = MLX5_GET_CTR(out, transmitted_eth_unicast.octets); s->rx_multicast_packets = MLX5_GET_CTR(out, received_eth_multicast.packets); s->rx_multicast_bytes = MLX5_GET_CTR(out, received_eth_multicast.octets); s->tx_multicast_packets = MLX5_GET_CTR(out, transmitted_eth_multicast.packets); s->tx_multicast_bytes = MLX5_GET_CTR(out, transmitted_eth_multicast.octets); s->rx_broadcast_packets = MLX5_GET_CTR(out, received_eth_broadcast.packets); s->rx_broadcast_bytes = MLX5_GET_CTR(out, received_eth_broadcast.octets); s->tx_broadcast_packets = MLX5_GET_CTR(out, transmitted_eth_broadcast.packets); s->tx_broadcast_bytes = MLX5_GET_CTR(out, transmitted_eth_broadcast.octets); s->tx_packets = s->tx_unicast_packets + s->tx_multicast_packets + s->tx_broadcast_packets; s->tx_bytes = s->tx_unicast_bytes + s->tx_multicast_bytes + s->tx_broadcast_bytes; /* Update calculated offload counters */ s->tx_csum_offload = s->tx_packets - tx_offload_none; s->rx_csum_good = s->rx_packets - s->rx_csum_none; } /* Get physical port counters */ mlx5e_update_pport_counters(priv); s->tx_jumbo_packets = priv->stats.port_stats_debug.tx_stat_p1519to2047octets + priv->stats.port_stats_debug.tx_stat_p2048to4095octets + priv->stats.port_stats_debug.tx_stat_p4096to8191octets + priv->stats.port_stats_debug.tx_stat_p8192to10239octets; #if (__FreeBSD_version < 1100000) /* no get_counters interface in fbsd 10 */ ifp->if_ipackets = s->rx_packets; ifp->if_ierrors = priv->stats.pport.in_range_len_errors + priv->stats.pport.out_of_range_len + priv->stats.pport.too_long_errors + priv->stats.pport.check_seq_err + priv->stats.pport.alignment_err; ifp->if_iqdrops = s->rx_out_of_buffer; ifp->if_opackets = s->tx_packets; ifp->if_oerrors = priv->stats.port_stats_debug.out_discards; ifp->if_snd.ifq_drops = s->tx_queue_dropped; ifp->if_ibytes = s->rx_bytes; ifp->if_obytes = s->tx_bytes; ifp->if_collisions = priv->stats.pport.collisions; #endif free_out: kvfree(out); /* Update diagnostics, if any */ if (priv->params_ethtool.diag_pci_enable || priv->params_ethtool.diag_general_enable) { int error = mlx5_core_get_diagnostics_full(mdev, priv->params_ethtool.diag_pci_enable ? &priv->params_pci : NULL, priv->params_ethtool.diag_general_enable ? &priv->params_general : NULL); if (error != 0) if_printf(priv->ifp, "Failed reading diagnostics: %d\n", error); } } static void mlx5e_update_stats_work(struct work_struct *work) { struct mlx5e_priv *priv; priv = container_of(work, struct mlx5e_priv, update_stats_work); PRIV_LOCK(priv); if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0) mlx5e_update_stats_locked(priv); PRIV_UNLOCK(priv); } static void mlx5e_update_stats(void *arg) { struct mlx5e_priv *priv = arg; queue_work(priv->wq, &priv->update_stats_work); callout_reset(&priv->watchdog, hz, &mlx5e_update_stats, priv); } static void mlx5e_async_event_sub(struct mlx5e_priv *priv, enum mlx5_dev_event event) { switch (event) { case MLX5_DEV_EVENT_PORT_UP: case MLX5_DEV_EVENT_PORT_DOWN: queue_work(priv->wq, &priv->update_carrier_work); break; default: break; } } static void mlx5e_async_event(struct mlx5_core_dev *mdev, void *vpriv, enum mlx5_dev_event event, unsigned long param) { struct mlx5e_priv *priv = vpriv; mtx_lock(&priv->async_events_mtx); if (test_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state)) mlx5e_async_event_sub(priv, event); mtx_unlock(&priv->async_events_mtx); } static void mlx5e_enable_async_events(struct mlx5e_priv *priv) { set_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state); } static void mlx5e_disable_async_events(struct mlx5e_priv *priv) { mtx_lock(&priv->async_events_mtx); clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLE, &priv->state); mtx_unlock(&priv->async_events_mtx); } static void mlx5e_calibration_callout(void *arg); static int mlx5e_calibration_duration = 20; static int mlx5e_fast_calibration = 1; static int mlx5e_normal_calibration = 30; static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW, 0, "MLX5 timestamp calibration parameteres"); SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN, &mlx5e_calibration_duration, 0, "Duration of initial calibration"); SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN, &mlx5e_fast_calibration, 0, "Recalibration interval during initial calibration"); SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN, &mlx5e_normal_calibration, 0, "Recalibration interval during normal operations"); /* * Ignites the calibration process. */ static void mlx5e_reset_calibration_callout(struct mlx5e_priv *priv) { if (priv->clbr_done == 0) mlx5e_calibration_callout(priv); else callout_reset_curcpu(&priv->tstmp_clbr, (priv->clbr_done < mlx5e_calibration_duration ? mlx5e_fast_calibration : mlx5e_normal_calibration) * hz, mlx5e_calibration_callout, priv); } static uint64_t mlx5e_timespec2usec(const struct timespec *ts) { return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec); } static uint64_t mlx5e_hw_clock(struct mlx5e_priv *priv) { struct mlx5_init_seg *iseg; uint32_t hw_h, hw_h1, hw_l; iseg = priv->mdev->iseg; do { hw_h = ioread32be(&iseg->internal_timer_h); hw_l = ioread32be(&iseg->internal_timer_l); hw_h1 = ioread32be(&iseg->internal_timer_h); } while (hw_h1 != hw_h); return (((uint64_t)hw_h << 32) | hw_l); } /* * The calibration callout, it runs either in the context of the * thread which enables calibration, or in callout. It takes the * snapshot of system and adapter clocks, then advances the pointers to * the calibration point to allow rx path to read the consistent data * lockless. */ static void mlx5e_calibration_callout(void *arg) { struct mlx5e_priv *priv; struct mlx5e_clbr_point *next, *curr; struct timespec ts; int clbr_curr_next; priv = arg; curr = &priv->clbr_points[priv->clbr_curr]; clbr_curr_next = priv->clbr_curr + 1; if (clbr_curr_next >= nitems(priv->clbr_points)) clbr_curr_next = 0; next = &priv->clbr_points[clbr_curr_next]; next->base_prev = curr->base_curr; next->clbr_hw_prev = curr->clbr_hw_curr; next->clbr_hw_curr = mlx5e_hw_clock(priv); if (((next->clbr_hw_curr - curr->clbr_hw_curr) >> MLX5E_TSTMP_PREC) == 0) { if (priv->clbr_done != 0) { if_printf(priv->ifp, "HW failed tstmp frozen %#jx %#jx," "disabling\n", next->clbr_hw_curr, curr->clbr_hw_prev); priv->clbr_done = 0; } atomic_store_rel_int(&curr->clbr_gen, 0); return; } nanouptime(&ts); next->base_curr = mlx5e_timespec2usec(&ts); curr->clbr_gen = 0; atomic_thread_fence_rel(); priv->clbr_curr = clbr_curr_next; atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen)); if (priv->clbr_done < mlx5e_calibration_duration) priv->clbr_done++; mlx5e_reset_calibration_callout(priv); } static const char *mlx5e_rq_stats_desc[] = { MLX5E_RQ_STATS(MLX5E_STATS_DESC) }; static int mlx5e_create_rq(struct mlx5e_channel *c, struct mlx5e_rq_param *param, struct mlx5e_rq *rq) { struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; char buffer[16]; void *rqc = param->rqc; void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); int wq_sz; int err; int i; u32 nsegs, wqe_sz; err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs); if (err != 0) goto done; /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ nsegs * MLX5E_MAX_RX_BYTES, /* maxsize */ nsegs, /* nsegments */ nsegs * MLX5E_MAX_RX_BYTES, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &rq->dma_tag))) goto done; err = mlx5_wq_ll_create(mdev, ¶m->wq, rqc_wq, &rq->wq, &rq->wq_ctrl); if (err) goto err_free_dma_tag; rq->wq.db = &rq->wq.db[MLX5_RCV_DBR]; err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs); if (err != 0) goto err_rq_wq_destroy; wq_sz = mlx5_wq_ll_get_size(&rq->wq); - err = -tcp_lro_init_args(&rq->lro, c->tag.m_snd_tag.ifp, TCP_LRO_ENTRIES, wq_sz); + err = -tcp_lro_init_args(&rq->lro, priv->ifp, TCP_LRO_ENTRIES, wq_sz); if (err) goto err_rq_wq_destroy; rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO); for (i = 0; i != wq_sz; i++) { struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i); int j; err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map); if (err != 0) { while (i--) bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map); goto err_rq_mbuf_free; } /* set value for constant fields */ for (j = 0; j < rq->nsegs; j++) wqe->data[j].lkey = c->mkey_be; } INIT_WORK(&rq->dim.work, mlx5e_dim_work); if (priv->params.rx_cq_moderation_mode < 2) { rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED; } else { void *cqc = container_of(param, struct mlx5e_channel_param, rq)->rx_cq.cqc; switch (MLX5_GET(cqc, cqc, cq_period_mode)) { case MLX5_CQ_PERIOD_MODE_START_FROM_EQE: rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE; break; case MLX5_CQ_PERIOD_MODE_START_FROM_CQE: rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE; break; default: rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED; break; } } - rq->ifp = c->tag.m_snd_tag.ifp; + rq->ifp = priv->ifp; rq->channel = c; rq->ix = c->ix; snprintf(buffer, sizeof(buffer), "rxstat%d", c->ix); mlx5e_create_stats(&rq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), buffer, mlx5e_rq_stats_desc, MLX5E_RQ_STATS_NUM, rq->stats.arg); return (0); err_rq_mbuf_free: free(rq->mbuf, M_MLX5EN); tcp_lro_free(&rq->lro); err_rq_wq_destroy: mlx5_wq_destroy(&rq->wq_ctrl); err_free_dma_tag: bus_dma_tag_destroy(rq->dma_tag); done: return (err); } static void mlx5e_destroy_rq(struct mlx5e_rq *rq) { int wq_sz; int i; /* destroy all sysctl nodes */ sysctl_ctx_free(&rq->stats.ctx); /* free leftover LRO packets, if any */ tcp_lro_free(&rq->lro); wq_sz = mlx5_wq_ll_get_size(&rq->wq); for (i = 0; i != wq_sz; i++) { if (rq->mbuf[i].mbuf != NULL) { bus_dmamap_unload(rq->dma_tag, rq->mbuf[i].dma_map); m_freem(rq->mbuf[i].mbuf); } bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map); } free(rq->mbuf, M_MLX5EN); mlx5_wq_destroy(&rq->wq_ctrl); } static int mlx5e_enable_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; void *in; void *rqc; void *wq; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(create_rq_in) + sizeof(u64) * rq->wq_ctrl.buf.npages; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); wq = MLX5_ADDR_OF(rqc, rqc, wq); memcpy(rqc, param->rqc, sizeof(param->rqc)); MLX5_SET(rqc, rqc, cqn, c->rq.cq.mcq.cqn); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); MLX5_SET(rqc, rqc, flush_in_error_en, 1); if (priv->counter_set_id >= 0) MLX5_SET(rqc, rqc, counter_set_id, priv->counter_set_id); MLX5_SET(wq, wq, log_wq_pg_sz, rq->wq_ctrl.buf.page_shift - PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, rq->wq_ctrl.db.dma); mlx5_fill_page_array(&rq->wq_ctrl.buf, (__be64 *) MLX5_ADDR_OF(wq, wq, pas)); err = mlx5_core_create_rq(mdev, in, inlen, &rq->rqn); kvfree(in); return (err); } static int mlx5e_modify_rq(struct mlx5e_rq *rq, int curr_state, int next_state) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; void *in; void *rqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_rq_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); MLX5_SET(modify_rq_in, in, rqn, rq->rqn); MLX5_SET(modify_rq_in, in, rq_state, curr_state); MLX5_SET(rqc, rqc, state, next_state); err = mlx5_core_modify_rq(mdev, in, inlen); kvfree(in); return (err); } static void mlx5e_disable_rq(struct mlx5e_rq *rq) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; mlx5_core_destroy_rq(mdev, rq->rqn); } static int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq) { struct mlx5e_channel *c = rq->channel; struct mlx5e_priv *priv = c->priv; struct mlx5_wq_ll *wq = &rq->wq; int i; for (i = 0; i < 1000; i++) { if (wq->cur_sz >= priv->params.min_rx_wqes) return (0); msleep(4); } return (-ETIMEDOUT); } static int mlx5e_open_rq(struct mlx5e_channel *c, struct mlx5e_rq_param *param, struct mlx5e_rq *rq) { int err; err = mlx5e_create_rq(c, param, rq); if (err) return (err); err = mlx5e_enable_rq(rq, param); if (err) goto err_destroy_rq; err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); if (err) goto err_disable_rq; c->rq.enabled = 1; return (0); err_disable_rq: mlx5e_disable_rq(rq); err_destroy_rq: mlx5e_destroy_rq(rq); return (err); } static void mlx5e_close_rq(struct mlx5e_rq *rq) { mtx_lock(&rq->mtx); rq->enabled = 0; callout_stop(&rq->watchdog); mtx_unlock(&rq->mtx); callout_drain(&rq->watchdog); mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR); } static void mlx5e_close_rq_wait(struct mlx5e_rq *rq) { mlx5e_disable_rq(rq); mlx5e_close_cq(&rq->cq); cancel_work_sync(&rq->dim.work); mlx5e_destroy_rq(rq); } void mlx5e_free_sq_db(struct mlx5e_sq *sq) { int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); int x; for (x = 0; x != wq_sz; x++) { if (sq->mbuf[x].mbuf != NULL) { bus_dmamap_unload(sq->dma_tag, sq->mbuf[x].dma_map); m_freem(sq->mbuf[x].mbuf); } bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map); } free(sq->mbuf, M_MLX5EN); } int mlx5e_alloc_sq_db(struct mlx5e_sq *sq) { int wq_sz = mlx5_wq_cyc_get_size(&sq->wq); int err; int x; sq->mbuf = malloc(wq_sz * sizeof(sq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO); /* Create DMA descriptor MAPs */ for (x = 0; x != wq_sz; x++) { err = -bus_dmamap_create(sq->dma_tag, 0, &sq->mbuf[x].dma_map); if (err != 0) { while (x--) bus_dmamap_destroy(sq->dma_tag, sq->mbuf[x].dma_map); free(sq->mbuf, M_MLX5EN); return (err); } } return (0); } static const char *mlx5e_sq_stats_desc[] = { MLX5E_SQ_STATS(MLX5E_STATS_DESC) }; void mlx5e_update_sq_inline(struct mlx5e_sq *sq) { sq->max_inline = sq->priv->params.tx_max_inline; sq->min_inline_mode = sq->priv->params.tx_min_inline_mode; /* * Check if trust state is DSCP or if inline mode is NONE which * indicates CX-5 or newer hardware. */ if (sq->priv->params_ethtool.trust_state != MLX5_QPTS_TRUST_PCP || sq->min_inline_mode == MLX5_INLINE_MODE_NONE) { if (MLX5_CAP_ETH(sq->priv->mdev, wqe_vlan_insert)) sq->min_insert_caps = MLX5E_INSERT_VLAN | MLX5E_INSERT_NON_VLAN; else sq->min_insert_caps = MLX5E_INSERT_NON_VLAN; } else { sq->min_insert_caps = 0; } } static void mlx5e_refresh_sq_inline_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c) { int i; for (i = 0; i != c->num_tc; i++) { mtx_lock(&c->sq[i].lock); mlx5e_update_sq_inline(&c->sq[i]); mtx_unlock(&c->sq[i].lock); } } void mlx5e_refresh_sq_inline(struct mlx5e_priv *priv) { int i; /* check if channels are closed */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return; for (i = 0; i < priv->params.num_channels; i++) mlx5e_refresh_sq_inline_sub(priv, &priv->channel[i]); } static int mlx5e_create_sq(struct mlx5e_channel *c, int tc, struct mlx5e_sq_param *param, struct mlx5e_sq *sq) { struct mlx5e_priv *priv = c->priv; struct mlx5_core_dev *mdev = priv->mdev; char buffer[16]; void *sqc = param->sqc; void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq); int err; /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */ MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */ MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &sq->dma_tag))) goto done; err = mlx5_alloc_map_uar(mdev, &sq->uar); if (err) goto err_free_dma_tag; err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) goto err_unmap_free_uar; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2; err = mlx5e_alloc_sq_db(sq); if (err) goto err_sq_wq_destroy; sq->mkey_be = c->mkey_be; sq->ifp = priv->ifp; sq->priv = priv; sq->tc = tc; mlx5e_update_sq_inline(sq); snprintf(buffer, sizeof(buffer), "txstat%dtc%d", c->ix, tc); mlx5e_create_stats(&sq->stats.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), buffer, mlx5e_sq_stats_desc, MLX5E_SQ_STATS_NUM, sq->stats.arg); return (0); err_sq_wq_destroy: mlx5_wq_destroy(&sq->wq_ctrl); err_unmap_free_uar: mlx5_unmap_free_uar(mdev, &sq->uar); err_free_dma_tag: bus_dma_tag_destroy(sq->dma_tag); done: return (err); } static void mlx5e_destroy_sq(struct mlx5e_sq *sq) { /* destroy all sysctl nodes */ sysctl_ctx_free(&sq->stats.ctx); mlx5e_free_sq_db(sq); mlx5_wq_destroy(&sq->wq_ctrl); mlx5_unmap_free_uar(sq->priv->mdev, &sq->uar); } int mlx5e_enable_sq(struct mlx5e_sq *sq, struct mlx5e_sq_param *param, int tis_num) { void *in; void *sqc; void *wq; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * sq->wq_ctrl.buf.npages; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); wq = MLX5_ADDR_OF(sqc, sqc, wq); memcpy(sqc, param->sqc, sizeof(param->sqc)); MLX5_SET(sqc, sqc, tis_num_0, tis_num); MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn); MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); MLX5_SET(sqc, sqc, tis_lst_sz, 1); MLX5_SET(sqc, sqc, flush_in_error_en, 1); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); MLX5_SET(wq, wq, uar_page, sq->uar.index); MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); mlx5_fill_page_array(&sq->wq_ctrl.buf, (__be64 *) MLX5_ADDR_OF(wq, wq, pas)); err = mlx5_core_create_sq(sq->priv->mdev, in, inlen, &sq->sqn); kvfree(in); return (err); } int mlx5e_modify_sq(struct mlx5e_sq *sq, int curr_state, int next_state) { void *in; void *sqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_sq_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); MLX5_SET(modify_sq_in, in, sqn, sq->sqn); MLX5_SET(modify_sq_in, in, sq_state, curr_state); MLX5_SET(sqc, sqc, state, next_state); err = mlx5_core_modify_sq(sq->priv->mdev, in, inlen); kvfree(in); return (err); } void mlx5e_disable_sq(struct mlx5e_sq *sq) { mlx5_core_destroy_sq(sq->priv->mdev, sq->sqn); } static int mlx5e_open_sq(struct mlx5e_channel *c, int tc, struct mlx5e_sq_param *param, struct mlx5e_sq *sq) { int err; err = mlx5e_create_sq(c, tc, param, sq); if (err) return (err); err = mlx5e_enable_sq(sq, param, c->priv->tisn[tc]); if (err) goto err_destroy_sq; err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); if (err) goto err_disable_sq; WRITE_ONCE(sq->running, 1); return (0); err_disable_sq: mlx5e_disable_sq(sq); err_destroy_sq: mlx5e_destroy_sq(sq); return (err); } static void mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep) { /* fill up remainder with NOPs */ while (sq->cev_counter != 0) { while (!mlx5e_sq_has_room_for(sq, 1)) { if (can_sleep != 0) { mtx_unlock(&sq->lock); msleep(4); mtx_lock(&sq->lock); } else { goto done; } } /* send a single NOP */ mlx5e_send_nop(sq, 1); atomic_thread_fence_rel(); } done: /* Check if we need to write the doorbell */ if (likely(sq->doorbell.d64 != 0)) { mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); sq->doorbell.d64 = 0; } } void mlx5e_sq_cev_timeout(void *arg) { struct mlx5e_sq *sq = arg; mtx_assert(&sq->lock, MA_OWNED); /* check next state */ switch (sq->cev_next_state) { case MLX5E_CEV_STATE_SEND_NOPS: /* fill TX ring with NOPs, if any */ mlx5e_sq_send_nops_locked(sq, 0); /* check if completed */ if (sq->cev_counter == 0) { sq->cev_next_state = MLX5E_CEV_STATE_INITIAL; return; } break; default: /* send NOPs on next timeout */ sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS; break; } /* restart timer */ callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq); } void mlx5e_drain_sq(struct mlx5e_sq *sq) { int error; struct mlx5_core_dev *mdev= sq->priv->mdev; /* * Check if already stopped. * * NOTE: Serialization of this function is managed by the * caller ensuring the priv's state lock is locked or in case * of rate limit support, a single thread manages drain and * resume of SQs. The "running" variable can therefore safely * be read without any locks. */ if (READ_ONCE(sq->running) == 0) return; /* don't put more packets into the SQ */ WRITE_ONCE(sq->running, 0); /* serialize access to DMA rings */ mtx_lock(&sq->lock); /* teardown event factor timer, if any */ sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS; callout_stop(&sq->cev_callout); /* send dummy NOPs in order to flush the transmit ring */ mlx5e_sq_send_nops_locked(sq, 1); mtx_unlock(&sq->lock); /* make sure it is safe to free the callout */ callout_drain(&sq->cev_callout); /* wait till SQ is empty or link is down */ mtx_lock(&sq->lock); while (sq->cc != sq->pc && (sq->priv->media_status_last & IFM_ACTIVE) != 0 && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { mtx_unlock(&sq->lock); msleep(1); sq->cq.mcq.comp(&sq->cq.mcq); mtx_lock(&sq->lock); } mtx_unlock(&sq->lock); /* error out remaining requests */ error = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR); if (error != 0) { if_printf(sq->ifp, "mlx5e_modify_sq() from RDY to ERR failed: %d\n", error); } /* wait till SQ is empty */ mtx_lock(&sq->lock); while (sq->cc != sq->pc && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { mtx_unlock(&sq->lock); msleep(1); sq->cq.mcq.comp(&sq->cq.mcq); mtx_lock(&sq->lock); } mtx_unlock(&sq->lock); } static void mlx5e_close_sq_wait(struct mlx5e_sq *sq) { mlx5e_drain_sq(sq); mlx5e_disable_sq(sq); mlx5e_destroy_sq(sq); } static int mlx5e_create_cq(struct mlx5e_priv *priv, struct mlx5e_cq_param *param, struct mlx5e_cq *cq, mlx5e_cq_comp_t *comp, int eq_ix) { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5_core_cq *mcq = &cq->mcq; int eqn_not_used; int irqn; int err; u32 i; param->wq.buf_numa_node = 0; param->wq.db_numa_node = 0; err = mlx5_cqwq_create(mdev, ¶m->wq, param->cqc, &cq->wq, &cq->wq_ctrl); if (err) return (err); mlx5_vector2eqn(mdev, eq_ix, &eqn_not_used, &irqn); mcq->cqe_sz = 64; mcq->set_ci_db = cq->wq_ctrl.db.db; mcq->arm_db = cq->wq_ctrl.db.db + 1; *mcq->set_ci_db = 0; *mcq->arm_db = 0; mcq->vector = eq_ix; mcq->comp = comp; mcq->event = mlx5e_cq_error_event; mcq->irqn = irqn; mcq->uar = &priv->cq_uar; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); cqe->op_own = 0xf1; } cq->priv = priv; return (0); } static void mlx5e_destroy_cq(struct mlx5e_cq *cq) { mlx5_wq_destroy(&cq->wq_ctrl); } static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param, int eq_ix) { struct mlx5_core_cq *mcq = &cq->mcq; void *in; void *cqc; int inlen; int irqn_not_used; int eqn; int err; inlen = MLX5_ST_SZ_BYTES(create_cq_in) + sizeof(u64) * cq->wq_ctrl.buf.npages; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); memcpy(cqc, param->cqc, sizeof(param->cqc)); mlx5_fill_page_array(&cq->wq_ctrl.buf, (__be64 *) MLX5_ADDR_OF(create_cq_in, in, pas)); mlx5_vector2eqn(cq->priv->mdev, eq_ix, &eqn, &irqn_not_used); MLX5_SET(cqc, cqc, c_eqn, eqn); MLX5_SET(cqc, cqc, uar_page, mcq->uar->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); err = mlx5_core_create_cq(cq->priv->mdev, mcq, in, inlen); kvfree(in); if (err) return (err); mlx5e_cq_arm(cq, MLX5_GET_DOORBELL_LOCK(&cq->priv->doorbell_lock)); return (0); } static void mlx5e_disable_cq(struct mlx5e_cq *cq) { mlx5_core_destroy_cq(cq->priv->mdev, &cq->mcq); } int mlx5e_open_cq(struct mlx5e_priv *priv, struct mlx5e_cq_param *param, struct mlx5e_cq *cq, mlx5e_cq_comp_t *comp, int eq_ix) { int err; err = mlx5e_create_cq(priv, param, cq, comp, eq_ix); if (err) return (err); err = mlx5e_enable_cq(cq, param, eq_ix); if (err) goto err_destroy_cq; return (0); err_destroy_cq: mlx5e_destroy_cq(cq); return (err); } void mlx5e_close_cq(struct mlx5e_cq *cq) { mlx5e_disable_cq(cq); mlx5e_destroy_cq(cq); } static int mlx5e_open_tx_cqs(struct mlx5e_channel *c, struct mlx5e_channel_param *cparam) { int err; int tc; for (tc = 0; tc < c->num_tc; tc++) { /* open completion queue */ err = mlx5e_open_cq(c->priv, &cparam->tx_cq, &c->sq[tc].cq, &mlx5e_tx_cq_comp, c->ix); if (err) goto err_close_tx_cqs; } return (0); err_close_tx_cqs: for (tc--; tc >= 0; tc--) mlx5e_close_cq(&c->sq[tc].cq); return (err); } static void mlx5e_close_tx_cqs(struct mlx5e_channel *c) { int tc; for (tc = 0; tc < c->num_tc; tc++) mlx5e_close_cq(&c->sq[tc].cq); } static int mlx5e_open_sqs(struct mlx5e_channel *c, struct mlx5e_channel_param *cparam) { int err; int tc; for (tc = 0; tc < c->num_tc; tc++) { err = mlx5e_open_sq(c, tc, &cparam->sq, &c->sq[tc]); if (err) goto err_close_sqs; } return (0); err_close_sqs: for (tc--; tc >= 0; tc--) mlx5e_close_sq_wait(&c->sq[tc]); return (err); } static void mlx5e_close_sqs_wait(struct mlx5e_channel *c) { int tc; for (tc = 0; tc < c->num_tc; tc++) mlx5e_close_sq_wait(&c->sq[tc]); } static void mlx5e_chan_mtx_init(struct mlx5e_channel *c) { int tc; mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF); callout_init_mtx(&c->rq.watchdog, &c->rq.mtx, 0); for (tc = 0; tc < c->num_tc; tc++) { struct mlx5e_sq *sq = c->sq + tc; mtx_init(&sq->lock, "mlx5tx", MTX_NETWORK_LOCK " TX", MTX_DEF); mtx_init(&sq->comp_lock, "mlx5comp", MTX_NETWORK_LOCK " TX", MTX_DEF); callout_init_mtx(&sq->cev_callout, &sq->lock, 0); sq->cev_factor = c->priv->params_ethtool.tx_completion_fact; /* ensure the TX completion event factor is not zero */ if (sq->cev_factor == 0) sq->cev_factor = 1; } } static void mlx5e_chan_mtx_destroy(struct mlx5e_channel *c) { int tc; mtx_destroy(&c->rq.mtx); for (tc = 0; tc < c->num_tc; tc++) { mtx_destroy(&c->sq[tc].lock); mtx_destroy(&c->sq[tc].comp_lock); } } static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, struct mlx5e_channel_param *cparam, struct mlx5e_channel *c) { int err; memset(c, 0, sizeof(*c)); c->priv = priv; c->ix = ix; /* setup send tag */ - c->tag.m_snd_tag.ifp = priv->ifp; c->tag.type = IF_SND_TAG_TYPE_UNLIMITED; c->mkey_be = cpu_to_be32(priv->mr.key); c->num_tc = priv->num_tc; /* init mutexes */ mlx5e_chan_mtx_init(c); /* open transmit completion queue */ err = mlx5e_open_tx_cqs(c, cparam); if (err) goto err_free; /* open receive completion queue */ err = mlx5e_open_cq(c->priv, &cparam->rx_cq, &c->rq.cq, &mlx5e_rx_cq_comp, c->ix); if (err) goto err_close_tx_cqs; err = mlx5e_open_sqs(c, cparam); if (err) goto err_close_rx_cq; err = mlx5e_open_rq(c, &cparam->rq, &c->rq); if (err) goto err_close_sqs; /* poll receive queue initially */ c->rq.cq.mcq.comp(&c->rq.cq.mcq); return (0); err_close_sqs: mlx5e_close_sqs_wait(c); err_close_rx_cq: mlx5e_close_cq(&c->rq.cq); err_close_tx_cqs: mlx5e_close_tx_cqs(c); err_free: /* destroy mutexes */ mlx5e_chan_mtx_destroy(c); return (err); } static void mlx5e_close_channel(struct mlx5e_channel *c) { mlx5e_close_rq(&c->rq); } static void mlx5e_close_channel_wait(struct mlx5e_channel *c) { mlx5e_close_rq_wait(&c->rq); mlx5e_close_sqs_wait(c); mlx5e_close_tx_cqs(c); /* destroy mutexes */ mlx5e_chan_mtx_destroy(c); } static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs) { u32 r, n; r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz : MLX5E_SW2MB_MTU(priv->ifp->if_mtu); if (r > MJUM16BYTES) return (-ENOMEM); if (r > MJUM9BYTES) r = MJUM16BYTES; else if (r > MJUMPAGESIZE) r = MJUM9BYTES; else if (r > MCLBYTES) r = MJUMPAGESIZE; else r = MCLBYTES; /* * n + 1 must be a power of two, because stride size must be. * Stride size is 16 * (n + 1), as the first segment is * control. */ for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++) ; if (n > MLX5E_MAX_BUSDMA_RX_SEGS) return (-ENOMEM); *wqe_sz = r; *nsegs = n; return (0); } static void mlx5e_build_rq_param(struct mlx5e_priv *priv, struct mlx5e_rq_param *param) { void *rqc = param->rqc; void *wq = MLX5_ADDR_OF(rqc, rqc, wq); u32 wqe_sz, nsegs; mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST); MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) + nsegs * sizeof(struct mlx5_wqe_data_seg))); MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size); MLX5_SET(wq, wq, pd, priv->pdn); param->wq.buf_numa_node = 0; param->wq.db_numa_node = 0; param->wq.linear = 1; } static void mlx5e_build_sq_param(struct mlx5e_priv *priv, struct mlx5e_sq_param *param) { void *sqc = param->sqc; void *wq = MLX5_ADDR_OF(sqc, sqc, wq); MLX5_SET(wq, wq, log_wq_sz, priv->params.log_sq_size); MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); MLX5_SET(wq, wq, pd, priv->pdn); param->wq.buf_numa_node = 0; param->wq.db_numa_node = 0; param->wq.linear = 1; } static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { void *cqc = param->cqc; MLX5_SET(cqc, cqc, uar_page, priv->cq_uar.index); } static void mlx5e_get_default_profile(struct mlx5e_priv *priv, int mode, struct net_dim_cq_moder *ptr) { *ptr = net_dim_get_profile(mode, MLX5E_DIM_DEFAULT_PROFILE); /* apply LRO restrictions */ if (priv->params.hw_lro_en && ptr->pkts > MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO) { ptr->pkts = MLX5E_DIM_MAX_RX_CQ_MODERATION_PKTS_WITH_LRO; } } static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { struct net_dim_cq_moder curr; void *cqc = param->cqc; /* * We use MLX5_CQE_FORMAT_HASH because the RX hash mini CQE * format is more beneficial for FreeBSD use case. * * Adding support for MLX5_CQE_FORMAT_CSUM will require changes * in mlx5e_decompress_cqe. */ if (priv->params.cqe_zipping_en) { MLX5_SET(cqc, cqc, mini_cqe_res_format, MLX5_CQE_FORMAT_HASH); MLX5_SET(cqc, cqc, cqe_compression_en, 1); } MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_rq_size); switch (priv->params.rx_cq_moderation_mode) { case 0: MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec); MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts); MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; case 1: MLX5_SET(cqc, cqc, cq_period, priv->params.rx_cq_moderation_usec); MLX5_SET(cqc, cqc, cq_max_count, priv->params.rx_cq_moderation_pkts); if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; case 2: mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE, &curr); MLX5_SET(cqc, cqc, cq_period, curr.usec); MLX5_SET(cqc, cqc, cq_max_count, curr.pkts); MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; case 3: mlx5e_get_default_profile(priv, NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE, &curr); MLX5_SET(cqc, cqc, cq_period, curr.usec); MLX5_SET(cqc, cqc, cq_max_count, curr.pkts); if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; default: break; } mlx5e_dim_build_cq_param(priv, param); mlx5e_build_common_cq_param(priv, param); } static void mlx5e_build_tx_cq_param(struct mlx5e_priv *priv, struct mlx5e_cq_param *param) { void *cqc = param->cqc; MLX5_SET(cqc, cqc, log_cq_size, priv->params.log_sq_size); MLX5_SET(cqc, cqc, cq_period, priv->params.tx_cq_moderation_usec); MLX5_SET(cqc, cqc, cq_max_count, priv->params.tx_cq_moderation_pkts); switch (priv->params.tx_cq_moderation_mode) { case 0: MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; default: if (MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; } mlx5e_build_common_cq_param(priv, param); } static void mlx5e_build_channel_param(struct mlx5e_priv *priv, struct mlx5e_channel_param *cparam) { memset(cparam, 0, sizeof(*cparam)); mlx5e_build_rq_param(priv, &cparam->rq); mlx5e_build_sq_param(priv, &cparam->sq); mlx5e_build_rx_cq_param(priv, &cparam->rx_cq); mlx5e_build_tx_cq_param(priv, &cparam->tx_cq); } static int mlx5e_open_channels(struct mlx5e_priv *priv) { struct mlx5e_channel_param cparam; int err; int i; int j; mlx5e_build_channel_param(priv, &cparam); for (i = 0; i < priv->params.num_channels; i++) { err = mlx5e_open_channel(priv, i, &cparam, &priv->channel[i]); if (err) goto err_close_channels; } for (j = 0; j < priv->params.num_channels; j++) { err = mlx5e_wait_for_min_rx_wqes(&priv->channel[j].rq); if (err) goto err_close_channels; } return (0); err_close_channels: while (i--) { mlx5e_close_channel(&priv->channel[i]); mlx5e_close_channel_wait(&priv->channel[i]); } return (err); } static void mlx5e_close_channels(struct mlx5e_priv *priv) { int i; for (i = 0; i < priv->params.num_channels; i++) mlx5e_close_channel(&priv->channel[i]); for (i = 0; i < priv->params.num_channels; i++) mlx5e_close_channel_wait(&priv->channel[i]); } static int mlx5e_refresh_sq_params(struct mlx5e_priv *priv, struct mlx5e_sq *sq) { if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) { uint8_t cq_mode; switch (priv->params.tx_cq_moderation_mode) { case 0: case 2: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE; break; default: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE; break; } return (mlx5_core_modify_cq_moderation_mode(priv->mdev, &sq->cq.mcq, priv->params.tx_cq_moderation_usec, priv->params.tx_cq_moderation_pkts, cq_mode)); } return (mlx5_core_modify_cq_moderation(priv->mdev, &sq->cq.mcq, priv->params.tx_cq_moderation_usec, priv->params.tx_cq_moderation_pkts)); } static int mlx5e_refresh_rq_params(struct mlx5e_priv *priv, struct mlx5e_rq *rq) { if (MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify)) { uint8_t cq_mode; uint8_t dim_mode; int retval; switch (priv->params.rx_cq_moderation_mode) { case 0: case 2: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_EQE; dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_EQE; break; default: cq_mode = MLX5_CQ_PERIOD_MODE_START_FROM_CQE; dim_mode = NET_DIM_CQ_PERIOD_MODE_START_FROM_CQE; break; } /* tear down dynamic interrupt moderation */ mtx_lock(&rq->mtx); rq->dim.mode = NET_DIM_CQ_PERIOD_MODE_DISABLED; mtx_unlock(&rq->mtx); /* wait for dynamic interrupt moderation work task, if any */ cancel_work_sync(&rq->dim.work); if (priv->params.rx_cq_moderation_mode >= 2) { struct net_dim_cq_moder curr; mlx5e_get_default_profile(priv, dim_mode, &curr); retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq, curr.usec, curr.pkts, cq_mode); /* set dynamic interrupt moderation mode and zero defaults */ mtx_lock(&rq->mtx); rq->dim.mode = dim_mode; rq->dim.state = 0; rq->dim.profile_ix = MLX5E_DIM_DEFAULT_PROFILE; mtx_unlock(&rq->mtx); } else { retval = mlx5_core_modify_cq_moderation_mode(priv->mdev, &rq->cq.mcq, priv->params.rx_cq_moderation_usec, priv->params.rx_cq_moderation_pkts, cq_mode); } return (retval); } return (mlx5_core_modify_cq_moderation(priv->mdev, &rq->cq.mcq, priv->params.rx_cq_moderation_usec, priv->params.rx_cq_moderation_pkts)); } static int mlx5e_refresh_channel_params_sub(struct mlx5e_priv *priv, struct mlx5e_channel *c) { int err; int i; err = mlx5e_refresh_rq_params(priv, &c->rq); if (err) goto done; for (i = 0; i != c->num_tc; i++) { err = mlx5e_refresh_sq_params(priv, &c->sq[i]); if (err) goto done; } done: return (err); } int mlx5e_refresh_channel_params(struct mlx5e_priv *priv) { int i; /* check if channels are closed */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return (EINVAL); for (i = 0; i < priv->params.num_channels; i++) { int err; err = mlx5e_refresh_channel_params_sub(priv, &priv->channel[i]); if (err) return (err); } return (0); } static int mlx5e_open_tis(struct mlx5e_priv *priv, int tc) { struct mlx5_core_dev *mdev = priv->mdev; u32 in[MLX5_ST_SZ_DW(create_tis_in)]; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); memset(in, 0, sizeof(in)); MLX5_SET(tisc, tisc, prio, tc); MLX5_SET(tisc, tisc, transport_domain, priv->tdn); return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc])); } static void mlx5e_close_tis(struct mlx5e_priv *priv, int tc) { mlx5_core_destroy_tis(priv->mdev, priv->tisn[tc]); } static int mlx5e_open_tises(struct mlx5e_priv *priv) { int num_tc = priv->num_tc; int err; int tc; for (tc = 0; tc < num_tc; tc++) { err = mlx5e_open_tis(priv, tc); if (err) goto err_close_tises; } return (0); err_close_tises: for (tc--; tc >= 0; tc--) mlx5e_close_tis(priv, tc); return (err); } static void mlx5e_close_tises(struct mlx5e_priv *priv) { int num_tc = priv->num_tc; int tc; for (tc = 0; tc < num_tc; tc++) mlx5e_close_tis(priv, tc); } static int mlx5e_open_rqt(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u32 *in; u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {0}; void *rqtc; int inlen; int err; int sz; int i; sz = 1 << priv->params.rx_hash_log_tbl_sz; inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz; in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); MLX5_SET(rqtc, rqtc, rqt_actual_size, sz); MLX5_SET(rqtc, rqtc, rqt_max_size, sz); for (i = 0; i < sz; i++) { int ix = i; #ifdef RSS ix = rss_get_indirection_to_bucket(ix); #endif /* ensure we don't overflow */ ix %= priv->params.num_channels; /* apply receive side scaling stride, if any */ ix -= ix % (int)priv->params.channels_rsss; MLX5_SET(rqtc, rqtc, rq_num[i], priv->channel[ix].rq.rqn); } MLX5_SET(create_rqt_in, in, opcode, MLX5_CMD_OP_CREATE_RQT); err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); if (!err) priv->rqtn = MLX5_GET(create_rqt_out, out, rqtn); kvfree(in); return (err); } static void mlx5e_close_rqt(struct mlx5e_priv *priv) { u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {0}; u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {0}; MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); MLX5_SET(destroy_rqt_in, in, rqtn, priv->rqtn); mlx5_cmd_exec(priv->mdev, in, sizeof(in), out, sizeof(out)); } static void mlx5e_build_tir_ctx(struct mlx5e_priv *priv, u32 * tirc, int tt) { void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); __be32 *hkey; MLX5_SET(tirc, tirc, transport_domain, priv->tdn); #define ROUGH_MAX_L2_L3_HDR_SZ 256 #define MLX5_HASH_IP (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP) #define MLX5_HASH_ALL (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP |\ MLX5_HASH_FIELD_SEL_L4_SPORT |\ MLX5_HASH_FIELD_SEL_L4_DPORT) #define MLX5_HASH_IP_IPSEC_SPI (MLX5_HASH_FIELD_SEL_SRC_IP |\ MLX5_HASH_FIELD_SEL_DST_IP |\ MLX5_HASH_FIELD_SEL_IPSEC_SPI) if (priv->params.hw_lro_en) { MLX5_SET(tirc, tirc, lro_enable_mask, MLX5_TIRC_LRO_ENABLE_MASK_IPV4_LRO | MLX5_TIRC_LRO_ENABLE_MASK_IPV6_LRO); MLX5_SET(tirc, tirc, lro_max_msg_sz, (priv->params.lro_wqe_sz - ROUGH_MAX_L2_L3_HDR_SZ) >> 8); /* TODO: add the option to choose timer value dynamically */ MLX5_SET(tirc, tirc, lro_timeout_period_usecs, MLX5_CAP_ETH(priv->mdev, lro_timer_supported_periods[2])); } /* setup parameters for hashing TIR type, if any */ switch (tt) { case MLX5E_TT_ANY: MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); MLX5_SET(tirc, tirc, inline_rqn, priv->channel[0].rq.rqn); break; default: MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); MLX5_SET(tirc, tirc, indirect_table, priv->rqtn); MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_TIRC_RX_HASH_FN_HASH_TOEPLITZ); hkey = (__be32 *) MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); #ifdef RSS /* * The FreeBSD RSS implementation does currently not * support symmetric Toeplitz hashes: */ MLX5_SET(tirc, tirc, rx_hash_symmetric, 0); rss_getkey((uint8_t *)hkey); #else MLX5_SET(tirc, tirc, rx_hash_symmetric, 1); hkey[0] = cpu_to_be32(0xD181C62C); hkey[1] = cpu_to_be32(0xF7F4DB5B); hkey[2] = cpu_to_be32(0x1983A2FC); hkey[3] = cpu_to_be32(0x943E1ADB); hkey[4] = cpu_to_be32(0xD9389E6B); hkey[5] = cpu_to_be32(0xD1039C2C); hkey[6] = cpu_to_be32(0xA74499AD); hkey[7] = cpu_to_be32(0x593D56D9); hkey[8] = cpu_to_be32(0xF3253C06); hkey[9] = cpu_to_be32(0x2ADC1FFC); #endif break; } switch (tt) { case MLX5E_TT_IPV4_TCP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV4)) { MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV6_TCP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_TCP_IPV6)) { MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV4_UDP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_UDP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV4)) { MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV6_UDP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_UDP); #ifdef RSS if (!(rss_gethashconfig() & RSS_HASHTYPE_RSS_UDP_IPV6)) { MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); } else #endif MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_ALL); break; case MLX5E_TT_IPV4_IPSEC_AH: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV6_IPSEC_AH: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV4_IPSEC_ESP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV6_IPSEC_ESP: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP_IPSEC_SPI); break; case MLX5E_TT_IPV4: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); break; case MLX5E_TT_IPV6: MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); MLX5_SET(rx_hash_field_select, hfso, selected_fields, MLX5_HASH_IP); break; default: break; } } static int mlx5e_open_tir(struct mlx5e_priv *priv, int tt) { struct mlx5_core_dev *mdev = priv->mdev; u32 *in; void *tirc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(create_tir_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); tirc = MLX5_ADDR_OF(create_tir_in, in, tir_context); mlx5e_build_tir_ctx(priv, tirc, tt); err = mlx5_core_create_tir(mdev, in, inlen, &priv->tirn[tt]); kvfree(in); return (err); } static void mlx5e_close_tir(struct mlx5e_priv *priv, int tt) { mlx5_core_destroy_tir(priv->mdev, priv->tirn[tt]); } static int mlx5e_open_tirs(struct mlx5e_priv *priv) { int err; int i; for (i = 0; i < MLX5E_NUM_TT; i++) { err = mlx5e_open_tir(priv, i); if (err) goto err_close_tirs; } return (0); err_close_tirs: for (i--; i >= 0; i--) mlx5e_close_tir(priv, i); return (err); } static void mlx5e_close_tirs(struct mlx5e_priv *priv) { int i; for (i = 0; i < MLX5E_NUM_TT; i++) mlx5e_close_tir(priv, i); } /* * SW MTU does not include headers, * HW MTU includes all headers and checksums. */ static int mlx5e_set_dev_port_mtu(struct ifnet *ifp, int sw_mtu) { struct mlx5e_priv *priv = ifp->if_softc; struct mlx5_core_dev *mdev = priv->mdev; int hw_mtu; int err; hw_mtu = MLX5E_SW2HW_MTU(sw_mtu); err = mlx5_set_port_mtu(mdev, hw_mtu); if (err) { if_printf(ifp, "%s: mlx5_set_port_mtu failed setting %d, err=%d\n", __func__, sw_mtu, err); return (err); } /* Update vport context MTU */ err = mlx5_set_vport_mtu(mdev, hw_mtu); if (err) { if_printf(ifp, "%s: Failed updating vport context with MTU size, err=%d\n", __func__, err); } ifp->if_mtu = sw_mtu; err = mlx5_query_vport_mtu(mdev, &hw_mtu); if (err || !hw_mtu) { /* fallback to port oper mtu */ err = mlx5_query_port_oper_mtu(mdev, &hw_mtu); } if (err) { if_printf(ifp, "Query port MTU, after setting new " "MTU value, failed\n"); return (err); } else if (MLX5E_HW2SW_MTU(hw_mtu) < sw_mtu) { err = -E2BIG, if_printf(ifp, "Port MTU %d is smaller than " "ifp mtu %d\n", hw_mtu, sw_mtu); } else if (MLX5E_HW2SW_MTU(hw_mtu) > sw_mtu) { err = -EINVAL; if_printf(ifp, "Port MTU %d is bigger than " "ifp mtu %d\n", hw_mtu, sw_mtu); } priv->params_ethtool.hw_mtu = hw_mtu; return (err); } int mlx5e_open_locked(struct ifnet *ifp) { struct mlx5e_priv *priv = ifp->if_softc; int err; u16 set_id; /* check if already opened */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0) return (0); #ifdef RSS if (rss_getnumbuckets() > priv->params.num_channels) { if_printf(ifp, "NOTE: There are more RSS buckets(%u) than " "channels(%u) available\n", rss_getnumbuckets(), priv->params.num_channels); } #endif err = mlx5e_open_tises(priv); if (err) { if_printf(ifp, "%s: mlx5e_open_tises failed, %d\n", __func__, err); return (err); } err = mlx5_vport_alloc_q_counter(priv->mdev, MLX5_INTERFACE_PROTOCOL_ETH, &set_id); if (err) { if_printf(priv->ifp, "%s: mlx5_vport_alloc_q_counter failed: %d\n", __func__, err); goto err_close_tises; } /* store counter set ID */ priv->counter_set_id = set_id; err = mlx5e_open_channels(priv); if (err) { if_printf(ifp, "%s: mlx5e_open_channels failed, %d\n", __func__, err); goto err_dalloc_q_counter; } err = mlx5e_open_rqt(priv); if (err) { if_printf(ifp, "%s: mlx5e_open_rqt failed, %d\n", __func__, err); goto err_close_channels; } err = mlx5e_open_tirs(priv); if (err) { if_printf(ifp, "%s: mlx5e_open_tir failed, %d\n", __func__, err); goto err_close_rqls; } err = mlx5e_open_flow_table(priv); if (err) { if_printf(ifp, "%s: mlx5e_open_flow_table failed, %d\n", __func__, err); goto err_close_tirs; } err = mlx5e_add_all_vlan_rules(priv); if (err) { if_printf(ifp, "%s: mlx5e_add_all_vlan_rules failed, %d\n", __func__, err); goto err_close_flow_table; } set_bit(MLX5E_STATE_OPENED, &priv->state); mlx5e_update_carrier(priv); mlx5e_set_rx_mode_core(priv); return (0); err_close_flow_table: mlx5e_close_flow_table(priv); err_close_tirs: mlx5e_close_tirs(priv); err_close_rqls: mlx5e_close_rqt(priv); err_close_channels: mlx5e_close_channels(priv); err_dalloc_q_counter: mlx5_vport_dealloc_q_counter(priv->mdev, MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id); err_close_tises: mlx5e_close_tises(priv); return (err); } static void mlx5e_open(void *arg) { struct mlx5e_priv *priv = arg; PRIV_LOCK(priv); if (mlx5_set_port_status(priv->mdev, MLX5_PORT_UP)) if_printf(priv->ifp, "%s: Setting port status to up failed\n", __func__); mlx5e_open_locked(priv->ifp); priv->ifp->if_drv_flags |= IFF_DRV_RUNNING; PRIV_UNLOCK(priv); } int mlx5e_close_locked(struct ifnet *ifp) { struct mlx5e_priv *priv = ifp->if_softc; /* check if already closed */ if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return (0); clear_bit(MLX5E_STATE_OPENED, &priv->state); mlx5e_set_rx_mode_core(priv); mlx5e_del_all_vlan_rules(priv); if_link_state_change(priv->ifp, LINK_STATE_DOWN); mlx5e_close_flow_table(priv); mlx5e_close_tirs(priv); mlx5e_close_rqt(priv); mlx5e_close_channels(priv); mlx5_vport_dealloc_q_counter(priv->mdev, MLX5_INTERFACE_PROTOCOL_ETH, priv->counter_set_id); mlx5e_close_tises(priv); return (0); } #if (__FreeBSD_version >= 1100000) static uint64_t mlx5e_get_counter(struct ifnet *ifp, ift_counter cnt) { struct mlx5e_priv *priv = ifp->if_softc; u64 retval; /* PRIV_LOCK(priv); XXX not allowed */ switch (cnt) { case IFCOUNTER_IPACKETS: retval = priv->stats.vport.rx_packets; break; case IFCOUNTER_IERRORS: retval = priv->stats.pport.in_range_len_errors + priv->stats.pport.out_of_range_len + priv->stats.pport.too_long_errors + priv->stats.pport.check_seq_err + priv->stats.pport.alignment_err; break; case IFCOUNTER_IQDROPS: retval = priv->stats.vport.rx_out_of_buffer; break; case IFCOUNTER_OPACKETS: retval = priv->stats.vport.tx_packets; break; case IFCOUNTER_OERRORS: retval = priv->stats.port_stats_debug.out_discards; break; case IFCOUNTER_IBYTES: retval = priv->stats.vport.rx_bytes; break; case IFCOUNTER_OBYTES: retval = priv->stats.vport.tx_bytes; break; case IFCOUNTER_IMCASTS: retval = priv->stats.vport.rx_multicast_packets; break; case IFCOUNTER_OMCASTS: retval = priv->stats.vport.tx_multicast_packets; break; case IFCOUNTER_OQDROPS: retval = priv->stats.vport.tx_queue_dropped; break; case IFCOUNTER_COLLISIONS: retval = priv->stats.pport.collisions; break; default: retval = if_get_counter_default(ifp, cnt); break; } /* PRIV_UNLOCK(priv); XXX not allowed */ return (retval); } #endif static void mlx5e_set_rx_mode(struct ifnet *ifp) { struct mlx5e_priv *priv = ifp->if_softc; queue_work(priv->wq, &priv->set_rx_mode_work); } static int mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct mlx5e_priv *priv; struct ifreq *ifr; struct ifi2creq i2c; int error = 0; int mask = 0; int size_read = 0; int module_status; int module_num; int max_mtu; uint8_t read_addr; priv = ifp->if_softc; /* check if detaching */ if (priv == NULL || priv->gone != 0) return (ENXIO); switch (command) { case SIOCSIFMTU: ifr = (struct ifreq *)data; PRIV_LOCK(priv); mlx5_query_port_max_mtu(priv->mdev, &max_mtu); if (ifr->ifr_mtu >= MLX5E_MTU_MIN && ifr->ifr_mtu <= MIN(MLX5E_MTU_MAX, max_mtu)) { int was_opened; was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); if (was_opened) mlx5e_close_locked(ifp); /* set new MTU */ mlx5e_set_dev_port_mtu(ifp, ifr->ifr_mtu); if (was_opened) mlx5e_open_locked(ifp); } else { error = EINVAL; if_printf(ifp, "Invalid MTU value. Min val: %d, Max val: %d\n", MLX5E_MTU_MIN, MIN(MLX5E_MTU_MAX, max_mtu)); } PRIV_UNLOCK(priv); break; case SIOCSIFFLAGS: if ((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { mlx5e_set_rx_mode(ifp); break; } PRIV_LOCK(priv); if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) mlx5e_open_locked(ifp); ifp->if_drv_flags |= IFF_DRV_RUNNING; mlx5_set_port_status(priv->mdev, MLX5_PORT_UP); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { mlx5_set_port_status(priv->mdev, MLX5_PORT_DOWN); if (test_bit(MLX5E_STATE_OPENED, &priv->state) != 0) mlx5e_close_locked(ifp); mlx5e_update_carrier(priv); ifp->if_drv_flags &= ~IFF_DRV_RUNNING; } } PRIV_UNLOCK(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: mlx5e_set_rx_mode(ifp); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: ifr = (struct ifreq *)data; error = ifmedia_ioctl(ifp, ifr, &priv->media, command); break; case SIOCSIFCAP: ifr = (struct ifreq *)data; PRIV_LOCK(priv); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (IFCAP_TSO4 & ifp->if_capenable && !(IFCAP_TXCSUM & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO4; ifp->if_hwassist &= ~CSUM_IP_TSO; if_printf(ifp, "tso4 disabled due to -txcsum.\n"); } } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; ifp->if_hwassist ^= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); if (IFCAP_TSO6 & ifp->if_capenable && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { ifp->if_capenable &= ~IFCAP_TSO6; ifp->if_hwassist &= ~CSUM_IP6_TSO; if_printf(ifp, "tso6 disabled due to -txcsum6.\n"); } } if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; if (mask & IFCAP_TSO4) { if (!(IFCAP_TSO4 & ifp->if_capenable) && !(IFCAP_TXCSUM & ifp->if_capenable)) { if_printf(ifp, "enable txcsum first.\n"); error = EAGAIN; goto out; } ifp->if_capenable ^= IFCAP_TSO4; ifp->if_hwassist ^= CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { if (!(IFCAP_TSO6 & ifp->if_capenable) && !(IFCAP_TXCSUM_IPV6 & ifp->if_capenable)) { if_printf(ifp, "enable txcsum6 first.\n"); error = EAGAIN; goto out; } ifp->if_capenable ^= IFCAP_TSO6; ifp->if_hwassist ^= CSUM_IP6_TSO; } if (mask & IFCAP_VLAN_HWFILTER) { if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) mlx5e_disable_vlan_filter(priv); else mlx5e_enable_vlan_filter(priv); ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; } if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_WOL_MAGIC) ifp->if_capenable ^= IFCAP_WOL_MAGIC; VLAN_CAPABILITIES(ifp); /* turn off LRO means also turn of HW LRO - if it's on */ if (mask & IFCAP_LRO) { int was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state); bool need_restart = false; ifp->if_capenable ^= IFCAP_LRO; /* figure out if updating HW LRO is needed */ if (!(ifp->if_capenable & IFCAP_LRO)) { if (priv->params.hw_lro_en) { priv->params.hw_lro_en = false; need_restart = true; } } else { if (priv->params.hw_lro_en == false && priv->params_ethtool.hw_lro != 0) { priv->params.hw_lro_en = true; need_restart = true; } } if (was_opened && need_restart) { mlx5e_close_locked(ifp); mlx5e_open_locked(ifp); } } if (mask & IFCAP_HWRXTSTMP) { ifp->if_capenable ^= IFCAP_HWRXTSTMP; if (ifp->if_capenable & IFCAP_HWRXTSTMP) { if (priv->clbr_done == 0) mlx5e_reset_calibration_callout(priv); } else { callout_drain(&priv->tstmp_clbr); priv->clbr_done = 0; } } out: PRIV_UNLOCK(priv); break; case SIOCGI2C: ifr = (struct ifreq *)data; /* * Copy from the user-space address ifr_data to the * kernel-space address i2c */ error = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (error) break; if (i2c.len > sizeof(i2c.data)) { error = EINVAL; break; } PRIV_LOCK(priv); /* Get module_num which is required for the query_eeprom */ error = mlx5_query_module_num(priv->mdev, &module_num); if (error) { if_printf(ifp, "Query module num failed, eeprom " "reading is not supported\n"); error = EINVAL; goto err_i2c; } /* Check if module is present before doing an access */ module_status = mlx5_query_module_status(priv->mdev, module_num); if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED && module_status != MLX5_MODULE_STATUS_PLUGGED_DISABLED) { error = EINVAL; goto err_i2c; } /* * Currently 0XA0 and 0xA2 are the only addresses permitted. * The internal conversion is as follows: */ if (i2c.dev_addr == 0xA0) read_addr = MLX5E_I2C_ADDR_LOW; else if (i2c.dev_addr == 0xA2) read_addr = MLX5E_I2C_ADDR_HIGH; else { if_printf(ifp, "Query eeprom failed, " "Invalid Address: %X\n", i2c.dev_addr); error = EINVAL; goto err_i2c; } error = mlx5_query_eeprom(priv->mdev, read_addr, MLX5E_EEPROM_LOW_PAGE, (uint32_t)i2c.offset, (uint32_t)i2c.len, module_num, (uint32_t *)i2c.data, &size_read); if (error) { if_printf(ifp, "Query eeprom failed, eeprom " "reading is not supported\n"); error = EINVAL; goto err_i2c; } if (i2c.len > MLX5_EEPROM_MAX_BYTES) { error = mlx5_query_eeprom(priv->mdev, read_addr, MLX5E_EEPROM_LOW_PAGE, (uint32_t)(i2c.offset + size_read), (uint32_t)(i2c.len - size_read), module_num, (uint32_t *)(i2c.data + size_read), &size_read); } if (error) { if_printf(ifp, "Query eeprom failed, eeprom " "reading is not supported\n"); error = EINVAL; goto err_i2c; } error = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); err_i2c: PRIV_UNLOCK(priv); break; default: error = ether_ioctl(ifp, command, data); break; } return (error); } static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev) { /* * TODO: uncoment once FW really sets all these bits if * (!mdev->caps.eth.rss_ind_tbl_cap || !mdev->caps.eth.csum_cap || * !mdev->caps.eth.max_lso_cap || !mdev->caps.eth.vlan_cap || * !(mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_SCQE_BRK_MOD)) return * -ENOTSUPP; */ /* TODO: add more must-to-have features */ if (MLX5_CAP_GEN(mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) return (-ENODEV); return (0); } static u16 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev) { uint32_t bf_buf_size = (1U << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2U; bf_buf_size -= sizeof(struct mlx5e_tx_wqe) - 2; /* verify against driver hardware limit */ if (bf_buf_size > MLX5E_MAX_TX_INLINE) bf_buf_size = MLX5E_MAX_TX_INLINE; return (bf_buf_size); } static int mlx5e_build_ifp_priv(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv, int num_comp_vectors) { int err; /* * TODO: Consider link speed for setting "log_sq_size", * "log_rq_size" and "cq_moderation_xxx": */ priv->params.log_sq_size = MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE; priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE; priv->params.rx_cq_moderation_usec = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE : MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC; priv->params.rx_cq_moderation_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ? 1 : 0; priv->params.rx_cq_moderation_pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS; priv->params.tx_cq_moderation_usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC; priv->params.tx_cq_moderation_pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS; priv->params.min_rx_wqes = MLX5E_PARAMS_DEFAULT_MIN_RX_WQES; priv->params.rx_hash_log_tbl_sz = (order_base_2(num_comp_vectors) > MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ) ? order_base_2(num_comp_vectors) : MLX5E_PARAMS_DEFAULT_RX_HASH_LOG_TBL_SZ; priv->params.num_tc = 1; priv->params.default_vlan_prio = 0; priv->counter_set_id = -1; priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev); err = mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode); if (err) return (err); /* * hw lro is currently defaulted to off. when it won't anymore we * will consider the HW capability: "!!MLX5_CAP_ETH(mdev, lro_cap)" */ priv->params.hw_lro_en = false; priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ; /* * CQE zipping is currently defaulted to off. when it won't * anymore we will consider the HW capability: * "!!MLX5_CAP_GEN(mdev, cqe_compression)" */ priv->params.cqe_zipping_en = false; priv->mdev = mdev; priv->params.num_channels = num_comp_vectors; priv->params.channels_rsss = 1; priv->order_base_2_num_channels = order_base_2(num_comp_vectors); priv->queue_mapping_channel_mask = roundup_pow_of_two(num_comp_vectors) - 1; priv->num_tc = priv->params.num_tc; priv->default_vlan_prio = priv->params.default_vlan_prio; INIT_WORK(&priv->update_stats_work, mlx5e_update_stats_work); INIT_WORK(&priv->update_carrier_work, mlx5e_update_carrier_work); INIT_WORK(&priv->set_rx_mode_work, mlx5e_set_rx_mode_work); return (0); } static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn, struct mlx5_core_mr *mkey) { struct ifnet *ifp = priv->ifp; struct mlx5_core_dev *mdev = priv->mdev; int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); void *mkc; u32 *in; int err; in = mlx5_vzalloc(inlen); if (in == NULL) { if_printf(ifp, "%s: failed to allocate inbox\n", __func__); return (-ENOMEM); } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA); MLX5_SET(mkc, mkc, lw, 1); MLX5_SET(mkc, mkc, lr, 1); MLX5_SET(mkc, mkc, pd, pdn); MLX5_SET(mkc, mkc, length64, 1); MLX5_SET(mkc, mkc, qpn, 0xffffff); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); if (err) if_printf(ifp, "%s: mlx5_core_create_mkey failed, %d\n", __func__, err); kvfree(in); return (err); } static const char *mlx5e_vport_stats_desc[] = { MLX5E_VPORT_STATS(MLX5E_STATS_DESC) }; static const char *mlx5e_pport_stats_desc[] = { MLX5E_PPORT_STATS(MLX5E_STATS_DESC) }; static void mlx5e_priv_mtx_init(struct mlx5e_priv *priv) { mtx_init(&priv->async_events_mtx, "mlx5async", MTX_NETWORK_LOCK, MTX_DEF); sx_init(&priv->state_lock, "mlx5state"); callout_init_mtx(&priv->watchdog, &priv->async_events_mtx, 0); MLX5_INIT_DOORBELL_LOCK(&priv->doorbell_lock); } static void mlx5e_priv_mtx_destroy(struct mlx5e_priv *priv) { mtx_destroy(&priv->async_events_mtx); sx_destroy(&priv->state_lock); } static int sysctl_firmware(SYSCTL_HANDLER_ARGS) { /* * %d.%d%.d the string format. * fw_rev_{maj,min,sub} return u16, 2^16 = 65536. * We need at most 5 chars to store that. * It also has: two "." and NULL at the end, which means we need 18 * (5*3 + 3) chars at most. */ char fw[18]; struct mlx5e_priv *priv = arg1; int error; snprintf(fw, sizeof(fw), "%d.%d.%d", fw_rev_maj(priv->mdev), fw_rev_min(priv->mdev), fw_rev_sub(priv->mdev)); error = sysctl_handle_string(oidp, fw, sizeof(fw), req); return (error); } static void mlx5e_disable_tx_dma(struct mlx5e_channel *ch) { int i; for (i = 0; i < ch->num_tc; i++) mlx5e_drain_sq(&ch->sq[i]); } static void mlx5e_reset_sq_doorbell_record(struct mlx5e_sq *sq) { sq->doorbell.d32[0] = cpu_to_be32(MLX5_OPCODE_NOP); sq->doorbell.d32[1] = cpu_to_be32(sq->sqn << 8); mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); sq->doorbell.d64 = 0; } void mlx5e_resume_sq(struct mlx5e_sq *sq) { int err; /* check if already enabled */ if (READ_ONCE(sq->running) != 0) return; err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_ERR, MLX5_SQC_STATE_RST); if (err != 0) { if_printf(sq->ifp, "mlx5e_modify_sq() from ERR to RST failed: %d\n", err); } sq->cc = 0; sq->pc = 0; /* reset doorbell prior to moving from RST to RDY */ mlx5e_reset_sq_doorbell_record(sq); err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); if (err != 0) { if_printf(sq->ifp, "mlx5e_modify_sq() from RST to RDY failed: %d\n", err); } sq->cev_next_state = MLX5E_CEV_STATE_INITIAL; WRITE_ONCE(sq->running, 1); } static void mlx5e_enable_tx_dma(struct mlx5e_channel *ch) { int i; for (i = 0; i < ch->num_tc; i++) mlx5e_resume_sq(&ch->sq[i]); } static void mlx5e_disable_rx_dma(struct mlx5e_channel *ch) { struct mlx5e_rq *rq = &ch->rq; int err; mtx_lock(&rq->mtx); rq->enabled = 0; callout_stop(&rq->watchdog); mtx_unlock(&rq->mtx); callout_drain(&rq->watchdog); err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RDY, MLX5_RQC_STATE_ERR); if (err != 0) { if_printf(rq->ifp, "mlx5e_modify_rq() from RDY to RST failed: %d\n", err); } while (!mlx5_wq_ll_is_empty(&rq->wq)) { msleep(1); rq->cq.mcq.comp(&rq->cq.mcq); } /* * Transitioning into RST state will allow the FW to track less ERR state queues, * thus reducing the recv queue flushing time */ err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_ERR, MLX5_RQC_STATE_RST); if (err != 0) { if_printf(rq->ifp, "mlx5e_modify_rq() from ERR to RST failed: %d\n", err); } } static void mlx5e_enable_rx_dma(struct mlx5e_channel *ch) { struct mlx5e_rq *rq = &ch->rq; int err; rq->wq.wqe_ctr = 0; mlx5_wq_ll_update_db_record(&rq->wq); err = mlx5e_modify_rq(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY); if (err != 0) { if_printf(rq->ifp, "mlx5e_modify_rq() from RST to RDY failed: %d\n", err); } rq->enabled = 1; rq->cq.mcq.comp(&rq->cq.mcq); } void mlx5e_modify_tx_dma(struct mlx5e_priv *priv, uint8_t value) { int i; if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return; for (i = 0; i < priv->params.num_channels; i++) { if (value) mlx5e_disable_tx_dma(&priv->channel[i]); else mlx5e_enable_tx_dma(&priv->channel[i]); } } void mlx5e_modify_rx_dma(struct mlx5e_priv *priv, uint8_t value) { int i; if (test_bit(MLX5E_STATE_OPENED, &priv->state) == 0) return; for (i = 0; i < priv->params.num_channels; i++) { if (value) mlx5e_disable_rx_dma(&priv->channel[i]); else mlx5e_enable_rx_dma(&priv->channel[i]); } } static void mlx5e_add_hw_stats(struct mlx5e_priv *priv) { SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw), OID_AUTO, "fw_version", CTLTYPE_STRING | CTLFLAG_RD, priv, 0, sysctl_firmware, "A", "HCA firmware version"); SYSCTL_ADD_STRING(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_hw), OID_AUTO, "board_id", CTLFLAG_RD, priv->mdev->board_id, 0, "Board ID"); } static int mlx5e_sysctl_tx_priority_flow_control(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; uint8_t temp[MLX5E_MAX_PRIORITY]; uint32_t tx_pfc; int err; int i; PRIV_LOCK(priv); tx_pfc = priv->params.tx_priority_flow_control; for (i = 0; i != MLX5E_MAX_PRIORITY; i++) temp[i] = (tx_pfc >> i) & 1; err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY); if (err) goto done; priv->params.tx_priority_flow_control = 0; /* range check input value */ for (i = 0; i != MLX5E_MAX_PRIORITY; i++) { if (temp[i] > 1) { err = ERANGE; goto done; } priv->params.tx_priority_flow_control |= (temp[i] << i); } /* check if update is required */ if (tx_pfc != priv->params.tx_priority_flow_control) err = -mlx5e_set_port_pfc(priv); done: if (err != 0) priv->params.tx_priority_flow_control= tx_pfc; PRIV_UNLOCK(priv); return (err); } static int mlx5e_sysctl_rx_priority_flow_control(SYSCTL_HANDLER_ARGS) { struct mlx5e_priv *priv = arg1; uint8_t temp[MLX5E_MAX_PRIORITY]; uint32_t rx_pfc; int err; int i; PRIV_LOCK(priv); rx_pfc = priv->params.rx_priority_flow_control; for (i = 0; i != MLX5E_MAX_PRIORITY; i++) temp[i] = (rx_pfc >> i) & 1; err = SYSCTL_OUT(req, temp, MLX5E_MAX_PRIORITY); if (err || !req->newptr) goto done; err = SYSCTL_IN(req, temp, MLX5E_MAX_PRIORITY); if (err) goto done; priv->params.rx_priority_flow_control = 0; /* range check input value */ for (i = 0; i != MLX5E_MAX_PRIORITY; i++) { if (temp[i] > 1) { err = ERANGE; goto done; } priv->params.rx_priority_flow_control |= (temp[i] << i); } /* check if update is required */ if (rx_pfc != priv->params.rx_priority_flow_control) err = -mlx5e_set_port_pfc(priv); done: if (err != 0) priv->params.rx_priority_flow_control= rx_pfc; PRIV_UNLOCK(priv); return (err); } static void mlx5e_setup_pauseframes(struct mlx5e_priv *priv) { #if (__FreeBSD_version < 1100000) char path[96]; #endif int error; /* enable pauseframes by default */ priv->params.tx_pauseframe_control = 1; priv->params.rx_pauseframe_control = 1; /* disable ports flow control, PFC, by default */ priv->params.tx_priority_flow_control = 0; priv->params.rx_priority_flow_control = 0; #if (__FreeBSD_version < 1100000) /* compute path for sysctl */ snprintf(path, sizeof(path), "dev.mce.%d.tx_pauseframe_control", device_get_unit(priv->mdev->pdev->dev.bsddev)); /* try to fetch tunable, if any */ TUNABLE_INT_FETCH(path, &priv->params.tx_pauseframe_control); /* compute path for sysctl */ snprintf(path, sizeof(path), "dev.mce.%d.rx_pauseframe_control", device_get_unit(priv->mdev->pdev->dev.bsddev)); /* try to fetch tunable, if any */ TUNABLE_INT_FETCH(path, &priv->params.rx_pauseframe_control); #endif /* register pauseframe SYSCTLs */ SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "tx_pauseframe_control", CTLFLAG_RDTUN, &priv->params.tx_pauseframe_control, 0, "Set to enable TX pause frames. Clear to disable."); SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rx_pauseframe_control", CTLFLAG_RDTUN, &priv->params.rx_pauseframe_control, 0, "Set to enable RX pause frames. Clear to disable."); /* register priority flow control, PFC, SYSCTLs */ SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "tx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_tx_priority_flow_control, "CU", "Set to enable TX ports flow control frames for priorities 0..7. Clear to disable."); SYSCTL_ADD_PROC(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rx_priority_flow_control", CTLTYPE_U8 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, priv, 0, &mlx5e_sysctl_rx_priority_flow_control, "CU", "Set to enable RX ports flow control frames for priorities 0..7. Clear to disable."); PRIV_LOCK(priv); /* range check */ priv->params.tx_pauseframe_control = priv->params.tx_pauseframe_control ? 1 : 0; priv->params.rx_pauseframe_control = priv->params.rx_pauseframe_control ? 1 : 0; /* update firmware */ error = mlx5e_set_port_pause_and_pfc(priv); if (error == -EINVAL) { if_printf(priv->ifp, "Global pauseframes must be disabled before enabling PFC.\n"); priv->params.rx_priority_flow_control = 0; priv->params.tx_priority_flow_control = 0; /* update firmware */ (void) mlx5e_set_port_pause_and_pfc(priv); } PRIV_UNLOCK(priv); } static int mlx5e_ul_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct mlx5e_priv *priv; struct mlx5e_channel *pch; priv = ifp->if_softc; if (unlikely(priv->gone || params->hdr.flowtype == M_HASHTYPE_NONE)) { return (EOPNOTSUPP); } else { /* keep this code synced with mlx5e_select_queue() */ u32 ch = priv->params.num_channels; #ifdef RSS u32 temp; if (rss_hash2bucket(params->hdr.flowid, params->hdr.flowtype, &temp) == 0) ch = temp % ch; else #endif ch = (params->hdr.flowid % 128) % ch; /* * NOTE: The channels array is only freed at detach * and it safe to return a pointer to the send tag * inside the channels structure as long as we * reference the priv. */ pch = priv->channel + ch; /* check if send queue is not running */ if (unlikely(pch->sq[0].running == 0)) return (ENXIO); mlx5e_ref_channel(priv); + MPASS(pch->tag.m_snd_tag.refcount == 0); + m_snd_tag_init(&pch->tag.m_snd_tag, ifp); *ppmt = &pch->tag.m_snd_tag; return (0); } } static int mlx5e_ul_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) { struct mlx5e_channel *pch = container_of(pmt, struct mlx5e_channel, tag.m_snd_tag); params->unlimited.max_rate = -1ULL; params->unlimited.queue_level = mlx5e_sq_queue_level(&pch->sq[0]); return (0); } static void mlx5e_ul_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_channel *pch = container_of(pmt, struct mlx5e_channel, tag.m_snd_tag); mlx5e_unref_channel(pch->priv); } static int mlx5e_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { switch (params->hdr.type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: return (mlx5e_rl_snd_tag_alloc(ifp, params, ppmt)); #endif case IF_SND_TAG_TYPE_UNLIMITED: return (mlx5e_ul_snd_tag_alloc(ifp, params, ppmt)); default: return (EOPNOTSUPP); } } static int mlx5e_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) { struct mlx5e_snd_tag *tag = container_of(pmt, struct mlx5e_snd_tag, m_snd_tag); switch (tag->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: return (mlx5e_rl_snd_tag_modify(pmt, params)); #endif case IF_SND_TAG_TYPE_UNLIMITED: default: return (EOPNOTSUPP); } } static int mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) { struct mlx5e_snd_tag *tag = container_of(pmt, struct mlx5e_snd_tag, m_snd_tag); switch (tag->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: return (mlx5e_rl_snd_tag_query(pmt, params)); #endif case IF_SND_TAG_TYPE_UNLIMITED: return (mlx5e_ul_snd_tag_query(pmt, params)); default: return (EOPNOTSUPP); } } static void mlx5e_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_snd_tag *tag = container_of(pmt, struct mlx5e_snd_tag, m_snd_tag); switch (tag->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: mlx5e_rl_snd_tag_free(pmt); break; #endif case IF_SND_TAG_TYPE_UNLIMITED: mlx5e_ul_snd_tag_free(pmt); break; default: break; } } static void * mlx5e_create_ifp(struct mlx5_core_dev *mdev) { struct ifnet *ifp; struct mlx5e_priv *priv; u8 dev_addr[ETHER_ADDR_LEN] __aligned(4); u8 connector_type; struct sysctl_oid_list *child; int ncv = mdev->priv.eq_table.num_comp_vectors; char unit[16]; struct pfil_head_args pa; int err; int i,j; u32 eth_proto_cap; u32 out[MLX5_ST_SZ_DW(ptys_reg)]; bool ext = 0; u32 speeds_num; struct media media_entry = {}; if (mlx5e_check_required_hca_cap(mdev)) { mlx5_core_dbg(mdev, "mlx5e_check_required_hca_cap() failed\n"); return (NULL); } /* * Try to allocate the priv and make room for worst-case * number of channel structures: */ priv = malloc(sizeof(*priv) + (sizeof(priv->channel[0]) * mdev->priv.eq_table.num_comp_vectors), M_MLX5EN, M_WAITOK | M_ZERO); mlx5e_priv_mtx_init(priv); ifp = priv->ifp = if_alloc_dev(IFT_ETHER, mdev->pdev->dev.bsddev); if (ifp == NULL) { mlx5_core_err(mdev, "if_alloc() failed\n"); goto err_free_priv; } ifp->if_softc = priv; if_initname(ifp, "mce", device_get_unit(mdev->pdev->dev.bsddev)); ifp->if_mtu = ETHERMTU; ifp->if_init = mlx5e_open; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = mlx5e_ioctl; ifp->if_transmit = mlx5e_xmit; ifp->if_qflush = if_qflush; #if (__FreeBSD_version >= 1100000) ifp->if_get_counter = mlx5e_get_counter; #endif ifp->if_snd.ifq_maxlen = ifqmaxlen; /* * Set driver features */ ifp->if_capabilities |= IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6; ifp->if_capabilities |= IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING; ifp->if_capabilities |= IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWFILTER; ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_LRO; ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO; ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP; ifp->if_capabilities |= IFCAP_TXRTLMT; ifp->if_snd_tag_alloc = mlx5e_snd_tag_alloc; ifp->if_snd_tag_free = mlx5e_snd_tag_free; ifp->if_snd_tag_modify = mlx5e_snd_tag_modify; ifp->if_snd_tag_query = mlx5e_snd_tag_query; /* set TSO limits so that we don't have to drop TX packets */ ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */; ifp->if_hw_tsomaxsegsize = MLX5E_MAX_TX_MBUF_SIZE; ifp->if_capenable = ifp->if_capabilities; ifp->if_hwassist = 0; if (ifp->if_capenable & IFCAP_TSO) ifp->if_hwassist |= CSUM_TSO; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP | CSUM_IP); if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= (CSUM_UDP_IPV6 | CSUM_TCP_IPV6); /* ifnet sysctl tree */ sysctl_ctx_init(&priv->sysctl_ctx); priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, ifp->if_dname, CTLFLAG_RD, 0, "MLX5 ethernet - interface name"); if (priv->sysctl_ifnet == NULL) { mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); goto err_free_sysctl; } snprintf(unit, sizeof(unit), "%d", ifp->if_dunit); priv->sysctl_ifnet = SYSCTL_ADD_NODE(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, unit, CTLFLAG_RD, 0, "MLX5 ethernet - interface unit"); if (priv->sysctl_ifnet == NULL) { mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); goto err_free_sysctl; } /* HW sysctl tree */ child = SYSCTL_CHILDREN(device_get_sysctl_tree(mdev->pdev->dev.bsddev)); priv->sysctl_hw = SYSCTL_ADD_NODE(&priv->sysctl_ctx, child, OID_AUTO, "hw", CTLFLAG_RD, 0, "MLX5 ethernet dev hw"); if (priv->sysctl_hw == NULL) { mlx5_core_err(mdev, "SYSCTL_ADD_NODE() failed\n"); goto err_free_sysctl; } err = mlx5e_build_ifp_priv(mdev, priv, ncv); if (err) { mlx5_core_err(mdev, "mlx5e_build_ifp_priv() failed (%d)\n", err); goto err_free_sysctl; } /* reuse mlx5core's watchdog workqueue */ priv->wq = mdev->priv.health.wq_watchdog; err = mlx5_alloc_map_uar(mdev, &priv->cq_uar); if (err) { if_printf(ifp, "%s: mlx5_alloc_map_uar failed, %d\n", __func__, err); goto err_free_wq; } err = mlx5_core_alloc_pd(mdev, &priv->pdn); if (err) { if_printf(ifp, "%s: mlx5_core_alloc_pd failed, %d\n", __func__, err); goto err_unmap_free_uar; } err = mlx5_alloc_transport_domain(mdev, &priv->tdn); if (err) { if_printf(ifp, "%s: mlx5_alloc_transport_domain failed, %d\n", __func__, err); goto err_dealloc_pd; } err = mlx5e_create_mkey(priv, priv->pdn, &priv->mr); if (err) { if_printf(ifp, "%s: mlx5e_create_mkey failed, %d\n", __func__, err); goto err_dealloc_transport_domain; } mlx5_query_nic_vport_mac_address(priv->mdev, 0, dev_addr); /* check if we should generate a random MAC address */ if (MLX5_CAP_GEN(priv->mdev, vport_group_manager) == 0 && is_zero_ether_addr(dev_addr)) { random_ether_addr(dev_addr); if_printf(ifp, "Assigned random MAC address\n"); } #ifdef RATELIMIT err = mlx5e_rl_init(priv); if (err) { if_printf(ifp, "%s: mlx5e_rl_init failed, %d\n", __func__, err); goto err_create_mkey; } #endif /* set default MTU */ mlx5e_set_dev_port_mtu(ifp, ifp->if_mtu); /* Set default media status */ priv->media_status_last = IFM_AVALID; priv->media_active_last = IFM_ETHER | IFM_AUTO | IFM_ETH_RXPAUSE | IFM_FDX; /* setup default pauseframes configuration */ mlx5e_setup_pauseframes(priv); /* Setup supported medias */ //TODO: If we failed to query ptys is it ok to proceed?? if (!mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 1)) { ext = MLX5_CAP_PCAM_FEATURE(mdev, ptys_extended_ethernet); eth_proto_cap = MLX5_GET_ETH_PROTO(ptys_reg, out, ext, eth_proto_capability); if (MLX5_CAP_PCAM_FEATURE(mdev, ptys_connector_type)) connector_type = MLX5_GET(ptys_reg, out, connector_type); } else { eth_proto_cap = 0; if_printf(ifp, "%s: Query port media capability failed," " %d\n", __func__, err); } ifmedia_init(&priv->media, IFM_IMASK | IFM_ETH_FMASK, mlx5e_media_change, mlx5e_media_status); speeds_num = ext ? MLX5E_EXT_LINK_SPEEDS_NUMBER : MLX5E_LINK_SPEEDS_NUMBER; for (i = 0; i != speeds_num; i++) { for (j = 0; j < MLX5E_LINK_MODES_NUMBER ; ++j) { media_entry = ext ? mlx5e_ext_mode_table[i][j] : mlx5e_mode_table[i][j]; if (media_entry.baudrate == 0) continue; if (MLX5E_PROT_MASK(i) & eth_proto_cap) { ifmedia_add(&priv->media, media_entry.subtype | IFM_ETHER, 0, NULL); ifmedia_add(&priv->media, media_entry.subtype | IFM_ETHER | IFM_FDX | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL); } } } ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE, 0, NULL); /* Set autoselect by default */ ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO | IFM_FDX | IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE); ether_ifattach(ifp, dev_addr); /* Register for VLAN events */ priv->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, mlx5e_vlan_rx_add_vid, priv, EVENTHANDLER_PRI_FIRST); priv->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, mlx5e_vlan_rx_kill_vid, priv, EVENTHANDLER_PRI_FIRST); /* Link is down by default */ if_link_state_change(ifp, LINK_STATE_DOWN); mlx5e_enable_async_events(priv); mlx5e_add_hw_stats(priv); mlx5e_create_stats(&priv->stats.vport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), "vstats", mlx5e_vport_stats_desc, MLX5E_VPORT_STATS_NUM, priv->stats.vport.arg); mlx5e_create_stats(&priv->stats.pport.ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), "pstats", mlx5e_pport_stats_desc, MLX5E_PPORT_STATS_NUM, priv->stats.pport.arg); mlx5e_create_ethtool(priv); mtx_lock(&priv->async_events_mtx); mlx5e_update_stats(priv); mtx_unlock(&priv->async_events_mtx); SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rx_clbr_done", CTLFLAG_RD, &priv->clbr_done, 0, "RX timestamps calibration state"); callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT); mlx5e_reset_calibration_callout(priv); pa.pa_version = PFIL_VERSION; pa.pa_flags = PFIL_IN; pa.pa_type = PFIL_TYPE_ETHERNET; pa.pa_headname = ifp->if_xname; priv->pfil = pfil_head_register(&pa); return (priv); #ifdef RATELIMIT err_create_mkey: mlx5_core_destroy_mkey(priv->mdev, &priv->mr); #endif err_dealloc_transport_domain: mlx5_dealloc_transport_domain(mdev, priv->tdn); err_dealloc_pd: mlx5_core_dealloc_pd(mdev, priv->pdn); err_unmap_free_uar: mlx5_unmap_free_uar(mdev, &priv->cq_uar); err_free_wq: flush_workqueue(priv->wq); err_free_sysctl: sysctl_ctx_free(&priv->sysctl_ctx); if (priv->sysctl_debug) sysctl_ctx_free(&priv->stats.port_stats_debug.ctx); if_free(ifp); err_free_priv: mlx5e_priv_mtx_destroy(priv); free(priv, M_MLX5EN); return (NULL); } static void mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv) { struct mlx5e_priv *priv = vpriv; struct ifnet *ifp = priv->ifp; /* don't allow more IOCTLs */ priv->gone = 1; /* XXX wait a bit to allow IOCTL handlers to complete */ pause("W", hz); #ifdef RATELIMIT /* * The kernel can have reference(s) via the m_snd_tag's into * the ratelimit channels, and these must go away before * detaching: */ while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) { if_printf(priv->ifp, "Waiting for all ratelimit connections " "to terminate\n"); pause("W", hz); } #endif /* stop watchdog timer */ callout_drain(&priv->watchdog); callout_drain(&priv->tstmp_clbr); if (priv->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach); if (priv->vlan_detach != NULL) EVENTHANDLER_DEREGISTER(vlan_unconfig, priv->vlan_detach); /* make sure device gets closed */ PRIV_LOCK(priv); mlx5e_close_locked(ifp); PRIV_UNLOCK(priv); /* wait for all unlimited send tags to go away */ while (priv->channel_refs != 0) { if_printf(priv->ifp, "Waiting for all unlimited connections " "to terminate\n"); pause("W", hz); } /* deregister pfil */ if (priv->pfil != NULL) { pfil_head_unregister(priv->pfil); priv->pfil = NULL; } /* unregister device */ ifmedia_removeall(&priv->media); ether_ifdetach(ifp); if_free(ifp); #ifdef RATELIMIT mlx5e_rl_cleanup(priv); #endif /* destroy all remaining sysctl nodes */ sysctl_ctx_free(&priv->stats.vport.ctx); sysctl_ctx_free(&priv->stats.pport.ctx); if (priv->sysctl_debug) sysctl_ctx_free(&priv->stats.port_stats_debug.ctx); sysctl_ctx_free(&priv->sysctl_ctx); mlx5_core_destroy_mkey(priv->mdev, &priv->mr); mlx5_dealloc_transport_domain(priv->mdev, priv->tdn); mlx5_core_dealloc_pd(priv->mdev, priv->pdn); mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar); mlx5e_disable_async_events(priv); flush_workqueue(priv->wq); mlx5e_priv_mtx_destroy(priv); free(priv, M_MLX5EN); } static void * mlx5e_get_ifp(void *vpriv) { struct mlx5e_priv *priv = vpriv; return (priv->ifp); } static struct mlx5_interface mlx5e_interface = { .add = mlx5e_create_ifp, .remove = mlx5e_destroy_ifp, .event = mlx5e_async_event, .protocol = MLX5_INTERFACE_PROTOCOL_ETH, .get_dev = mlx5e_get_ifp, }; void mlx5e_init(void) { mlx5_register_interface(&mlx5e_interface); } void mlx5e_cleanup(void) { mlx5_unregister_interface(&mlx5e_interface); } static void mlx5e_show_version(void __unused *arg) { printf("%s", mlx5e_version); } SYSINIT(mlx5e_show_version, SI_SUB_DRIVERS, SI_ORDER_ANY, mlx5e_show_version, NULL); module_init_order(mlx5e_init, SI_ORDER_THIRD); module_exit_order(mlx5e_cleanup, SI_ORDER_THIRD); #if (__FreeBSD_version >= 1100000) MODULE_DEPEND(mlx5en, linuxkpi, 1, 1, 1); #endif MODULE_DEPEND(mlx5en, mlx5, 1, 1, 1); MODULE_VERSION(mlx5en, 1); Index: head/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c =================================================================== --- head/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision 348253) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision 348254) @@ -1,1573 +1,1574 @@ /*- * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "en.h" #ifdef RATELIMIT static int mlx5e_rl_open_workers(struct mlx5e_priv *); static void mlx5e_rl_close_workers(struct mlx5e_priv *); static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS); static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x, struct sysctl_oid *, const char *name, const char *desc); static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, struct sysctl_oid *node, const char *name, const char *desc); static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value); static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value); static void mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl, struct mlx5e_sq_param *param) { void *sqc = param->sqc; void *wq = MLX5_ADDR_OF(sqc, sqc, wq); uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); MLX5_SET(wq, wq, log_wq_sz, log_sq_size); MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); MLX5_SET(wq, wq, pd, rl->priv->pdn); param->wq.buf_numa_node = 0; param->wq.db_numa_node = 0; param->wq.linear = 1; } static void mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl, struct mlx5e_cq_param *param) { void *cqc = param->cqc; uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size); MLX5_SET(cqc, cqc, log_cq_size, log_sq_size); MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs); MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts); switch (rl->param.tx_coalesce_mode) { case 0: MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; default: if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe)) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE); else MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); break; } } static void mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl, struct mlx5e_rl_channel_param *cparam) { memset(cparam, 0, sizeof(*cparam)); mlx5e_rl_build_sq_param(rl, &cparam->sq); mlx5e_rl_build_cq_param(rl, &cparam->cq); } static int mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, struct mlx5e_sq_param *param, int ix) { struct mlx5_core_dev *mdev = priv->mdev; void *sqc = param->sqc; void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq); int err; /* Create DMA descriptor TAG */ if ((err = -bus_dma_tag_create( bus_get_dma_tag(mdev->pdev->dev.bsddev), 1, /* any alignment */ 0, /* no boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */ MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */ MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &sq->dma_tag))) goto done; /* use shared UAR */ sq->uar = priv->rl.sq_uar; err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) goto err_free_dma_tag; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; /* * The sq->bf_buf_size variable is intentionally left zero so * that the doorbell writes will occur at the same memory * location. */ err = mlx5e_alloc_sq_db(sq); if (err) goto err_sq_wq_destroy; sq->mkey_be = cpu_to_be32(priv->mr.key); sq->ifp = priv->ifp; sq->priv = priv; mlx5e_update_sq_inline(sq); return (0); err_sq_wq_destroy: mlx5_wq_destroy(&sq->wq_ctrl); err_free_dma_tag: bus_dma_tag_destroy(sq->dma_tag); done: return (err); } static void mlx5e_rl_destroy_sq(struct mlx5e_sq *sq) { mlx5e_free_sq_db(sq); mlx5_wq_destroy(&sq->wq_ctrl); } static int mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq, struct mlx5e_sq_param *param, int ix) { int err; err = mlx5e_rl_create_sq(priv, sq, param, ix); if (err) return (err); err = mlx5e_enable_sq(sq, param, priv->rl.tisn); if (err) goto err_destroy_sq; err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY); if (err) goto err_disable_sq; WRITE_ONCE(sq->running, 1); return (0); err_disable_sq: mlx5e_disable_sq(sq); err_destroy_sq: mlx5e_rl_destroy_sq(sq); return (err); } static void mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq) { mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF); mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF); callout_init_mtx(&sq->cev_callout, &sq->lock, 0); sq->cev_factor = priv->rl.param.tx_completion_fact; /* ensure the TX completion event factor is not zero */ if (sq->cev_factor == 0) sq->cev_factor = 1; } static int mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix, struct mlx5e_rl_channel_param *cparam, struct mlx5e_sq *volatile *ppsq) { struct mlx5e_priv *priv = rlw->priv; struct mlx5e_sq *sq; int err; sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO); /* init mutexes */ mlx5e_rl_chan_mtx_init(priv, sq); /* open TX completion queue */ err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq, &mlx5e_tx_cq_comp, eq_ix); if (err) goto err_free; err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix); if (err) goto err_close_tx_cq; /* store TX channel pointer */ *ppsq = sq; /* poll TX queue initially */ sq->cq.mcq.comp(&sq->cq.mcq); return (0); err_close_tx_cq: mlx5e_close_cq(&sq->cq); err_free: /* destroy mutexes */ mtx_destroy(&sq->lock); mtx_destroy(&sq->comp_lock); free(sq, M_MLX5EN); atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL); return (err); } static void mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq) { struct mlx5e_sq *sq = *ppsq; /* check if channel is already closed */ if (sq == NULL) return; /* ensure channel pointer is no longer used */ *ppsq = NULL; /* teardown and destroy SQ */ mlx5e_drain_sq(sq); mlx5e_disable_sq(sq); mlx5e_rl_destroy_sq(sq); /* close CQ */ mlx5e_close_cq(&sq->cq); /* destroy mutexes */ mtx_destroy(&sq->lock); mtx_destroy(&sq->comp_lock); free(sq, M_MLX5EN); } static void mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl) { /* * Limit the maximum distance between completion events to * half of the currently set TX queue size. * * The maximum number of queue entries a single IP packet can * consume is given by MLX5_SEND_WQE_MAX_WQEBBS. * * The worst case max value is then given as below: */ uint64_t max = rl->param.tx_queue_size / (2 * MLX5_SEND_WQE_MAX_WQEBBS); /* * Update the maximum completion factor value in case the * tx_queue_size field changed. Ensure we don't overflow * 16-bits. */ if (max < 1) max = 1; else if (max > 65535) max = 65535; rl->param.tx_completion_fact_max = max; /* * Verify that the current TX completion factor is within the * given limits: */ if (rl->param.tx_completion_fact < 1) rl->param.tx_completion_fact = 1; else if (rl->param.tx_completion_fact > max) rl->param.tx_completion_fact = max; } static int mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index) { struct mlx5e_priv *priv = sq->priv; struct mlx5_core_dev *mdev = priv->mdev; void *in; void *sqc; int inlen; int err; inlen = MLX5_ST_SZ_BYTES(modify_sq_in); in = mlx5_vzalloc(inlen); if (in == NULL) return (-ENOMEM); sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); MLX5_SET(modify_sq_in, in, sqn, sq->sqn); MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY); MLX5_SET64(modify_sq_in, in, modify_bitmask, 1); MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY); MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index); err = mlx5_core_modify_sq(mdev, in, inlen); kvfree(in); return (err); } /* * This function will search the configured rate limit table for the * best match to avoid that a single socket based application can * allocate all the available hardware rates. If the user selected * rate deviates too much from the closes rate available in the rate * limit table, unlimited rate will be selected. */ static uint64_t mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate) { uint64_t distance = -1ULL; uint64_t diff; uint64_t retval = 0; /* unlimited */ uint64_t x; /* search for closest rate */ for (x = 0; x != rl->param.tx_rates_def; x++) { uint64_t rate = rl->rate_limit_table[x]; if (rate == 0) continue; if (rate > user_rate) diff = rate - user_rate; else diff = user_rate - rate; /* check if distance is smaller than previous rate */ if (diff < distance) { distance = diff; retval = rate; } } /* range check for multiplication below */ if (user_rate > rl->param.tx_limit_max) user_rate = rl->param.tx_limit_max; /* fallback to unlimited, if rate deviates too much */ if (distance > howmany(user_rate * rl->param.tx_allowed_deviation, 1000ULL)) retval = 0; return (retval); } /* * This function sets the requested rate for a rate limit channel, in * bits per second. The requested rate will be filtered through the * find best rate function above. */ static int mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate) { struct mlx5e_rl_priv_data *rl = &rlw->priv->rl; struct mlx5e_sq *sq; uint64_t temp; uint16_t index; uint16_t burst; int error; if (rate != 0) { MLX5E_RL_WORKER_UNLOCK(rlw); MLX5E_RL_RLOCK(rl); /* get current burst size in bytes */ temp = rl->param.tx_burst_size * MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu); /* limit burst size to 64K currently */ if (temp > 65535) temp = 65535; burst = temp; /* find best rate */ rate = mlx5e_rl_find_best_rate_locked(rl, rate); MLX5E_RL_RUNLOCK(rl); if (rate == 0) { /* rate doesn't exist, fallback to unlimited */ index = 0; rate = 0; atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); } else { /* get a reference on the new rate */ error = -mlx5_rl_add_rate(rlw->priv->mdev, howmany(rate, 1000), burst, &index); if (error != 0) { /* adding rate failed, fallback to unlimited */ index = 0; rate = 0; atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL); } } MLX5E_RL_WORKER_LOCK(rlw); } else { index = 0; burst = 0; /* default */ } /* atomically swap rates */ temp = channel->last_rate; channel->last_rate = rate; rate = temp; /* atomically swap burst size */ temp = channel->last_burst; channel->last_burst = burst; burst = temp; MLX5E_RL_WORKER_UNLOCK(rlw); /* put reference on the old rate, if any */ if (rate != 0) { mlx5_rl_remove_rate(rlw->priv->mdev, howmany(rate, 1000), burst); } /* set new rate, if SQ is running */ sq = channel->sq; if (sq != NULL && READ_ONCE(sq->running) != 0) { error = mlx5e_rl_modify_sq(sq, index); if (error != 0) atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL); } else error = 0; MLX5E_RL_WORKER_LOCK(rlw); return (-error); } static void mlx5e_rl_worker(void *arg) { struct thread *td; struct mlx5e_rl_worker *rlw = arg; struct mlx5e_rl_channel *channel; struct mlx5e_priv *priv; unsigned ix; uint64_t x; int error; /* set thread priority */ td = curthread; thread_lock(td); sched_prio(td, PI_SWI(SWI_NET)); thread_unlock(td); priv = rlw->priv; /* compute completion vector */ ix = (rlw - priv->rl.workers) % priv->mdev->priv.eq_table.num_comp_vectors; /* TODO bind to CPU */ /* open all the SQs */ MLX5E_RL_WORKER_LOCK(rlw); for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { struct mlx5e_rl_channel *channel = rlw->channels + x; #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS) if (channel->state == MLX5E_RL_ST_FREE) continue; #endif MLX5E_RL_WORKER_UNLOCK(rlw); MLX5E_RL_RLOCK(&priv->rl); error = mlx5e_rl_open_channel(rlw, ix, &priv->rl.chan_param, &channel->sq); MLX5E_RL_RUNLOCK(&priv->rl); MLX5E_RL_WORKER_LOCK(rlw); if (error != 0) { if_printf(priv->ifp, "mlx5e_rl_open_channel failed: %d\n", error); break; } mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate); } while (1) { if (STAILQ_FIRST(&rlw->process_head) == NULL) { /* check if we are tearing down */ if (rlw->worker_done != 0) break; cv_wait(&rlw->cv, &rlw->mtx); } /* check if we are tearing down */ if (rlw->worker_done != 0) break; channel = STAILQ_FIRST(&rlw->process_head); if (channel != NULL) { STAILQ_REMOVE_HEAD(&rlw->process_head, entry); switch (channel->state) { case MLX5E_RL_ST_MODIFY: channel->state = MLX5E_RL_ST_USED; MLX5E_RL_WORKER_UNLOCK(rlw); /* create channel by demand */ if (channel->sq == NULL) { MLX5E_RL_RLOCK(&priv->rl); error = mlx5e_rl_open_channel(rlw, ix, &priv->rl.chan_param, &channel->sq); MLX5E_RL_RUNLOCK(&priv->rl); if (error != 0) { if_printf(priv->ifp, "mlx5e_rl_open_channel failed: %d\n", error); } else { atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL); } } else { mlx5e_resume_sq(channel->sq); } MLX5E_RL_WORKER_LOCK(rlw); /* convert from bytes/s to bits/s and set new rate */ error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->new_rate * 8ULL); if (error != 0) { if_printf(priv->ifp, "mlx5e_rlw_channel_set_rate_locked failed: %d\n", error); } break; case MLX5E_RL_ST_DESTROY: error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); if (error != 0) { if_printf(priv->ifp, "mlx5e_rlw_channel_set_rate_locked failed: %d\n", error); } if (channel->sq != NULL) { /* * Make sure all packets are * transmitted before SQ is * returned to free list: */ MLX5E_RL_WORKER_UNLOCK(rlw); mlx5e_drain_sq(channel->sq); MLX5E_RL_WORKER_LOCK(rlw); } /* put the channel back into the free list */ STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry); channel->state = MLX5E_RL_ST_FREE; atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL); break; default: /* NOP */ break; } } } /* close all the SQs */ for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) { struct mlx5e_rl_channel *channel = rlw->channels + x; /* update the initial rate */ channel->init_rate = channel->last_rate; /* make sure we free up the rate resource */ mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0); if (channel->sq != NULL) { MLX5E_RL_WORKER_UNLOCK(rlw); mlx5e_rl_close_channel(&channel->sq); atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL); MLX5E_RL_WORKER_LOCK(rlw); } } rlw->worker_done = 0; cv_broadcast(&rlw->cv); MLX5E_RL_WORKER_UNLOCK(rlw); kthread_exit(); } static int mlx5e_rl_open_tis(struct mlx5e_priv *priv) { struct mlx5_core_dev *mdev = priv->mdev; u32 in[MLX5_ST_SZ_DW(create_tis_in)]; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); memset(in, 0, sizeof(in)); MLX5_SET(tisc, tisc, prio, 0); MLX5_SET(tisc, tisc, transport_domain, priv->tdn); return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn)); } static void mlx5e_rl_close_tis(struct mlx5e_priv *priv) { mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn); } static void mlx5e_rl_set_default_params(struct mlx5e_rl_params *param, struct mlx5_core_dev *mdev) { /* ratelimit workers */ param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors; param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS; /* range check */ if (param->tx_worker_threads_def == 0 || param->tx_worker_threads_def > param->tx_worker_threads_max) param->tx_worker_threads_def = param->tx_worker_threads_max; /* ratelimit channels */ param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS / param->tx_worker_threads_def; param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS; /* range check */ if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER) param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER; /* set default burst size */ param->tx_burst_size = 4; /* MTUs */ /* * Set maximum burst size * * The burst size is multiplied by the MTU and clamped to the * range 0 ... 65535 bytes inclusivly before fed into the * firmware. * * NOTE: If the burst size or MTU is changed only ratelimit * connections made after the change will use the new burst * size. */ param->tx_burst_size_max = 255; /* get firmware rate limits in 1000bit/s and convert them to bit/s */ param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL; param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL; /* ratelimit table size */ param->tx_rates_max = mdev->priv.rl_table.max_size; /* range check */ if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES) param->tx_rates_max = MLX5E_RL_MAX_TX_RATES; /* set default number of rates */ param->tx_rates_def = param->tx_rates_max; /* set maximum allowed rate deviation */ if (param->tx_limit_max != 0) { /* * Make sure the deviation multiplication doesn't * overflow unsigned 64-bit: */ param->tx_allowed_deviation_max = -1ULL / param->tx_limit_max; } /* set default rate deviation */ param->tx_allowed_deviation = 50; /* 5.0% */ /* channel parameters */ param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT; param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT; param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT; param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT; } static const char *mlx5e_rl_params_desc[] = { MLX5E_RL_PARAMS(MLX5E_STATS_DESC) }; static const char *mlx5e_rl_table_params_desc[] = { MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC) }; static const char *mlx5e_rl_stats_desc[] = { MLX5E_RL_STATS(MLX5E_STATS_DESC) }; int mlx5e_rl_init(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; struct sysctl_oid *node; struct sysctl_oid *stats; char buf[64]; uint64_t i; uint64_t j; int error; /* check if there is support for packet pacing */ if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) return (0); rl->priv = priv; sysctl_ctx_init(&rl->ctx); sx_init(&rl->rl_sxlock, "ratelimit-sxlock"); /* allocate shared UAR for SQs */ error = mlx5_alloc_map_uar(priv->mdev, &rl->sq_uar); if (error) goto done; /* open own TIS domain for ratelimit SQs */ error = mlx5e_rl_open_tis(priv); if (error) goto err_uar; /* setup default value for parameters */ mlx5e_rl_set_default_params(&rl->param, priv->mdev); /* update the completion factor */ mlx5e_rl_sync_tx_completion_fact(rl); /* create root node */ node = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO, "rate_limit", CTLFLAG_RW, NULL, "Rate limiting support"); if (node != NULL) { /* create SYSCTLs */ for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) { mlx5e_rl_sysctl_add_u64_oid(rl, MLX5E_RL_PARAMS_INDEX(arg[i]), node, mlx5e_rl_params_desc[2 * i], mlx5e_rl_params_desc[2 * i + 1]); } stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, "stats", CTLFLAG_RD, NULL, "Rate limiting statistics"); if (stats != NULL) { /* create SYSCTLs */ for (i = 0; i != MLX5E_RL_STATS_NUM; i++) { mlx5e_rl_sysctl_add_stats_u64_oid(rl, i, stats, mlx5e_rl_stats_desc[2 * i], mlx5e_rl_stats_desc[2 * i + 1]); } } } /* allocate workers array */ rl->workers = malloc(sizeof(rl->workers[0]) * rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO); /* allocate rate limit array */ rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) * rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO); if (node != NULL) { /* create more SYSCTls */ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table, "A", "Show table of all configured TX rates"); /* try to fetch rate table from kernel environment */ for (i = 0; i != rl->param.tx_rates_def; i++) { /* compute path for tunable */ snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d", device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i); if (TUNABLE_QUAD_FETCH(buf, &j)) mlx5e_rl_tx_limit_add(rl, j); } /* setup rate table sysctls */ for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) { mlx5e_rl_sysctl_add_u64_oid(rl, MLX5E_RL_PARAMS_INDEX(table_arg[i]), node, mlx5e_rl_table_params_desc[2 * i], mlx5e_rl_table_params_desc[2 * i + 1]); } } for (j = 0; j < rl->param.tx_worker_threads_def; j++) { struct mlx5e_rl_worker *rlw = rl->workers + j; rlw->priv = priv; cv_init(&rlw->cv, "mlx5-worker-cv"); mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF); STAILQ_INIT(&rlw->index_list_head); STAILQ_INIT(&rlw->process_head); rlw->channels = malloc(sizeof(rlw->channels[0]) * rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO); MLX5E_RL_WORKER_LOCK(rlw); for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) { struct mlx5e_rl_channel *channel = rlw->channels + i; channel->worker = rlw; - channel->tag.m_snd_tag.ifp = priv->ifp; channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT; STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry); } MLX5E_RL_WORKER_UNLOCK(rlw); } PRIV_LOCK(priv); error = mlx5e_rl_open_workers(priv); PRIV_UNLOCK(priv); if (error != 0) { if_printf(priv->ifp, "mlx5e_rl_open_workers failed: %d\n", error); } return (0); err_uar: mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar); done: sysctl_ctx_free(&rl->ctx); sx_destroy(&rl->rl_sxlock); return (error); } static int mlx5e_rl_open_workers(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; struct thread *rl_thread = NULL; struct proc *rl_proc = NULL; uint64_t j; int error; if (priv->gone || rl->opened) return (-EINVAL); MLX5E_RL_WLOCK(rl); /* compute channel parameters once */ mlx5e_rl_build_channel_param(rl, &rl->chan_param); MLX5E_RL_WUNLOCK(rl); for (j = 0; j < rl->param.tx_worker_threads_def; j++) { struct mlx5e_rl_worker *rlw = rl->workers + j; /* start worker thread */ error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread, RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j); if (error != 0) { if_printf(rl->priv->ifp, "kproc_kthread_add failed: %d\n", error); rlw->worker_done = 1; } } rl->opened = 1; return (0); } static void mlx5e_rl_close_workers(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; uint64_t y; if (rl->opened == 0) return; /* tear down worker threads simultaneously */ for (y = 0; y < rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; /* tear down worker before freeing SQs */ MLX5E_RL_WORKER_LOCK(rlw); if (rlw->worker_done == 0) { rlw->worker_done = 1; cv_broadcast(&rlw->cv); } else { /* XXX thread not started */ rlw->worker_done = 0; } MLX5E_RL_WORKER_UNLOCK(rlw); } /* wait for worker threads to exit */ for (y = 0; y < rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; /* tear down worker before freeing SQs */ MLX5E_RL_WORKER_LOCK(rlw); while (rlw->worker_done != 0) cv_wait(&rlw->cv, &rlw->mtx); MLX5E_RL_WORKER_UNLOCK(rlw); } rl->opened = 0; } static void mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl) { unsigned x; MLX5E_RL_WLOCK(rl); for (x = 0; x != rl->param.tx_rates_def; x++) rl->rate_limit_table[x] = 0; MLX5E_RL_WUNLOCK(rl); } void mlx5e_rl_cleanup(struct mlx5e_priv *priv) { struct mlx5e_rl_priv_data *rl = &priv->rl; uint64_t y; /* check if there is support for packet pacing */ if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing)) return; /* TODO check if there is support for packet pacing */ sysctl_ctx_free(&rl->ctx); PRIV_LOCK(priv); mlx5e_rl_close_workers(priv); PRIV_UNLOCK(priv); mlx5e_rl_reset_rates(rl); /* free shared UAR for SQs */ mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar); /* close TIS domain */ mlx5e_rl_close_tis(priv); for (y = 0; y < rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; cv_destroy(&rlw->cv); mtx_destroy(&rlw->mtx); free(rlw->channels, M_MLX5EN); } free(rl->rate_limit_table, M_MLX5EN); free(rl->workers, M_MLX5EN); sx_destroy(&rl->rl_sxlock); } static void mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel) { STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry); cv_broadcast(&rlw->cv); } static void mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel) { if (channel == NULL) return; MLX5E_RL_WORKER_LOCK(rlw); switch (channel->state) { case MLX5E_RL_ST_MODIFY: channel->state = MLX5E_RL_ST_DESTROY; break; case MLX5E_RL_ST_USED: channel->state = MLX5E_RL_ST_DESTROY; mlx5e_rlw_queue_channel_locked(rlw, channel); break; default: break; } MLX5E_RL_WORKER_UNLOCK(rlw); } static int mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate) { MLX5E_RL_WORKER_LOCK(rlw); channel->new_rate = rate; switch (channel->state) { case MLX5E_RL_ST_USED: channel->state = MLX5E_RL_ST_MODIFY; mlx5e_rlw_queue_channel_locked(rlw, channel); break; default: break; } MLX5E_RL_WORKER_UNLOCK(rlw); return (0); } static int mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, union if_snd_tag_query_params *params) { int retval; MLX5E_RL_WORKER_LOCK(rlw); switch (channel->state) { case MLX5E_RL_ST_USED: params->rate_limit.max_rate = channel->last_rate; params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); retval = 0; break; case MLX5E_RL_ST_MODIFY: params->rate_limit.max_rate = channel->last_rate; params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq); retval = EBUSY; break; default: retval = EINVAL; break; } MLX5E_RL_WORKER_UNLOCK(rlw); return (retval); } static int mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel **pchannel) { struct mlx5e_rl_channel *channel; int retval = ENOMEM; MLX5E_RL_WORKER_LOCK(rlw); /* Check for available channel in free list */ if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) { retval = 0; /* Remove head index from available list */ STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry); channel->state = MLX5E_RL_ST_USED; atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL); } else { atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL); } MLX5E_RL_WORKER_UNLOCK(rlw); *pchannel = channel; #ifdef RATELIMIT_DEBUG if_printf(rlw->priv->ifp, "Channel pointer for rate limit connection is %p\n", channel); #endif return (retval); } int mlx5e_rl_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { struct mlx5e_rl_channel *channel; struct mlx5e_rl_worker *rlw; struct mlx5e_priv *priv; int error; priv = ifp->if_softc; /* check if there is support for packet pacing or if device is going away */ if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone || params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT) return (EOPNOTSUPP); /* compute worker thread this TCP connection belongs to */ rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) % priv->rl.param.tx_worker_threads_def); error = mlx5e_find_available_tx_ring_index(rlw, &channel); if (error != 0) goto done; error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate); if (error != 0) { mlx5e_rl_free(rlw, channel); goto done; } /* store pointer to mbuf tag */ + MPASS(channel->tag.m_snd_tag.refcount == 0); + m_snd_tag_init(&channel->tag.m_snd_tag, ifp); *ppmt = &channel->tag.m_snd_tag; done: return (error); } int mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params) { struct mlx5e_rl_channel *channel = container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag); return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate)); } int mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params) { struct mlx5e_rl_channel *channel = container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag); return (mlx5e_rl_query(channel->worker, channel, params)); } void mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_rl_channel *channel = container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag); mlx5e_rl_free(channel->worker, channel); } static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS) { struct mlx5e_rl_priv_data *rl = arg1; struct mlx5e_priv *priv = rl->priv; struct sbuf sbuf; unsigned x; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); PRIV_LOCK(priv); sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req); sbuf_printf(&sbuf, "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n" "\t" "--------------------------------------------\n"); MLX5E_RL_RLOCK(rl); for (x = 0; x != rl->param.tx_rates_def; x++) { if (rl->rate_limit_table[x] == 0) continue; sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n", x, (unsigned)rl->param.tx_burst_size, (long long)rl->rate_limit_table[x]); } MLX5E_RL_RUNLOCK(rl); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); PRIV_UNLOCK(priv); return (error); } static int mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl) { uint64_t x; uint64_t y; MLX5E_RL_WLOCK(rl); /* compute channel parameters once */ mlx5e_rl_build_channel_param(rl, &rl->chan_param); MLX5E_RL_WUNLOCK(rl); for (y = 0; y != rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { struct mlx5e_rl_channel *channel; struct mlx5e_sq *sq; channel = rlw->channels + x; sq = channel->sq; if (sq == NULL) continue; if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) { mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq, rl->param.tx_coalesce_usecs, rl->param.tx_coalesce_pkts, rl->param.tx_coalesce_mode); } else { mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq, rl->param.tx_coalesce_usecs, rl->param.tx_coalesce_pkts); } } } return (0); } void mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl) { uint64_t x; uint64_t y; for (y = 0; y != rl->param.tx_worker_threads_def; y++) { struct mlx5e_rl_worker *rlw = rl->workers + y; for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) { struct mlx5e_rl_channel *channel; struct mlx5e_sq *sq; channel = rlw->channels + x; sq = channel->sq; if (sq == NULL) continue; mtx_lock(&sq->lock); mlx5e_update_sq_inline(sq); mtx_unlock(&sq->lock); } } } static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value) { unsigned x; int error; if (value < 1000 || mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0) return (EINVAL); MLX5E_RL_WLOCK(rl); error = ENOMEM; /* check if rate already exists */ for (x = 0; x != rl->param.tx_rates_def; x++) { if (rl->rate_limit_table[x] != value) continue; error = EEXIST; break; } /* check if there is a free rate entry */ if (x == rl->param.tx_rates_def) { for (x = 0; x != rl->param.tx_rates_def; x++) { if (rl->rate_limit_table[x] != 0) continue; rl->rate_limit_table[x] = value; error = 0; break; } } MLX5E_RL_WUNLOCK(rl); return (error); } static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value) { unsigned x; int error; if (value == 0) return (EINVAL); MLX5E_RL_WLOCK(rl); /* check if rate already exists */ for (x = 0; x != rl->param.tx_rates_def; x++) { if (rl->rate_limit_table[x] != value) continue; /* free up rate */ rl->rate_limit_table[x] = 0; break; } /* check if there is a free rate entry */ if (x == rl->param.tx_rates_def) error = ENOENT; else error = 0; MLX5E_RL_WUNLOCK(rl); return (error); } static int mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct mlx5e_rl_priv_data *rl = arg1; struct mlx5e_priv *priv = rl->priv; unsigned mode_modify; unsigned was_opened; uint64_t value; uint64_t old; int error; PRIV_LOCK(priv); MLX5E_RL_RLOCK(rl); value = rl->param.arg[arg2]; MLX5E_RL_RUNLOCK(rl); if (req != NULL) { old = value; error = sysctl_handle_64(oidp, &value, 0, req); if (error || req->newptr == NULL || value == rl->param.arg[arg2]) goto done; } else { old = 0; error = 0; } /* check if device is gone */ if (priv->gone) { error = ENXIO; goto done; } was_opened = rl->opened; mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify); switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) { case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def): if (value > rl->param.tx_worker_threads_max) value = rl->param.tx_worker_threads_max; else if (value < 1) value = 1; /* store new value */ rl->param.arg[arg2] = value; break; case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def): if (value > rl->param.tx_channels_per_worker_max) value = rl->param.tx_channels_per_worker_max; else if (value < 1) value = 1; /* store new value */ rl->param.arg[arg2] = value; break; case MLX5E_RL_PARAMS_INDEX(tx_rates_def): if (value > rl->param.tx_rates_max) value = rl->param.tx_rates_max; else if (value < 1) value = 1; /* store new value */ rl->param.arg[arg2] = value; break; case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs): /* range check */ if (value < 1) value = 0; else if (value > MLX5E_FLD_MAX(cqc, cq_period)) value = MLX5E_FLD_MAX(cqc, cq_period); /* store new value */ rl->param.arg[arg2] = value; /* check to avoid down and up the network interface */ if (was_opened) error = mlx5e_rl_refresh_channel_params(rl); break; case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts): /* import TX coal pkts */ if (value < 1) value = 0; else if (value > MLX5E_FLD_MAX(cqc, cq_max_count)) value = MLX5E_FLD_MAX(cqc, cq_max_count); /* store new value */ rl->param.arg[arg2] = value; /* check to avoid down and up the network interface */ if (was_opened) error = mlx5e_rl_refresh_channel_params(rl); break; case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode): /* network interface must be down */ if (was_opened != 0 && mode_modify == 0) mlx5e_rl_close_workers(priv); /* import TX coalesce mode */ if (value != 0) value = 1; /* store new value */ rl->param.arg[arg2] = value; /* restart network interface, if any */ if (was_opened != 0) { if (mode_modify == 0) mlx5e_rl_open_workers(priv); else error = mlx5e_rl_refresh_channel_params(rl); } break; case MLX5E_RL_PARAMS_INDEX(tx_queue_size): /* network interface must be down */ if (was_opened) mlx5e_rl_close_workers(priv); /* import TX queue size */ if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE)) value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE); else if (value > priv->params_ethtool.tx_queue_size_max) value = priv->params_ethtool.tx_queue_size_max; /* store actual TX queue size */ value = 1ULL << order_base_2(value); /* store new value */ rl->param.arg[arg2] = value; /* verify TX completion factor */ mlx5e_rl_sync_tx_completion_fact(rl); /* restart network interface, if any */ if (was_opened) mlx5e_rl_open_workers(priv); break; case MLX5E_RL_PARAMS_INDEX(tx_completion_fact): /* network interface must be down */ if (was_opened) mlx5e_rl_close_workers(priv); /* store new value */ rl->param.arg[arg2] = value; /* verify parameter */ mlx5e_rl_sync_tx_completion_fact(rl); /* restart network interface, if any */ if (was_opened) mlx5e_rl_open_workers(priv); break; case MLX5E_RL_PARAMS_INDEX(tx_limit_add): error = mlx5e_rl_tx_limit_add(rl, value); break; case MLX5E_RL_PARAMS_INDEX(tx_limit_clr): error = mlx5e_rl_tx_limit_clr(rl, value); break; case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation): /* range check */ if (value > rl->param.tx_allowed_deviation_max) value = rl->param.tx_allowed_deviation_max; else if (value < rl->param.tx_allowed_deviation_min) value = rl->param.tx_allowed_deviation_min; MLX5E_RL_WLOCK(rl); rl->param.arg[arg2] = value; MLX5E_RL_WUNLOCK(rl); break; case MLX5E_RL_PARAMS_INDEX(tx_burst_size): /* range check */ if (value > rl->param.tx_burst_size_max) value = rl->param.tx_burst_size_max; else if (value < rl->param.tx_burst_size_min) value = rl->param.tx_burst_size_min; MLX5E_RL_WLOCK(rl); rl->param.arg[arg2] = value; MLX5E_RL_WUNLOCK(rl); break; default: break; } done: PRIV_UNLOCK(priv); return (error); } static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, struct sysctl_oid *node, const char *name, const char *desc) { /* * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will * take care of loading default sysctl value from the kernel * environment, if any: */ if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) { /* read-only SYSCTLs */ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); } else { if (strstr(name, "_def") != 0) { #ifdef RATELIMIT_DEBUG /* tunable read-only advanced SYSCTLs */ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, CTLTYPE_U64 | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); #endif } else { /* read-write SYSCTLs */ SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc); } } } static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x, struct sysctl_oid *node, const char *name, const char *desc) { /* read-only SYSCTLs */ SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name, CTLFLAG_RD, &rl->stats.arg[x], 0, desc); } #endif Index: head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c =================================================================== --- head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c (revision 348253) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c (revision 348254) @@ -1,657 +1,642 @@ /*- * Copyright (c) 2015-2018 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "en.h" #include static inline bool mlx5e_do_send_cqe(struct mlx5e_sq *sq) { sq->cev_counter++; /* interleave the CQEs */ if (sq->cev_counter >= sq->cev_factor) { sq->cev_counter = 0; return (1); } return (0); } void mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt) { u16 pi = sq->pc & sq->wq.sz_m1; struct mlx5e_tx_wqe *wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); memset(&wqe->ctrl, 0, sizeof(wqe->ctrl)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); if (mlx5e_do_send_cqe(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; else wqe->ctrl.fm_ce_se = 0; /* Copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); sq->mbuf[pi].mbuf = NULL; sq->mbuf[pi].num_bytes = 0; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->pc += sq->mbuf[pi].num_wqebbs; } #if (__FreeBSD_version >= 1100000) static uint32_t mlx5e_hash_value; static void mlx5e_hash_init(void *arg) { mlx5e_hash_value = m_ether_tcpip_hash_init(); } /* Make kernel call mlx5e_hash_init after the random stack finished initializing */ SYSINIT(mlx5e_hash_init, SI_SUB_RANDOM, SI_ORDER_ANY, &mlx5e_hash_init, NULL); #endif static struct mlx5e_sq * mlx5e_select_queue_by_send_tag(struct ifnet *ifp, struct mbuf *mb) { struct mlx5e_snd_tag *ptag; struct mlx5e_sq *sq; - /* check for route change */ - if (mb->m_pkthdr.snd_tag->ifp != ifp) - return (NULL); - /* get pointer to sendqueue */ ptag = container_of(mb->m_pkthdr.snd_tag, struct mlx5e_snd_tag, m_snd_tag); switch (ptag->type) { #ifdef RATELIMIT case IF_SND_TAG_TYPE_RATE_LIMIT: sq = container_of(ptag, struct mlx5e_rl_channel, tag)->sq; break; #endif case IF_SND_TAG_TYPE_UNLIMITED: sq = &container_of(ptag, struct mlx5e_channel, tag)->sq[0]; KASSERT(({ struct mlx5e_priv *priv = ifp->if_softc; priv->channel_refs > 0; }), ("mlx5e_select_queue: Channel refs are zero for unlimited tag")); break; default: sq = NULL; break; } /* check if valid */ if (sq != NULL && READ_ONCE(sq->running) != 0) return (sq); return (NULL); } static struct mlx5e_sq * mlx5e_select_queue(struct ifnet *ifp, struct mbuf *mb) { struct mlx5e_priv *priv = ifp->if_softc; struct mlx5e_sq *sq; u32 ch; u32 tc; /* obtain VLAN information if present */ if (mb->m_flags & M_VLANTAG) { tc = (mb->m_pkthdr.ether_vtag >> 13); if (tc >= priv->num_tc) tc = priv->default_vlan_prio; } else { tc = priv->default_vlan_prio; } ch = priv->params.num_channels; /* check if flowid is set */ if (M_HASHTYPE_GET(mb) != M_HASHTYPE_NONE) { #ifdef RSS u32 temp; if (rss_hash2bucket(mb->m_pkthdr.flowid, M_HASHTYPE_GET(mb), &temp) == 0) ch = temp % ch; else #endif ch = (mb->m_pkthdr.flowid % 128) % ch; } else { #if (__FreeBSD_version >= 1100000) ch = m_ether_tcpip_hash(MBUF_HASHFLAG_L3 | MBUF_HASHFLAG_L4, mb, mlx5e_hash_value) % ch; #else /* * m_ether_tcpip_hash not present in stable, so just * throw unhashed mbufs on queue 0 */ ch = 0; #endif } /* check if send queue is running */ sq = &priv->channel[ch].sq[tc]; if (likely(READ_ONCE(sq->running) != 0)) return (sq); return (NULL); } static inline u16 mlx5e_get_l2_header_size(struct mlx5e_sq *sq, struct mbuf *mb) { struct ether_vlan_header *eh; uint16_t eth_type; int min_inline; eh = mtod(mb, struct ether_vlan_header *); if (unlikely(mb->m_len < ETHER_HDR_LEN)) { goto max_inline; } else if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { if (unlikely(mb->m_len < (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN))) goto max_inline; eth_type = ntohs(eh->evl_proto); min_inline = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = ntohs(eh->evl_encap_proto); min_inline = ETHER_HDR_LEN; } switch (eth_type) { case ETHERTYPE_IP: case ETHERTYPE_IPV6: /* * Make sure the TOS(IPv4) or traffic class(IPv6) * field gets inlined. Else the SQ may stall. */ min_inline += 4; break; default: goto max_inline; } /* * m_copydata() will be used on the remaining header which * does not need to reside within the first m_len bytes of * data: */ if (mb->m_pkthdr.len < min_inline) goto max_inline; return (min_inline); max_inline: return (MIN(mb->m_pkthdr.len, sq->max_inline)); } static int mlx5e_get_full_header_size(struct mbuf *mb) { struct ether_vlan_header *eh; struct tcphdr *th; struct ip *ip; int ip_hlen, tcp_hlen; struct ip6_hdr *ip6; uint16_t eth_type; int eth_hdr_len; eh = mtod(mb, struct ether_vlan_header *); if (mb->m_len < ETHER_HDR_LEN) return (0); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { if (mb->m_len < (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)) return (0); eth_type = ntohs(eh->evl_proto); eth_hdr_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; } else { eth_type = ntohs(eh->evl_encap_proto); eth_hdr_len = ETHER_HDR_LEN; } switch (eth_type) { case ETHERTYPE_IP: ip = (struct ip *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip)) return (0); switch (ip->ip_p) { case IPPROTO_TCP: ip_hlen = ip->ip_hl << 2; eth_hdr_len += ip_hlen; break; case IPPROTO_UDP: ip_hlen = ip->ip_hl << 2; eth_hdr_len += ip_hlen + 8; goto done; default: return (0); } break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mb->m_data + eth_hdr_len); if (mb->m_len < eth_hdr_len + sizeof(*ip6)) return (0); switch (ip6->ip6_nxt) { case IPPROTO_TCP: eth_hdr_len += sizeof(*ip6); break; case IPPROTO_UDP: eth_hdr_len += sizeof(*ip6) + 8; goto done; default: return (0); } break; default: return (0); } if (mb->m_len < eth_hdr_len + sizeof(*th)) return (0); th = (struct tcphdr *)(mb->m_data + eth_hdr_len); tcp_hlen = th->th_off << 2; eth_hdr_len += tcp_hlen; done: /* * m_copydata() will be used on the remaining header which * does not need to reside within the first m_len bytes of * data: */ if (mb->m_pkthdr.len < eth_hdr_len) return (0); return (eth_hdr_len); } static int mlx5e_sq_xmit(struct mlx5e_sq *sq, struct mbuf **mbp) { bus_dma_segment_t segs[MLX5E_MAX_TX_MBUF_FRAGS]; struct mlx5_wqe_data_seg *dseg; struct mlx5e_tx_wqe *wqe; struct ifnet *ifp; int nsegs; int err; int x; struct mbuf *mb = *mbp; u16 ds_cnt; u16 ihs; u16 pi; u8 opcode; /* Return ENOBUFS if the queue is full */ if (unlikely(!mlx5e_sq_has_room_for(sq, 2 * MLX5_SEND_WQE_MAX_WQEBBS))) return (ENOBUFS); /* Align SQ edge with NOPs to avoid WQE wrap around */ pi = ((~sq->pc) & sq->wq.sz_m1); if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) { /* Send one multi NOP message instead of many */ mlx5e_send_nop(sq, (pi + 1) * MLX5_SEND_WQEBB_NUM_DS); pi = ((~sq->pc) & sq->wq.sz_m1); if (pi < (MLX5_SEND_WQE_MAX_WQEBBS - 1)) return (ENOMEM); } /* Setup local variables */ pi = sq->pc & sq->wq.sz_m1; wqe = mlx5_wq_cyc_get_wqe(&sq->wq, pi); ifp = sq->ifp; memset(wqe, 0, sizeof(*wqe)); /* Send a copy of the frame to the BPF listener, if any */ if (ifp != NULL && ifp->if_bpf != NULL) ETHER_BPF_MTAP(ifp, mb); if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) { wqe->eth.cs_flags |= MLX5_ETH_WQE_L3_CSUM; } if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) { wqe->eth.cs_flags |= MLX5_ETH_WQE_L4_CSUM; } if (wqe->eth.cs_flags == 0) { sq->stats.csum_offload_none++; } if (mb->m_pkthdr.csum_flags & CSUM_TSO) { u32 payload_len; u32 mss = mb->m_pkthdr.tso_segsz; u32 num_pkts; wqe->eth.mss = cpu_to_be16(mss); opcode = MLX5_OPCODE_LSO; ihs = mlx5e_get_full_header_size(mb); if (unlikely(ihs == 0)) { err = EINVAL; goto tx_drop; } payload_len = mb->m_pkthdr.len - ihs; if (payload_len == 0) num_pkts = 1; else num_pkts = DIV_ROUND_UP(payload_len, mss); sq->mbuf[pi].num_bytes = payload_len + (num_pkts * ihs); sq->stats.tso_packets++; sq->stats.tso_bytes += payload_len; } else { opcode = MLX5_OPCODE_SEND; switch (sq->min_inline_mode) { case MLX5_INLINE_MODE_IP: case MLX5_INLINE_MODE_TCP_UDP: ihs = mlx5e_get_full_header_size(mb); if (unlikely(ihs == 0)) ihs = mlx5e_get_l2_header_size(sq, mb); break; case MLX5_INLINE_MODE_L2: ihs = mlx5e_get_l2_header_size(sq, mb); break; case MLX5_INLINE_MODE_NONE: /* FALLTHROUGH */ default: if ((mb->m_flags & M_VLANTAG) != 0 && (sq->min_insert_caps & MLX5E_INSERT_VLAN) != 0) { /* inlining VLAN data is not required */ wqe->eth.vlan_cmd = htons(0x8000); /* bit 0 CVLAN */ wqe->eth.vlan_hdr = htons(mb->m_pkthdr.ether_vtag); ihs = 0; } else if ((mb->m_flags & M_VLANTAG) == 0 && (sq->min_insert_caps & MLX5E_INSERT_NON_VLAN) != 0) { /* inlining non-VLAN data is not required */ ihs = 0; } else { /* we are forced to inlining L2 header, if any */ ihs = mlx5e_get_l2_header_size(sq, mb); } break; } sq->mbuf[pi].num_bytes = max_t (unsigned int, mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN); } if (likely(ihs == 0)) { /* nothing to inline */ } else if (unlikely(ihs > sq->max_inline)) { /* inline header size is too big */ err = EINVAL; goto tx_drop; } else if ((mb->m_flags & M_VLANTAG) != 0) { struct ether_vlan_header *eh = (struct ether_vlan_header *) wqe->eth.inline_hdr_start; /* Range checks */ if (unlikely(ihs > (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN))) ihs = (MLX5E_MAX_TX_INLINE - ETHER_VLAN_ENCAP_LEN); else if (unlikely(ihs < ETHER_HDR_LEN)) { err = EINVAL; goto tx_drop; } m_copydata(mb, 0, ETHER_HDR_LEN, (caddr_t)eh); m_adj(mb, ETHER_HDR_LEN); /* Insert 4 bytes VLAN tag into data stream */ eh->evl_proto = eh->evl_encap_proto; eh->evl_encap_proto = htons(ETHERTYPE_VLAN); eh->evl_tag = htons(mb->m_pkthdr.ether_vtag); /* Copy rest of header data, if any */ m_copydata(mb, 0, ihs - ETHER_HDR_LEN, (caddr_t)(eh + 1)); m_adj(mb, ihs - ETHER_HDR_LEN); /* Extend header by 4 bytes */ ihs += ETHER_VLAN_ENCAP_LEN; wqe->eth.inline_hdr_sz = cpu_to_be16(ihs); } else { m_copydata(mb, 0, ihs, wqe->eth.inline_hdr_start); m_adj(mb, ihs); wqe->eth.inline_hdr_sz = cpu_to_be16(ihs); } ds_cnt = sizeof(*wqe) / MLX5_SEND_WQE_DS; if (ihs > sizeof(wqe->eth.inline_hdr_start)) { ds_cnt += DIV_ROUND_UP(ihs - sizeof(wqe->eth.inline_hdr_start), MLX5_SEND_WQE_DS); } dseg = ((struct mlx5_wqe_data_seg *)&wqe->ctrl) + ds_cnt; err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map, mb, segs, &nsegs, BUS_DMA_NOWAIT); if (err == EFBIG) { /* Update statistics */ sq->stats.defragged++; /* Too many mbuf fragments */ mb = m_defrag(*mbp, M_NOWAIT); if (mb == NULL) { mb = *mbp; goto tx_drop; } /* Try again */ err = bus_dmamap_load_mbuf_sg(sq->dma_tag, sq->mbuf[pi].dma_map, mb, segs, &nsegs, BUS_DMA_NOWAIT); } /* Catch errors */ if (err != 0) goto tx_drop; /* Make sure all mbuf data, if any, is written to RAM */ if (nsegs != 0) { bus_dmamap_sync(sq->dma_tag, sq->mbuf[pi].dma_map, BUS_DMASYNC_PREWRITE); } else { /* All data was inlined, free the mbuf. */ bus_dmamap_unload(sq->dma_tag, sq->mbuf[pi].dma_map); m_freem(mb); mb = NULL; } for (x = 0; x != nsegs; x++) { if (segs[x].ds_len == 0) continue; dseg->addr = cpu_to_be64((uint64_t)segs[x].ds_addr); dseg->lkey = sq->mkey_be; dseg->byte_count = cpu_to_be32((uint32_t)segs[x].ds_len); dseg++; } ds_cnt = (dseg - ((struct mlx5_wqe_data_seg *)&wqe->ctrl)); wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode); wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt); if (mlx5e_do_send_cqe(sq)) wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE; else wqe->ctrl.fm_ce_se = 0; /* Copy data for doorbell */ memcpy(sq->doorbell.d32, &wqe->ctrl, sizeof(sq->doorbell.d32)); /* Store pointer to mbuf */ sq->mbuf[pi].mbuf = mb; sq->mbuf[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS); sq->pc += sq->mbuf[pi].num_wqebbs; /* Count all traffic going out */ sq->stats.packets++; sq->stats.bytes += sq->mbuf[pi].num_bytes; *mbp = NULL; /* safety clear */ return (0); tx_drop: sq->stats.dropped++; *mbp = NULL; m_freem(mb); return err; } static void mlx5e_poll_tx_cq(struct mlx5e_sq *sq, int budget) { u16 sqcc; /* * sq->cc must be updated only after mlx5_cqwq_update_db_record(), * otherwise a cq overrun may occur */ sqcc = sq->cc; while (budget > 0) { struct mlx5_cqe64 *cqe; struct mbuf *mb; u16 x; u16 ci; cqe = mlx5e_get_cqe(&sq->cq); if (!cqe) break; mlx5_cqwq_pop(&sq->cq.wq); /* update budget according to the event factor */ budget -= sq->cev_factor; for (x = 0; x != sq->cev_factor; x++) { ci = sqcc & sq->wq.sz_m1; mb = sq->mbuf[ci].mbuf; sq->mbuf[ci].mbuf = NULL; if (mb == NULL) { if (sq->mbuf[ci].num_bytes == 0) { /* NOP */ sq->stats.nop++; } } else { bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map); /* Free transmitted mbuf */ m_freem(mb); } sqcc += sq->mbuf[ci].num_wqebbs; } } mlx5_cqwq_update_db_record(&sq->cq.wq); /* Ensure cq space is freed before enabling more cqes */ atomic_thread_fence_rel(); sq->cc = sqcc; } static int mlx5e_xmit_locked(struct ifnet *ifp, struct mlx5e_sq *sq, struct mbuf *mb) { int err = 0; if (unlikely((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || READ_ONCE(sq->running) == 0)) { m_freem(mb); return (ENETDOWN); } /* Do transmit */ if (mlx5e_sq_xmit(sq, &mb) != 0) { /* NOTE: m_freem() is NULL safe */ m_freem(mb); err = ENOBUFS; } /* Check if we need to write the doorbell */ if (likely(sq->doorbell.d64 != 0)) { mlx5e_tx_notify_hw(sq, sq->doorbell.d32, 0); sq->doorbell.d64 = 0; } /* * Check if we need to start the event timer which flushes the * transmit ring on timeout: */ if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL && sq->cev_factor != 1)) { /* start the timer */ mlx5e_sq_cev_timeout(sq); } else { /* don't send NOPs yet */ sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS; } return (err); } int mlx5e_xmit(struct ifnet *ifp, struct mbuf *mb) { struct mlx5e_sq *sq; int ret; - if (mb->m_pkthdr.snd_tag != NULL) { + if (mb->m_pkthdr.csum_flags & CSUM_SND_TAG) { + MPASS(mb->m_pkthdr.snd_tag->ifp == ifp); sq = mlx5e_select_queue_by_send_tag(ifp, mb); if (unlikely(sq == NULL)) { - /* Check for route change */ - if (mb->m_pkthdr.snd_tag->ifp != ifp) { - /* Free mbuf */ - m_freem(mb); - - /* - * Tell upper layers about route - * change and to re-transmit this - * packet: - */ - return (EAGAIN); - } goto select_queue; } } else { select_queue: sq = mlx5e_select_queue(ifp, mb); if (unlikely(sq == NULL)) { /* Free mbuf */ m_freem(mb); /* Invalid send queue */ return (ENXIO); } } mtx_lock(&sq->lock); ret = mlx5e_xmit_locked(ifp, sq, mb); mtx_unlock(&sq->lock); return (ret); } void mlx5e_tx_cq_comp(struct mlx5_core_cq *mcq) { struct mlx5e_sq *sq = container_of(mcq, struct mlx5e_sq, cq.mcq); mtx_lock(&sq->comp_lock); mlx5e_poll_tx_cq(sq, MLX5E_BUDGET_MAX); mlx5e_cq_arm(&sq->cq, MLX5_GET_DOORBELL_LOCK(&sq->priv->doorbell_lock)); mtx_unlock(&sq->comp_lock); } Index: head/sys/kern/kern_mbuf.c =================================================================== --- head/sys/kern/kern_mbuf.c (revision 348253) +++ head/sys/kern/kern_mbuf.c (revision 348254) @@ -1,1151 +1,1182 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004, 2005, * Bosko Milekic . All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include +#include +#include + #include #include #include #include #include #include #include /* * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA * Zones. * * Mbuf Clusters (2K, contiguous) are allocated from the Cluster * Zone. The Zone can be capped at kern.ipc.nmbclusters, if the * administrator so desires. * * Mbufs are allocated from a UMA Master Zone called the Mbuf * Zone. * * Additionally, FreeBSD provides a Packet Zone, which it * configures as a Secondary Zone to the Mbuf Master Zone, * thus sharing backend Slab kegs with the Mbuf Master Zone. * * Thus common-case allocations and locking are simplified: * * m_clget() m_getcl() * | | * | .------------>[(Packet Cache)] m_get(), m_gethdr() * | | [ Packet ] | * [(Cluster Cache)] [ Secondary ] [ (Mbuf Cache) ] * [ Cluster Zone ] [ Zone ] [ Mbuf Master Zone ] * | \________ | * [ Cluster Keg ] \ / * | [ Mbuf Keg ] * [ Cluster Slabs ] | * | [ Mbuf Slabs ] * \____________(VM)_________________/ * * * Whenever an object is allocated with uma_zalloc() out of * one of the Zones its _ctor_ function is executed. The same * for any deallocation through uma_zfree() the _dtor_ function * is executed. * * Caches are per-CPU and are filled from the Master Zone. * * Whenever an object is allocated from the underlying global * memory pool it gets pre-initialized with the _zinit_ functions. * When the Keg's are overfull objects get decommissioned with * _zfini_ functions and free'd back to the global memory pool. * */ int nmbufs; /* limits number of mbufs */ int nmbclusters; /* limits number of mbuf clusters */ int nmbjumbop; /* limits number of page size jumbo clusters */ int nmbjumbo9; /* limits number of 9k jumbo clusters */ int nmbjumbo16; /* limits number of 16k jumbo clusters */ static quad_t maxmbufmem; /* overall real memory limit for all mbufs */ SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0, "Maximum real memory allocatable to various mbuf types"); +static counter_u64_t snd_tag_count; +SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW, + &snd_tag_count, "# of active mbuf send tags"); + /* * tunable_mbinit() has to be run before any mbuf allocations are done. */ static void tunable_mbinit(void *dummy) { quad_t realmem; /* * The default limit for all mbuf related memory is 1/2 of all * available kernel memory (physical or kmem). * At most it can be 3/4 of available kernel memory. */ realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size); maxmbufmem = realmem / 2; TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem); if (maxmbufmem > realmem / 4 * 3) maxmbufmem = realmem / 4 * 3; TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters); if (nmbclusters == 0) nmbclusters = maxmbufmem / MCLBYTES / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop); if (nmbjumbop == 0) nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9); if (nmbjumbo9 == 0) nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6; TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16); if (nmbjumbo16 == 0) nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6; /* * We need at least as many mbufs as we have clusters of * the various types added together. */ TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs); if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) nmbufs = lmax(maxmbufmem / MSIZE / 5, nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16); } SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL); static int sysctl_nmbclusters(SYSCTL_HANDLER_ARGS) { int error, newnmbclusters; newnmbclusters = nmbclusters; error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); if (error == 0 && req->newptr && newnmbclusters != nmbclusters) { if (newnmbclusters > nmbclusters && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbclusters = newnmbclusters; nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); EVENTHANDLER_INVOKE(nmbclusters_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW, &nmbclusters, 0, sysctl_nmbclusters, "IU", "Maximum number of mbuf clusters allowed"); static int sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbop; newnmbjumbop = nmbjumbop; error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) { if (newnmbjumbop > nmbjumbop && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbop = newnmbjumbop; nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbop, 0, sysctl_nmbjumbop, "IU", "Maximum number of mbuf page size jumbo clusters allowed"); static int sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo9; newnmbjumbo9 = nmbjumbo9; error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) { if (newnmbjumbo9 > nmbjumbo9 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo9 = newnmbjumbo9; nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo9, 0, sysctl_nmbjumbo9, "IU", "Maximum number of mbuf 9k jumbo clusters allowed"); static int sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS) { int error, newnmbjumbo16; newnmbjumbo16 = nmbjumbo16; error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) { if (newnmbjumbo16 > nmbjumbo16 && nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) { nmbjumbo16 = newnmbjumbo16; nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW, &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU", "Maximum number of mbuf 16k jumbo clusters allowed"); static int sysctl_nmbufs(SYSCTL_HANDLER_ARGS) { int error, newnmbufs; newnmbufs = nmbufs; error = sysctl_handle_int(oidp, &newnmbufs, 0, req); if (error == 0 && req->newptr && newnmbufs != nmbufs) { if (newnmbufs > nmbufs) { nmbufs = newnmbufs; nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); EVENTHANDLER_INVOKE(nmbufs_change); } else error = EINVAL; } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs, CTLTYPE_INT|CTLFLAG_RW, &nmbufs, 0, sysctl_nmbufs, "IU", "Maximum number of mbufs allowed"); /* * Zones from which we allocate. */ uma_zone_t zone_mbuf; uma_zone_t zone_clust; uma_zone_t zone_pack; uma_zone_t zone_jumbop; uma_zone_t zone_jumbo9; uma_zone_t zone_jumbo16; /* * Local prototypes. */ static int mb_ctor_mbuf(void *, int, void *, int); static int mb_ctor_clust(void *, int, void *, int); static int mb_ctor_pack(void *, int, void *, int); static void mb_dtor_mbuf(void *, int, void *); static void mb_dtor_pack(void *, int, void *); static int mb_zinit_pack(void *, int, int); static void mb_zfini_pack(void *, int); static void mb_reclaim(uma_zone_t, int); static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int); /* Ensure that MSIZE is a power of 2. */ CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE); /* * Initialize FreeBSD Network buffer allocation. */ static void mbuf_init(void *dummy) { /* * Configure UMA zones for Mbufs, Clusters, and Packets. */ zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif MSIZE - 1, UMA_ZONE_MAXBUCKET); if (nmbufs > 0) nmbufs = uma_zone_set_max(zone_mbuf, nmbufs); uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached"); uma_zone_set_maxaction(zone_mbuf, mb_reclaim); zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); if (nmbclusters > 0) nmbclusters = uma_zone_set_max(zone_clust, nmbclusters); uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached"); uma_zone_set_maxaction(zone_clust, mb_reclaim); zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack, mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf); /* Make jumbo frame zone too. Page size, 9k and 16k. */ zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); if (nmbjumbop > 0) nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop); uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached"); uma_zone_set_maxaction(zone_jumbop, mb_reclaim); zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc); if (nmbjumbo9 > 0) nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9); uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached"); uma_zone_set_maxaction(zone_jumbo9, mb_reclaim); zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif UMA_ALIGN_PTR, 0); uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc); if (nmbjumbo16 > 0) nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16); uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached"); uma_zone_set_maxaction(zone_jumbo16, mb_reclaim); /* * Hook event handler for low-memory situation, used to * drain protocols and push data back to the caches (UMA * later pushes it back to VM). */ EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL, EVENTHANDLER_PRI_FIRST); + + snd_tag_count = counter_u64_alloc(M_WAITOK); } SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL); #ifdef NETDUMP /* * netdump makes use of a pre-allocated pool of mbufs and clusters. When * netdump is configured, we initialize a set of UMA cache zones which return * items from this pool. At panic-time, the regular UMA zone pointers are * overwritten with those of the cache zones so that drivers may allocate and * free mbufs and clusters without attempting to allocate physical memory. * * We keep mbufs and clusters in a pair of mbuf queues. In particular, for * the purpose of caching clusters, we treat them as mbufs. */ static struct mbufq nd_mbufq = { STAILQ_HEAD_INITIALIZER(nd_mbufq.mq_head), 0, INT_MAX }; static struct mbufq nd_clustq = { STAILQ_HEAD_INITIALIZER(nd_clustq.mq_head), 0, INT_MAX }; static int nd_clsize; static uma_zone_t nd_zone_mbuf; static uma_zone_t nd_zone_clust; static uma_zone_t nd_zone_pack; static int nd_buf_import(void *arg, void **store, int count, int domain __unused, int flags) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = mbufq_dequeue(q); if (m == NULL) break; trash_init(m, q == &nd_mbufq ? MSIZE : nd_clsize, flags); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void nd_buf_release(void *arg, void **store, int count) { struct mbufq *q; struct mbuf *m; int i; q = arg; for (i = 0; i < count; i++) { m = store[i]; (void)mbufq_enqueue(q, m); } } static int nd_pack_import(void *arg __unused, void **store, int count, int domain __unused, int flags __unused) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = m_get(MT_DATA, M_NOWAIT); if (m == NULL) break; clust = uma_zalloc(nd_zone_clust, M_NOWAIT); if (clust == NULL) { m_free(m); break; } mb_ctor_clust(clust, nd_clsize, m, 0); store[i] = m; } KASSERT((flags & M_WAITOK) == 0 || i == count, ("%s: ran out of pre-allocated mbufs", __func__)); return (i); } static void nd_pack_release(void *arg __unused, void **store, int count) { struct mbuf *m; void *clust; int i; for (i = 0; i < count; i++) { m = store[i]; clust = m->m_ext.ext_buf; uma_zfree(nd_zone_clust, clust); uma_zfree(nd_zone_mbuf, m); } } /* * Free the pre-allocated mbufs and clusters reserved for netdump, and destroy * the corresponding UMA cache zones. */ void netdump_mbuf_drain(void) { struct mbuf *m; void *item; if (nd_zone_mbuf != NULL) { uma_zdestroy(nd_zone_mbuf); nd_zone_mbuf = NULL; } if (nd_zone_clust != NULL) { uma_zdestroy(nd_zone_clust); nd_zone_clust = NULL; } if (nd_zone_pack != NULL) { uma_zdestroy(nd_zone_pack); nd_zone_pack = NULL; } while ((m = mbufq_dequeue(&nd_mbufq)) != NULL) m_free(m); while ((item = mbufq_dequeue(&nd_clustq)) != NULL) uma_zfree(m_getzone(nd_clsize), item); } /* * Callback invoked immediately prior to starting a netdump. */ void netdump_mbuf_dump(void) { /* * All cluster zones return buffers of the size requested by the * drivers. It's up to the driver to reinitialize the zones if the * MTU of a netdump-enabled interface changes. */ printf("netdump: overwriting mbuf zone pointers\n"); zone_mbuf = nd_zone_mbuf; zone_clust = nd_zone_clust; zone_pack = nd_zone_pack; zone_jumbop = nd_zone_clust; zone_jumbo9 = nd_zone_clust; zone_jumbo16 = nd_zone_clust; } /* * Reinitialize the netdump mbuf+cluster pool and cache zones. */ void netdump_mbuf_reinit(int nmbuf, int nclust, int clsize) { struct mbuf *m; void *item; netdump_mbuf_drain(); nd_clsize = clsize; nd_zone_mbuf = uma_zcache_create("netdump_" MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, #ifdef INVARIANTS trash_init, trash_fini, #else NULL, NULL, #endif nd_buf_import, nd_buf_release, &nd_mbufq, UMA_ZONE_NOBUCKET); nd_zone_clust = uma_zcache_create("netdump_" MBUF_CLUSTER_MEM_NAME, clsize, mb_ctor_clust, #ifdef INVARIANTS trash_dtor, trash_init, trash_fini, #else NULL, NULL, NULL, #endif nd_buf_import, nd_buf_release, &nd_clustq, UMA_ZONE_NOBUCKET); nd_zone_pack = uma_zcache_create("netdump_" MBUF_PACKET_MEM_NAME, MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL, nd_pack_import, nd_pack_release, NULL, UMA_ZONE_NOBUCKET); while (nmbuf-- > 0) { m = m_get(MT_DATA, M_WAITOK); uma_zfree(nd_zone_mbuf, m); } while (nclust-- > 0) { item = uma_zalloc(m_getzone(nd_clsize), M_WAITOK); uma_zfree(nd_zone_clust, item); } } #endif /* NETDUMP */ /* * UMA backend page allocator for the jumbo frame zones. * * Allocates kernel virtual memory that is backed by contiguous physical * pages. */ static void * mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), bytes, wait, (vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT)); } /* * Constructor for Mbuf master zone. * * The 'arg' pointer points to a mb_args structure which * contains call-specific information required to support the * mbuf allocation API. See mbuf.h. */ static int mb_ctor_mbuf(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error; int flags; short type; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif args = (struct mb_args *)arg; type = args->type; /* * The mbuf is initialized later. The caller has the * responsibility to set up any MAC labels too. */ if (type == MT_NOINIT) return (0); m = (struct mbuf *)mem; flags = args->flags; MPASS((flags & M_NOFREE) == 0); error = m_init(m, how, type, flags); return (error); } /* * The Mbuf master zone destructor. */ static void mb_dtor_mbuf(void *mem, int size, void *arg) { struct mbuf *m; unsigned long flags; m = (struct mbuf *)mem; flags = (unsigned long)arg; KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__)); if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags)) m_tag_delete_chain(m, NULL); #ifdef INVARIANTS trash_dtor(mem, size, arg); #endif } /* * The Mbuf Packet zone destructor. */ static void mb_dtor_pack(void *mem, int size, void *arg) { struct mbuf *m; m = (struct mbuf *)mem; if ((m->m_flags & M_PKTHDR) != 0) m_tag_delete_chain(m, NULL); /* Make sure we've got a clean cluster back. */ KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__)); KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__)); KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__)); KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__)); KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__)); KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__)); KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__)); #ifdef INVARIANTS trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg); #endif /* * If there are processes blocked on zone_clust, waiting for pages * to be freed up, * cause them to be woken up by draining the * packet zone. We are exposed to a race here * (in the check for * the UMA_ZFLAG_FULL) where we might miss the flag set, but that * is deliberate. We don't want to acquire the zone lock for every * mbuf free. */ if (uma_zone_exhausted_nolock(zone_clust)) zone_drain(zone_pack); } /* * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor. * * Here the 'arg' pointer points to the Mbuf which we * are configuring cluster storage for. If 'arg' is * empty we allocate just the cluster without setting * the mbuf to it. See mbuf.h. */ static int mb_ctor_clust(void *mem, int size, void *arg, int how) { struct mbuf *m; #ifdef INVARIANTS trash_ctor(mem, size, arg, how); #endif m = (struct mbuf *)arg; if (m != NULL) { m->m_ext.ext_buf = (char *)mem; m->m_data = m->m_ext.ext_buf; m->m_flags |= M_EXT; m->m_ext.ext_free = NULL; m->m_ext.ext_arg1 = NULL; m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = m_gettype(size); m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; } return (0); } /* * The Packet secondary zone's init routine, executed on the * object's transition from mbuf keg slab to zone cache. */ static int mb_zinit_pack(void *mem, int size, int how) { struct mbuf *m; m = (struct mbuf *)mem; /* m is virgin. */ if (uma_zalloc_arg(zone_clust, m, how) == NULL || m->m_ext.ext_buf == NULL) return (ENOMEM); m->m_ext.ext_type = EXT_PACKET; /* Override. */ #ifdef INVARIANTS trash_init(m->m_ext.ext_buf, MCLBYTES, how); #endif return (0); } /* * The Packet secondary zone's fini routine, executed on the * object's transition from zone cache to keg slab. */ static void mb_zfini_pack(void *mem, int size) { struct mbuf *m; m = (struct mbuf *)mem; #ifdef INVARIANTS trash_fini(m->m_ext.ext_buf, MCLBYTES); #endif uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL); #ifdef INVARIANTS trash_dtor(mem, size, NULL); #endif } /* * The "packet" keg constructor. */ static int mb_ctor_pack(void *mem, int size, void *arg, int how) { struct mbuf *m; struct mb_args *args; int error, flags; short type; m = (struct mbuf *)mem; args = (struct mb_args *)arg; flags = args->flags; type = args->type; MPASS((flags & M_NOFREE) == 0); #ifdef INVARIANTS trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how); #endif error = m_init(m, how, type, flags); /* m_ext is already initialized. */ m->m_data = m->m_ext.ext_buf; m->m_flags = (flags | M_EXT); return (error); } /* * This is the protocol drain routine. Called by UMA whenever any of the * mbuf zones is closed to its limit. * * No locks should be held when this is called. The drain routines have to * presently acquire some locks which raises the possibility of lock order * reversal. */ static void mb_reclaim(uma_zone_t zone __unused, int pending __unused) { struct domain *dp; struct protosw *pr; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__); for (dp = domains; dp != NULL; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_drain != NULL) (*pr->pr_drain)(); } /* * Clean up after mbufs with M_EXT storage attached to them if the * reference count hits 1. */ void mb_free_ext(struct mbuf *m) { volatile u_int *refcnt; struct mbuf *mref; int freembuf; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = &m->m_ext.ext_count; mref = m; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; mref = __containerof(refcnt, struct mbuf, m_ext.ext_count); } /* * Check if the header is embedded in the cluster. It is * important that we can't touch any of the mbuf fields * after we have freed the external storage, since mbuf * could have been embedded in it. For now, the mbufs * embedded into the cluster are always of type EXT_EXTREF, * and for this type we won't free the mref. */ if (m->m_flags & M_NOFREE) { freembuf = 0; KASSERT(m->m_ext.ext_type == EXT_EXTREF || m->m_ext.ext_type == EXT_RXRING, ("%s: no-free mbuf %p has wrong type", __func__, m)); } else freembuf = 1; /* Free attached storage if this mbuf is the only reference to it. */ if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) { switch (m->m_ext.ext_type) { case EXT_PACKET: /* The packet zone is special. */ if (*refcnt == 0) *refcnt = 1; uma_zfree(zone_pack, mref); break; case EXT_CLUSTER: uma_zfree(zone_clust, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBOP: uma_zfree(zone_jumbop, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBO9: uma_zfree(zone_jumbo9, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_JUMBO16: uma_zfree(zone_jumbo16, m->m_ext.ext_buf); uma_zfree(zone_mbuf, mref); break; case EXT_SFBUF: case EXT_NET_DRV: case EXT_MOD_TYPE: case EXT_DISPOSABLE: KASSERT(mref->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); mref->m_ext.ext_free(mref); uma_zfree(zone_mbuf, mref); break; case EXT_EXTREF: KASSERT(m->m_ext.ext_free != NULL, ("%s: ext_free not set", __func__)); m->m_ext.ext_free(m); break; case EXT_RXRING: KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free is set", __func__)); break; default: KASSERT(m->m_ext.ext_type == 0, ("%s: unknown ext_type", __func__)); } } if (freembuf && m != mref) uma_zfree(zone_mbuf, m); } /* * Official mbuf(9) allocation KPI for stack and drivers: * * m_get() - a single mbuf without any attachments, sys/mbuf.h. * m_gethdr() - a single mbuf initialized as M_PKTHDR, sys/mbuf.h. * m_getcl() - an mbuf + 2k cluster, sys/mbuf.h. * m_clget() - attach cluster to already allocated mbuf. * m_cljget() - attach jumbo cluster to already allocated mbuf. * m_get2() - allocate minimum mbuf that would fit size argument. * m_getm2() - allocate a chain of mbufs/clusters. * m_extadd() - attach external cluster to mbuf. * * m_free() - free single mbuf with its tags and ext, sys/mbuf.h. * m_freem() - free chain of mbufs. */ int m_clget(struct mbuf *m, int how) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = (char *)NULL; uma_zalloc_arg(zone_clust, m, how); /* * On a cluster allocation failure, drain the packet zone and retry, * we might be able to loosen a few clusters up on the drain. */ if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) { zone_drain(zone_pack); uma_zalloc_arg(zone_clust, m, how); } MBUF_PROBE2(m__clget, m, how); return (m->m_flags & M_EXT); } /* * m_cljget() is different from m_clget() as it can allocate clusters without * attaching them to an mbuf. In that case the return value is the pointer * to the cluster of the requested size. If an mbuf was specified, it gets * the cluster attached to it and the return value can be safely ignored. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ void * m_cljget(struct mbuf *m, int how, int size) { uma_zone_t zone; void *retval; if (m != NULL) { KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT", __func__, m)); m->m_ext.ext_buf = NULL; } zone = m_getzone(size); retval = uma_zalloc_arg(zone, m, how); MBUF_PROBE4(m__cljget, m, how, size, retval); return (retval); } /* * m_get2() allocates minimum mbuf that would fit "size" argument. */ struct mbuf * m_get2(int size, int how, short type, int flags) { struct mb_args args; struct mbuf *m, *n; args.flags = flags; args.type = type; if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0)) return (uma_zalloc_arg(zone_mbuf, &args, how)); if (size <= MCLBYTES) return (uma_zalloc_arg(zone_pack, &args, how)); if (size > MJUMPAGESIZE) return (NULL); m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); n = uma_zalloc_arg(zone_jumbop, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } return (m); } /* * m_getjcl() returns an mbuf with a cluster of the specified size attached. * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES. */ struct mbuf * m_getjcl(int how, short type, int flags, int size) { struct mb_args args; struct mbuf *m, *n; uma_zone_t zone; if (size == MCLBYTES) return m_getcl(how, type, flags); args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); if (m == NULL) return (NULL); zone = m_getzone(size); n = uma_zalloc_arg(zone, m, how); if (n == NULL) { uma_zfree(zone_mbuf, m); return (NULL); } return (m); } /* * Allocate a given length worth of mbufs and/or clusters (whatever fits * best) and return a pointer to the top of the allocated chain. If an * existing mbuf chain is provided, then we will append the new chain * to the existing one and return a pointer to the provided mbuf. */ struct mbuf * m_getm2(struct mbuf *m, int len, int how, short type, int flags) { struct mbuf *mb, *nm = NULL, *mtail = NULL; KASSERT(len >= 0, ("%s: len is < 0", __func__)); /* Validate flags. */ flags &= (M_PKTHDR | M_EOR); /* Packet header mbuf must be first in chain. */ if ((flags & M_PKTHDR) && m != NULL) flags &= ~M_PKTHDR; /* Loop and append maximum sized mbufs to the chain tail. */ while (len > 0) { if (len > MCLBYTES) mb = m_getjcl(how, type, (flags & M_PKTHDR), MJUMPAGESIZE); else if (len >= MINCLSIZE) mb = m_getcl(how, type, (flags & M_PKTHDR)); else if (flags & M_PKTHDR) mb = m_gethdr(how, type); else mb = m_get(how, type); /* Fail the whole operation if one mbuf can't be allocated. */ if (mb == NULL) { if (nm != NULL) m_freem(nm); return (NULL); } /* Book keeping. */ len -= M_SIZE(mb); if (mtail != NULL) mtail->m_next = mb; else nm = mb; mtail = mb; flags &= ~M_PKTHDR; /* Only valid on the first mbuf. */ } if (flags & M_EOR) mtail->m_flags |= M_EOR; /* Only valid on the last mbuf. */ /* If mbuf was supplied, append new chain to the end of it. */ if (m != NULL) { for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next) ; mtail->m_next = nm; mtail->m_flags &= ~M_EOR; } else m = nm; return (m); } /*- * Configure a provided mbuf to refer to the provided external storage * buffer and setup a reference count for said buffer. * * Arguments: * mb The existing mbuf to which to attach the provided buffer. * buf The address of the provided external storage buffer. * size The size of the provided buffer. * freef A pointer to a routine that is responsible for freeing the * provided external storage buffer. * args A pointer to an argument structure (of any type) to be passed * to the provided freef routine (may be NULL). * flags Any other flags to be passed to the provided mbuf. * type The type that the external storage buffer should be * labeled with. * * Returns: * Nothing. */ void m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef, void *arg1, void *arg2, int flags, int type) { KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__)); mb->m_flags |= (M_EXT | flags); mb->m_ext.ext_buf = buf; mb->m_data = mb->m_ext.ext_buf; mb->m_ext.ext_size = size; mb->m_ext.ext_free = freef; mb->m_ext.ext_arg1 = arg1; mb->m_ext.ext_arg2 = arg2; mb->m_ext.ext_type = type; if (type != EXT_EXTREF) { mb->m_ext.ext_count = 1; mb->m_ext.ext_flags = EXT_FLAG_EMBREF; } else mb->m_ext.ext_flags = 0; } /* * Free an entire chain of mbufs and associated external buffers, if * applicable. */ void m_freem(struct mbuf *mb) { MBUF_PROBE1(m__freem, mb); while (mb != NULL) mb = m_free(mb); +} + +void +m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp) +{ + + if_ref(ifp); + mst->ifp = ifp; + refcount_init(&mst->refcount, 1); + counter_u64_add(snd_tag_count, 1); +} + +void +m_snd_tag_destroy(struct m_snd_tag *mst) +{ + struct ifnet *ifp; + + ifp = mst->ifp; + ifp->if_snd_tag_free(mst); + if_rele(ifp); + counter_u64_add(snd_tag_count, -1); } Index: head/sys/kern/uipc_mbuf.c =================================================================== --- head/sys/kern/uipc_mbuf.c (revision 348253) +++ head/sys/kern/uipc_mbuf.c (revision 348254) @@ -1,1875 +1,1891 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_mbuf_stress_test.h" #include "opt_mbuf_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t", "void*", "void*"); SDT_PROBE_DEFINE(sdt, , , m__cljset); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, "struct mbuf *", "mbufinfo_t *"); #include int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; #ifdef MBUF_STRESS_TEST int m_defragpackets; int m_defragbytes; int m_defraguseless; int m_defragfailure; int m_defragrandomfailures; #endif /* * sysctl(8) exported objects */ SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, &max_linkhdr, 0, "Size of largest link layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, &max_protohdr, 0, "Size of largest protocol layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, &max_hdr, 0, "Size of largest link plus protocol header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD, &max_datalen, 0, "Minimum space left in mbuf after max_hdr"); #ifdef MBUF_STRESS_TEST SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, &m_defragpackets, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, &m_defragbytes, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, &m_defraguseless, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, &m_defragfailure, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, &m_defragrandomfailures, 0, ""); #endif /* * Ensure the correct size of various mbuf parameters. It could be off due * to compiler-induced padding and alignment artifacts. */ CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); /* * mbuf data storage should be 64-bit aligned regardless of architectural * pointer size; check this is the case with and without a packet header. */ CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); /* * While the specific values here don't matter too much (i.e., +/- a few * words), we do want to ensure that changes to these values are carefully * reasoned about and properly documented. This is especially the case as * network-protocol and device-driver modules encode these layouts, and must * be recompiled if the structures change. Check these values at compile time * against the ones documented in comments in mbuf.h. * * NB: Possibly they should be documented there via #define's and not just * comments. */ #if defined(__LP64__) CTASSERT(offsetof(struct mbuf, m_dat) == 32); CTASSERT(sizeof(struct pkthdr) == 56); CTASSERT(sizeof(struct m_ext) == 48); #else CTASSERT(offsetof(struct mbuf, m_dat) == 24); CTASSERT(sizeof(struct pkthdr) == 48); CTASSERT(sizeof(struct m_ext) == 28); #endif /* * Assert that the queue(3) macros produce code of the same size as an old * plain pointer does. */ #ifdef INVARIANTS static struct mbuf __used m_assertbuf; CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); #endif /* * Attach the cluster from *m to *n, set up m_ext in *n * and bump the refcount of the cluster. */ void mb_dupcl(struct mbuf *n, struct mbuf *m) { volatile u_int *refcnt; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n)); /* * Cache access optimization. For most kinds of external * storage we don't need full copy of m_ext, since the * holder of the 'ext_count' is responsible to carry the * free routine and its arguments. Exclusion is EXT_EXTREF, * where 'ext_cnt' doesn't point into mbuf at all. */ if (m->m_ext.ext_type == EXT_EXTREF) bcopy(&m->m_ext, &n->m_ext, sizeof(struct m_ext)); else bcopy(&m->m_ext, &n->m_ext, m_ext_copylen); n->m_flags |= M_EXT; n->m_flags |= m->m_flags & M_RDONLY; /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count; n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; } if (*refcnt == 1) *refcnt += 1; else atomic_add_int(refcnt, 1); } void m_demote_pkthdr(struct mbuf *m) { M_ASSERTPKTHDR(m); m_tag_delete_chain(m, NULL); m->m_flags &= ~M_PKTHDR; bzero(&m->m_pkthdr, sizeof(struct pkthdr)); } /* * Clean up mbuf (chain) from any tags and packet headers. * If "all" is set then the first mbuf in the chain will be * cleaned too. */ void m_demote(struct mbuf *m0, int all, int flags) { struct mbuf *m; for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", __func__, m, m0)); if (m->m_flags & M_PKTHDR) m_demote_pkthdr(m); m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags); } } /* * Sanity checks on mbuf (chain) for use in KASSERT() and general * debugging. * Returns 0 or panics when bad and 1 on all tests passed. * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they * blow up later. */ int m_sanity(struct mbuf *m0, int sanitize) { struct mbuf *m; caddr_t a, b; int pktlen = 0; #ifdef INVARIANTS #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) #else #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) #endif for (m = m0; m != NULL; m = m->m_next) { /* * Basic pointer checks. If any of these fails then some * unrelated kernel memory before or after us is trashed. * No way to recover from that. */ a = M_START(m); b = a + M_SIZE(m); if ((caddr_t)m->m_data < a) M_SANITY_ACTION("m_data outside mbuf data range left"); if ((caddr_t)m->m_data > b) M_SANITY_ACTION("m_data outside mbuf data range right"); if ((caddr_t)m->m_data + m->m_len > b) M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); /* m->m_nextpkt may only be set on first mbuf in chain. */ if (m != m0 && m->m_nextpkt != NULL) { if (sanitize) { m_freem(m->m_nextpkt); m->m_nextpkt = (struct mbuf *)0xDEADC0DE; } else M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); } /* packet length (not mbuf length!) calculation */ if (m0->m_flags & M_PKTHDR) pktlen += m->m_len; /* m_tags may only be attached to first mbuf in chain. */ if (m != m0 && m->m_flags & M_PKTHDR && !SLIST_EMPTY(&m->m_pkthdr.tags)) { if (sanitize) { m_tag_delete_chain(m, NULL); /* put in 0xDEADC0DE perhaps? */ } else M_SANITY_ACTION("m_tags on in-chain mbuf"); } /* M_PKTHDR may only be set on first mbuf in chain */ if (m != m0 && m->m_flags & M_PKTHDR) { if (sanitize) { bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); m->m_flags &= ~M_PKTHDR; /* put in 0xDEADCODE and leave hdr flag in */ } else M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); } } m = m0; if (pktlen && pktlen != m->m_pkthdr.len) { if (sanitize) m->m_pkthdr.len = 0; else M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); } return 1; #undef M_SANITY_ACTION } /* * Non-inlined part of m_init(). */ int m_pkthdr_init(struct mbuf *m, int how) { #ifdef MAC int error; #endif m->m_data = m->m_pktdat; bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); #ifdef NUMA m->m_pkthdr.numa_domain = M_NODOM; #endif #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); if (error) return (error); #endif return (0); } /* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. */ void m_move_pkthdr(struct mbuf *to, struct mbuf *from) { #if 0 /* see below for why these are not enabled */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_move_pkthdr: to has tags")); #endif #ifdef MAC /* * XXXMAC: It could be this should also occur for non-MAC? */ if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; + if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) { + from->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; + from->m_pkthdr.snd_tag = NULL; + } } /* * Duplicate "from"'s mbuf pkthdr in "to". * "from" must have M_PKTHDR set, and "to" must be empty. * In particular, this does a deep copy of the packet tags. */ int m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) { #if 0 /* * The mbuf allocator only initializes the pkthdr * when the mbuf is allocated with m_gethdr(). Many users * (e.g. m_copy*, m_prepend) use m_get() and then * smash the pkthdr as needed causing these * assertions to trip. For now just disable them. */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); #endif MBUF_CHECKSLEEP(how); #ifdef MAC if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; + if (from->m_pkthdr.csum_flags & CSUM_SND_TAG) + m_snd_tag_ref(from->m_pkthdr.snd_tag); SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } /* * Lesser-used path for M_PREPEND: * allocate new mbuf to prepend to chain, * copy junk along. */ struct mbuf * m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; if (m->m_flags & M_PKTHDR) mn = m_gethdr(how, m->m_type); else mn = m_get(how, m->m_type); if (mn == NULL) { m_freem(m); return (NULL); } if (m->m_flags & M_PKTHDR) m_move_pkthdr(mn, m); mn->m_next = m; m = mn; if (len < M_SIZE(m)) M_ALIGN(m, len); m->m_len = len; return (m); } /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ struct mbuf * m_copym(struct mbuf *m, int off0, int len, int wait) { struct mbuf *n, **np; int off = off0; struct mbuf *top; int copyhdr = 0; KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); MBUF_CHECKSLEEP(wait); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = ⊤ top = NULL; while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("m_copym, length > size of mbuf chain")); break; } if (copyhdr) n = m_gethdr(wait, m->m_type); else n = m_get(wait, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, wait)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; mb_dupcl(n, m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (len != M_COPYALL) len -= n->m_len; off = 0; m = m->m_next; np = &n->m_next; } return (top); nospace: m_freem(top); return (NULL); } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. * Preserve alignment of the first mbuf so if the creator has left * some room at the beginning (e.g. for inserting protocol headers) * the copies still have the room available. */ struct mbuf * m_copypacket(struct mbuf *m, int how) { struct mbuf *top, *n, *o; MBUF_CHECKSLEEP(how); n = m_get(how, m->m_type); top = n; if (n == NULL) goto nospace; if (!m_dup_pkthdr(n, m, how)) goto nospace; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; while (m) { o = m_get(how, m->m_type); if (o == NULL) goto nospace; n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; } return top; nospace: m_freem(top); return (NULL); } /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ void m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) { u_int count; KASSERT(off >= 0, ("m_copydata, negative off %d", off)); KASSERT(len >= 0, ("m_copydata, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; m = m->m_next; } } /* * Copy a packet header mbuf chain into a completely new chain, including * copying any mbuf clusters. Use this instead of m_copypacket() when * you need a writable copy of an mbuf chain. */ struct mbuf * m_dup(const struct mbuf *m, int how) { struct mbuf **p, *top = NULL; int remain, moff, nsize; MBUF_CHECKSLEEP(how); /* Sanity check */ if (m == NULL) return (NULL); M_ASSERTPKTHDR(m); /* While there's more data, get a new mbuf, tack it on, and fill it */ remain = m->m_pkthdr.len; moff = 0; p = ⊤ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ struct mbuf *n; /* Get the next new mbuf */ if (remain >= MINCLSIZE) { n = m_getcl(how, m->m_type, 0); nsize = MCLBYTES; } else { n = m_get(how, m->m_type); nsize = MLEN; } if (n == NULL) goto nospace; if (top == NULL) { /* First one, must be PKTHDR */ if (!m_dup_pkthdr(n, m, how)) { m_free(n); goto nospace; } if ((n->m_flags & M_EXT) == 0) nsize = MHLEN; n->m_flags &= ~M_RDONLY; } n->m_len = 0; /* Link it into the new chain */ *p = n; p = &n->m_next; /* Copy data from original mbuf(s) into new mbuf */ while (n->m_len < nsize && m != NULL) { int chunk = min(nsize - n->m_len, m->m_len - moff); bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); moff += chunk; n->m_len += chunk; remain -= chunk; if (moff == m->m_len) { m = m->m_next; moff = 0; } } /* Check correct total mbuf length */ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), ("%s: bogus m_pkthdr.len", __func__)); } return (top); nospace: m_freem(top); return (NULL); } /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ void m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { if (!M_WRITABLE(m) || M_TRAILINGSPACE(m) < n->m_len) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } /* * Concatenate two pkthdr mbuf chains. */ void m_catpkt(struct mbuf *m, struct mbuf *n) { M_ASSERTPKTHDR(m); M_ASSERTPKTHDR(n); m->m_pkthdr.len += n->m_pkthdr.len; m_demote(n, 1, 0); m_cat(m, n); } void m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (m->m_next != NULL) { m_freem(m->m_next); m->m_next = NULL; } break; } count -= m->m_len; } } } /* * Rearange an mbuf chain so that len bytes are contiguous * and in the data area of an mbuf (so that mtod will work * for a structure of size len). Returns the resulting * mbuf chain on success, frees it and returns null on failure. * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ struct mbuf * m_pullup(struct mbuf *n, int len) { struct mbuf *m; int count; int space; /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) return (n); m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Like m_pullup(), except a new mbuf is always allocated, and we allow * the amount of empty space before the data in the new mbuf to be specified * (in the event that the caller expects to prepend later). */ struct mbuf * m_copyup(struct mbuf *n, int len, int dstoff) { struct mbuf *m; int count, space; if (len > (MHLEN - dstoff)) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); m->m_data += dstoff; space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. * * Note that the resulting mbufs might be read-only, because the new * mbuf can end up sharing an mbuf cluster with the original mbuf if * the "breaking point" happens to lie within a cluster mbuf. Use the * M_WRITABLE() macro to check for this case. */ struct mbuf * m_split(struct mbuf *m0, int len0, int wait) { struct mbuf *m, *n; u_int len = len0, remain; MBUF_CHECKSLEEP(wait); for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return (NULL); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR && remain == 0) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_next = m->m_next; m->m_next = NULL; - n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { + n->m_pkthdr.snd_tag = + m_snd_tag_ref(m0->m_pkthdr.snd_tag); + n->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } else + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; return (n); } else if (m0->m_flags & M_PKTHDR) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); - n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + if (m0->m_pkthdr.csum_flags & CSUM_SND_TAG) { + n->m_pkthdr.snd_tag = + m_snd_tag_ref(m0->m_pkthdr.snd_tag); + n->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } else + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) goto extpacket; if (remain > MHLEN) { /* m can't be the lead packet */ M_ALIGN(n, 0); n->m_next = m_split(m, len, wait); if (n->m_next == NULL) { (void) m_free(n); return (NULL); } else { n->m_len = 0; return (n); } } else M_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; m->m_next = NULL; return (n); } else { n = m_get(wait, m->m_type); if (n == NULL) return (NULL); M_ALIGN(n, remain); } extpacket: if (m->m_flags & M_EXT) { n->m_data = m->m_data + len; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = NULL; return (n); } /* * Routine to copy from device local memory into mbufs. * Note that `off' argument is offset into first mbuf of target chain from * which to begin copying the data to. */ struct mbuf * m_devget(char *buf, int totlen, int off, struct ifnet *ifp, void (*copy)(char *from, caddr_t to, u_int len)) { struct mbuf *m; struct mbuf *top = NULL, **mp = ⊤ int len; if (off < 0 || off > MHLEN) return (NULL); while (totlen > 0) { if (top == NULL) { /* First one, must be PKTHDR */ if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); len = MCLBYTES; } else { m = m_gethdr(M_NOWAIT, MT_DATA); len = MHLEN; /* Place initial small packet/header at end of mbuf */ if (m && totlen + off + max_linkhdr <= MHLEN) { m->m_data += max_linkhdr; len -= max_linkhdr; } } if (m == NULL) return NULL; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; } else { if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, 0); len = MCLBYTES; } else { m = m_get(M_NOWAIT, MT_DATA); len = MLEN; } if (m == NULL) { m_freem(top); return NULL; } } if (off) { m->m_data += off; len -= off; off = 0; } m->m_len = len = min(totlen, len); if (copy) copy(buf, mtod(m, caddr_t), (u_int)len); else bcopy(buf, mtod(m, caddr_t), (u_int)len); buf += len; *mp = m; mp = &m->m_next; totlen -= len; } return (top); } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) { int mlen; struct mbuf *m = m0, *n; int totlen = 0; if (m0 == NULL) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) goto out; bzero(mtod(n, caddr_t), MLEN); n->m_len = min(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { if (m->m_next == NULL && (len > m->m_len - off)) { m->m_len += min(len - (m->m_len - off), M_TRAILINGSPACE(m)); } mlen = min (m->m_len - off, len); bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); cp += mlen; len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, len); m->m_next = n; } m = m->m_next; } out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * Return 1 if able to complete the job; otherwise 0. */ int m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space, remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len, remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } /* * Apply function f to the data in an mbuf chain starting "off" bytes from * the beginning, continuing for "len" bytes. */ int m_apply(struct mbuf *m, int off, int len, int (*f)(void *, void *, u_int), void *arg) { u_int count; int rval; KASSERT(off >= 0, ("m_apply, negative off %d", off)); KASSERT(len >= 0, ("m_apply, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); count = min(m->m_len - off, len); rval = (*f)(arg, mtod(m, caddr_t) + off, count); if (rval) return (rval); len -= count; off = 0; m = m->m_next; } return (0); } /* * Return a pointer to mbuf/offset of location in mbuf chain. */ struct mbuf * m_getptr(struct mbuf *m, int loc, int *off) { while (loc >= 0) { /* Normal end of search. */ if (m->m_len > loc) { *off = loc; return (m); } else { loc -= m->m_len; if (m->m_next == NULL) { if (loc == 0) { /* Point at the end of valid data. */ *off = m->m_len; return (m); } return (NULL); } m = m->m_next; } } return (NULL); } void m_print(const struct mbuf *m, int maxlen) { int len; int pdata; const struct mbuf *m2; if (m == NULL) { printf("mbuf: %p\n", m); return; } if (m->m_flags & M_PKTHDR) len = m->m_pkthdr.len; else len = -1; m2 = m; while (m2 != NULL && (len == -1 || len)) { pdata = m2->m_len; if (maxlen != -1 && pdata > maxlen) pdata = maxlen; printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); if (pdata) printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); if (len != -1) len -= m2->m_len; m2 = m2->m_next; } if (len > 0) printf("%d bytes unaccounted for.\n", len); return; } u_int m_fixhdr(struct mbuf *m0) { u_int len; len = m_length(m0, NULL); m0->m_pkthdr.len = len; return (len); } u_int m_length(struct mbuf *m0, struct mbuf **last) { struct mbuf *m; u_int len; len = 0; for (m = m0; m != NULL; m = m->m_next) { len += m->m_len; if (m->m_next == NULL) break; } if (last != NULL) *last = m; return (len); } /* * Defragment a mbuf chain, returning the shortest possible * chain of mbufs and clusters. If allocation fails and * this cannot be completed, NULL will be returned, but * the passed in chain will be unchanged. Upon success, * the original chain will be freed, and the new chain * will be returned. * * If a non-packet header is passed in, the original * mbuf (chain?) will be returned unharmed. */ struct mbuf * m_defrag(struct mbuf *m0, int how) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0, length; MBUF_CHECKSLEEP(how); if (!(m0->m_flags & M_PKTHDR)) return (m0); m_fixhdr(m0); /* Needed sanity check */ #ifdef MBUF_STRESS_TEST if (m_defragrandomfailures) { int temp = arc4random() & 0xff; if (temp == 0xba) goto nospace; } #endif if (m0->m_pkthdr.len > MHLEN) m_final = m_getcl(how, MT_DATA, M_PKTHDR); else m_final = m_gethdr(how, MT_DATA); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; while (progress < m0->m_pkthdr.len) { length = m0->m_pkthdr.len - progress; if (length > MCLBYTES) length = MCLBYTES; if (m_new == NULL) { if (length > MLEN) m_new = m_getcl(how, MT_DATA, 0); else m_new = m_get(how, MT_DATA); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, length, mtod(m_new, caddr_t)); progress += length; m_new->m_len = length; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } #ifdef MBUF_STRESS_TEST if (m0->m_next == NULL) m_defraguseless++; #endif m_freem(m0); m0 = m_final; #ifdef MBUF_STRESS_TEST m_defragpackets++; m_defragbytes += m0->m_pkthdr.len; #endif return (m0); nospace: #ifdef MBUF_STRESS_TEST m_defragfailure++; #endif if (m_final) m_freem(m_final); return (NULL); } /* * Defragment an mbuf chain, returning at most maxfrags separate * mbufs+clusters. If this is not possible NULL is returned and * the original mbuf chain is left in its present (potentially * modified) state. We use two techniques: collapsing consecutive * mbufs and replacing consecutive mbufs by a cluster. * * NB: this should really be named m_defrag but that name is taken */ struct mbuf * m_collapse(struct mbuf *m0, int how, int maxfrags) { struct mbuf *m, *n, *n2, **prev; u_int curfrags; /* * Calculate the current number of frags. */ curfrags = 0; for (m = m0; m != NULL; m = m->m_next) curfrags++; /* * First, try to collapse mbufs. Note that we always collapse * towards the front so we don't need to deal with moving the * pkthdr. This may be suboptimal if the first mbuf has much * less data than the following. */ m = m0; again: for (;;) { n = m->m_next; if (n == NULL) break; if (M_WRITABLE(m) && n->m_len < M_TRAILINGSPACE(m)) { bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, n->m_len); m->m_len += n->m_len; m->m_next = n->m_next; m_free(n); if (--curfrags <= maxfrags) return m0; } else m = n; } KASSERT(maxfrags > 1, ("maxfrags %u, but normal collapse failed", maxfrags)); /* * Collapse consecutive mbufs to a cluster. */ prev = &m0->m_next; /* NB: not the first mbuf */ while ((n = *prev) != NULL) { if ((n2 = n->m_next) != NULL && n->m_len + n2->m_len < MCLBYTES) { m = m_getcl(how, MT_DATA, 0); if (m == NULL) goto bad; bcopy(mtod(n, void *), mtod(m, void *), n->m_len); bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, n2->m_len); m->m_len = n->m_len + n2->m_len; m->m_next = n2->m_next; *prev = m; m_free(n); m_free(n2); if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ return m0; /* * Still not there, try the normal collapse * again before we allocate another cluster. */ goto again; } prev = &n->m_next; } /* * No place where we can collapse to a cluster; punt. * This can occur if, for example, you request 2 frags * but the packet requires that both be clusters (we * never reallocate the first mbuf to avoid moving the * packet header). */ bad: return NULL; } #ifdef MBUF_STRESS_TEST /* * Fragment an mbuf chain. There's no reason you'd ever want to do * this in normal usage, but it's great for stress testing various * mbuf consumers. * * If fragmentation is not possible, the original chain will be * returned. * * Possible length values: * 0 no fragmentation will occur * > 0 each fragment will be of the specified length * -1 each fragment will be the same random value in length * -2 each fragment's length will be entirely random * (Random values range from 1 to 256) */ struct mbuf * m_fragment(struct mbuf *m0, int how, int length) { struct mbuf *m_first, *m_last; int divisor = 255, progress = 0, fraglen; if (!(m0->m_flags & M_PKTHDR)) return (m0); if (length == 0 || length < -2) return (m0); if (length > MCLBYTES) length = MCLBYTES; if (length < 0 && divisor > MCLBYTES) divisor = MCLBYTES; if (length == -1) length = 1 + (arc4random() % divisor); if (length > 0) fraglen = length; m_fixhdr(m0); /* Needed sanity check */ m_first = m_getcl(how, MT_DATA, M_PKTHDR); if (m_first == NULL) goto nospace; if (m_dup_pkthdr(m_first, m0, how) == 0) goto nospace; m_last = m_first; while (progress < m0->m_pkthdr.len) { if (length == -2) fraglen = 1 + (arc4random() % divisor); if (fraglen > m0->m_pkthdr.len - progress) fraglen = m0->m_pkthdr.len - progress; if (progress != 0) { struct mbuf *m_new = m_getcl(how, MT_DATA, 0); if (m_new == NULL) goto nospace; m_last->m_next = m_new; m_last = m_new; } m_copydata(m0, progress, fraglen, mtod(m_last, caddr_t)); progress += fraglen; m_last->m_len = fraglen; } m_freem(m0); m0 = m_first; return (m0); nospace: if (m_first) m_freem(m_first); /* Return the original chain on failure */ return (m0); } #endif /* * Copy the contents of uio into a properly sized mbuf chain. */ struct mbuf * m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) { struct mbuf *m, *mb; int error, length; ssize_t total; int progress = 0; /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. */ if (len > 0) total = (uio->uio_resid < len) ? uio->uio_resid : len; else total = uio->uio_resid; /* * The smallest unit returned by m_getm2() is a single mbuf * with pkthdr. We can't align past it. */ if (align >= MHLEN) return (NULL); /* * Give us the full allocation or nothing. * If len is zero return the smallest empty mbuf. */ m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags); if (m == NULL) return (NULL); m->m_data += align; /* Fill all mbufs with uio data and update header information. */ for (mb = m; mb != NULL; mb = mb->m_next) { length = min(M_TRAILINGSPACE(mb), total - progress); error = uiomove(mtod(mb, void *), length, uio); if (error) { m_freem(m); return (NULL); } mb->m_len = length; progress += length; if (flags & M_PKTHDR) m->m_pkthdr.len += length; } KASSERT(progress == total, ("%s: progress != total", __func__)); return (m); } /* * Copy an mbuf chain into a uio limited by len if set. */ int m_mbuftouio(struct uio *uio, const struct mbuf *m, int len) { int error, length, total; int progress = 0; if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* Fill the uio with data from the mbufs. */ for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); error = uiomove(mtod(m, void *), length, uio); if (error) return (error); progress += length; } return (0); } /* * Create a writable copy of the mbuf chain. While doing this * we compact the chain with a goal of producing a chain with * at most two mbufs. The second mbuf in this chain is likely * to be a cluster. The primary purpose of this work is to create * a writable packet for encryption, compression, etc. The * secondary goal is to linearize the data so the data can be * passed to crypto hardware in the most efficient manner possible. */ struct mbuf * m_unshare(struct mbuf *m0, int how) { struct mbuf *m, *mprev; struct mbuf *n, *mfirst, *mlast; int len, off; mprev = NULL; for (m = m0; m != NULL; m = mprev->m_next) { /* * Regular mbufs are ignored unless there's a cluster * in front of it that we can use to coalesce. We do * the latter mainly so later clusters can be coalesced * also w/o having to handle them specially (i.e. convert * mbuf+cluster -> cluster). This optimization is heavily * influenced by the assumption that we're running over * Ethernet where MCLBYTES is large enough that the max * packet size will permit lots of coalescing into a * single cluster. This in turn permits efficient * crypto operations, especially when using hardware. */ if ((m->m_flags & M_EXT) == 0) { if (mprev && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ } else { mprev = m; } continue; } /* * Writable mbufs are left alone (for now). */ if (M_WRITABLE(m)) { mprev = m; continue; } /* * Not writable, replace with a copy or coalesce with * the previous mbuf if possible (since we have to copy * it anyway, we try to reduce the number of mbufs and * clusters so that future work is easier). */ KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); /* NB: we only coalesce into a cluster or larger */ if (mprev != NULL && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ continue; } /* * Allocate new space to hold the copy and copy the data. * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by * splitting them into clusters. We could just malloc a * buffer and make it external but too many device drivers * don't know how to break up the non-contiguous memory when * doing DMA. */ n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(m0); return (NULL); } if (m->m_flags & M_PKTHDR) { KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", __func__, m0, m)); m_move_pkthdr(n, m); } len = m->m_len; off = 0; mfirst = n; mlast = NULL; for (;;) { int cc = min(len, MCLBYTES); memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); n->m_len = cc; if (mlast != NULL) mlast->m_next = n; mlast = n; #if 0 newipsecstat.ips_clcopied++; #endif len -= cc; if (len <= 0) break; off += cc; n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(mfirst); m_freem(m0); return (NULL); } } n->m_next = m->m_next; if (mprev == NULL) m0 = mfirst; /* new head of chain */ else mprev->m_next = mfirst; /* replace old mbuf */ m_free(m); /* release old mbuf */ mprev = mfirst; } return (m0); } #ifdef MBUF_PROFILING #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ struct mbufprofile { uintmax_t wasted[MP_BUCKETS]; uintmax_t used[MP_BUCKETS]; uintmax_t segments[MP_BUCKETS]; } mbprof; #define MP_MAXDIGITS 21 /* strlen("16,000,000,000,000,000,000") == 21 */ #define MP_NUMLINES 6 #define MP_NUMSPERLINE 16 #define MP_EXTRABYTES 64 /* > strlen("used:\nwasted:\nsegments:\n") */ /* work out max space needed and add a bit of spare space too */ #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE) #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES) char mbprofbuf[MP_BUFSIZE]; void m_profile(struct mbuf *m) { int segments = 0; int used = 0; int wasted = 0; while (m) { segments++; used += m->m_len; if (m->m_flags & M_EXT) { wasted += MHLEN - sizeof(m->m_ext) + m->m_ext.ext_size - m->m_len; } else { if (m->m_flags & M_PKTHDR) wasted += MHLEN - m->m_len; else wasted += MLEN - m->m_len; } m = m->m_next; } /* be paranoid.. it helps */ if (segments > MP_BUCKETS - 1) segments = MP_BUCKETS - 1; if (used > 100000) used = 100000; if (wasted > 100000) wasted = 100000; /* store in the appropriate bucket */ /* don't bother locking. if it's slightly off, so what? */ mbprof.segments[segments]++; mbprof.used[fls(used)]++; mbprof.wasted[fls(wasted)]++; } static void mbprof_textify(void) { int offset; char *c; uint64_t *p; p = &mbprof.wasted[0]; c = mbprofbuf; offset = snprintf(c, MP_MAXLINE + 10, "wasted:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.wasted[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.used[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "used:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.used[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.segments[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "segments:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.segments[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %jju", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif } static int mbprof_handler(SYSCTL_HANDLER_ARGS) { int error; mbprof_textify(); error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1); return (error); } static int mbprof_clr_handler(SYSCTL_HANDLER_ARGS) { int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error || !req->newptr) return (error); if (clear) { bzero(&mbprof, sizeof(mbprof)); } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, mbprof_handler, "A", "mbuf profiling statistics"); SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW, NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics"); #endif Index: head/sys/net/bpf.c =================================================================== --- head/sys/net/bpf.c (revision 348253) +++ head/sys/net/bpf.c (revision 348254) @@ -1,3085 +1,3085 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2019 Andrey V. Elsukov * * This code is derived from the Stanford/CMU enet packet filter, * (net/enet.c) distributed as part of 4.3BSD, and code contributed * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence * Berkeley Laboratory. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)bpf.c 8.4 (Berkeley) 1/9/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_bpf.h" #include "opt_ddb.h" #include "opt_netgraph.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #ifdef BPF_JITTER #include #endif #include #include #include #include #include #include #include #include #include #include MALLOC_DEFINE(M_BPF, "BPF", "BPF data"); static struct bpf_if_ext dead_bpf_if = { .bif_dlist = CK_LIST_HEAD_INITIALIZER() }; struct bpf_if { #define bif_next bif_ext.bif_next #define bif_dlist bif_ext.bif_dlist struct bpf_if_ext bif_ext; /* public members */ u_int bif_dlt; /* link layer type */ u_int bif_hdrlen; /* length of link header */ struct bpfd_list bif_wlist; /* writer-only list */ struct ifnet *bif_ifp; /* corresponding interface */ struct bpf_if **bif_bpf; /* Pointer to pointer to us */ volatile u_int bif_refcnt; struct epoch_context epoch_ctx; }; CTASSERT(offsetof(struct bpf_if, bif_ext) == 0); struct bpf_program_buffer { struct epoch_context epoch_ctx; #ifdef BPF_JITTER bpf_jit_filter *func; #endif void *buffer[0]; }; #if defined(DEV_BPF) || defined(NETGRAPH_BPF) #define PRINET 26 /* interruptible */ #define SIZEOF_BPF_HDR(type) \ (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen)) #ifdef COMPAT_FREEBSD32 #include #include #define BPF_ALIGNMENT32 sizeof(int32_t) #define BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32) #ifndef BURN_BRIDGES /* * 32-bit version of structure prepended to each packet. We use this header * instead of the standard one for 32-bit streams. We mark the a stream as * 32-bit the first time we see a 32-bit compat ioctl request. */ struct bpf_hdr32 { struct timeval32 bh_tstamp; /* time stamp */ uint32_t bh_caplen; /* length of captured portion */ uint32_t bh_datalen; /* original length of packet */ uint16_t bh_hdrlen; /* length of bpf header (this struct plus alignment padding) */ }; #endif struct bpf_program32 { u_int bf_len; uint32_t bf_insns; }; struct bpf_dltlist32 { u_int bfl_len; u_int bfl_list; }; #define BIOCSETF32 _IOW('B', 103, struct bpf_program32) #define BIOCSRTIMEOUT32 _IOW('B', 109, struct timeval32) #define BIOCGRTIMEOUT32 _IOR('B', 110, struct timeval32) #define BIOCGDLTLIST32 _IOWR('B', 121, struct bpf_dltlist32) #define BIOCSETWF32 _IOW('B', 123, struct bpf_program32) #define BIOCSETFNR32 _IOW('B', 130, struct bpf_program32) #endif #define BPF_LOCK() sx_xlock(&bpf_sx) #define BPF_UNLOCK() sx_xunlock(&bpf_sx) #define BPF_LOCK_ASSERT() sx_assert(&bpf_sx, SA_XLOCKED) /* * bpf_iflist is a list of BPF interface structures, each corresponding to a * specific DLT. The same network interface might have several BPF interface * structures registered by different layers in the stack (i.e., 802.11 * frames, ethernet frames, etc). */ CK_LIST_HEAD(bpf_iflist, bpf_if); static struct bpf_iflist bpf_iflist; static struct sx bpf_sx; /* bpf global lock */ static int bpf_bpfd_cnt; static void bpfif_ref(struct bpf_if *); static void bpfif_rele(struct bpf_if *); static void bpfd_ref(struct bpf_d *); static void bpfd_rele(struct bpf_d *); static void bpf_attachd(struct bpf_d *, struct bpf_if *); static void bpf_detachd(struct bpf_d *); static void bpf_detachd_locked(struct bpf_d *, bool); static void bpfd_free(epoch_context_t); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, struct sockaddr *, int *, struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void bpf_wakeup(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *); static void reset_d(struct bpf_d *); static int bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd); static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); static int bpf_setdlt(struct bpf_d *, u_int); static void filt_bpfdetach(struct knote *); static int filt_bpfread(struct knote *, long); static void bpf_drvinit(void *); static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl"); int bpf_maxinsns = BPF_MAXINSNS; SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW, &bpf_maxinsns, 0, "Maximum bpf program instructions"); static int bpf_zerocopy_enable = 0; SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW, &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions"); static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW, bpf_stats_sysctl, "bpf statistics portal"); VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0; #define V_bpf_optimize_writers VNET(bpf_optimize_writers) SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(bpf_optimize_writers), 0, "Do not send packets until BPF program is set"); static d_open_t bpfopen; static d_read_t bpfread; static d_write_t bpfwrite; static d_ioctl_t bpfioctl; static d_poll_t bpfpoll; static d_kqfilter_t bpfkqfilter; static struct cdevsw bpf_cdevsw = { .d_version = D_VERSION, .d_open = bpfopen, .d_read = bpfread, .d_write = bpfwrite, .d_ioctl = bpfioctl, .d_poll = bpfpoll, .d_name = "bpf", .d_kqfilter = bpfkqfilter, }; static struct filterops bpfread_filtops = { .f_isfd = 1, .f_detach = filt_bpfdetach, .f_event = filt_bpfread, }; /* * LOCKING MODEL USED BY BPF * * Locks: * 1) global lock (BPF_LOCK). Sx, used to protect some global counters, * every bpf_iflist changes, serializes ioctl access to bpf descriptors. * 2) Descriptor lock. Mutex, used to protect BPF buffers and various * structure fields used by bpf_*tap* code. * * Lock order: global lock, then descriptor lock. * * There are several possible consumers: * * 1. The kernel registers interface pointer with bpfattach(). * Each call allocates new bpf_if structure, references ifnet pointer * and links bpf_if into bpf_iflist chain. This is protected with global * lock. * * 2. An userland application uses ioctl() call to bpf_d descriptor. * All such call are serialized with global lock. BPF filters can be * changed, but pointer to old filter will be freed using epoch_call(). * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to * filter pointers, even if change will happen during bpf_tap execution. * Destroying of bpf_d descriptor also is doing using epoch_call(). * * 3. An userland application can write packets into bpf_d descriptor. * There we need to be sure, that ifnet won't disappear during bpfwrite(). * * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to * bif_dlist is protected with net_epoch_preempt section. So, it should * be safe to make access to bpf_d descriptor inside the section. * * 5. The kernel invokes bpfdetach() on interface destroying. All lists * are modified with global lock held and actual free() is done using * epoch_call(). */ static void bpfif_free(epoch_context_t ctx) { struct bpf_if *bp; bp = __containerof(ctx, struct bpf_if, epoch_ctx); if_rele(bp->bif_ifp); free(bp, M_BPF); } static void bpfif_ref(struct bpf_if *bp) { refcount_acquire(&bp->bif_refcnt); } static void bpfif_rele(struct bpf_if *bp) { if (!refcount_release(&bp->bif_refcnt)) return; epoch_call(net_epoch_preempt, &bp->epoch_ctx, bpfif_free); } static void bpfd_ref(struct bpf_d *d) { refcount_acquire(&d->bd_refcnt); } static void bpfd_rele(struct bpf_d *d) { if (!refcount_release(&d->bd_refcnt)) return; epoch_call(net_epoch_preempt, &d->epoch_ctx, bpfd_free); } static struct bpf_program_buffer* bpf_program_buffer_alloc(size_t size, int flags) { return (malloc(sizeof(struct bpf_program_buffer) + size, M_BPF, flags)); } static void bpf_program_buffer_free(epoch_context_t ctx) { struct bpf_program_buffer *ptr; ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx); #ifdef BPF_JITTER if (ptr->func != NULL) bpf_destroy_jit_filter(ptr->func); #endif free(ptr, M_BPF); } /* * Wrapper functions for various buffering methods. If the set of buffer * modes expands, we will probably want to introduce a switch data structure * similar to protosw, et. */ static void bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_bytes(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: counter_u64_add(d->bd_zcopy, 1); return (bpf_zerocopy_append_bytes(d, buf, offset, src, len)); default: panic("bpf_buf_append_bytes"); } } static void bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src, u_int len) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_append_mbuf(d, buf, offset, src, len)); case BPF_BUFMODE_ZBUF: counter_u64_add(d->bd_zcopy, 1); return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len)); default: panic("bpf_buf_append_mbuf"); } } /* * This function gets called when the free buffer is re-assigned. */ static void bpf_buf_reclaimed(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return; case BPF_BUFMODE_ZBUF: bpf_zerocopy_buf_reclaimed(d); return; default: panic("bpf_buf_reclaimed"); } } /* * If the buffer mechanism has a way to decide that a held buffer can be made * free, then it is exposed via the bpf_canfreebuf() interface. (1) is * returned if the buffer can be discarded, (0) is returned if it cannot. */ static int bpf_canfreebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canfreebuf(d)); } return (0); } /* * Allow the buffer model to indicate that the current store buffer is * immutable, regardless of the appearance of space. Return (1) if the * buffer is writable, and (0) if not. */ static int bpf_canwritebuf(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_canwritebuf(d)); } return (1); } /* * Notify buffer model that an attempt to write to the store buffer has * resulted in a dropped packet, in which case the buffer may be considered * full. */ static void bpf_buffull(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_buffull(d); break; } } /* * Notify the buffer model that a buffer has moved into the hold position. */ void bpf_bufheld(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); switch (d->bd_bufmode) { case BPF_BUFMODE_ZBUF: bpf_zerocopy_bufheld(d); break; } } static void bpf_free(struct bpf_d *d) { switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: return (bpf_buffer_free(d)); case BPF_BUFMODE_ZBUF: return (bpf_zerocopy_free(d)); default: panic("bpf_buf_free"); } } static int bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_uiomove(d, buf, len, uio)); } static int bpf_ioctl_sblen(struct bpf_d *d, u_int *i) { if (d->bd_bufmode != BPF_BUFMODE_BUFFER) return (EOPNOTSUPP); return (bpf_buffer_ioctl_sblen(d, i)); } static int bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_getzmax(td, d, i)); } static int bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz)); } static int bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz) { if (d->bd_bufmode != BPF_BUFMODE_ZBUF) return (EOPNOTSUPP); return (bpf_zerocopy_ioctl_setzbuf(td, d, bz)); } /* * General BPF functions. */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, struct sockaddr *sockp, int *hdrlen, struct bpf_d *d) { const struct ieee80211_bpf_params *p; struct ether_header *eh; struct mbuf *m; int error; int len; int hlen; int slen; /* * Build a sockaddr based on the data link layer type. * We do this at this level because the ethernet header * is copied directly into the data field of the sockaddr. * In the case of SLIP, there is no header and the packet * is forwarded as is. * Also, we are careful to leave room at the front of the mbuf * for the link level header. */ switch (linktype) { case DLT_SLIP: sockp->sa_family = AF_INET; hlen = 0; break; case DLT_EN10MB: sockp->sa_family = AF_UNSPEC; /* XXX Would MAXLINKHDR be better? */ hlen = ETHER_HDR_LEN; break; case DLT_FDDI: sockp->sa_family = AF_IMPLINK; hlen = 0; break; case DLT_RAW: sockp->sa_family = AF_UNSPEC; hlen = 0; break; case DLT_NULL: /* * null interface types require a 4 byte pseudo header which * corresponds to the address family of the packet. */ sockp->sa_family = AF_UNSPEC; hlen = 4; break; case DLT_ATM_RFC1483: /* * en atm driver requires 4-byte atm pseudo header. * though it isn't standard, vpi:vci needs to be * specified anyway. */ sockp->sa_family = AF_UNSPEC; hlen = 12; /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */ break; case DLT_PPP: sockp->sa_family = AF_UNSPEC; hlen = 4; /* This should match PPP_HDRLEN */ break; case DLT_IEEE802_11: /* IEEE 802.11 wireless */ sockp->sa_family = AF_IEEE80211; hlen = 0; break; case DLT_IEEE802_11_RADIO: /* IEEE 802.11 wireless w/ phy params */ sockp->sa_family = AF_IEEE80211; sockp->sa_len = 12; /* XXX != 0 */ hlen = sizeof(struct ieee80211_bpf_params); break; default: return (EIO); } len = uio->uio_resid; if (len < hlen || len - hlen > ifp->if_mtu) return (EMSGSIZE); m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR); if (m == NULL) return (EIO); m->m_pkthdr.len = m->m_len = len; *mp = m; error = uiomove(mtod(m, u_char *), len, uio); if (error) goto bad; slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; } /* Check for multicast destination */ switch (linktype) { case DLT_EN10MB: eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost, ETHER_ADDR_LEN) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; } if (d->bd_hdrcmplt == 0) { memcpy(eh->ether_shost, IF_LLADDR(ifp), sizeof(eh->ether_shost)); } break; } /* * Make room for link header, and copy it to sockaddr */ if (hlen != 0) { if (sockp->sa_family == AF_IEEE80211) { /* * Collect true length from the parameter header * NB: sockp is known to be zero'd so if we do a * short copy unspecified parameters will be * zero. * NB: packet may not be aligned after stripping * bpf params * XXX check ibp_vers */ p = mtod(m, const struct ieee80211_bpf_params *); hlen = p->ibp_len; if (hlen > sizeof(sockp->sa_data)) { error = EINVAL; goto bad; } } bcopy(mtod(m, const void *), sockp->sa_data, hlen); } *hdrlen = hlen; return (0); bad: m_freem(m); return (error); } /* * Attach descriptor to the bpf interface, i.e. make d listen on bp, * then reset its buffers and counters with reset_d(). */ static void bpf_attachd(struct bpf_d *d, struct bpf_if *bp) { int op_w; BPF_LOCK_ASSERT(); /* * Save sysctl value to protect from sysctl change * between reads */ op_w = V_bpf_optimize_writers || d->bd_writer; if (d->bd_bif != NULL) bpf_detachd_locked(d, false); /* * Point d at bp, and add d to the interface's list. * Since there are many applications using BPF for * sending raw packets only (dhcpd, cdpd are good examples) * we can delay adding d to the list of active listeners until * some filter is configured. */ BPFD_LOCK(d); /* * Hold reference to bpif while descriptor uses this interface. */ bpfif_ref(bp); d->bd_bif = bp; if (op_w != 0) { /* Add to writers-only list */ CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next); /* * We decrement bd_writer on every filter set operation. * First BIOCSETF is done by pcap_open_live() to set up * snap length. After that appliation usually sets its own * filter. */ d->bd_writer = 2; } else CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); reset_d(d); BPFD_UNLOCK(d); bpf_bpfd_cnt++; CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list", __func__, d->bd_pid, d->bd_writer ? "writer" : "active"); if (op_w == 0) EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1); } /* * Check if we need to upgrade our descriptor @d from write-only mode. */ static int bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen) { int is_snap, need_upgrade; /* * Check if we've already upgraded or new filter is empty. */ if (d->bd_writer == 0 || fcode == NULL) return (0); need_upgrade = 0; /* * Check if cmd looks like snaplen setting from * pcap_bpf.c:pcap_open_live(). * Note we're not checking .k value here: * while pcap_open_live() definitely sets to non-zero value, * we'd prefer to treat k=0 (deny ALL) case the same way: e.g. * do not consider upgrading immediately */ if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K)) is_snap = 1; else is_snap = 0; if (is_snap == 0) { /* * We're setting first filter and it doesn't look like * setting snaplen. We're probably using bpf directly. * Upgrade immediately. */ need_upgrade = 1; } else { /* * Do not require upgrade by first BIOCSETF * (used to set snaplen) by pcap_open_live(). */ if (--d->bd_writer == 0) { /* * First snaplen filter has already * been set. This is probably catch-all * filter */ need_upgrade = 1; } } CTR5(KTR_NET, "%s: filter function set by pid %d, " "bd_writer counter %d, snap %d upgrade %d", __func__, d->bd_pid, d->bd_writer, is_snap, need_upgrade); return (need_upgrade); } /* * Detach a file from its interface. */ static void bpf_detachd(struct bpf_d *d) { BPF_LOCK(); bpf_detachd_locked(d, false); BPF_UNLOCK(); } static void bpf_detachd_locked(struct bpf_d *d, bool detached_ifp) { struct bpf_if *bp; struct ifnet *ifp; int error; BPF_LOCK_ASSERT(); CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid); /* Check if descriptor is attached */ if ((bp = d->bd_bif) == NULL) return; BPFD_LOCK(d); /* Remove d from the interface's descriptor list. */ CK_LIST_REMOVE(d, bd_next); /* Save bd_writer value */ error = d->bd_writer; ifp = bp->bif_ifp; d->bd_bif = NULL; if (detached_ifp) { /* * Notify descriptor as it's detached, so that any * sleepers wake up and get ENXIO. */ bpf_wakeup(d); } BPFD_UNLOCK(d); bpf_bpfd_cnt--; /* Call event handler iff d is attached */ if (error == 0) EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0); /* * Check if this descriptor had requested promiscuous mode. * If so and ifnet is not detached, turn it off. */ if (d->bd_promisc && !detached_ifp) { d->bd_promisc = 0; CURVNET_SET(ifp->if_vnet); error = ifpromisc(ifp, 0); CURVNET_RESTORE(); if (error != 0 && error != ENXIO) { /* * ENXIO can happen if a pccard is unplugged * Something is really wrong if we were able to put * the driver into promiscuous mode, but can't * take it out. */ if_printf(bp->bif_ifp, "bpf_detach: ifpromisc failed (%d)\n", error); } } bpfif_rele(bp); } /* * Close the descriptor by detaching it from its interface, * deallocating its buffers, and marking it free. */ static void bpf_dtor(void *data) { struct bpf_d *d = data; BPFD_LOCK(d); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); funsetown(&d->bd_sigio); bpf_detachd(d); #ifdef MAC mac_bpfdesc_destroy(d); #endif /* MAC */ seldrain(&d->bd_sel); knlist_destroy(&d->bd_sel.si_note); callout_drain(&d->bd_callout); bpfd_rele(d); } /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. */ /* ARGSUSED */ static int bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct bpf_d *d; int error; d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO); error = devfs_set_cdevpriv(d, bpf_dtor); if (error != 0) { free(d, M_BPF); return (error); } /* Setup counters */ d->bd_rcount = counter_u64_alloc(M_WAITOK); d->bd_dcount = counter_u64_alloc(M_WAITOK); d->bd_fcount = counter_u64_alloc(M_WAITOK); d->bd_wcount = counter_u64_alloc(M_WAITOK); d->bd_wfcount = counter_u64_alloc(M_WAITOK); d->bd_wdcount = counter_u64_alloc(M_WAITOK); d->bd_zcopy = counter_u64_alloc(M_WAITOK); /* * For historical reasons, perform a one-time initialization call to * the buffer routines, even though we're not yet committed to a * particular buffer method. */ bpf_buffer_init(d); if ((flags & FREAD) == 0) d->bd_writer = 2; d->bd_hbuf_in_use = 0; d->bd_bufmode = BPF_BUFMODE_BUFFER; d->bd_sig = SIGIO; d->bd_direction = BPF_D_INOUT; d->bd_refcnt = 1; BPF_PID_REFRESH(d, td); #ifdef MAC mac_bpfdesc_init(d); mac_bpfdesc_create(td->td_ucred, d); #endif mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF); callout_init_mtx(&d->bd_callout, &d->bd_lock, 0); knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock); return (0); } /* * bpfread - read next chunk of packets from buffers */ static int bpfread(struct cdev *dev, struct uio *uio, int ioflag) { struct bpf_d *d; int error; int non_block; int timed_out; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Restrict application to use a buffer the same size as * as kernel buffers. */ if (uio->uio_resid != d->bd_bufsize) return (EINVAL); non_block = ((ioflag & O_NONBLOCK) != 0); BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); if (d->bd_bufmode != BPF_BUFMODE_BUFFER) { BPFD_UNLOCK(d); return (EOPNOTSUPP); } if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); timed_out = (d->bd_state == BPF_TIMED_OUT); d->bd_state = BPF_IDLE; while (d->bd_hbuf_in_use) { error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET|PCATCH, "bd_hbuf", 0); if (error != 0) { BPFD_UNLOCK(d); return (error); } } /* * If the hold buffer is empty, then do a timed sleep, which * ends when the timeout expires or when enough packets * have arrived to fill the store buffer. */ while (d->bd_hbuf == NULL) { if (d->bd_slen != 0) { /* * A packet(s) either arrived since the previous * read or arrived while we were asleep. */ if (d->bd_immediate || non_block || timed_out) { /* * Rotate the buffers and return what's here * if we are in immediate mode, non-blocking * flag is set, or this descriptor timed out. */ ROTATE_BUFFERS(d); break; } } /* * No data is available, check to see if the bpf device * is still pointed at a real interface. If not, return * ENXIO so that the userland process knows to rebind * it before using it again. */ if (d->bd_bif == NULL) { BPFD_UNLOCK(d); return (ENXIO); } if (non_block) { BPFD_UNLOCK(d); return (EWOULDBLOCK); } error = msleep(d, &d->bd_lock, PRINET|PCATCH, "bpf", d->bd_rtout); if (error == EINTR || error == ERESTART) { BPFD_UNLOCK(d); return (error); } if (error == EWOULDBLOCK) { /* * On a timeout, return what's in the buffer, * which may be nothing. If there is something * in the store buffer, we can rotate the buffers. */ if (d->bd_hbuf) /* * We filled up the buffer in between * getting the timeout and arriving * here, so we don't need to rotate. */ break; if (d->bd_slen == 0) { BPFD_UNLOCK(d); return (0); } ROTATE_BUFFERS(d); break; } } /* * At this point, we know we have something in the hold slot. */ d->bd_hbuf_in_use = 1; BPFD_UNLOCK(d); /* * Move data from hold buffer into user space. * We know the entire buffer is transferred since * we checked above that the read buffer is bpf_bufsize bytes. * * We do not have to worry about simultaneous reads because * we waited for sole access to the hold buffer above. */ error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio); BPFD_LOCK(d); KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf")); d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); d->bd_hbuf_in_use = 0; wakeup(&d->bd_hbuf_in_use); BPFD_UNLOCK(d); return (error); } /* * If there are processes sleeping on this descriptor, wake them up. */ static __inline void bpf_wakeup(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (d->bd_state == BPF_WAITING) { callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; } wakeup(d); if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(&d->bd_sigio, d->bd_sig, 0); selwakeuppri(&d->bd_sel, PRINET); KNOTE_LOCKED(&d->bd_sel.si_note, 0); } static void bpf_timed_out(void *arg) { struct bpf_d *d = (struct bpf_d *)arg; BPFD_LOCK_ASSERT(d); if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout)) return; if (d->bd_state == BPF_WAITING) { d->bd_state = BPF_TIMED_OUT; if (d->bd_slen != 0) bpf_wakeup(d); } } static int bpf_ready(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); if (!bpf_canfreebuf(d) && d->bd_hlen != 0) return (1); if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0) return (1); return (0); } static int bpfwrite(struct cdev *dev, struct uio *uio, int ioflag) { struct route ro; struct sockaddr dst; struct epoch_tracker et; struct bpf_if *bp; struct bpf_d *d; struct ifnet *ifp; struct mbuf *m, *mc; int error, hlen; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); NET_EPOCH_ENTER(et); BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); counter_u64_add(d->bd_wcount, 1); if ((bp = d->bd_bif) == NULL) { error = ENXIO; goto out_locked; } ifp = bp->bif_ifp; if ((ifp->if_flags & IFF_UP) == 0) { error = ENETDOWN; goto out_locked; } if (uio->uio_resid == 0) goto out_locked; bzero(&dst, sizeof(dst)); m = NULL; hlen = 0; /* * Take extra reference, unlock d and exit from epoch section, * since bpf_movein() can sleep. */ bpfd_ref(d); NET_EPOCH_EXIT(et); BPFD_UNLOCK(d); error = bpf_movein(uio, (int)bp->bif_dlt, ifp, &m, &dst, &hlen, d); if (error != 0) { counter_u64_add(d->bd_wdcount, 1); bpfd_rele(d); return (error); } BPFD_LOCK(d); /* * Check that descriptor is still attached to the interface. * This can happen on bpfdetach(). To avoid access to detached * ifnet, free mbuf and return ENXIO. */ if (d->bd_bif == NULL) { counter_u64_add(d->bd_wdcount, 1); BPFD_UNLOCK(d); bpfd_rele(d); m_freem(m); return (ENXIO); } counter_u64_add(d->bd_wfcount, 1); if (d->bd_hdrcmplt) dst.sa_family = pseudo_AF_HDRCMPLT; if (d->bd_feedback) { mc = m_dup(m, M_NOWAIT); if (mc != NULL) mc->m_pkthdr.rcvif = ifp; /* Set M_PROMISC for outgoing packets to be discarded. */ if (d->bd_direction == BPF_D_INOUT) m->m_flags |= M_PROMISC; } else mc = NULL; m->m_pkthdr.len -= hlen; m->m_len -= hlen; m->m_data += hlen; /* XXX */ CURVNET_SET(ifp->if_vnet); #ifdef MAC mac_bpfdesc_create_mbuf(d, m); if (mc != NULL) mac_bpfdesc_create_mbuf(d, mc); #endif bzero(&ro, sizeof(ro)); if (hlen != 0) { ro.ro_prepend = (u_char *)&dst.sa_data; ro.ro_plen = hlen; ro.ro_flags = RT_HAS_HEADER; } /* Avoid possible recursion on BPFD_LOCK(). */ NET_EPOCH_ENTER(et); BPFD_UNLOCK(d); error = (*ifp->if_output)(ifp, m, &dst, &ro); if (error) counter_u64_add(d->bd_wdcount, 1); if (mc != NULL) { if (error == 0) (*ifp->if_input)(ifp, mc); else m_freem(mc); } NET_EPOCH_EXIT(et); CURVNET_RESTORE(); bpfd_rele(d); return (error); out_locked: counter_u64_add(d->bd_wdcount, 1); NET_EPOCH_EXIT(et); BPFD_UNLOCK(d); return (error); } /* * Reset a descriptor by flushing its packet buffer and clearing the receive * and drop counts. This is doable for kernel-only buffers, but with * zero-copy buffers, we can't write to (or rotate) buffers that are * currently owned by userspace. It would be nice if we could encapsulate * this logic in the buffer code rather than here. */ static void reset_d(struct bpf_d *d) { BPFD_LOCK_ASSERT(d); while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if ((d->bd_hbuf != NULL) && (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) { /* Free the hold buffer. */ d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } if (bpf_canwritebuf(d)) d->bd_slen = 0; counter_u64_zero(d->bd_rcount); counter_u64_zero(d->bd_dcount); counter_u64_zero(d->bd_fcount); counter_u64_zero(d->bd_wcount); counter_u64_zero(d->bd_wfcount); counter_u64_zero(d->bd_wdcount); counter_u64_zero(d->bd_zcopy); } /* * FIONREAD Check for read packet available. * BIOCGBLEN Get buffer len [for read()]. * BIOCSETF Set read filter. * BIOCSETFNR Set read filter without resetting descriptor. * BIOCSETWF Set write filter. * BIOCFLUSH Flush read packet buffer. * BIOCPROMISC Put interface into promiscuous mode. * BIOCGDLT Get link layer type. * BIOCGETIF Get interface name. * BIOCSETIF Set interface. * BIOCSRTIMEOUT Set read timeout. * BIOCGRTIMEOUT Get read timeout. * BIOCGSTATS Get packet stats. * BIOCIMMEDIATE Set immediate mode. * BIOCVERSION Get filter language version. * BIOCGHDRCMPLT Get "header already complete" flag * BIOCSHDRCMPLT Set "header already complete" flag * BIOCGDIRECTION Get packet direction flag * BIOCSDIRECTION Set packet direction flag * BIOCGTSTAMP Get time stamp format and resolution. * BIOCSTSTAMP Set time stamp format and resolution. * BIOCLOCK Set "locked" flag * BIOCFEEDBACK Set packet feedback mode. * BIOCSETZBUF Set current zero-copy buffer locations. * BIOCGETZMAX Get maximum zero-copy buffer size. * BIOCROTZBUF Force rotation of zero-copy buffer * BIOCSETBUFMODE Set buffer mode. * BIOCGETBUFMODE Get current buffer mode. */ /* ARGSUSED */ static int bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td) { struct bpf_d *d; int error; error = devfs_get_cdevpriv((void **)&d); if (error != 0) return (error); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (d->bd_state == BPF_WAITING) callout_stop(&d->bd_callout); d->bd_state = BPF_IDLE; BPFD_UNLOCK(d); if (d->bd_locked == 1) { switch (cmd) { case BIOCGBLEN: case BIOCFLUSH: case BIOCGDLT: case BIOCGDLTLIST: #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: #endif case BIOCGETIF: case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCGRTIMEOUT32: #endif case BIOCGSTATS: case BIOCVERSION: case BIOCGRSIG: case BIOCGHDRCMPLT: case BIOCSTSTAMP: case BIOCFEEDBACK: case FIONREAD: case BIOCLOCK: case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCSRTIMEOUT32: #endif case BIOCIMMEDIATE: case TIOCGPGRP: case BIOCROTZBUF: break; default: return (EPERM); } } #ifdef COMPAT_FREEBSD32 /* * If we see a 32-bit compat ioctl, mark the stream as 32-bit so * that it will get 32-bit packet headers. */ switch (cmd) { case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: case BIOCGDLTLIST32: case BIOCGRTIMEOUT32: case BIOCSRTIMEOUT32: if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { BPFD_LOCK(d); d->bd_compat32 = 1; BPFD_UNLOCK(d); } } #endif CURVNET_SET(TD_TO_VNET(td)); switch (cmd) { default: error = EINVAL; break; /* * Check for read packet available. */ case FIONREAD: { int n; BPFD_LOCK(d); n = d->bd_slen; while (d->bd_hbuf_in_use) mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET, "bd_hbuf", 0); if (d->bd_hbuf) n += d->bd_hlen; BPFD_UNLOCK(d); *(int *)addr = n; break; } /* * Get buffer len [for read()]. */ case BIOCGBLEN: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufsize; BPFD_UNLOCK(d); break; /* * Set buffer length. */ case BIOCSBLEN: error = bpf_ioctl_sblen(d, (u_int *)addr); break; /* * Set link layer read filter. */ case BIOCSETF: case BIOCSETFNR: case BIOCSETWF: #ifdef COMPAT_FREEBSD32 case BIOCSETF32: case BIOCSETFNR32: case BIOCSETWF32: #endif error = bpf_setf(d, (struct bpf_program *)addr, cmd); break; /* * Flush read packet buffer. */ case BIOCFLUSH: BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); break; /* * Put interface into promiscuous mode. */ case BIOCPROMISC: if (d->bd_bif == NULL) { /* * No interface attached yet. */ error = EINVAL; break; } if (d->bd_promisc == 0) { error = ifpromisc(d->bd_bif->bif_ifp, 1); if (error == 0) d->bd_promisc = 1; } break; /* * Get current data link type. */ case BIOCGDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else *(u_int *)addr = d->bd_bif->bif_dlt; BPF_UNLOCK(); break; /* * Get a list of supported data link types. */ #ifdef COMPAT_FREEBSD32 case BIOCGDLTLIST32: { struct bpf_dltlist32 *list32; struct bpf_dltlist dltlist; list32 = (struct bpf_dltlist32 *)addr; dltlist.bfl_len = list32->bfl_len; dltlist.bfl_list = PTRIN(list32->bfl_list); BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { error = bpf_getdltlist(d, &dltlist); if (error == 0) list32->bfl_len = dltlist.bfl_len; } BPF_UNLOCK(); break; } #endif case BIOCGDLTLIST: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_getdltlist(d, (struct bpf_dltlist *)addr); BPF_UNLOCK(); break; /* * Set data link type. */ case BIOCSDLT: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else error = bpf_setdlt(d, *(u_int *)addr); BPF_UNLOCK(); break; /* * Get interface name. */ case BIOCGETIF: BPF_LOCK(); if (d->bd_bif == NULL) error = EINVAL; else { struct ifnet *const ifp = d->bd_bif->bif_ifp; struct ifreq *const ifr = (struct ifreq *)addr; strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); } BPF_UNLOCK(); break; /* * Set interface. */ case BIOCSETIF: { int alloc_buf, size; /* * Behavior here depends on the buffering model. If * we're using kernel memory buffers, then we can * allocate them here. If we're using zero-copy, * then the user process must have registered buffers * by the time we get here. */ alloc_buf = 0; BPFD_LOCK(d); if (d->bd_bufmode == BPF_BUFMODE_BUFFER && d->bd_sbuf == NULL) alloc_buf = 1; BPFD_UNLOCK(d); if (alloc_buf) { size = d->bd_bufsize; error = bpf_buffer_ioctl_sblen(d, &size); if (error != 0) break; } BPF_LOCK(); error = bpf_setif(d, (struct ifreq *)addr); BPF_UNLOCK(); break; } /* * Set read timeout. */ case BIOCSRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCSRTIMEOUT32: #endif { struct timeval *tv = (struct timeval *)addr; #if defined(COMPAT_FREEBSD32) && !defined(__mips__) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCSRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv = &tv64; tv->tv_sec = tv32->tv_sec; tv->tv_usec = tv32->tv_usec; } else #endif tv = (struct timeval *)addr; /* * Subtract 1 tick from tvtohz() since this isn't * a one-shot timer. */ if ((error = itimerfix(tv)) == 0) d->bd_rtout = tvtohz(tv) - 1; break; } /* * Get read timeout. */ case BIOCGRTIMEOUT: #if defined(COMPAT_FREEBSD32) && defined(__amd64__) case BIOCGRTIMEOUT32: #endif { struct timeval *tv; #if defined(COMPAT_FREEBSD32) && defined(__amd64__) struct timeval32 *tv32; struct timeval tv64; if (cmd == BIOCGRTIMEOUT32) tv = &tv64; else #endif tv = (struct timeval *)addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; #if defined(COMPAT_FREEBSD32) && defined(__amd64__) if (cmd == BIOCGRTIMEOUT32) { tv32 = (struct timeval32 *)addr; tv32->tv_sec = tv->tv_sec; tv32->tv_usec = tv->tv_usec; } #endif break; } /* * Get packet stats. */ case BIOCGSTATS: { struct bpf_stat *bs = (struct bpf_stat *)addr; /* XXXCSJP overflow */ bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount); bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount); break; } /* * Set immediate mode. */ case BIOCIMMEDIATE: BPFD_LOCK(d); d->bd_immediate = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCVERSION: { struct bpf_version *bv = (struct bpf_version *)addr; bv->bv_major = BPF_MAJOR_VERSION; bv->bv_minor = BPF_MINOR_VERSION; break; } /* * Get "header already complete" flag */ case BIOCGHDRCMPLT: BPFD_LOCK(d); *(u_int *)addr = d->bd_hdrcmplt; BPFD_UNLOCK(d); break; /* * Set "header already complete" flag */ case BIOCSHDRCMPLT: BPFD_LOCK(d); d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0; BPFD_UNLOCK(d); break; /* * Get packet direction flag */ case BIOCGDIRECTION: BPFD_LOCK(d); *(u_int *)addr = d->bd_direction; BPFD_UNLOCK(d); break; /* * Set packet direction flag */ case BIOCSDIRECTION: { u_int direction; direction = *(u_int *)addr; switch (direction) { case BPF_D_IN: case BPF_D_INOUT: case BPF_D_OUT: BPFD_LOCK(d); d->bd_direction = direction; BPFD_UNLOCK(d); break; default: error = EINVAL; } } break; /* * Get packet timestamp format and resolution. */ case BIOCGTSTAMP: BPFD_LOCK(d); *(u_int *)addr = d->bd_tstamp; BPFD_UNLOCK(d); break; /* * Set packet timestamp format and resolution. */ case BIOCSTSTAMP: { u_int func; func = *(u_int *)addr; if (BPF_T_VALID(func)) d->bd_tstamp = func; else error = EINVAL; } break; case BIOCFEEDBACK: BPFD_LOCK(d); d->bd_feedback = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCLOCK: BPFD_LOCK(d); d->bd_locked = 1; BPFD_UNLOCK(d); break; case FIONBIO: /* Non-blocking I/O */ break; case FIOASYNC: /* Send signal on receive packets */ BPFD_LOCK(d); d->bd_async = *(int *)addr; BPFD_UNLOCK(d); break; case FIOSETOWN: /* * XXX: Add some sort of locking here? * fsetown() can sleep. */ error = fsetown(*(int *)addr, &d->bd_sigio); break; case FIOGETOWN: BPFD_LOCK(d); *(int *)addr = fgetown(&d->bd_sigio); BPFD_UNLOCK(d); break; /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: error = fsetown(-(*(int *)addr), &d->bd_sigio); break; /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)addr = -fgetown(&d->bd_sigio); break; case BIOCSRSIG: /* Set receive signal */ { u_int sig; sig = *(u_int *)addr; if (sig >= NSIG) error = EINVAL; else { BPFD_LOCK(d); d->bd_sig = sig; BPFD_UNLOCK(d); } break; } case BIOCGRSIG: BPFD_LOCK(d); *(u_int *)addr = d->bd_sig; BPFD_UNLOCK(d); break; case BIOCGETBUFMODE: BPFD_LOCK(d); *(u_int *)addr = d->bd_bufmode; BPFD_UNLOCK(d); break; case BIOCSETBUFMODE: /* * Allow the buffering mode to be changed as long as we * haven't yet committed to a particular mode. Our * definition of commitment, for now, is whether or not a * buffer has been allocated or an interface attached, since * that's the point where things get tricky. */ switch (*(u_int *)addr) { case BPF_BUFMODE_BUFFER: break; case BPF_BUFMODE_ZBUF: if (bpf_zerocopy_enable) break; /* FALLSTHROUGH */ default: CURVNET_RESTORE(); return (EINVAL); } BPFD_LOCK(d); if (d->bd_sbuf != NULL || d->bd_hbuf != NULL || d->bd_fbuf != NULL || d->bd_bif != NULL) { BPFD_UNLOCK(d); CURVNET_RESTORE(); return (EBUSY); } d->bd_bufmode = *(u_int *)addr; BPFD_UNLOCK(d); break; case BIOCGETZMAX: error = bpf_ioctl_getzmax(td, d, (size_t *)addr); break; case BIOCSETZBUF: error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr); break; case BIOCROTZBUF: error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr); break; } CURVNET_RESTORE(); return (error); } /* * Set d's packet filter program to fp. If this file already has a filter, * free it and replace it. Returns EINVAL for bogus requests. * * Note we use global lock here to serialize bpf_setf() and bpf_setif() * calls. */ static int bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd) { #ifdef COMPAT_FREEBSD32 struct bpf_program fp_swab; struct bpf_program32 *fp32; #endif struct bpf_program_buffer *fcode; struct bpf_insn *filter; #ifdef BPF_JITTER bpf_jit_filter *jfunc; #endif size_t size; u_int flen; bool track_event; #ifdef COMPAT_FREEBSD32 switch (cmd) { case BIOCSETF32: case BIOCSETWF32: case BIOCSETFNR32: fp32 = (struct bpf_program32 *)fp; fp_swab.bf_len = fp32->bf_len; fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns; fp = &fp_swab; switch (cmd) { case BIOCSETF32: cmd = BIOCSETF; break; case BIOCSETWF32: cmd = BIOCSETWF; break; } break; } #endif filter = NULL; #ifdef BPF_JITTER jfunc = NULL; #endif /* * Check new filter validness before acquiring any locks. * Allocate memory for new filter, if needed. */ flen = fp->bf_len; if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0)) return (EINVAL); size = flen * sizeof(*fp->bf_insns); if (size > 0) { /* We're setting up new filter. Copy and check actual data. */ fcode = bpf_program_buffer_alloc(size, M_WAITOK); filter = (struct bpf_insn *)fcode->buffer; if (copyin(fp->bf_insns, filter, size) != 0 || !bpf_validate(filter, flen)) { free(fcode, M_BPF); return (EINVAL); } #ifdef BPF_JITTER if (cmd != BIOCSETWF) { /* * Filter is copied inside fcode and is * perfectly valid. */ jfunc = bpf_jitter(filter, flen); } #endif } track_event = false; fcode = NULL; BPF_LOCK(); BPFD_LOCK(d); /* Set up new filter. */ if (cmd == BIOCSETWF) { if (d->bd_wfilter != NULL) { fcode = __containerof((void *)d->bd_wfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER fcode->func = NULL; #endif } d->bd_wfilter = filter; } else { if (d->bd_rfilter != NULL) { fcode = __containerof((void *)d->bd_rfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER fcode->func = d->bd_bfilter; #endif } d->bd_rfilter = filter; #ifdef BPF_JITTER d->bd_bfilter = jfunc; #endif if (cmd == BIOCSETF) reset_d(d); if (bpf_check_upgrade(cmd, d, filter, flen) != 0) { /* * Filter can be set several times without * specifying interface. In this case just mark d * as reader. */ d->bd_writer = 0; if (d->bd_bif != NULL) { /* * Remove descriptor from writers-only list * and add it to active readers list. */ CK_LIST_REMOVE(d, bd_next); CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist, d, bd_next); CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid); track_event = true; } } } BPFD_UNLOCK(d); if (fcode != NULL) epoch_call(net_epoch_preempt, &fcode->epoch_ctx, bpf_program_buffer_free); if (track_event) EVENTHANDLER_INVOKE(bpf_track, d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1); BPF_UNLOCK(); return (0); } /* * Detach a file from its current interface (if attached at all) and attach * to the interface indicated by the name stored in ifr. * Return an errno or 0. */ static int bpf_setif(struct bpf_d *d, struct ifreq *ifr) { struct bpf_if *bp; struct ifnet *theywant; BPF_LOCK_ASSERT(); theywant = ifunit(ifr->ifr_name); if (theywant == NULL || theywant->if_bpf == NULL) return (ENXIO); bp = theywant->if_bpf; /* * At this point, we expect the buffer is already allocated. If not, * return an error. */ switch (d->bd_bufmode) { case BPF_BUFMODE_BUFFER: case BPF_BUFMODE_ZBUF: if (d->bd_sbuf == NULL) return (EINVAL); break; default: panic("bpf_setif: bufmode %d", d->bd_bufmode); } if (bp != d->bd_bif) bpf_attachd(d, bp); else { BPFD_LOCK(d); reset_d(d); BPFD_UNLOCK(d); } return (0); } /* * Support for select() and poll() system calls * * Return true iff the specific operation will not block indefinitely. * Otherwise, return false but make a note that a selwakeup() must be done. */ static int bpfpoll(struct cdev *dev, int events, struct thread *td) { struct bpf_d *d; int revents; if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL) return (events & (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); /* * Refresh PID associated with this descriptor. */ revents = events & (POLLOUT | POLLWRNORM); BPFD_LOCK(d); BPF_PID_REFRESH(d, td); if (events & (POLLIN | POLLRDNORM)) { if (bpf_ready(d)) revents |= events & (POLLIN | POLLRDNORM); else { selrecord(td, &d->bd_sel); /* Start the read timeout if necessary. */ if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } } } BPFD_UNLOCK(d); return (revents); } /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. */ int bpfkqfilter(struct cdev *dev, struct knote *kn) { struct bpf_d *d; if (devfs_get_cdevpriv((void **)&d) != 0 || kn->kn_filter != EVFILT_READ) return (1); /* * Refresh PID associated with this descriptor. */ BPFD_LOCK(d); BPF_PID_REFRESH_CUR(d); kn->kn_fop = &bpfread_filtops; kn->kn_hook = d; knlist_add(&d->bd_sel.si_note, kn, 1); BPFD_UNLOCK(d); return (0); } static void filt_bpfdetach(struct knote *kn) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; knlist_remove(&d->bd_sel.si_note, kn, 0); } static int filt_bpfread(struct knote *kn, long hint) { struct bpf_d *d = (struct bpf_d *)kn->kn_hook; int ready; BPFD_LOCK_ASSERT(d); ready = bpf_ready(d); if (ready) { kn->kn_data = d->bd_slen; /* * Ignore the hold buffer if it is being copied to user space. */ if (!d->bd_hbuf_in_use && d->bd_hbuf) kn->kn_data += d->bd_hlen; } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { callout_reset(&d->bd_callout, d->bd_rtout, bpf_timed_out, d); d->bd_state = BPF_WAITING; } return (ready); } #define BPF_TSTAMP_NONE 0 #define BPF_TSTAMP_FAST 1 #define BPF_TSTAMP_NORMAL 2 #define BPF_TSTAMP_EXTERN 3 static int bpf_ts_quality(int tstype) { if (tstype == BPF_T_NONE) return (BPF_TSTAMP_NONE); if ((tstype & BPF_T_FAST) != 0) return (BPF_TSTAMP_FAST); return (BPF_TSTAMP_NORMAL); } static int bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m) { struct m_tag *tag; int quality; quality = bpf_ts_quality(tstype); if (quality == BPF_TSTAMP_NONE) return (quality); if (m != NULL) { tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL); if (tag != NULL) { *bt = *(struct bintime *)(tag + 1); return (BPF_TSTAMP_EXTERN); } } if (quality == BPF_TSTAMP_NORMAL) binuptime(bt); else getbinuptime(bt); return (quality); } /* * Incoming linkage from device drivers. Process the packet pkt, of length * pktlen, which is stored in a contiguous buffer. The packet is parsed * by each process' filter, and if accepted, stashed into the corresponding * buffer. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int slen; int gottime; gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { counter_u64_add(d->bd_rcount, 1); /* * NB: We dont call BPF_CHECK_DIRECTION() here since there * is no way for the caller to indiciate to us whether this * packet is inbound or outbound. In the bpf_mtap() routines, * we use the interface pointers on the mbuf to figure it out. */ #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; if (bf != NULL) slen = (*(bf->func))(pkt, pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen); if (slen != 0) { /* * Filter matches. Let's to acquire write lock. */ BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, NULL); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, pkt, pktlen, slen, bpf_append_bytes, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } #define BPF_CHECK_DIRECTION(d, r, i) \ (((d)->bd_direction == BPF_D_IN && (r) != (i)) || \ ((d)->bd_direction == BPF_D_OUT && (r) == (i))) /* * Incoming linkage from device drivers, when packet is in an mbuf chain. * Locking model is explained in bpf_tap(). */ void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { struct epoch_tracker et; struct bintime bt; struct bpf_d *d; #ifdef BPF_JITTER bpf_jit_filter *bf; #endif u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ - if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { + if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { - if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) + if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); #ifdef BPF_JITTER bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL; /* XXX We cannot handle multiple mbufs. */ if (bf != NULL && m->m_next == NULL) slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen); else #endif slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)m, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } /* * Incoming linkage from device drivers, when packet is in * an mbuf chain and to be prepended by a contiguous header. */ void bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m) { struct epoch_tracker et; struct bintime bt; struct mbuf mb; struct bpf_d *d; u_int pktlen, slen; int gottime; /* Skip outgoing duplicate packets. */ if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) { m->m_flags &= ~M_PROMISC; return; } pktlen = m_length(m, NULL); /* * Craft on-stack mbuf suitable for passing to bpf_filter. * Note that we cut corners here; we only setup what's * absolutely needed--this mbuf should never go anywhere else. */ mb.m_next = m; mb.m_data = data; mb.m_len = dlen; pktlen += dlen; gottime = BPF_TSTAMP_NONE; NET_EPOCH_ENTER(et); CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) { if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp)) continue; counter_u64_add(d->bd_rcount, 1); slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0); if (slen != 0) { BPFD_LOCK(d); counter_u64_add(d->bd_fcount, 1); if (gottime < bpf_ts_quality(d->bd_tstamp)) gottime = bpf_gettime(&bt, d->bd_tstamp, m); #ifdef MAC if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0) #endif catchpacket(d, (u_char *)&mb, pktlen, slen, bpf_append_mbuf, &bt); BPFD_UNLOCK(d); } } NET_EPOCH_EXIT(et); } #undef BPF_CHECK_DIRECTION #undef BPF_TSTAMP_NONE #undef BPF_TSTAMP_FAST #undef BPF_TSTAMP_NORMAL #undef BPF_TSTAMP_EXTERN static int bpf_hdrlen(struct bpf_d *d) { int hdrlen; hdrlen = d->bd_bif->bif_hdrlen; #ifndef BURN_BRIDGES if (d->bd_tstamp == BPF_T_NONE || BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME) #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr); else #endif hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) hdrlen = BPF_WORDALIGN32(hdrlen); else #endif hdrlen = BPF_WORDALIGN(hdrlen); return (hdrlen - d->bd_bif->bif_hdrlen); } static void bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype) { struct bintime bt2, boottimebin; struct timeval tsm; struct timespec tsn; if ((tstype & BPF_T_MONOTONIC) == 0) { bt2 = *bt; getboottimebin(&boottimebin); bintime_add(&bt2, &boottimebin); bt = &bt2; } switch (BPF_T_FORMAT(tstype)) { case BPF_T_MICROTIME: bintime2timeval(bt, &tsm); ts->bt_sec = tsm.tv_sec; ts->bt_frac = tsm.tv_usec; break; case BPF_T_NANOTIME: bintime2timespec(bt, &tsn); ts->bt_sec = tsn.tv_sec; ts->bt_frac = tsn.tv_nsec; break; case BPF_T_BINTIME: ts->bt_sec = bt->sec; ts->bt_frac = bt->frac; break; } } /* * Move the packet data from interface memory (pkt) into the * store buffer. "cpfn" is the routine called to do the actual data * transfer. bcopy is passed in to copy contiguous chunks, while * bpf_append_mbuf is passed in to copy mbuf chains. In the latter case, * pkt is really an mbuf. */ static void catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int), struct bintime *bt) { struct bpf_xhdr hdr; #ifndef BURN_BRIDGES struct bpf_hdr hdr_old; #ifdef COMPAT_FREEBSD32 struct bpf_hdr32 hdr32_old; #endif #endif int caplen, curlen, hdrlen, totlen; int do_wakeup = 0; int do_timestamp; int tstype; BPFD_LOCK_ASSERT(d); /* * Detect whether user space has released a buffer back to us, and if * so, move it from being a hold buffer to a free buffer. This may * not be the best place to do it (for example, we might only want to * run this check if we need the space), but for now it's a reliable * spot to do it. */ if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) { d->bd_fbuf = d->bd_hbuf; d->bd_hbuf = NULL; d->bd_hlen = 0; bpf_buf_reclaimed(d); } /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that * much. Otherwise, transfer the whole packet (unless * we hit the buffer size limit). */ hdrlen = bpf_hdrlen(d); totlen = hdrlen + min(snaplen, pktlen); if (totlen > d->bd_bufsize) totlen = d->bd_bufsize; /* * Round up the end of the previous packet to the next longword. * * Drop the packet if there's no room and no hope of room * If the packet would overflow the storage buffer or the storage * buffer is considered immutable by the buffer model, try to rotate * the buffer and wakeup pending processes. */ #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) curlen = BPF_WORDALIGN32(d->bd_slen); else #endif curlen = BPF_WORDALIGN(d->bd_slen); if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) { if (d->bd_fbuf == NULL) { /* * There's no room in the store buffer, and no * prospect of room, so drop the packet. Notify the * buffer model. */ bpf_buffull(d); counter_u64_add(d->bd_dcount, 1); return; } KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use")); ROTATE_BUFFERS(d); do_wakeup = 1; curlen = 0; } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* * Immediate mode is set, or the read timeout has already * expired during a select call. A packet arrived, so the * reader should be woken up. */ do_wakeup = 1; caplen = totlen - hdrlen; tstype = d->bd_tstamp; do_timestamp = tstype != BPF_T_NONE; #ifndef BURN_BRIDGES if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) { struct bpf_ts ts; if (do_timestamp) bpf_bintime2ts(bt, &ts, tstype); #ifdef COMPAT_FREEBSD32 if (d->bd_compat32) { bzero(&hdr32_old, sizeof(hdr32_old)); if (do_timestamp) { hdr32_old.bh_tstamp.tv_sec = ts.bt_sec; hdr32_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr32_old.bh_datalen = pktlen; hdr32_old.bh_hdrlen = hdrlen; hdr32_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old, sizeof(hdr32_old)); goto copy; } #endif bzero(&hdr_old, sizeof(hdr_old)); if (do_timestamp) { hdr_old.bh_tstamp.tv_sec = ts.bt_sec; hdr_old.bh_tstamp.tv_usec = ts.bt_frac; } hdr_old.bh_datalen = pktlen; hdr_old.bh_hdrlen = hdrlen; hdr_old.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old, sizeof(hdr_old)); goto copy; } #endif /* * Append the bpf header. Note we append the actual header size, but * move forward the length of the header plus padding. */ bzero(&hdr, sizeof(hdr)); if (do_timestamp) bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype); hdr.bh_datalen = pktlen; hdr.bh_hdrlen = hdrlen; hdr.bh_caplen = caplen; bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr)); /* * Copy the packet data into the store buffer and update its length. */ #ifndef BURN_BRIDGES copy: #endif (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen); d->bd_slen = curlen + totlen; if (do_wakeup) bpf_wakeup(d); } /* * Free buffers currently in use by a descriptor. * Called on close. */ static void bpfd_free(epoch_context_t ctx) { struct bpf_d *d; struct bpf_program_buffer *p; /* * We don't need to lock out interrupts since this descriptor has * been detached from its interface and it yet hasn't been marked * free. */ d = __containerof(ctx, struct bpf_d, epoch_ctx); bpf_free(d); if (d->bd_rfilter != NULL) { p = __containerof((void *)d->bd_rfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER p->func = d->bd_bfilter; #endif bpf_program_buffer_free(&p->epoch_ctx); } if (d->bd_wfilter != NULL) { p = __containerof((void *)d->bd_wfilter, struct bpf_program_buffer, buffer); #ifdef BPF_JITTER p->func = NULL; #endif bpf_program_buffer_free(&p->epoch_ctx); } mtx_destroy(&d->bd_lock); counter_u64_free(d->bd_rcount); counter_u64_free(d->bd_dcount); counter_u64_free(d->bd_fcount); counter_u64_free(d->bd_wcount); counter_u64_free(d->bd_wfcount); counter_u64_free(d->bd_wdcount); counter_u64_free(d->bd_zcopy); free(d, M_BPF); } /* * Attach an interface to bpf. dlt is the link layer type; hdrlen is the * fixed size of the link header (variable length headers not yet supported). */ void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } /* * Attach an interface to bpf. ifp is a pointer to the structure * defining the interface to be attached, dlt is the link layer type, * and hdrlen is the fixed size of the link header (variable length * headers are not yet supporrted). */ void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { struct bpf_if *bp; KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized")); bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO); CK_LIST_INIT(&bp->bif_dlist); CK_LIST_INIT(&bp->bif_wlist); bp->bif_ifp = ifp; bp->bif_dlt = dlt; bp->bif_hdrlen = hdrlen; bp->bif_bpf = driverp; bp->bif_refcnt = 1; *driverp = bp; /* * Reference ifnet pointer, so it won't freed until * we release it. */ if_ref(ifp); BPF_LOCK(); CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next); BPF_UNLOCK(); if (bootverbose && IS_DEFAULT_VNET(curvnet)) if_printf(ifp, "bpf attached\n"); } #ifdef VIMAGE /* * When moving interfaces between vnet instances we need a way to * query the dlt and hdrlen before detach so we can re-attch the if_bpf * after the vmove. We unfortunately have no device driver infrastructure * to query the interface for these values after creation/attach, thus * add this as a workaround. */ int bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen) { if (bp == NULL) return (ENXIO); if (bif_dlt == NULL && bif_hdrlen == NULL) return (0); if (bif_dlt != NULL) *bif_dlt = bp->bif_dlt; if (bif_hdrlen != NULL) *bif_hdrlen = bp->bif_hdrlen; return (0); } #endif /* * Detach bpf from an interface. This involves detaching each descriptor * associated with the interface. Notify each descriptor as it's detached * so that any sleepers wake up and get ENXIO. */ void bpfdetach(struct ifnet *ifp) { struct bpf_if *bp, *bp_temp; struct bpf_d *d; BPF_LOCK(); /* Find all bpf_if struct's which reference ifp and detach them. */ CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) { if (ifp != bp->bif_ifp) continue; CK_LIST_REMOVE(bp, bif_next); *bp->bif_bpf = (struct bpf_if *)&dead_bpf_if; CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p", __func__, bp->bif_dlt, bp, ifp); /* Detach common descriptors */ while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) { bpf_detachd_locked(d, true); } /* Detach writer-only descriptors */ while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) { bpf_detachd_locked(d, true); } bpfif_rele(bp); } BPF_UNLOCK(); } /* * Get a list of available data link type of the interface. */ static int bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl) { struct ifnet *ifp; struct bpf_if *bp; u_int *lst; int error, n, n1; BPF_LOCK_ASSERT(); ifp = d->bd_bif->bif_ifp; n1 = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp) n1++; } if (bfl->bfl_list == NULL) { bfl->bfl_len = n1; return (0); } if (n1 > bfl->bfl_len) return (ENOMEM); lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK); n = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp != ifp) continue; lst[n++] = bp->bif_dlt; } error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n); free(lst, M_TEMP); bfl->bfl_len = n; return (error); } /* * Set the data link type of a BPF instance. */ static int bpf_setdlt(struct bpf_d *d, u_int dlt) { int error, opromisc; struct ifnet *ifp; struct bpf_if *bp; BPF_LOCK_ASSERT(); MPASS(d->bd_bif != NULL); /* * It is safe to check bd_bif without BPFD_LOCK, it can not be * changed while we hold global lock. */ if (d->bd_bif->bif_dlt == dlt) return (0); ifp = d->bd_bif->bif_ifp; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { if (bp->bif_ifp == ifp && bp->bif_dlt == dlt) break; } if (bp == NULL) return (EINVAL); opromisc = d->bd_promisc; bpf_attachd(d, bp); if (opromisc) { error = ifpromisc(bp->bif_ifp, 1); if (error) if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n", __func__, error); else d->bd_promisc = 1; } return (0); } static void bpf_drvinit(void *unused) { struct cdev *dev; sx_init(&bpf_sx, "bpf global lock"); CK_LIST_INIT(&bpf_iflist); dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf"); /* For compatibility */ make_dev_alias(dev, "bpf0"); } /* * Zero out the various packet counters associated with all of the bpf * descriptors. At some point, we will probably want to get a bit more * granular and allow the user to specify descriptors to be zeroed. */ static void bpf_zero_counters(void) { struct bpf_if *bp; struct bpf_d *bd; BPF_LOCK(); /* * We are protected by global lock here, interfaces and * descriptors can not be deleted while we hold it. */ CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { counter_u64_zero(bd->bd_rcount); counter_u64_zero(bd->bd_dcount); counter_u64_zero(bd->bd_fcount); counter_u64_zero(bd->bd_wcount); counter_u64_zero(bd->bd_wfcount); counter_u64_zero(bd->bd_zcopy); } } BPF_UNLOCK(); } /* * Fill filter statistics */ static void bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd) { BPF_LOCK_ASSERT(); bzero(d, sizeof(*d)); d->bd_structsize = sizeof(*d); d->bd_immediate = bd->bd_immediate; d->bd_promisc = bd->bd_promisc; d->bd_hdrcmplt = bd->bd_hdrcmplt; d->bd_direction = bd->bd_direction; d->bd_feedback = bd->bd_feedback; d->bd_async = bd->bd_async; d->bd_rcount = counter_u64_fetch(bd->bd_rcount); d->bd_dcount = counter_u64_fetch(bd->bd_dcount); d->bd_fcount = counter_u64_fetch(bd->bd_fcount); d->bd_sig = bd->bd_sig; d->bd_slen = bd->bd_slen; d->bd_hlen = bd->bd_hlen; d->bd_bufsize = bd->bd_bufsize; d->bd_pid = bd->bd_pid; strlcpy(d->bd_ifname, bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ); d->bd_locked = bd->bd_locked; d->bd_wcount = counter_u64_fetch(bd->bd_wcount); d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount); d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount); d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy); d->bd_bufmode = bd->bd_bufmode; } /* * Handle `netstat -B' stats request */ static int bpf_stats_sysctl(SYSCTL_HANDLER_ARGS) { static const struct xbpf_d zerostats; struct xbpf_d *xbdbuf, *xbd, tempstats; int index, error; struct bpf_if *bp; struct bpf_d *bd; /* * XXX This is not technically correct. It is possible for non * privileged users to open bpf devices. It would make sense * if the users who opened the devices were able to retrieve * the statistics for them, too. */ error = priv_check(req->td, PRIV_NET_BPF); if (error) return (error); /* * Check to see if the user is requesting that the counters be * zeroed out. Explicitly check that the supplied data is zeroed, * as we aren't allowing the user to set the counters currently. */ if (req->newptr != NULL) { if (req->newlen != sizeof(tempstats)) return (EINVAL); memset(&tempstats, 0, sizeof(tempstats)); error = SYSCTL_IN(req, &tempstats, sizeof(tempstats)); if (error) return (error); if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0) return (EINVAL); bpf_zero_counters(); return (0); } if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd))); if (bpf_bpfd_cnt == 0) return (SYSCTL_OUT(req, 0, 0)); xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK); BPF_LOCK(); if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) { BPF_UNLOCK(); free(xbdbuf, M_BPF); return (ENOMEM); } index = 0; CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) { /* Send writers-only first */ CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) { xbd = &xbdbuf[index++]; bpfstats_fill_xbpf(xbd, bd); } CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) { xbd = &xbdbuf[index++]; bpfstats_fill_xbpf(xbd, bd); } } BPF_UNLOCK(); error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd)); free(xbdbuf, M_BPF); return (error); } SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL); #else /* !DEV_BPF && !NETGRAPH_BPF */ /* * NOP stubs to allow bpf-using drivers to load and function. * * A 'better' implementation would allow the core bpf functionality * to be loaded at runtime. */ void bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen) { } void bpf_mtap(struct bpf_if *bp, struct mbuf *m) { } void bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m) { } void bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen) { bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf); } void bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp) { *driverp = (struct bpf_if *)&dead_bpf_if; } void bpfdetach(struct ifnet *ifp) { } u_int bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) { return -1; /* "no filter" behaviour */ } int bpf_validate(const struct bpf_insn *f, int len) { return 0; /* false */ } #endif /* !DEV_BPF && !NETGRAPH_BPF */ #ifdef DDB static void bpf_show_bpf_if(struct bpf_if *bpf_if) { if (bpf_if == NULL) return; db_printf("%p:\n", bpf_if); #define BPF_DB_PRINTF(f, e) db_printf(" %s = " f "\n", #e, bpf_if->e); /* bif_ext.bif_next */ /* bif_ext.bif_dlist */ BPF_DB_PRINTF("%#x", bif_dlt); BPF_DB_PRINTF("%u", bif_hdrlen); /* bif_wlist */ BPF_DB_PRINTF("%p", bif_ifp); BPF_DB_PRINTF("%p", bif_bpf); BPF_DB_PRINTF("%u", bif_refcnt); } DB_SHOW_COMMAND(bpf_if, db_show_bpf_if) { if (!have_addr) { db_printf("usage: show bpf_if \n"); return; } bpf_show_bpf_if((struct bpf_if *)addr); } #endif Index: head/sys/net/if.c =================================================================== --- head/sys/net/if.c (revision 348253) +++ head/sys/net/if.c (revision 348254) @@ -1,4598 +1,4600 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if.c 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #ifdef INET6 #include #include #endif /* INET6 */ #endif /* INET || INET6 */ #include /* * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name * and ifr_ifru when it is used in SIOCGIFCONF. */ _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) == offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru"); __read_mostly epoch_t net_epoch_preempt; __read_mostly epoch_t net_epoch; #ifdef COMPAT_FREEBSD32 #include #include struct ifreq_buffer32 { uint32_t length; /* (size_t) */ uint32_t buffer; /* (void *) */ }; /* * Interface request structure used for socket * ioctl's. All interface ioctl's must have parameter * definitions which begin with ifr_name. The * remainder may be interface specific. */ struct ifreq32 { char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */ union { struct sockaddr ifru_addr; struct sockaddr ifru_dstaddr; struct sockaddr ifru_broadaddr; struct ifreq_buffer32 ifru_buffer; short ifru_flags[2]; short ifru_index; int ifru_jid; int ifru_metric; int ifru_mtu; int ifru_phys; int ifru_media; uint32_t ifru_data; int ifru_cap[2]; u_int ifru_fib; u_char ifru_vlan_pcp; } ifr_ifru; }; CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32)); CTASSERT(__offsetof(struct ifreq, ifr_ifru) == __offsetof(struct ifreq32, ifr_ifru)); struct ifgroupreq32 { char ifgr_name[IFNAMSIZ]; u_int ifgr_len; union { char ifgru_group[IFNAMSIZ]; uint32_t ifgru_groups; } ifgr_ifgru; }; struct ifmediareq32 { char ifm_name[IFNAMSIZ]; int ifm_current; int ifm_mask; int ifm_status; int ifm_active; int ifm_count; uint32_t ifm_ulist; /* (int *) */ }; #define SIOCGIFMEDIA32 _IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32) #define SIOCGIFXMEDIA32 _IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32) #define _CASE_IOC_IFGROUPREQ_32(cmd) \ _IOC_NEWTYPE((cmd), struct ifgroupreq32): case #else /* !COMPAT_FREEBSD32 */ #define _CASE_IOC_IFGROUPREQ_32(cmd) #endif /* !COMPAT_FREEBSD32 */ #define CASE_IOC_IFGROUPREQ(cmd) \ _CASE_IOC_IFGROUPREQ_32(cmd) \ (cmd) union ifreq_union { struct ifreq ifr; #ifdef COMPAT_FREEBSD32 struct ifreq32 ifr32; #endif }; union ifgroupreq_union { struct ifgroupreq ifgr; #ifdef COMPAT_FREEBSD32 struct ifgroupreq32 ifgr32; #endif }; SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management"); SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN, &ifqmaxlen, 0, "max send queue size"); /* Log link state change events */ static int log_link_state_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW, &log_link_state_change, 0, "log interface link state change events"); /* Log promiscuous mode change events */ static int log_promisc_mode_change = 1; SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN, &log_promisc_mode_change, 1, "log promiscuous mode change events"); /* Interface description */ static unsigned int ifdescr_maxlen = 1024; SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW, &ifdescr_maxlen, 0, "administrative maximum length for interface description"); static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions"); /* global sx for non-critical path ifdescr */ static struct sx ifdescr_sx; SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr"); void (*ng_ether_link_state_p)(struct ifnet *ifp, int state); void (*lagg_linkstate_p)(struct ifnet *ifp, int state); /* These are external hooks for CARP. */ void (*carp_linkstate_p)(struct ifnet *ifp); void (*carp_demote_adj_p)(int, char *); int (*carp_master_p)(struct ifaddr *); #if defined(INET) || defined(INET6) int (*carp_forus_p)(struct ifnet *ifp, u_char *dhost); int (*carp_output_p)(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa); int (*carp_ioctl_p)(struct ifreq *, u_long, struct thread *); int (*carp_attach_p)(struct ifaddr *, int); void (*carp_detach_p)(struct ifaddr *, bool); #endif #ifdef INET int (*carp_iamatch_p)(struct ifaddr *, uint8_t **); #endif #ifdef INET6 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6); caddr_t (*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr); #endif struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL; /* * XXX: Style; these should be sorted alphabetically, and unprototyped * static functions should be prototyped. Currently they are sorted by * declaration order. */ static void if_attachdomain(void *); static void if_attachdomain1(struct ifnet *); static int ifconf(u_long, caddr_t); static void *if_grow(void); static void if_input_default(struct ifnet *, struct mbuf *); static int if_requestencap_default(struct ifnet *, struct if_encap_req *); static void if_route(struct ifnet *, int flag, int fam); static int if_setflag(struct ifnet *, int, int, int *, int); static int if_transmit(struct ifnet *ifp, struct mbuf *m); static void if_unroute(struct ifnet *, int flag, int fam); static int if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int); static void do_link_state_change(void *, int); static int if_getgroup(struct ifgroupreq *, struct ifnet *); static int if_getgroupmembers(struct ifgroupreq *); static void if_delgroups(struct ifnet *); static void if_attach_internal(struct ifnet *, int, struct if_clone *); static int if_detach_internal(struct ifnet *, int, struct if_clone **); #ifdef VIMAGE static void if_vmove(struct ifnet *, struct vnet *); #endif #ifdef INET6 /* * XXX: declare here to avoid to include many inet6 related files.. * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); #endif /* ipsec helper hooks */ VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]); VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]); VNET_DEFINE(int, if_index); int ifqmaxlen = IFQ_MAXLEN; VNET_DEFINE(struct ifnethead, ifnet); /* depend on static init XXX */ VNET_DEFINE(struct ifgrouphead, ifg_head); VNET_DEFINE_STATIC(int, if_indexlim) = 8; /* Table of ifnet by index. */ VNET_DEFINE(struct ifnet **, ifindex_table); #define V_if_indexlim VNET(if_indexlim) #define V_ifindex_table VNET(ifindex_table) /* * The global network interface list (V_ifnet) and related state (such as * if_index, if_indexlim, and ifindex_table) are protected by an sxlock and * an rwlock. Either may be acquired shared to stablize the list, but both * must be acquired writable to modify the list. This model allows us to * both stablize the interface list during interrupt thread processing, but * also to stablize it over long-running ioctls, without introducing priority * inversions and deadlocks. */ struct rwlock ifnet_rwlock; RW_SYSINIT_FLAGS(ifnet_rw, &ifnet_rwlock, "ifnet_rw", RW_RECURSE); struct sx ifnet_sxlock; SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE); /* * The allocation of network interfaces is a rather non-atomic affair; we * need to select an index before we are ready to expose the interface for * use, so will use this pointer value to indicate reservation. */ #define IFNET_HOLD (void *)(uintptr_t)(-1) static if_com_alloc_t *if_com_alloc[256]; static if_com_free_t *if_com_free[256]; static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals"); MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); struct ifnet * ifnet_byindex_locked(u_short idx) { if (idx > V_if_index) return (NULL); if (V_ifindex_table[idx] == IFNET_HOLD) return (NULL); return (V_ifindex_table[idx]); } struct ifnet * ifnet_byindex(u_short idx) { struct ifnet *ifp; ifp = ifnet_byindex_locked(idx); return (ifp); } struct ifnet * ifnet_byindex_ref(u_short idx) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); ifp = ifnet_byindex_locked(idx); if (ifp == NULL || (ifp->if_flags & IFF_DYING)) { NET_EPOCH_EXIT(et); return (NULL); } if_ref(ifp); NET_EPOCH_EXIT(et); return (ifp); } /* * Allocate an ifindex array entry; return 0 on success or an error on * failure. */ static u_short ifindex_alloc(void **old) { u_short idx; IFNET_WLOCK_ASSERT(); /* * Try to find an empty slot below V_if_index. If we fail, take the * next slot. */ for (idx = 1; idx <= V_if_index; idx++) { if (V_ifindex_table[idx] == NULL) break; } /* Catch if_index overflow. */ if (idx >= V_if_indexlim) { *old = if_grow(); return (USHRT_MAX); } if (idx > V_if_index) V_if_index = idx; return (idx); } static void ifindex_free_locked(u_short idx) { IFNET_WLOCK_ASSERT(); V_ifindex_table[idx] = NULL; while (V_if_index > 0 && V_ifindex_table[V_if_index] == NULL) V_if_index--; } static void ifindex_free(u_short idx) { IFNET_WLOCK(); ifindex_free_locked(idx); IFNET_WUNLOCK(); } static void ifnet_setbyindex(u_short idx, struct ifnet *ifp) { V_ifindex_table[idx] = ifp; } struct ifaddr * ifaddr_byindex(u_short idx) { struct epoch_tracker et; struct ifnet *ifp; struct ifaddr *ifa = NULL; NET_EPOCH_ENTER(et); ifp = ifnet_byindex_locked(idx); if (ifp != NULL && (ifa = ifp->if_addr) != NULL) ifa_ref(ifa); NET_EPOCH_EXIT(et); return (ifa); } /* * Network interface utility routines. * * Routines with ifa_ifwith* names take sockaddr *'s as * parameters. */ static void vnet_if_init(const void *unused __unused) { void *old; CK_STAILQ_INIT(&V_ifnet); CK_STAILQ_INIT(&V_ifg_head); IFNET_WLOCK(); old = if_grow(); /* create initial table */ IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); vnet_if_clone_init(); } VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init, NULL); #ifdef VIMAGE static void vnet_if_uninit(const void *unused __unused) { VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifnet), ("%s:%d tailq &V_ifnet=%p " "not empty", __func__, __LINE__, &V_ifnet)); VNET_ASSERT(CK_STAILQ_EMPTY(&V_ifg_head), ("%s:%d tailq &V_ifg_head=%p " "not empty", __func__, __LINE__, &V_ifg_head)); free((caddr_t)V_ifindex_table, M_IFNET); } VNET_SYSUNINIT(vnet_if_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_if_uninit, NULL); static void vnet_if_return(const void *unused __unused) { struct ifnet *ifp, *nifp; /* Return all inherited interfaces to their parent vnets. */ CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) { if (ifp->if_home_vnet != ifp->if_vnet) if_vmove(ifp, ifp->if_home_vnet); } } VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY, vnet_if_return, NULL); #endif static void * if_grow(void) { int oldlim; u_int n; struct ifnet **e; void *old; old = NULL; IFNET_WLOCK_ASSERT(); oldlim = V_if_indexlim; IFNET_WUNLOCK(); n = (oldlim << 1) * sizeof(*e); e = malloc(n, M_IFNET, M_WAITOK | M_ZERO); IFNET_WLOCK(); if (V_if_indexlim != oldlim) { free(e, M_IFNET); return (NULL); } if (V_ifindex_table != NULL) { memcpy((caddr_t)e, (caddr_t)V_ifindex_table, n/2); old = V_ifindex_table; } V_if_indexlim <<= 1; V_ifindex_table = e; return (old); } /* * Allocate a struct ifnet and an index for an interface. A layer 2 * common structure will also be allocated if an allocation routine is * registered for the passed type. */ struct ifnet * if_alloc_domain(u_char type, int numa_domain) { struct ifnet *ifp; u_short idx; void *old; KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large")); if (numa_domain == IF_NODOM) ifp = malloc(sizeof(struct ifnet), M_IFNET, M_WAITOK | M_ZERO); else ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET, DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO); restart: IFNET_WLOCK(); idx = ifindex_alloc(&old); if (__predict_false(idx == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(idx, IFNET_HOLD); IFNET_WUNLOCK(); ifp->if_index = idx; ifp->if_type = type; ifp->if_alloctype = type; ifp->if_numa_domain = numa_domain; #ifdef VIMAGE ifp->if_vnet = curvnet; #endif if (if_com_alloc[type] != NULL) { ifp->if_l2com = if_com_alloc[type](type, ifp); if (ifp->if_l2com == NULL) { free(ifp, M_IFNET); ifindex_free(idx); return (NULL); } } IF_ADDR_LOCK_INIT(ifp); TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp); ifp->if_afdata_initialized = 0; IF_AFDATA_LOCK_INIT(ifp); CK_STAILQ_INIT(&ifp->if_addrhead); CK_STAILQ_INIT(&ifp->if_multiaddrs); CK_STAILQ_INIT(&ifp->if_groups); #ifdef MAC mac_ifnet_init(ifp); #endif ifq_init(&ifp->if_snd, ifp); refcount_init(&ifp->if_refcount, 1); /* Index reference. */ for (int i = 0; i < IFCOUNTERS; i++) ifp->if_counters[i] = counter_u64_alloc(M_WAITOK); ifp->if_get_counter = if_get_counter_default; ifp->if_pcp = IFNET_PCP_NONE; ifnet_setbyindex(ifp->if_index, ifp); return (ifp); } struct ifnet * if_alloc_dev(u_char type, device_t dev) { int numa_domain; if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0) return (if_alloc_domain(type, IF_NODOM)); return (if_alloc_domain(type, numa_domain)); } struct ifnet * if_alloc(u_char type) { return (if_alloc_domain(type, IF_NODOM)); } /* * Do the actual work of freeing a struct ifnet, and layer 2 common * structure. This call is made when the last reference to an * interface is released. */ static void if_free_internal(struct ifnet *ifp) { KASSERT((ifp->if_flags & IFF_DYING), ("if_free_internal: interface not dying")); if (if_com_free[ifp->if_alloctype] != NULL) if_com_free[ifp->if_alloctype](ifp->if_l2com, ifp->if_alloctype); #ifdef MAC mac_ifnet_destroy(ifp); #endif /* MAC */ IF_AFDATA_DESTROY(ifp); IF_ADDR_LOCK_DESTROY(ifp); ifq_delete(&ifp->if_snd); for (int i = 0; i < IFCOUNTERS; i++) counter_u64_free(ifp->if_counters[i]); free(ifp->if_description, M_IFDESCR); free(ifp->if_hw_addr, M_IFADDR); if (ifp->if_numa_domain == IF_NODOM) free(ifp, M_IFNET); else free_domain(ifp, M_IFNET); } static void if_destroy(epoch_context_t ctx) { struct ifnet *ifp; ifp = __containerof(ctx, struct ifnet, if_epoch_ctx); if_free_internal(ifp); } /* * Deregister an interface and free the associated storage. */ void if_free(struct ifnet *ifp) { ifp->if_flags |= IFF_DYING; /* XXX: Locking */ CURVNET_SET_QUIET(ifp->if_vnet); IFNET_WLOCK(); KASSERT(ifp == ifnet_byindex_locked(ifp->if_index), ("%s: freeing unallocated ifnet", ifp->if_xname)); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); if (refcount_release(&ifp->if_refcount)) epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); CURVNET_RESTORE(); } /* * Interfaces to keep an ifnet type-stable despite the possibility of the * driver calling if_free(). If there are additional references, we defer * freeing the underlying data structure. */ void if_ref(struct ifnet *ifp) { /* We don't assert the ifnet list lock here, but arguably should. */ refcount_acquire(&ifp->if_refcount); } void if_rele(struct ifnet *ifp) { if (!refcount_release(&ifp->if_refcount)) return; epoch_call(net_epoch_preempt, &ifp->if_epoch_ctx, if_destroy); } void ifq_init(struct ifaltq *ifq, struct ifnet *ifp) { mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF); if (ifq->ifq_maxlen == 0) ifq->ifq_maxlen = ifqmaxlen; ifq->altq_type = 0; ifq->altq_disc = NULL; ifq->altq_flags &= ALTQF_CANTCHANGE; ifq->altq_tbr = NULL; ifq->altq_ifp = ifp; } void ifq_delete(struct ifaltq *ifq) { mtx_destroy(&ifq->ifq_mtx); } /* * Perform generic interface initialization tasks and attach the interface * to the list of "active" interfaces. If vmove flag is set on entry * to if_attach_internal(), perform only a limited subset of initialization * tasks, given that we are moving from one vnet to another an ifnet which * has already been fully initialized. * * Note that if_detach_internal() removes group membership unconditionally * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL. * Thus, when if_vmove() is applied to a cloned interface, group membership * is lost while a cloned one always joins a group whose name is * ifc->ifc_name. To recover this after if_detach_internal() and * if_attach_internal(), the cloner should be specified to * if_attach_internal() via ifc. If it is non-NULL, if_attach_internal() * attempts to join a group whose name is ifc->ifc_name. * * XXX: * - The decision to return void and thus require this function to * succeed is questionable. * - We should probably do more sanity checking. For instance we don't * do anything to insure if_xname is unique or non-empty. */ void if_attach(struct ifnet *ifp) { if_attach_internal(ifp, 0, NULL); } /* * Compute the least common TSO limit. */ void if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax) { /* * 1) If there is no limit currently, take the limit from * the network adapter. * * 2) If the network adapter has a limit below the current * limit, apply it. */ if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 && ifp->if_hw_tsomax < pmax->tsomaxbytes)) { pmax->tsomaxbytes = ifp->if_hw_tsomax; } if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 && ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) { pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; } if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 && ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) { pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } /* * Update TSO limit of a network adapter. * * Returns zero if no change. Else non-zero. */ int if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax) { int retval = 0; if (ifp->if_hw_tsomax != pmax->tsomaxbytes) { ifp->if_hw_tsomax = pmax->tsomaxbytes; retval++; } if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) { ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize; retval++; } if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) { ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount; retval++; } return (retval); } static void if_attach_internal(struct ifnet *ifp, int vmove, struct if_clone *ifc) { unsigned socksize, ifasize; int namelen, masklen; struct sockaddr_dl *sdl; struct ifaddr *ifa; if (ifp->if_index == 0 || ifp != ifnet_byindex(ifp->if_index)) panic ("%s: BUG: if_attach called without if_alloc'd input()\n", ifp->if_xname); #ifdef VIMAGE ifp->if_vnet = curvnet; if (ifp->if_home_vnet == NULL) ifp->if_home_vnet = curvnet; #endif if_addgroup(ifp, IFG_ALL); /* Restore group membership for cloned interfaces. */ if (vmove && ifc != NULL) if_clone_addgroup(ifp, ifc); getmicrotime(&ifp->if_lastchange); ifp->if_epoch = time_uptime; KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) || (ifp->if_transmit != NULL && ifp->if_qflush != NULL), ("transmit and qflush must both either be set or both be NULL")); if (ifp->if_transmit == NULL) { ifp->if_transmit = if_transmit; ifp->if_qflush = if_qflush; } if (ifp->if_input == NULL) ifp->if_input = if_input_default; if (ifp->if_requestencap == NULL) ifp->if_requestencap = if_requestencap_default; if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); #endif /* * Create a Link Level name for this device. */ namelen = strlen(ifp->if_xname); /* * Always save enough space for any possiable name so we * can do a rename in place later. */ masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ; socksize = masklen + ifp->if_addrlen; if (socksize < sizeof(*sdl)) socksize = sizeof(*sdl); socksize = roundup2(socksize, sizeof(long)); ifasize = sizeof(*ifa) + 2 * socksize; ifa = ifa_alloc(ifasize, M_WAITOK); sdl = (struct sockaddr_dl *)(ifa + 1); sdl->sdl_len = socksize; sdl->sdl_family = AF_LINK; bcopy(ifp->if_xname, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; ifp->if_addr = ifa; ifa->ifa_ifp = ifp; ifa->ifa_addr = (struct sockaddr *)sdl; sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl); ifa->ifa_netmask = (struct sockaddr *)sdl; sdl->sdl_len = masklen; while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); /* Reliably crash if used uninitialized. */ ifp->if_broadcastaddr = NULL; if (ifp->if_type == IFT_ETHER) { ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR, M_WAITOK | M_ZERO); } #if defined(INET) || defined(INET6) /* Use defaults for TSO, if nothing is set */ if (ifp->if_hw_tsomax == 0 && ifp->if_hw_tsomaxsegcount == 0 && ifp->if_hw_tsomaxsegsize == 0) { /* * The TSO defaults needs to be such that an * NFS mbuf list of 35 mbufs totalling just * below 64K works and that a chain of mbufs * can be defragged into at most 32 segments: */ ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN)); ifp->if_hw_tsomaxsegcount = 35; ifp->if_hw_tsomaxsegsize = 2048; /* 2K */ /* XXX some drivers set IFCAP_TSO after ethernet attach */ if (ifp->if_capabilities & IFCAP_TSO) { if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } } #endif } #ifdef VIMAGE else { /* * Update the interface index in the link layer address * of the interface. */ for (ifa = ifp->if_addr; ifa != NULL; ifa = CK_STAILQ_NEXT(ifa, ifa_link)) { if (ifa->ifa_addr->sa_family == AF_LINK) { sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_index = ifp->if_index; } } } #endif IFNET_WLOCK(); CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link); #ifdef VIMAGE curvnet->vnet_ifcnt++; #endif IFNET_WUNLOCK(); if (domain_init_status >= 2) if_attachdomain1(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL); /* Announce the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); } static void if_epochalloc(void *dummy __unused) { net_epoch_preempt = epoch_alloc(EPOCH_PREEMPT); net_epoch = epoch_alloc(0); } SYSINIT(ifepochalloc, SI_SUB_TASKQ + 1, SI_ORDER_ANY, if_epochalloc, NULL); static void if_attachdomain(void *dummy) { struct ifnet *ifp; CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) if_attachdomain1(ifp); } SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND, if_attachdomain, NULL); static void if_attachdomain1(struct ifnet *ifp) { struct domain *dp; /* * Since dp->dom_ifattach calls malloc() with M_WAITOK, we * cannot lock ifp->if_afdata initialization, entirely. */ IF_AFDATA_LOCK(ifp); if (ifp->if_afdata_initialized >= domain_init_status) { IF_AFDATA_UNLOCK(ifp); log(LOG_WARNING, "%s called more than once on %s\n", __func__, ifp->if_xname); return; } ifp->if_afdata_initialized = domain_init_status; IF_AFDATA_UNLOCK(ifp); /* address family dependent data region */ bzero(ifp->if_afdata, sizeof(ifp->if_afdata)); for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_ifattach) ifp->if_afdata[dp->dom_family] = (*dp->dom_ifattach)(ifp); } } /* * Remove any unicast or broadcast network addresses from an interface. */ void if_purgeaddrs(struct ifnet *ifp) { struct ifaddr *ifa; while (1) { struct epoch_tracker et; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_LINK) break; } NET_EPOCH_EXIT(et); if (ifa == NULL) break; #ifdef INET /* XXX: Ugly!! ad hoc just for INET */ if (ifa->ifa_addr->sa_family == AF_INET) { struct ifaliasreq ifr; bzero(&ifr, sizeof(ifr)); ifr.ifra_addr = *ifa->ifa_addr; if (ifa->ifa_dstaddr) ifr.ifra_broadaddr = *ifa->ifa_dstaddr; if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, NULL) == 0) continue; } #endif /* INET */ #ifdef INET6 if (ifa->ifa_addr->sa_family == AF_INET6) { in6_purgeaddr(ifa); /* ifp_addrhead is already updated */ continue; } #endif /* INET6 */ IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } } /* * Remove any multicast network addresses from an interface when an ifnet * is going away. */ static void if_purgemaddrs(struct ifnet *ifp) { struct ifmultiaddr *ifma; IF_ADDR_WLOCK(ifp); while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) { ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs); CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); if_delmulti_locked(ifp, ifma, 1); } IF_ADDR_WUNLOCK(ifp); } /* * Detach an interface, removing it from the list of "active" interfaces. * If vmove flag is set on entry to if_detach_internal(), perform only a * limited subset of cleanup tasks, given that we are moving an ifnet from * one vnet to another, where it must be fully operational. * * XXXRW: There are some significant questions about event ordering, and * how to prevent things from starting to use the interface during detach. */ void if_detach(struct ifnet *ifp) { CURVNET_SET_QUIET(ifp->if_vnet); if_detach_internal(ifp, 0, NULL); CURVNET_RESTORE(); } /* * The vmove flag, if set, indicates that we are called from a callpath * that is moving an interface to a different vnet instance. * * The shutdown flag, if set, indicates that we are called in the * process of shutting down a vnet instance. Currently only the * vnet_if_return SYSUNINIT function sets it. Note: we can be called * on a vnet instance shutdown without this flag being set, e.g., when * the cloned interfaces are destoyed as first thing of teardown. */ static int if_detach_internal(struct ifnet *ifp, int vmove, struct if_clone **ifcp) { struct ifaddr *ifa; int i; struct domain *dp; struct ifnet *iter; int found = 0; #ifdef VIMAGE int shutdown; shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; #endif IFNET_WLOCK(); CK_STAILQ_FOREACH(iter, &V_ifnet, if_link) if (iter == ifp) { CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link); if (!vmove) ifp->if_flags |= IFF_DYING; found = 1; break; } IFNET_WUNLOCK(); if (!found) { /* * While we would want to panic here, we cannot * guarantee that the interface is indeed still on * the list given we don't hold locks all the way. */ return (ENOENT); #if 0 if (vmove) panic("%s: ifp=%p not on the ifnet tailq %p", __func__, ifp, &V_ifnet); else return; /* XXX this should panic as well? */ #endif } /* * At this point we know the interface still was on the ifnet list * and we removed it so we are in a stable state. */ #ifdef VIMAGE curvnet->vnet_ifcnt--; #endif epoch_wait_preempt(net_epoch_preempt); /* * In any case (destroy or vmove) detach us from the groups * and remove/wait for pending events on the taskq. * XXX-BZ in theory an interface could still enqueue a taskq change? */ if_delgroups(ifp); taskqueue_drain(taskqueue_swi, &ifp->if_linktask); /* * Check if this is a cloned interface or not. Must do even if * shutting down as a if_vmove_reclaim() would move the ifp and * the if_clone_addgroup() will have a corrupted string overwise * from a gibberish pointer. */ if (vmove && ifcp != NULL) *ifcp = if_clone_findifc(ifp); if_down(ifp); #ifdef VIMAGE /* * On VNET shutdown abort here as the stack teardown will do all * the work top-down for us. */ if (shutdown) { /* Give interface users the chance to clean up. */ EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); /* * In case of a vmove we are done here without error. * If we would signal an error it would lead to the same * abort as if we did not find the ifnet anymore. * if_detach() calls us in void context and does not care * about an early abort notification, so life is splendid :) */ goto finish_vnet_shutdown; } #endif /* * At this point we are not tearing down a VNET and are either * going to destroy or vmove the interface and have to cleanup * accordingly. */ /* * Remove routes and flush queues. */ #ifdef ALTQ if (ALTQ_IS_ENABLED(&ifp->if_snd)) altq_disable(&ifp->if_snd); if (ALTQ_IS_ATTACHED(&ifp->if_snd)) altq_detach(&ifp->if_snd); #endif if_purgeaddrs(ifp); #ifdef INET in_ifdetach(ifp); #endif #ifdef INET6 /* * Remove all IPv6 kernel structs related to ifp. This should be done * before removing routing entries below, since IPv6 interface direct * routes are expected to be removed by the IPv6-specific kernel API. * Otherwise, the kernel will detect some inconsistency and bark it. */ in6_ifdetach(ifp); #endif if_purgemaddrs(ifp); /* Announce that the interface is gone. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL); if (!vmove) { /* * Prevent further calls into the device driver via ifnet. */ if_dead(ifp); /* * Clean up all addresses. */ IF_ADDR_WLOCK(ifp); if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) { ifa = CK_STAILQ_FIRST(&ifp->if_addrhead); CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(ifa); } else IF_ADDR_WUNLOCK(ifp); } rt_flushifroutes(ifp); #ifdef VIMAGE finish_vnet_shutdown: #endif /* * We cannot hold the lock over dom_ifdetach calls as they might * sleep, for example trying to drain a callout, thus open up the * theoretical race with re-attaching. */ IF_AFDATA_LOCK(ifp); i = ifp->if_afdata_initialized; ifp->if_afdata_initialized = 0; IF_AFDATA_UNLOCK(ifp); for (dp = domains; i > 0 && dp; dp = dp->dom_next) { if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) { (*dp->dom_ifdetach)(ifp, ifp->if_afdata[dp->dom_family]); ifp->if_afdata[dp->dom_family] = NULL; } } return (0); } #ifdef VIMAGE /* * if_vmove() performs a limited version of if_detach() in current * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg. * An attempt is made to shrink if_index in current vnet, find an * unused if_index in target vnet and calls if_grow() if necessary, * and finally find an unused if_xname for the target vnet. */ static void if_vmove(struct ifnet *ifp, struct vnet *new_vnet) { struct if_clone *ifc; u_int bif_dlt, bif_hdrlen; void *old; int rc; /* * if_detach_internal() will call the eventhandler to notify * interface departure. That will detach if_bpf. We need to * safe the dlt and hdrlen so we can re-attach it later. */ bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen); /* * Detach from current vnet, but preserve LLADDR info, do not * mark as dead etc. so that the ifnet can be reattached later. * If we cannot find it, we lost the race to someone else. */ rc = if_detach_internal(ifp, 1, &ifc); if (rc != 0) return; /* * Unlink the ifnet from ifindex_table[] in current vnet, and shrink * the if_index for that vnet if possible. * * NOTE: IFNET_WLOCK/IFNET_WUNLOCK() are assumed to be unvirtualized, * or we'd lock on one vnet and unlock on another. */ IFNET_WLOCK(); ifindex_free_locked(ifp->if_index); IFNET_WUNLOCK(); /* * Perform interface-specific reassignment tasks, if provided by * the driver. */ if (ifp->if_reassign != NULL) ifp->if_reassign(ifp, new_vnet, NULL); /* * Switch to the context of the target vnet. */ CURVNET_SET_QUIET(new_vnet); restart: IFNET_WLOCK(); ifp->if_index = ifindex_alloc(&old); if (__predict_false(ifp->if_index == USHRT_MAX)) { IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(old, M_IFNET); goto restart; } ifnet_setbyindex(ifp->if_index, ifp); IFNET_WUNLOCK(); if_attach_internal(ifp, 1, ifc); if (ifp->if_bpf == NULL) bpfattach(ifp, bif_dlt, bif_hdrlen); CURVNET_RESTORE(); } /* * Move an ifnet to or from another child prison/vnet, specified by the jail id. */ static int if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid) { struct prison *pr; struct ifnet *difp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Do not try to move the iface from and to the same prison. */ if (pr->pr_vnet == ifp->if_vnet) { prison_free(pr); return (EEXIST); } /* Make sure the named iface does not exists in the dst. prison/vnet. */ /* XXX Lock interfaces to avoid races. */ CURVNET_SET_QUIET(pr->pr_vnet); difp = ifunit(ifname); if (difp != NULL) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } CURVNET_RESTORE(); /* Move the interface into the child jail/vnet. */ if_vmove(ifp, pr->pr_vnet); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } static int if_vmove_reclaim(struct thread *td, char *ifname, int jid) { struct prison *pr; struct vnet *vnet_dst; struct ifnet *ifp; int shutdown; /* Try to find the prison within our visibility. */ sx_slock(&allprison_lock); pr = prison_find_child(td->td_ucred->cr_prison, jid); sx_sunlock(&allprison_lock); if (pr == NULL) return (ENXIO); prison_hold_locked(pr); mtx_unlock(&pr->pr_mtx); /* Make sure the named iface exists in the source prison/vnet. */ CURVNET_SET(pr->pr_vnet); ifp = ifunit(ifname); /* XXX Lock to avoid races. */ if (ifp == NULL) { CURVNET_RESTORE(); prison_free(pr); return (ENXIO); } /* Do not try to move the iface from and to the same prison. */ vnet_dst = TD_TO_VNET(td); if (vnet_dst == ifp->if_vnet) { CURVNET_RESTORE(); prison_free(pr); return (EEXIST); } /* Make sure the VNET is stable. */ shutdown = (ifp->if_vnet->vnet_state > SI_SUB_VNET && ifp->if_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); prison_free(pr); return (EBUSY); } /* Get interface back from child jail/vnet. */ if_vmove(ifp, vnet_dst); CURVNET_RESTORE(); /* Report the new if_xname back to the userland. */ sprintf(ifname, "%s", ifp->if_xname); prison_free(pr); return (0); } #endif /* VIMAGE */ /* * Add a group to an interface */ int if_addgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_group *ifg = NULL; struct ifg_member *ifgm; int new = 0; if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' && groupname[strlen(groupname) - 1] <= '9') return (EINVAL); IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) { IFNET_WUNLOCK(); return (EEXIST); } if ((ifgl = (struct ifg_list *)malloc(sizeof(struct ifg_list), M_TEMP, M_NOWAIT)) == NULL) { IFNET_WUNLOCK(); return (ENOMEM); } if ((ifgm = (struct ifg_member *)malloc(sizeof(struct ifg_member), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, groupname)) break; if (ifg == NULL) { if ((ifg = (struct ifg_group *)malloc(sizeof(struct ifg_group), M_TEMP, M_NOWAIT)) == NULL) { free(ifgl, M_TEMP); free(ifgm, M_TEMP); IFNET_WUNLOCK(); return (ENOMEM); } strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group)); ifg->ifg_refcnt = 0; CK_STAILQ_INIT(&ifg->ifg_members); CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next); new = 1; } ifg->ifg_refcnt++; ifgl->ifgl_group = ifg; ifgm->ifgm_ifp = ifp; IF_ADDR_WLOCK(ifp); CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next); CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next); IF_ADDR_WUNLOCK(ifp); IFNET_WUNLOCK(); if (new) EVENTHANDLER_INVOKE(group_attach_event, ifg); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove a group from an interface */ int if_delgroup(struct ifnet *ifp, const char *groupname) { struct ifg_list *ifgl; struct ifg_member *ifgm; int freeifgl; IFNET_WLOCK(); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) break; if (ifgl == NULL) { IFNET_WUNLOCK(); return (ENOENT); } freeifgl = 0; IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); freeifgl = 1; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); if (freeifgl) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } free(ifgm, M_TEMP); free(ifgl, M_TEMP); EVENTHANDLER_INVOKE(group_change_event, groupname); return (0); } /* * Remove an interface from all groups */ static void if_delgroups(struct ifnet *ifp) { struct ifg_list *ifgl; struct ifg_member *ifgm; char groupname[IFNAMSIZ]; int ifglfree; IFNET_WLOCK(); while (!CK_STAILQ_EMPTY(&ifp->if_groups)) { ifgl = CK_STAILQ_FIRST(&ifp->if_groups); strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ); IF_ADDR_WLOCK(ifp); CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next); IF_ADDR_WUNLOCK(ifp); CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) if (ifgm->ifgm_ifp == ifp) break; if (ifgm != NULL) CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifg_member, ifgm_next); ifglfree = 0; if (--ifgl->ifgl_group->ifg_refcnt == 0) { CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group, ifg_next); ifglfree = 1; } IFNET_WUNLOCK(); epoch_wait_preempt(net_epoch_preempt); free(ifgm, M_TEMP); if (ifglfree) { EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group); free(ifgl->ifgl_group, M_TEMP); } EVENTHANDLER_INVOKE(group_change_event, groupname); IFNET_WLOCK(); } IFNET_WUNLOCK(); } static char * ifgr_group_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (&ifgrup->ifgr32.ifgr_ifgru.ifgru_group[0]); #endif return (&ifgrup->ifgr.ifgr_ifgru.ifgru_group[0]); } static struct ifg_req * ifgr_groups_get(void *ifgrp) { union ifgroupreq_union *ifgrup; ifgrup = ifgrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((struct ifg_req *)(uintptr_t) ifgrup->ifgr32.ifgr_ifgru.ifgru_groups); #endif return (ifgrup->ifgr.ifgr_ifgru.ifgru_groups); } /* * Stores all groups from an interface in memory pointed to by ifgr. */ static int if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp) { struct epoch_tracker et; int len, error; struct ifg_list *ifgl; struct ifg_req ifgrq, *ifgp; if (ifgr->ifgr_len == 0) { NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) ifgr->ifgr_len += sizeof(struct ifg_req); NET_EPOCH_EXIT(et); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); /* XXX: wire */ NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) { if (len < sizeof(ifgrq)) { NET_EPOCH_EXIT(et); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group, sizeof(ifgrq.ifgrq_group)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { NET_EPOCH_EXIT(et); return (error); } len -= sizeof(ifgrq); ifgp++; } NET_EPOCH_EXIT(et); return (0); } /* * Stores all members of a group in memory pointed to by igfr */ static int if_getgroupmembers(struct ifgroupreq *ifgr) { struct ifg_group *ifg; struct ifg_member *ifgm; struct ifg_req ifgrq, *ifgp; int len, error; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) if (!strcmp(ifg->ifg_group, ifgr->ifgr_name)) break; if (ifg == NULL) { IFNET_RUNLOCK(); return (ENOENT); } if (ifgr->ifgr_len == 0) { CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) ifgr->ifgr_len += sizeof(ifgrq); IFNET_RUNLOCK(); return (0); } len = ifgr->ifgr_len; ifgp = ifgr_groups_get(ifgr); CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) { if (len < sizeof(ifgrq)) { IFNET_RUNLOCK(); return (EINVAL); } bzero(&ifgrq, sizeof ifgrq); strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname, sizeof(ifgrq.ifgrq_member)); if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) { IFNET_RUNLOCK(); return (error); } len -= sizeof(ifgrq); ifgp++; } IFNET_RUNLOCK(); return (0); } /* * Return counter values from counter(9)s stored in ifnet. */ uint64_t if_get_counter_default(struct ifnet *ifp, ift_counter cnt) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); return (counter_u64_fetch(ifp->if_counters[cnt])); } /* * Increase an ifnet counter. Usually used for counters shared * between the stack and a driver, but function supports them all. */ void if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc) { KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); counter_u64_add(ifp->if_counters[cnt], inc); } /* * Copy data from ifnet to userland API structure if_data. */ void if_data_copy(struct ifnet *ifp, struct if_data *ifd) { ifd->ifi_type = ifp->if_type; ifd->ifi_physical = 0; ifd->ifi_addrlen = ifp->if_addrlen; ifd->ifi_hdrlen = ifp->if_hdrlen; ifd->ifi_link_state = ifp->if_link_state; ifd->ifi_vhid = 0; ifd->ifi_datalen = sizeof(struct if_data); ifd->ifi_mtu = ifp->if_mtu; ifd->ifi_metric = ifp->if_metric; ifd->ifi_baudrate = ifp->if_baudrate; ifd->ifi_hwassist = ifp->if_hwassist; ifd->ifi_epoch = ifp->if_epoch; ifd->ifi_lastchange = ifp->if_lastchange; ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS); ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS); ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS); ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS); ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS); ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES); ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES); ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS); ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS); ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS); ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS); ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO); } /* * Wrapper functions for struct ifnet address list locking macros. These are * used by kernel modules to avoid encoding programming interface or binary * interface assumptions that may be violated when kernel-internal locking * approaches change. */ void if_addr_rlock(struct ifnet *ifp) { epoch_enter_preempt(net_epoch_preempt, curthread->td_et); } void if_addr_runlock(struct ifnet *ifp) { epoch_exit_preempt(net_epoch_preempt, curthread->td_et); } void if_maddr_rlock(if_t ifp) { epoch_enter_preempt(net_epoch_preempt, curthread->td_et); } void if_maddr_runlock(if_t ifp) { epoch_exit_preempt(net_epoch_preempt, curthread->td_et); } /* * Initialization, destruction and refcounting functions for ifaddrs. */ struct ifaddr * ifa_alloc(size_t size, int flags) { struct ifaddr *ifa; KASSERT(size >= sizeof(struct ifaddr), ("%s: invalid size %zu", __func__, size)); ifa = malloc(size, M_IFADDR, M_ZERO | flags); if (ifa == NULL) return (NULL); if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL) goto fail; if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL) goto fail; refcount_init(&ifa->ifa_refcnt, 1); return (ifa); fail: /* free(NULL) is okay */ counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); return (NULL); } void ifa_ref(struct ifaddr *ifa) { refcount_acquire(&ifa->ifa_refcnt); } static void ifa_destroy(epoch_context_t ctx) { struct ifaddr *ifa; ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx); counter_u64_free(ifa->ifa_opackets); counter_u64_free(ifa->ifa_ipackets); counter_u64_free(ifa->ifa_obytes); counter_u64_free(ifa->ifa_ibytes); free(ifa, M_IFADDR); } void ifa_free(struct ifaddr *ifa) { if (refcount_release(&ifa->ifa_refcnt)) epoch_call(net_epoch_preempt, &ifa->ifa_epoch_ctx, ifa_destroy); } static int ifa_maintain_loopback_route(int cmd, const char *otype, struct ifaddr *ifa, struct sockaddr *ia) { struct epoch_tracker et; int error; struct rt_addrinfo info; struct sockaddr_dl null_sdl; struct ifnet *ifp; ifp = ifa->ifa_ifp; bzero(&info, sizeof(info)); if (cmd != RTM_DELETE) info.rti_ifp = V_loif; if (cmd == RTM_ADD) { /* explicitly specify (loopback) ifa */ if (info.rti_ifp != NULL) { NET_EPOCH_ENTER(et); info.rti_ifa = ifaof_ifpforaddr(ifa->ifa_addr, info.rti_ifp); if (info.rti_ifa != NULL) ifa_ref(info.rti_ifa); NET_EPOCH_EXIT(et); } } info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED; info.rti_info[RTAX_DST] = ia; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&null_sdl; link_init_sdl(ifp, (struct sockaddr *)&null_sdl, ifp->if_type); error = rtrequest1_fib(cmd, &info, NULL, ifp->if_fib); if (error != 0 && !(cmd == RTM_ADD && error == EEXIST) && !(cmd == RTM_DELETE && error == ENOENT)) if_printf(ifp, "%s failed: %d\n", otype, error); return (error); } int ifa_add_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_ADD, "insertion", ifa, ia)); } int ifa_del_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_DELETE, "deletion", ifa, ia)); } int ifa_switch_loopback_route(struct ifaddr *ifa, struct sockaddr *ia) { return (ifa_maintain_loopback_route(RTM_CHANGE, "switch", ifa, ia)); } /* * XXX: Because sockaddr_dl has deeper structure than the sockaddr * structs used to represent other address families, it is necessary * to perform a different comparison. */ #define sa_dl_equal(a1, a2) \ ((((const struct sockaddr_dl *)(a1))->sdl_len == \ ((const struct sockaddr_dl *)(a2))->sdl_len) && \ (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)), \ CLLADDR((const struct sockaddr_dl *)(a2)), \ ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0)) /* * Locate an interface based on a complete address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (sa_equal(addr, ifa->ifa_addr)) { goto done; } /* IP6 doesn't have broadcast */ if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } int ifa_ifwithaddr_check(const struct sockaddr *addr) { struct epoch_tracker et; int rc; NET_EPOCH_ENTER(et); rc = (ifa_ifwithaddr(addr) != NULL); NET_EPOCH_EXIT(et); return (rc); } /* * Locate an interface based on the broadcast address. */ /* ARGSUSED */ struct ifaddr * ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && ifa->ifa_broadaddr->sa_len != 0 && sa_equal(ifa->ifa_broadaddr, addr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((ifp->if_flags & IFF_POINTOPOINT) == 0) continue; if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != addr->sa_family) continue; if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } } ifa = NULL; done: return (ifa); } /* * Find an interface on a specific network. If many, choice * is most specific found. */ struct ifaddr * ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; MPASS(in_epoch(net_epoch_preempt)); /* * AF_LINK addresses can be looked up directly by their index number, * so do that if we can. */ if (af == AF_LINK) { const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; if (sdl->sdl_index && sdl->sdl_index <= V_if_index) return (ifaddr_byindex(sdl->sdl_index)); } /* * Scan though each interface, looking for ones that have addresses * in this address family and the requested fib. */ CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum)) continue; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { const char *cp, *cp2, *cp3; if (ifa->ifa_addr->sa_family != af) next: continue; if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) { /* * This is a bit broken as it doesn't * take into account that the remote end may * be a single node in the network we are * looking for. * The trouble is that we don't know the * netmask for the remote end. */ if (ifa->ifa_dstaddr != NULL && sa_equal(addr, ifa->ifa_dstaddr)) { goto done; } } else { /* * Scan all the bits in the ifa's address. * If a bit dissagrees with what we are * looking for, mask it with the netmask * to see if it really matters. * (A byte at a time) */ if (ifa->ifa_netmask == 0) continue; cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; while (cp3 < cplim) if ((*cp++ ^ *cp2++) & *cp3++) goto next; /* next address! */ /* * If the netmask of what we just found * is more specific than what we had before * (if we had one), or if the virtual status * of new prefix is better than of the old one, * then remember the new one before continuing * to search for an even better one. */ if (ifa_maybe == NULL || ifa_preferred(ifa_maybe, ifa) || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { ifa_maybe = ifa; } } } } ifa = ifa_maybe; ifa_maybe = NULL; done: return (ifa); } /* * Find an interface address specific to an interface best matching * a given address. */ struct ifaddr * ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa; const char *cp, *cp2, *cp3; char *cplim; struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; if (af >= AF_MAX) return (NULL); MPASS(in_epoch(net_epoch_preempt)); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != af) continue; if (ifa_maybe == NULL) ifa_maybe = ifa; if (ifa->ifa_netmask == 0) { if (sa_equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && sa_equal(addr, ifa->ifa_dstaddr))) goto done; continue; } if (ifp->if_flags & IFF_POINTOPOINT) { if (sa_equal(addr, ifa->ifa_dstaddr)) goto done; } else { cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) goto done; } } ifa = ifa_maybe; done: return (ifa); } /* * See whether new ifa is better than current one: * 1) A non-virtual one is preferred over virtual. * 2) A virtual in master state preferred over any other state. * * Used in several address selecting functions. */ int ifa_preferred(struct ifaddr *cur, struct ifaddr *next) { return (cur->ifa_carp && (!next->ifa_carp || ((*carp_master_p)(next) && !(*carp_master_p)(cur)))); } struct sockaddr_dl * link_alloc_sdl(size_t size, int flags) { return (malloc(size, M_TEMP, flags)); } void link_free_sdl(struct sockaddr *sa) { free(sa, M_TEMP); } /* * Fills in given sdl with interface basic info. * Returns pointer to filled sdl. */ struct sockaddr_dl * link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)paddr; memset(sdl, 0, sizeof(struct sockaddr_dl)); sdl->sdl_len = sizeof(struct sockaddr_dl); sdl->sdl_family = AF_LINK; sdl->sdl_index = ifp->if_index; sdl->sdl_type = iftype; return (sdl); } /* * Mark an interface down and notify protocols of * the transition. */ static void if_unroute(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP")); ifp->if_flags &= ~flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFDOWN, ifa->ifa_addr); ifp->if_qflush(ifp); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); } /* * Mark an interface up and notify protocols of * the transition. */ static void if_route(struct ifnet *ifp, int flag, int fam) { struct ifaddr *ifa; KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP")); ifp->if_flags |= flag; getmicrotime(&ifp->if_lastchange); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family)) pfctlinput(PRC_IFUP, ifa->ifa_addr); if (ifp->if_carp) (*carp_linkstate_p)(ifp); rt_ifmsg(ifp); #ifdef INET6 in6_if_up(ifp); #endif } void (*vlan_link_state_p)(struct ifnet *); /* XXX: private from if_vlan */ void (*vlan_trunk_cap_p)(struct ifnet *); /* XXX: private from if_vlan */ struct ifnet *(*vlan_trunkdev_p)(struct ifnet *); struct ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t); int (*vlan_tag_p)(struct ifnet *, uint16_t *); int (*vlan_pcp_p)(struct ifnet *, uint16_t *); int (*vlan_setcookie_p)(struct ifnet *, void *); void *(*vlan_cookie_p)(struct ifnet *); /* * Handle a change in the interface link state. To avoid LORs * between driver lock and upper layer locks, as well as possible * recursions, we post event to taskqueue, and all job * is done in static do_link_state_change(). */ void if_link_state_change(struct ifnet *ifp, int link_state) { /* Return if state hasn't changed. */ if (ifp->if_link_state == link_state) return; ifp->if_link_state = link_state; taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask); } static void do_link_state_change(void *arg, int pending) { struct ifnet *ifp = (struct ifnet *)arg; int link_state = ifp->if_link_state; CURVNET_SET(ifp->if_vnet); /* Notify that the link state has changed. */ rt_ifmsg(ifp); if (ifp->if_vlantrunk != NULL) (*vlan_link_state_p)(ifp); if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) && ifp->if_l2com != NULL) (*ng_ether_link_state_p)(ifp, link_state); if (ifp->if_carp) (*carp_linkstate_p)(ifp); if (ifp->if_bridge) ifp->if_bridge_linkstate(ifp); if (ifp->if_lagg) (*lagg_linkstate_p)(ifp, link_state); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("IFNET", ifp->if_xname, (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL); if (pending > 1) if_printf(ifp, "%d link states coalesced\n", pending); if (log_link_state_change) if_printf(ifp, "link state changed to %s\n", (link_state == LINK_STATE_UP) ? "UP" : "DOWN" ); EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state); CURVNET_RESTORE(); } /* * Mark an interface down and notify protocols of * the transition. */ void if_down(struct ifnet *ifp) { EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN); if_unroute(ifp, IFF_UP, AF_UNSPEC); } /* * Mark an interface up and notify protocols of * the transition. */ void if_up(struct ifnet *ifp) { if_route(ifp, IFF_UP, AF_UNSPEC); EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP); } /* * Flush an interface queue. */ void if_qflush(struct ifnet *ifp) { struct mbuf *m, *n; struct ifaltq *ifq; ifq = &ifp->if_snd; IFQ_LOCK(ifq); #ifdef ALTQ if (ALTQ_IS_ENABLED(ifq)) ALTQ_PURGE(ifq); #endif n = ifq->ifq_head; while ((m = n) != NULL) { n = m->m_nextpkt; m_freem(m); } ifq->ifq_head = 0; ifq->ifq_tail = 0; ifq->ifq_len = 0; IFQ_UNLOCK(ifq); } /* * Map interface name to interface structure pointer, with or without * returning a reference. */ struct ifnet * ifunit_ref(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 && !(ifp->if_flags & IFF_DYING)) break; } if (ifp != NULL) if_ref(ifp); NET_EPOCH_EXIT(et); return (ifp); } struct ifnet * ifunit(const char *name) { struct epoch_tracker et; struct ifnet *ifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0) break; } NET_EPOCH_EXIT(et); return (ifp); } static void * ifr_buffer_get_buffer(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer); } static void ifr_buffer_set_buffer_null(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL; } static size_t ifr_buffer_get_length(void *data) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return (ifrup->ifr32.ifr_ifru.ifru_buffer.length); #endif return (ifrup->ifr.ifr_ifru.ifru_buffer.length); } static void ifr_buffer_set_length(void *data, size_t len) { union ifreq_union *ifrup; ifrup = data; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) ifrup->ifr32.ifr_ifru.ifru_buffer.length = len; else #endif ifrup->ifr.ifr_ifru.ifru_buffer.length = len; } void * ifr_data_get_ptr(void *ifrp) { union ifreq_union *ifrup; ifrup = ifrp; #ifdef COMPAT_FREEBSD32 if (SV_CURPROC_FLAG(SV_ILP32)) return ((void *)(uintptr_t) ifrup->ifr32.ifr_ifru.ifru_data); #endif return (ifrup->ifr.ifr_ifru.ifru_data); } /* * Hardware specific interface ioctls. */ int ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td) { struct ifreq *ifr; int error = 0, do_ifup = 0; int new_flags, temp_flags; size_t namelen, onamelen; size_t descrlen; char *descrbuf, *odescrbuf; char new_name[IFNAMSIZ]; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifr = (struct ifreq *)data; switch (cmd) { case SIOCGIFINDEX: ifr->ifr_index = ifp->if_index; break; case SIOCGIFFLAGS: temp_flags = ifp->if_flags | ifp->if_drv_flags; ifr->ifr_flags = temp_flags & 0xffff; ifr->ifr_flagshigh = temp_flags >> 16; break; case SIOCGIFCAP: ifr->ifr_reqcap = ifp->if_capabilities; ifr->ifr_curcap = ifp->if_capenable; break; #ifdef MAC case SIOCGIFMAC: error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp); break; #endif case SIOCGIFMETRIC: ifr->ifr_metric = ifp->if_metric; break; case SIOCGIFMTU: ifr->ifr_mtu = ifp->if_mtu; break; case SIOCGIFPHYS: /* XXXGL: did this ever worked? */ ifr->ifr_phys = 0; break; case SIOCGIFDESCR: error = 0; sx_slock(&ifdescr_sx); if (ifp->if_description == NULL) error = ENOMSG; else { /* space for terminating nul */ descrlen = strlen(ifp->if_description) + 1; if (ifr_buffer_get_length(ifr) < descrlen) ifr_buffer_set_buffer_null(ifr); else error = copyout(ifp->if_description, ifr_buffer_get_buffer(ifr), descrlen); ifr_buffer_set_length(ifr, descrlen); } sx_sunlock(&ifdescr_sx); break; case SIOCSIFDESCR: error = priv_check(td, PRIV_NET_SETIFDESCR); if (error) return (error); /* * Copy only (length-1) bytes to make sure that * if_description is always nul terminated. The * length parameter is supposed to count the * terminating nul in. */ if (ifr_buffer_get_length(ifr) > ifdescr_maxlen) return (ENAMETOOLONG); else if (ifr_buffer_get_length(ifr) == 0) descrbuf = NULL; else { descrbuf = malloc(ifr_buffer_get_length(ifr), M_IFDESCR, M_WAITOK | M_ZERO); error = copyin(ifr_buffer_get_buffer(ifr), descrbuf, ifr_buffer_get_length(ifr) - 1); if (error) { free(descrbuf, M_IFDESCR); break; } } sx_xlock(&ifdescr_sx); odescrbuf = ifp->if_description; ifp->if_description = descrbuf; sx_xunlock(&ifdescr_sx); getmicrotime(&ifp->if_lastchange); free(odescrbuf, M_IFDESCR); break; case SIOCGIFFIB: ifr->ifr_fib = ifp->if_fib; break; case SIOCSIFFIB: error = priv_check(td, PRIV_NET_SETIFFIB); if (error) return (error); if (ifr->ifr_fib >= rt_numfibs) return (EINVAL); ifp->if_fib = ifr->ifr_fib; break; case SIOCSIFFLAGS: error = priv_check(td, PRIV_NET_SETIFFLAGS); if (error) return (error); /* * Currently, no driver owned flags pass the IFF_CANTCHANGE * check, so we don't need special handling here yet. */ new_flags = (ifr->ifr_flags & 0xffff) | (ifr->ifr_flagshigh << 16); if (ifp->if_flags & IFF_UP && (new_flags & IFF_UP) == 0) { if_down(ifp); } else if (new_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { do_ifup = 1; } /* See if permanently promiscuous mode bit is about to flip */ if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) { if (new_flags & IFF_PPROMISC) ifp->if_flags |= IFF_PROMISC; else if (ifp->if_pcount == 0) ifp->if_flags &= ~IFF_PROMISC; if (log_promisc_mode_change) if_printf(ifp, "permanently promiscuous mode %s\n", ((new_flags & IFF_PPROMISC) ? "enabled" : "disabled")); } ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) | (new_flags &~ IFF_CANTCHANGE); if (ifp->if_ioctl) { (void) (*ifp->if_ioctl)(ifp, cmd, data); } if (do_ifup) if_up(ifp); getmicrotime(&ifp->if_lastchange); break; case SIOCSIFCAP: error = priv_check(td, PRIV_NET_SETIFCAP); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); if (ifr->ifr_reqcap & ~ifp->if_capabilities) return (EINVAL); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; #ifdef MAC case SIOCSIFMAC: error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp); break; #endif case SIOCSIFNAME: error = priv_check(td, PRIV_NET_SETIFNAME); if (error) return (error); error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ, NULL); if (error != 0) return (error); if (new_name[0] == '\0') return (EINVAL); if (new_name[IFNAMSIZ-1] != '\0') { new_name[IFNAMSIZ-1] = '\0'; if (strlen(new_name) == IFNAMSIZ-1) return (EINVAL); } if (ifunit(new_name) != NULL) return (EEXIST); /* * XXX: Locking. Nothing else seems to lock if_flags, * and there are numerous other races with the * ifunit() checks not being atomic with namespace * changes (renames, vmoves, if_attach, etc). */ ifp->if_flags |= IFF_RENAMING; /* Announce the departure of the interface. */ rt_ifannouncemsg(ifp, IFAN_DEPARTURE); EVENTHANDLER_INVOKE(ifnet_departure_event, ifp); if_printf(ifp, "changing name to '%s'\n", new_name); IF_ADDR_WLOCK(ifp); strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname)); ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; namelen = strlen(new_name); onamelen = sdl->sdl_nlen; /* * Move the address if needed. This is safe because we * allocate space for a name of length IFNAMSIZ when we * create this in if_attach(). */ if (namelen != onamelen) { bcopy(sdl->sdl_data + onamelen, sdl->sdl_data + namelen, sdl->sdl_alen); } bcopy(new_name, sdl->sdl_data, namelen); sdl->sdl_nlen = namelen; sdl = (struct sockaddr_dl *)ifa->ifa_netmask; bzero(sdl->sdl_data, onamelen); while (namelen != 0) sdl->sdl_data[--namelen] = 0xff; IF_ADDR_WUNLOCK(ifp); EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp); /* Announce the return of the interface. */ rt_ifannouncemsg(ifp, IFAN_ARRIVAL); ifp->if_flags &= ~IFF_RENAMING; break; #ifdef VIMAGE case SIOCSIFVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error) return (error); error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid); break; #endif case SIOCSIFMETRIC: error = priv_check(td, PRIV_NET_SETIFMETRIC); if (error) return (error); ifp->if_metric = ifr->ifr_metric; getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYS: error = priv_check(td, PRIV_NET_SETIFPHYS); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFMTU: { u_long oldmtu = ifp->if_mtu; error = priv_check(td, PRIV_NET_SETIFMTU); if (error) return (error); if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) return (EINVAL); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) { getmicrotime(&ifp->if_lastchange); rt_ifmsg(ifp); #ifdef INET NETDUMP_REINIT(ifp); #endif } /* * If the link MTU changed, do network layer specific procedure. */ if (ifp->if_mtu != oldmtu) { #ifdef INET6 nd6_setmtu(ifp); #endif rt_updatemtu(ifp); } break; } case SIOCADDMULTI: case SIOCDELMULTI: if (cmd == SIOCADDMULTI) error = priv_check(td, PRIV_NET_ADDMULTI); else error = priv_check(td, PRIV_NET_DELMULTI); if (error) return (error); /* Don't allow group membership on non-multicast interfaces. */ if ((ifp->if_flags & IFF_MULTICAST) == 0) return (EOPNOTSUPP); /* Don't let users screw up protocols' entries. */ if (ifr->ifr_addr.sa_family != AF_LINK) return (EINVAL); if (cmd == SIOCADDMULTI) { struct epoch_tracker et; struct ifmultiaddr *ifma; /* * Userland is only permitted to join groups once * via the if_addmulti() KPI, because it cannot hold * struct ifmultiaddr * between calls. It may also * lose a race while we check if the membership * already exists. */ NET_EPOCH_ENTER(et); ifma = if_findmulti(ifp, &ifr->ifr_addr); NET_EPOCH_EXIT(et); if (ifma != NULL) error = EADDRINUSE; else error = if_addmulti(ifp, &ifr->ifr_addr, &ifma); } else { error = if_delmulti(ifp, &ifr->ifr_addr); } if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: #ifdef INET6 case SIOCSIFPHYADDR_IN6: #endif case SIOCSIFMEDIA: case SIOCSIFGENERIC: error = priv_check(td, PRIV_NET_HWIOCTL); if (error) return (error); if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); if (error == 0) getmicrotime(&ifp->if_lastchange); break; case SIOCGIFSTATUS: case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGIFMEDIA: case SIOCGIFXMEDIA: case SIOCGIFGENERIC: case SIOCGIFRSSKEY: case SIOCGIFRSSHASH: if (ifp->if_ioctl == NULL) return (EOPNOTSUPP); error = (*ifp->if_ioctl)(ifp, cmd, data); break; case SIOCSIFLLADDR: error = priv_check(td, PRIV_NET_SETLLADDR); if (error) return (error); error = if_setlladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); break; case SIOCGHWADDR: error = if_gethwaddr(ifp, ifr); break; case CASE_IOC_IFGROUPREQ(SIOCAIFGROUP): error = priv_check(td, PRIV_NET_ADDIFGROUP); if (error) return (error); if ((error = if_addgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; case CASE_IOC_IFGROUPREQ(SIOCGIFGROUP): if ((error = if_getgroup((struct ifgroupreq *)data, ifp))) return (error); break; case CASE_IOC_IFGROUPREQ(SIOCDIFGROUP): error = priv_check(td, PRIV_NET_DELIFGROUP); if (error) return (error); if ((error = if_delgroup(ifp, ifgr_group_get((struct ifgroupreq *)data)))) return (error); break; default: error = ENOIOCTL; break; } return (error); } #ifdef COMPAT_FREEBSD32 struct ifconf32 { int32_t ifc_len; union { uint32_t ifcu_buf; uint32_t ifcu_req; } ifc_ifcu; }; #define SIOCGIFCONF32 _IOWR('i', 36, struct ifconf32) #endif #ifdef COMPAT_FREEBSD32 static void ifmr_init(struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; memcpy(ifmr->ifm_name, ifmr32->ifm_name, sizeof(ifmr->ifm_name)); ifmr->ifm_current = ifmr32->ifm_current; ifmr->ifm_mask = ifmr32->ifm_mask; ifmr->ifm_status = ifmr32->ifm_status; ifmr->ifm_active = ifmr32->ifm_active; ifmr->ifm_count = ifmr32->ifm_count; ifmr->ifm_ulist = (int *)(uintptr_t)ifmr32->ifm_ulist; } static void ifmr_update(const struct ifmediareq *ifmr, caddr_t data) { struct ifmediareq32 *ifmr32; ifmr32 = (struct ifmediareq32 *)data; ifmr32->ifm_current = ifmr->ifm_current; ifmr32->ifm_mask = ifmr->ifm_mask; ifmr32->ifm_status = ifmr->ifm_status; ifmr32->ifm_active = ifmr->ifm_active; ifmr32->ifm_count = ifmr->ifm_count; } #endif /* * Interface ioctls. */ int ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td) { #ifdef COMPAT_FREEBSD32 caddr_t saved_data = NULL; struct ifmediareq ifmr; struct ifmediareq *ifmrp; #endif struct ifnet *ifp; struct ifreq *ifr; int error; int oif_flags; #ifdef VIMAGE int shutdown; #endif CURVNET_SET(so->so_vnet); #ifdef VIMAGE /* Make sure the VNET is stable. */ shutdown = (so->so_vnet->vnet_state > SI_SUB_VNET && so->so_vnet->vnet_state < SI_SUB_VNET_DONE) ? 1 : 0; if (shutdown) { CURVNET_RESTORE(); return (EBUSY); } #endif switch (cmd) { case SIOCGIFCONF: error = ifconf(cmd, data); CURVNET_RESTORE(); return (error); #ifdef COMPAT_FREEBSD32 case SIOCGIFCONF32: { struct ifconf32 *ifc32; struct ifconf ifc; ifc32 = (struct ifconf32 *)data; ifc.ifc_len = ifc32->ifc_len; ifc.ifc_buf = PTRIN(ifc32->ifc_buf); error = ifconf(SIOCGIFCONF, (void *)&ifc); CURVNET_RESTORE(); if (error == 0) ifc32->ifc_len = ifc.ifc_len; return (error); } #endif } #ifdef COMPAT_FREEBSD32 ifmrp = NULL; switch (cmd) { case SIOCGIFMEDIA32: case SIOCGIFXMEDIA32: ifmrp = &ifmr; ifmr_init(ifmrp, data); cmd = _IOC_NEWTYPE(cmd, struct ifmediareq); saved_data = data; data = (caddr_t)ifmrp; } #endif ifr = (struct ifreq *)data; switch (cmd) { #ifdef VIMAGE case SIOCSIFRVNET: error = priv_check(td, PRIV_NET_SETIFVNET); if (error == 0) error = if_vmove_reclaim(td, ifr->ifr_name, ifr->ifr_jid); goto out_noref; #endif case SIOCIFCREATE: case SIOCIFCREATE2: error = priv_check(td, PRIV_NET_IFCREATE); if (error == 0) error = if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ? ifr_data_get_ptr(ifr) : NULL); goto out_noref; case SIOCIFDESTROY: error = priv_check(td, PRIV_NET_IFDESTROY); if (error == 0) error = if_clone_destroy(ifr->ifr_name); goto out_noref; case SIOCIFGCLONERS: error = if_clone_list((struct if_clonereq *)data); goto out_noref; case CASE_IOC_IFGROUPREQ(SIOCGIFGMEMB): error = if_getgroupmembers((struct ifgroupreq *)data); goto out_noref; #if defined(INET) || defined(INET6) case SIOCSVH: case SIOCGVH: if (carp_ioctl_p == NULL) error = EPROTONOSUPPORT; else error = (*carp_ioctl_p)(ifr, cmd, td); goto out_noref; #endif } ifp = ifunit_ref(ifr->ifr_name); if (ifp == NULL) { error = ENXIO; goto out_noref; } error = ifhwioctl(cmd, ifp, data, td); if (error != ENOIOCTL) goto out_ref; oif_flags = ifp->if_flags; if (so->so_proto == NULL) { error = EOPNOTSUPP; goto out_ref; } /* * Pass the request on to the socket control method, and if the * latter returns EOPNOTSUPP, directly to the interface. * * Make an exception for the legacy SIOCSIF* requests. Drivers * trust SIOCSIFADDR et al to come from an already privileged * layer, and do not perform any credentials checks or input * validation. */ error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, ifp, td)); if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL && cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR && cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK) error = (*ifp->if_ioctl)(ifp, cmd, data); if ((oif_flags ^ ifp->if_flags) & IFF_UP) { #ifdef INET6 if (ifp->if_flags & IFF_UP) in6_if_up(ifp); #endif } out_ref: if_rele(ifp); out_noref: #ifdef COMPAT_FREEBSD32 if (ifmrp != NULL) { KASSERT((cmd == SIOCGIFMEDIA || cmd == SIOCGIFXMEDIA), ("ifmrp non-NULL, but cmd is not an ifmedia req 0x%lx", cmd)); data = saved_data; ifmr_update(ifmrp, data); } #endif CURVNET_RESTORE(); return (error); } /* * The code common to handling reference counted flags, * e.g., in ifpromisc() and if_allmulti(). * The "pflag" argument can specify a permanent mode flag to check, * such as IFF_PPROMISC for promiscuous mode; should be 0 if none. * * Only to be used on stack-owned flags, not driver-owned flags. */ static int if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch) { struct ifreq ifr; int error; int oldflags, oldcount; /* Sanity checks to catch programming errors */ KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0, ("%s: setting driver-owned flag %d", __func__, flag)); if (onswitch) KASSERT(*refcount >= 0, ("%s: increment negative refcount %d for flag %d", __func__, *refcount, flag)); else KASSERT(*refcount > 0, ("%s: decrement non-positive refcount %d for flag %d", __func__, *refcount, flag)); /* In case this mode is permanent, just touch refcount */ if (ifp->if_flags & pflag) { *refcount += onswitch ? 1 : -1; return (0); } /* Save ifnet parameters for if_ioctl() may fail */ oldcount = *refcount; oldflags = ifp->if_flags; /* * See if we aren't the only and touching refcount is enough. * Actually toggle interface flag if we are the first or last. */ if (onswitch) { if ((*refcount)++) return (0); ifp->if_flags |= flag; } else { if (--(*refcount)) return (0); ifp->if_flags &= ~flag; } /* Call down the driver since we've changed interface flags */ if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto recover; } ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); if (error) goto recover; /* Notify userland that interface flags have changed */ rt_ifmsg(ifp); return (0); recover: /* Recover after driver error */ *refcount = oldcount; ifp->if_flags = oldflags; return (error); } /* * Set/clear promiscuous mode on interface ifp based on the truth value * of pswitch. The calls are reference counted so that only the first * "on" request actually has an effect, as does the final "off" request. * Results are undefined if the "off" and "on" requests are not matched. */ int ifpromisc(struct ifnet *ifp, int pswitch) { int error; int oldflags = ifp->if_flags; error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC, &ifp->if_pcount, pswitch); /* If promiscuous mode status has changed, log a message */ if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) && log_promisc_mode_change) if_printf(ifp, "promiscuous mode %s\n", (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled"); return (error); } /* * Return interface configuration * of system. List may be used * in later ioctl's (above) to get * other information. */ /*ARGSUSED*/ static int ifconf(u_long cmd, caddr_t data) { struct ifconf *ifc = (struct ifconf *)data; struct ifnet *ifp; struct ifaddr *ifa; struct ifreq ifr; struct sbuf *sb; int error, full = 0, valid_len, max_len; /* Limit initial buffer size to MAXPHYS to avoid DoS from userspace. */ max_len = MAXPHYS - 1; /* Prevent hostile input from being able to crash the system */ if (ifc->ifc_len <= 0) return (EINVAL); again: if (ifc->ifc_len <= max_len) { max_len = ifc->ifc_len; full = 1; } sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN); max_len = 0; valid_len = 0; IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { struct epoch_tracker et; int addrs; /* * Zero the ifr to make sure we don't disclose the contents * of the stack. */ memset(&ifr, 0, sizeof(ifr)); if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)) >= sizeof(ifr.ifr_name)) { sbuf_delete(sb); IFNET_RUNLOCK(); return (ENAMETOOLONG); } addrs = 0; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { struct sockaddr *sa = ifa->ifa_addr; if (prison_if(curthread->td_ucred, sa) != 0) continue; addrs++; if (sa->sa_len <= sizeof(*sa)) { if (sa->sa_len < sizeof(*sa)) { memset(&ifr.ifr_ifru.ifru_addr, 0, sizeof(ifr.ifr_ifru.ifru_addr)); memcpy(&ifr.ifr_ifru.ifru_addr, sa, sa->sa_len); } else ifr.ifr_ifru.ifru_addr = *sa; sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); } else { sbuf_bcat(sb, &ifr, offsetof(struct ifreq, ifr_addr)); max_len += offsetof(struct ifreq, ifr_addr); sbuf_bcat(sb, sa, sa->sa_len); max_len += sa->sa_len; } if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } NET_EPOCH_EXIT(et); if (addrs == 0) { sbuf_bcat(sb, &ifr, sizeof(ifr)); max_len += sizeof(ifr); if (sbuf_error(sb) == 0) valid_len = sbuf_len(sb); } } IFNET_RUNLOCK(); /* * If we didn't allocate enough space (uncommon), try again. If * we have already allocated as much space as we are allowed, * return what we've got. */ if (valid_len != max_len && !full) { sbuf_delete(sb); goto again; } ifc->ifc_len = valid_len; sbuf_finish(sb); error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len); sbuf_delete(sb); return (error); } /* * Just like ifpromisc(), but for all-multicast-reception mode. */ int if_allmulti(struct ifnet *ifp, int onswitch) { return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch)); } struct ifmultiaddr * if_findmulti(struct ifnet *ifp, const struct sockaddr *sa) { struct ifmultiaddr *ifma; IF_ADDR_LOCK_ASSERT(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (sa->sa_family == AF_LINK) { if (sa_dl_equal(ifma->ifma_addr, sa)) break; } else { if (sa_equal(ifma->ifma_addr, sa)) break; } } return ifma; } /* * Allocate a new ifmultiaddr and initialize based on passed arguments. We * make copies of passed sockaddrs. The ifmultiaddr will not be added to * the ifnet multicast address list here, so the caller must do that and * other setup work (such as notifying the device driver). The reference * count is initialized to 1. */ static struct ifmultiaddr * if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa, int mflags) { struct ifmultiaddr *ifma; struct sockaddr *dupsa; ifma = malloc(sizeof *ifma, M_IFMADDR, mflags | M_ZERO); if (ifma == NULL) return (NULL); dupsa = malloc(sa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma, M_IFMADDR); return (NULL); } bcopy(sa, dupsa, sa->sa_len); ifma->ifma_addr = dupsa; ifma->ifma_ifp = ifp; ifma->ifma_refcount = 1; ifma->ifma_protospec = NULL; if (llsa == NULL) { ifma->ifma_lladdr = NULL; return (ifma); } dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags); if (dupsa == NULL) { free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); return (NULL); } bcopy(llsa, dupsa, llsa->sa_len); ifma->ifma_lladdr = dupsa; return (ifma); } /* * if_freemulti: free ifmultiaddr structure and possibly attached related * addresses. The caller is responsible for implementing reference * counting, notifying the driver, handling routing messages, and releasing * any dependent link layer state. */ #ifdef MCAST_VERBOSE extern void kdb_backtrace(void); #endif static void if_freemulti_internal(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d", ifma->ifma_refcount)); if (ifma->ifma_lladdr != NULL) free(ifma->ifma_lladdr, M_IFMADDR); #ifdef MCAST_VERBOSE kdb_backtrace(); printf("%s freeing ifma: %p\n", __func__, ifma); #endif free(ifma->ifma_addr, M_IFMADDR); free(ifma, M_IFMADDR); } static void if_destroymulti(epoch_context_t ctx) { struct ifmultiaddr *ifma; ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx); if_freemulti_internal(ifma); } void if_freemulti(struct ifmultiaddr *ifma) { KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d", ifma->ifma_refcount)); epoch_call(net_epoch_preempt, &ifma->ifma_epoch_ctx, if_destroymulti); } /* * Register an additional multicast address with a network interface. * * - If the address is already present, bump the reference count on the * address and return. * - If the address is not link-layer, look up a link layer address. * - Allocate address structures for one or both addresses, and attach to the * multicast address list on the interface. If automatically adding a link * layer address, the protocol address will own a reference to the link * layer address, to be freed when it is freed. * - Notify the network device driver of an addition to the multicast address * list. * * 'sa' points to caller-owned memory with the desired multicast address. * * 'retifma' will be used to return a pointer to the resulting multicast * address reference, if desired. */ int if_addmulti(struct ifnet *ifp, struct sockaddr *sa, struct ifmultiaddr **retifma) { struct ifmultiaddr *ifma, *ll_ifma; struct sockaddr *llsa; struct sockaddr_dl sdl; int error; #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif #ifdef INET6 IN6_MULTI_LIST_UNLOCK_ASSERT(); #endif /* * If the address is already present, return a new reference to it; * otherwise, allocate storage and set up a new address. */ IF_ADDR_WLOCK(ifp); ifma = if_findmulti(ifp, sa); if (ifma != NULL) { ifma->ifma_refcount++; if (retifma != NULL) *retifma = ifma; IF_ADDR_WUNLOCK(ifp); return (0); } /* * The address isn't already present; resolve the protocol address * into a link layer address, and then look that up, bump its * refcount or allocate an ifma for that also. * Most link layer resolving functions returns address data which * fits inside default sockaddr_dl structure. However callback * can allocate another sockaddr structure, in that case we need to * free it later. */ llsa = NULL; ll_ifma = NULL; if (ifp->if_resolvemulti != NULL) { /* Provide called function with buffer size information */ sdl.sdl_len = sizeof(sdl); llsa = (struct sockaddr *)&sdl; error = ifp->if_resolvemulti(ifp, &llsa, sa); if (error) goto unlock_out; } /* * Allocate the new address. Don't hook it up yet, as we may also * need to allocate a link layer multicast address. */ ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT); if (ifma == NULL) { error = ENOMEM; goto free_llsa_out; } /* * If a link layer address is found, we'll need to see if it's * already present in the address list, or allocate is as well. * When this block finishes, the link layer address will be on the * list. */ if (llsa != NULL) { ll_ifma = if_findmulti(ifp, llsa); if (ll_ifma == NULL) { ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT); if (ll_ifma == NULL) { --ifma->ifma_refcount; if_freemulti(ifma); error = ENOMEM; goto free_llsa_out; } ll_ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma, ifma_link); } else ll_ifma->ifma_refcount++; ifma->ifma_llifma = ll_ifma; } /* * We now have a new multicast address, ifma, and possibly a new or * referenced link layer address. Add the primary address to the * ifnet address list. */ ifma->ifma_flags |= IFMA_F_ENQUEUED; CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); if (retifma != NULL) *retifma = ifma; /* * Must generate the message while holding the lock so that 'ifma' * pointer is still valid. */ rt_newmaddrmsg(RTM_NEWMADDR, ifma); IF_ADDR_WUNLOCK(ifp); /* * We are certain we have added something, so call down to the * interface to let them know about it. */ if (ifp->if_ioctl != NULL) { (void) (*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0); } if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); return (0); free_llsa_out: if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl)) link_free_sdl(llsa); unlock_out: IF_ADDR_WUNLOCK(ifp); return (error); } /* * Delete a multicast group membership by network-layer group address. * * Returns ENOENT if the entry could not be found. If ifp no longer * exists, results are undefined. This entry point should only be used * from subsystems which do appropriate locking to hold ifp for the * duration of the call. * Network-layer protocol domains must use if_delmulti_ifma(). */ int if_delmulti(struct ifnet *ifp, struct sockaddr *sa) { struct ifmultiaddr *ifma; int lastref; #ifdef INVARIANTS struct epoch_tracker et; struct ifnet *oifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; NET_EPOCH_EXIT(et); KASSERT(ifp != NULL, ("%s: ifnet went away", __func__)); #endif if (ifp == NULL) return (ENOENT); IF_ADDR_WLOCK(ifp); lastref = 0; ifma = if_findmulti(ifp, sa); if (ifma != NULL) lastref = if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); if (ifma == NULL) return (ENOENT); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } return (0); } /* * Delete all multicast group membership for an interface. * Should be used to quickly flush all multicast filters. */ void if_delallmulti(struct ifnet *ifp) { struct ifmultiaddr *ifma; struct ifmultiaddr *next; IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next) if_delmulti_locked(ifp, ifma, 0); IF_ADDR_WUNLOCK(ifp); } void if_delmulti_ifma(struct ifmultiaddr *ifma) { if_delmulti_ifma_flags(ifma, 0); } /* * Delete a multicast group membership by group membership pointer. * Network-layer protocol domains must use this routine. * * It is safe to call this routine if the ifp disappeared. */ void if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags) { struct ifnet *ifp; int lastref; MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma); #ifdef INET IN_MULTI_LIST_UNLOCK_ASSERT(); #endif ifp = ifma->ifma_ifp; #ifdef DIAGNOSTIC if (ifp == NULL) { printf("%s: ifma_ifp seems to be detached\n", __func__); } else { struct epoch_tracker et; struct ifnet *oifp; NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link) if (ifp == oifp) break; if (ifp != oifp) ifp = NULL; NET_EPOCH_EXIT(et); } #endif /* * If and only if the ifnet instance exists: Acquire the address lock. */ if (ifp != NULL) IF_ADDR_WLOCK(ifp); lastref = if_delmulti_locked(ifp, ifma, flags); if (ifp != NULL) { /* * If and only if the ifnet instance exists: * Release the address lock. * If the group was left: update the hardware hash filter. */ IF_ADDR_WUNLOCK(ifp); if (lastref && ifp->if_ioctl != NULL) { (void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0); } } } /* * Perform deletion of network-layer and/or link-layer multicast address. * * Return 0 if the reference count was decremented. * Return 1 if the final reference was released, indicating that the * hardware hash filter should be reprogrammed. */ static int if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching) { struct ifmultiaddr *ll_ifma; if (ifp != NULL && ifma->ifma_ifp != NULL) { KASSERT(ifma->ifma_ifp == ifp, ("%s: inconsistent ifp %p", __func__, ifp)); IF_ADDR_WLOCK_ASSERT(ifp); } ifp = ifma->ifma_ifp; MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : ""); /* * If the ifnet is detaching, null out references to ifnet, * so that upper protocol layers will notice, and not attempt * to obtain locks for an ifnet which no longer exists. The * routing socket announcement must happen before the ifnet * instance is detached from the system. */ if (detaching) { #ifdef DIAGNOSTIC printf("%s: detaching ifnet instance %p\n", __func__, ifp); #endif /* * ifp may already be nulled out if we are being reentered * to delete the ll_ifma. */ if (ifp != NULL) { rt_newmaddrmsg(RTM_DELMADDR, ifma); ifma->ifma_ifp = NULL; } } if (--ifma->ifma_refcount > 0) return 0; if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link); ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } /* * If this ifma is a network-layer ifma, a link-layer ifma may * have been associated with it. Release it first if so. */ ll_ifma = ifma->ifma_llifma; if (ll_ifma != NULL) { KASSERT(ifma->ifma_lladdr != NULL, ("%s: llifma w/o lladdr", __func__)); if (detaching) ll_ifma->ifma_ifp = NULL; /* XXX */ if (--ll_ifma->ifma_refcount == 0) { if (ifp != NULL) { if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) { CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link); ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED; } } if_freemulti(ll_ifma); } } #ifdef INVARIANTS if (ifp) { struct ifmultiaddr *ifmatmp; CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link) MPASS(ifma != ifmatmp); } #endif if_freemulti(ifma); /* * The last reference to this instance of struct ifmultiaddr * was released; the hardware should be notified of this change. */ return 1; } /* * Set the link layer address on an interface. * * At this time we only support certain types of interfaces, * and we don't allow the length of the address to change. * * Set noinline to be dtrace-friendly */ __noinline int if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) { struct sockaddr_dl *sdl; struct ifaddr *ifa; struct ifreq ifr; struct epoch_tracker et; int rc; rc = 0; NET_EPOCH_ENTER(et); ifa = ifp->if_addr; if (ifa == NULL) { rc = EINVAL; goto out; } sdl = (struct sockaddr_dl *)ifa->ifa_addr; if (sdl == NULL) { rc = EINVAL; goto out; } if (len != sdl->sdl_alen) { /* don't allow length to change */ rc = EINVAL; goto out; } switch (ifp->if_type) { case IFT_ETHER: case IFT_XETHER: case IFT_L2VLAN: case IFT_BRIDGE: case IFT_IEEE8023ADLAG: bcopy(lladdr, LLADDR(sdl), len); break; default: rc = ENODEV; goto out; } /* * If the interface is already up, we need * to re-init it in order to reprogram its * address filter. */ NET_EPOCH_EXIT(et); if ((ifp->if_flags & IFF_UP) != 0) { if (ifp->if_ioctl) { ifp->if_flags &= ~IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); ifp->if_flags |= IFF_UP; ifr.ifr_flags = ifp->if_flags & 0xffff; ifr.ifr_flagshigh = ifp->if_flags >> 16; (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr); } } EVENTHANDLER_INVOKE(iflladdr_event, ifp); return (0); out: NET_EPOCH_EXIT(et); return (rc); } /* * Compat function for handling basic encapsulation requests. * Not converted stacks (FDDI, IB, ..) supports traditional * output model: ARP (and other similar L2 protocols) are handled * inside output routine, arpresolve/nd6_resolve() returns MAC * address instead of full prepend. * * This function creates calculated header==MAC for IPv4/IPv6 and * returns EAFNOSUPPORT (which is then handled in ARP code) for other * address families. */ static int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) { if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < req->lladdr_len) return (ENOMEM); switch (req->family) { case AF_INET: case AF_INET6: break; default: return (EAFNOSUPPORT); } /* Copy lladdr to storage as is */ memmove(req->buf, req->lladdr, req->lladdr_len); req->bufsize = req->lladdr_len; req->lladdr_off = 0; return (0); } /* * Tunnel interfaces can nest, also they may cause infinite recursion * calls when misconfigured. We'll prevent this by detecting loops. * High nesting level may cause stack exhaustion. We'll prevent this * by introducing upper limit. * * Return 0, if tunnel nesting count is equal or less than limit. */ int if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie, int limit) { struct m_tag *mtag; int count; count = 1; mtag = NULL; while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) { if (*(struct ifnet **)(mtag + 1) == ifp) { log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp)); return (EIO); } count++; } if (count > limit) { log(LOG_NOTICE, "%s: if_output recursively called too many times(%d)\n", if_name(ifp), count); return (EIO); } mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT); if (mtag == NULL) return (ENOMEM); *(struct ifnet **)(mtag + 1) = ifp; m_tag_prepend(m, mtag); return (0); } /* * Get the link layer address that was read from the hardware at attach. * * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type * their component interfaces as IFT_IEEE8023ADLAG. */ int if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr) { if (ifp->if_hw_addr == NULL) return (ENODEV); switch (ifp->if_type) { case IFT_ETHER: case IFT_IEEE8023ADLAG: bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen); return (0); default: return (ENODEV); } } /* * The name argument must be a pointer to storage which will last as * long as the interface does. For physical devices, the result of * device_get_name(dev) is a good choice and for pseudo-devices a * static string works well. */ void if_initname(struct ifnet *ifp, const char *name, int unit) { ifp->if_dname = name; ifp->if_dunit = unit; if (unit != IF_DUNIT_NONE) snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit); else strlcpy(ifp->if_xname, name, IFNAMSIZ); } int if_printf(struct ifnet *ifp, const char *fmt, ...) { char if_fmt[256]; va_list ap; snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt); va_start(ap, fmt); vlog(LOG_INFO, if_fmt, ap); va_end(ap); return (0); } void if_start(struct ifnet *ifp) { (*(ifp)->if_start)(ifp); } /* * Backwards compatibility interface for drivers * that have not implemented it */ static int if_transmit(struct ifnet *ifp, struct mbuf *m) { int error; IFQ_HANDOFF(ifp, m, error); return (error); } static void if_input_default(struct ifnet *ifp __unused, struct mbuf *m) { m_freem(m); } int if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust) { int active = 0; IF_LOCK(ifq); if (_IF_QFULL(ifq)) { IF_UNLOCK(ifq); if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); m_freem(m); return (0); } if (ifp != NULL) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust); if (m->m_flags & (M_BCAST|M_MCAST)) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); active = ifp->if_drv_flags & IFF_DRV_OACTIVE; } _IF_ENQUEUE(ifq, m); IF_UNLOCK(ifq); if (ifp != NULL && !active) (*(ifp)->if_start)(ifp); return (1); } void if_register_com_alloc(u_char type, if_com_alloc_t *a, if_com_free_t *f) { KASSERT(if_com_alloc[type] == NULL, ("if_register_com_alloc: %d already registered", type)); KASSERT(if_com_free[type] == NULL, ("if_register_com_alloc: %d free already registered", type)); if_com_alloc[type] = a; if_com_free[type] = f; } void if_deregister_com_alloc(u_char type) { KASSERT(if_com_alloc[type] != NULL, ("if_deregister_com_alloc: %d not registered", type)); KASSERT(if_com_free[type] != NULL, ("if_deregister_com_alloc: %d free not registered", type)); if_com_alloc[type] = NULL; if_com_free[type] = NULL; } /* API for driver access to network stack owned ifnet.*/ uint64_t if_setbaudrate(struct ifnet *ifp, uint64_t baudrate) { uint64_t oldbrate; oldbrate = ifp->if_baudrate; ifp->if_baudrate = baudrate; return (oldbrate); } uint64_t if_getbaudrate(if_t ifp) { return (((struct ifnet *)ifp)->if_baudrate); } int if_setcapabilities(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capabilities = capabilities; return (0); } int if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit) { ((struct ifnet *)ifp)->if_capabilities |= setbit; ((struct ifnet *)ifp)->if_capabilities &= ~clearbit; return (0); } int if_getcapabilities(if_t ifp) { return ((struct ifnet *)ifp)->if_capabilities; } int if_setcapenable(if_t ifp, int capabilities) { ((struct ifnet *)ifp)->if_capenable = capabilities; return (0); } int if_setcapenablebit(if_t ifp, int setcap, int clearcap) { if(setcap) ((struct ifnet *)ifp)->if_capenable |= setcap; if(clearcap) ((struct ifnet *)ifp)->if_capenable &= ~clearcap; return (0); } const char * if_getdname(if_t ifp) { return ((struct ifnet *)ifp)->if_dname; } int if_togglecapenable(if_t ifp, int togglecap) { ((struct ifnet *)ifp)->if_capenable ^= togglecap; return (0); } int if_getcapenable(if_t ifp) { return ((struct ifnet *)ifp)->if_capenable; } /* * This is largely undesirable because it ties ifnet to a device, but does * provide flexiblity for an embedded product vendor. Should be used with * the understanding that it violates the interface boundaries, and should be * a last resort only. */ int if_setdev(if_t ifp, void *dev) { return (0); } int if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags) { ((struct ifnet *)ifp)->if_drv_flags |= set_flags; ((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags; return (0); } int if_getdrvflags(if_t ifp) { return ((struct ifnet *)ifp)->if_drv_flags; } int if_setdrvflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_drv_flags = flags; return (0); } int if_setflags(if_t ifp, int flags) { ((struct ifnet *)ifp)->if_flags = flags; return (0); } int if_setflagbits(if_t ifp, int set, int clear) { ((struct ifnet *)ifp)->if_flags |= set; ((struct ifnet *)ifp)->if_flags &= ~clear; return (0); } int if_getflags(if_t ifp) { return ((struct ifnet *)ifp)->if_flags; } int if_clearhwassist(if_t ifp) { ((struct ifnet *)ifp)->if_hwassist = 0; return (0); } int if_sethwassistbits(if_t ifp, int toset, int toclear) { ((struct ifnet *)ifp)->if_hwassist |= toset; ((struct ifnet *)ifp)->if_hwassist &= ~toclear; return (0); } int if_sethwassist(if_t ifp, int hwassist_bit) { ((struct ifnet *)ifp)->if_hwassist = hwassist_bit; return (0); } int if_gethwassist(if_t ifp) { return ((struct ifnet *)ifp)->if_hwassist; } int if_setmtu(if_t ifp, int mtu) { ((struct ifnet *)ifp)->if_mtu = mtu; return (0); } int if_getmtu(if_t ifp) { return ((struct ifnet *)ifp)->if_mtu; } int if_getmtu_family(if_t ifp, int family) { struct domain *dp; for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_family == family && dp->dom_ifmtu != NULL) return (dp->dom_ifmtu((struct ifnet *)ifp)); } return (((struct ifnet *)ifp)->if_mtu); } int if_setsoftc(if_t ifp, void *softc) { ((struct ifnet *)ifp)->if_softc = softc; return (0); } void * if_getsoftc(if_t ifp) { return ((struct ifnet *)ifp)->if_softc; } void if_setrcvif(struct mbuf *m, if_t ifp) { + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (struct ifnet *)ifp; } void if_setvtag(struct mbuf *m, uint16_t tag) { m->m_pkthdr.ether_vtag = tag; } uint16_t if_getvtag(struct mbuf *m) { return (m->m_pkthdr.ether_vtag); } int if_sendq_empty(if_t ifp) { return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd); } struct ifaddr * if_getifaddr(if_t ifp) { return ((struct ifnet *)ifp)->if_addr; } int if_getamcount(if_t ifp) { return ((struct ifnet *)ifp)->if_amcount; } int if_setsendqready(if_t ifp) { IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd); return (0); } int if_setsendqlen(if_t ifp, int tx_desc_count) { IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count); ((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count; return (0); } int if_vlantrunkinuse(if_t ifp) { return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0; } int if_input(if_t ifp, struct mbuf* sendmp) { (*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp); return (0); } /* XXX */ #ifndef ETH_ADDR_LEN #define ETH_ADDR_LEN 6 #endif int if_setupmultiaddr(if_t ifp, void *mta, int *cnt, int max) { struct ifmultiaddr *ifma; uint8_t *lmta = (uint8_t *)mta; int mcnt = 0; CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == max) break; bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), &lmta[mcnt * ETH_ADDR_LEN], ETH_ADDR_LEN); mcnt++; } *cnt = mcnt; return (0); } int if_multiaddr_array(if_t ifp, void *mta, int *cnt, int max) { int error; if_maddr_rlock(ifp); error = if_setupmultiaddr(ifp, mta, cnt, max); if_maddr_runlock(ifp); return (error); } int if_multiaddr_count(if_t ifp, int max) { struct ifmultiaddr *ifma; int count; count = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &((struct ifnet *)ifp)->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; count++; if (count == max) break; } if_maddr_runlock(ifp); return (count); } int if_multi_apply(struct ifnet *ifp, int (*filter)(void *, struct ifmultiaddr *, int), void *arg) { struct ifmultiaddr *ifma; int cnt = 0; if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) cnt += filter(arg, ifma, cnt); if_maddr_runlock(ifp); return (cnt); } struct mbuf * if_dequeue(if_t ifp) { struct mbuf *m; IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m); return (m); } int if_sendq_prepend(if_t ifp, struct mbuf *m) { IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m); return (0); } int if_setifheaderlen(if_t ifp, int len) { ((struct ifnet *)ifp)->if_hdrlen = len; return (0); } caddr_t if_getlladdr(if_t ifp) { return (IF_LLADDR((struct ifnet *)ifp)); } void * if_gethandle(u_char type) { return (if_alloc(type)); } void if_bpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; BPF_MTAP(ifp, m); } void if_etherbpfmtap(if_t ifh, struct mbuf *m) { struct ifnet *ifp = (struct ifnet *)ifh; ETHER_BPF_MTAP(ifp, m); } void if_vlancap(if_t ifh) { struct ifnet *ifp = (struct ifnet *)ifh; VLAN_CAPABILITIES(ifp); } int if_sethwtsomax(if_t ifp, u_int if_hw_tsomax) { ((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax; return (0); } int if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount) { ((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount; return (0); } int if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize) { ((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize; return (0); } u_int if_gethwtsomax(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomax); } u_int if_gethwtsomaxsegcount(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount); } u_int if_gethwtsomaxsegsize(if_t ifp) { return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize); } void if_setinitfn(if_t ifp, void (*init_fn)(void *)) { ((struct ifnet *)ifp)->if_init = init_fn; } void if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t)) { ((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn; } void if_setstartfn(if_t ifp, void (*start_fn)(if_t)) { ((struct ifnet *)ifp)->if_start = (void *)start_fn; } void if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn) { ((struct ifnet *)ifp)->if_transmit = start_fn; } void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn) { ((struct ifnet *)ifp)->if_qflush = flush_fn; } void if_setgetcounterfn(if_t ifp, if_get_counter_t fn) { ifp->if_get_counter = fn; } /* Revisit these - These are inline functions originally. */ int drbr_inuse_drv(if_t ifh, struct buf_ring *br) { return drbr_inuse(ifh, br); } struct mbuf* drbr_dequeue_drv(if_t ifh, struct buf_ring *br) { return drbr_dequeue(ifh, br); } int drbr_needs_enqueue_drv(if_t ifh, struct buf_ring *br) { return drbr_needs_enqueue(ifh, br); } int drbr_enqueue_drv(if_t ifh, struct buf_ring *br, struct mbuf *m) { return drbr_enqueue(ifh, br, m); } Index: head/sys/net/if_ethersubr.c =================================================================== --- head/sys/net/if_ethersubr.c (revision 348253) +++ head/sys/net/if_ethersubr.c (revision 348254) @@ -1,1443 +1,1444 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_netgraph.h" #include "opt_mbuf_profiling.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #include #endif #ifdef INET6 #include #endif #include #include #ifdef CTASSERT CTASSERT(sizeof (struct ether_header) == ETHER_ADDR_LEN * 2 + 2); CTASSERT(sizeof (struct ether_addr) == ETHER_ADDR_LEN); #endif VNET_DEFINE(pfil_head_t, link_pfil_head); /* Packet filter hooks */ /* netgraph node hooks for ng_ether(4) */ void (*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m); int (*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp); void (*ng_ether_attach_p)(struct ifnet *ifp); void (*ng_ether_detach_p)(struct ifnet *ifp); void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* if_bridge(4) support */ void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ether_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #ifdef VIMAGE static void ether_reassign(struct ifnet *, struct vnet *, char *); #endif static int ether_requestencap(struct ifnet *, struct if_encap_req *); #define senderr(e) do { error = (e); goto bad;} while (0) static void update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) { int csum_flags = 0; if (src->m_pkthdr.csum_flags & CSUM_IP) csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); if (src->m_pkthdr.csum_flags & CSUM_SCTP) csum_flags |= CSUM_SCTP_VALID; dst->m_pkthdr.csum_flags |= csum_flags; if (csum_flags & CSUM_DATA_VALID) dst->m_pkthdr.csum_data = 0xffff; } /* * Handle link-layer encapsulation requests. */ static int ether_requestencap(struct ifnet *ifp, struct if_encap_req *req) { struct ether_header *eh; struct arphdr *ah; uint16_t etype; const u_char *lladdr; if (req->rtype != IFENCAP_LL) return (EOPNOTSUPP); if (req->bufsize < ETHER_HDR_LEN) return (ENOMEM); eh = (struct ether_header *)req->buf; lladdr = req->lladdr; req->lladdr_off = 0; switch (req->family) { case AF_INET: etype = htons(ETHERTYPE_IP); break; case AF_INET6: etype = htons(ETHERTYPE_IPV6); break; case AF_ARP: ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_ETHER); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: etype = htons(ETHERTYPE_ARP); break; } if (req->flags & IFENCAP_FLAG_BROADCAST) lladdr = ifp->if_broadcastaddr; break; default: return (EAFNOSUPPORT); } memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); req->bufsize = sizeof(struct ether_header); return (0); } static int ether_resolve_addr(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro, u_char *phdr, uint32_t *pflags, struct llentry **plle) { struct ether_header *eh; uint32_t lleflags = 0; int error = 0; #if defined(INET) || defined(INET6) uint16_t etype; #endif if (plle) *plle = NULL; eh = (struct ether_header *)phdr; switch (dst->sa_family) { #ifdef INET case AF_INET: if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { if (m->m_flags & M_BCAST) memcpy(eh->ether_dhost, ifp->if_broadcastaddr, ETHER_ADDR_LEN); else { const struct in_addr *a; a = &(((const struct sockaddr_in *)dst)->sin_addr); ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost); } etype = htons(ETHERTYPE_IP); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif #ifdef INET6 case AF_INET6: if ((m->m_flags & M_MCAST) == 0) error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, plle); else { const struct in6_addr *a6; a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost); etype = htons(ETHERTYPE_IPV6); memcpy(&eh->ether_type, &etype, sizeof(etype)); memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); } break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); if (m != NULL) m_freem(m); return (EAFNOSUPPORT); } if (error == EHOSTDOWN) { if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) error = EHOSTUNREACH; } if (error != 0) return (error); *pflags = RT_MAY_LOOP; if (lleflags & LLE_IFADDR) *pflags |= RT_L2_ME; return (0); } /* * Ethernet output routine. * Encapsulate a packet of type family for the local net. * Use trailer local net encapsulation if enough data in first * packet leaves a multiple of 512 bytes of data in remainder. */ int ether_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { int error = 0; char linkhdr[ETHER_HDR_LEN], *phdr; struct ether_header *eh; struct pf_mtag *t; int loop_copy = 1; int hlen; /* link layer header length */ uint32_t pflags; struct llentry *lle = NULL; int addref = 0; phdr = NULL; pflags = 0; if (ro != NULL) { /* XXX BPF uses ro_prepend */ if (ro->ro_prepend != NULL) { phdr = ro->ro_prepend; hlen = ro->ro_plen; } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { if ((ro->ro_flags & RT_LLE_CACHE) != 0) { lle = ro->ro_lle; if (lle != NULL && (lle->la_flags & LLE_VALID) == 0) { LLE_FREE(lle); lle = NULL; /* redundant */ ro->ro_lle = NULL; } if (lle == NULL) { /* if we lookup, keep cache */ addref = 1; } else /* * Notify LLE code that * the entry was used * by datapath. */ llentry_mark_used(lle); } if (lle != NULL) { phdr = lle->r_linkdata; hlen = lle->r_hdrlen; pflags = lle->r_flags; } } } #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) senderr(ENETDOWN); if (phdr == NULL) { /* No prepend data supplied. Try to calculate ourselves. */ phdr = linkhdr; hlen = ETHER_HDR_LEN; error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags, addref ? &lle : NULL); if (addref && lle != NULL) ro->ro_lle = lle; if (error != 0) return (error == EWOULDBLOCK ? 0 : error); } if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); } loop_copy = pflags & RT_MAY_LOOP; /* * Add local net header. If no space in first mbuf, * allocate another. * * Note that we do prepend regardless of RT_HAS_HEADER flag. * This is done because BPF code shifts m_data pointer * to the end of ethernet header prior to calling if_output(). */ M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); if ((pflags & RT_HAS_HEADER) == 0) { eh = mtod(m, struct ether_header *); memcpy(eh, phdr, hlen); } /* * If a simplex interface, and the packet is being sent to our * Ethernet address or a broadcast address, loopback a copy. * XXX To make a simplex device behave exactly like a duplex * device, we should copy in the case of sending to our own * ethernet address (thus letting the original actually appear * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { struct mbuf *n; /* * Because if_simloop() modifies the packet, we need a * writable copy through m_dup() instead of a readonly * one as m_copy[m] would give us. The alternative would * be to modify if_simloop() to handle the readonly mbuf, * but performancewise it is mostly equivalent (trading * extra data copying vs. extra locking). * * XXX This is a local workaround. A number of less * often used kernel parts suffer from the same bug. * See PR kern/105943 for a proposed general solution. */ if ((n = m_dup(m, M_NOWAIT)) != NULL) { update_mbuf_csumflags(m, n); (void)if_simloop(ifp, n, dst->sa_family, hlen); } else if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } /* * Bridges require special output handling. */ if (ifp->if_bridge) { BRIDGE_OUTPUT(ifp, m, error); return (error); } #if defined(INET) || defined(INET6) if (ifp->if_carp && (error = (*carp_output_p)(ifp, m, dst))) goto bad; #endif /* Handle ng_ether(4) processing, if any */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_output_p != NULL, ("ng_ether_output_p is NULL")); if ((error = (*ng_ether_output_p)(ifp, &m)) != 0) { bad: if (m != NULL) m_freem(m); return (error); } if (m == NULL) return (0); } /* Continue with link-layer output */ return ether_output_frame(ifp, m); } static bool ether_set_pcp(struct mbuf **mp, struct ifnet *ifp, uint8_t pcp) { struct ether_header *eh; eh = mtod(*mp, struct ether_header *); if (ntohs(eh->ether_type) == ETHERTYPE_VLAN || ether_8021q_frame(mp, ifp, ifp, 0, pcp)) return (true); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (false); } /* * Ethernet link layer output routine to send a raw frame to the device. * * This assumes that the 14 byte Ethernet header is present and contiguous * in the first mbuf (if BRIDGE'ing). */ int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { uint8_t pcp; pcp = ifp->if_pcp; if (pcp != IFNET_PCP_NONE && ifp->if_type != IFT_L2VLAN && !ether_set_pcp(&m, ifp, pcp)) return (0); if (PFIL_HOOKED_OUT(V_link_pfil_head)) switch (pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_OUT, NULL)) { case PFIL_DROPPED: return (EACCES); case PFIL_CONSUMED: return (0); } #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { struct ether_header *eh; eh = mtod(m, struct ether_header *); switch (ntohs(eh->ether_type)) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return (EAFNOSUPPORT); /* NOTREACHED */ break; }; } #endif #endif /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); } /* * Process a received Ethernet packet; the packet is in the * mbuf chain m with the ethernet header at the front. */ static void ether_input_internal(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; u_short etype; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } #ifdef DIAGNOSTIC if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { if_printf(ifp, "discard frame at !IFF_DRV_RUNNING\n"); m_freem(m); return; } #endif if (m->m_len < ETHER_HDR_LEN) { /* XXX maybe should pullup? */ if_printf(ifp, "discard frame w/o leading ethernet " "header (len %u pkt len %u)\n", m->m_len, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); return; } eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); random_harvest_queue_ether(m, sizeof(*m)); #ifdef EXPERIMENTAL #if defined(INET6) && defined(INET) /* draft-ietf-6man-ipv6only-flag */ /* Catch ETHERTYPE_IP, and ETHERTYPE_[REV]ARP if we are v6-only. */ if ((ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY_MASK) != 0) { switch (etype) { case ETHERTYPE_IP: case ETHERTYPE_ARP: case ETHERTYPE_REVARP: m_freem(m); return; /* NOTREACHED */ break; }; } #endif #endif CURVNET_SET_QUIET(ifp->if_vnet); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (ETHER_IS_BROADCAST(eh->ether_dhost)) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* * Give bpf a chance at the packet. */ ETHER_BPF_MTAP(ifp, m); /* * If the CRC is still on the packet, trim it off. We do this once * and once only in case we are re-entered. Nothing else on the * Ethernet receive path expects to see the FCS. */ if (m->m_flags & M_HASFCS) { m_adj(m, -ETHER_CRC_LEN); m->m_flags &= ~M_HASFCS; } if (!(ifp->if_capenable & IFCAP_HWSTATS)) if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); CURVNET_RESTORE(); return; } /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { KASSERT(lagg_input_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { CURVNET_RESTORE(); return; } } /* * If the hardware did not process an 802.1Q tag, do this now, * to allow 802.1P priority frames to be passed to the main input * path correctly. * TODO: Deal with Q-in-Q frames, but not arbitrary nesting levels. */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_VLAN) { struct ether_vlan_header *evl; if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { #ifdef DIAGNOSTIC if_printf(ifp, "cannot pullup VLAN header\n"); #endif if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); CURVNET_RESTORE(); return; } evl = mtod(m, struct ether_vlan_header *); m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); m->m_flags |= M_VLANTAG; bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); eh = mtod(m, struct ether_header *); } M_SETFIB(m, ifp->if_fib); /* Allow ng_ether(4) to claim this frame. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_p != NULL, ("%s: ng_ether_input_p is NULL", __func__)); m->m_flags &= ~M_PROMISC; (*ng_ether_input_p)(ifp, &m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } /* * Allow if_bridge(4) to claim this frame. * The BRIDGE_INPUT() macro will update ifp if the bridge changed it * and the frame should be delivered locally. */ if (ifp->if_bridge != NULL) { m->m_flags &= ~M_PROMISC; BRIDGE_INPUT(ifp, m); if (m == NULL) { CURVNET_RESTORE(); return; } eh = mtod(m, struct ether_header *); } #if defined(INET) || defined(INET6) /* * Clear M_PROMISC on frame so that carp(4) will see it when the * mbuf flows up to Layer 3. * FreeBSD's implementation of carp(4) uses the inprotosw * to dispatch IPPROTO_CARP. carp(4) also allocates its own * Ethernet addresses of the form 00:00:5e:00:01:xx, which * is outside the scope of the M_PROMISC test below. * TODO: Maintain a hash table of ethernet addresses other than * ether_dhost which may be active on this ifp. */ if (ifp->if_carp && (*carp_forus_p)(ifp, eh->ether_dhost)) { m->m_flags &= ~M_PROMISC; } else #endif { /* * If the frame received was not for our MAC address, set the * M_PROMISC flag on the mbuf chain. The frame may need to * be seen by the rest of the Ethernet input path in case of * re-entry (e.g. bridge, vlan, netgraph) but should not be * seen by upper protocol layers. */ if (!ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(IF_LLADDR(ifp), eh->ether_dhost, ETHER_ADDR_LEN) != 0) m->m_flags |= M_PROMISC; } ether_demux(ifp, m); CURVNET_RESTORE(); } /* * Ethernet input dispatch; by default, direct dispatch here regardless of * global configuration. However, if RSS is enabled, hook up RSS affinity * so that when deferred or hybrid dispatch is enabled, we can redistribute * load based on RSS. * * XXXRW: Would be nice if the ifnet passed up a flag indicating whether or * not it had already done work distribution via multi-queue. Then we could * direct dispatch in the event load balancing was already complete and * handle the case of interfaces with different capabilities better. * * XXXRW: Sort of want an M_DISTRIBUTED flag to avoid multiple distributions * at multiple layers? * * XXXRW: For now, enable all this only if RSS is compiled in, although it * works fine without RSS. Need to characterise the performance overhead * of the detour through the netisr code in the event the result is always * direct dispatch. */ static void ether_nh_input(struct mbuf *m) { M_ASSERTPKTHDR(m); KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: NULL interface pointer", __func__)); ether_input_internal(m->m_pkthdr.rcvif, m); } static struct netisr_handler ether_nh = { .nh_name = "ether", .nh_handler = ether_nh_input, .nh_proto = NETISR_ETHER, #ifdef RSS .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_DIRECT, .nh_m2cpuid = rss_m2cpuid, #else .nh_policy = NETISR_POLICY_SOURCE, .nh_dispatch = NETISR_DISPATCH_DIRECT, #endif }; static void ether_init(__unused void *arg) { netisr_register(ðer_nh); } SYSINIT(ether, SI_SUB_INIT_IF, SI_ORDER_ANY, ether_init, NULL); static void vnet_ether_init(__unused void *arg) { struct pfil_head_args args; args.pa_version = PFIL_VERSION; args.pa_flags = PFIL_IN | PFIL_OUT; args.pa_type = PFIL_TYPE_ETHERNET; args.pa_headname = PFIL_ETHER_NAME; V_link_pfil_head = pfil_head_register(&args); #ifdef VIMAGE netisr_register_vnet(ðer_nh); #endif } VNET_SYSINIT(vnet_ether_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_init, NULL); #ifdef VIMAGE static void vnet_ether_pfil_destroy(__unused void *arg) { pfil_head_unregister(V_link_pfil_head); } VNET_SYSUNINIT(vnet_ether_pfil_uninit, SI_SUB_PROTO_PFIL, SI_ORDER_ANY, vnet_ether_pfil_destroy, NULL); static void vnet_ether_destroy(__unused void *arg) { netisr_unregister_vnet(ðer_nh); } VNET_SYSUNINIT(vnet_ether_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_ether_destroy, NULL); #endif static void ether_input(struct ifnet *ifp, struct mbuf *m) { struct mbuf *mn; /* * The drivers are allowed to pass in a chain of packets linked with * m_nextpkt. We split them up into separate packets here and pass * them up. This allows the drivers to amortize the receive lock. */ while (m) { mn = m->m_nextpkt; m->m_nextpkt = NULL; /* * We will rely on rcvif being set properly in the deferred context, * so assert it is correct here. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); KASSERT(m->m_pkthdr.rcvif == ifp, ("%s: ifnet mismatch m %p " "rcvif %p ifp %p", __func__, m, m->m_pkthdr.rcvif, ifp)); CURVNET_SET_QUIET(ifp->if_vnet); netisr_dispatch(NETISR_ETHER, m); CURVNET_RESTORE(); m = mn; } } /* * Upper layer processing for a received Ethernet packet. */ void ether_demux(struct ifnet *ifp, struct mbuf *m) { struct ether_header *eh; int i, isr; u_short ether_type; KASSERT(ifp != NULL, ("%s: NULL interface pointer", __func__)); /* Do not grab PROMISC frames in case we are re-entered. */ if (PFIL_HOOKED_IN(V_link_pfil_head) && !(m->m_flags & M_PROMISC)) { i = pfil_run_hooks(V_link_pfil_head, &m, ifp, PFIL_IN, NULL); if (i != 0 || m == NULL) return; } eh = mtod(m, struct ether_header *); ether_type = ntohs(eh->ether_type); /* * If this frame has a VLAN tag other than 0, call vlan_input() * if its module is loaded. Otherwise, drop. */ if ((m->m_flags & M_VLANTAG) && EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != 0) { if (ifp->if_vlantrunk == NULL) { if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } KASSERT(vlan_input_p != NULL,("%s: VLAN not loaded!", __func__)); /* Clear before possibly re-entering ether_input(). */ m->m_flags &= ~M_PROMISC; (*vlan_input_p)(ifp, m); return; } /* * Pass promiscuously received frames to the upper layer if the user * requested this by setting IFF_PPROMISC. Otherwise, drop them. */ if ((ifp->if_flags & IFF_PPROMISC) == 0 && (m->m_flags & M_PROMISC)) { m_freem(m); return; } /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Ethernet header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, ETHER_HDR_LEN); /* * Dispatch frame to upper layer. */ switch (ether_type) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: /* * Packet is to be discarded. If netgraph is present, * hand the packet to it for last chance processing; * otherwise dispose of it. */ if (ifp->if_l2com != NULL) { KASSERT(ng_ether_input_orphan_p != NULL, ("ng_ether_input_orphan_p is NULL")); /* * Put back the ethernet header so netgraph has a * consistent view of inbound packets. */ M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); (*ng_ether_input_orphan_p)(ifp, m); return; } m_freem(m); } /* * Convert Ethernet address to printable (loggable) representation. * This routine is for compatibility; it's better to just use * * printf("%6D", , ":"); * * since there's no static buffer involved. */ char * ether_sprintf(const u_char *ap) { static char etherbuf[18]; snprintf(etherbuf, sizeof (etherbuf), "%6D", ap, ":"); return (etherbuf); } /* * Perform common duties while attaching to interface list */ void ether_ifattach(struct ifnet *ifp, const u_int8_t *lla) { int i; struct ifaddr *ifa; struct sockaddr_dl *sdl; ifp->if_addrlen = ETHER_ADDR_LEN; ifp->if_hdrlen = ETHER_HDR_LEN; if_attach(ifp); ifp->if_mtu = ETHERMTU; ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; ifp->if_requestencap = ether_requestencap; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Mbps(10); /* just a default */ ifp->if_broadcastaddr = etherbroadcastaddr; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_ETHER; sdl->sdl_alen = ifp->if_addrlen; bcopy(lla, LLADDR(sdl), ifp->if_addrlen); if (ifp->if_hw_addr != NULL) bcopy(lla, ifp->if_hw_addr, ifp->if_addrlen); bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); if (ng_ether_attach_p != NULL) (*ng_ether_attach_p)(ifp); /* Announce Ethernet MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Ethernet address: %6D\n", lla, ":"); uuid_ether_add(LLADDR(sdl)); /* Add necessary bits are setup; announce it now. */ EVENTHANDLER_INVOKE(ether_ifattach_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("ETHERNET", ifp->if_xname, "IFATTACH", NULL); } /* * Perform common duties while detaching an Ethernet interface */ void ether_ifdetach(struct ifnet *ifp) { struct sockaddr_dl *sdl; sdl = (struct sockaddr_dl *)(ifp->if_addr->ifa_addr); uuid_ether_del(LLADDR(sdl)); if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } bpfdetach(ifp); if_detach(ifp); } #ifdef VIMAGE void ether_reassign(struct ifnet *ifp, struct vnet *new_vnet, char *unused __unused) { if (ifp->if_l2com != NULL) { KASSERT(ng_ether_detach_p != NULL, ("ng_ether_detach_p is NULL")); (*ng_ether_detach_p)(ifp); } if (ng_ether_attach_p != NULL) { CURVNET_SET_QUIET(new_vnet); (*ng_ether_attach_p)(ifp); CURVNET_RESTORE(); } } #endif SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, IFT_ETHER, ether, CTLFLAG_RW, 0, "Ethernet"); #if 0 /* * This is for reference. We have a table-driven version * of the little-endian crc32 generator, which is faster * than the double-loop. */ uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { size_t i; uint32_t crc; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = (crc ^ data) & 1; crc >>= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_LE); } } return (crc); } #else uint32_t ether_crc32_le(const uint8_t *buf, size_t len) { static const uint32_t crctab[] = { 0x00000000, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c }; size_t i; uint32_t crc; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { crc ^= buf[i]; crc = (crc >> 4) ^ crctab[crc & 0xf]; crc = (crc >> 4) ^ crctab[crc & 0xf]; } return (crc); } #endif uint32_t ether_crc32_be(const uint8_t *buf, size_t len) { size_t i; uint32_t crc, carry; int bit; uint8_t data; crc = 0xffffffff; /* initial value */ for (i = 0; i < len; i++) { for (data = *buf++, bit = 0; bit < 8; bit++, data >>= 1) { carry = ((crc & 0x80000000) ? 1 : 0) ^ (data & 0x01); crc <<= 1; if (carry) crc = (crc ^ ETHER_CRC_POLY_BE) | carry; } } return (crc); } int ether_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ETHER_ADDR_LEN); break; case SIOCSIFMTU: /* * Set the interface MTU. */ if (ifr->ifr_mtu > ETHERMTU) { error = EINVAL; } else { ifp->if_mtu = ifr->ifr_mtu; } break; case SIOCSLANPCP: error = priv_check(curthread, PRIV_NET_SETLANPCP); if (error != 0) break; if (ifr->ifr_lan_pcp > 7 && ifr->ifr_lan_pcp != IFNET_PCP_NONE) { error = EINVAL; } else { ifp->if_pcp = ifr->ifr_lan_pcp; /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); } break; case SIOCGLANPCP: ifr->ifr_lan_pcp = ifp->if_pcp; break; default: error = EINVAL; /* XXX netbsd has ENOTTY??? */ break; } return (error); } static int ether_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!ETHER_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* * An IP6 address of 0 means listen to all * of the Ethernet multicast address used for IP6. * (This is used for multicast routers.) */ ifp->if_flags |= IFF_ALLMULTI; *llsa = NULL; return 0; } if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_ETHER); sdl->sdl_alen = ETHER_ADDR_LEN; e_addr = LLADDR(sdl); ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: /* * Well, the text isn't quite right, but it's the name * that counts... */ return EAFNOSUPPORT; } } static moduledata_t ether_mod = { .name = "ether", }; void ether_vlan_mtap(struct bpf_if *bp, struct mbuf *m, void *data, u_int dlen) { struct ether_vlan_header vlan; struct mbuf mv, mb; KASSERT((m->m_flags & M_VLANTAG) != 0, ("%s: vlan information not present", __func__)); KASSERT(m->m_len >= sizeof(struct ether_header), ("%s: mbuf not large enough for header", __func__)); bcopy(mtod(m, char *), &vlan, sizeof(struct ether_header)); vlan.evl_proto = vlan.evl_encap_proto; vlan.evl_encap_proto = htons(ETHERTYPE_VLAN); vlan.evl_tag = htons(m->m_pkthdr.ether_vtag); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * If a data link has been supplied by the caller, then we will need to * re-create a stack allocated mbuf chain with the following structure: * * (1) mbuf #1 will contain the supplied data link * (2) mbuf #2 will contain the vlan header * (3) mbuf #3 will contain the original mbuf's packet data * * Otherwise, submit the packet and vlan header via bpf_mtap2(). */ if (data != NULL) { mv.m_next = m; mv.m_data = (caddr_t)&vlan; mv.m_len = sizeof(vlan); mb.m_next = &mv; mb.m_data = data; mb.m_len = dlen; bpf_mtap(bp, &mb); } else bpf_mtap2(bp, &vlan, sizeof(vlan), m); m->m_len += sizeof(struct ether_header); m->m_data -= sizeof(struct ether_header); } struct mbuf * ether_vlanencap(struct mbuf *m, uint16_t tag) { struct ether_vlan_header *evl; M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT); if (m == NULL) return (NULL); /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ if (m->m_len < sizeof(*evl)) { m = m_pullup(m, sizeof(*evl)); if (m == NULL) return (NULL); } /* * Transform the Ethernet header into an Ethernet header * with 802.1Q encapsulation. */ evl = mtod(m, struct ether_vlan_header *); bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN, (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN); evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); return (m); } static SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW, 0, "IEEE 802.1Q VLAN"); static SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0, "for consistency"); VNET_DEFINE_STATIC(int, soft_pad); #define V_soft_pad VNET(soft_pad) SYSCTL_INT(_net_link_vlan, OID_AUTO, soft_pad, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(soft_pad), 0, "pad short frames before tagging"); /* * For now, make preserving PCP via an mbuf tag optional, as it increases * per-packet memory allocations and frees. In the future, it would be * preferable to reuse ether_vtag for this, or similar. */ int vlan_mtag_pcp = 0; SYSCTL_INT(_net_link_vlan, OID_AUTO, mtag_pcp, CTLFLAG_RW, &vlan_mtag_pcp, 0, "Retain VLAN PCP information as packets are passed up the stack"); bool ether_8021q_frame(struct mbuf **mp, struct ifnet *ife, struct ifnet *p, uint16_t vid, uint8_t pcp) { struct m_tag *mtag; int n; uint16_t tag; static const char pad[8]; /* just zeros */ /* * Pad the frame to the minimum size allowed if told to. * This option is in accord with IEEE Std 802.1Q, 2003 Ed., * paragraph C.4.4.3.b. It can help to work around buggy * bridges that violate paragraph C.4.4.3.a from the same * document, i.e., fail to pad short frames after untagging. * E.g., a tagged frame 66 bytes long (incl. FCS) is OK, but * untagging it will produce a 62-byte frame, which is a runt * and requires padding. There are VLAN-enabled network * devices that just discard such runts instead or mishandle * them somehow. */ if (V_soft_pad && p->if_type == IFT_ETHER) { for (n = ETHERMIN + ETHER_HDR_LEN - (*mp)->m_pkthdr.len; n > 0; n -= sizeof(pad)) { if (!m_append(*mp, min(n, sizeof(pad)), pad)) break; } if (n > 0) { m_freem(*mp); *mp = NULL; if_printf(ife, "cannot pad short frame"); return (false); } } /* * If underlying interface can do VLAN tag insertion itself, * just pass the packet along. However, we need some way to * tell the interface where the packet came from so that it * knows how to find the VLAN tag to use, so we attach a * packet tag that holds it. */ if (vlan_mtag_pcp && (mtag = m_tag_locate(*mp, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL)) != NULL) tag = EVL_MAKETAG(vid, *(uint8_t *)(mtag + 1), 0); else tag = EVL_MAKETAG(vid, pcp, 0); if (p->if_capenable & IFCAP_VLAN_HWTAGGING) { (*mp)->m_pkthdr.ether_vtag = tag; (*mp)->m_flags |= M_VLANTAG; } else { *mp = ether_vlanencap(*mp, tag); if (*mp == NULL) { if_printf(ife, "unable to prepend 802.1Q header"); return (false); } } return (true); } /* * Allocate an address from the FreeBSD Foundation OUI. This uses a * cryptographic hash function on the containing jail's UUID and the interface * name to attempt to provide a unique but stable address. Pseudo-interfaces * which require a MAC address should use this function to allocate * non-locally-administered addresses. */ void ether_gen_addr(struct ifnet *ifp, struct ether_addr *hwaddr) { #define ETHER_GEN_ADDR_BUFSIZ HOSTUUIDLEN + IFNAMSIZ + 2 SHA1_CTX ctx; char buf[ETHER_GEN_ADDR_BUFSIZ]; char uuid[HOSTUUIDLEN + 1]; uint64_t addr; int i, sz; char digest[SHA1_RESULTLEN]; getcredhostuuid(curthread->td_ucred, uuid, sizeof(uuid)); sz = snprintf(buf, ETHER_GEN_ADDR_BUFSIZ, "%s-%s", uuid, ifp->if_xname); SHA1Init(&ctx); SHA1Update(&ctx, buf, sz); SHA1Final(digest, &ctx); addr = ((digest[0] << 16) | (digest[1] << 8) | digest[2]) & OUI_FREEBSD_GENERATED_MASK; addr = OUI_FREEBSD(addr); for (i = 0; i < ETHER_ADDR_LEN; ++i) { hwaddr->octet[i] = addr >> ((ETHER_ADDR_LEN - i - 1) * 8) & 0xFF; } } DECLARE_MODULE(ether, ether_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(ether, 1); Index: head/sys/net/if_lagg.c =================================================================== --- head/sys/net/if_lagg.c (revision 348253) +++ head/sys/net/if_lagg.c (revision 348254) @@ -1,2278 +1,2371 @@ /* $OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $ */ /* * Copyright (c) 2005, 2006 Reyk Floeter * Copyright (c) 2007 Andrew Thompson * Copyright (c) 2014, 2016 Marcelo Araujo * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #endif #ifdef INET #include #include #endif #ifdef INET6 #include #include #include #endif #include #include #include #define LAGG_RLOCK() struct epoch_tracker lagg_et; epoch_enter_preempt(net_epoch_preempt, &lagg_et) #define LAGG_RUNLOCK() epoch_exit_preempt(net_epoch_preempt, &lagg_et) #define LAGG_RLOCK_ASSERT() MPASS(in_epoch(net_epoch_preempt)) #define LAGG_UNLOCK_ASSERT() MPASS(!in_epoch(net_epoch_preempt)) #define LAGG_SX_INIT(_sc) sx_init(&(_sc)->sc_sx, "if_lagg sx") #define LAGG_SX_DESTROY(_sc) sx_destroy(&(_sc)->sc_sx) #define LAGG_XLOCK(_sc) sx_xlock(&(_sc)->sc_sx) #define LAGG_XUNLOCK(_sc) sx_xunlock(&(_sc)->sc_sx) #define LAGG_SXLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_LOCKED) #define LAGG_XLOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_sx, SA_XLOCKED) /* Special flags we should propagate to the lagg ports. */ static struct { int flag; int (*func)(struct ifnet *, int); } lagg_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; +struct lagg_snd_tag { + struct m_snd_tag com; + struct m_snd_tag *tag; +}; + VNET_DEFINE(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */ #define V_lagg_list VNET(lagg_list) VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx); #define V_lagg_list_mtx VNET(lagg_list_mtx) #define LAGG_LIST_LOCK_INIT(x) mtx_init(&V_lagg_list_mtx, \ "if_lagg list", NULL, MTX_DEF) #define LAGG_LIST_LOCK_DESTROY(x) mtx_destroy(&V_lagg_list_mtx) #define LAGG_LIST_LOCK(x) mtx_lock(&V_lagg_list_mtx) #define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; static int lagg_clone_create(struct if_clone *, int, caddr_t); static void lagg_clone_destroy(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner); #define V_lagg_cloner VNET(lagg_cloner) static const char laggname[] = "lagg"; static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface"); static void lagg_capabilities(struct lagg_softc *); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); static struct mbuf *lagg_input(struct ifnet *, struct mbuf *); static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); static int lagg_port_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); static void lagg_port_ifdetach(void *arg __unused, struct ifnet *); #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *); #endif static void lagg_port2req(struct lagg_port *, struct lagg_reqport *); static void lagg_init(void *); static void lagg_stop(struct lagg_softc *); static int lagg_ioctl(struct ifnet *, u_long, caddr_t); #ifdef RATELIMIT static int lagg_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); +static int lagg_snd_tag_modify(struct m_snd_tag *, + union if_snd_tag_modify_params *); +static int lagg_snd_tag_query(struct m_snd_tag *, + union if_snd_tag_query_params *); static void lagg_snd_tag_free(struct m_snd_tag *); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); static int lagg_setcaps(struct lagg_port *, int cap); static int lagg_setflag(struct lagg_port *, int, int, int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt); static int lagg_transmit(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); static struct lagg_port *lagg_link_active(struct lagg_softc *, struct lagg_port *); /* Simple round robin */ static void lagg_rr_attach(struct lagg_softc *); static int lagg_rr_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Active failover */ static int lagg_fail_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *); static void lagg_lb_detach(struct lagg_softc *); static int lagg_lb_port_create(struct lagg_port *); static void lagg_lb_port_destroy(struct lagg_port *); static int lagg_lb_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static int lagg_lb_porttable(struct lagg_softc *, struct lagg_port *); /* Broadcast */ static int lagg_bcast_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_bcast_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); /* 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *); static void lagg_lacp_detach(struct lagg_softc *); static int lagg_lacp_start(struct lagg_softc *, struct mbuf *); static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *, struct mbuf *); static void lagg_lacp_lladdr(struct lagg_softc *); /* lagg protocol table */ static const struct lagg_proto { lagg_proto pr_num; void (*pr_attach)(struct lagg_softc *); void (*pr_detach)(struct lagg_softc *); int (*pr_start)(struct lagg_softc *, struct mbuf *); struct mbuf * (*pr_input)(struct lagg_softc *, struct lagg_port *, struct mbuf *); int (*pr_addport)(struct lagg_port *); void (*pr_delport)(struct lagg_port *); void (*pr_linkstate)(struct lagg_port *); void (*pr_init)(struct lagg_softc *); void (*pr_stop)(struct lagg_softc *); void (*pr_lladdr)(struct lagg_softc *); void (*pr_request)(struct lagg_softc *, void *); void (*pr_portreq)(struct lagg_port *, void *); } lagg_protos[] = { { .pr_num = LAGG_PROTO_NONE }, { .pr_num = LAGG_PROTO_ROUNDROBIN, .pr_attach = lagg_rr_attach, .pr_start = lagg_rr_start, .pr_input = lagg_rr_input, }, { .pr_num = LAGG_PROTO_FAILOVER, .pr_start = lagg_fail_start, .pr_input = lagg_fail_input, }, { .pr_num = LAGG_PROTO_LOADBALANCE, .pr_attach = lagg_lb_attach, .pr_detach = lagg_lb_detach, .pr_start = lagg_lb_start, .pr_input = lagg_lb_input, .pr_addport = lagg_lb_port_create, .pr_delport = lagg_lb_port_destroy, }, { .pr_num = LAGG_PROTO_LACP, .pr_attach = lagg_lacp_attach, .pr_detach = lagg_lacp_detach, .pr_start = lagg_lacp_start, .pr_input = lagg_lacp_input, .pr_addport = lacp_port_create, .pr_delport = lacp_port_destroy, .pr_linkstate = lacp_linkstate, .pr_init = lacp_init, .pr_stop = lacp_stop, .pr_lladdr = lagg_lacp_lladdr, .pr_request = lacp_req, .pr_portreq = lacp_portreq, }, { .pr_num = LAGG_PROTO_BROADCAST, .pr_start = lagg_bcast_start, .pr_input = lagg_bcast_input, }, }; SYSCTL_DECL(_net_link); SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0, "Link Aggregation"); /* Allow input on any failover links */ VNET_DEFINE_STATIC(int, lagg_failover_rx_all); #define V_lagg_failover_rx_all VNET(lagg_failover_rx_all) SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(lagg_failover_rx_all), 0, "Accept input from any interface in a failover lagg"); /* Default value for using flowid */ VNET_DEFINE_STATIC(int, def_use_flowid) = 0; #define V_def_use_flowid VNET(def_use_flowid) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RWTUN, &VNET_NAME(def_use_flowid), 0, "Default setting for using flow id for load sharing"); /* Default value for using numa */ VNET_DEFINE_STATIC(int, def_use_numa) = 1; #define V_def_use_numa VNET(def_use_numa) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa, CTLFLAG_RWTUN, &VNET_NAME(def_use_numa), 0, "Use numa to steer flows"); /* Default value for flowid shift */ VNET_DEFINE_STATIC(int, def_flowid_shift) = 16; #define V_def_flowid_shift VNET(def_flowid_shift) SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift, CTLFLAG_RWTUN, &VNET_NAME(def_flowid_shift), 0, "Default setting for flowid shift for load sharing"); static void vnet_lagg_init(const void *unused __unused) { LAGG_LIST_LOCK_INIT(); SLIST_INIT(&V_lagg_list); V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, lagg_clone_destroy, 0); } VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_lagg_init, NULL); static void vnet_lagg_uninit(const void *unused __unused) { if_clone_detach(V_lagg_cloner); LAGG_LIST_LOCK_DESTROY(); } VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY, vnet_lagg_uninit, NULL); static int lagg_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: lagg_input_p = lagg_input; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, lagg_port_ifdetach, NULL, EVENTHANDLER_PRI_ANY); break; case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); lagg_input_p = NULL; lagg_linkstate_p = NULL; break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t lagg_mod = { "if_lagg", lagg_modevent, 0 }; DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); static void lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr) { LAGG_XLOCK_ASSERT(sc); KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto", __func__, sc)); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "using proto %u\n", pr); if (lagg_protos[pr].pr_attach != NULL) lagg_protos[pr].pr_attach(sc); sc->sc_proto = pr; } static void lagg_proto_detach(struct lagg_softc *sc) { lagg_proto pr; LAGG_XLOCK_ASSERT(sc); pr = sc->sc_proto; sc->sc_proto = LAGG_PROTO_NONE; if (lagg_protos[pr].pr_detach != NULL) lagg_protos[pr].pr_detach(sc); } static int lagg_proto_start(struct lagg_softc *sc, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_start(sc, m)); } static struct mbuf * lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m)); } static int lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_addport == NULL) return (0); else return (lagg_protos[sc->sc_proto].pr_addport(lp)); } static void lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_delport != NULL) lagg_protos[sc->sc_proto].pr_delport(lp); } static void lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp) { if (lagg_protos[sc->sc_proto].pr_linkstate != NULL) lagg_protos[sc->sc_proto].pr_linkstate(lp); } static void lagg_proto_init(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_init != NULL) lagg_protos[sc->sc_proto].pr_init(sc); } static void lagg_proto_stop(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_stop != NULL) lagg_protos[sc->sc_proto].pr_stop(sc); } static void lagg_proto_lladdr(struct lagg_softc *sc) { if (lagg_protos[sc->sc_proto].pr_lladdr != NULL) lagg_protos[sc->sc_proto].pr_lladdr(sc); } static void lagg_proto_request(struct lagg_softc *sc, void *v) { if (lagg_protos[sc->sc_proto].pr_request != NULL) lagg_protos[sc->sc_proto].pr_request(sc, v); } static void lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v) { if (lagg_protos[sc->sc_proto].pr_portreq != NULL) lagg_protos[sc->sc_proto].pr_portreq(lp, v); } /* * This routine is run via an vlan * config EVENT */ static void lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } /* * This routine is run via an vlan * unconfig EVENT */ static void lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct lagg_softc *sc = ifp->if_softc; struct lagg_port *lp; if (ifp->if_softc != arg) /* Not our event */ return; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag); LAGG_RUNLOCK(); } static int lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) { struct lagg_softc *sc; struct ifnet *ifp; static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO); ifp = sc->sc_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { free(sc, M_LAGG); return (ENOSPC); } LAGG_SX_INIT(sc); LAGG_XLOCK(sc); if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; if (V_def_use_numa) sc->sc_opts |= LAGG_OPT_USE_NUMA; sc->flowid_shift = V_def_flowid_shift; /* Hash all layers by default */ sc->sc_flags = MBUF_HASHFLAG_L2|MBUF_HASHFLAG_L3|MBUF_HASHFLAG_L4; lagg_proto_attach(sc, LAGG_PROTO_DEFAULT); CK_SLIST_INIT(&sc->sc_ports); /* Initialise pseudo media types */ ifmedia_init(&sc->sc_media, 0, lagg_media_change, lagg_media_status); ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); if_initname(ifp, laggname, unit); ifp->if_softc = sc; ifp->if_transmit = lagg_transmit; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; ifp->if_get_counter = lagg_get_counter; ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST; #ifdef RATELIMIT ifp->if_snd_tag_alloc = lagg_snd_tag_alloc; + ifp->if_snd_tag_modify = lagg_snd_tag_modify; + ifp->if_snd_tag_query = lagg_snd_tag_query; ifp->if_snd_tag_free = lagg_snd_tag_free; #endif ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; /* * Attach as an ordinary ethernet device, children will be attached * as special device IFT_IEEE8023ADLAG. */ ether_ifattach(ifp, eaddr); sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); /* Insert into the global list of laggs */ LAGG_LIST_LOCK(); SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_XUNLOCK(sc); return (0); } static void lagg_clone_destroy(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; LAGG_XLOCK(sc); sc->sc_destroying = 1; lagg_stop(sc); ifp->if_flags &= ~IFF_UP; EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach); /* Shutdown and remove lagg ports */ while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL) lagg_port_destroy(lp, 1); /* Unhook the aggregation protocol */ lagg_proto_detach(sc); LAGG_XUNLOCK(sc); ifmedia_removeall(&sc->sc_media); ether_ifdetach(ifp); if_free(ifp); LAGG_LIST_LOCK(); SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); LAGG_LIST_UNLOCK(); LAGG_SX_DESTROY(sc); free(sc, M_LAGG); } static void lagg_capabilities(struct lagg_softc *sc) { struct lagg_port *lp; int cap, ena, pena; uint64_t hwa; struct ifnet_hw_tsomax hw_tsomax; LAGG_XLOCK_ASSERT(sc); /* Get common enabled capabilities for the lagg ports */ ena = ~0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ena &= lp->lp_ifp->if_capenable; ena = (ena == ~0 ? 0 : ena); /* * Apply common enabled capabilities back to the lagg ports. * May require several iterations if they are dependent. */ do { pena = ena; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setcaps(lp, ena); ena &= lp->lp_ifp->if_capenable; } } while (pena != ena); /* Get other capabilities from the lagg ports */ cap = ~0; hwa = ~(uint64_t)0; memset(&hw_tsomax, 0, sizeof(hw_tsomax)); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { cap &= lp->lp_ifp->if_capabilities; hwa &= lp->lp_ifp->if_hwassist; if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax); } cap = (cap == ~0 ? 0 : cap); hwa = (hwa == ~(uint64_t)0 ? 0 : hwa); if (sc->sc_ifp->if_capabilities != cap || sc->sc_ifp->if_capenable != ena || sc->sc_ifp->if_hwassist != hwa || if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) { sc->sc_ifp->if_capabilities = cap; sc->sc_ifp->if_capenable = ena; sc->sc_ifp->if_hwassist = hwa; getmicrotime(&sc->sc_ifp->if_lastchange); if (sc->sc_ifflags & IFF_DEBUG) if_printf(sc->sc_ifp, "capabilities 0x%08x enabled 0x%08x\n", cap, ena); } } static int lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp) { struct lagg_softc *sc_ptr; struct lagg_port *lp, *tlp; struct ifreq ifr; int error, i, oldmtu; uint64_t *pval; LAGG_XLOCK_ASSERT(sc); if (sc->sc_ifp == ifp) { if_printf(sc->sc_ifp, "cannot add a lagg to itself as a port\n"); return (EINVAL); } /* Limit the maximal number of lagg ports */ if (sc->sc_count >= LAGG_MAX_PORTS) return (ENOSPC); /* Check if port has already been associated to a lagg */ if (ifp->if_lagg != NULL) { /* Port is already in the current lagg? */ lp = (struct lagg_port *)ifp->if_lagg; if (lp->lp_softc == sc) return (EEXIST); return (EBUSY); } /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) return (EPROTONOSUPPORT); /* Allow the first Ethernet member to define the MTU */ oldmtu = -1; if (CK_SLIST_EMPTY(&sc->sc_ports)) { sc->sc_ifp->if_mtu = ifp->if_mtu; } else if (sc->sc_ifp->if_mtu != ifp->if_mtu) { if (ifp->if_ioctl == NULL) { if_printf(sc->sc_ifp, "cannot change MTU for %s\n", ifp->if_xname); return (EINVAL); } oldmtu = ifp->if_mtu; strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_mtu = sc->sc_ifp->if_mtu; error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); if (error != 0) { if_printf(sc->sc_ifp, "invalid MTU for %s\n", ifp->if_xname); return (error); } ifr.ifr_mtu = oldmtu; } lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK|M_ZERO); lp->lp_softc = sc; /* Check if port is a stacked lagg */ LAGG_LIST_LOCK(); SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) { if (ifp == sc_ptr->sc_ifp) { LAGG_LIST_UNLOCK(); free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (EINVAL); /* XXX disable stacking for the moment, its untested */ #ifdef LAGG_PORT_STACKING lp->lp_flags |= LAGG_PORT_STACK; if (lagg_port_checkstacking(sc_ptr) >= LAGG_MAX_STACKING) { LAGG_LIST_UNLOCK(); free(lp, M_LAGG); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (E2BIG); } #endif } } LAGG_LIST_UNLOCK(); if_ref(ifp); lp->lp_ifp = ifp; bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN); lp->lp_ifcapenable = ifp->if_capenable; if (CK_SLIST_EMPTY(&sc->sc_ports)) { bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } else { if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); } lagg_setflags(lp, 1); if (CK_SLIST_EMPTY(&sc->sc_ports)) sc->sc_primary = lp; /* Change the interface type */ lp->lp_iftype = ifp->if_type; ifp->if_type = IFT_IEEE8023ADLAG; ifp->if_lagg = lp; lp->lp_ioctl = ifp->if_ioctl; ifp->if_ioctl = lagg_port_ioctl; lp->lp_output = ifp->if_output; ifp->if_output = lagg_port_output; /* Read port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) *pval = ifp->if_get_counter(ifp, i); /* * Insert into the list of ports. * Keep ports sorted by if_index. It is handy, when configuration * is predictable and `ifconfig laggN create ...` command * will lead to the same result each time. */ CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) { if (tlp->lp_ifp->if_index < ifp->if_index && ( CK_SLIST_NEXT(tlp, lp_entries) == NULL || ((struct lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index > ifp->if_index)) break; } if (tlp != NULL) CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries); else CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries); sc->sc_count++; lagg_setmulti(lp); if ((error = lagg_proto_addport(sc, lp)) != 0) { /* Remove the port, without calling pr_delport. */ lagg_port_destroy(lp, 0); if (oldmtu != -1) (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr); return (error); } /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } #ifdef LAGG_PORT_STACKING static int lagg_port_checkstacking(struct lagg_softc *sc) { struct lagg_softc *sc_ptr; struct lagg_port *lp; int m = 0; LAGG_SXLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_flags & LAGG_PORT_STACK) { sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc; m = MAX(m, lagg_port_checkstacking(sc_ptr)); } } return (m + 1); } #endif static void lagg_port_destroy_cb(epoch_context_t ec) { struct lagg_port *lp; struct ifnet *ifp; lp = __containerof(ec, struct lagg_port, lp_epoch_ctx); ifp = lp->lp_ifp; if_rele(ifp); free(lp, M_LAGG); } static int lagg_port_destroy(struct lagg_port *lp, int rundelport) { struct lagg_softc *sc = lp->lp_softc; struct lagg_port *lp_ptr, *lp0; struct ifnet *ifp = lp->lp_ifp; uint64_t *pval, vdiff; int i; LAGG_XLOCK_ASSERT(sc); if (rundelport) lagg_proto_delport(sc, lp); if (lp->lp_detaching == 0) lagg_clrmulti(lp); /* Restore interface */ ifp->if_type = lp->lp_iftype; ifp->if_ioctl = lp->lp_ioctl; ifp->if_output = lp->lp_output; ifp->if_lagg = NULL; /* Update detached port counters */ pval = lp->port_counters.val; for (i = 0; i < IFCOUNTERS; i++, pval++) { vdiff = ifp->if_get_counter(ifp, i) - *pval; sc->detached_counters.val[i] += vdiff; } /* Finally, remove the port from the lagg */ CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries); sc->sc_count--; /* Update the primary interface */ if (lp == sc->sc_primary) { uint8_t lladdr[ETHER_ADDR_LEN]; if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL) bzero(&lladdr, ETHER_ADDR_LEN); else bcopy(lp0->lp_lladdr, lladdr, ETHER_ADDR_LEN); sc->sc_primary = lp0; if (sc->sc_destroying == 0) { bcopy(lladdr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } /* * Update lladdr for each port (new primary needs update * as well, to switch from old lladdr to its 'real' one) */ CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) if_setlladdr(lp_ptr->lp_ifp, lladdr, ETHER_ADDR_LEN); } if (lp->lp_ifflags) if_printf(ifp, "%s: lp_ifflags unclean\n", __func__); if (lp->lp_detaching == 0) { lagg_setflags(lp, 0); lagg_setcaps(lp, lp->lp_ifcapenable); if_setlladdr(ifp, lp->lp_lladdr, ETHER_ADDR_LEN); } /* * free port and release it's ifnet reference after a grace period has * elapsed. */ epoch_call(net_epoch_preempt, &lp->lp_epoch_ctx, lagg_port_destroy_cb); /* Update lagg capabilities */ lagg_capabilities(sc); lagg_linkstate(sc); return (0); } static int lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_reqport *rp = (struct lagg_reqport *)data; struct lagg_softc *sc; struct lagg_port *lp = NULL; int error = 0; /* Should be checked by the caller */ if (ifp->if_type != IFT_IEEE8023ADLAG || (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) goto fallback; switch (cmd) { case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || ifunit(rp->rp_portname) != ifp) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); break; case SIOCSIFCAP: if (lp->lp_ioctl == NULL) { error = EINVAL; break; } error = (*lp->lp_ioctl)(ifp, cmd, data); if (error) break; /* Update lagg interface capabilities */ LAGG_XLOCK(sc); lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); break; case SIOCSIFMTU: /* Do not allow the MTU to be changed once joined */ error = EINVAL; break; default: goto fallback; } return (error); fallback: if (lp != NULL && lp->lp_ioctl != NULL) return ((*lp->lp_ioctl)(ifp, cmd, data)); return (EINVAL); } /* * Requests counter @cnt data. * * Counter value is calculated the following way: * 1) for each port, sum difference between current and "initial" measurements. * 2) add lagg logical interface counters. * 3) add data from detached_counters array. * * We also do the following things on ports attach/detach: * 1) On port attach we store all counters it has into port_counter array. * 2) On port detach we add the different between "initial" and * current counters data to detached_counters array. */ static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt) { struct lagg_softc *sc; struct lagg_port *lp; struct ifnet *lpifp; uint64_t newval, oldval, vsum; /* Revise this when we've got non-generic counters. */ KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt)); sc = (struct lagg_softc *)ifp->if_softc; vsum = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { /* Saved attached value */ oldval = lp->port_counters.val[cnt]; /* current value */ lpifp = lp->lp_ifp; newval = lpifp->if_get_counter(lpifp, cnt); /* Calculate diff and save new */ vsum += newval - oldval; } LAGG_RUNLOCK(); /* * Add counter data which might be added by upper * layer protocols operating on logical interface. */ vsum += if_get_counter_default(ifp, cnt); /* * Add counter data from detached ports counters */ vsum += sc->detached_counters.val[cnt]; return (vsum); } /* * For direct output to child ports. */ static int lagg_port_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { struct lagg_port *lp = ifp->if_lagg; switch (dst->sa_family) { case pseudo_AF_HDRCMPLT: case AF_UNSPEC: return ((*lp->lp_output)(ifp, m, dst, ro)); } /* drop any other frames */ m_freem(m); return (ENETDOWN); } static void lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp) { struct lagg_port *lp; struct lagg_softc *sc; if ((lp = ifp->if_lagg) == NULL) return; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; sc = lp->lp_softc; LAGG_XLOCK(sc); lp->lp_detaching = 1; lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(sc->sc_ifp); } static void lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp) { struct lagg_softc *sc = lp->lp_softc; strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname)); strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname)); rp->rp_prio = lp->lp_prio; rp->rp_flags = lp->lp_flags; lagg_proto_portreq(sc, lp, &rp->rp_psc); /* Add protocol specific flags */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: if (lp == sc->sc_primary) rp->rp_flags |= LAGG_PORT_MASTER; if (lp == lagg_link_active(sc, sc->sc_primary)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: if (LAGG_PORTACTIVE(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; break; case LAGG_PROTO_LACP: /* LACP has a different definition of active */ if (lacp_isactive(lp)) rp->rp_flags |= LAGG_PORT_ACTIVE; if (lacp_iscollecting(lp)) rp->rp_flags |= LAGG_PORT_COLLECTING; if (lacp_isdistributing(lp)) rp->rp_flags |= LAGG_PORT_DISTRIBUTING; break; } } static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; struct ifnet *ifp = sc->sc_ifp; struct lagg_port *lp; LAGG_XLOCK(sc); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { LAGG_XUNLOCK(sc); return; } ifp->if_drv_flags |= IFF_DRV_RUNNING; /* * Update the port lladdrs if needed. * This might be if_setlladdr() notification * that lladdr has been changed. */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp), ETHER_ADDR_LEN) != 0) if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN); } lagg_proto_init(sc); LAGG_XUNLOCK(sc); } static void lagg_stop(struct lagg_softc *sc) { struct ifnet *ifp = sc->sc_ifp; LAGG_XLOCK_ASSERT(sc); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) return; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; lagg_proto_stop(sc); } static int lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_reqall *ra = (struct lagg_reqall *)data; struct lagg_reqopts *ro = (struct lagg_reqopts *)data; struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf; struct lagg_reqflags *rf = (struct lagg_reqflags *)data; struct ifreq *ifr = (struct ifreq *)data; struct lagg_port *lp; struct ifnet *tpif; struct thread *td = curthread; char *buf, *outbuf; int count, buflen, len, error = 0; bzero(&rpbuf, sizeof(rpbuf)); switch (cmd) { case SIOCGLAGG: LAGG_XLOCK(sc); buflen = sc->sc_count * sizeof(struct lagg_reqport); outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO); ra->ra_proto = sc->sc_proto; lagg_proto_request(sc, &ra->ra_psc); count = 0; buf = outbuf; len = min(ra->ra_size, buflen); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (len < sizeof(rpbuf)) break; lagg_port2req(lp, &rpbuf); memcpy(buf, &rpbuf, sizeof(rpbuf)); count++; buf += sizeof(rpbuf); len -= sizeof(rpbuf); } LAGG_XUNLOCK(sc); ra->ra_ports = count; ra->ra_size = count * sizeof(rpbuf); error = copyout(outbuf, ra->ra_port, ra->ra_size); free(outbuf, M_TEMP); break; case SIOCSLAGG: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ra->ra_proto >= LAGG_PROTO_MAX) { error = EPROTONOSUPPORT; break; } LAGG_XLOCK(sc); lagg_proto_detach(sc); LAGG_UNLOCK_ASSERT(); lagg_proto_attach(sc, ra->ra_proto); LAGG_XUNLOCK(sc); break; case SIOCGLAGGOPTS: LAGG_XLOCK(sc); ro->ro_opts = sc->sc_opts; if (sc->sc_proto == LAGG_PROTO_LACP) { struct lacp_softc *lsc; lsc = (struct lacp_softc *)sc->sc_psc; if (lsc->lsc_debug.lsc_tx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_TXTEST; if (lsc->lsc_debug.lsc_rx_test != 0) ro->ro_opts |= LAGG_OPT_LACP_RXTEST; if (lsc->lsc_strict_mode != 0) ro->ro_opts |= LAGG_OPT_LACP_STRICT; if (lsc->lsc_fast_timeout != 0) ro->ro_opts |= LAGG_OPT_LACP_TIMEOUT; ro->ro_active = sc->sc_active; } else { ro->ro_active = 0; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) ro->ro_active += LAGG_PORTACTIVE(lp); } ro->ro_bkt = sc->sc_bkt; ro->ro_flapping = sc->sc_flapping; ro->ro_flowid_shift = sc->flowid_shift; LAGG_XUNLOCK(sc); break; case SIOCSLAGGOPTS: if (sc->sc_proto == LAGG_PROTO_ROUNDROBIN) { if (ro->ro_bkt == 0) sc->sc_bkt = 1; // Minimum 1 packet per iface. else sc->sc_bkt = ro->ro_bkt; } error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (ro->ro_opts == 0) break; /* * Set options. LACP options are stored in sc->sc_psc, * not in sc_opts. */ int valid, lacp; switch (ro->ro_opts) { case LAGG_OPT_USE_FLOWID: case -LAGG_OPT_USE_FLOWID: case LAGG_OPT_USE_NUMA: case -LAGG_OPT_USE_NUMA: case LAGG_OPT_FLOWIDSHIFT: valid = 1; lacp = 0; break; case LAGG_OPT_LACP_TXTEST: case -LAGG_OPT_LACP_TXTEST: case LAGG_OPT_LACP_RXTEST: case -LAGG_OPT_LACP_RXTEST: case LAGG_OPT_LACP_STRICT: case -LAGG_OPT_LACP_STRICT: case LAGG_OPT_LACP_TIMEOUT: case -LAGG_OPT_LACP_TIMEOUT: valid = lacp = 1; break; default: valid = lacp = 0; break; } LAGG_XLOCK(sc); if (valid == 0 || (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) { /* Invalid combination of options specified. */ error = EINVAL; LAGG_XUNLOCK(sc); break; /* Return from SIOCSLAGGOPTS. */ } /* * Store new options into sc->sc_opts except for * FLOWIDSHIFT and LACP options. */ if (lacp == 0) { if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT) sc->flowid_shift = ro->ro_flowid_shift; else if (ro->ro_opts > 0) sc->sc_opts |= ro->ro_opts; else sc->sc_opts &= ~ro->ro_opts; } else { struct lacp_softc *lsc; struct lacp_port *lp; lsc = (struct lacp_softc *)sc->sc_psc; switch (ro->ro_opts) { case LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 1; break; case -LAGG_OPT_LACP_TXTEST: lsc->lsc_debug.lsc_tx_test = 0; break; case LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 1; break; case -LAGG_OPT_LACP_RXTEST: lsc->lsc_debug.lsc_rx_test = 0; break; case LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 1; break; case -LAGG_OPT_LACP_STRICT: lsc->lsc_strict_mode = 0; break; case LAGG_OPT_LACP_TIMEOUT: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state |= LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 1; break; case -LAGG_OPT_LACP_TIMEOUT: LACP_LOCK(lsc); LIST_FOREACH(lp, &lsc->lsc_ports, lp_next) lp->lp_state &= ~LACP_STATE_TIMEOUT; LACP_UNLOCK(lsc); lsc->lsc_fast_timeout = 0; break; } } LAGG_XUNLOCK(sc); break; case SIOCGLAGGFLAGS: rf->rf_flags = 0; LAGG_XLOCK(sc); if (sc->sc_flags & MBUF_HASHFLAG_L2) rf->rf_flags |= LAGG_F_HASHL2; if (sc->sc_flags & MBUF_HASHFLAG_L3) rf->rf_flags |= LAGG_F_HASHL3; if (sc->sc_flags & MBUF_HASHFLAG_L4) rf->rf_flags |= LAGG_F_HASHL4; LAGG_XUNLOCK(sc); break; case SIOCSLAGGHASH: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) { error = EINVAL; break; } LAGG_XLOCK(sc); sc->sc_flags = 0; if (rf->rf_flags & LAGG_F_HASHL2) sc->sc_flags |= MBUF_HASHFLAG_L2; if (rf->rf_flags & LAGG_F_HASHL3) sc->sc_flags |= MBUF_HASHFLAG_L3; if (rf->rf_flags & LAGG_F_HASHL4) sc->sc_flags |= MBUF_HASHFLAG_L4; LAGG_XUNLOCK(sc); break; case SIOCGLAGGPORT: if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_RLOCK(); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_RUNLOCK(); if_rele(tpif); break; } lagg_port2req(lp, rp); LAGG_RUNLOCK(); if_rele(tpif); break; case SIOCSLAGGPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } #ifdef INET6 /* * A laggport interface should not have inet6 address * because two interfaces with a valid link-local * scope zone must not be merged in any form. This * restriction is needed to prevent violation of * link-local scope zone. Attempts to add a laggport * interface which has inet6 addresses triggers * removal of all inet6 addresses on the member * interface. */ if (in6ifa_llaonifp(tpif)) { in6_ifdetach(tpif); if_printf(sc->sc_ifp, "IPv6 addresses on %s have been removed " "before adding it as a member to prevent " "IPv6 address scope violation.\n", tpif->if_xname); } #endif LAGG_XLOCK(sc); error = lagg_port_create(sc, tpif); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSLAGGDELPORT: error = priv_check(td, PRIV_NET_LAGG); if (error) break; if (rp->rp_portname[0] == '\0' || (tpif = ifunit_ref(rp->rp_portname)) == NULL) { error = EINVAL; break; } LAGG_XLOCK(sc); if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL || lp->lp_softc != sc) { error = ENOENT; LAGG_XUNLOCK(sc); if_rele(tpif); break; } error = lagg_port_destroy(lp, 1); LAGG_XUNLOCK(sc); if_rele(tpif); VLAN_CAPABILITIES(ifp); break; case SIOCSIFFLAGS: /* Set flags on ports too */ LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_setflags(lp, 1); } if (!(ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked down and it is running, * then stop and disable it. */ lagg_stop(sc); LAGG_XUNLOCK(sc); } else if ((ifp->if_flags & IFF_UP) && !(ifp->if_drv_flags & IFF_DRV_RUNNING)) { /* * If interface is marked up and it is stopped, then * start it. */ LAGG_XUNLOCK(sc); (*ifp->if_init)(sc); } else LAGG_XUNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { lagg_clrmulti(lp); lagg_setmulti(lp); } LAGG_XUNLOCK(sc); error = 0; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCSIFCAP: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } lagg_capabilities(sc); LAGG_XUNLOCK(sc); VLAN_CAPABILITIES(ifp); error = 0; break; case SIOCSIFMTU: LAGG_XLOCK(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); else error = EINVAL; if (error != 0) { if_printf(ifp, "failed to change MTU to %d on port %s, " "reverting all ports to original MTU (%d)\n", ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu); break; } } if (error == 0) { ifp->if_mtu = ifr->ifr_mtu; } else { /* set every port back to the original MTU */ ifr->ifr_mtu = ifp->if_mtu; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ioctl != NULL) (*lp->lp_ioctl)(lp->lp_ifp, cmd, data); } } LAGG_XUNLOCK(sc); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } #ifdef RATELIMIT -static int -lagg_snd_tag_alloc(struct ifnet *ifp, - union if_snd_tag_alloc_params *params, - struct m_snd_tag **ppmt) +static inline struct lagg_snd_tag * +mst_to_lst(struct m_snd_tag *mst) { - struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + + return (__containerof(mst, struct lagg_snd_tag, com)); +} + +/* + * Look up the port used by a specific flow. This only works for lagg + * protocols with deterministic port mappings (e.g. not roundrobin). + * In addition protocols which use a hash to map flows to ports must + * be configured to use the mbuf flowid rather than hashing packet + * contents. + */ +static struct lagg_port * +lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype) +{ + struct lagg_softc *sc; struct lagg_port *lp; struct lagg_lb *lb; uint32_t p; - LAGG_RLOCK(); + sc = ifp->if_softc; + switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: - lp = lagg_link_active(sc, sc->sc_primary); - break; + return (lagg_link_active(sc, sc->sc_primary)); case LAGG_PROTO_LOADBALANCE: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || - params->hdr.flowtype == M_HASHTYPE_NONE) { - LAGG_RUNLOCK(); - return (EOPNOTSUPP); - } - p = params->hdr.flowid >> sc->flowid_shift; + flowtype == M_HASHTYPE_NONE) + return (NULL); + p = flowid >> sc->flowid_shift; p %= sc->sc_count; lb = (struct lagg_lb *)sc->sc_psc; lp = lb->lb_ports[p]; - lp = lagg_link_active(sc, lp); - break; + return (lagg_link_active(sc, lp)); case LAGG_PROTO_LACP: if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 || - params->hdr.flowtype == M_HASHTYPE_NONE) { - LAGG_RUNLOCK(); - return (EOPNOTSUPP); - } - lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid); - break; + flowtype == M_HASHTYPE_NONE) + return (NULL); + return (lacp_select_tx_port_by_hash(sc, flowid)); default: + return (NULL); + } +} + +static int +lagg_snd_tag_alloc(struct ifnet *ifp, + union if_snd_tag_alloc_params *params, + struct m_snd_tag **ppmt) +{ + struct lagg_snd_tag *lst; + struct lagg_softc *sc; + struct lagg_port *lp; + struct ifnet *lp_ifp; + int error; + + sc = ifp->if_softc; + + LAGG_RLOCK(); + lp = lookup_snd_tag_port(ifp, params->hdr.flowid, params->hdr.flowtype); + if (lp == NULL) { LAGG_RUNLOCK(); return (EOPNOTSUPP); } - if (lp == NULL) { + if (lp->lp_ifp == NULL || lp->lp_ifp->if_snd_tag_alloc == NULL) { LAGG_RUNLOCK(); return (EOPNOTSUPP); } - ifp = lp->lp_ifp; + lp_ifp = lp->lp_ifp; + if_ref(lp_ifp); LAGG_RUNLOCK(); - if (ifp == NULL || ifp->if_snd_tag_alloc == NULL || - (ifp->if_capenable & IFCAP_TXRTLMT) == 0) - return (EOPNOTSUPP); - /* forward allocation request */ - return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); + lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT); + if (lst == NULL) { + if_rele(lp_ifp); + return (ENOMEM); + } + + error = lp_ifp->if_snd_tag_alloc(lp_ifp, params, &lst->tag); + if_rele(lp_ifp); + if (error) { + free(lst, M_LAGG); + return (error); + } + + m_snd_tag_init(&lst->com, ifp); + + *ppmt = &lst->com; + return (0); } +static int +lagg_snd_tag_modify(struct m_snd_tag *mst, + union if_snd_tag_modify_params *params) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + return (lst->tag->ifp->if_snd_tag_modify(lst->tag, params)); +} + +static int +lagg_snd_tag_query(struct m_snd_tag *mst, + union if_snd_tag_query_params *params) +{ + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + return (lst->tag->ifp->if_snd_tag_query(lst->tag, params)); +} + static void -lagg_snd_tag_free(struct m_snd_tag *tag) +lagg_snd_tag_free(struct m_snd_tag *mst) { - tag->ifp->if_snd_tag_free(tag); + struct lagg_snd_tag *lst; + + lst = mst_to_lst(mst); + m_snd_tag_rele(lst->tag); + free(lst, M_LAGG); } #endif static int lagg_setmulti(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *ifp = lp->lp_ifp; struct ifnet *scifp = sc->sc_ifp; struct lagg_mc *mc; struct ifmultiaddr *ifma; int error; IF_ADDR_WLOCK(scifp); CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(scifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp->if_index; mc->mc_ifma = NULL; SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries); } IF_ADDR_WUNLOCK(scifp); SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) { error = if_addmulti(ifp, (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma); if (error) return (error); } return (0); } static int lagg_clrmulti(struct lagg_port *lp) { struct lagg_mc *mc; LAGG_XLOCK_ASSERT(lp->lp_softc); while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) { SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries); if (mc->mc_ifma && lp->lp_detaching == 0) if_delmulti_ifma(mc->mc_ifma); free(mc, M_LAGG); } return (0); } static int lagg_setcaps(struct lagg_port *lp, int cap) { struct ifreq ifr; if (lp->lp_ifp->if_capenable == cap) return (0); if (lp->lp_ioctl == NULL) return (ENXIO); ifr.ifr_reqcap = cap; return ((*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr)); } /* Handle a ref counted flag that should be set on the lagg port as well */ static int lagg_setflag(struct lagg_port *lp, int flag, int status, int (*func)(struct ifnet *, int)) { struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; struct ifnet *ifp = lp->lp_ifp; int error; LAGG_XLOCK_ASSERT(sc); status = status ? (scifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded ports status is different from what * we want it to be. If it is, flip it. We record ports * status in lp_ifflags so that we won't clear ports flag * we haven't set. In fact, we don't clear or set ports * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual ports flags. */ if (status != (lp->lp_ifflags & flag)) { error = (*func)(ifp, status); if (error) return (error); lp->lp_ifflags &= ~flag; lp->lp_ifflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the lagg port * if "status" is true, update ports flags respective to the lagg * if "status" is false, forcedly clear the flags set on port. */ static int lagg_setflags(struct lagg_port *lp, int status) { int error, i; for (i = 0; lagg_pflags[i].flag; i++) { error = lagg_setflag(lp, lagg_pflags[i].flag, status, lagg_pflags[i].func); if (error) return (error); } return (0); } static int lagg_transmit(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; +#ifdef RATELIMIT + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); +#endif LAGG_RLOCK(); /* We need a Tx algorithm and at least one port */ if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { LAGG_RUNLOCK(); m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENXIO); } ETHER_BPF_MTAP(ifp, m); error = lagg_proto_start(sc, m); LAGG_RUNLOCK(); if (error != 0) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (error); } /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ static void lagg_qflush(struct ifnet *ifp __unused) { } static struct mbuf * lagg_input(struct ifnet *ifp, struct mbuf *m) { struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; struct ifnet *scifp = sc->sc_ifp; LAGG_RLOCK(); if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || lp->lp_detaching != 0 || sc->sc_proto == LAGG_PROTO_NONE) { LAGG_RUNLOCK(); m_freem(m); return (NULL); } ETHER_BPF_MTAP(scifp, m); m = lagg_proto_input(sc, lp, m); if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) { m_freem(m); m = NULL; } LAGG_RUNLOCK(); return (m); } static int lagg_media_change(struct ifnet *ifp) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; if (sc->sc_ifflags & IFF_DEBUG) printf("%s\n", __func__); /* Ignore */ return (0); } static void lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; imr->ifm_status = IFM_AVALID; imr->ifm_active = IFM_ETHER | IFM_AUTO; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp)) imr->ifm_status |= IFM_ACTIVE; } LAGG_RUNLOCK(); } static void lagg_linkstate(struct lagg_softc *sc) { struct lagg_port *lp; int new_link = LINK_STATE_DOWN; uint64_t speed; LAGG_XLOCK_ASSERT(sc); /* LACP handles link state itself */ if (sc->sc_proto == LAGG_PROTO_LACP) return; /* Our link is considered up if at least one of our ports is active */ LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (lp->lp_ifp->if_link_state == LINK_STATE_UP) { new_link = LINK_STATE_UP; break; } } LAGG_RUNLOCK(); if_link_state_change(sc->sc_ifp, new_link); /* Update if_baudrate to reflect the max possible speed */ switch (sc->sc_proto) { case LAGG_PROTO_FAILOVER: sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ? sc->sc_primary->lp_ifp->if_baudrate : 0; break; case LAGG_PROTO_ROUNDROBIN: case LAGG_PROTO_LOADBALANCE: case LAGG_PROTO_BROADCAST: speed = 0; LAGG_RLOCK(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) speed += lp->lp_ifp->if_baudrate; LAGG_RUNLOCK(); sc->sc_ifp->if_baudrate = speed; break; case LAGG_PROTO_LACP: /* LACP updates if_baudrate itself */ break; } } static void lagg_port_state(struct ifnet *ifp, int state) { struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg; struct lagg_softc *sc = NULL; if (lp != NULL) sc = lp->lp_softc; if (sc == NULL) return; LAGG_XLOCK(sc); lagg_linkstate(sc); lagg_proto_linkstate(sc, lp); LAGG_XUNLOCK(sc); } struct lagg_port * lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_port *lp_next, *rval = NULL; /* * Search a port which reports an active link state. */ /* * This is called with either LAGG_RLOCK() held or * LAGG_XLOCK(sc) held. */ if (!in_epoch(net_epoch_preempt)) LAGG_XLOCK_ASSERT(sc); if (lp == NULL) goto search; if (LAGG_PORTACTIVE(lp)) { rval = lp; goto found; } if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL && LAGG_PORTACTIVE(lp_next)) { rval = lp_next; goto found; } search: CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (LAGG_PORTACTIVE(lp_next)) { return (lp_next); } } found: return (rval); } int lagg_enqueue(struct ifnet *ifp, struct mbuf *m) { +#ifdef RATELIMIT + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + struct lagg_snd_tag *lst; + struct m_snd_tag *mst; + + mst = m->m_pkthdr.snd_tag; + lst = mst_to_lst(mst); + if (lst->tag->ifp != ifp) { + m_freem(m); + return (EAGAIN); + } + m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag); + m_snd_tag_rele(mst); + } +#endif return (ifp->if_transmit)(ifp, m); } /* * Simple round robin aggregation */ static void lagg_rr_attach(struct lagg_softc *sc) { sc->sc_seq = 0; sc->sc_bkt_count = sc->sc_bkt; } static int lagg_rr_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; uint32_t p; if (sc->sc_bkt_count == 0 && sc->sc_bkt > 0) sc->sc_bkt_count = sc->sc_bkt; if (sc->sc_bkt > 0) { atomic_subtract_int(&sc->sc_bkt_count, 1); if (atomic_cmpset_int(&sc->sc_bkt_count, 0, sc->sc_bkt)) p = atomic_fetchadd_32(&sc->sc_seq, 1); else p = sc->sc_seq; } else p = atomic_fetchadd_32(&sc->sc_seq, 1); p %= sc->sc_count; lp = CK_SLIST_FIRST(&sc->sc_ports); while (p--) lp = CK_SLIST_NEXT(lp, lp_entries); /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Broadcast mode */ static int lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m) { int active_ports = 0; int errors = 0; int ret; struct lagg_port *lp, *last = NULL; struct mbuf *m0; LAGG_RLOCK_ASSERT(); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (!LAGG_PORTACTIVE(lp)) continue; active_ports++; if (last != NULL) { m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (m0 == NULL) { ret = ENOBUFS; errors++; break; } ret = lagg_enqueue(last->lp_ifp, m0); if (ret != 0) errors++; } last = lp; } if (last == NULL) { m_freem(m); return (ENOENT); } if ((last = lagg_link_active(sc, last)) == NULL) { m_freem(m); return (ENETDOWN); } ret = lagg_enqueue(last->lp_ifp, m); if (ret != 0) errors++; if (errors == 0) return (ret); return (0); } static struct mbuf* lagg_bcast_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * Active failover */ static int lagg_fail_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; /* Use the master port if active or the next available port */ if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct lagg_port *tmp_tp; if (lp == sc->sc_primary || V_lagg_failover_rx_all) { m->m_pkthdr.rcvif = ifp; return (m); } if (!LAGG_PORTACTIVE(sc->sc_primary)) { tmp_tp = lagg_link_active(sc, sc->sc_primary); /* * If tmp_tp is null, we've received a packet when all * our links are down. Weird, but process it anyways. */ if ((tmp_tp == NULL || tmp_tp == lp)) { m->m_pkthdr.rcvif = ifp; return (m); } } m_freem(m); return (NULL); } /* * Loadbalancing */ static void lagg_lb_attach(struct lagg_softc *sc) { struct lagg_port *lp; struct lagg_lb *lb; LAGG_XLOCK_ASSERT(sc); lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO); lb->lb_key = m_ether_tcpip_hash_init(); sc->sc_psc = lb; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lagg_lb_port_create(lp); } static void lagg_lb_detach(struct lagg_softc *sc) { struct lagg_lb *lb; lb = (struct lagg_lb *)sc->sc_psc; if (lb != NULL) free(lb, M_LAGG); } static int lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp_next; int i = 0, rv; rv = 0; bzero(&lb->lb_ports, sizeof(lb->lb_ports)); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) { if (lp_next == lp) continue; if (i >= LAGG_MAX_PORTS) { rv = EINVAL; break; } if (sc->sc_ifflags & IFF_DEBUG) printf("%s: port %s at index %d\n", sc->sc_ifname, lp_next->lp_ifp->if_xname, i); lb->lb_ports[i++] = lp_next; } return (rv); } static int lagg_lb_port_create(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; return (lagg_lb_porttable(sc, NULL)); } static void lagg_lb_port_destroy(struct lagg_port *lp) { struct lagg_softc *sc = lp->lp_softc; lagg_lb_porttable(sc, lp); } static int lagg_lb_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc; struct lagg_port *lp = NULL; uint32_t p = 0; if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) p = m->m_pkthdr.flowid >> sc->flowid_shift; else p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key); p %= sc->sc_count; lp = lb->lb_ports[p]; /* * Check the port's link state. This will return the next active * port if the link is down or the port is NULL. */ if ((lp = lagg_link_active(sc, lp)) == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; /* Just pass in the packet to our lagg device */ m->m_pkthdr.rcvif = ifp; return (m); } /* * 802.3ad LACP */ static void lagg_lacp_attach(struct lagg_softc *sc) { struct lagg_port *lp; lacp_attach(sc); LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static void lagg_lacp_detach(struct lagg_softc *sc) { struct lagg_port *lp; void *psc; LAGG_XLOCK_ASSERT(sc); CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); psc = sc->sc_psc; sc->sc_psc = NULL; lacp_detach(psc); } static void lagg_lacp_lladdr(struct lagg_softc *sc) { struct lagg_port *lp; LAGG_SXLOCK_ASSERT(sc); /* purge all the lacp ports */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_destroy(lp); /* add them back in */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) lacp_port_create(lp); } static int lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m) { struct lagg_port *lp; lp = lacp_select_tx_port(sc, m); if (lp == NULL) { m_freem(m); return (ENETDOWN); } /* Send mbuf */ return (lagg_enqueue(lp->lp_ifp, m)); } static struct mbuf * lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m) { struct ifnet *ifp = sc->sc_ifp; struct ether_header *eh; u_short etype; eh = mtod(m, struct ether_header *); etype = ntohs(eh->ether_type); /* Tap off LACP control messages */ if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) { m = lacp_input(lp, m); if (m == NULL) return (NULL); } /* * If the port is not collecting or not in the active aggregator then * free and return. */ if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) { m_freem(m); return (NULL); } m->m_pkthdr.rcvif = ifp; return (m); } Index: head/sys/net/if_vlan.c =================================================================== --- head/sys/net/if_vlan.c (revision 348253) +++ head/sys/net/if_vlan.c (revision 348254) @@ -1,1945 +1,2038 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * Copyright 2012 ADARA Networks, Inc. * Copyright 2017 Dell EMC Isilon * * Portions of this software were developed by Robert N. M. Watson under * contract to ADARA Networks, Inc. * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * if_vlan.c - pseudo-device driver for IEEE 802.1Q virtual LANs. * This is sort of sneaky in the implementation, since * we need to pretend to be enough of an Ethernet implementation * to make arp work. The way we do this is by telling everyone * that we are an Ethernet, and then catch the packets that * ether_output() sends to us via if_transmit(), rewrite them for * use by the real outgoing interface, and ask it to send them. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_vlan.h" #include "opt_ratelimit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif #define VLAN_DEF_HWIDTH 4 #define VLAN_IFFLAGS (IFF_BROADCAST | IFF_MULTICAST) #define UP_AND_RUNNING(ifp) \ ((ifp)->if_flags & IFF_UP && (ifp)->if_drv_flags & IFF_DRV_RUNNING) CK_SLIST_HEAD(ifvlanhead, ifvlan); struct ifvlantrunk { struct ifnet *parent; /* parent interface of this trunk */ struct mtx lock; #ifdef VLAN_ARRAY #define VLAN_ARRAY_SIZE (EVL_VLID_MASK + 1) struct ifvlan *vlans[VLAN_ARRAY_SIZE]; /* static table */ #else struct ifvlanhead *hash; /* dynamic hash-list table */ uint16_t hmask; uint16_t hwidth; #endif int refcnt; }; +#ifdef RATELIMIT +struct vlan_snd_tag { + struct m_snd_tag com; + struct m_snd_tag *tag; +}; + +static inline struct vlan_snd_tag * +mst_to_vst(struct m_snd_tag *mst) +{ + + return (__containerof(mst, struct vlan_snd_tag, com)); +} +#endif + /* * This macro provides a facility to iterate over every vlan on a trunk with * the assumption that none will be added/removed during iteration. */ #ifdef VLAN_ARRAY #define VLAN_FOREACH(_ifv, _trunk) \ size_t _i; \ for (_i = 0; _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i]) != NULL) #else /* VLAN_ARRAY */ #define VLAN_FOREACH(_ifv, _trunk) \ struct ifvlan *_next; \ size_t _i; \ for (_i = 0; _i < (1 << (_trunk)->hwidth); _i++) \ CK_SLIST_FOREACH_SAFE((_ifv), &(_trunk)->hash[_i], ifv_list, _next) #endif /* VLAN_ARRAY */ /* * This macro provides a facility to iterate over every vlan on a trunk while * also modifying the number of vlans on the trunk. The iteration continues * until some condition is met or there are no more vlans on the trunk. */ #ifdef VLAN_ARRAY /* The VLAN_ARRAY case is simple -- just a for loop using the condition. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ for (_i = 0; !(_cond) && _i < VLAN_ARRAY_SIZE; _i++) \ if (((_ifv) = (_trunk)->vlans[_i])) #else /* VLAN_ARRAY */ /* * The hash table case is more complicated. We allow for the hash table to be * modified (i.e. vlans removed) while we are iterating over it. To allow for * this we must restart the iteration every time we "touch" something during * the iteration, since removal will resize the hash table and invalidate our * current position. If acting on the touched element causes the trunk to be * emptied, then iteration also stops. */ #define VLAN_FOREACH_UNTIL_SAFE(_ifv, _trunk, _cond) \ size_t _i; \ bool _touch = false; \ for (_i = 0; \ !(_cond) && _i < (1 << (_trunk)->hwidth); \ _i = (_touch && ((_trunk) != NULL) ? 0 : _i + 1), _touch = false) \ if (((_ifv) = CK_SLIST_FIRST(&(_trunk)->hash[_i])) != NULL && \ (_touch = true)) #endif /* VLAN_ARRAY */ struct vlan_mc_entry { struct sockaddr_dl mc_addr; CK_SLIST_ENTRY(vlan_mc_entry) mc_entries; struct epoch_context mc_epoch_ctx; }; struct ifvlan { struct ifvlantrunk *ifv_trunk; struct ifnet *ifv_ifp; #define TRUNK(ifv) ((ifv)->ifv_trunk) #define PARENT(ifv) ((ifv)->ifv_trunk->parent) void *ifv_cookie; int ifv_pflags; /* special flags we have set on parent */ int ifv_capenable; int ifv_encaplen; /* encapsulation length */ int ifv_mtufudge; /* MTU fudged by this much */ int ifv_mintu; /* min transmission unit */ uint16_t ifv_proto; /* encapsulation ethertype */ uint16_t ifv_tag; /* tag to apply on packets leaving if */ uint16_t ifv_vid; /* VLAN ID */ uint8_t ifv_pcp; /* Priority Code Point (PCP). */ struct task lladdr_task; CK_SLIST_HEAD(, vlan_mc_entry) vlan_mc_listhead; #ifndef VLAN_ARRAY CK_SLIST_ENTRY(ifvlan) ifv_list; #endif }; /* Special flags we should propagate to parent. */ static struct { int flag; int (*func)(struct ifnet *, int); } vlan_pflags[] = { {IFF_PROMISC, ifpromisc}, {IFF_ALLMULTI, if_allmulti}, {0, NULL} }; extern int vlan_mtag_pcp; static const char vlanname[] = "vlan"; static MALLOC_DEFINE(M_VLAN, vlanname, "802.1Q Virtual LAN Interface"); static eventhandler_tag ifdetach_tag; static eventhandler_tag iflladdr_tag; /* * if_vlan uses two module-level synchronizations primitives to allow concurrent * modification of vlan interfaces and (mostly) allow for vlans to be destroyed * while they are being used for tx/rx. To accomplish this in a way that has * acceptable performance and cooperation with other parts of the network stack * there is a non-sleepable epoch(9) and an sx(9). * * The performance-sensitive paths that warrant using the epoch(9) are * vlan_transmit and vlan_input. Both have to check for the vlan interface's * existence using if_vlantrunk, and being in the network tx/rx paths the use * of an epoch(9) gives a measureable improvement in performance. * * The reason for having an sx(9) is mostly because there are still areas that * must be sleepable and also have safe concurrent access to a vlan interface. * Since the sx(9) exists, it is used by default in most paths unless sleeping * is not permitted, or if it is not clear whether sleeping is permitted. * */ #define _VLAN_SX_ID ifv_sx static struct sx _VLAN_SX_ID; #define VLAN_LOCKING_INIT() \ sx_init(&_VLAN_SX_ID, "vlan_sx") #define VLAN_LOCKING_DESTROY() \ sx_destroy(&_VLAN_SX_ID) #define VLAN_SLOCK() sx_slock(&_VLAN_SX_ID) #define VLAN_SUNLOCK() sx_sunlock(&_VLAN_SX_ID) #define VLAN_XLOCK() sx_xlock(&_VLAN_SX_ID) #define VLAN_XUNLOCK() sx_xunlock(&_VLAN_SX_ID) #define VLAN_SLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_SLOCKED) #define VLAN_XLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_XLOCKED) #define VLAN_SXLOCK_ASSERT() sx_assert(&_VLAN_SX_ID, SA_LOCKED) /* * We also have a per-trunk mutex that should be acquired when changing * its state. */ #define TRUNK_LOCK_INIT(trunk) mtx_init(&(trunk)->lock, vlanname, NULL, MTX_DEF) #define TRUNK_LOCK_DESTROY(trunk) mtx_destroy(&(trunk)->lock) #define TRUNK_WLOCK(trunk) mtx_lock(&(trunk)->lock) #define TRUNK_WUNLOCK(trunk) mtx_unlock(&(trunk)->lock) #define TRUNK_LOCK_ASSERT(trunk) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(trunk)->lock)) #define TRUNK_WLOCK_ASSERT(trunk) mtx_assert(&(trunk)->lock, MA_OWNED); /* * The VLAN_ARRAY substitutes the dynamic hash with a static array * with 4096 entries. In theory this can give a boost in processing, * however in practice it does not. Probably this is because the array * is too big to fit into CPU cache. */ #ifndef VLAN_ARRAY static void vlan_inithash(struct ifvlantrunk *trunk); static void vlan_freehash(struct ifvlantrunk *trunk); static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv); static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch); static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid); #endif static void trunk_destroy(struct ifvlantrunk *trunk); static void vlan_init(void *foo); static void vlan_input(struct ifnet *ifp, struct mbuf *m); static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr); #ifdef RATELIMIT static int vlan_snd_tag_alloc(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); -static void vlan_snd_tag_free(struct m_snd_tag *); +static int vlan_snd_tag_modify(struct m_snd_tag *, + union if_snd_tag_modify_params *); +static int vlan_snd_tag_query(struct m_snd_tag *, + union if_snd_tag_query_params *); +static void vlan_snd_tag_free(struct m_snd_tag *); #endif static void vlan_qflush(struct ifnet *ifp); static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)); static int vlan_setflags(struct ifnet *ifp, int status); static int vlan_setmulti(struct ifnet *ifp); static int vlan_transmit(struct ifnet *ifp, struct mbuf *m); static void vlan_unconfig(struct ifnet *ifp); static void vlan_unconfig_locked(struct ifnet *ifp, int departing); static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t tag); static void vlan_link_state(struct ifnet *ifp); static void vlan_capabilities(struct ifvlan *ifv); static void vlan_trunk_capabilities(struct ifnet *ifp); static struct ifnet *vlan_clone_match_ethervid(const char *, int *); static int vlan_clone_match(struct if_clone *, const char *); static int vlan_clone_create(struct if_clone *, char *, size_t, caddr_t); static int vlan_clone_destroy(struct if_clone *, struct ifnet *); static void vlan_ifdetach(void *arg, struct ifnet *ifp); static void vlan_iflladdr(void *arg, struct ifnet *ifp); static void vlan_lladdr_fn(void *arg, int pending); static struct if_clone *vlan_cloner; #ifdef VIMAGE VNET_DEFINE_STATIC(struct if_clone *, vlan_cloner); #define V_vlan_cloner VNET(vlan_cloner) #endif static void vlan_mc_free(struct epoch_context *ctx) { struct vlan_mc_entry *mc = __containerof(ctx, struct vlan_mc_entry, mc_epoch_ctx); free(mc, M_VLAN); } #ifndef VLAN_ARRAY #define HASH(n, m) ((((n) >> 8) ^ ((n) >> 4) ^ (n)) & (m)) static void vlan_inithash(struct ifvlantrunk *trunk) { int i, n; /* * The trunk must not be locked here since we call malloc(M_WAITOK). * It is OK in case this function is called before the trunk struct * gets hooked up and becomes visible from other threads. */ KASSERT(trunk->hwidth == 0 && trunk->hash == NULL, ("%s: hash already initialized", __func__)); trunk->hwidth = VLAN_DEF_HWIDTH; n = 1 << trunk->hwidth; trunk->hmask = n - 1; trunk->hash = malloc(sizeof(struct ifvlanhead) * n, M_VLAN, M_WAITOK); for (i = 0; i < n; i++) CK_SLIST_INIT(&trunk->hash[i]); } static void vlan_freehash(struct ifvlantrunk *trunk) { #ifdef INVARIANTS int i; KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); for (i = 0; i < (1 << trunk->hwidth); i++) KASSERT(CK_SLIST_EMPTY(&trunk->hash[i]), ("%s: hash table not empty", __func__)); #endif free(trunk->hash, M_VLAN); trunk->hash = NULL; trunk->hwidth = trunk->hmask = 0; } static int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv->ifv_vid == ifv2->ifv_vid) return (EEXIST); /* * Grow the hash when the number of vlans exceeds half of the number of * hash buckets squared. This will make the average linked-list length * buckets/2. */ if (trunk->refcnt > (b * b) / 2) { vlan_growhash(trunk, 1); i = HASH(ifv->ifv_vid, trunk->hmask); } CK_SLIST_INSERT_HEAD(&trunk->hash[i], ifv, ifv_list); trunk->refcnt++; return (0); } static int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { int i, b; struct ifvlan *ifv2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); b = 1 << trunk->hwidth; i = HASH(ifv->ifv_vid, trunk->hmask); CK_SLIST_FOREACH(ifv2, &trunk->hash[i], ifv_list) if (ifv2 == ifv) { trunk->refcnt--; CK_SLIST_REMOVE(&trunk->hash[i], ifv2, ifvlan, ifv_list); if (trunk->refcnt < (b * b) / 2) vlan_growhash(trunk, -1); return (0); } panic("%s: vlan not found\n", __func__); return (ENOENT); /*NOTREACHED*/ } /* * Grow the hash larger or smaller if memory permits. */ static void vlan_growhash(struct ifvlantrunk *trunk, int howmuch) { struct ifvlan *ifv; struct ifvlanhead *hash2; int hwidth2, i, j, n, n2; VLAN_XLOCK_ASSERT(); KASSERT(trunk->hwidth > 0, ("%s: hwidth not positive", __func__)); if (howmuch == 0) { /* Harmless yet obvious coding error */ printf("%s: howmuch is 0\n", __func__); return; } hwidth2 = trunk->hwidth + howmuch; n = 1 << trunk->hwidth; n2 = 1 << hwidth2; /* Do not shrink the table below the default */ if (hwidth2 < VLAN_DEF_HWIDTH) return; hash2 = malloc(sizeof(struct ifvlanhead) * n2, M_VLAN, M_WAITOK); if (hash2 == NULL) { printf("%s: out of memory -- hash size not changed\n", __func__); return; /* We can live with the old hash table */ } for (j = 0; j < n2; j++) CK_SLIST_INIT(&hash2[j]); for (i = 0; i < n; i++) while ((ifv = CK_SLIST_FIRST(&trunk->hash[i])) != NULL) { CK_SLIST_REMOVE(&trunk->hash[i], ifv, ifvlan, ifv_list); j = HASH(ifv->ifv_vid, n2 - 1); CK_SLIST_INSERT_HEAD(&hash2[j], ifv, ifv_list); } NET_EPOCH_WAIT(); free(trunk->hash, M_VLAN); trunk->hash = hash2; trunk->hwidth = hwidth2; trunk->hmask = n2 - 1; if (bootverbose) if_printf(trunk->parent, "VLAN hash table resized from %d to %d buckets\n", n, n2); } static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { struct ifvlan *ifv; NET_EPOCH_ASSERT(); CK_SLIST_FOREACH(ifv, &trunk->hash[HASH(vid, trunk->hmask)], ifv_list) if (ifv->ifv_vid == vid) return (ifv); return (NULL); } #if 0 /* Debugging code to view the hashtables. */ static void vlan_dumphash(struct ifvlantrunk *trunk) { int i; struct ifvlan *ifv; for (i = 0; i < (1 << trunk->hwidth); i++) { printf("%d: ", i); CK_SLIST_FOREACH(ifv, &trunk->hash[i], ifv_list) printf("%s ", ifv->ifv_ifp->if_xname); printf("\n"); } } #endif /* 0 */ #else static __inline struct ifvlan * vlan_gethash(struct ifvlantrunk *trunk, uint16_t vid) { return trunk->vlans[vid]; } static __inline int vlan_inshash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { if (trunk->vlans[ifv->ifv_vid] != NULL) return EEXIST; trunk->vlans[ifv->ifv_vid] = ifv; trunk->refcnt++; return (0); } static __inline int vlan_remhash(struct ifvlantrunk *trunk, struct ifvlan *ifv) { trunk->vlans[ifv->ifv_vid] = NULL; trunk->refcnt--; return (0); } static __inline void vlan_freehash(struct ifvlantrunk *trunk) { } static __inline void vlan_inithash(struct ifvlantrunk *trunk) { } #endif /* !VLAN_ARRAY */ static void trunk_destroy(struct ifvlantrunk *trunk) { VLAN_XLOCK_ASSERT(); vlan_freehash(trunk); trunk->parent->if_vlantrunk = NULL; TRUNK_LOCK_DESTROY(trunk); if_rele(trunk->parent); free(trunk, M_VLAN); } /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the * side effect of causing the parent interface to receive multicast * traffic that it doesn't really want, which ends up being discarded * later by the upper protocol layers. Unfortunately, there's no way * to avoid this: there really is only one physical interface. */ static int vlan_setmulti(struct ifnet *ifp) { struct ifnet *ifp_p; struct ifmultiaddr *ifma; struct ifvlan *sc; struct vlan_mc_entry *mc; int error; VLAN_XLOCK_ASSERT(); /* Find the parent. */ sc = ifp->if_softc; ifp_p = PARENT(sc); CURVNET_SET_QUIET(ifp_p->if_vnet); /* First, remove any existing filter entries. */ while ((mc = CK_SLIST_FIRST(&sc->vlan_mc_listhead)) != NULL) { CK_SLIST_REMOVE_HEAD(&sc->vlan_mc_listhead, mc_entries); (void)if_delmulti(ifp_p, (struct sockaddr *)&mc->mc_addr); epoch_call(net_epoch_preempt, &mc->mc_epoch_ctx, vlan_mc_free); } /* Now program new ones. */ IF_ADDR_WLOCK(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; mc = malloc(sizeof(struct vlan_mc_entry), M_VLAN, M_NOWAIT); if (mc == NULL) { IF_ADDR_WUNLOCK(ifp); return (ENOMEM); } bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len); mc->mc_addr.sdl_index = ifp_p->if_index; CK_SLIST_INSERT_HEAD(&sc->vlan_mc_listhead, mc, mc_entries); } IF_ADDR_WUNLOCK(ifp); CK_SLIST_FOREACH (mc, &sc->vlan_mc_listhead, mc_entries) { error = if_addmulti(ifp_p, (struct sockaddr *)&mc->mc_addr, NULL); if (error) return (error); } CURVNET_RESTORE(); return (0); } /* * A handler for parent interface link layer address changes. * If the parent interface link layer address is changed we * should also change it on all children vlans. */ static void vlan_iflladdr(void *arg __unused, struct ifnet *ifp) { struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *ifv_ifp; struct ifvlantrunk *trunk; struct sockaddr_dl *sdl; /* Need the epoch since this is run on taskqueue_swi. */ NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } /* * OK, it's a trunk. Loop over and change all vlan's lladdrs on it. * We need an exclusive lock here to prevent concurrent SIOCSIFLLADDR * ioctl calls on the parent garbling the lladdr of the child vlan. */ TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { /* * Copy new new lladdr into the ifv_ifp, enqueue a task * to actually call if_setlladdr. if_setlladdr needs to * be deferred to a taskqueue because it will call into * the if_vlan ioctl path and try to acquire the global * lock. */ ifv_ifp = ifv->ifv_ifp; bcopy(IF_LLADDR(ifp), IF_LLADDR(ifv_ifp), ifp->if_addrlen); sdl = (struct sockaddr_dl *)ifv_ifp->if_addr->ifa_addr; sdl->sdl_alen = ifp->if_addrlen; taskqueue_enqueue(taskqueue_thread, &ifv->lladdr_task); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } /* * A handler for network interface departure events. * Track departure of trunks here so that we don't access invalid * pointers or whatever if a trunk is ripped from under us, e.g., * by ejecting its hot-plug card. However, if an ifnet is simply * being renamed, then there's no need to tear down the state. */ static void vlan_ifdetach(void *arg __unused, struct ifnet *ifp) { struct ifvlan *ifv; struct ifvlantrunk *trunk; /* If the ifnet is just being renamed, don't do anything. */ if (ifp->if_flags & IFF_RENAMING) return; VLAN_XLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_XUNLOCK(); return; } /* * OK, it's a trunk. Loop over and detach all vlan's on it. * Check trunk pointer after each vlan_unconfig() as it will * free it and set to NULL after the last vlan was detached. */ VLAN_FOREACH_UNTIL_SAFE(ifv, ifp->if_vlantrunk, ifp->if_vlantrunk == NULL) vlan_unconfig_locked(ifv->ifv_ifp, 1); /* Trunk should have been destroyed in vlan_unconfig(). */ KASSERT(ifp->if_vlantrunk == NULL, ("%s: purge failed", __func__)); VLAN_XUNLOCK(); } /* * Return the trunk device for a virtual interface. */ static struct ifnet * vlan_trunkdev(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); NET_EPOCH_ENTER(et); ifv = ifp->if_softc; ifp = NULL; if (ifv->ifv_trunk) ifp = PARENT(ifv); NET_EPOCH_EXIT(et); return (ifp); } /* * Return the 12-bit VLAN VID for this interface, for use by external * components such as Infiniband. * * XXXRW: Note that the function name here is historical; it should be named * vlan_vid(). */ static int vlan_tag(struct ifnet *ifp, uint16_t *vidp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *vidp = ifv->ifv_vid; return (0); } static int vlan_pcp(struct ifnet *ifp, uint16_t *pcpp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; *pcpp = ifv->ifv_pcp; return (0); } /* * Return a driver specific cookie for this interface. Synchronization * with setcookie must be provided by the driver. */ static void * vlan_cookie(struct ifnet *ifp) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (NULL); ifv = ifp->if_softc; return (ifv->ifv_cookie); } /* * Store a cookie in our softc that drivers can use to store driver * private per-instance data in. */ static int vlan_setcookie(struct ifnet *ifp, void *cookie) { struct ifvlan *ifv; if (ifp->if_type != IFT_L2VLAN) return (EINVAL); ifv = ifp->if_softc; ifv->ifv_cookie = cookie; return (0); } /* * Return the vlan device present at the specific VID. */ static struct ifnet * vlan_devat(struct ifnet *ifp, uint16_t vid) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return (NULL); } ifp = NULL; ifv = vlan_gethash(trunk, vid); if (ifv) ifp = ifv->ifv_ifp; NET_EPOCH_EXIT(et); return (ifp); } /* * Recalculate the cached VLAN tag exposed via the MIB. */ static void vlan_tag_recalculate(struct ifvlan *ifv) { ifv->ifv_tag = EVL_MAKETAG(ifv->ifv_vid, ifv->ifv_pcp, 0); } /* * VLAN support can be loaded as a module. The only place in the * system that's intimately aware of this is ether_input. We hook * into this code through vlan_input_p which is defined there and * set here. No one else in the system should be aware of this so * we use an explicit reference here. */ extern void (*vlan_input_p)(struct ifnet *, struct mbuf *); /* For if_link_state_change() eyes only... */ extern void (*vlan_link_state_p)(struct ifnet *); static int vlan_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: ifdetach_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, vlan_ifdetach, NULL, EVENTHANDLER_PRI_ANY); if (ifdetach_tag == NULL) return (ENOMEM); iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, vlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY); if (iflladdr_tag == NULL) return (ENOMEM); VLAN_LOCKING_INIT(); vlan_input_p = vlan_input; vlan_link_state_p = vlan_link_state; vlan_trunk_cap_p = vlan_trunk_capabilities; vlan_trunkdev_p = vlan_trunkdev; vlan_cookie_p = vlan_cookie; vlan_setcookie_p = vlan_setcookie; vlan_tag_p = vlan_tag; vlan_pcp_p = vlan_pcp; vlan_devat_p = vlan_devat; #ifndef VIMAGE vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); #endif if (bootverbose) printf("vlan: initialized, using " #ifdef VLAN_ARRAY "full-size arrays" #else "hash tables with chaining" #endif "\n"); break; case MOD_UNLOAD: #ifndef VIMAGE if_clone_detach(vlan_cloner); #endif EVENTHANDLER_DEREGISTER(ifnet_departure_event, ifdetach_tag); EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_tag); vlan_input_p = NULL; vlan_link_state_p = NULL; vlan_trunk_cap_p = NULL; vlan_trunkdev_p = NULL; vlan_tag_p = NULL; vlan_cookie_p = NULL; vlan_setcookie_p = NULL; vlan_devat_p = NULL; VLAN_LOCKING_DESTROY(); if (bootverbose) printf("vlan: unloaded\n"); break; default: return (EOPNOTSUPP); } return (0); } static moduledata_t vlan_mod = { "if_vlan", vlan_modevent, 0 }; DECLARE_MODULE(if_vlan, vlan_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_vlan, 3); #ifdef VIMAGE static void vnet_vlan_init(const void *unused __unused) { vlan_cloner = if_clone_advanced(vlanname, 0, vlan_clone_match, vlan_clone_create, vlan_clone_destroy); V_vlan_cloner = vlan_cloner; } VNET_SYSINIT(vnet_vlan_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_vlan_init, NULL); static void vnet_vlan_uninit(const void *unused __unused) { if_clone_detach(V_vlan_cloner); } VNET_SYSUNINIT(vnet_vlan_uninit, SI_SUB_INIT_IF, SI_ORDER_FIRST, vnet_vlan_uninit, NULL); #endif /* * Check for . style interface names. */ static struct ifnet * vlan_clone_match_ethervid(const char *name, int *vidp) { char ifname[IFNAMSIZ]; char *cp; struct ifnet *ifp; int vid; strlcpy(ifname, name, IFNAMSIZ); if ((cp = strchr(ifname, '.')) == NULL) return (NULL); *cp = '\0'; if ((ifp = ifunit_ref(ifname)) == NULL) return (NULL); /* Parse VID. */ if (*++cp == '\0') { if_rele(ifp); return (NULL); } vid = 0; for(; *cp >= '0' && *cp <= '9'; cp++) vid = (vid * 10) + (*cp - '0'); if (*cp != '\0') { if_rele(ifp); return (NULL); } if (vidp != NULL) *vidp = vid; return (ifp); } static int vlan_clone_match(struct if_clone *ifc, const char *name) { const char *cp; if (vlan_clone_match_ethervid(name, NULL) != NULL) return (1); if (strncmp(vlanname, name, strlen(vlanname)) != 0) return (0); for (cp = name + 4; *cp != '\0'; cp++) { if (*cp < '0' || *cp > '9') return (0); } return (1); } static int vlan_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { char *dp; int wildcard; int unit; int error; int vid; struct ifvlan *ifv; struct ifnet *ifp; struct ifnet *p; struct ifaddr *ifa; struct sockaddr_dl *sdl; struct vlanreq vlr; static const u_char eaddr[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ /* * There are 3 (ugh) ways to specify the cloned device: * o pass a parameter block with the clone request. * o specify parameters in the text of the clone device name * o specify no parameters and get an unattached device that * must be configured separately. * The first technique is preferred; the latter two are * supported for backwards compatibility. * * XXXRW: Note historic use of the word "tag" here. New ioctls may be * called for. */ if (params) { error = copyin(params, &vlr, sizeof(vlr)); if (error) return error; p = ifunit_ref(vlr.vlr_parent); if (p == NULL) return (ENXIO); error = ifc_name2unit(name, &unit); if (error != 0) { if_rele(p); return (error); } vid = vlr.vlr_tag; wildcard = (unit < 0); } else if ((p = vlan_clone_match_ethervid(name, &vid)) != NULL) { unit = -1; wildcard = 0; } else { p = NULL; error = ifc_name2unit(name, &unit); if (error != 0) return (error); wildcard = (unit < 0); } error = ifc_alloc_unit(ifc, &unit); if (error != 0) { if (p != NULL) if_rele(p); return (error); } /* In the wildcard case, we need to update the name. */ if (wildcard) { for (dp = name; *dp != '\0'; dp++); if (snprintf(dp, len - (dp-name), "%d", unit) > len - (dp-name) - 1) { panic("%s: interface name too long", __func__); } } ifv = malloc(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO); ifp = ifv->ifv_ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { ifc_free_unit(ifc, unit); free(ifv, M_VLAN); if (p != NULL) if_rele(p); return (ENOSPC); } CK_SLIST_INIT(&ifv->vlan_mc_listhead); ifp->if_softc = ifv; /* * Set the name manually rather than using if_initname because * we don't conform to the default naming convention for interfaces. */ strlcpy(ifp->if_xname, name, IFNAMSIZ); ifp->if_dname = vlanname; ifp->if_dunit = unit; ifp->if_init = vlan_init; ifp->if_transmit = vlan_transmit; ifp->if_qflush = vlan_qflush; ifp->if_ioctl = vlan_ioctl; #ifdef RATELIMIT ifp->if_snd_tag_alloc = vlan_snd_tag_alloc; + ifp->if_snd_tag_modify = vlan_snd_tag_modify; + ifp->if_snd_tag_query = vlan_snd_tag_query; ifp->if_snd_tag_free = vlan_snd_tag_free; #endif ifp->if_flags = VLAN_IFFLAGS; ether_ifattach(ifp, eaddr); /* Now undo some of the damage... */ ifp->if_baudrate = 0; ifp->if_type = IFT_L2VLAN; ifp->if_hdrlen = ETHER_VLAN_ENCAP_LEN; ifa = ifp->if_addr; sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_L2VLAN; if (p != NULL) { error = vlan_config(ifv, p, vid); if_rele(p); if (error != 0) { /* * Since we've partially failed, we need to back * out all the way, otherwise userland could get * confused. Thus, we destroy the interface. */ ether_ifdetach(ifp); vlan_unconfig(ifp); if_free(ifp); ifc_free_unit(ifc, unit); free(ifv, M_VLAN); return (error); } } return (0); } static int vlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { struct ifvlan *ifv = ifp->if_softc; int unit = ifp->if_dunit; ether_ifdetach(ifp); /* first, remove it from system-wide lists */ vlan_unconfig(ifp); /* now it can be unconfigured and freed */ /* * We should have the only reference to the ifv now, so we can now * drain any remaining lladdr task before freeing the ifnet and the * ifvlan. */ taskqueue_drain(taskqueue_thread, &ifv->lladdr_task); NET_EPOCH_WAIT(); if_free(ifp); free(ifv, M_VLAN); ifc_free_unit(ifc, unit); return (0); } /* * The ifp->if_init entry point for vlan(4) is a no-op. */ static void vlan_init(void *foo __unused) { } /* * The if_transmit method for vlan(4) interface. */ static int vlan_transmit(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct ifvlan *ifv; struct ifnet *p; int error, len, mcast; NET_EPOCH_ENTER(et); ifv = ifp->if_softc; if (TRUNK(ifv) == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); NET_EPOCH_EXIT(et); m_freem(m); return (ENETDOWN); } p = PARENT(ifv); len = m->m_pkthdr.len; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; BPF_MTAP(ifp, m); +#ifdef RATELIMIT + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) { + struct vlan_snd_tag *vst; + struct m_snd_tag *mst; + + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); + mst = m->m_pkthdr.snd_tag; + vst = mst_to_vst(mst); + if (vst->tag->ifp != p) { + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + NET_EPOCH_EXIT(et); + m_freem(m); + return (EAGAIN); + } + + m->m_pkthdr.snd_tag = m_snd_tag_ref(vst->tag); + m_snd_tag_rele(mst); + } +#endif + /* * Do not run parent's if_transmit() if the parent is not up, * or parent's driver will cause a system crash. */ if (!UP_AND_RUNNING(p)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); NET_EPOCH_EXIT(et); m_freem(m); return (ENETDOWN); } if (!ether_8021q_frame(&m, ifp, p, ifv->ifv_vid, ifv->ifv_pcp)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); NET_EPOCH_EXIT(et); return (0); } /* * Send it, precisely as ether_output() would have. */ error = (p->if_transmit)(p, m); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); NET_EPOCH_EXIT(et); return (error); } /* * The ifp->if_qflush entry point for vlan(4) is a no-op. */ static void vlan_qflush(struct ifnet *ifp __unused) { } static void vlan_input(struct ifnet *ifp, struct mbuf *m) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; struct m_tag *mtag; uint16_t vid, tag; NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); m_freem(m); return; } if (m->m_flags & M_VLANTAG) { /* * Packet is tagged, but m contains a normal * Ethernet frame; the tag is stored out-of-band. */ tag = m->m_pkthdr.ether_vtag; m->m_flags &= ~M_VLANTAG; } else { struct ether_vlan_header *evl; /* * Packet is tagged in-band as specified by 802.1q. */ switch (ifp->if_type) { case IFT_ETHER: if (m->m_len < sizeof(*evl) && (m = m_pullup(m, sizeof(*evl))) == NULL) { if_printf(ifp, "cannot pullup VLAN header\n"); NET_EPOCH_EXIT(et); return; } evl = mtod(m, struct ether_vlan_header *); tag = ntohs(evl->evl_tag); /* * Remove the 802.1q header by copying the Ethernet * addresses over it and adjusting the beginning of * the data in the mbuf. The encapsulated Ethernet * type field is already in place. */ bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN, ETHER_HDR_LEN - ETHER_TYPE_LEN); m_adj(m, ETHER_VLAN_ENCAP_LEN); break; default: #ifdef INVARIANTS panic("%s: %s has unsupported if_type %u", __func__, ifp->if_xname, ifp->if_type); #endif if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); NET_EPOCH_EXIT(et); m_freem(m); return; } } vid = EVL_VLANOFTAG(tag); ifv = vlan_gethash(trunk, vid); if (ifv == NULL || !UP_AND_RUNNING(ifv->ifv_ifp)) { NET_EPOCH_EXIT(et); if_inc_counter(ifp, IFCOUNTER_NOPROTO, 1); m_freem(m); return; } if (vlan_mtag_pcp) { /* * While uncommon, it is possible that we will find a 802.1q * packet encapsulated inside another packet that also had an * 802.1q header. For example, ethernet tunneled over IPSEC * arriving over ethernet. In that case, we replace the * existing 802.1q PCP m_tag value. */ mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL); if (mtag == NULL) { mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_IN, sizeof(uint8_t), M_NOWAIT); if (mtag == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); NET_EPOCH_EXIT(et); m_freem(m); return; } m_tag_prepend(m, mtag); } *(uint8_t *)(mtag + 1) = EVL_PRIOFTAG(tag); } m->m_pkthdr.rcvif = ifv->ifv_ifp; if_inc_counter(ifv->ifv_ifp, IFCOUNTER_IPACKETS, 1); NET_EPOCH_EXIT(et); /* Pass it back through the parent's input routine. */ (*ifv->ifv_ifp->if_input)(ifv->ifv_ifp, m); } static void vlan_lladdr_fn(void *arg, int pending __unused) { struct ifvlan *ifv; struct ifnet *ifp; ifv = (struct ifvlan *)arg; ifp = ifv->ifv_ifp; CURVNET_SET(ifp->if_vnet); /* The ifv_ifp already has the lladdr copied in. */ if_setlladdr(ifp, IF_LLADDR(ifp), ifp->if_addrlen); CURVNET_RESTORE(); } static int vlan_config(struct ifvlan *ifv, struct ifnet *p, uint16_t vid) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifnet *ifp; int error = 0; /* * We can handle non-ethernet hardware types as long as * they handle the tagging and headers themselves. */ if (p->if_type != IFT_ETHER && (p->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) return (EPROTONOSUPPORT); if ((p->if_flags & VLAN_IFFLAGS) != VLAN_IFFLAGS) return (EPROTONOSUPPORT); /* * Don't let the caller set up a VLAN VID with * anything except VLID bits. * VID numbers 0x0 and 0xFFF are reserved. */ if (vid == 0 || vid == 0xFFF || (vid & ~EVL_VLID_MASK)) return (EINVAL); if (ifv->ifv_trunk) return (EBUSY); VLAN_XLOCK(); if (p->if_vlantrunk == NULL) { trunk = malloc(sizeof(struct ifvlantrunk), M_VLAN, M_WAITOK | M_ZERO); vlan_inithash(trunk); TRUNK_LOCK_INIT(trunk); TRUNK_WLOCK(trunk); p->if_vlantrunk = trunk; trunk->parent = p; if_ref(trunk->parent); TRUNK_WUNLOCK(trunk); } else { trunk = p->if_vlantrunk; } ifv->ifv_vid = vid; /* must set this before vlan_inshash() */ ifv->ifv_pcp = 0; /* Default: best effort delivery. */ vlan_tag_recalculate(ifv); error = vlan_inshash(trunk, ifv); if (error) goto done; ifv->ifv_proto = ETHERTYPE_VLAN; ifv->ifv_encaplen = ETHER_VLAN_ENCAP_LEN; ifv->ifv_mintu = ETHERMIN; ifv->ifv_pflags = 0; ifv->ifv_capenable = -1; /* * If the parent supports the VLAN_MTU capability, * i.e. can Tx/Rx larger than ETHER_MAX_LEN frames, * use it. */ if (p->if_capenable & IFCAP_VLAN_MTU) { /* * No need to fudge the MTU since the parent can * handle extended frames. */ ifv->ifv_mtufudge = 0; } else { /* * Fudge the MTU by the encapsulation size. This * makes us incompatible with strictly compliant * 802.1Q implementations, but allows us to use * the feature with other NetBSD implementations, * which might still be useful. */ ifv->ifv_mtufudge = ifv->ifv_encaplen; } ifv->ifv_trunk = trunk; ifp = ifv->ifv_ifp; /* * Initialize fields from our parent. This duplicates some * work with ether_ifattach() but allows for non-ethernet * interfaces to also work. */ ifp->if_mtu = p->if_mtu - ifv->ifv_mtufudge; ifp->if_baudrate = p->if_baudrate; ifp->if_output = p->if_output; ifp->if_input = p->if_input; ifp->if_resolvemulti = p->if_resolvemulti; ifp->if_addrlen = p->if_addrlen; ifp->if_broadcastaddr = p->if_broadcastaddr; ifp->if_pcp = ifv->ifv_pcp; /* * Copy only a selected subset of flags from the parent. * Other flags are none of our business. */ #define VLAN_COPY_FLAGS (IFF_SIMPLEX) ifp->if_flags &= ~VLAN_COPY_FLAGS; ifp->if_flags |= p->if_flags & VLAN_COPY_FLAGS; #undef VLAN_COPY_FLAGS ifp->if_link_state = p->if_link_state; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); /* * Set up our interface address to reflect the underlying * physical interface's. */ bcopy(IF_LLADDR(p), IF_LLADDR(ifp), p->if_addrlen); ((struct sockaddr_dl *)ifp->if_addr->ifa_addr)->sdl_alen = p->if_addrlen; TASK_INIT(&ifv->lladdr_task, 0, vlan_lladdr_fn, ifv); /* We are ready for operation now. */ ifp->if_drv_flags |= IFF_DRV_RUNNING; /* Update flags on the parent, if necessary. */ vlan_setflags(ifp, 1); /* * Configure multicast addresses that may already be * joined on the vlan device. */ (void)vlan_setmulti(ifp); done: if (error == 0) EVENTHANDLER_INVOKE(vlan_config, p, ifv->ifv_vid); VLAN_XUNLOCK(); return (error); } static void vlan_unconfig(struct ifnet *ifp) { VLAN_XLOCK(); vlan_unconfig_locked(ifp, 0); VLAN_XUNLOCK(); } static void vlan_unconfig_locked(struct ifnet *ifp, int departing) { struct ifvlantrunk *trunk; struct vlan_mc_entry *mc; struct ifvlan *ifv; struct ifnet *parent; int error; VLAN_XLOCK_ASSERT(); ifv = ifp->if_softc; trunk = ifv->ifv_trunk; parent = NULL; if (trunk != NULL) { parent = trunk->parent; /* * Since the interface is being unconfigured, we need to * empty the list of multicast groups that we may have joined * while we were alive from the parent's list. */ while ((mc = CK_SLIST_FIRST(&ifv->vlan_mc_listhead)) != NULL) { /* * If the parent interface is being detached, * all its multicast addresses have already * been removed. Warn about errors if * if_delmulti() does fail, but don't abort as * all callers expect vlan destruction to * succeed. */ if (!departing) { error = if_delmulti(parent, (struct sockaddr *)&mc->mc_addr); if (error) if_printf(ifp, "Failed to delete multicast address from parent: %d\n", error); } CK_SLIST_REMOVE_HEAD(&ifv->vlan_mc_listhead, mc_entries); epoch_call(net_epoch_preempt, &mc->mc_epoch_ctx, vlan_mc_free); } vlan_setflags(ifp, 0); /* clear special flags on parent */ vlan_remhash(trunk, ifv); ifv->ifv_trunk = NULL; /* * Check if we were the last. */ if (trunk->refcnt == 0) { parent->if_vlantrunk = NULL; NET_EPOCH_WAIT(); trunk_destroy(trunk); } } /* Disconnect from parent. */ if (ifv->ifv_pflags) if_printf(ifp, "%s: ifv_pflags unclean\n", __func__); ifp->if_mtu = ETHERMTU; ifp->if_link_state = LINK_STATE_UNKNOWN; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; /* * Only dispatch an event if vlan was * attached, otherwise there is nothing * to cleanup anyway. */ if (parent != NULL) EVENTHANDLER_INVOKE(vlan_unconfig, parent, ifv->ifv_vid); } /* Handle a reference counted flag that should be set on the parent as well */ static int vlan_setflag(struct ifnet *ifp, int flag, int status, int (*func)(struct ifnet *, int)) { struct ifvlan *ifv; int error; VLAN_SXLOCK_ASSERT(); ifv = ifp->if_softc; status = status ? (ifp->if_flags & flag) : 0; /* Now "status" contains the flag value or 0 */ /* * See if recorded parent's status is different from what * we want it to be. If it is, flip it. We record parent's * status in ifv_pflags so that we won't clear parent's flag * we haven't set. In fact, we don't clear or set parent's * flags directly, but get or release references to them. * That's why we can be sure that recorded flags still are * in accord with actual parent's flags. */ if (status != (ifv->ifv_pflags & flag)) { error = (*func)(PARENT(ifv), status); if (error) return (error); ifv->ifv_pflags &= ~flag; ifv->ifv_pflags |= status; } return (0); } /* * Handle IFF_* flags that require certain changes on the parent: * if "status" is true, update parent's flags respective to our if_flags; * if "status" is false, forcedly clear the flags set on parent. */ static int vlan_setflags(struct ifnet *ifp, int status) { int error, i; for (i = 0; vlan_pflags[i].flag; i++) { error = vlan_setflag(ifp, vlan_pflags[i].flag, status, vlan_pflags[i].func); if (error) return (error); } return (0); } /* Inform all vlans that their parent has changed link state */ static void vlan_link_state(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; /* Called from a taskqueue_swi task, so we cannot sleep. */ NET_EPOCH_ENTER(et); trunk = ifp->if_vlantrunk; if (trunk == NULL) { NET_EPOCH_EXIT(et); return; } TRUNK_WLOCK(trunk); VLAN_FOREACH(ifv, trunk) { ifv->ifv_ifp->if_baudrate = trunk->parent->if_baudrate; if_link_state_change(ifv->ifv_ifp, trunk->parent->if_link_state); } TRUNK_WUNLOCK(trunk); NET_EPOCH_EXIT(et); } static void vlan_capabilities(struct ifvlan *ifv) { struct ifnet *p; struct ifnet *ifp; struct ifnet_hw_tsomax hw_tsomax; int cap = 0, ena = 0, mena; u_long hwa = 0; VLAN_SXLOCK_ASSERT(); NET_EPOCH_ASSERT(); p = PARENT(ifv); ifp = ifv->ifv_ifp; /* Mask parent interface enabled capabilities disabled by user. */ mena = p->if_capenable & ifv->ifv_capenable; /* * If the parent interface can do checksum offloading * on VLANs, then propagate its hardware-assisted * checksumming flags. Also assert that checksum * offloading requires hardware VLAN tagging. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (p->if_capenable & IFCAP_VLAN_HWCSUM && p->if_capenable & IFCAP_VLAN_HWTAGGING) { ena |= mena & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); if (ena & IFCAP_TXCSUM) hwa |= p->if_hwassist & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP); if (ena & IFCAP_TXCSUM_IPV6) hwa |= p->if_hwassist & (CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_SCTP_IPV6); } /* * If the parent interface can do TSO on VLANs then * propagate the hardware-assisted flag. TSO on VLANs * does not necessarily require hardware VLAN tagging. */ memset(&hw_tsomax, 0, sizeof(hw_tsomax)); if_hw_tsomax_common(p, &hw_tsomax); if_hw_tsomax_update(ifp, &hw_tsomax); if (p->if_capabilities & IFCAP_VLAN_HWTSO) cap |= p->if_capabilities & IFCAP_TSO; if (p->if_capenable & IFCAP_VLAN_HWTSO) { ena |= mena & IFCAP_TSO; if (ena & IFCAP_TSO) hwa |= p->if_hwassist & CSUM_TSO; } /* * If the parent interface can do LRO and checksum offloading on * VLANs, then guess it may do LRO on VLANs. False positive here * cost nothing, while false negative may lead to some confusions. */ if (p->if_capabilities & IFCAP_VLAN_HWCSUM) cap |= p->if_capabilities & IFCAP_LRO; if (p->if_capenable & IFCAP_VLAN_HWCSUM) ena |= p->if_capenable & IFCAP_LRO; /* * If the parent interface can offload TCP connections over VLANs then * propagate its TOE capability to the VLAN interface. * * All TOE drivers in the tree today can deal with VLANs. If this * changes then IFCAP_VLAN_TOE should be promoted to a full capability * with its own bit. */ #define IFCAP_VLAN_TOE IFCAP_TOE if (p->if_capabilities & IFCAP_VLAN_TOE) cap |= p->if_capabilities & IFCAP_TOE; if (p->if_capenable & IFCAP_VLAN_TOE) { TOEDEV(ifp) = TOEDEV(p); ena |= mena & IFCAP_TOE; } /* * If the parent interface supports dynamic link state, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_LINKSTATE); ena |= (mena & IFCAP_LINKSTATE); #ifdef RATELIMIT /* * If the parent interface supports ratelimiting, so does the * VLAN interface. */ cap |= (p->if_capabilities & IFCAP_TXRTLMT); ena |= (mena & IFCAP_TXRTLMT); #endif ifp->if_capabilities = cap; ifp->if_capenable = ena; ifp->if_hwassist = hwa; } static void vlan_trunk_capabilities(struct ifnet *ifp) { struct epoch_tracker et; struct ifvlantrunk *trunk; struct ifvlan *ifv; VLAN_SLOCK(); trunk = ifp->if_vlantrunk; if (trunk == NULL) { VLAN_SUNLOCK(); return; } NET_EPOCH_ENTER(et); VLAN_FOREACH(ifv, trunk) { vlan_capabilities(ifv); } NET_EPOCH_EXIT(et); VLAN_SUNLOCK(); } static int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifnet *p; struct ifreq *ifr; struct ifaddr *ifa; struct ifvlan *ifv; struct ifvlantrunk *trunk; struct vlanreq vlr; int error = 0; ifr = (struct ifreq *)data; ifa = (struct ifaddr *) data; ifv = ifp->if_softc; switch (cmd) { case SIOCSIFADDR: ifp->if_flags |= IFF_UP; #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) arp_ifinit(ifp, ifa); #endif break; case SIOCGIFADDR: bcopy(IF_LLADDR(ifp), &ifr->ifr_addr.sa_data[0], ifp->if_addrlen); break; case SIOCGIFMEDIA: VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { p = PARENT(ifv); if_ref(p); error = (*p->if_ioctl)(p, SIOCGIFMEDIA, data); if_rele(p); /* Limit the result to the parent's current config. */ if (error == 0) { struct ifmediareq *ifmr; ifmr = (struct ifmediareq *)data; if (ifmr->ifm_count >= 1 && ifmr->ifm_ulist) { ifmr->ifm_count = 1; error = copyout(&ifmr->ifm_current, ifmr->ifm_ulist, sizeof(int)); } } } else { error = EINVAL; } VLAN_SUNLOCK(); break; case SIOCSIFMEDIA: error = EINVAL; break; case SIOCSIFMTU: /* * Set the interface MTU. */ VLAN_SLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) { TRUNK_WLOCK(trunk); if (ifr->ifr_mtu > (PARENT(ifv)->if_mtu - ifv->ifv_mtufudge) || ifr->ifr_mtu < (ifv->ifv_mintu - ifv->ifv_mtufudge)) error = EINVAL; else ifp->if_mtu = ifr->ifr_mtu; TRUNK_WUNLOCK(trunk); } else error = EINVAL; VLAN_SUNLOCK(); break; case SIOCSETVLAN: #ifdef VIMAGE /* * XXXRW/XXXBZ: The goal in these checks is to allow a VLAN * interface to be delegated to a jail without allowing the * jail to change what underlying interface/VID it is * associated with. We are not entirely convinced that this * is the right way to accomplish that policy goal. */ if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = copyin(ifr_data_get_ptr(ifr), &vlr, sizeof(vlr)); if (error) break; if (vlr.vlr_parent[0] == '\0') { vlan_unconfig(ifp); break; } p = ifunit_ref(vlr.vlr_parent); if (p == NULL) { error = ENOENT; break; } error = vlan_config(ifv, p, vlr.vlr_tag); if_rele(p); break; case SIOCGETVLAN: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif bzero(&vlr, sizeof(vlr)); VLAN_SLOCK(); if (TRUNK(ifv) != NULL) { strlcpy(vlr.vlr_parent, PARENT(ifv)->if_xname, sizeof(vlr.vlr_parent)); vlr.vlr_tag = ifv->ifv_vid; } VLAN_SUNLOCK(); error = copyout(&vlr, ifr_data_get_ptr(ifr), sizeof(vlr)); break; case SIOCSIFFLAGS: /* * We should propagate selected flags to the parent, * e.g., promiscuous mode. */ VLAN_XLOCK(); if (TRUNK(ifv) != NULL) error = vlan_setflags(ifp, 1); VLAN_XUNLOCK(); break; case SIOCADDMULTI: case SIOCDELMULTI: /* * If we don't have a parent, just remember the membership for * when we do. * * XXX We need the rmlock here to avoid sleeping while * holding in6_multi_mtx. */ VLAN_XLOCK(); trunk = TRUNK(ifv); if (trunk != NULL) error = vlan_setmulti(ifp); VLAN_XUNLOCK(); break; case SIOCGVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif ifr->ifr_vlan_pcp = ifv->ifv_pcp; break; case SIOCSVLANPCP: #ifdef VIMAGE if (ifp->if_vnet != ifp->if_home_vnet) { error = EPERM; break; } #endif error = priv_check(curthread, PRIV_NET_SETVLANPCP); if (error) break; if (ifr->ifr_vlan_pcp > 7) { error = EINVAL; break; } ifv->ifv_pcp = ifr->ifr_vlan_pcp; ifp->if_pcp = ifv->ifv_pcp; vlan_tag_recalculate(ifv); /* broadcast event about PCP change */ EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_PCP); break; case SIOCSIFCAP: VLAN_SLOCK(); ifv->ifv_capenable = ifr->ifr_reqcap; trunk = TRUNK(ifv); if (trunk != NULL) { struct epoch_tracker et; NET_EPOCH_ENTER(et); vlan_capabilities(ifv); NET_EPOCH_EXIT(et); } VLAN_SUNLOCK(); break; default: error = EINVAL; break; } return (error); } #ifdef RATELIMIT static int vlan_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params, struct m_snd_tag **ppmt) { + struct epoch_tracker et; + struct vlan_snd_tag *vst; + struct ifvlan *ifv; + struct ifnet *parent; + int error; - /* get trunk device */ - ifp = vlan_trunkdev(ifp); - if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0) + NET_EPOCH_ENTER(et); + ifv = ifp->if_softc; + if (ifv->ifv_trunk != NULL) + parent = PARENT(ifv); + else + parent = NULL; + if (parent == NULL || parent->if_snd_tag_alloc == NULL) { + NET_EPOCH_EXIT(et); return (EOPNOTSUPP); - /* forward allocation request */ - return (ifp->if_snd_tag_alloc(ifp, params, ppmt)); + } + if_ref(parent); + NET_EPOCH_EXIT(et); + + vst = malloc(sizeof(*vst), M_VLAN, M_NOWAIT); + if (vst == NULL) { + if_rele(parent); + return (ENOMEM); + } + + error = parent->if_snd_tag_alloc(parent, params, &vst->tag); + if_rele(parent); + if (error) { + free(vst, M_VLAN); + return (error); + } + + m_snd_tag_init(&vst->com, ifp); + + *ppmt = &vst->com; + return (0); } +static int +vlan_snd_tag_modify(struct m_snd_tag *mst, + union if_snd_tag_modify_params *params) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + return (vst->tag->ifp->if_snd_tag_modify(vst->tag, params)); +} + +static int +vlan_snd_tag_query(struct m_snd_tag *mst, + union if_snd_tag_query_params *params) +{ + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + return (vst->tag->ifp->if_snd_tag_query(vst->tag, params)); +} + static void -vlan_snd_tag_free(struct m_snd_tag *tag) +vlan_snd_tag_free(struct m_snd_tag *mst) { - tag->ifp->if_snd_tag_free(tag); + struct vlan_snd_tag *vst; + + vst = mst_to_vst(mst); + m_snd_tag_rele(vst->tag); + free(vst, M_VLAN); } #endif Index: head/sys/net/netisr.c =================================================================== --- head/sys/net/netisr.c (revision 348253) +++ head/sys/net/netisr.c (revision 348254) @@ -1,1533 +1,1534 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract * to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * netisr is a packet dispatch service, allowing synchronous (directly * dispatched) and asynchronous (deferred dispatch) processing of packets by * registered protocol handlers. Callers pass a protocol identifier and * packet to netisr, along with a direct dispatch hint, and work will either * be immediately processed by the registered handler, or passed to a * software interrupt (SWI) thread for deferred dispatch. Callers will * generally select one or the other based on: * * - Whether directly dispatching a netisr handler lead to code reentrance or * lock recursion, such as entering the socket code from the socket code. * - Whether directly dispatching a netisr handler lead to recursive * processing, such as when decapsulating several wrapped layers of tunnel * information (IPSEC within IPSEC within ...). * * Maintaining ordering for protocol streams is a critical design concern. * Enforcing ordering limits the opportunity for concurrency, but maintains * the strong ordering requirements found in some protocols, such as TCP. Of * related concern is CPU affinity--it is desirable to process all data * associated with a particular stream on the same CPU over time in order to * avoid acquiring locks associated with the connection on different CPUs, * keep connection data in one cache, and to generally encourage associated * user threads to live on the same CPU as the stream. It's also desirable * to avoid lock migration and contention where locks are associated with * more than one flow. * * netisr supports several policy variations, represented by the * NETISR_POLICY_* constants, allowing protocols to play various roles in * identifying flows, assigning work to CPUs, etc. These are described in * netisr.h. */ #include "opt_ddb.h" #include "opt_device_polling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #define _WANT_NETISR_INTERNAL /* Enable definitions from netisr_internal.h */ #include #include #include #include #include /*- * Synchronize use and modification of the registered netisr data structures; * acquire a read lock while modifying the set of registered protocols to * prevent partially registered or unregistered protocols from being run. * * The following data structures and fields are protected by this lock: * * - The netisr_proto array, including all fields of struct netisr_proto. * - The nws array, including all fields of struct netisr_worker. * - The nws_array array. * * Note: the NETISR_LOCKING define controls whether read locks are acquired * in packet processing paths requiring netisr registration stability. This * is disabled by default as it can lead to measurable performance * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and * because netisr registration and unregistration is extremely rare at * runtime. If it becomes more common, this decision should be revisited. * * XXXRW: rmlocks don't support assertions. */ static struct rmlock netisr_rmlock; #define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ RM_NOWITNESS) #define NETISR_LOCK_ASSERT() #define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) #define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) #define NETISR_WLOCK() rm_wlock(&netisr_rmlock) #define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) /* #define NETISR_LOCKING */ static SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); /*- * Three global direct dispatch policies are supported: * * NETISR_DISPATCH_DEFERRED: All work is deferred for a netisr, regardless of * context (may be overriden by protocols). * * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch, * and we're running on the CPU the work would be performed on, then direct * dispatch it if it wouldn't violate ordering constraints on the workstream. * * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch, * always direct dispatch. (The default.) * * Notice that changing the global policy could lead to short periods of * misordered processing, but this is considered acceptable as compared to * the complexity of enforcing ordering during policy changes. Protocols can * override the global policy (when they're not doing that, they select * NETISR_DISPATCH_DEFAULT). */ #define NETISR_DISPATCH_POLICY_DEFAULT NETISR_DISPATCH_DIRECT #define NETISR_DISPATCH_POLICY_MAXSTR 20 /* Used for temporary buffers. */ static u_int netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT; static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RWTUN, 0, 0, sysctl_netisr_dispatch_policy, "A", "netisr dispatch policy"); /* * Allow the administrator to limit the number of threads (CPUs) to use for * netisr. We don't check netisr_maxthreads before creating the thread for * CPU 0. This must be set at boot. We will create at most one thread per CPU. * By default we initialize this to 1 which would assign just 1 cpu (cpu0) and * therefore only 1 workstream. If set to -1, netisr would use all cpus * (mp_ncpus) and therefore would have those many workstreams. One workstream * per thread (CPU). */ static int netisr_maxthreads = 1; /* Max number of threads. */ SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN, &netisr_maxthreads, 0, "Use at most this many CPUs for netisr processing"); static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN, &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); /* * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit, * both for initial configuration and later modification using * netisr_setqlimit(). */ #define NETISR_DEFAULT_MAXQLIMIT 10240 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; SYSCTL_UINT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN, &netisr_maxqlimit, 0, "Maximum netisr per-protocol, per-CPU queue depth."); /* * The default per-workstream mbuf queue limit for protocols that don't * initialize the nh_qlimit field of their struct netisr_handler. If this is * set above netisr_maxqlimit, we truncate it to the maximum during boot. */ #define NETISR_DEFAULT_DEFAULTQLIMIT 256 static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; SYSCTL_UINT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN, &netisr_defaultqlimit, 0, "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); /* * Store and export the compile-time constant NETISR_MAXPROT limit on the * number of protocols that can register with netisr at a time. This is * required for crashdump analysis, as it sizes netisr_proto[]. */ static u_int netisr_maxprot = NETISR_MAXPROT; SYSCTL_UINT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD, &netisr_maxprot, 0, "Compile-time limit on the number of protocols supported by netisr."); /* * The netisr_proto array describes all registered protocols, indexed by * protocol number. See netisr_internal.h for more details. */ static struct netisr_proto netisr_proto[NETISR_MAXPROT]; #ifdef VIMAGE /* * The netisr_enable array describes a per-VNET flag for registered * protocols on whether this netisr is active in this VNET or not. * netisr_register() will automatically enable the netisr for the * default VNET and all currently active instances. * netisr_unregister() will disable all active VNETs, including vnet0. * Individual network stack instances can be enabled/disabled by the * netisr_(un)register _vnet() functions. * With this we keep the one netisr_proto per protocol but add a * mechanism to stop netisr processing for vnet teardown. * Apart from that we expect a VNET to always be enabled. */ VNET_DEFINE_STATIC(u_int, netisr_enable[NETISR_MAXPROT]); #define V_netisr_enable VNET(netisr_enable) #endif /* * Per-CPU workstream data. See netisr_internal.h for more details. */ DPCPU_DEFINE(struct netisr_workstream, nws); /* * Map contiguous values between 0 and nws_count into CPU IDs appropriate for * accessing workstreams. This allows constructions of the form * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws). */ static u_int nws_array[MAXCPU]; /* * Number of registered workstreams. Will be at most the number of running * CPUs once fully started. */ static u_int nws_count; SYSCTL_UINT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, &nws_count, 0, "Number of extant netisr threads."); /* * Synchronization for each workstream: a mutex protects all mutable fields * in each stream, including per-protocol state (mbuf queues). The SWI is * woken up if asynchronous dispatch is required. */ #define NWS_LOCK(s) mtx_lock(&(s)->nws_mtx) #define NWS_LOCK_ASSERT(s) mtx_assert(&(s)->nws_mtx, MA_OWNED) #define NWS_UNLOCK(s) mtx_unlock(&(s)->nws_mtx) #define NWS_SIGNAL(s) swi_sched((s)->nws_swi_cookie, 0) /* * Utility routines for protocols that implement their own mapping of flows * to CPUs. */ u_int netisr_get_cpucount(void) { return (nws_count); } u_int netisr_get_cpuid(u_int cpunumber) { return (nws_array[cpunumber % nws_count]); } /* * The default implementation of flow -> CPU ID mapping. * * Non-static so that protocols can use it to map their own work to specific * CPUs in a manner consistent to netisr for affinity purposes. */ u_int netisr_default_flow2cpu(u_int flowid) { return (nws_array[flowid % nws_count]); } /* * Dispatch tunable and sysctl configuration. */ struct netisr_dispatch_table_entry { u_int ndte_policy; const char *ndte_policy_str; }; static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = { { NETISR_DISPATCH_DEFAULT, "default" }, { NETISR_DISPATCH_DEFERRED, "deferred" }, { NETISR_DISPATCH_HYBRID, "hybrid" }, { NETISR_DISPATCH_DIRECT, "direct" }, }; static void netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer, u_int buflen) { const struct netisr_dispatch_table_entry *ndtep; const char *str; u_int i; str = "unknown"; for (i = 0; i < nitems(netisr_dispatch_table); i++) { ndtep = &netisr_dispatch_table[i]; if (ndtep->ndte_policy == dispatch_policy) { str = ndtep->ndte_policy_str; break; } } snprintf(buffer, buflen, "%s", str); } static int netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp) { const struct netisr_dispatch_table_entry *ndtep; u_int i; for (i = 0; i < nitems(netisr_dispatch_table); i++) { ndtep = &netisr_dispatch_table[i]; if (strcmp(ndtep->ndte_policy_str, str) == 0) { *dispatch_policyp = ndtep->ndte_policy; return (0); } } return (EINVAL); } static int sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS) { char tmp[NETISR_DISPATCH_POLICY_MAXSTR]; u_int dispatch_policy; int error; netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp, sizeof(tmp)); error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req); if (error == 0 && req->newptr != NULL) { error = netisr_dispatch_policy_from_str(tmp, &dispatch_policy); if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT) error = EINVAL; if (error == 0) netisr_dispatch_policy = dispatch_policy; } return (error); } /* * Register a new netisr handler, which requires initializing per-protocol * fields for each workstream. All netisr work is briefly suspended while * the protocol is installed. */ void netisr_register(const struct netisr_handler *nhp) { VNET_ITERATOR_DECL(vnet_iter); struct netisr_work *npwp; const char *name; u_int i, proto; proto = nhp->nh_proto; name = nhp->nh_name; /* * Test that the requested registration is valid. */ KASSERT(nhp->nh_name != NULL, ("%s: nh_name NULL for %u", __func__, proto)); KASSERT(nhp->nh_handler != NULL, ("%s: nh_handler NULL for %s", __func__, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE || nhp->nh_policy == NETISR_POLICY_FLOW || nhp->nh_policy == NETISR_POLICY_CPU, ("%s: unsupported nh_policy %u for %s", __func__, nhp->nh_policy, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW || nhp->nh_m2flow == NULL, ("%s: nh_policy != FLOW but m2flow defined for %s", __func__, name)); KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL, ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__, name)); KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL, ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__, name)); KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT || nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED || nhp->nh_dispatch == NETISR_DISPATCH_HYBRID || nhp->nh_dispatch == NETISR_DISPATCH_DIRECT, ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch)); KASSERT(proto < NETISR_MAXPROT, ("%s(%u, %s): protocol too big", __func__, proto, name)); /* * Test that no existing registration exists for this protocol. */ NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_name == NULL, ("%s(%u, %s): name present", __func__, proto, name)); KASSERT(netisr_proto[proto].np_handler == NULL, ("%s(%u, %s): handler present", __func__, proto, name)); netisr_proto[proto].np_name = name; netisr_proto[proto].np_handler = nhp->nh_handler; netisr_proto[proto].np_m2flow = nhp->nh_m2flow; netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid; netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu; if (nhp->nh_qlimit == 0) netisr_proto[proto].np_qlimit = netisr_defaultqlimit; else if (nhp->nh_qlimit > netisr_maxqlimit) { printf("%s: %s requested queue limit %u capped to " "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit, netisr_maxqlimit); netisr_proto[proto].np_qlimit = netisr_maxqlimit; } else netisr_proto[proto].np_qlimit = nhp->nh_qlimit; netisr_proto[proto].np_policy = nhp->nh_policy; netisr_proto[proto].np_dispatch = nhp->nh_dispatch; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; bzero(npwp, sizeof(*npwp)); npwp->nw_qlimit = netisr_proto[proto].np_qlimit; } #ifdef VIMAGE /* * Test that we are in vnet0 and have a curvnet set. */ KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); KASSERT(IS_DEFAULT_VNET(curvnet), ("%s: curvnet %p is not vnet0 %p", __func__, curvnet, vnet0)); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_netisr_enable[proto] = 1; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); #endif NETISR_WUNLOCK(); } /* * Clear drop counters across all workstreams for a protocol. */ void netisr_clearqdrops(const struct netisr_handler *nhp) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; npwp->nw_qdrops = 0; } NETISR_WUNLOCK(); } /* * Query current drop counters across all workstreams for a protocol. */ void netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) { struct netisr_work *npwp; struct rm_priotracker tracker; #ifdef INVARIANTS const char *name; #endif u_int i, proto; *qdropp = 0; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; *qdropp += npwp->nw_qdrops; } NETISR_RUNLOCK(&tracker); } /* * Query current per-workstream queue limit for a protocol. */ void netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) { struct rm_priotracker tracker; #ifdef INVARIANTS const char *name; #endif u_int proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_RLOCK(&tracker); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); *qlimitp = netisr_proto[proto].np_qlimit; NETISR_RUNLOCK(&tracker); } /* * Update the queue limit across per-workstream queues for a protocol. We * simply change the limits, and don't drain overflowed packets as they will * (hopefully) take care of themselves shortly. */ int netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit) { struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; if (qlimit > netisr_maxqlimit) return (EINVAL); proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); netisr_proto[proto].np_qlimit = qlimit; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; npwp->nw_qlimit = qlimit; } NETISR_WUNLOCK(); return (0); } /* * Drain all packets currently held in a particular protocol work queue. */ static void netisr_drain_proto(struct netisr_work *npwp) { struct mbuf *m; /* * We would assert the lock on the workstream but it's not passed in. */ while ((m = npwp->nw_head) != NULL) { npwp->nw_head = m->m_nextpkt; m->m_nextpkt = NULL; if (npwp->nw_head == NULL) npwp->nw_tail = NULL; npwp->nw_len--; m_freem(m); } KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__)); KASSERT(npwp->nw_len == 0, ("%s: len", __func__)); } /* * Remove the registration of a network protocol, which requires clearing * per-protocol fields across all workstreams, including freeing all mbufs in * the queues at time of unregister. All work in netisr is briefly suspended * while this takes place. */ void netisr_unregister(const struct netisr_handler *nhp) { VNET_ITERATOR_DECL(vnet_iter); struct netisr_work *npwp; #ifdef INVARIANTS const char *name; #endif u_int i, proto; proto = nhp->nh_proto; #ifdef INVARIANTS name = nhp->nh_name; #endif KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, name)); #ifdef VIMAGE VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); V_netisr_enable[proto] = 0; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); #endif netisr_proto[proto].np_name = NULL; netisr_proto[proto].np_handler = NULL; netisr_proto[proto].np_m2flow = NULL; netisr_proto[proto].np_m2cpuid = NULL; netisr_proto[proto].np_qlimit = 0; netisr_proto[proto].np_policy = 0; CPU_FOREACH(i) { npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; netisr_drain_proto(npwp); bzero(npwp, sizeof(*npwp)); } NETISR_WUNLOCK(); } #ifdef VIMAGE void netisr_register_vnet(const struct netisr_handler *nhp) { u_int proto; proto = nhp->nh_proto; KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, nhp->nh_name)); V_netisr_enable[proto] = 1; NETISR_WUNLOCK(); } static void netisr_drain_proto_vnet(struct vnet *vnet, u_int proto) { struct netisr_workstream *nwsp; struct netisr_work *npwp; struct mbuf *m, *mp, *n, *ne; u_int i; KASSERT(vnet != NULL, ("%s: vnet is NULL", __func__)); NETISR_LOCK_ASSERT(); CPU_FOREACH(i) { nwsp = DPCPU_ID_PTR(i, nws); if (nwsp->nws_intr_event == NULL) continue; npwp = &nwsp->nws_work[proto]; NWS_LOCK(nwsp); /* * Rather than dissecting and removing mbufs from the middle * of the chain, we build a new chain if the packet stays and * update the head and tail pointers at the end. All packets * matching the given vnet are freed. */ m = npwp->nw_head; n = ne = NULL; while (m != NULL) { mp = m; m = m->m_nextpkt; mp->m_nextpkt = NULL; if (mp->m_pkthdr.rcvif->if_vnet != vnet) { if (n == NULL) { n = ne = mp; } else { ne->m_nextpkt = mp; ne = mp; } continue; } /* This is a packet in the selected vnet. Free it. */ npwp->nw_len--; m_freem(mp); } npwp->nw_head = n; npwp->nw_tail = ne; NWS_UNLOCK(nwsp); } } void netisr_unregister_vnet(const struct netisr_handler *nhp) { u_int proto; proto = nhp->nh_proto; KASSERT(curvnet != NULL, ("%s: curvnet is NULL", __func__)); KASSERT(proto < NETISR_MAXPROT, ("%s(%u): protocol too big for %s", __func__, proto, nhp->nh_name)); NETISR_WLOCK(); KASSERT(netisr_proto[proto].np_handler != NULL, ("%s(%u): protocol not registered for %s", __func__, proto, nhp->nh_name)); V_netisr_enable[proto] = 0; netisr_drain_proto_vnet(curvnet, proto); NETISR_WUNLOCK(); } #endif /* * Compose the global and per-protocol policies on dispatch, and return the * dispatch policy to use. */ static u_int netisr_get_dispatch(struct netisr_proto *npp) { /* * Protocol-specific configuration overrides the global default. */ if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT) return (npp->np_dispatch); return (netisr_dispatch_policy); } /* * Look up the workstream given a packet and source identifier. Do this by * checking the protocol's policy, and optionally call out to the protocol * for assistance if required. */ static struct mbuf * netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy, uintptr_t source, struct mbuf *m, u_int *cpuidp) { struct ifnet *ifp; u_int policy; NETISR_LOCK_ASSERT(); /* * In the event we have only one worker, shortcut and deliver to it * without further ado. */ if (nws_count == 1) { *cpuidp = nws_array[0]; return (m); } /* * What happens next depends on the policy selected by the protocol. * If we want to support per-interface policies, we should do that * here first. */ policy = npp->np_policy; if (policy == NETISR_POLICY_CPU) { m = npp->np_m2cpuid(m, source, cpuidp); if (m == NULL) return (NULL); /* * It's possible for a protocol not to have a good idea about * where to process a packet, in which case we fall back on * the netisr code to decide. In the hybrid case, return the * current CPU ID, which will force an immediate direct * dispatch. In the queued case, fall back on the SOURCE * policy. */ if (*cpuidp != NETISR_CPUID_NONE) { *cpuidp = netisr_get_cpuid(*cpuidp); return (m); } if (dispatch_policy == NETISR_DISPATCH_HYBRID) { *cpuidp = netisr_get_cpuid(curcpu); return (m); } policy = NETISR_POLICY_SOURCE; } if (policy == NETISR_POLICY_FLOW) { if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE && npp->np_m2flow != NULL) { m = npp->np_m2flow(m, source); if (m == NULL) return (NULL); } if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { *cpuidp = netisr_default_flow2cpu(m->m_pkthdr.flowid); return (m); } policy = NETISR_POLICY_SOURCE; } KASSERT(policy == NETISR_POLICY_SOURCE, ("%s: invalid policy %u for %s", __func__, npp->np_policy, npp->np_name)); + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); ifp = m->m_pkthdr.rcvif; if (ifp != NULL) *cpuidp = nws_array[(ifp->if_index + source) % nws_count]; else *cpuidp = nws_array[source % nws_count]; return (m); } /* * Process packets associated with a workstream and protocol. For reasons of * fairness, we process up to one complete netisr queue at a time, moving the * queue to a stack-local queue for processing, but do not loop refreshing * from the global queue. The caller is responsible for deciding whether to * loop, and for setting the NWS_RUNNING flag. The passed workstream will be * locked on entry and relocked before return, but will be released while * processing. The number of packets processed is returned. */ static u_int netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto) { struct netisr_work local_npw, *npwp; u_int handled; struct mbuf *m; NETISR_LOCK_ASSERT(); NWS_LOCK_ASSERT(nwsp); KASSERT(nwsp->nws_flags & NWS_RUNNING, ("%s(%u): not running", __func__, proto)); KASSERT(proto >= 0 && proto < NETISR_MAXPROT, ("%s(%u): invalid proto\n", __func__, proto)); npwp = &nwsp->nws_work[proto]; if (npwp->nw_len == 0) return (0); /* * Move the global work queue to a thread-local work queue. * * Notice that this means the effective maximum length of the queue * is actually twice that of the maximum queue length specified in * the protocol registration call. */ handled = npwp->nw_len; local_npw = *npwp; npwp->nw_head = NULL; npwp->nw_tail = NULL; npwp->nw_len = 0; nwsp->nws_pendingbits &= ~(1 << proto); NWS_UNLOCK(nwsp); while ((m = local_npw.nw_head) != NULL) { local_npw.nw_head = m->m_nextpkt; m->m_nextpkt = NULL; if (local_npw.nw_head == NULL) local_npw.nw_tail = NULL; local_npw.nw_len--; VNET_ASSERT(m->m_pkthdr.rcvif != NULL, ("%s:%d rcvif == NULL: m=%p", __func__, __LINE__, m)); CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); netisr_proto[proto].np_handler(m); CURVNET_RESTORE(); } KASSERT(local_npw.nw_len == 0, ("%s(%u): len %u", __func__, proto, local_npw.nw_len)); if (netisr_proto[proto].np_drainedcpu) netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu); NWS_LOCK(nwsp); npwp->nw_handled += handled; return (handled); } /* * SWI handler for netisr -- processes packets in a set of workstreams that * it owns, woken up by calls to NWS_SIGNAL(). If this workstream is already * being direct dispatched, go back to sleep and wait for the dispatching * thread to wake us up again. */ static void swi_net(void *arg) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; u_int bits, prot; nwsp = arg; #ifdef DEVICE_POLLING KASSERT(nws_count == 1, ("%s: device_polling but nws_count != 1", __func__)); netisr_poll(); #endif #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif NWS_LOCK(nwsp); KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); if (nwsp->nws_flags & NWS_DISPATCHING) goto out; nwsp->nws_flags |= NWS_RUNNING; nwsp->nws_flags &= ~NWS_SCHEDULED; while ((bits = nwsp->nws_pendingbits) != 0) { while ((prot = ffs(bits)) != 0) { prot--; bits &= ~(1 << prot); (void)netisr_process_workstream_proto(nwsp, prot); } } nwsp->nws_flags &= ~NWS_RUNNING; out: NWS_UNLOCK(nwsp); #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif #ifdef DEVICE_POLLING netisr_pollmore(); #endif } static int netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto, struct netisr_work *npwp, struct mbuf *m, int *dosignalp) { NWS_LOCK_ASSERT(nwsp); *dosignalp = 0; if (npwp->nw_len < npwp->nw_qlimit) { m->m_nextpkt = NULL; if (npwp->nw_head == NULL) { npwp->nw_head = m; npwp->nw_tail = m; } else { npwp->nw_tail->m_nextpkt = m; npwp->nw_tail = m; } npwp->nw_len++; if (npwp->nw_len > npwp->nw_watermark) npwp->nw_watermark = npwp->nw_len; /* * We must set the bit regardless of NWS_RUNNING, so that * swi_net() keeps calling netisr_process_workstream_proto(). */ nwsp->nws_pendingbits |= (1 << proto); if (!(nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) { nwsp->nws_flags |= NWS_SCHEDULED; *dosignalp = 1; /* Defer until unlocked. */ } npwp->nw_queued++; return (0); } else { m_freem(m); npwp->nw_qdrops++; return (ENOBUFS); } } static int netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid) { struct netisr_workstream *nwsp; struct netisr_work *npwp; int dosignal, error; #ifdef NETISR_LOCKING NETISR_LOCK_ASSERT(); #endif KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, cpuid, mp_maxid)); KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); dosignal = 0; error = 0; nwsp = DPCPU_ID_PTR(cpuid, nws); npwp = &nwsp->nws_work[proto]; NWS_LOCK(nwsp); error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); return (error); } int netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif u_int cpuid; int error; KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif KASSERT(netisr_proto[proto].np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); #ifdef VIMAGE if (V_netisr_enable[proto] == 0) { m_freem(m); return (ENOPROTOOPT); } #endif m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED, source, m, &cpuid); if (m != NULL) { KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); error = netisr_queue_internal(proto, m, cpuid); } else error = ENOBUFS; #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif return (error); } int netisr_queue(u_int proto, struct mbuf *m) { return (netisr_queue_src(proto, 0, m)); } /* * Dispatch a packet for netisr processing; direct dispatch is permitted by * calling context. */ int netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) { #ifdef NETISR_LOCKING struct rm_priotracker tracker; #endif struct netisr_workstream *nwsp; struct netisr_proto *npp; struct netisr_work *npwp; int dosignal, error; u_int cpuid, dispatch_policy; KASSERT(proto < NETISR_MAXPROT, ("%s: invalid proto %u", __func__, proto)); #ifdef NETISR_LOCKING NETISR_RLOCK(&tracker); #endif npp = &netisr_proto[proto]; KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__, proto)); #ifdef VIMAGE if (V_netisr_enable[proto] == 0) { m_freem(m); return (ENOPROTOOPT); } #endif dispatch_policy = netisr_get_dispatch(npp); if (dispatch_policy == NETISR_DISPATCH_DEFERRED) return (netisr_queue_src(proto, source, m)); /* * If direct dispatch is forced, then unconditionally dispatch * without a formal CPU selection. Borrow the current CPU's stats, * even if there's no worker on it. In this case we don't update * nws_flags because all netisr processing will be source ordered due * to always being forced to directly dispatch. */ if (dispatch_policy == NETISR_DISPATCH_DIRECT) { nwsp = DPCPU_PTR(nws); npwp = &nwsp->nws_work[proto]; npwp->nw_dispatched++; npwp->nw_handled++; netisr_proto[proto].np_handler(m); error = 0; goto out_unlock; } KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID, ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy)); /* * Otherwise, we execute in a hybrid mode where we will try to direct * dispatch if we're on the right CPU and the netisr worker isn't * already running. */ sched_pin(); m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID, source, m, &cpuid); if (m == NULL) { error = ENOBUFS; goto out_unpin; } KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); if (cpuid != curcpu) goto queue_fallback; nwsp = DPCPU_PTR(nws); npwp = &nwsp->nws_work[proto]; /*- * We are willing to direct dispatch only if three conditions hold: * * (1) The netisr worker isn't already running, * (2) Another thread isn't already directly dispatching, and * (3) The netisr hasn't already been woken up. */ NWS_LOCK(nwsp); if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) { error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); goto out_unpin; } /* * The current thread is now effectively the netisr worker, so set * the dispatching flag to prevent concurrent processing of the * stream from another thread (even the netisr worker), which could * otherwise lead to effective misordering of the stream. */ nwsp->nws_flags |= NWS_DISPATCHING; NWS_UNLOCK(nwsp); netisr_proto[proto].np_handler(m); NWS_LOCK(nwsp); nwsp->nws_flags &= ~NWS_DISPATCHING; npwp->nw_handled++; npwp->nw_hybrid_dispatched++; /* * If other work was enqueued by another thread while we were direct * dispatching, we need to signal the netisr worker to do that work. * In the future, we might want to do some of that work in the * current thread, rather than trigger further context switches. If * so, we'll want to establish a reasonable bound on the work done in * the "borrowed" context. */ if (nwsp->nws_pendingbits != 0) { nwsp->nws_flags |= NWS_SCHEDULED; dosignal = 1; } else dosignal = 0; NWS_UNLOCK(nwsp); if (dosignal) NWS_SIGNAL(nwsp); error = 0; goto out_unpin; queue_fallback: error = netisr_queue_internal(proto, m, cpuid); out_unpin: sched_unpin(); out_unlock: #ifdef NETISR_LOCKING NETISR_RUNLOCK(&tracker); #endif return (error); } int netisr_dispatch(u_int proto, struct mbuf *m) { return (netisr_dispatch_src(proto, 0, m)); } #ifdef DEVICE_POLLING /* * Kernel polling borrows a netisr thread to run interface polling in; this * function allows kernel polling to request that the netisr thread be * scheduled even if no packets are pending for protocols. */ void netisr_sched_poll(void) { struct netisr_workstream *nwsp; nwsp = DPCPU_ID_PTR(nws_array[0], nws); NWS_SIGNAL(nwsp); } #endif static void netisr_start_swi(u_int cpuid, struct pcpu *pc) { char swiname[12]; struct netisr_workstream *nwsp; int error; KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); nwsp = DPCPU_ID_PTR(cpuid, nws); mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); nwsp->nws_cpu = cpuid; snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); if (error) panic("%s: swi_add %d", __func__, error); pc->pc_netisr = nwsp->nws_intr_event; if (netisr_bindthreads) { error = intr_event_bind(nwsp->nws_intr_event, cpuid); if (error != 0) printf("%s: cpu %u: intr_event_bind: %d", __func__, cpuid, error); } NETISR_WLOCK(); nws_array[nws_count] = nwsp->nws_cpu; nws_count++; NETISR_WUNLOCK(); } /* * Initialize the netisr subsystem. We rely on BSS and static initialization * of most fields in global data structures. * * Start a worker thread for the boot CPU so that we can support network * traffic immediately in case the network stack is used before additional * CPUs are started (for example, diskless boot). */ static void netisr_init(void *arg) { struct pcpu *pc; NETISR_LOCK_INIT(); if (netisr_maxthreads == 0 || netisr_maxthreads < -1 ) netisr_maxthreads = 1; /* default behavior */ else if (netisr_maxthreads == -1) netisr_maxthreads = mp_ncpus; /* use max cpus */ if (netisr_maxthreads > mp_ncpus) { printf("netisr_init: forcing maxthreads from %d to %d\n", netisr_maxthreads, mp_ncpus); netisr_maxthreads = mp_ncpus; } if (netisr_defaultqlimit > netisr_maxqlimit) { printf("netisr_init: forcing defaultqlimit from %d to %d\n", netisr_defaultqlimit, netisr_maxqlimit); netisr_defaultqlimit = netisr_maxqlimit; } #ifdef DEVICE_POLLING /* * The device polling code is not yet aware of how to deal with * multiple netisr threads, so for the time being compiling in device * polling disables parallel netisr workers. */ if (netisr_maxthreads != 1 || netisr_bindthreads != 0) { printf("netisr_init: forcing maxthreads to 1 and " "bindthreads to 0 for device polling\n"); netisr_maxthreads = 1; netisr_bindthreads = 0; } #endif #ifdef EARLY_AP_STARTUP STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (nws_count >= netisr_maxthreads) break; netisr_start_swi(pc->pc_cpuid, pc); } #else pc = get_pcpu(); netisr_start_swi(pc->pc_cpuid, pc); #endif } SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); #ifndef EARLY_AP_STARTUP /* * Start worker threads for additional CPUs. No attempt to gracefully handle * work reassignment, we don't yet support dynamic reconfiguration. */ static void netisr_start(void *arg) { struct pcpu *pc; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (nws_count >= netisr_maxthreads) break; /* Worker will already be present for boot CPU. */ if (pc->pc_netisr != NULL) continue; netisr_start_swi(pc->pc_cpuid, pc); } } SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); #endif /* * Sysctl monitoring for netisr: query a list of registered protocols. */ static int sysctl_netisr_proto(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_proto *snpp, *snp_array; struct netisr_proto *npp; u_int counter, proto; int error; if (req->newptr != NULL) return (EINVAL); snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); for (proto = 0; proto < NETISR_MAXPROT; proto++) { npp = &netisr_proto[proto]; if (npp->np_name == NULL) continue; snpp = &snp_array[counter]; snpp->snp_version = sizeof(*snpp); strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN); snpp->snp_proto = proto; snpp->snp_qlimit = npp->np_qlimit; snpp->snp_policy = npp->np_policy; snpp->snp_dispatch = npp->np_dispatch; if (npp->np_m2flow != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW; if (npp->np_m2cpuid != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID; if (npp->np_drainedcpu != NULL) snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU; counter++; } NETISR_RUNLOCK(&tracker); KASSERT(counter <= NETISR_MAXPROT, ("sysctl_netisr_proto: counter too big (%d)", counter)); error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter); free(snp_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, proto, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto, "S,sysctl_netisr_proto", "Return list of protocols registered with netisr"); /* * Sysctl monitoring for netisr: query a list of workstreams. */ static int sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_workstream *snwsp, *snws_array; struct netisr_workstream *nwsp; u_int counter, cpuid; int error; if (req->newptr != NULL) return (EINVAL); snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; NWS_LOCK(nwsp); snwsp = &snws_array[counter]; snwsp->snws_version = sizeof(*snwsp); /* * For now, we equate workstream IDs and CPU IDs in the * kernel, but expose them independently to userspace in case * that assumption changes in the future. */ snwsp->snws_wsid = cpuid; snwsp->snws_cpu = cpuid; if (nwsp->nws_intr_event != NULL) snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR; NWS_UNLOCK(nwsp); counter++; } NETISR_RUNLOCK(&tracker); KASSERT(counter <= MAXCPU, ("sysctl_netisr_workstream: counter too big (%d)", counter)); error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter); free(snws_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, workstream, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream, "S,sysctl_netisr_workstream", "Return list of workstreams implemented by netisr"); /* * Sysctl monitoring for netisr: query per-protocol data across all * workstreams. */ static int sysctl_netisr_work(SYSCTL_HANDLER_ARGS) { struct rm_priotracker tracker; struct sysctl_netisr_work *snwp, *snw_array; struct netisr_workstream *nwsp; struct netisr_proto *npp; struct netisr_work *nwp; u_int counter, cpuid, proto; int error; if (req->newptr != NULL) return (EINVAL); snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT, M_TEMP, M_ZERO | M_WAITOK); counter = 0; NETISR_RLOCK(&tracker); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; NWS_LOCK(nwsp); for (proto = 0; proto < NETISR_MAXPROT; proto++) { npp = &netisr_proto[proto]; if (npp->np_name == NULL) continue; nwp = &nwsp->nws_work[proto]; snwp = &snw_array[counter]; snwp->snw_version = sizeof(*snwp); snwp->snw_wsid = cpuid; /* See comment above. */ snwp->snw_proto = proto; snwp->snw_len = nwp->nw_len; snwp->snw_watermark = nwp->nw_watermark; snwp->snw_dispatched = nwp->nw_dispatched; snwp->snw_hybrid_dispatched = nwp->nw_hybrid_dispatched; snwp->snw_qdrops = nwp->nw_qdrops; snwp->snw_queued = nwp->nw_queued; snwp->snw_handled = nwp->nw_handled; counter++; } NWS_UNLOCK(nwsp); } KASSERT(counter <= MAXCPU * NETISR_MAXPROT, ("sysctl_netisr_work: counter too big (%d)", counter)); NETISR_RUNLOCK(&tracker); error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter); free(snw_array, M_TEMP); return (error); } SYSCTL_PROC(_net_isr, OID_AUTO, work, CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work, "S,sysctl_netisr_work", "Return list of per-workstream, per-protocol work in netisr"); #ifdef DDB DB_SHOW_COMMAND(netisr, db_show_netisr) { struct netisr_workstream *nwsp; struct netisr_work *nwp; int first, proto; u_int cpuid; db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto", "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue"); CPU_FOREACH(cpuid) { nwsp = DPCPU_ID_PTR(cpuid, nws); if (nwsp->nws_intr_event == NULL) continue; first = 1; for (proto = 0; proto < NETISR_MAXPROT; proto++) { if (netisr_proto[proto].np_handler == NULL) continue; nwp = &nwsp->nws_work[proto]; if (first) { db_printf("%3d ", cpuid); first = 0; } else db_printf("%3s ", ""); db_printf( "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n", netisr_proto[proto].np_name, nwp->nw_len, nwp->nw_watermark, nwp->nw_qlimit, nwp->nw_dispatched, nwp->nw_hybrid_dispatched, nwp->nw_qdrops, nwp->nw_queued); } } } #endif Index: head/sys/net80211/ieee80211_hwmp.c =================================================================== --- head/sys/net80211/ieee80211_hwmp.c (revision 348253) +++ head/sys/net80211/ieee80211_hwmp.c (revision 348254) @@ -1,2084 +1,2085 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 The FreeBSD Foundation * All rights reserved. * * This software was developed by Rui Paulo under sponsorship from the * FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #ifdef __FreeBSD__ __FBSDID("$FreeBSD$"); #endif /* * IEEE 802.11s Hybrid Wireless Mesh Protocol, HWMP. * * Based on March 2009, D3.0 802.11s draft spec. */ #include "opt_inet.h" #include "opt_wlan.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void hwmp_vattach(struct ieee80211vap *); static void hwmp_vdetach(struct ieee80211vap *); static int hwmp_newstate(struct ieee80211vap *, enum ieee80211_state, int); static int hwmp_send_action(struct ieee80211vap *, const uint8_t [IEEE80211_ADDR_LEN], uint8_t *, size_t); static uint8_t * hwmp_add_meshpreq(uint8_t *, const struct ieee80211_meshpreq_ie *); static uint8_t * hwmp_add_meshprep(uint8_t *, const struct ieee80211_meshprep_ie *); static uint8_t * hwmp_add_meshperr(uint8_t *, const struct ieee80211_meshperr_ie *); static uint8_t * hwmp_add_meshrann(uint8_t *, const struct ieee80211_meshrann_ie *); static void hwmp_rootmode_setup(struct ieee80211vap *); static void hwmp_rootmode_cb(void *); static void hwmp_rootmode_rann_cb(void *); static void hwmp_recv_preq(struct ieee80211vap *, struct ieee80211_node *, const struct ieee80211_frame *, const struct ieee80211_meshpreq_ie *); static int hwmp_send_preq(struct ieee80211vap *, const uint8_t [IEEE80211_ADDR_LEN], struct ieee80211_meshpreq_ie *, struct timeval *, struct timeval *); static void hwmp_recv_prep(struct ieee80211vap *, struct ieee80211_node *, const struct ieee80211_frame *, const struct ieee80211_meshprep_ie *); static int hwmp_send_prep(struct ieee80211vap *, const uint8_t [IEEE80211_ADDR_LEN], struct ieee80211_meshprep_ie *); static void hwmp_recv_perr(struct ieee80211vap *, struct ieee80211_node *, const struct ieee80211_frame *, const struct ieee80211_meshperr_ie *); static int hwmp_send_perr(struct ieee80211vap *, const uint8_t [IEEE80211_ADDR_LEN], struct ieee80211_meshperr_ie *); static void hwmp_senderror(struct ieee80211vap *, const uint8_t [IEEE80211_ADDR_LEN], struct ieee80211_mesh_route *, int); static void hwmp_recv_rann(struct ieee80211vap *, struct ieee80211_node *, const struct ieee80211_frame *, const struct ieee80211_meshrann_ie *); static int hwmp_send_rann(struct ieee80211vap *, const uint8_t [IEEE80211_ADDR_LEN], struct ieee80211_meshrann_ie *); static struct ieee80211_node * hwmp_discover(struct ieee80211vap *, const uint8_t [IEEE80211_ADDR_LEN], struct mbuf *); static void hwmp_peerdown(struct ieee80211_node *); static struct timeval ieee80211_hwmp_preqminint = { 0, 100000 }; static struct timeval ieee80211_hwmp_perrminint = { 0, 100000 }; /* NB: the Target Address set in a Proactive PREQ is the broadcast address. */ static const uint8_t broadcastaddr[IEEE80211_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; typedef uint32_t ieee80211_hwmp_seq; #define HWMP_SEQ_LT(a, b) ((int32_t)((a)-(b)) < 0) #define HWMP_SEQ_LEQ(a, b) ((int32_t)((a)-(b)) <= 0) #define HWMP_SEQ_EQ(a, b) ((int32_t)((a)-(b)) == 0) #define HWMP_SEQ_GT(a, b) ((int32_t)((a)-(b)) > 0) #define HWMP_SEQ_MAX(a, b) (a > b ? a : b) /* * Private extension of ieee80211_mesh_route. */ struct ieee80211_hwmp_route { ieee80211_hwmp_seq hr_seq; /* last HWMP seq seen from dst*/ ieee80211_hwmp_seq hr_preqid; /* last PREQ ID seen from dst */ ieee80211_hwmp_seq hr_origseq; /* seq. no. on our latest PREQ*/ struct timeval hr_lastpreq; /* last time we sent a PREQ */ struct timeval hr_lastrootconf; /* last sent PREQ root conf */ int hr_preqretries; /* number of discoveries */ int hr_lastdiscovery; /* last discovery in ticks */ }; struct ieee80211_hwmp_state { ieee80211_hwmp_seq hs_seq; /* next seq to be used */ ieee80211_hwmp_seq hs_preqid; /* next PREQ ID to be used */ int hs_rootmode; /* proactive HWMP */ struct timeval hs_lastperr; /* last time we sent a PERR */ struct callout hs_roottimer; uint8_t hs_maxhops; /* max hop count */ }; static SYSCTL_NODE(_net_wlan, OID_AUTO, hwmp, CTLFLAG_RD, 0, "IEEE 802.11s HWMP parameters"); static int ieee80211_hwmp_targetonly = 0; SYSCTL_INT(_net_wlan_hwmp, OID_AUTO, targetonly, CTLFLAG_RW, &ieee80211_hwmp_targetonly, 0, "Set TO bit on generated PREQs"); static int ieee80211_hwmp_pathtimeout = -1; SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, pathlifetime, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_hwmp_pathtimeout, 0, ieee80211_sysctl_msecs_ticks, "I", "path entry lifetime (ms)"); static int ieee80211_hwmp_maxpreq_retries = -1; SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, maxpreq_retries, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_hwmp_maxpreq_retries, 0, ieee80211_sysctl_msecs_ticks, "I", "maximum number of preq retries"); static int ieee80211_hwmp_net_diameter_traversaltime = -1; SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, net_diameter_traversal_time, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_hwmp_net_diameter_traversaltime, 0, ieee80211_sysctl_msecs_ticks, "I", "estimate travelse time across the MBSS (ms)"); static int ieee80211_hwmp_roottimeout = -1; SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, roottimeout, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_hwmp_roottimeout, 0, ieee80211_sysctl_msecs_ticks, "I", "root PREQ timeout (ms)"); static int ieee80211_hwmp_rootint = -1; SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, rootint, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_hwmp_rootint, 0, ieee80211_sysctl_msecs_ticks, "I", "root interval (ms)"); static int ieee80211_hwmp_rannint = -1; SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, rannint, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_hwmp_rannint, 0, ieee80211_sysctl_msecs_ticks, "I", "root announcement interval (ms)"); static struct timeval ieee80211_hwmp_rootconfint = { 0, 0 }; static int ieee80211_hwmp_rootconfint_internal = -1; SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, rootconfint, CTLTYPE_INT | CTLFLAG_RD, &ieee80211_hwmp_rootconfint_internal, 0, ieee80211_sysctl_msecs_ticks, "I", "root confirmation interval (ms) (read-only)"); #define IEEE80211_HWMP_DEFAULT_MAXHOPS 31 static ieee80211_recv_action_func hwmp_recv_action_meshpath; static struct ieee80211_mesh_proto_path mesh_proto_hwmp = { .mpp_descr = "HWMP", .mpp_ie = IEEE80211_MESHCONF_PATH_HWMP, .mpp_discover = hwmp_discover, .mpp_peerdown = hwmp_peerdown, .mpp_senderror = hwmp_senderror, .mpp_vattach = hwmp_vattach, .mpp_vdetach = hwmp_vdetach, .mpp_newstate = hwmp_newstate, .mpp_privlen = sizeof(struct ieee80211_hwmp_route), }; SYSCTL_PROC(_net_wlan_hwmp, OID_AUTO, inact, CTLTYPE_INT | CTLFLAG_RW, &mesh_proto_hwmp.mpp_inact, 0, ieee80211_sysctl_msecs_ticks, "I", "mesh route inactivity timeout (ms)"); static void ieee80211_hwmp_init(void) { /* Default values as per amendment */ ieee80211_hwmp_pathtimeout = msecs_to_ticks(5*1000); ieee80211_hwmp_roottimeout = msecs_to_ticks(5*1000); ieee80211_hwmp_rootint = msecs_to_ticks(2*1000); ieee80211_hwmp_rannint = msecs_to_ticks(1*1000); ieee80211_hwmp_rootconfint_internal = msecs_to_ticks(2*1000); ieee80211_hwmp_maxpreq_retries = 3; /* * (TU): A measurement of time equal to 1024 μs, * 500 TU is 512 ms. */ ieee80211_hwmp_net_diameter_traversaltime = msecs_to_ticks(512); /* * NB: I dont know how to make SYSCTL_PROC that calls ms to ticks * and return a struct timeval... */ ieee80211_hwmp_rootconfint.tv_usec = ieee80211_hwmp_rootconfint_internal * 1000; /* * Register action frame handler. */ ieee80211_recv_action_register(IEEE80211_ACTION_CAT_MESH, IEEE80211_ACTION_MESH_HWMP, hwmp_recv_action_meshpath); /* NB: default is 5 secs per spec */ mesh_proto_hwmp.mpp_inact = msecs_to_ticks(5*1000); /* * Register HWMP. */ ieee80211_mesh_register_proto_path(&mesh_proto_hwmp); } SYSINIT(wlan_hwmp, SI_SUB_DRIVERS, SI_ORDER_SECOND, ieee80211_hwmp_init, NULL); static void hwmp_vattach(struct ieee80211vap *vap) { struct ieee80211_hwmp_state *hs; KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a mesh vap, opmode %d", vap->iv_opmode)); hs = IEEE80211_MALLOC(sizeof(struct ieee80211_hwmp_state), M_80211_VAP, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); if (hs == NULL) { printf("%s: couldn't alloc HWMP state\n", __func__); return; } hs->hs_maxhops = IEEE80211_HWMP_DEFAULT_MAXHOPS; callout_init(&hs->hs_roottimer, 1); vap->iv_hwmp = hs; } static void hwmp_vdetach(struct ieee80211vap *vap) { struct ieee80211_hwmp_state *hs = vap->iv_hwmp; callout_drain(&hs->hs_roottimer); IEEE80211_FREE(vap->iv_hwmp, M_80211_VAP); vap->iv_hwmp = NULL; } static int hwmp_newstate(struct ieee80211vap *vap, enum ieee80211_state ostate, int arg) { enum ieee80211_state nstate = vap->iv_state; struct ieee80211_hwmp_state *hs = vap->iv_hwmp; IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s (%d)\n", __func__, ieee80211_state_name[ostate], ieee80211_state_name[nstate], arg); if (nstate != IEEE80211_S_RUN && ostate == IEEE80211_S_RUN) callout_drain(&hs->hs_roottimer); if (nstate == IEEE80211_S_RUN) hwmp_rootmode_setup(vap); return 0; } /* * Verify the length of an HWMP PREQ and return the number * of destinations >= 1, if verification fails -1 is returned. */ static int verify_mesh_preq_len(struct ieee80211vap *vap, const struct ieee80211_frame *wh, const uint8_t *iefrm) { int alloc_sz = -1; int ndest = -1; if (iefrm[2] & IEEE80211_MESHPREQ_FLAGS_AE) { /* Originator External Address present */ alloc_sz = IEEE80211_MESHPREQ_BASE_SZ_AE; ndest = iefrm[IEEE80211_MESHPREQ_TCNT_OFFSET_AE]; } else { /* w/o Originator External Address */ alloc_sz = IEEE80211_MESHPREQ_BASE_SZ; ndest = iefrm[IEEE80211_MESHPREQ_TCNT_OFFSET]; } alloc_sz += ndest * IEEE80211_MESHPREQ_TRGT_SZ; if(iefrm[1] != (alloc_sz)) { IEEE80211_DISCARD(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP, wh, NULL, "PREQ (AE=%s) with wrong len", iefrm[2] & IEEE80211_MESHPREQ_FLAGS_AE ? "1" : "0"); return (-1); } return ndest; } /* * Verify the length of an HWMP PREP and returns 1 on success, * otherwise -1. */ static int verify_mesh_prep_len(struct ieee80211vap *vap, const struct ieee80211_frame *wh, const uint8_t *iefrm) { int alloc_sz = -1; if (iefrm[2] & IEEE80211_MESHPREP_FLAGS_AE) { if (iefrm[1] == IEEE80211_MESHPREP_BASE_SZ_AE) alloc_sz = IEEE80211_MESHPREP_BASE_SZ_AE; } else if (iefrm[1] == IEEE80211_MESHPREP_BASE_SZ) alloc_sz = IEEE80211_MESHPREP_BASE_SZ; if(alloc_sz < 0) { IEEE80211_DISCARD(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP, wh, NULL, "PREP (AE=%s) with wrong len", iefrm[2] & IEEE80211_MESHPREP_FLAGS_AE ? "1" : "0"); return (-1); } return (1); } /* * Verify the length of an HWMP PERR and return the number * of destinations >= 1, if verification fails -1 is returned. */ static int verify_mesh_perr_len(struct ieee80211vap *vap, const struct ieee80211_frame *wh, const uint8_t *iefrm) { int alloc_sz = -1; const uint8_t *iefrm_t = iefrm; uint8_t ndest = iefrm_t[IEEE80211_MESHPERR_NDEST_OFFSET]; int i; if(ndest > IEEE80211_MESHPERR_MAXDEST) { IEEE80211_DISCARD(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP, wh, NULL, "PERR with wrong number of destionat (>19), %u", ndest); return (-1); } iefrm_t += IEEE80211_MESHPERR_NDEST_OFFSET + 1; /* flag is next field */ /* We need to check each destionation flag to know size */ for(i = 0; ini_vap; struct ieee80211_meshpreq_ie *preq; struct ieee80211_meshprep_ie *prep; struct ieee80211_meshperr_ie *perr; struct ieee80211_meshrann_ie rann; const uint8_t *iefrm = frm + 2; /* action + code */ const uint8_t *iefrm_t = iefrm; /* temporary pointer */ int ndest = -1; int found = 0; while (efrm - iefrm > 1) { IEEE80211_VERIFY_LENGTH(efrm - iefrm, iefrm[1] + 2, return 0); switch (*iefrm) { case IEEE80211_ELEMID_MESHPREQ: { int i = 0; iefrm_t = iefrm; ndest = verify_mesh_preq_len(vap, wh, iefrm_t); if (ndest < 0) { vap->iv_stats.is_rx_mgtdiscard++; break; } preq = IEEE80211_MALLOC(sizeof(*preq) + (ndest - 1) * sizeof(*preq->preq_targets), M_80211_MESH_PREQ, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); KASSERT(preq != NULL, ("preq == NULL")); preq->preq_ie = *iefrm_t++; preq->preq_len = *iefrm_t++; preq->preq_flags = *iefrm_t++; preq->preq_hopcount = *iefrm_t++; preq->preq_ttl = *iefrm_t++; preq->preq_id = le32dec(iefrm_t); iefrm_t += 4; IEEE80211_ADDR_COPY(preq->preq_origaddr, iefrm_t); iefrm_t += 6; preq->preq_origseq = le32dec(iefrm_t); iefrm_t += 4; /* NB: may have Originator Proxied Address */ if (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AE) { IEEE80211_ADDR_COPY( preq->preq_orig_ext_addr, iefrm_t); iefrm_t += 6; } preq->preq_lifetime = le32dec(iefrm_t); iefrm_t += 4; preq->preq_metric = le32dec(iefrm_t); iefrm_t += 4; preq->preq_tcount = *iefrm_t++; for (i = 0; i < preq->preq_tcount; i++) { preq->preq_targets[i].target_flags = *iefrm_t++; IEEE80211_ADDR_COPY( preq->preq_targets[i].target_addr, iefrm_t); iefrm_t += 6; preq->preq_targets[i].target_seq = le32dec(iefrm_t); iefrm_t += 4; } hwmp_recv_preq(vap, ni, wh, preq); IEEE80211_FREE(preq, M_80211_MESH_PREQ); found++; break; } case IEEE80211_ELEMID_MESHPREP: { iefrm_t = iefrm; ndest = verify_mesh_prep_len(vap, wh, iefrm_t); if (ndest < 0) { vap->iv_stats.is_rx_mgtdiscard++; break; } prep = IEEE80211_MALLOC(sizeof(*prep), M_80211_MESH_PREP, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); KASSERT(prep != NULL, ("prep == NULL")); prep->prep_ie = *iefrm_t++; prep->prep_len = *iefrm_t++; prep->prep_flags = *iefrm_t++; prep->prep_hopcount = *iefrm_t++; prep->prep_ttl = *iefrm_t++; IEEE80211_ADDR_COPY(prep->prep_targetaddr, iefrm_t); iefrm_t += 6; prep->prep_targetseq = le32dec(iefrm_t); iefrm_t += 4; /* NB: May have Target Proxied Address */ if (prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE) { IEEE80211_ADDR_COPY( prep->prep_target_ext_addr, iefrm_t); iefrm_t += 6; } prep->prep_lifetime = le32dec(iefrm_t); iefrm_t += 4; prep->prep_metric = le32dec(iefrm_t); iefrm_t += 4; IEEE80211_ADDR_COPY(prep->prep_origaddr, iefrm_t); iefrm_t += 6; prep->prep_origseq = le32dec(iefrm_t); iefrm_t += 4; hwmp_recv_prep(vap, ni, wh, prep); IEEE80211_FREE(prep, M_80211_MESH_PREP); found++; break; } case IEEE80211_ELEMID_MESHPERR: { int i = 0; iefrm_t = iefrm; ndest = verify_mesh_perr_len(vap, wh, iefrm_t); if (ndest < 0) { vap->iv_stats.is_rx_mgtdiscard++; break; } perr = IEEE80211_MALLOC(sizeof(*perr) + (ndest - 1) * sizeof(*perr->perr_dests), M_80211_MESH_PERR, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); KASSERT(perr != NULL, ("perr == NULL")); perr->perr_ie = *iefrm_t++; perr->perr_len = *iefrm_t++; perr->perr_ttl = *iefrm_t++; perr->perr_ndests = *iefrm_t++; for (i = 0; iperr_ndests; i++) { perr->perr_dests[i].dest_flags = *iefrm_t++; IEEE80211_ADDR_COPY( perr->perr_dests[i].dest_addr, iefrm_t); iefrm_t += 6; perr->perr_dests[i].dest_seq = le32dec(iefrm_t); iefrm_t += 4; /* NB: May have Target Proxied Address */ if (perr->perr_dests[i].dest_flags & IEEE80211_MESHPERR_FLAGS_AE) { IEEE80211_ADDR_COPY( perr->perr_dests[i].dest_ext_addr, iefrm_t); iefrm_t += 6; } perr->perr_dests[i].dest_rcode = le16dec(iefrm_t); iefrm_t += 2; } hwmp_recv_perr(vap, ni, wh, perr); IEEE80211_FREE(perr, M_80211_MESH_PERR); found++; break; } case IEEE80211_ELEMID_MESHRANN: { const struct ieee80211_meshrann_ie *mrann = (const struct ieee80211_meshrann_ie *) iefrm; if (mrann->rann_len != sizeof(struct ieee80211_meshrann_ie) - 2) { IEEE80211_DISCARD(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP, wh, NULL, "%s", "RAN with wrong len"); vap->iv_stats.is_rx_mgtdiscard++; return 1; } memcpy(&rann, mrann, sizeof(rann)); rann.rann_seq = le32dec(&mrann->rann_seq); rann.rann_interval = le32dec(&mrann->rann_interval); rann.rann_metric = le32dec(&mrann->rann_metric); hwmp_recv_rann(vap, ni, wh, &rann); found++; break; } } iefrm += iefrm[1] + 2; } if (!found) { IEEE80211_DISCARD(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_HWMP, wh, NULL, "%s", "PATH SEL action without IE"); vap->iv_stats.is_rx_mgtdiscard++; } return 0; } static int hwmp_send_action(struct ieee80211vap *vap, const uint8_t da[IEEE80211_ADDR_LEN], uint8_t *ie, size_t len) { struct ieee80211_node *ni; struct ieee80211com *ic; struct ieee80211_bpf_params params; struct mbuf *m; uint8_t *frm; int ret; if (IEEE80211_IS_MULTICAST(da)) { ni = ieee80211_ref_node(vap->iv_bss); #ifdef IEEE80211_DEBUG_REFCNT IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE, "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__, ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1); #endif ieee80211_ref_node(ni); } else ni = ieee80211_mesh_find_txnode(vap, da); if (vap->iv_state == IEEE80211_S_CAC) { IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, ni, "block %s frame in CAC state", "HWMP action"); vap->iv_stats.is_tx_badstate++; return EIO; /* XXX */ } KASSERT(ni != NULL, ("null node")); ic = ni->ni_ic; m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(struct ieee80211_action) + len ); if (m == NULL) { ieee80211_free_node(ni); vap->iv_stats.is_tx_nobuf++; return ENOMEM; } *frm++ = IEEE80211_ACTION_CAT_MESH; *frm++ = IEEE80211_ACTION_MESH_HWMP; switch (*ie) { case IEEE80211_ELEMID_MESHPREQ: frm = hwmp_add_meshpreq(frm, (struct ieee80211_meshpreq_ie *)ie); break; case IEEE80211_ELEMID_MESHPREP: frm = hwmp_add_meshprep(frm, (struct ieee80211_meshprep_ie *)ie); break; case IEEE80211_ELEMID_MESHPERR: frm = hwmp_add_meshperr(frm, (struct ieee80211_meshperr_ie *)ie); break; case IEEE80211_ELEMID_MESHRANN: frm = hwmp_add_meshrann(frm, (struct ieee80211_meshrann_ie *)ie); break; } m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT); if (m == NULL) { ieee80211_free_node(ni); vap->iv_stats.is_tx_nobuf++; return ENOMEM; } IEEE80211_TX_LOCK(ic); ieee80211_send_setup(ni, m, IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_ACTION, IEEE80211_NONQOS_TID, vap->iv_myaddr, da, vap->iv_myaddr); m->m_flags |= M_ENCAP; /* mark encapsulated */ IEEE80211_NODE_STAT(ni, tx_mgmt); memset(¶ms, 0, sizeof(params)); params.ibp_pri = WME_AC_VO; params.ibp_rate0 = ni->ni_txparms->mgmtrate; if (IEEE80211_IS_MULTICAST(da)) params.ibp_try0 = 1; else params.ibp_try0 = ni->ni_txparms->maxretry; params.ibp_power = ni->ni_txpower; ret = ieee80211_raw_output(vap, ni, m, ¶ms); IEEE80211_TX_UNLOCK(ic); return (ret); } #define ADDSHORT(frm, v) do { \ le16enc(frm, v); \ frm += 2; \ } while (0) #define ADDWORD(frm, v) do { \ le32enc(frm, v); \ frm += 4; \ } while (0) /* * Add a Mesh Path Request IE to a frame. */ #define PREQ_TFLAGS(n) preq->preq_targets[n].target_flags #define PREQ_TADDR(n) preq->preq_targets[n].target_addr #define PREQ_TSEQ(n) preq->preq_targets[n].target_seq static uint8_t * hwmp_add_meshpreq(uint8_t *frm, const struct ieee80211_meshpreq_ie *preq) { int i; *frm++ = IEEE80211_ELEMID_MESHPREQ; *frm++ = preq->preq_len; /* len already calculated */ *frm++ = preq->preq_flags; *frm++ = preq->preq_hopcount; *frm++ = preq->preq_ttl; ADDWORD(frm, preq->preq_id); IEEE80211_ADDR_COPY(frm, preq->preq_origaddr); frm += 6; ADDWORD(frm, preq->preq_origseq); if (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AE) { IEEE80211_ADDR_COPY(frm, preq->preq_orig_ext_addr); frm += 6; } ADDWORD(frm, preq->preq_lifetime); ADDWORD(frm, preq->preq_metric); *frm++ = preq->preq_tcount; for (i = 0; i < preq->preq_tcount; i++) { *frm++ = PREQ_TFLAGS(i); IEEE80211_ADDR_COPY(frm, PREQ_TADDR(i)); frm += 6; ADDWORD(frm, PREQ_TSEQ(i)); } return frm; } #undef PREQ_TFLAGS #undef PREQ_TADDR #undef PREQ_TSEQ /* * Add a Mesh Path Reply IE to a frame. */ static uint8_t * hwmp_add_meshprep(uint8_t *frm, const struct ieee80211_meshprep_ie *prep) { *frm++ = IEEE80211_ELEMID_MESHPREP; *frm++ = prep->prep_len; /* len already calculated */ *frm++ = prep->prep_flags; *frm++ = prep->prep_hopcount; *frm++ = prep->prep_ttl; IEEE80211_ADDR_COPY(frm, prep->prep_targetaddr); frm += 6; ADDWORD(frm, prep->prep_targetseq); if (prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE) { IEEE80211_ADDR_COPY(frm, prep->prep_target_ext_addr); frm += 6; } ADDWORD(frm, prep->prep_lifetime); ADDWORD(frm, prep->prep_metric); IEEE80211_ADDR_COPY(frm, prep->prep_origaddr); frm += 6; ADDWORD(frm, prep->prep_origseq); return frm; } /* * Add a Mesh Path Error IE to a frame. */ #define PERR_DFLAGS(n) perr->perr_dests[n].dest_flags #define PERR_DADDR(n) perr->perr_dests[n].dest_addr #define PERR_DSEQ(n) perr->perr_dests[n].dest_seq #define PERR_EXTADDR(n) perr->perr_dests[n].dest_ext_addr #define PERR_DRCODE(n) perr->perr_dests[n].dest_rcode static uint8_t * hwmp_add_meshperr(uint8_t *frm, const struct ieee80211_meshperr_ie *perr) { int i; *frm++ = IEEE80211_ELEMID_MESHPERR; *frm++ = perr->perr_len; /* len already calculated */ *frm++ = perr->perr_ttl; *frm++ = perr->perr_ndests; for (i = 0; i < perr->perr_ndests; i++) { *frm++ = PERR_DFLAGS(i); IEEE80211_ADDR_COPY(frm, PERR_DADDR(i)); frm += 6; ADDWORD(frm, PERR_DSEQ(i)); if (PERR_DFLAGS(i) & IEEE80211_MESHPERR_FLAGS_AE) { IEEE80211_ADDR_COPY(frm, PERR_EXTADDR(i)); frm += 6; } ADDSHORT(frm, PERR_DRCODE(i)); } return frm; } #undef PERR_DFLAGS #undef PERR_DADDR #undef PERR_DSEQ #undef PERR_EXTADDR #undef PERR_DRCODE /* * Add a Root Annoucement IE to a frame. */ static uint8_t * hwmp_add_meshrann(uint8_t *frm, const struct ieee80211_meshrann_ie *rann) { *frm++ = IEEE80211_ELEMID_MESHRANN; *frm++ = rann->rann_len; *frm++ = rann->rann_flags; *frm++ = rann->rann_hopcount; *frm++ = rann->rann_ttl; IEEE80211_ADDR_COPY(frm, rann->rann_addr); frm += 6; ADDWORD(frm, rann->rann_seq); ADDWORD(frm, rann->rann_interval); ADDWORD(frm, rann->rann_metric); return frm; } static void hwmp_rootmode_setup(struct ieee80211vap *vap) { struct ieee80211_hwmp_state *hs = vap->iv_hwmp; struct ieee80211_mesh_state *ms = vap->iv_mesh; switch (hs->hs_rootmode) { case IEEE80211_HWMP_ROOTMODE_DISABLED: callout_drain(&hs->hs_roottimer); ms->ms_flags &= ~IEEE80211_MESHFLAGS_ROOT; break; case IEEE80211_HWMP_ROOTMODE_NORMAL: case IEEE80211_HWMP_ROOTMODE_PROACTIVE: callout_reset(&hs->hs_roottimer, ieee80211_hwmp_rootint, hwmp_rootmode_cb, vap); ms->ms_flags |= IEEE80211_MESHFLAGS_ROOT; break; case IEEE80211_HWMP_ROOTMODE_RANN: callout_reset(&hs->hs_roottimer, ieee80211_hwmp_rannint, hwmp_rootmode_rann_cb, vap); ms->ms_flags |= IEEE80211_MESHFLAGS_ROOT; break; } } /* * Send a broadcast Path Request to find all nodes on the mesh. We are * called when the vap is configured as a HWMP root node. */ #define PREQ_TFLAGS(n) preq.preq_targets[n].target_flags #define PREQ_TADDR(n) preq.preq_targets[n].target_addr #define PREQ_TSEQ(n) preq.preq_targets[n].target_seq static void hwmp_rootmode_cb(void *arg) { struct ieee80211vap *vap = (struct ieee80211vap *)arg; struct ieee80211_hwmp_state *hs = vap->iv_hwmp; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_meshpreq_ie preq; IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, vap->iv_bss, "%s", "send broadcast PREQ"); preq.preq_flags = 0; if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE) preq.preq_flags |= IEEE80211_MESHPREQ_FLAGS_GATE; if (hs->hs_rootmode == IEEE80211_HWMP_ROOTMODE_PROACTIVE) preq.preq_flags |= IEEE80211_MESHPREQ_FLAGS_PP; preq.preq_hopcount = 0; preq.preq_ttl = ms->ms_ttl; preq.preq_id = ++hs->hs_preqid; IEEE80211_ADDR_COPY(preq.preq_origaddr, vap->iv_myaddr); preq.preq_origseq = ++hs->hs_seq; preq.preq_lifetime = ticks_to_msecs(ieee80211_hwmp_roottimeout); preq.preq_metric = IEEE80211_MESHLMETRIC_INITIALVAL; preq.preq_tcount = 1; IEEE80211_ADDR_COPY(PREQ_TADDR(0), broadcastaddr); PREQ_TFLAGS(0) = IEEE80211_MESHPREQ_TFLAGS_TO | IEEE80211_MESHPREQ_TFLAGS_USN; PREQ_TSEQ(0) = 0; vap->iv_stats.is_hwmp_rootreqs++; /* NB: we enforce rate check ourself */ hwmp_send_preq(vap, broadcastaddr, &preq, NULL, NULL); hwmp_rootmode_setup(vap); } #undef PREQ_TFLAGS #undef PREQ_TADDR #undef PREQ_TSEQ /* * Send a Root Annoucement (RANN) to find all the nodes on the mesh. We are * called when the vap is configured as a HWMP RANN root node. */ static void hwmp_rootmode_rann_cb(void *arg) { struct ieee80211vap *vap = (struct ieee80211vap *)arg; struct ieee80211_hwmp_state *hs = vap->iv_hwmp; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_meshrann_ie rann; IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, vap->iv_bss, "%s", "send broadcast RANN"); rann.rann_flags = 0; if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE) rann.rann_flags |= IEEE80211_MESHFLAGS_GATE; rann.rann_hopcount = 0; rann.rann_ttl = ms->ms_ttl; IEEE80211_ADDR_COPY(rann.rann_addr, vap->iv_myaddr); rann.rann_seq = ++hs->hs_seq; rann.rann_interval = ieee80211_hwmp_rannint; rann.rann_metric = IEEE80211_MESHLMETRIC_INITIALVAL; vap->iv_stats.is_hwmp_rootrann++; hwmp_send_rann(vap, broadcastaddr, &rann); hwmp_rootmode_setup(vap); } /* * Update forwarding information to TA if metric improves. */ static void hwmp_update_transmitter(struct ieee80211vap *vap, struct ieee80211_node *ni, const char *hwmp_frame) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rttran = NULL; /* Transmitter */ int metric = 0; rttran = ieee80211_mesh_rt_find(vap, ni->ni_macaddr); if (rttran == NULL) { rttran = ieee80211_mesh_rt_add(vap, ni->ni_macaddr); if (rttran == NULL) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "unable to add path to transmitter %6D of %s", ni->ni_macaddr, ":", hwmp_frame); vap->iv_stats.is_mesh_rtaddfailed++; return; } } metric = ms->ms_pmetric->mpm_metric(ni); if (!(rttran->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) || rttran->rt_metric > metric) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "%s path to transmitter %6D of %s, metric %d:%d", rttran->rt_flags & IEEE80211_MESHRT_FLAGS_VALID ? "prefer" : "update", ni->ni_macaddr, ":", hwmp_frame, rttran->rt_metric, metric); IEEE80211_ADDR_COPY(rttran->rt_nexthop, ni->ni_macaddr); rttran->rt_metric = metric; rttran->rt_nhops = 1; ieee80211_mesh_rt_update(rttran, ms->ms_ppath->mpp_inact); rttran->rt_flags = IEEE80211_MESHRT_FLAGS_VALID; } } #define PREQ_TFLAGS(n) preq->preq_targets[n].target_flags #define PREQ_TADDR(n) preq->preq_targets[n].target_addr #define PREQ_TSEQ(n) preq->preq_targets[n].target_seq static void hwmp_recv_preq(struct ieee80211vap *vap, struct ieee80211_node *ni, const struct ieee80211_frame *wh, const struct ieee80211_meshpreq_ie *preq) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rtorig = NULL; struct ieee80211_mesh_route *rtorig_ext = NULL; struct ieee80211_mesh_route *rttarg = NULL; struct ieee80211_hwmp_route *hrorig = NULL; struct ieee80211_hwmp_route *hrtarg = NULL; struct ieee80211_hwmp_state *hs = vap->iv_hwmp; ieee80211_hwmp_seq preqid; /* last seen preqid for orig */ uint32_t metric = 0; /* * Ignore PREQs from us. Could happen because someone forward it * back to us. */ if (IEEE80211_ADDR_EQ(vap->iv_myaddr, preq->preq_origaddr)) return; IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "received PREQ, orig %6D, targ(0) %6D", preq->preq_origaddr, ":", PREQ_TADDR(0), ":"); /* * Acceptance criteria: (if the PREQ is not for us or not broadcast, * or an external mac address not proxied by us), * AND forwarding is disabled, discard this PREQ. */ rttarg = ieee80211_mesh_rt_find(vap, PREQ_TADDR(0)); if (!(ms->ms_flags & IEEE80211_MESHFLAGS_FWD) && (!IEEE80211_ADDR_EQ(vap->iv_myaddr, PREQ_TADDR(0)) || !IEEE80211_IS_MULTICAST(PREQ_TADDR(0)) || (rttarg != NULL && rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY && IEEE80211_ADDR_EQ(vap->iv_myaddr, rttarg->rt_mesh_gate)))) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_HWMP, preq->preq_origaddr, NULL, "%s", "not accepting PREQ"); return; } /* * Acceptance criteria: if unicast addressed * AND no valid forwarding for Target of PREQ, discard this PREQ. */ if(rttarg != NULL) hrtarg = IEEE80211_MESH_ROUTE_PRIV(rttarg, struct ieee80211_hwmp_route); /* Address mode: ucast */ if(preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AM && rttarg == NULL && !IEEE80211_ADDR_EQ(vap->iv_myaddr, PREQ_TADDR(0))) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_HWMP, preq->preq_origaddr, NULL, "unicast addressed PREQ of unknown target %6D", PREQ_TADDR(0), ":"); return; } /* PREQ ACCEPTED */ rtorig = ieee80211_mesh_rt_find(vap, preq->preq_origaddr); if (rtorig == NULL) { rtorig = ieee80211_mesh_rt_add(vap, preq->preq_origaddr); if (rtorig == NULL) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "unable to add orig path to %6D", preq->preq_origaddr, ":"); vap->iv_stats.is_mesh_rtaddfailed++; return; } IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "adding originator %6D", preq->preq_origaddr, ":"); } hrorig = IEEE80211_MESH_ROUTE_PRIV(rtorig, struct ieee80211_hwmp_route); /* record last seen preqid */ preqid = hrorig->hr_preqid; hrorig->hr_preqid = HWMP_SEQ_MAX(hrorig->hr_preqid, preq->preq_id); /* Data creation and update of forwarding information * according to Table 11C-8 for originator mesh STA. */ metric = preq->preq_metric + ms->ms_pmetric->mpm_metric(ni); if (HWMP_SEQ_GT(preq->preq_origseq, hrorig->hr_seq) || (HWMP_SEQ_EQ(preq->preq_origseq, hrorig->hr_seq) && metric < rtorig->rt_metric)) { hrorig->hr_seq = preq->preq_origseq; IEEE80211_ADDR_COPY(rtorig->rt_nexthop, wh->i_addr2); rtorig->rt_metric = metric; rtorig->rt_nhops = preq->preq_hopcount + 1; ieee80211_mesh_rt_update(rtorig, preq->preq_lifetime); /* Path to orig is valid now. * NB: we know it can't be Proxy, and if it is GATE * it will be marked below. */ rtorig->rt_flags = IEEE80211_MESHRT_FLAGS_VALID; } else if ((hrtarg != NULL && !HWMP_SEQ_EQ(hrtarg->hr_seq, PREQ_TSEQ(0))) || (rtorig->rt_flags & IEEE80211_MESHRT_FLAGS_VALID && preqid >= preq->preq_id)) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "discard PREQ from %6D, old seqno %u <= %u," " or old preqid %u < %u", preq->preq_origaddr, ":", preq->preq_origseq, hrorig->hr_seq, preq->preq_id, preqid); return; } /* Update forwarding information to TA if metric improves. */ hwmp_update_transmitter(vap, ni, "PREQ"); /* * Check if the PREQ is addressed to us. * or a Proxy currently gated by us. */ if (IEEE80211_ADDR_EQ(vap->iv_myaddr, PREQ_TADDR(0)) || (ms->ms_flags & IEEE80211_MESHFLAGS_GATE && rttarg != NULL && IEEE80211_ADDR_EQ(vap->iv_myaddr, rttarg->rt_mesh_gate) && rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY && rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_VALID)) { struct ieee80211_meshprep_ie prep; /* * When we are the target we shall update our own HWMP seq * number with max of (current and preq->seq) + 1 */ hs->hs_seq = HWMP_SEQ_MAX(hs->hs_seq, PREQ_TSEQ(0)) + 1; prep.prep_flags = 0; prep.prep_hopcount = 0; prep.prep_metric = IEEE80211_MESHLMETRIC_INITIALVAL; IEEE80211_ADDR_COPY(prep.prep_targetaddr, vap->iv_myaddr); if (rttarg != NULL && /* if NULL it means we are the target */ rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "reply for proxy %6D", rttarg->rt_dest, ":"); prep.prep_flags |= IEEE80211_MESHPREP_FLAGS_AE; IEEE80211_ADDR_COPY(prep.prep_target_ext_addr, rttarg->rt_dest); /* update proxy seqno to HWMP seqno */ rttarg->rt_ext_seq = hs->hs_seq; prep.prep_hopcount = rttarg->rt_nhops; prep.prep_metric = rttarg->rt_metric; IEEE80211_ADDR_COPY(prep.prep_targetaddr, rttarg->rt_mesh_gate); } /* * Build and send a PREP frame. */ prep.prep_ttl = ms->ms_ttl; prep.prep_targetseq = hs->hs_seq; prep.prep_lifetime = preq->preq_lifetime; IEEE80211_ADDR_COPY(prep.prep_origaddr, preq->preq_origaddr); prep.prep_origseq = preq->preq_origseq; IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "reply to %6D", preq->preq_origaddr, ":"); hwmp_send_prep(vap, wh->i_addr2, &prep); return; } /* we may update our proxy information for the orig external */ else if (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AE) { rtorig_ext = ieee80211_mesh_rt_find(vap, preq->preq_orig_ext_addr); if (rtorig_ext == NULL) { rtorig_ext = ieee80211_mesh_rt_add(vap, preq->preq_orig_ext_addr); if (rtorig_ext == NULL) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "unable to add orig ext proxy to %6D", preq->preq_orig_ext_addr, ":"); vap->iv_stats.is_mesh_rtaddfailed++; return; } IEEE80211_ADDR_COPY(rtorig_ext->rt_mesh_gate, preq->preq_origaddr); } rtorig_ext->rt_ext_seq = preq->preq_origseq; ieee80211_mesh_rt_update(rtorig_ext, preq->preq_lifetime); } /* * Proactive PREQ: reply with a proactive PREP to the * root STA if requested. */ if (IEEE80211_ADDR_EQ(PREQ_TADDR(0), broadcastaddr) && (PREQ_TFLAGS(0) & IEEE80211_MESHPREQ_TFLAGS_TO)) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "root mesh station @ %6D", preq->preq_origaddr, ":"); /* Check if root is a mesh gate, mark it */ if (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_GATE) { struct ieee80211_mesh_gate_route *gr; rtorig->rt_flags |= IEEE80211_MESHRT_FLAGS_GATE; gr = ieee80211_mesh_mark_gate(vap, preq->preq_origaddr, rtorig); gr->gr_lastseq = 0; /* NOT GANN */ } /* * Reply with a PREP if we don't have a path to the root * or if the root sent us a proactive PREQ. */ if ((rtorig->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0 || (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_PP)) { struct ieee80211_meshprep_ie prep; prep.prep_flags = 0; prep.prep_hopcount = 0; prep.prep_ttl = ms->ms_ttl; IEEE80211_ADDR_COPY(prep.prep_origaddr, preq->preq_origaddr); prep.prep_origseq = preq->preq_origseq; prep.prep_lifetime = preq->preq_lifetime; prep.prep_metric = IEEE80211_MESHLMETRIC_INITIALVAL; IEEE80211_ADDR_COPY(prep.prep_targetaddr, vap->iv_myaddr); prep.prep_targetseq = ++hs->hs_seq; hwmp_send_prep(vap, rtorig->rt_nexthop, &prep); } } /* * Forwarding and Intermediate reply for PREQs with 1 target. */ if ((preq->preq_tcount == 1) && (preq->preq_ttl > 1) && (ms->ms_flags & IEEE80211_MESHFLAGS_FWD)) { struct ieee80211_meshpreq_ie ppreq; /* propagated PREQ */ memcpy(&ppreq, preq, sizeof(ppreq)); /* * We have a valid route to this node. * NB: if target is proxy dont reply. */ if (rttarg != NULL && rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_VALID && !(rttarg->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY)) { /* * Check if we can send an intermediate Path Reply, * i.e., Target Only bit is not set and target is not * the MAC broadcast address. */ if (!(PREQ_TFLAGS(0) & IEEE80211_MESHPREQ_TFLAGS_TO) && !IEEE80211_ADDR_EQ(PREQ_TADDR(0), broadcastaddr)) { struct ieee80211_meshprep_ie prep; IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "intermediate reply for PREQ from %6D", preq->preq_origaddr, ":"); prep.prep_flags = 0; prep.prep_hopcount = rttarg->rt_nhops; prep.prep_ttl = ms->ms_ttl; IEEE80211_ADDR_COPY(&prep.prep_targetaddr, PREQ_TADDR(0)); prep.prep_targetseq = hrtarg->hr_seq; prep.prep_lifetime = preq->preq_lifetime; prep.prep_metric =rttarg->rt_metric; IEEE80211_ADDR_COPY(&prep.prep_origaddr, preq->preq_origaddr); prep.prep_origseq = hrorig->hr_seq; hwmp_send_prep(vap, rtorig->rt_nexthop, &prep); /* * Set TO and unset RF bits because we have * sent a PREP. */ ppreq.preq_targets[0].target_flags |= IEEE80211_MESHPREQ_TFLAGS_TO; } } IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "forward PREQ from %6D", preq->preq_origaddr, ":"); ppreq.preq_hopcount += 1; ppreq.preq_ttl -= 1; ppreq.preq_metric += ms->ms_pmetric->mpm_metric(ni); /* don't do PREQ ratecheck when we propagate */ hwmp_send_preq(vap, broadcastaddr, &ppreq, NULL, NULL); } } #undef PREQ_TFLAGS #undef PREQ_TADDR #undef PREQ_TSEQ static int hwmp_send_preq(struct ieee80211vap *vap, const uint8_t da[IEEE80211_ADDR_LEN], struct ieee80211_meshpreq_ie *preq, struct timeval *last, struct timeval *minint) { /* * Enforce PREQ interval. * NB: Proactive ROOT PREQs rate is handled by cb task. */ if (last != NULL && minint != NULL) { if (ratecheck(last, minint) == 0) return EALREADY; /* XXX: we should postpone */ getmicrouptime(last); } /* * mesh preq action frame format * [6] da * [6] sa * [6] addr3 = sa * [1] action * [1] category * [tlv] mesh path request */ preq->preq_ie = IEEE80211_ELEMID_MESHPREQ; preq->preq_len = (preq->preq_flags & IEEE80211_MESHPREQ_FLAGS_AE ? IEEE80211_MESHPREQ_BASE_SZ_AE : IEEE80211_MESHPREQ_BASE_SZ) + preq->preq_tcount * IEEE80211_MESHPREQ_TRGT_SZ; return hwmp_send_action(vap, da, (uint8_t *)preq, preq->preq_len+2); } static void hwmp_recv_prep(struct ieee80211vap *vap, struct ieee80211_node *ni, const struct ieee80211_frame *wh, const struct ieee80211_meshprep_ie *prep) { #define IS_PROXY(rt) (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) #define PROXIED_BY_US(rt) \ (IEEE80211_ADDR_EQ(vap->iv_myaddr, rt->rt_mesh_gate)) struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_hwmp_state *hs = vap->iv_hwmp; struct ieee80211_mesh_route *rt = NULL; struct ieee80211_mesh_route *rtorig = NULL; struct ieee80211_mesh_route *rtext = NULL; struct ieee80211_hwmp_route *hr; struct ieee80211com *ic = vap->iv_ic; struct mbuf *m, *next; uint32_t metric = 0; const uint8_t *addr; IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "received PREP, orig %6D, targ %6D", prep->prep_origaddr, ":", prep->prep_targetaddr, ":"); /* * Acceptance criteria: (If the corresponding PREP was not generated * by us OR not generated by an external mac that is not proxied by us) * AND forwarding is disabled, discard this PREP. */ rtorig = ieee80211_mesh_rt_find(vap, prep->prep_origaddr); if ((!IEEE80211_ADDR_EQ(vap->iv_myaddr, prep->prep_origaddr) || (rtorig != NULL && IS_PROXY(rtorig) && !PROXIED_BY_US(rtorig))) && !(ms->ms_flags & IEEE80211_MESHFLAGS_FWD)){ IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "discard PREP, orig(%6D) not proxied or generated by us", prep->prep_origaddr, ":"); return; } /* PREP ACCEPTED */ /* * If accepted shall create or update the active forwarding information * it maintains for the target mesh STA of the PREP (according to the * rules defined in 13.10.8.4). If the conditions for creating or * updating the forwarding information have not been met in those * rules, no further steps are applied to the PREP. */ rt = ieee80211_mesh_rt_find(vap, prep->prep_targetaddr); if (rt == NULL) { rt = ieee80211_mesh_rt_add(vap, prep->prep_targetaddr); if (rt == NULL) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "unable to add PREP path to %6D", prep->prep_targetaddr, ":"); vap->iv_stats.is_mesh_rtaddfailed++; return; } IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "adding target %6D", prep->prep_targetaddr, ":"); } hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route); /* update path metric */ metric = prep->prep_metric + ms->ms_pmetric->mpm_metric(ni); if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID)) { if (HWMP_SEQ_LT(prep->prep_targetseq, hr->hr_seq)) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "discard PREP from %6D, old seq no %u < %u", prep->prep_targetaddr, ":", prep->prep_targetseq, hr->hr_seq); return; } else if (HWMP_SEQ_LEQ(prep->prep_targetseq, hr->hr_seq) && metric > rt->rt_metric) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "discard PREP from %6D, new metric %u > %u", prep->prep_targetaddr, ":", metric, rt->rt_metric); return; } } IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "%s path to %6D, hopcount %d:%d metric %d:%d", rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID ? "prefer" : "update", prep->prep_targetaddr, ":", rt->rt_nhops, prep->prep_hopcount + 1, rt->rt_metric, metric); hr->hr_seq = prep->prep_targetseq; hr->hr_preqretries = 0; IEEE80211_ADDR_COPY(rt->rt_nexthop, ni->ni_macaddr); rt->rt_metric = metric; rt->rt_nhops = prep->prep_hopcount + 1; ieee80211_mesh_rt_update(rt, prep->prep_lifetime); if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_DISCOVER) { /* discovery complete */ rt->rt_flags &= ~IEEE80211_MESHRT_FLAGS_DISCOVER; } rt->rt_flags |= IEEE80211_MESHRT_FLAGS_VALID; /* mark valid */ /* Update forwarding information to TA if metric improves */ hwmp_update_transmitter(vap, ni, "PREP"); /* * If it's NOT for us, propagate the PREP */ if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, prep->prep_origaddr) && prep->prep_ttl > 1 && prep->prep_hopcount < hs->hs_maxhops) { struct ieee80211_meshprep_ie pprep; /* propagated PREP */ /* * NB: We should already have setup the path to orig * mesh STA when we propagated PREQ to target mesh STA, * no PREP is generated without a corresponding PREQ. * XXX: for now just ignore. */ if (rtorig == NULL) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "received PREP for an unknown orig(%6D)", prep->prep_origaddr, ":"); return; } IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "propagate PREP from %6D", prep->prep_targetaddr, ":"); memcpy(&pprep, prep, sizeof(pprep)); pprep.prep_hopcount += 1; pprep.prep_ttl -= 1; pprep.prep_metric += ms->ms_pmetric->mpm_metric(ni); hwmp_send_prep(vap, rtorig->rt_nexthop, &pprep); /* precursor list for the Target Mesh STA Address is updated */ } /* * Check if we received a PREP w/ AE and store target external address. * We may store target external address if recevied PREP w/ AE * and we are not final destination */ if (prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE) { rtext = ieee80211_mesh_rt_find(vap, prep->prep_target_ext_addr); if (rtext == NULL) { rtext = ieee80211_mesh_rt_add(vap, prep->prep_target_ext_addr); if (rtext == NULL) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "unable to add PREP path to proxy %6D", prep->prep_targetaddr, ":"); vap->iv_stats.is_mesh_rtaddfailed++; return; } } IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "%s path to %6D, hopcount %d:%d metric %d:%d", rtext->rt_flags & IEEE80211_MESHRT_FLAGS_VALID ? "prefer" : "update", prep->prep_target_ext_addr, ":", rtext->rt_nhops, prep->prep_hopcount + 1, rtext->rt_metric, metric); rtext->rt_flags = IEEE80211_MESHRT_FLAGS_PROXY | IEEE80211_MESHRT_FLAGS_VALID; IEEE80211_ADDR_COPY(rtext->rt_dest, prep->prep_target_ext_addr); IEEE80211_ADDR_COPY(rtext->rt_mesh_gate, prep->prep_targetaddr); IEEE80211_ADDR_COPY(rtext->rt_nexthop, wh->i_addr2); rtext->rt_metric = metric; rtext->rt_lifetime = prep->prep_lifetime; rtext->rt_nhops = prep->prep_hopcount + 1; rtext->rt_ext_seq = prep->prep_origseq; /* new proxy seq */ /* * XXX: proxy entries have no HWMP priv data, * nullify them to be sure? */ } /* * Check for frames queued awaiting path discovery. * XXX probably can tell exactly and avoid remove call * NB: hash may have false matches, if so they will get * stuck back on the stageq because there won't be * a path. */ addr = prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE ? prep->prep_target_ext_addr : prep->prep_targetaddr; m = ieee80211_ageq_remove(&ic->ic_stageq, (struct ieee80211_node *)(uintptr_t) ieee80211_mac_hash(ic, addr)); /* either dest or ext_dest */ /* * All frames in the stageq here should be non-M_ENCAP; or things * will get very unhappy. */ for (; m != NULL; m = next) { next = m->m_nextpkt; m->m_nextpkt = NULL; IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "flush queued frame %p len %d", m, m->m_pkthdr.len); /* * If the mbuf has M_ENCAP set, ensure we free it. * Note that after if_transmit() is called, m is invalid. */ (void) ieee80211_vap_xmitpkt(vap, m); } #undef IS_PROXY #undef PROXIED_BY_US } static int hwmp_send_prep(struct ieee80211vap *vap, const uint8_t da[IEEE80211_ADDR_LEN], struct ieee80211_meshprep_ie *prep) { /* NB: there's no PREP minimum interval. */ /* * mesh prep action frame format * [6] da * [6] sa * [6] addr3 = sa * [1] action * [1] category * [tlv] mesh path reply */ prep->prep_ie = IEEE80211_ELEMID_MESHPREP; prep->prep_len = prep->prep_flags & IEEE80211_MESHPREP_FLAGS_AE ? IEEE80211_MESHPREP_BASE_SZ_AE : IEEE80211_MESHPREP_BASE_SZ; return hwmp_send_action(vap, da, (uint8_t *)prep, prep->prep_len + 2); } #define PERR_DFLAGS(n) perr.perr_dests[n].dest_flags #define PERR_DADDR(n) perr.perr_dests[n].dest_addr #define PERR_DSEQ(n) perr.perr_dests[n].dest_seq #define PERR_DRCODE(n) perr.perr_dests[n].dest_rcode static void hwmp_peerdown(struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_meshperr_ie perr; struct ieee80211_mesh_route *rt; struct ieee80211_hwmp_route *hr; rt = ieee80211_mesh_rt_find(vap, ni->ni_macaddr); if (rt == NULL) return; hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route); IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "%s", "delete route entry"); perr.perr_ttl = ms->ms_ttl; perr.perr_ndests = 1; PERR_DFLAGS(0) = 0; if (hr->hr_seq == 0) PERR_DFLAGS(0) |= IEEE80211_MESHPERR_DFLAGS_USN; PERR_DFLAGS(0) |= IEEE80211_MESHPERR_DFLAGS_RC; IEEE80211_ADDR_COPY(PERR_DADDR(0), rt->rt_dest); PERR_DSEQ(0) = ++hr->hr_seq; PERR_DRCODE(0) = IEEE80211_REASON_MESH_PERR_DEST_UNREACH; /* NB: flush everything passing through peer */ ieee80211_mesh_rt_flush_peer(vap, ni->ni_macaddr); hwmp_send_perr(vap, broadcastaddr, &perr); } #undef PERR_DFLAGS #undef PERR_DADDR #undef PERR_DSEQ #undef PERR_DRCODE #define PERR_DFLAGS(n) perr->perr_dests[n].dest_flags #define PERR_DADDR(n) perr->perr_dests[n].dest_addr #define PERR_DSEQ(n) perr->perr_dests[n].dest_seq #define PERR_DEXTADDR(n) perr->perr_dests[n].dest_ext_addr static void hwmp_recv_perr(struct ieee80211vap *vap, struct ieee80211_node *ni, const struct ieee80211_frame *wh, const struct ieee80211_meshperr_ie *perr) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt = NULL; struct ieee80211_mesh_route *rt_ext = NULL; struct ieee80211_hwmp_route *hr; struct ieee80211_meshperr_ie *pperr = NULL; int i, j = 0, forward = 0; IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "received PERR from %6D", wh->i_addr2, ":"); /* * if forwarding is true, prepare pperr */ if (ms->ms_flags & IEEE80211_MESHFLAGS_FWD) { forward = 1; pperr = IEEE80211_MALLOC(sizeof(*perr) + 31*sizeof(*perr->perr_dests), M_80211_MESH_PERR, IEEE80211_M_NOWAIT); /* XXX: magic number, 32 err dests */ } /* * Acceptance criteria: check if we have forwarding information * stored about destination, and that nexthop == TA of this PERR. * NB: we also build a new PERR to propagate in case we should forward. */ for (i = 0; i < perr->perr_ndests; i++) { rt = ieee80211_mesh_rt_find(vap, PERR_DADDR(i)); if (rt == NULL) continue; if (!IEEE80211_ADDR_EQ(rt->rt_nexthop, wh->i_addr2)) continue; /* found and accepted a PERR ndest element, process it... */ if (forward) memcpy(&pperr->perr_dests[j], &perr->perr_dests[i], sizeof(*perr->perr_dests)); hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route); switch(PERR_DFLAGS(i)) { case (IEEE80211_REASON_MESH_PERR_NO_FI): if (PERR_DSEQ(i) == 0) { hr->hr_seq++; if (forward) { pperr->perr_dests[j].dest_seq = hr->hr_seq; } } else { hr->hr_seq = PERR_DSEQ(i); } rt->rt_flags &= ~IEEE80211_MESHRT_FLAGS_VALID; j++; break; case (IEEE80211_REASON_MESH_PERR_DEST_UNREACH): if(HWMP_SEQ_GT(PERR_DSEQ(i), hr->hr_seq)) { hr->hr_seq = PERR_DSEQ(i); rt->rt_flags &= ~IEEE80211_MESHRT_FLAGS_VALID; j++; } break; case (IEEE80211_REASON_MESH_PERR_NO_PROXY): rt_ext = ieee80211_mesh_rt_find(vap, PERR_DEXTADDR(i)); if (rt_ext != NULL) { rt_ext->rt_flags &= ~IEEE80211_MESHRT_FLAGS_VALID; j++; } break; default: IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL, "PERR, unknown reason code %u\n", PERR_DFLAGS(i)); goto done; /* XXX: stats?? */ } ieee80211_mesh_rt_flush_peer(vap, PERR_DADDR(i)); KASSERT(j < 32, ("PERR, error ndest >= 32 (%u)", j)); } if (j == 0) { IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL, "%s", "PERR not accepted"); goto done; /* XXX: stats?? */ } /* * Propagate the PERR if we previously found it on our routing table. */ if (forward && perr->perr_ttl > 1) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "propagate PERR from %6D", wh->i_addr2, ":"); pperr->perr_ndests = j; pperr->perr_ttl--; hwmp_send_perr(vap, broadcastaddr, pperr); } done: if (pperr != NULL) IEEE80211_FREE(pperr, M_80211_MESH_PERR); } #undef PERR_DFLAGS #undef PERR_DADDR #undef PERR_DSEQ #undef PERR_DEXTADDR static int hwmp_send_perr(struct ieee80211vap *vap, const uint8_t da[IEEE80211_ADDR_LEN], struct ieee80211_meshperr_ie *perr) { struct ieee80211_hwmp_state *hs = vap->iv_hwmp; int i; uint8_t length = 0; /* * Enforce PERR interval. */ if (ratecheck(&hs->hs_lastperr, &ieee80211_hwmp_perrminint) == 0) return EALREADY; getmicrouptime(&hs->hs_lastperr); /* * mesh perr action frame format * [6] da * [6] sa * [6] addr3 = sa * [1] action * [1] category * [tlv] mesh path error */ perr->perr_ie = IEEE80211_ELEMID_MESHPERR; length = IEEE80211_MESHPERR_BASE_SZ; for (i = 0; iperr_ndests; i++) { if (perr->perr_dests[i].dest_flags & IEEE80211_MESHPERR_FLAGS_AE) { length += IEEE80211_MESHPERR_DEST_SZ_AE; continue ; } length += IEEE80211_MESHPERR_DEST_SZ; } perr->perr_len =length; return hwmp_send_action(vap, da, (uint8_t *)perr, perr->perr_len+2); } /* * Called from the rest of the net80211 code (mesh code for example). * NB: IEEE80211_REASON_MESH_PERR_DEST_UNREACH can be trigger by the fact that * a mesh STA is unable to forward an MSDU/MMPDU to a next-hop mesh STA. */ #define PERR_DFLAGS(n) perr.perr_dests[n].dest_flags #define PERR_DADDR(n) perr.perr_dests[n].dest_addr #define PERR_DSEQ(n) perr.perr_dests[n].dest_seq #define PERR_DEXTADDR(n) perr.perr_dests[n].dest_ext_addr #define PERR_DRCODE(n) perr.perr_dests[n].dest_rcode static void hwmp_senderror(struct ieee80211vap *vap, const uint8_t addr[IEEE80211_ADDR_LEN], struct ieee80211_mesh_route *rt, int rcode) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_hwmp_route *hr = NULL; struct ieee80211_meshperr_ie perr; if (rt != NULL) hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route); perr.perr_ndests = 1; perr.perr_ttl = ms->ms_ttl; PERR_DFLAGS(0) = 0; PERR_DRCODE(0) = rcode; switch (rcode) { case IEEE80211_REASON_MESH_PERR_NO_FI: IEEE80211_ADDR_COPY(PERR_DADDR(0), addr); PERR_DSEQ(0) = 0; /* reserved */ break; case IEEE80211_REASON_MESH_PERR_NO_PROXY: KASSERT(rt != NULL, ("no proxy info for sending PERR")); KASSERT(rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY, ("route is not marked proxy")); PERR_DFLAGS(0) |= IEEE80211_MESHPERR_FLAGS_AE; IEEE80211_ADDR_COPY(PERR_DADDR(0), vap->iv_myaddr); PERR_DSEQ(0) = rt->rt_ext_seq; IEEE80211_ADDR_COPY(PERR_DEXTADDR(0), addr); break; case IEEE80211_REASON_MESH_PERR_DEST_UNREACH: KASSERT(rt != NULL, ("no route info for sending PERR")); IEEE80211_ADDR_COPY(PERR_DADDR(0), addr); PERR_DSEQ(0) = hr->hr_seq; break; default: KASSERT(0, ("unknown reason code for HWMP PERR (%u)", rcode)); } hwmp_send_perr(vap, broadcastaddr, &perr); } #undef PERR_DFLAGS #undef PEER_DADDR #undef PERR_DSEQ #undef PERR_DEXTADDR #undef PERR_DRCODE static void hwmp_recv_rann(struct ieee80211vap *vap, struct ieee80211_node *ni, const struct ieee80211_frame *wh, const struct ieee80211_meshrann_ie *rann) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_hwmp_state *hs = vap->iv_hwmp; struct ieee80211_mesh_route *rt = NULL; struct ieee80211_hwmp_route *hr; struct ieee80211_meshpreq_ie preq; struct ieee80211_meshrann_ie prann; if (IEEE80211_ADDR_EQ(rann->rann_addr, vap->iv_myaddr)) return; rt = ieee80211_mesh_rt_find(vap, rann->rann_addr); if (rt != NULL && rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) { hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route); /* Acceptance criteria: if RANN.seq < stored seq, discard RANN */ if (HWMP_SEQ_LT(rann->rann_seq, hr->hr_seq)) { IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL, "RANN seq %u < %u", rann->rann_seq, hr->hr_seq); return; } /* Acceptance criteria: if RANN.seq == stored seq AND * RANN.metric > stored metric, discard RANN */ if (HWMP_SEQ_EQ(rann->rann_seq, hr->hr_seq) && rann->rann_metric > rt->rt_metric) { IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL, "RANN metric %u > %u", rann->rann_metric, rt->rt_metric); return; } } /* RANN ACCEPTED */ ieee80211_hwmp_rannint = rann->rann_interval; /* XXX: mtx lock? */ if (rt == NULL) { rt = ieee80211_mesh_rt_add(vap, rann->rann_addr); if (rt == NULL) { IEEE80211_DISCARD(vap, IEEE80211_MSG_HWMP, wh, NULL, "unable to add mac for RANN root %6D", rann->rann_addr, ":"); vap->iv_stats.is_mesh_rtaddfailed++; return; } } hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route); /* Check if root is a mesh gate, mark it */ if (rann->rann_flags & IEEE80211_MESHRANN_FLAGS_GATE) { struct ieee80211_mesh_gate_route *gr; rt->rt_flags |= IEEE80211_MESHRT_FLAGS_GATE; gr = ieee80211_mesh_mark_gate(vap, rann->rann_addr, rt); gr->gr_lastseq = 0; /* NOT GANN */ } /* discovery timeout */ ieee80211_mesh_rt_update(rt, ticks_to_msecs(ieee80211_hwmp_roottimeout)); preq.preq_flags = IEEE80211_MESHPREQ_FLAGS_AM; preq.preq_hopcount = 0; preq.preq_ttl = ms->ms_ttl; preq.preq_id = 0; /* reserved */ IEEE80211_ADDR_COPY(preq.preq_origaddr, vap->iv_myaddr); preq.preq_origseq = ++hs->hs_seq; preq.preq_lifetime = ieee80211_hwmp_roottimeout; preq.preq_metric = IEEE80211_MESHLMETRIC_INITIALVAL; preq.preq_tcount = 1; preq.preq_targets[0].target_flags = IEEE80211_MESHPREQ_TFLAGS_TO; /* NB: IEEE80211_MESHPREQ_TFLAGS_USN = 0 implicitly implied */ IEEE80211_ADDR_COPY(preq.preq_targets[0].target_addr, rann->rann_addr); preq.preq_targets[0].target_seq = rann->rann_seq; /* XXX: if rootconfint have not passed, we built this preq in vain */ hwmp_send_preq(vap, wh->i_addr2, &preq, &hr->hr_lastrootconf, &ieee80211_hwmp_rootconfint); /* propagate a RANN */ if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID && rann->rann_ttl > 1 && ms->ms_flags & IEEE80211_MESHFLAGS_FWD) { hr->hr_seq = rann->rann_seq; memcpy(&prann, rann, sizeof(prann)); prann.rann_hopcount += 1; prann.rann_ttl -= 1; prann.rann_metric += ms->ms_pmetric->mpm_metric(ni); hwmp_send_rann(vap, broadcastaddr, &prann); } } static int hwmp_send_rann(struct ieee80211vap *vap, const uint8_t da[IEEE80211_ADDR_LEN], struct ieee80211_meshrann_ie *rann) { /* * mesh rann action frame format * [6] da * [6] sa * [6] addr3 = sa * [1] action * [1] category * [tlv] root annoucement */ rann->rann_ie = IEEE80211_ELEMID_MESHRANN; rann->rann_len = IEEE80211_MESHRANN_BASE_SZ; return hwmp_send_action(vap, da, (uint8_t *)rann, rann->rann_len + 2); } #define PREQ_TFLAGS(n) preq.preq_targets[n].target_flags #define PREQ_TADDR(n) preq.preq_targets[n].target_addr #define PREQ_TSEQ(n) preq.preq_targets[n].target_seq static void hwmp_rediscover_cb(void *arg) { struct ieee80211_mesh_route *rt = arg; struct ieee80211vap *vap = rt->rt_vap; struct ieee80211_hwmp_state *hs = vap->iv_hwmp; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_hwmp_route *hr; struct ieee80211_meshpreq_ie preq; /* Optimize: storing first preq? */ if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID)) return ; /* nothing to do */ hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route); if (hr->hr_preqretries >= ieee80211_hwmp_maxpreq_retries) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_ANY, rt->rt_dest, "%s", "max number of discovery, send queued frames to GATE"); ieee80211_mesh_forward_to_gates(vap, rt); vap->iv_stats.is_mesh_fwd_nopath++; return ; /* XXX: flush queue? */ } hr->hr_preqretries++; IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, rt->rt_dest, "start path rediscovery , target seq %u", hr->hr_seq); /* * Try to discover the path for this node. * Group addressed PREQ Case A */ preq.preq_flags = 0; preq.preq_hopcount = 0; preq.preq_ttl = ms->ms_ttl; preq.preq_id = ++hs->hs_preqid; IEEE80211_ADDR_COPY(preq.preq_origaddr, vap->iv_myaddr); preq.preq_origseq = hr->hr_origseq; preq.preq_lifetime = ticks_to_msecs(ieee80211_hwmp_pathtimeout); preq.preq_metric = IEEE80211_MESHLMETRIC_INITIALVAL; preq.preq_tcount = 1; IEEE80211_ADDR_COPY(PREQ_TADDR(0), rt->rt_dest); PREQ_TFLAGS(0) = 0; if (ieee80211_hwmp_targetonly) PREQ_TFLAGS(0) |= IEEE80211_MESHPREQ_TFLAGS_TO; PREQ_TFLAGS(0) |= IEEE80211_MESHPREQ_TFLAGS_USN; PREQ_TSEQ(0) = 0; /* RESERVED when USN flag is set */ /* XXX check return value */ hwmp_send_preq(vap, broadcastaddr, &preq, &hr->hr_lastpreq, &ieee80211_hwmp_preqminint); callout_reset(&rt->rt_discovery, ieee80211_hwmp_net_diameter_traversaltime * 2, hwmp_rediscover_cb, rt); } static struct ieee80211_node * hwmp_discover(struct ieee80211vap *vap, const uint8_t dest[IEEE80211_ADDR_LEN], struct mbuf *m) { struct ieee80211_hwmp_state *hs = vap->iv_hwmp; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt = NULL; struct ieee80211_hwmp_route *hr; struct ieee80211_meshpreq_ie preq; struct ieee80211_node *ni; int sendpreq = 0; KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a mesh vap, opmode %d", vap->iv_opmode)); KASSERT(!IEEE80211_ADDR_EQ(vap->iv_myaddr, dest), ("%s: discovering self!", __func__)); ni = NULL; if (!IEEE80211_IS_MULTICAST(dest)) { rt = ieee80211_mesh_rt_find(vap, dest); if (rt == NULL) { rt = ieee80211_mesh_rt_add(vap, dest); if (rt == NULL) { IEEE80211_NOTE(vap, IEEE80211_MSG_HWMP, ni, "unable to add discovery path to %6D", dest, ":"); vap->iv_stats.is_mesh_rtaddfailed++; goto done; } } hr = IEEE80211_MESH_ROUTE_PRIV(rt, struct ieee80211_hwmp_route); if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_DISCOVER) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, dest, "%s", "already discovering queue frame until path found"); sendpreq = 1; goto done; } if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) { if (hr->hr_lastdiscovery != 0 && (ticks - hr->hr_lastdiscovery < (ieee80211_hwmp_net_diameter_traversaltime * 2))) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, dest, NULL, "%s", "too frequent discovery requeust"); sendpreq = 1; goto done; } hr->hr_lastdiscovery = ticks; if (hr->hr_preqretries >= ieee80211_hwmp_maxpreq_retries) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, dest, NULL, "%s", "no valid path , max number of discovery"); vap->iv_stats.is_mesh_fwd_nopath++; goto done; } rt->rt_flags = IEEE80211_MESHRT_FLAGS_DISCOVER; hr->hr_preqretries++; if (hr->hr_origseq == 0) hr->hr_origseq = ++hs->hs_seq; rt->rt_metric = IEEE80211_MESHLMETRIC_INITIALVAL; sendpreq = 1; IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, dest, "start path discovery (src %s), target seq %u", m == NULL ? "" : ether_sprintf( mtod(m, struct ether_header *)->ether_shost), hr->hr_seq); /* * Try to discover the path for this node. * Group addressed PREQ Case A */ preq.preq_flags = 0; preq.preq_hopcount = 0; preq.preq_ttl = ms->ms_ttl; preq.preq_id = ++hs->hs_preqid; IEEE80211_ADDR_COPY(preq.preq_origaddr, vap->iv_myaddr); preq.preq_origseq = hr->hr_origseq; preq.preq_lifetime = ticks_to_msecs(ieee80211_hwmp_pathtimeout); preq.preq_metric = IEEE80211_MESHLMETRIC_INITIALVAL; preq.preq_tcount = 1; IEEE80211_ADDR_COPY(PREQ_TADDR(0), dest); PREQ_TFLAGS(0) = 0; if (ieee80211_hwmp_targetonly) PREQ_TFLAGS(0) |= IEEE80211_MESHPREQ_TFLAGS_TO; PREQ_TFLAGS(0) |= IEEE80211_MESHPREQ_TFLAGS_USN; PREQ_TSEQ(0) = 0; /* RESERVED when USN flag is set */ /* XXX check return value */ hwmp_send_preq(vap, broadcastaddr, &preq, &hr->hr_lastpreq, &ieee80211_hwmp_preqminint); callout_reset(&rt->rt_discovery, ieee80211_hwmp_net_diameter_traversaltime * 2, hwmp_rediscover_cb, rt); } if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) ni = ieee80211_find_txnode(vap, rt->rt_nexthop); } else { ni = ieee80211_find_txnode(vap, dest); /* NB: if null then we leak mbuf */ KASSERT(ni != NULL, ("leak mcast frame")); return ni; } done: if (ni == NULL && m != NULL) { if (sendpreq) { struct ieee80211com *ic = vap->iv_ic; /* * Queue packet for transmit when path discovery * completes. If discovery never completes the * frame will be flushed by way of the aging timer. */ IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, dest, "%s", "queue frame until path found"); + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)(uintptr_t) ieee80211_mac_hash(ic, dest); /* XXX age chosen randomly */ ieee80211_ageq_append(&ic->ic_stageq, m, IEEE80211_INACT_WAIT); } else { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_HWMP, dest, NULL, "%s", "no valid path to this node"); m_freem(m); } } return ni; } #undef PREQ_TFLAGS #undef PREQ_TADDR #undef PREQ_TSEQ static int hwmp_ioctl_get80211(struct ieee80211vap *vap, struct ieee80211req *ireq) { struct ieee80211_hwmp_state *hs = vap->iv_hwmp; int error; if (vap->iv_opmode != IEEE80211_M_MBSS) return ENOSYS; error = 0; switch (ireq->i_type) { case IEEE80211_IOC_HWMP_ROOTMODE: ireq->i_val = hs->hs_rootmode; break; case IEEE80211_IOC_HWMP_MAXHOPS: ireq->i_val = hs->hs_maxhops; break; default: return ENOSYS; } return error; } IEEE80211_IOCTL_GET(hwmp, hwmp_ioctl_get80211); static int hwmp_ioctl_set80211(struct ieee80211vap *vap, struct ieee80211req *ireq) { struct ieee80211_hwmp_state *hs = vap->iv_hwmp; int error; if (vap->iv_opmode != IEEE80211_M_MBSS) return ENOSYS; error = 0; switch (ireq->i_type) { case IEEE80211_IOC_HWMP_ROOTMODE: if (ireq->i_val < 0 || ireq->i_val > 3) return EINVAL; hs->hs_rootmode = ireq->i_val; hwmp_rootmode_setup(vap); break; case IEEE80211_IOC_HWMP_MAXHOPS: if (ireq->i_val <= 0 || ireq->i_val > 255) return EINVAL; hs->hs_maxhops = ireq->i_val; break; default: return ENOSYS; } return error; } IEEE80211_IOCTL_SET(hwmp, hwmp_ioctl_set80211); Index: head/sys/net80211/ieee80211_mesh.c =================================================================== --- head/sys/net80211/ieee80211_mesh.c (revision 348253) +++ head/sys/net80211/ieee80211_mesh.c (revision 348254) @@ -1,3608 +1,3609 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 The FreeBSD Foundation * All rights reserved. * * This software was developed by Rui Paulo under sponsorship from the * FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #ifdef __FreeBSD__ __FBSDID("$FreeBSD$"); #endif /* * IEEE 802.11s Mesh Point (MBSS) support. * * Based on March 2009, D3.0 802.11s draft spec. */ #include "opt_inet.h" #include "opt_wlan.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IEEE80211_SUPPORT_SUPERG #include #endif #include #include static void mesh_rt_flush_invalid(struct ieee80211vap *); static int mesh_select_proto_path(struct ieee80211vap *, const char *); static int mesh_select_proto_metric(struct ieee80211vap *, const char *); static void mesh_vattach(struct ieee80211vap *); static int mesh_newstate(struct ieee80211vap *, enum ieee80211_state, int); static void mesh_rt_cleanup_cb(void *); static void mesh_gatemode_setup(struct ieee80211vap *); static void mesh_gatemode_cb(void *); static void mesh_linkchange(struct ieee80211_node *, enum ieee80211_mesh_mlstate); static void mesh_checkid(void *, struct ieee80211_node *); static uint32_t mesh_generateid(struct ieee80211vap *); static int mesh_checkpseq(struct ieee80211vap *, const uint8_t [IEEE80211_ADDR_LEN], uint32_t); static void mesh_transmit_to_gate(struct ieee80211vap *, struct mbuf *, struct ieee80211_mesh_route *); static void mesh_forward(struct ieee80211vap *, struct mbuf *, const struct ieee80211_meshcntl *); static int mesh_input(struct ieee80211_node *, struct mbuf *, const struct ieee80211_rx_stats *rxs, int, int); static void mesh_recv_mgmt(struct ieee80211_node *, struct mbuf *, int, const struct ieee80211_rx_stats *rxs, int, int); static void mesh_recv_ctl(struct ieee80211_node *, struct mbuf *, int); static void mesh_peer_timeout_setup(struct ieee80211_node *); static void mesh_peer_timeout_backoff(struct ieee80211_node *); static void mesh_peer_timeout_cb(void *); static __inline void mesh_peer_timeout_stop(struct ieee80211_node *); static int mesh_verify_meshid(struct ieee80211vap *, const uint8_t *); static int mesh_verify_meshconf(struct ieee80211vap *, const uint8_t *); static int mesh_verify_meshpeer(struct ieee80211vap *, uint8_t, const uint8_t *); uint32_t mesh_airtime_calc(struct ieee80211_node *); /* * Timeout values come from the specification and are in milliseconds. */ static SYSCTL_NODE(_net_wlan, OID_AUTO, mesh, CTLFLAG_RD, 0, "IEEE 802.11s parameters"); static int ieee80211_mesh_gateint = -1; SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, gateint, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_mesh_gateint, 0, ieee80211_sysctl_msecs_ticks, "I", "mesh gate interval (ms)"); static int ieee80211_mesh_retrytimeout = -1; SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, retrytimeout, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_mesh_retrytimeout, 0, ieee80211_sysctl_msecs_ticks, "I", "Retry timeout (msec)"); static int ieee80211_mesh_holdingtimeout = -1; SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, holdingtimeout, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_mesh_holdingtimeout, 0, ieee80211_sysctl_msecs_ticks, "I", "Holding state timeout (msec)"); static int ieee80211_mesh_confirmtimeout = -1; SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, confirmtimeout, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_mesh_confirmtimeout, 0, ieee80211_sysctl_msecs_ticks, "I", "Confirm state timeout (msec)"); static int ieee80211_mesh_backofftimeout = -1; SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, backofftimeout, CTLTYPE_INT | CTLFLAG_RW, &ieee80211_mesh_backofftimeout, 0, ieee80211_sysctl_msecs_ticks, "I", "Backoff timeout (msec). This is to throutles peering forever when " "not receiving answer or is rejected by a neighbor"); static int ieee80211_mesh_maxretries = 2; SYSCTL_INT(_net_wlan_mesh, OID_AUTO, maxretries, CTLFLAG_RW, &ieee80211_mesh_maxretries, 0, "Maximum retries during peer link establishment"); static int ieee80211_mesh_maxholding = 2; SYSCTL_INT(_net_wlan_mesh, OID_AUTO, maxholding, CTLFLAG_RW, &ieee80211_mesh_maxholding, 0, "Maximum times we are allowed to transition to HOLDING state before " "backinoff during peer link establishment"); static const uint8_t broadcastaddr[IEEE80211_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static ieee80211_recv_action_func mesh_recv_action_meshpeering_open; static ieee80211_recv_action_func mesh_recv_action_meshpeering_confirm; static ieee80211_recv_action_func mesh_recv_action_meshpeering_close; static ieee80211_recv_action_func mesh_recv_action_meshlmetric; static ieee80211_recv_action_func mesh_recv_action_meshgate; static ieee80211_send_action_func mesh_send_action_meshpeering_open; static ieee80211_send_action_func mesh_send_action_meshpeering_confirm; static ieee80211_send_action_func mesh_send_action_meshpeering_close; static ieee80211_send_action_func mesh_send_action_meshlmetric; static ieee80211_send_action_func mesh_send_action_meshgate; static const struct ieee80211_mesh_proto_metric mesh_metric_airtime = { .mpm_descr = "AIRTIME", .mpm_ie = IEEE80211_MESHCONF_METRIC_AIRTIME, .mpm_metric = mesh_airtime_calc, }; static struct ieee80211_mesh_proto_path mesh_proto_paths[4]; static struct ieee80211_mesh_proto_metric mesh_proto_metrics[4]; MALLOC_DEFINE(M_80211_MESH_PREQ, "80211preq", "802.11 MESH Path Request frame"); MALLOC_DEFINE(M_80211_MESH_PREP, "80211prep", "802.11 MESH Path Reply frame"); MALLOC_DEFINE(M_80211_MESH_PERR, "80211perr", "802.11 MESH Path Error frame"); /* The longer one of the lifetime should be stored as new lifetime */ #define MESH_ROUTE_LIFETIME_MAX(a, b) (a > b ? a : b) MALLOC_DEFINE(M_80211_MESH_RT, "80211mesh_rt", "802.11s routing table"); MALLOC_DEFINE(M_80211_MESH_GT_RT, "80211mesh_gt", "802.11s known gates table"); /* * Helper functions to manipulate the Mesh routing table. */ static struct ieee80211_mesh_route * mesh_rt_find_locked(struct ieee80211_mesh_state *ms, const uint8_t dest[IEEE80211_ADDR_LEN]) { struct ieee80211_mesh_route *rt; MESH_RT_LOCK_ASSERT(ms); TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) { if (IEEE80211_ADDR_EQ(dest, rt->rt_dest)) return rt; } return NULL; } static struct ieee80211_mesh_route * mesh_rt_add_locked(struct ieee80211vap *vap, const uint8_t dest[IEEE80211_ADDR_LEN]) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt; KASSERT(!IEEE80211_ADDR_EQ(broadcastaddr, dest), ("%s: adding broadcast to the routing table", __func__)); MESH_RT_LOCK_ASSERT(ms); rt = IEEE80211_MALLOC(ALIGN(sizeof(struct ieee80211_mesh_route)) + ms->ms_ppath->mpp_privlen, M_80211_MESH_RT, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); if (rt != NULL) { rt->rt_vap = vap; IEEE80211_ADDR_COPY(rt->rt_dest, dest); rt->rt_priv = (void *)ALIGN(&rt[1]); MESH_RT_ENTRY_LOCK_INIT(rt, "MBSS_RT"); callout_init(&rt->rt_discovery, 1); rt->rt_updtime = ticks; /* create time */ TAILQ_INSERT_TAIL(&ms->ms_routes, rt, rt_next); } return rt; } struct ieee80211_mesh_route * ieee80211_mesh_rt_find(struct ieee80211vap *vap, const uint8_t dest[IEEE80211_ADDR_LEN]) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt; MESH_RT_LOCK(ms); rt = mesh_rt_find_locked(ms, dest); MESH_RT_UNLOCK(ms); return rt; } struct ieee80211_mesh_route * ieee80211_mesh_rt_add(struct ieee80211vap *vap, const uint8_t dest[IEEE80211_ADDR_LEN]) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt; KASSERT(ieee80211_mesh_rt_find(vap, dest) == NULL, ("%s: duplicate entry in the routing table", __func__)); KASSERT(!IEEE80211_ADDR_EQ(vap->iv_myaddr, dest), ("%s: adding self to the routing table", __func__)); MESH_RT_LOCK(ms); rt = mesh_rt_add_locked(vap, dest); MESH_RT_UNLOCK(ms); return rt; } /* * Update the route lifetime and returns the updated lifetime. * If new_lifetime is zero and route is timedout it will be invalidated. * new_lifetime is in msec */ int ieee80211_mesh_rt_update(struct ieee80211_mesh_route *rt, int new_lifetime) { int timesince, now; uint32_t lifetime = 0; KASSERT(rt != NULL, ("route is NULL")); now = ticks; MESH_RT_ENTRY_LOCK(rt); /* dont clobber a proxy entry gated by us */ if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY && rt->rt_nhops == 0) { MESH_RT_ENTRY_UNLOCK(rt); return rt->rt_lifetime; } timesince = ticks_to_msecs(now - rt->rt_updtime); rt->rt_updtime = now; if (timesince >= rt->rt_lifetime) { if (new_lifetime != 0) { rt->rt_lifetime = new_lifetime; } else { rt->rt_flags &= ~IEEE80211_MESHRT_FLAGS_VALID; rt->rt_lifetime = 0; } } else { /* update what is left of lifetime */ rt->rt_lifetime = rt->rt_lifetime - timesince; rt->rt_lifetime = MESH_ROUTE_LIFETIME_MAX( new_lifetime, rt->rt_lifetime); } lifetime = rt->rt_lifetime; MESH_RT_ENTRY_UNLOCK(rt); return lifetime; } /* * Add a proxy route (as needed) for the specified destination. */ void ieee80211_mesh_proxy_check(struct ieee80211vap *vap, const uint8_t dest[IEEE80211_ADDR_LEN]) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt; MESH_RT_LOCK(ms); rt = mesh_rt_find_locked(ms, dest); if (rt == NULL) { rt = mesh_rt_add_locked(vap, dest); if (rt == NULL) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest, "%s", "unable to add proxy entry"); vap->iv_stats.is_mesh_rtaddfailed++; } else { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest, "%s", "add proxy entry"); IEEE80211_ADDR_COPY(rt->rt_mesh_gate, vap->iv_myaddr); IEEE80211_ADDR_COPY(rt->rt_nexthop, vap->iv_myaddr); rt->rt_flags |= IEEE80211_MESHRT_FLAGS_VALID | IEEE80211_MESHRT_FLAGS_PROXY; } } else if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) { KASSERT(rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY, ("no proxy flag for poxy entry")); struct ieee80211com *ic = vap->iv_ic; /* * Fix existing entry created by received frames from * stations that have some memory of dest. We also * flush any frames held on the staging queue; delivering * them is too much trouble right now. */ IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest, "%s", "fix proxy entry"); IEEE80211_ADDR_COPY(rt->rt_nexthop, vap->iv_myaddr); rt->rt_flags |= IEEE80211_MESHRT_FLAGS_VALID | IEEE80211_MESHRT_FLAGS_PROXY; /* XXX belongs in hwmp */ ieee80211_ageq_drain_node(&ic->ic_stageq, (void *)(uintptr_t) ieee80211_mac_hash(ic, dest)); /* XXX stat? */ } MESH_RT_UNLOCK(ms); } static __inline void mesh_rt_del(struct ieee80211_mesh_state *ms, struct ieee80211_mesh_route *rt) { TAILQ_REMOVE(&ms->ms_routes, rt, rt_next); /* * Grab the lock before destroying it, to be sure no one else * is holding the route. */ MESH_RT_ENTRY_LOCK(rt); callout_drain(&rt->rt_discovery); MESH_RT_ENTRY_LOCK_DESTROY(rt); IEEE80211_FREE(rt, M_80211_MESH_RT); } void ieee80211_mesh_rt_del(struct ieee80211vap *vap, const uint8_t dest[IEEE80211_ADDR_LEN]) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt, *next; MESH_RT_LOCK(ms); TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) { if (IEEE80211_ADDR_EQ(rt->rt_dest, dest)) { if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) { ms->ms_ppath->mpp_senderror(vap, dest, rt, IEEE80211_REASON_MESH_PERR_NO_PROXY); } else { ms->ms_ppath->mpp_senderror(vap, dest, rt, IEEE80211_REASON_MESH_PERR_DEST_UNREACH); } mesh_rt_del(ms, rt); MESH_RT_UNLOCK(ms); return; } } MESH_RT_UNLOCK(ms); } void ieee80211_mesh_rt_flush(struct ieee80211vap *vap) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt, *next; if (ms == NULL) return; MESH_RT_LOCK(ms); TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) mesh_rt_del(ms, rt); MESH_RT_UNLOCK(ms); } void ieee80211_mesh_rt_flush_peer(struct ieee80211vap *vap, const uint8_t peer[IEEE80211_ADDR_LEN]) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt, *next; MESH_RT_LOCK(ms); TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) { if (IEEE80211_ADDR_EQ(rt->rt_nexthop, peer)) mesh_rt_del(ms, rt); } MESH_RT_UNLOCK(ms); } /* * Flush expired routing entries, i.e. those in invalid state for * some time. */ static void mesh_rt_flush_invalid(struct ieee80211vap *vap) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt, *next; if (ms == NULL) return; MESH_RT_LOCK(ms); TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) { /* Discover paths will be deleted by their own callout */ if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_DISCOVER) continue; ieee80211_mesh_rt_update(rt, 0); if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) mesh_rt_del(ms, rt); } MESH_RT_UNLOCK(ms); } int ieee80211_mesh_register_proto_path(const struct ieee80211_mesh_proto_path *mpp) { int i, firstempty = -1; for (i = 0; i < nitems(mesh_proto_paths); i++) { if (strncmp(mpp->mpp_descr, mesh_proto_paths[i].mpp_descr, IEEE80211_MESH_PROTO_DSZ) == 0) return EEXIST; if (!mesh_proto_paths[i].mpp_active && firstempty == -1) firstempty = i; } if (firstempty < 0) return ENOSPC; memcpy(&mesh_proto_paths[firstempty], mpp, sizeof(*mpp)); mesh_proto_paths[firstempty].mpp_active = 1; return 0; } int ieee80211_mesh_register_proto_metric(const struct ieee80211_mesh_proto_metric *mpm) { int i, firstempty = -1; for (i = 0; i < nitems(mesh_proto_metrics); i++) { if (strncmp(mpm->mpm_descr, mesh_proto_metrics[i].mpm_descr, IEEE80211_MESH_PROTO_DSZ) == 0) return EEXIST; if (!mesh_proto_metrics[i].mpm_active && firstempty == -1) firstempty = i; } if (firstempty < 0) return ENOSPC; memcpy(&mesh_proto_metrics[firstempty], mpm, sizeof(*mpm)); mesh_proto_metrics[firstempty].mpm_active = 1; return 0; } static int mesh_select_proto_path(struct ieee80211vap *vap, const char *name) { struct ieee80211_mesh_state *ms = vap->iv_mesh; int i; for (i = 0; i < nitems(mesh_proto_paths); i++) { if (strcasecmp(mesh_proto_paths[i].mpp_descr, name) == 0) { ms->ms_ppath = &mesh_proto_paths[i]; return 0; } } return ENOENT; } static int mesh_select_proto_metric(struct ieee80211vap *vap, const char *name) { struct ieee80211_mesh_state *ms = vap->iv_mesh; int i; for (i = 0; i < nitems(mesh_proto_metrics); i++) { if (strcasecmp(mesh_proto_metrics[i].mpm_descr, name) == 0) { ms->ms_pmetric = &mesh_proto_metrics[i]; return 0; } } return ENOENT; } static void mesh_gatemode_setup(struct ieee80211vap *vap) { struct ieee80211_mesh_state *ms = vap->iv_mesh; /* * NB: When a mesh gate is running as a ROOT it shall * not send out periodic GANNs but instead mark the * mesh gate flag for the corresponding proactive PREQ * and RANN frames. */ if (ms->ms_flags & IEEE80211_MESHFLAGS_ROOT || (ms->ms_flags & IEEE80211_MESHFLAGS_GATE) == 0) { callout_drain(&ms->ms_gatetimer); return ; } callout_reset(&ms->ms_gatetimer, ieee80211_mesh_gateint, mesh_gatemode_cb, vap); } static void mesh_gatemode_cb(void *arg) { struct ieee80211vap *vap = (struct ieee80211vap *)arg; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_meshgann_ie gann; gann.gann_flags = 0; /* Reserved */ gann.gann_hopcount = 0; gann.gann_ttl = ms->ms_ttl; IEEE80211_ADDR_COPY(gann.gann_addr, vap->iv_myaddr); gann.gann_seq = ms->ms_gateseq++; gann.gann_interval = ieee80211_mesh_gateint; IEEE80211_NOTE(vap, IEEE80211_MSG_MESH, vap->iv_bss, "send broadcast GANN (seq %u)", gann.gann_seq); ieee80211_send_action(vap->iv_bss, IEEE80211_ACTION_CAT_MESH, IEEE80211_ACTION_MESH_GANN, &gann); mesh_gatemode_setup(vap); } static void ieee80211_mesh_init(void) { memset(mesh_proto_paths, 0, sizeof(mesh_proto_paths)); memset(mesh_proto_metrics, 0, sizeof(mesh_proto_metrics)); /* * Setup mesh parameters that depends on the clock frequency. */ ieee80211_mesh_gateint = msecs_to_ticks(10000); ieee80211_mesh_retrytimeout = msecs_to_ticks(40); ieee80211_mesh_holdingtimeout = msecs_to_ticks(40); ieee80211_mesh_confirmtimeout = msecs_to_ticks(40); ieee80211_mesh_backofftimeout = msecs_to_ticks(5000); /* * Register action frame handlers. */ ieee80211_recv_action_register(IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_OPEN, mesh_recv_action_meshpeering_open); ieee80211_recv_action_register(IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CONFIRM, mesh_recv_action_meshpeering_confirm); ieee80211_recv_action_register(IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, mesh_recv_action_meshpeering_close); ieee80211_recv_action_register(IEEE80211_ACTION_CAT_MESH, IEEE80211_ACTION_MESH_LMETRIC, mesh_recv_action_meshlmetric); ieee80211_recv_action_register(IEEE80211_ACTION_CAT_MESH, IEEE80211_ACTION_MESH_GANN, mesh_recv_action_meshgate); ieee80211_send_action_register(IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_OPEN, mesh_send_action_meshpeering_open); ieee80211_send_action_register(IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CONFIRM, mesh_send_action_meshpeering_confirm); ieee80211_send_action_register(IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, mesh_send_action_meshpeering_close); ieee80211_send_action_register(IEEE80211_ACTION_CAT_MESH, IEEE80211_ACTION_MESH_LMETRIC, mesh_send_action_meshlmetric); ieee80211_send_action_register(IEEE80211_ACTION_CAT_MESH, IEEE80211_ACTION_MESH_GANN, mesh_send_action_meshgate); /* * Register Airtime Link Metric. */ ieee80211_mesh_register_proto_metric(&mesh_metric_airtime); } SYSINIT(wlan_mesh, SI_SUB_DRIVERS, SI_ORDER_FIRST, ieee80211_mesh_init, NULL); void ieee80211_mesh_attach(struct ieee80211com *ic) { ic->ic_vattach[IEEE80211_M_MBSS] = mesh_vattach; } void ieee80211_mesh_detach(struct ieee80211com *ic) { } static void mesh_vdetach_peers(void *arg, struct ieee80211_node *ni) { struct ieee80211com *ic = ni->ni_ic; uint16_t args[3]; if (ni->ni_mlstate == IEEE80211_NODE_MESH_ESTABLISHED) { args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; args[2] = IEEE80211_REASON_PEER_LINK_CANCELED; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); } callout_drain(&ni->ni_mltimer); /* XXX belongs in hwmp */ ieee80211_ageq_drain_node(&ic->ic_stageq, (void *)(uintptr_t) ieee80211_mac_hash(ic, ni->ni_macaddr)); } static void mesh_vdetach(struct ieee80211vap *vap) { struct ieee80211_mesh_state *ms = vap->iv_mesh; callout_drain(&ms->ms_cleantimer); ieee80211_iterate_nodes(&vap->iv_ic->ic_sta, mesh_vdetach_peers, NULL); ieee80211_mesh_rt_flush(vap); MESH_RT_LOCK_DESTROY(ms); ms->ms_ppath->mpp_vdetach(vap); IEEE80211_FREE(vap->iv_mesh, M_80211_VAP); vap->iv_mesh = NULL; } static void mesh_vattach(struct ieee80211vap *vap) { struct ieee80211_mesh_state *ms; vap->iv_newstate = mesh_newstate; vap->iv_input = mesh_input; vap->iv_opdetach = mesh_vdetach; vap->iv_recv_mgmt = mesh_recv_mgmt; vap->iv_recv_ctl = mesh_recv_ctl; ms = IEEE80211_MALLOC(sizeof(struct ieee80211_mesh_state), M_80211_VAP, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); if (ms == NULL) { printf("%s: couldn't alloc MBSS state\n", __func__); return; } vap->iv_mesh = ms; ms->ms_seq = 0; ms->ms_flags = (IEEE80211_MESHFLAGS_AP | IEEE80211_MESHFLAGS_FWD); ms->ms_ttl = IEEE80211_MESH_DEFAULT_TTL; TAILQ_INIT(&ms->ms_known_gates); TAILQ_INIT(&ms->ms_routes); MESH_RT_LOCK_INIT(ms, "MBSS"); callout_init(&ms->ms_cleantimer, 1); callout_init(&ms->ms_gatetimer, 1); ms->ms_gateseq = 0; mesh_select_proto_metric(vap, "AIRTIME"); KASSERT(ms->ms_pmetric, ("ms_pmetric == NULL")); mesh_select_proto_path(vap, "HWMP"); KASSERT(ms->ms_ppath, ("ms_ppath == NULL")); ms->ms_ppath->mpp_vattach(vap); } /* * IEEE80211_M_MBSS vap state machine handler. */ static int mesh_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211com *ic = vap->iv_ic; struct ieee80211_node *ni; enum ieee80211_state ostate; IEEE80211_LOCK_ASSERT(ic); ostate = vap->iv_state; IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s (%d)\n", __func__, ieee80211_state_name[ostate], ieee80211_state_name[nstate], arg); vap->iv_state = nstate; /* state transition */ if (ostate != IEEE80211_S_SCAN) ieee80211_cancel_scan(vap); /* background scan */ ni = vap->iv_bss; /* NB: no reference held */ if (nstate != IEEE80211_S_RUN && ostate == IEEE80211_S_RUN) { callout_drain(&ms->ms_cleantimer); callout_drain(&ms->ms_gatetimer); } switch (nstate) { case IEEE80211_S_INIT: switch (ostate) { case IEEE80211_S_SCAN: ieee80211_cancel_scan(vap); break; case IEEE80211_S_CAC: ieee80211_dfs_cac_stop(vap); break; case IEEE80211_S_RUN: ieee80211_iterate_nodes(&ic->ic_sta, mesh_vdetach_peers, NULL); break; default: break; } if (ostate != IEEE80211_S_INIT) { /* NB: optimize INIT -> INIT case */ ieee80211_reset_bss(vap); ieee80211_mesh_rt_flush(vap); } break; case IEEE80211_S_SCAN: switch (ostate) { case IEEE80211_S_INIT: if (vap->iv_des_chan != IEEE80211_CHAN_ANYC && !IEEE80211_IS_CHAN_RADAR(vap->iv_des_chan) && ms->ms_idlen != 0) { /* * Already have a channel and a mesh ID; bypass * the scan and startup immediately. */ ieee80211_create_ibss(vap, vap->iv_des_chan); break; } /* * Initiate a scan. We can come here as a result * of an IEEE80211_IOC_SCAN_REQ too in which case * the vap will be marked with IEEE80211_FEXT_SCANREQ * and the scan request parameters will be present * in iv_scanreq. Otherwise we do the default. */ if (vap->iv_flags_ext & IEEE80211_FEXT_SCANREQ) { ieee80211_check_scan(vap, vap->iv_scanreq_flags, vap->iv_scanreq_duration, vap->iv_scanreq_mindwell, vap->iv_scanreq_maxdwell, vap->iv_scanreq_nssid, vap->iv_scanreq_ssid); vap->iv_flags_ext &= ~IEEE80211_FEXT_SCANREQ; } else ieee80211_check_scan_current(vap); break; default: break; } break; case IEEE80211_S_CAC: /* * Start CAC on a DFS channel. We come here when starting * a bss on a DFS channel (see ieee80211_create_ibss). */ ieee80211_dfs_cac_start(vap); break; case IEEE80211_S_RUN: switch (ostate) { case IEEE80211_S_INIT: /* * Already have a channel; bypass the * scan and startup immediately. * Note that ieee80211_create_ibss will call * back to do a RUN->RUN state change. */ ieee80211_create_ibss(vap, ieee80211_ht_adjust_channel(ic, ic->ic_curchan, vap->iv_flags_ht)); /* NB: iv_bss is changed on return */ break; case IEEE80211_S_CAC: /* * NB: This is the normal state change when CAC * expires and no radar was detected; no need to * clear the CAC timer as it's already expired. */ /* fall thru... */ case IEEE80211_S_CSA: #if 0 /* * Shorten inactivity timer of associated stations * to weed out sta's that don't follow a CSA. */ ieee80211_iterate_nodes(&ic->ic_sta, sta_csa, vap); #endif /* * Update bss node channel to reflect where * we landed after CSA. */ ieee80211_node_set_chan(ni, ieee80211_ht_adjust_channel(ic, ic->ic_curchan, ieee80211_htchanflags(ni->ni_chan))); /* XXX bypass debug msgs */ break; case IEEE80211_S_SCAN: case IEEE80211_S_RUN: #ifdef IEEE80211_DEBUG if (ieee80211_msg_debug(vap)) { ieee80211_note(vap, "synchronized with %s meshid ", ether_sprintf(ni->ni_meshid)); ieee80211_print_essid(ni->ni_meshid, ni->ni_meshidlen); /* XXX MCS/HT */ printf(" channel %d\n", ieee80211_chan2ieee(ic, ic->ic_curchan)); } #endif break; default: break; } ieee80211_node_authorize(ni); callout_reset(&ms->ms_cleantimer, ms->ms_ppath->mpp_inact, mesh_rt_cleanup_cb, vap); mesh_gatemode_setup(vap); break; default: break; } /* NB: ostate not nstate */ ms->ms_ppath->mpp_newstate(vap, ostate, arg); return 0; } static void mesh_rt_cleanup_cb(void *arg) { struct ieee80211vap *vap = arg; struct ieee80211_mesh_state *ms = vap->iv_mesh; mesh_rt_flush_invalid(vap); callout_reset(&ms->ms_cleantimer, ms->ms_ppath->mpp_inact, mesh_rt_cleanup_cb, vap); } /* * Mark a mesh STA as gate and return a pointer to it. * If this is first time, we create a new gate route. * Always update the path route to this mesh gate. */ struct ieee80211_mesh_gate_route * ieee80211_mesh_mark_gate(struct ieee80211vap *vap, const uint8_t *addr, struct ieee80211_mesh_route *rt) { struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_gate_route *gr = NULL, *next; int found = 0; MESH_RT_LOCK(ms); TAILQ_FOREACH_SAFE(gr, &ms->ms_known_gates, gr_next, next) { if (IEEE80211_ADDR_EQ(gr->gr_addr, addr)) { found = 1; break; } } if (!found) { /* New mesh gate add it to known table. */ IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, addr, "%s", "stored new gate information from pro-PREQ."); gr = IEEE80211_MALLOC(ALIGN(sizeof(struct ieee80211_mesh_gate_route)), M_80211_MESH_GT_RT, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); IEEE80211_ADDR_COPY(gr->gr_addr, addr); TAILQ_INSERT_TAIL(&ms->ms_known_gates, gr, gr_next); } gr->gr_route = rt; /* TODO: link from path route to gate route */ MESH_RT_UNLOCK(ms); return gr; } /* * Helper function to note the Mesh Peer Link FSM change. */ static void mesh_linkchange(struct ieee80211_node *ni, enum ieee80211_mesh_mlstate state) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_mesh_state *ms = vap->iv_mesh; #ifdef IEEE80211_DEBUG static const char *meshlinkstates[] = { [IEEE80211_NODE_MESH_IDLE] = "IDLE", [IEEE80211_NODE_MESH_OPENSNT] = "OPEN SENT", [IEEE80211_NODE_MESH_OPENRCV] = "OPEN RECEIVED", [IEEE80211_NODE_MESH_CONFIRMRCV] = "CONFIRM RECEIVED", [IEEE80211_NODE_MESH_ESTABLISHED] = "ESTABLISHED", [IEEE80211_NODE_MESH_HOLDING] = "HOLDING" }; #endif IEEE80211_NOTE(vap, IEEE80211_MSG_MESH, ni, "peer link: %s -> %s", meshlinkstates[ni->ni_mlstate], meshlinkstates[state]); /* track neighbor count */ if (state == IEEE80211_NODE_MESH_ESTABLISHED && ni->ni_mlstate != IEEE80211_NODE_MESH_ESTABLISHED) { KASSERT(ms->ms_neighbors < 65535, ("neighbor count overflow")); ms->ms_neighbors++; ieee80211_beacon_notify(vap, IEEE80211_BEACON_MESHCONF); } else if (ni->ni_mlstate == IEEE80211_NODE_MESH_ESTABLISHED && state != IEEE80211_NODE_MESH_ESTABLISHED) { KASSERT(ms->ms_neighbors > 0, ("neighbor count 0")); ms->ms_neighbors--; ieee80211_beacon_notify(vap, IEEE80211_BEACON_MESHCONF); } ni->ni_mlstate = state; switch (state) { case IEEE80211_NODE_MESH_HOLDING: ms->ms_ppath->mpp_peerdown(ni); break; case IEEE80211_NODE_MESH_ESTABLISHED: ieee80211_mesh_discover(vap, ni->ni_macaddr, NULL); break; default: break; } } /* * Helper function to generate a unique local ID required for mesh * peer establishment. */ static void mesh_checkid(void *arg, struct ieee80211_node *ni) { uint16_t *r = arg; if (*r == ni->ni_mllid) *(uint16_t *)arg = 0; } static uint32_t mesh_generateid(struct ieee80211vap *vap) { int maxiter = 4; uint16_t r; do { get_random_bytes(&r, 2); ieee80211_iterate_nodes(&vap->iv_ic->ic_sta, mesh_checkid, &r); maxiter--; } while (r == 0 && maxiter > 0); return r; } /* * Verifies if we already received this packet by checking its * sequence number. * Returns 0 if the frame is to be accepted, 1 otherwise. */ static int mesh_checkpseq(struct ieee80211vap *vap, const uint8_t source[IEEE80211_ADDR_LEN], uint32_t seq) { struct ieee80211_mesh_route *rt; rt = ieee80211_mesh_rt_find(vap, source); if (rt == NULL) { rt = ieee80211_mesh_rt_add(vap, source); if (rt == NULL) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, source, "%s", "add mcast route failed"); vap->iv_stats.is_mesh_rtaddfailed++; return 1; } IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, source, "add mcast route, mesh seqno %d", seq); rt->rt_lastmseq = seq; return 0; } if (IEEE80211_MESH_SEQ_GEQ(rt->rt_lastmseq, seq)) { return 1; } else { rt->rt_lastmseq = seq; return 0; } } /* * Iterate the routing table and locate the next hop. */ struct ieee80211_node * ieee80211_mesh_find_txnode(struct ieee80211vap *vap, const uint8_t dest[IEEE80211_ADDR_LEN]) { struct ieee80211_mesh_route *rt; rt = ieee80211_mesh_rt_find(vap, dest); if (rt == NULL) return NULL; if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest, "%s: !valid, flags 0x%x", __func__, rt->rt_flags); /* XXX stat */ return NULL; } if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) { rt = ieee80211_mesh_rt_find(vap, rt->rt_mesh_gate); if (rt == NULL) return NULL; if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest, "%s: meshgate !valid, flags 0x%x", __func__, rt->rt_flags); /* XXX stat */ return NULL; } } return ieee80211_find_txnode(vap, rt->rt_nexthop); } static void mesh_transmit_to_gate(struct ieee80211vap *vap, struct mbuf *m, struct ieee80211_mesh_route *rt_gate) { struct ifnet *ifp = vap->iv_ifp; struct ieee80211_node *ni; IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic); ni = ieee80211_mesh_find_txnode(vap, rt_gate->rt_dest); if (ni == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); return; } /* * Send through the VAP packet transmit path. * This consumes the node ref grabbed above and * the mbuf, regardless of whether there's a problem * or not. */ (void) ieee80211_vap_pkt_send_dest(vap, m, ni); } /* * Forward the queued frames to known valid mesh gates. * Assume destination to be outside the MBSS (i.e. proxy entry), * If no valid mesh gates are known silently discard queued frames. * After transmitting frames to all known valid mesh gates, this route * will be marked invalid, and a new path discovery will happen in the hopes * that (at least) one of the mesh gates have a new proxy entry for us to use. */ void ieee80211_mesh_forward_to_gates(struct ieee80211vap *vap, struct ieee80211_mesh_route *rt_dest) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt_gate; struct ieee80211_mesh_gate_route *gr = NULL, *gr_next; struct mbuf *m, *mcopy, *next; IEEE80211_TX_UNLOCK_ASSERT(ic); KASSERT( rt_dest->rt_flags == IEEE80211_MESHRT_FLAGS_DISCOVER, ("Route is not marked with IEEE80211_MESHRT_FLAGS_DISCOVER")); /* XXX: send to more than one valid mash gate */ MESH_RT_LOCK(ms); m = ieee80211_ageq_remove(&ic->ic_stageq, (struct ieee80211_node *)(uintptr_t) ieee80211_mac_hash(ic, rt_dest->rt_dest)); TAILQ_FOREACH_SAFE(gr, &ms->ms_known_gates, gr_next, gr_next) { rt_gate = gr->gr_route; if (rt_gate == NULL) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, rt_dest->rt_dest, "mesh gate with no path %6D", gr->gr_addr, ":"); continue; } if ((rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) continue; KASSERT(rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_GATE, ("route not marked as a mesh gate")); KASSERT((rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) == 0, ("found mesh gate that is also marked porxy")); /* * convert route to a proxy route gated by the current * mesh gate, this is needed so encap can built data * frame with correct address. */ rt_dest->rt_flags = IEEE80211_MESHRT_FLAGS_PROXY | IEEE80211_MESHRT_FLAGS_VALID; rt_dest->rt_ext_seq = 1; /* random value */ IEEE80211_ADDR_COPY(rt_dest->rt_mesh_gate, rt_gate->rt_dest); IEEE80211_ADDR_COPY(rt_dest->rt_nexthop, rt_gate->rt_nexthop); rt_dest->rt_metric = rt_gate->rt_metric; rt_dest->rt_nhops = rt_gate->rt_nhops; ieee80211_mesh_rt_update(rt_dest, ms->ms_ppath->mpp_inact); MESH_RT_UNLOCK(ms); /* XXX: lock?? */ mcopy = m_dup(m, M_NOWAIT); for (; mcopy != NULL; mcopy = next) { next = mcopy->m_nextpkt; mcopy->m_nextpkt = NULL; IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP, rt_dest->rt_dest, "flush queued frame %p len %d", mcopy, mcopy->m_pkthdr.len); mesh_transmit_to_gate(vap, mcopy, rt_gate); } MESH_RT_LOCK(ms); } rt_dest->rt_flags = 0; /* Mark invalid */ m_freem(m); MESH_RT_UNLOCK(ms); } /* * Forward the specified frame. * Decrement the TTL and set TA to our MAC address. */ static void mesh_forward(struct ieee80211vap *vap, struct mbuf *m, const struct ieee80211_meshcntl *mc) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ifnet *ifp = vap->iv_ifp; const struct ieee80211_frame *wh = mtod(m, const struct ieee80211_frame *); struct mbuf *mcopy; struct ieee80211_meshcntl *mccopy; struct ieee80211_frame *whcopy; struct ieee80211_node *ni; int err; /* This is called from the RX path - don't hold this lock */ IEEE80211_TX_UNLOCK_ASSERT(ic); /* * mesh ttl of 1 means we are the last one receiving it, * according to amendment we decrement and then check if * 0, if so we dont forward. */ if (mc->mc_ttl < 1) { IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh, "%s", "frame not fwd'd, ttl 1"); vap->iv_stats.is_mesh_fwd_ttl++; return; } if (!(ms->ms_flags & IEEE80211_MESHFLAGS_FWD)) { IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh, "%s", "frame not fwd'd, fwding disabled"); vap->iv_stats.is_mesh_fwd_disabled++; return; } mcopy = m_dup(m, M_NOWAIT); if (mcopy == NULL) { IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh, "%s", "frame not fwd'd, cannot dup"); vap->iv_stats.is_mesh_fwd_nobuf++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return; } mcopy = m_pullup(mcopy, ieee80211_hdrspace(ic, wh) + sizeof(struct ieee80211_meshcntl)); if (mcopy == NULL) { IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh, "%s", "frame not fwd'd, too short"); vap->iv_stats.is_mesh_fwd_tooshort++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(mcopy); return; } whcopy = mtod(mcopy, struct ieee80211_frame *); mccopy = (struct ieee80211_meshcntl *) (mtod(mcopy, uint8_t *) + ieee80211_hdrspace(ic, wh)); /* XXX clear other bits? */ whcopy->i_fc[1] &= ~IEEE80211_FC1_RETRY; IEEE80211_ADDR_COPY(whcopy->i_addr2, vap->iv_myaddr); if (IEEE80211_IS_MULTICAST(wh->i_addr1)) { ni = ieee80211_ref_node(vap->iv_bss); mcopy->m_flags |= M_MCAST; } else { ni = ieee80211_mesh_find_txnode(vap, whcopy->i_addr3); if (ni == NULL) { /* * [Optional] any of the following three actions: * o silently discard * o trigger a path discovery * o inform TA that meshDA is unknown. */ IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh, "%s", "frame not fwd'd, no path"); ms->ms_ppath->mpp_senderror(vap, whcopy->i_addr3, NULL, IEEE80211_REASON_MESH_PERR_NO_FI); vap->iv_stats.is_mesh_fwd_nopath++; m_freem(mcopy); return; } IEEE80211_ADDR_COPY(whcopy->i_addr1, ni->ni_macaddr); } KASSERT(mccopy->mc_ttl > 0, ("%s called with wrong ttl", __func__)); mccopy->mc_ttl--; /* XXX calculate priority so drivers can find the tx queue */ M_WME_SETAC(mcopy, WME_AC_BE); /* XXX do we know m_nextpkt is NULL? */ + MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mcopy->m_pkthdr.rcvif = (void *) ni; /* * XXX this bypasses all of the VAP TX handling; it passes frames * directly to the parent interface. * * Because of this, there's no TX lock being held as there's no * encaps state being used. * * Doing a direct parent transmit may not be the correct thing * to do here; we'll have to re-think this soon. */ IEEE80211_TX_LOCK(ic); err = ieee80211_parent_xmitpkt(ic, mcopy); IEEE80211_TX_UNLOCK(ic); if (!err) if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); } static struct mbuf * mesh_decap(struct ieee80211vap *vap, struct mbuf *m, int hdrlen, int meshdrlen) { #define WHDIR(wh) ((wh)->i_fc[1] & IEEE80211_FC1_DIR_MASK) #define MC01(mc) ((const struct ieee80211_meshcntl_ae01 *)mc) uint8_t b[sizeof(struct ieee80211_qosframe_addr4) + sizeof(struct ieee80211_meshcntl_ae10)]; const struct ieee80211_qosframe_addr4 *wh; const struct ieee80211_meshcntl_ae10 *mc; struct ether_header *eh; struct llc *llc; int ae; if (m->m_len < hdrlen + sizeof(*llc) && (m = m_pullup(m, hdrlen + sizeof(*llc))) == NULL) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_ANY, "discard data frame: %s", "m_pullup failed"); vap->iv_stats.is_rx_tooshort++; return NULL; } memcpy(b, mtod(m, caddr_t), hdrlen); wh = (const struct ieee80211_qosframe_addr4 *)&b[0]; mc = (const struct ieee80211_meshcntl_ae10 *)&b[hdrlen - meshdrlen]; KASSERT(WHDIR(wh) == IEEE80211_FC1_DIR_FROMDS || WHDIR(wh) == IEEE80211_FC1_DIR_DSTODS, ("bogus dir, fc 0x%x:0x%x", wh->i_fc[0], wh->i_fc[1])); llc = (struct llc *)(mtod(m, caddr_t) + hdrlen); if (llc->llc_dsap == LLC_SNAP_LSAP && llc->llc_ssap == LLC_SNAP_LSAP && llc->llc_control == LLC_UI && llc->llc_snap.org_code[0] == 0 && llc->llc_snap.org_code[1] == 0 && llc->llc_snap.org_code[2] == 0 && /* NB: preserve AppleTalk frames that have a native SNAP hdr */ !(llc->llc_snap.ether_type == htons(ETHERTYPE_AARP) || llc->llc_snap.ether_type == htons(ETHERTYPE_IPX))) { m_adj(m, hdrlen + sizeof(struct llc) - sizeof(*eh)); llc = NULL; } else { m_adj(m, hdrlen - sizeof(*eh)); } eh = mtod(m, struct ether_header *); ae = mc->mc_flags & IEEE80211_MESH_AE_MASK; if (WHDIR(wh) == IEEE80211_FC1_DIR_FROMDS) { IEEE80211_ADDR_COPY(eh->ether_dhost, wh->i_addr1); if (ae == IEEE80211_MESH_AE_00) { IEEE80211_ADDR_COPY(eh->ether_shost, wh->i_addr3); } else if (ae == IEEE80211_MESH_AE_01) { IEEE80211_ADDR_COPY(eh->ether_shost, MC01(mc)->mc_addr4); } else { IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY, (const struct ieee80211_frame *)wh, NULL, "bad AE %d", ae); vap->iv_stats.is_mesh_badae++; m_freem(m); return NULL; } } else { if (ae == IEEE80211_MESH_AE_00) { IEEE80211_ADDR_COPY(eh->ether_dhost, wh->i_addr3); IEEE80211_ADDR_COPY(eh->ether_shost, wh->i_addr4); } else if (ae == IEEE80211_MESH_AE_10) { IEEE80211_ADDR_COPY(eh->ether_dhost, mc->mc_addr5); IEEE80211_ADDR_COPY(eh->ether_shost, mc->mc_addr6); } else { IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY, (const struct ieee80211_frame *)wh, NULL, "bad AE %d", ae); vap->iv_stats.is_mesh_badae++; m_freem(m); return NULL; } } #ifndef __NO_STRICT_ALIGNMENT if (!ALIGNED_POINTER(mtod(m, caddr_t) + sizeof(*eh), uint32_t)) { m = ieee80211_realign(vap, m, sizeof(*eh)); if (m == NULL) return NULL; } #endif /* !__NO_STRICT_ALIGNMENT */ if (llc != NULL) { eh = mtod(m, struct ether_header *); eh->ether_type = htons(m->m_pkthdr.len - sizeof(*eh)); } return m; #undef WDIR #undef MC01 } /* * Return non-zero if the unicast mesh data frame should be processed * locally. Frames that are not proxy'd have our address, otherwise * we need to consult the routing table to look for a proxy entry. */ static __inline int mesh_isucastforme(struct ieee80211vap *vap, const struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc) { int ae = mc->mc_flags & 3; KASSERT((wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) == IEEE80211_FC1_DIR_DSTODS, ("bad dir 0x%x:0x%x", wh->i_fc[0], wh->i_fc[1])); KASSERT(ae == IEEE80211_MESH_AE_00 || ae == IEEE80211_MESH_AE_10, ("bad AE %d", ae)); if (ae == IEEE80211_MESH_AE_10) { /* ucast w/ proxy */ const struct ieee80211_meshcntl_ae10 *mc10 = (const struct ieee80211_meshcntl_ae10 *) mc; struct ieee80211_mesh_route *rt = ieee80211_mesh_rt_find(vap, mc10->mc_addr5); /* check for proxy route to ourself */ return (rt != NULL && (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY)); } else /* ucast w/o proxy */ return IEEE80211_ADDR_EQ(wh->i_addr3, vap->iv_myaddr); } /* * Verifies transmitter, updates lifetime, precursor list and forwards data. * > 0 means we have forwarded data and no need to process locally * == 0 means we want to process locally (and we may have forwarded data * < 0 means there was an error and data should be discarded */ static int mesh_recv_indiv_data_to_fwrd(struct ieee80211vap *vap, struct mbuf *m, struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc) { struct ieee80211_qosframe_addr4 *qwh; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt_meshda, *rt_meshsa; /* This is called from the RX path - don't hold this lock */ IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic); qwh = (struct ieee80211_qosframe_addr4 *)wh; /* * TODO: * o verify addr2 is a legitimate transmitter * o lifetime of precursor of addr3 (addr2) is max(init, curr) * o lifetime of precursor of addr4 (nexthop) is max(init, curr) */ /* set lifetime of addr3 (meshDA) to initial value */ rt_meshda = ieee80211_mesh_rt_find(vap, qwh->i_addr3); if (rt_meshda == NULL) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, qwh->i_addr2, "no route to meshDA(%6D)", qwh->i_addr3, ":"); /* * [Optional] any of the following three actions: * o silently discard [X] * o trigger a path discovery [ ] * o inform TA that meshDA is unknown. [ ] */ /* XXX: stats */ return (-1); } ieee80211_mesh_rt_update(rt_meshda, ticks_to_msecs( ms->ms_ppath->mpp_inact)); /* set lifetime of addr4 (meshSA) to initial value */ rt_meshsa = ieee80211_mesh_rt_find(vap, qwh->i_addr4); KASSERT(rt_meshsa != NULL, ("no route")); ieee80211_mesh_rt_update(rt_meshsa, ticks_to_msecs( ms->ms_ppath->mpp_inact)); mesh_forward(vap, m, mc); return (1); /* dont process locally */ } /* * Verifies transmitter, updates lifetime, precursor list and process data * locally, if data is proxy with AE = 10 it could mean data should go * on another mesh path or data should be forwarded to the DS. * * > 0 means we have forwarded data and no need to process locally * == 0 means we want to process locally (and we may have forwarded data * < 0 means there was an error and data should be discarded */ static int mesh_recv_indiv_data_to_me(struct ieee80211vap *vap, struct mbuf *m, struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc) { struct ieee80211_qosframe_addr4 *qwh; const struct ieee80211_meshcntl_ae10 *mc10; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_route *rt; int ae; /* This is called from the RX path - don't hold this lock */ IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic); qwh = (struct ieee80211_qosframe_addr4 *)wh; mc10 = (const struct ieee80211_meshcntl_ae10 *)mc; /* * TODO: * o verify addr2 is a legitimate transmitter * o lifetime of precursor entry is max(init, curr) */ /* set lifetime of addr4 (meshSA) to initial value */ rt = ieee80211_mesh_rt_find(vap, qwh->i_addr4); KASSERT(rt != NULL, ("no route")); ieee80211_mesh_rt_update(rt, ticks_to_msecs(ms->ms_ppath->mpp_inact)); rt = NULL; ae = mc10->mc_flags & IEEE80211_MESH_AE_MASK; KASSERT(ae == IEEE80211_MESH_AE_00 || ae == IEEE80211_MESH_AE_10, ("bad AE %d", ae)); if (ae == IEEE80211_MESH_AE_10) { if (IEEE80211_ADDR_EQ(mc10->mc_addr5, qwh->i_addr3)) { return (0); /* process locally */ } rt = ieee80211_mesh_rt_find(vap, mc10->mc_addr5); if (rt != NULL && (rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) && (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) == 0) { /* * Forward on another mesh-path, according to * amendment as specified in 9.32.4.1 */ IEEE80211_ADDR_COPY(qwh->i_addr3, mc10->mc_addr5); mesh_forward(vap, m, (const struct ieee80211_meshcntl *)mc10); return (1); /* dont process locally */ } /* * All other cases: forward of MSDUs from the MBSS to DS indiv. * addressed according to 13.11.3.2. */ IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_OUTPUT, qwh->i_addr2, "forward frame to DS, SA(%6D) DA(%6D)", mc10->mc_addr6, ":", mc10->mc_addr5, ":"); } return (0); /* process locally */ } /* * Try to forward the group addressed data on to other mesh STAs, and * also to the DS. * * > 0 means we have forwarded data and no need to process locally * == 0 means we want to process locally (and we may have forwarded data * < 0 means there was an error and data should be discarded */ static int mesh_recv_group_data(struct ieee80211vap *vap, struct mbuf *m, struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc) { #define MC01(mc) ((const struct ieee80211_meshcntl_ae01 *)mc) struct ieee80211_mesh_state *ms = vap->iv_mesh; /* This is called from the RX path - don't hold this lock */ IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic); mesh_forward(vap, m, mc); if(mc->mc_ttl > 0) { if (mc->mc_flags & IEEE80211_MESH_AE_01) { /* * Forward of MSDUs from the MBSS to DS group addressed * (according to 13.11.3.2) * This happens by delivering the packet, and a bridge * will sent it on another port member. */ if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE && ms->ms_flags & IEEE80211_MESHFLAGS_FWD) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, MC01(mc)->mc_addr4, "%s", "forward from MBSS to the DS"); } } } return (0); /* process locally */ #undef MC01 } static int mesh_input(struct ieee80211_node *ni, struct mbuf *m, const struct ieee80211_rx_stats *rxs, int rssi, int nf) { #define HAS_SEQ(type) ((type & 0x4) == 0) #define MC01(mc) ((const struct ieee80211_meshcntl_ae01 *)mc) struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ifnet *ifp = vap->iv_ifp; struct ieee80211_frame *wh; const struct ieee80211_meshcntl *mc; int hdrspace, meshdrlen, need_tap, error; uint8_t dir, type, subtype, ae; uint32_t seq; const uint8_t *addr; uint8_t qos[2]; KASSERT(ni != NULL, ("null node")); ni->ni_inact = ni->ni_inact_reload; need_tap = 1; /* mbuf need to be tapped. */ type = -1; /* undefined */ /* This is called from the RX path - don't hold this lock */ IEEE80211_TX_UNLOCK_ASSERT(ic); if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_min)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, NULL, "too short (1): len %u", m->m_pkthdr.len); vap->iv_stats.is_rx_tooshort++; goto out; } /* * Bit of a cheat here, we use a pointer for a 3-address * frame format but don't reference fields past outside * ieee80211_frame_min w/o first validating the data is * present. */ wh = mtod(m, struct ieee80211_frame *); if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) != IEEE80211_FC0_VERSION_0) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, NULL, "wrong version %x", wh->i_fc[0]); vap->iv_stats.is_rx_badversion++; goto err; } dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK; type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK; subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK; if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) { IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi); ni->ni_noise = nf; if (HAS_SEQ(type)) { uint8_t tid = ieee80211_gettid(wh); if (IEEE80211_QOS_HAS_SEQ(wh) && TID_TO_WME_AC(tid) >= WME_AC_VI) ic->ic_wme.wme_hipri_traffic++; if (! ieee80211_check_rxseq(ni, wh, wh->i_addr1, rxs)) goto out; } } #ifdef IEEE80211_DEBUG /* * It's easier, but too expensive, to simulate different mesh * topologies by consulting the ACL policy very early, so do this * only under DEBUG. * * NB: this check is also done upon peering link initiation. */ if (vap->iv_acl != NULL && !vap->iv_acl->iac_check(vap, wh)) { IEEE80211_DISCARD(vap, IEEE80211_MSG_ACL, wh, NULL, "%s", "disallowed by ACL"); vap->iv_stats.is_rx_acl++; goto out; } #endif switch (type) { case IEEE80211_FC0_TYPE_DATA: if (ni == vap->iv_bss) goto out; if (ni->ni_mlstate != IEEE80211_NODE_MESH_ESTABLISHED) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH, ni->ni_macaddr, NULL, "peer link not yet established (%d)", ni->ni_mlstate); vap->iv_stats.is_mesh_nolink++; goto out; } if (dir != IEEE80211_FC1_DIR_FROMDS && dir != IEEE80211_FC1_DIR_DSTODS) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, "data", "incorrect dir 0x%x", dir); vap->iv_stats.is_rx_wrongdir++; goto err; } /* All Mesh data frames are QoS subtype */ if (!HAS_SEQ(type)) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, "data", "incorrect subtype 0x%x", subtype); vap->iv_stats.is_rx_badsubtype++; goto err; } /* * Next up, any fragmentation. * XXX: we defrag before we even try to forward, * Mesh Control field is not present in sub-sequent * fragmented frames. This is in contrast to Draft 4.0. */ hdrspace = ieee80211_hdrspace(ic, wh); if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) { m = ieee80211_defrag(ni, m, hdrspace); if (m == NULL) { /* Fragment dropped or frame not complete yet */ goto out; } } wh = mtod(m, struct ieee80211_frame *); /* NB: after defrag */ /* * Now we have a complete Mesh Data frame. */ /* * Only fromDStoDS data frames use 4 address qos frames * as specified in amendment. Otherwise addr4 is located * in the Mesh Control field and a 3 address qos frame * is used. */ *(uint16_t *)qos = *(uint16_t *)ieee80211_getqos(wh); /* * NB: The mesh STA sets the Mesh Control Present * subfield to 1 in the Mesh Data frame containing * an unfragmented MSDU, an A-MSDU, or the first * fragment of an MSDU. * After defrag it should always be present. */ if (!(qos[1] & IEEE80211_QOS_MC)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH, ni->ni_macaddr, NULL, "%s", "Mesh control field not present"); vap->iv_stats.is_rx_elem_missing++; /* XXX: kinda */ goto err; } /* pull up enough to get to the mesh control */ if (m->m_len < hdrspace + sizeof(struct ieee80211_meshcntl) && (m = m_pullup(m, hdrspace + sizeof(struct ieee80211_meshcntl))) == NULL) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, NULL, "data too short: expecting %u", hdrspace); vap->iv_stats.is_rx_tooshort++; goto out; /* XXX */ } /* * Now calculate the full extent of the headers. Note * mesh_decap will pull up anything we didn't get * above when it strips the 802.11 headers. */ mc = (const struct ieee80211_meshcntl *) (mtod(m, const uint8_t *) + hdrspace); ae = mc->mc_flags & IEEE80211_MESH_AE_MASK; meshdrlen = sizeof(struct ieee80211_meshcntl) + ae * IEEE80211_ADDR_LEN; hdrspace += meshdrlen; /* pull complete hdrspace = ieee80211_hdrspace + meshcontrol */ if ((meshdrlen > sizeof(struct ieee80211_meshcntl)) && (m->m_len < hdrspace) && ((m = m_pullup(m, hdrspace)) == NULL)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, NULL, "data too short: expecting %u", hdrspace); vap->iv_stats.is_rx_tooshort++; goto out; /* XXX */ } /* XXX: are we sure there is no reallocating after m_pullup? */ seq = le32dec(mc->mc_seq); if (IEEE80211_IS_MULTICAST(wh->i_addr1)) addr = wh->i_addr3; else if (ae == IEEE80211_MESH_AE_01) addr = MC01(mc)->mc_addr4; else addr = ((struct ieee80211_qosframe_addr4 *)wh)->i_addr4; if (IEEE80211_ADDR_EQ(vap->iv_myaddr, addr)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT, addr, "data", "%s", "not to me"); vap->iv_stats.is_rx_wrongbss++; /* XXX kinda */ goto out; } if (mesh_checkpseq(vap, addr, seq) != 0) { vap->iv_stats.is_rx_dup++; goto out; } /* This code "routes" the frame to the right control path */ if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) { if (IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr3)) error = mesh_recv_indiv_data_to_me(vap, m, wh, mc); else if (IEEE80211_IS_MULTICAST(wh->i_addr3)) error = mesh_recv_group_data(vap, m, wh, mc); else error = mesh_recv_indiv_data_to_fwrd(vap, m, wh, mc); } else error = mesh_recv_group_data(vap, m, wh, mc); if (error < 0) goto err; else if (error > 0) goto out; if (ieee80211_radiotap_active_vap(vap)) ieee80211_radiotap_rx(vap, m); need_tap = 0; /* * Finally, strip the 802.11 header. */ m = mesh_decap(vap, m, hdrspace, meshdrlen); if (m == NULL) { /* XXX mask bit to check for both */ /* don't count Null data frames as errors */ if (subtype == IEEE80211_FC0_SUBTYPE_NODATA || subtype == IEEE80211_FC0_SUBTYPE_QOS_NULL) goto out; IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT, ni->ni_macaddr, "data", "%s", "decap error"); vap->iv_stats.is_rx_decap++; IEEE80211_NODE_STAT(ni, rx_decap); goto err; } if (qos[0] & IEEE80211_QOS_AMSDU) { m = ieee80211_decap_amsdu(ni, m); if (m == NULL) return IEEE80211_FC0_TYPE_DATA; } ieee80211_deliver_data(vap, ni, m); return type; case IEEE80211_FC0_TYPE_MGT: vap->iv_stats.is_rx_mgmt++; IEEE80211_NODE_STAT(ni, rx_mgmt); if (dir != IEEE80211_FC1_DIR_NODS) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, "mgt", "incorrect dir 0x%x", dir); vap->iv_stats.is_rx_wrongdir++; goto err; } if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, "mgt", "too short: len %u", m->m_pkthdr.len); vap->iv_stats.is_rx_tooshort++; goto out; } #ifdef IEEE80211_DEBUG if ((ieee80211_msg_debug(vap) && (vap->iv_ic->ic_flags & IEEE80211_F_SCAN)) || ieee80211_msg_dumppkts(vap)) { if_printf(ifp, "received %s from %s rssi %d\n", ieee80211_mgt_subtype_name(subtype), ether_sprintf(wh->i_addr2), rssi); } #endif if (wh->i_fc[1] & IEEE80211_FC1_PROTECTED) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "%s", "WEP set but not permitted"); vap->iv_stats.is_rx_mgtdiscard++; /* XXX */ goto out; } vap->iv_recv_mgmt(ni, m, subtype, rxs, rssi, nf); goto out; case IEEE80211_FC0_TYPE_CTL: vap->iv_stats.is_rx_ctl++; IEEE80211_NODE_STAT(ni, rx_ctrl); goto out; default: IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY, wh, "bad", "frame type 0x%x", type); /* should not come here */ break; } err: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); out: if (m != NULL) { if (need_tap && ieee80211_radiotap_active_vap(vap)) ieee80211_radiotap_rx(vap, m); m_freem(m); } return type; #undef HAS_SEQ #undef MC01 } static void mesh_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0, int subtype, const struct ieee80211_rx_stats *rxs, int rssi, int nf) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211com *ic = ni->ni_ic; struct ieee80211_channel *rxchan = ic->ic_curchan; struct ieee80211_frame *wh; struct ieee80211_mesh_route *rt; uint8_t *frm, *efrm; wh = mtod(m0, struct ieee80211_frame *); frm = (uint8_t *)&wh[1]; efrm = mtod(m0, uint8_t *) + m0->m_len; switch (subtype) { case IEEE80211_FC0_SUBTYPE_PROBE_RESP: case IEEE80211_FC0_SUBTYPE_BEACON: { struct ieee80211_scanparams scan; struct ieee80211_channel *c; /* * We process beacon/probe response * frames to discover neighbors. */ if (rxs != NULL) { c = ieee80211_lookup_channel_rxstatus(vap, rxs); if (c != NULL) rxchan = c; } if (ieee80211_parse_beacon(ni, m0, rxchan, &scan) != 0) return; /* * Count frame now that we know it's to be processed. */ if (subtype == IEEE80211_FC0_SUBTYPE_BEACON) { vap->iv_stats.is_rx_beacon++; /* XXX remove */ IEEE80211_NODE_STAT(ni, rx_beacons); } else IEEE80211_NODE_STAT(ni, rx_proberesp); /* * If scanning, just pass information to the scan module. */ if (ic->ic_flags & IEEE80211_F_SCAN) { if (ic->ic_flags_ext & IEEE80211_FEXT_PROBECHAN) { /* * Actively scanning a channel marked passive; * send a probe request now that we know there * is 802.11 traffic present. * * XXX check if the beacon we recv'd gives * us what we need and suppress the probe req */ ieee80211_probe_curchan(vap, 1); ic->ic_flags_ext &= ~IEEE80211_FEXT_PROBECHAN; } ieee80211_add_scan(vap, rxchan, &scan, wh, subtype, rssi, nf); return; } /* The rest of this code assumes we are running */ if (vap->iv_state != IEEE80211_S_RUN) return; /* * Ignore non-mesh STAs. */ if ((scan.capinfo & (IEEE80211_CAPINFO_ESS|IEEE80211_CAPINFO_IBSS)) || scan.meshid == NULL || scan.meshconf == NULL) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, "beacon", "%s", "not a mesh sta"); vap->iv_stats.is_mesh_wrongmesh++; return; } /* * Ignore STAs for other mesh networks. */ if (memcmp(scan.meshid+2, ms->ms_id, ms->ms_idlen) != 0 || mesh_verify_meshconf(vap, scan.meshconf)) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, "beacon", "%s", "not for our mesh"); vap->iv_stats.is_mesh_wrongmesh++; return; } /* * Peer only based on the current ACL policy. */ if (vap->iv_acl != NULL && !vap->iv_acl->iac_check(vap, wh)) { IEEE80211_DISCARD(vap, IEEE80211_MSG_ACL, wh, NULL, "%s", "disallowed by ACL"); vap->iv_stats.is_rx_acl++; return; } /* * Do neighbor discovery. */ if (!IEEE80211_ADDR_EQ(wh->i_addr2, ni->ni_macaddr)) { /* * Create a new entry in the neighbor table. */ ni = ieee80211_add_neighbor(vap, wh, &scan); } /* * Automatically peer with discovered nodes if possible. */ if (ni != vap->iv_bss && (ms->ms_flags & IEEE80211_MESHFLAGS_AP)) { switch (ni->ni_mlstate) { case IEEE80211_NODE_MESH_IDLE: { uint16_t args[1]; /* Wait for backoff callout to reset counter */ if (ni->ni_mlhcnt >= ieee80211_mesh_maxholding) return; ni->ni_mlpid = mesh_generateid(vap); if (ni->ni_mlpid == 0) return; mesh_linkchange(ni, IEEE80211_NODE_MESH_OPENSNT); args[0] = ni->ni_mlpid; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_OPEN, args); ni->ni_mlrcnt = 0; mesh_peer_timeout_setup(ni); break; } case IEEE80211_NODE_MESH_ESTABLISHED: { /* * Valid beacon from a peer mesh STA * bump TA lifetime */ rt = ieee80211_mesh_rt_find(vap, wh->i_addr2); if(rt != NULL) { ieee80211_mesh_rt_update(rt, ticks_to_msecs( ms->ms_ppath->mpp_inact)); } break; } default: break; /* ignore */ } } break; } case IEEE80211_FC0_SUBTYPE_PROBE_REQ: { uint8_t *ssid, *meshid, *rates, *xrates; if (vap->iv_state != IEEE80211_S_RUN) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "wrong state %s", ieee80211_state_name[vap->iv_state]); vap->iv_stats.is_rx_mgtdiscard++; return; } if (IEEE80211_IS_MULTICAST(wh->i_addr2)) { /* frame must be directed */ IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "%s", "not unicast"); vap->iv_stats.is_rx_mgtdiscard++; /* XXX stat */ return; } /* * prreq frame format * [tlv] ssid * [tlv] supported rates * [tlv] extended supported rates * [tlv] mesh id */ ssid = meshid = rates = xrates = NULL; while (efrm - frm > 1) { IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return); switch (*frm) { case IEEE80211_ELEMID_SSID: ssid = frm; break; case IEEE80211_ELEMID_RATES: rates = frm; break; case IEEE80211_ELEMID_XRATES: xrates = frm; break; case IEEE80211_ELEMID_MESHID: meshid = frm; break; } frm += frm[1] + 2; } IEEE80211_VERIFY_ELEMENT(ssid, IEEE80211_NWID_LEN, return); IEEE80211_VERIFY_ELEMENT(rates, IEEE80211_RATE_MAXSIZE, return); if (xrates != NULL) IEEE80211_VERIFY_ELEMENT(xrates, IEEE80211_RATE_MAXSIZE - rates[1], return); if (meshid != NULL) { IEEE80211_VERIFY_ELEMENT(meshid, IEEE80211_MESHID_LEN, return); /* NB: meshid, not ssid */ IEEE80211_VERIFY_SSID(vap->iv_bss, meshid, return); } /* XXX find a better class or define it's own */ IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_INPUT, wh->i_addr2, "%s", "recv probe req"); /* * Some legacy 11b clients cannot hack a complete * probe response frame. When the request includes * only a bare-bones rate set, communicate this to * the transmit side. */ ieee80211_send_proberesp(vap, wh->i_addr2, 0); break; } case IEEE80211_FC0_SUBTYPE_ACTION: case IEEE80211_FC0_SUBTYPE_ACTION_NOACK: if (ni == vap->iv_bss) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "%s", "unknown node"); vap->iv_stats.is_rx_mgtdiscard++; } else if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr1) && !IEEE80211_IS_MULTICAST(wh->i_addr1)) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "%s", "not for us"); vap->iv_stats.is_rx_mgtdiscard++; } else if (vap->iv_state != IEEE80211_S_RUN) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "wrong state %s", ieee80211_state_name[vap->iv_state]); vap->iv_stats.is_rx_mgtdiscard++; } else { if (ieee80211_parse_action(ni, m0) == 0) (void)ic->ic_recv_action(ni, wh, frm, efrm); } break; case IEEE80211_FC0_SUBTYPE_ASSOC_REQ: case IEEE80211_FC0_SUBTYPE_ASSOC_RESP: case IEEE80211_FC0_SUBTYPE_REASSOC_REQ: case IEEE80211_FC0_SUBTYPE_REASSOC_RESP: case IEEE80211_FC0_SUBTYPE_TIMING_ADV: case IEEE80211_FC0_SUBTYPE_ATIM: case IEEE80211_FC0_SUBTYPE_DISASSOC: case IEEE80211_FC0_SUBTYPE_AUTH: case IEEE80211_FC0_SUBTYPE_DEAUTH: IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "%s", "not handled"); vap->iv_stats.is_rx_mgtdiscard++; break; default: IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY, wh, "mgt", "subtype 0x%x not handled", subtype); vap->iv_stats.is_rx_badsubtype++; break; } } static void mesh_recv_ctl(struct ieee80211_node *ni, struct mbuf *m, int subtype) { switch (subtype) { case IEEE80211_FC0_SUBTYPE_BAR: ieee80211_recv_bar(ni, m); break; } } /* * Parse meshpeering action ie's for MPM frames */ static const struct ieee80211_meshpeer_ie * mesh_parse_meshpeering_action(struct ieee80211_node *ni, const struct ieee80211_frame *wh, /* XXX for VERIFY_LENGTH */ const uint8_t *frm, const uint8_t *efrm, struct ieee80211_meshpeer_ie *mp, uint8_t subtype) { struct ieee80211vap *vap = ni->ni_vap; const struct ieee80211_meshpeer_ie *mpie; uint16_t args[3]; const uint8_t *meshid, *meshconf; uint8_t sendclose = 0; /* 1 = MPM frame rejected, close will be sent */ meshid = meshconf = NULL; while (efrm - frm > 1) { IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return NULL); switch (*frm) { case IEEE80211_ELEMID_MESHID: meshid = frm; break; case IEEE80211_ELEMID_MESHCONF: meshconf = frm; break; case IEEE80211_ELEMID_MESHPEER: mpie = (const struct ieee80211_meshpeer_ie *) frm; memset(mp, 0, sizeof(*mp)); mp->peer_len = mpie->peer_len; mp->peer_proto = le16dec(&mpie->peer_proto); mp->peer_llinkid = le16dec(&mpie->peer_llinkid); switch (subtype) { case IEEE80211_ACTION_MESHPEERING_CONFIRM: mp->peer_linkid = le16dec(&mpie->peer_linkid); break; case IEEE80211_ACTION_MESHPEERING_CLOSE: /* NB: peer link ID is optional */ if (mpie->peer_len == (IEEE80211_MPM_BASE_SZ + 2)) { mp->peer_linkid = 0; mp->peer_rcode = le16dec(&mpie->peer_linkid); } else { mp->peer_linkid = le16dec(&mpie->peer_linkid); mp->peer_rcode = le16dec(&mpie->peer_rcode); } break; } break; } frm += frm[1] + 2; } /* * Verify the contents of the frame. * If it fails validation, close the peer link. */ if (mesh_verify_meshpeer(vap, subtype, (const uint8_t *)mp)) { sendclose = 1; IEEE80211_DISCARD(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, wh, NULL, "%s", "MPM validation failed"); } /* If meshid is not the same reject any frames type. */ if (sendclose == 0 && mesh_verify_meshid(vap, meshid)) { sendclose = 1; IEEE80211_DISCARD(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, wh, NULL, "%s", "not for our mesh"); if (subtype == IEEE80211_ACTION_MESHPEERING_CLOSE) { /* * Standard not clear about this, if we dont ignore * there will be an endless loop between nodes sending * CLOSE frames between each other with wrong meshid. * Discard and timers will bring FSM to IDLE state. */ return NULL; } } /* * Close frames are accepted if meshid is the same. * Verify the other two types. */ if (sendclose == 0 && subtype != IEEE80211_ACTION_MESHPEERING_CLOSE && mesh_verify_meshconf(vap, meshconf)) { sendclose = 1; IEEE80211_DISCARD(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, wh, NULL, "%s", "configuration missmatch"); } if (sendclose) { vap->iv_stats.is_rx_mgtdiscard++; switch (ni->ni_mlstate) { case IEEE80211_NODE_MESH_IDLE: case IEEE80211_NODE_MESH_ESTABLISHED: case IEEE80211_NODE_MESH_HOLDING: /* ignore */ break; case IEEE80211_NODE_MESH_OPENSNT: case IEEE80211_NODE_MESH_OPENRCV: case IEEE80211_NODE_MESH_CONFIRMRCV: args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; /* Reason codes for rejection */ switch (subtype) { case IEEE80211_ACTION_MESHPEERING_OPEN: args[2] = IEEE80211_REASON_MESH_CPVIOLATION; break; case IEEE80211_ACTION_MESHPEERING_CONFIRM: args[2] = IEEE80211_REASON_MESH_INCONS_PARAMS; break; } ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING); mesh_peer_timeout_setup(ni); break; } return NULL; } return (const struct ieee80211_meshpeer_ie *) mp; } static int mesh_recv_action_meshpeering_open(struct ieee80211_node *ni, const struct ieee80211_frame *wh, const uint8_t *frm, const uint8_t *efrm) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_meshpeer_ie ie; const struct ieee80211_meshpeer_ie *meshpeer; uint16_t args[3]; /* +2+2 for action + code + capabilites */ meshpeer = mesh_parse_meshpeering_action(ni, wh, frm+2+2, efrm, &ie, IEEE80211_ACTION_MESHPEERING_OPEN); if (meshpeer == NULL) { return 0; } /* XXX move up */ IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni, "recv PEER OPEN, lid 0x%x", meshpeer->peer_llinkid); switch (ni->ni_mlstate) { case IEEE80211_NODE_MESH_IDLE: /* Reject open request if reached our maximum neighbor count */ if (ms->ms_neighbors >= IEEE80211_MESH_MAX_NEIGHBORS) { args[0] = meshpeer->peer_llinkid; args[1] = 0; args[2] = IEEE80211_REASON_MESH_MAX_PEERS; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); /* stay in IDLE state */ return (0); } /* Open frame accepted */ mesh_linkchange(ni, IEEE80211_NODE_MESH_OPENRCV); ni->ni_mllid = meshpeer->peer_llinkid; ni->ni_mlpid = mesh_generateid(vap); if (ni->ni_mlpid == 0) return 0; /* XXX */ args[0] = ni->ni_mlpid; /* Announce we're open too... */ ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_OPEN, args); /* ...and confirm the link. */ args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CONFIRM, args); mesh_peer_timeout_setup(ni); break; case IEEE80211_NODE_MESH_OPENRCV: /* Wrong Link ID */ if (ni->ni_mllid != meshpeer->peer_llinkid) { args[0] = ni->ni_mllid; args[1] = ni->ni_mlpid; args[2] = IEEE80211_REASON_PEER_LINK_CANCELED; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING); mesh_peer_timeout_setup(ni); break; } /* Duplicate open, confirm again. */ args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CONFIRM, args); break; case IEEE80211_NODE_MESH_OPENSNT: ni->ni_mllid = meshpeer->peer_llinkid; mesh_linkchange(ni, IEEE80211_NODE_MESH_OPENRCV); args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CONFIRM, args); /* NB: don't setup/clear any timeout */ break; case IEEE80211_NODE_MESH_CONFIRMRCV: if (ni->ni_mlpid != meshpeer->peer_linkid || ni->ni_mllid != meshpeer->peer_llinkid) { args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; args[2] = IEEE80211_REASON_PEER_LINK_CANCELED; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING); mesh_peer_timeout_setup(ni); break; } mesh_linkchange(ni, IEEE80211_NODE_MESH_ESTABLISHED); ni->ni_mllid = meshpeer->peer_llinkid; args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CONFIRM, args); mesh_peer_timeout_stop(ni); break; case IEEE80211_NODE_MESH_ESTABLISHED: if (ni->ni_mllid != meshpeer->peer_llinkid) { args[0] = ni->ni_mllid; args[1] = ni->ni_mlpid; args[2] = IEEE80211_REASON_PEER_LINK_CANCELED; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING); mesh_peer_timeout_setup(ni); break; } args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CONFIRM, args); break; case IEEE80211_NODE_MESH_HOLDING: args[0] = ni->ni_mlpid; args[1] = meshpeer->peer_llinkid; /* Standard not clear about what the reaason code should be */ args[2] = IEEE80211_REASON_PEER_LINK_CANCELED; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); break; } return 0; } static int mesh_recv_action_meshpeering_confirm(struct ieee80211_node *ni, const struct ieee80211_frame *wh, const uint8_t *frm, const uint8_t *efrm) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_meshpeer_ie ie; const struct ieee80211_meshpeer_ie *meshpeer; uint16_t args[3]; /* +2+2+2+2 for action + code + capabilites + status code + AID */ meshpeer = mesh_parse_meshpeering_action(ni, wh, frm+2+2+2+2, efrm, &ie, IEEE80211_ACTION_MESHPEERING_CONFIRM); if (meshpeer == NULL) { return 0; } IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni, "recv PEER CONFIRM, local id 0x%x, peer id 0x%x", meshpeer->peer_llinkid, meshpeer->peer_linkid); switch (ni->ni_mlstate) { case IEEE80211_NODE_MESH_OPENRCV: mesh_linkchange(ni, IEEE80211_NODE_MESH_ESTABLISHED); mesh_peer_timeout_stop(ni); break; case IEEE80211_NODE_MESH_OPENSNT: mesh_linkchange(ni, IEEE80211_NODE_MESH_CONFIRMRCV); mesh_peer_timeout_setup(ni); break; case IEEE80211_NODE_MESH_HOLDING: args[0] = ni->ni_mlpid; args[1] = meshpeer->peer_llinkid; /* Standard not clear about what the reaason code should be */ args[2] = IEEE80211_REASON_PEER_LINK_CANCELED; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); break; case IEEE80211_NODE_MESH_CONFIRMRCV: if (ni->ni_mllid != meshpeer->peer_llinkid) { args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; args[2] = IEEE80211_REASON_PEER_LINK_CANCELED; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING); mesh_peer_timeout_setup(ni); } break; default: IEEE80211_DISCARD(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, wh, NULL, "received confirm in invalid state %d", ni->ni_mlstate); vap->iv_stats.is_rx_mgtdiscard++; break; } return 0; } static int mesh_recv_action_meshpeering_close(struct ieee80211_node *ni, const struct ieee80211_frame *wh, const uint8_t *frm, const uint8_t *efrm) { struct ieee80211_meshpeer_ie ie; const struct ieee80211_meshpeer_ie *meshpeer; uint16_t args[3]; /* +2 for action + code */ meshpeer = mesh_parse_meshpeering_action(ni, wh, frm+2, efrm, &ie, IEEE80211_ACTION_MESHPEERING_CLOSE); if (meshpeer == NULL) { return 0; } /* * XXX: check reason code, for example we could receive * IEEE80211_REASON_MESH_MAX_PEERS then we should not attempt * to peer again. */ IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni, "%s", "recv PEER CLOSE"); switch (ni->ni_mlstate) { case IEEE80211_NODE_MESH_IDLE: /* ignore */ break; case IEEE80211_NODE_MESH_OPENRCV: case IEEE80211_NODE_MESH_OPENSNT: case IEEE80211_NODE_MESH_CONFIRMRCV: case IEEE80211_NODE_MESH_ESTABLISHED: args[0] = ni->ni_mlpid; args[1] = ni->ni_mllid; args[2] = IEEE80211_REASON_MESH_CLOSE_RCVD; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING); mesh_peer_timeout_setup(ni); break; case IEEE80211_NODE_MESH_HOLDING: mesh_linkchange(ni, IEEE80211_NODE_MESH_IDLE); mesh_peer_timeout_stop(ni); break; } return 0; } /* * Link Metric handling. */ static int mesh_recv_action_meshlmetric(struct ieee80211_node *ni, const struct ieee80211_frame *wh, const uint8_t *frm, const uint8_t *efrm) { const struct ieee80211_meshlmetric_ie *ie = (const struct ieee80211_meshlmetric_ie *) (frm+2); /* action + code */ struct ieee80211_meshlmetric_ie lm_rep; if (ie->lm_flags & IEEE80211_MESH_LMETRIC_FLAGS_REQ) { lm_rep.lm_flags = 0; lm_rep.lm_metric = mesh_airtime_calc(ni); ieee80211_send_action(ni, IEEE80211_ACTION_CAT_MESH, IEEE80211_ACTION_MESH_LMETRIC, &lm_rep); } /* XXX: else do nothing for now */ return 0; } /* * Parse meshgate action ie's for GANN frames. * Returns -1 if parsing fails, otherwise 0. */ static int mesh_parse_meshgate_action(struct ieee80211_node *ni, const struct ieee80211_frame *wh, /* XXX for VERIFY_LENGTH */ struct ieee80211_meshgann_ie *ie, const uint8_t *frm, const uint8_t *efrm) { struct ieee80211vap *vap = ni->ni_vap; const struct ieee80211_meshgann_ie *gannie; while (efrm - frm > 1) { IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return -1); switch (*frm) { case IEEE80211_ELEMID_MESHGANN: gannie = (const struct ieee80211_meshgann_ie *) frm; memset(ie, 0, sizeof(*ie)); ie->gann_ie = gannie->gann_ie; ie->gann_len = gannie->gann_len; ie->gann_flags = gannie->gann_flags; ie->gann_hopcount = gannie->gann_hopcount; ie->gann_ttl = gannie->gann_ttl; IEEE80211_ADDR_COPY(ie->gann_addr, gannie->gann_addr); ie->gann_seq = le32dec(&gannie->gann_seq); ie->gann_interval = le16dec(&gannie->gann_interval); break; } frm += frm[1] + 2; } return 0; } /* * Mesh Gate Announcement handling. */ static int mesh_recv_action_meshgate(struct ieee80211_node *ni, const struct ieee80211_frame *wh, const uint8_t *frm, const uint8_t *efrm) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_mesh_gate_route *gr, *next; struct ieee80211_mesh_route *rt_gate; struct ieee80211_meshgann_ie pgann; struct ieee80211_meshgann_ie ie; int found = 0; /* +2 for action + code */ if (mesh_parse_meshgate_action(ni, wh, &ie, frm+2, efrm) != 0) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH, ni->ni_macaddr, NULL, "%s", "GANN parsing failed"); vap->iv_stats.is_rx_mgtdiscard++; return (0); } if (IEEE80211_ADDR_EQ(vap->iv_myaddr, ie.gann_addr)) return 0; IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, ni->ni_macaddr, "received GANN, meshgate: %6D (seq %u)", ie.gann_addr, ":", ie.gann_seq); if (ms == NULL) return (0); MESH_RT_LOCK(ms); TAILQ_FOREACH_SAFE(gr, &ms->ms_known_gates, gr_next, next) { if (!IEEE80211_ADDR_EQ(gr->gr_addr, ie.gann_addr)) continue; if (ie.gann_seq <= gr->gr_lastseq) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH, ni->ni_macaddr, NULL, "GANN old seqno %u <= %u", ie.gann_seq, gr->gr_lastseq); MESH_RT_UNLOCK(ms); return (0); } /* corresponding mesh gate found & GANN accepted */ found = 1; break; } if (found == 0) { /* this GANN is from a new mesh Gate add it to known table. */ IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, ie.gann_addr, "stored new GANN information, seq %u.", ie.gann_seq); gr = IEEE80211_MALLOC(ALIGN(sizeof(struct ieee80211_mesh_gate_route)), M_80211_MESH_GT_RT, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); IEEE80211_ADDR_COPY(gr->gr_addr, ie.gann_addr); TAILQ_INSERT_TAIL(&ms->ms_known_gates, gr, gr_next); } gr->gr_lastseq = ie.gann_seq; /* check if we have a path to this gate */ rt_gate = mesh_rt_find_locked(ms, gr->gr_addr); if (rt_gate != NULL && rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) { gr->gr_route = rt_gate; rt_gate->rt_flags |= IEEE80211_MESHRT_FLAGS_GATE; } MESH_RT_UNLOCK(ms); /* popagate only if decremented ttl >= 1 && forwarding is enabled */ if ((ie.gann_ttl - 1) < 1 && !(ms->ms_flags & IEEE80211_MESHFLAGS_FWD)) return 0; pgann.gann_flags = ie.gann_flags; /* Reserved */ pgann.gann_hopcount = ie.gann_hopcount + 1; pgann.gann_ttl = ie.gann_ttl - 1; IEEE80211_ADDR_COPY(pgann.gann_addr, ie.gann_addr); pgann.gann_seq = ie.gann_seq; pgann.gann_interval = ie.gann_interval; IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, ie.gann_addr, "%s", "propagate GANN"); ieee80211_send_action(vap->iv_bss, IEEE80211_ACTION_CAT_MESH, IEEE80211_ACTION_MESH_GANN, &pgann); return 0; } static int mesh_send_action(struct ieee80211_node *ni, const uint8_t sa[IEEE80211_ADDR_LEN], const uint8_t da[IEEE80211_ADDR_LEN], struct mbuf *m) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ieee80211_bpf_params params; int ret; KASSERT(ni != NULL, ("null node")); if (vap->iv_state == IEEE80211_S_CAC) { IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, ni, "block %s frame in CAC state", "Mesh action"); vap->iv_stats.is_tx_badstate++; ieee80211_free_node(ni); m_freem(m); return EIO; /* XXX */ } M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT); if (m == NULL) { ieee80211_free_node(ni); return ENOMEM; } IEEE80211_TX_LOCK(ic); ieee80211_send_setup(ni, m, IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_ACTION, IEEE80211_NONQOS_TID, sa, da, sa); m->m_flags |= M_ENCAP; /* mark encapsulated */ memset(¶ms, 0, sizeof(params)); params.ibp_pri = WME_AC_VO; params.ibp_rate0 = ni->ni_txparms->mgmtrate; if (IEEE80211_IS_MULTICAST(da)) params.ibp_try0 = 1; else params.ibp_try0 = ni->ni_txparms->maxretry; params.ibp_power = ni->ni_txpower; IEEE80211_NODE_STAT(ni, tx_mgmt); ret = ieee80211_raw_output(vap, ni, m, ¶ms); IEEE80211_TX_UNLOCK(ic); return (ret); } #define ADDSHORT(frm, v) do { \ frm[0] = (v) & 0xff; \ frm[1] = (v) >> 8; \ frm += 2; \ } while (0) #define ADDWORD(frm, v) do { \ frm[0] = (v) & 0xff; \ frm[1] = ((v) >> 8) & 0xff; \ frm[2] = ((v) >> 16) & 0xff; \ frm[3] = ((v) >> 24) & 0xff; \ frm += 4; \ } while (0) static int mesh_send_action_meshpeering_open(struct ieee80211_node *ni, int category, int action, void *args0) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; uint16_t *args = args0; const struct ieee80211_rateset *rs; struct mbuf *m; uint8_t *frm; IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni, "send PEER OPEN action: localid 0x%x", args[0]); IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE, "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__, ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1); ieee80211_ref_node(ni); m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(uint16_t) /* action+category */ + sizeof(uint16_t) /* capabilites */ + 2 + IEEE80211_RATE_SIZE + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE) + 2 + IEEE80211_MESHID_LEN + sizeof(struct ieee80211_meshconf_ie) + sizeof(struct ieee80211_meshpeer_ie) ); if (m != NULL) { /* * mesh peer open action frame format: * [1] category * [1] action * [2] capabilities * [tlv] rates * [tlv] xrates * [tlv] mesh id * [tlv] mesh conf * [tlv] mesh peer link mgmt */ *frm++ = category; *frm++ = action; ADDSHORT(frm, ieee80211_getcapinfo(vap, ni->ni_chan)); rs = ieee80211_get_suprates(ic, ic->ic_curchan); frm = ieee80211_add_rates(frm, rs); frm = ieee80211_add_xrates(frm, rs); frm = ieee80211_add_meshid(frm, vap); frm = ieee80211_add_meshconf(frm, vap); frm = ieee80211_add_meshpeer(frm, IEEE80211_ACTION_MESHPEERING_OPEN, args[0], 0, 0); m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m); } else { vap->iv_stats.is_tx_nobuf++; ieee80211_free_node(ni); return ENOMEM; } } static int mesh_send_action_meshpeering_confirm(struct ieee80211_node *ni, int category, int action, void *args0) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; uint16_t *args = args0; const struct ieee80211_rateset *rs; struct mbuf *m; uint8_t *frm; IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni, "send PEER CONFIRM action: localid 0x%x, peerid 0x%x", args[0], args[1]); IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE, "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__, ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1); ieee80211_ref_node(ni); m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(uint16_t) /* action+category */ + sizeof(uint16_t) /* capabilites */ + sizeof(uint16_t) /* status code */ + sizeof(uint16_t) /* AID */ + 2 + IEEE80211_RATE_SIZE + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE) + 2 + IEEE80211_MESHID_LEN + sizeof(struct ieee80211_meshconf_ie) + sizeof(struct ieee80211_meshpeer_ie) ); if (m != NULL) { /* * mesh peer confirm action frame format: * [1] category * [1] action * [2] capabilities * [2] status code * [2] association id (peer ID) * [tlv] rates * [tlv] xrates * [tlv] mesh id * [tlv] mesh conf * [tlv] mesh peer link mgmt */ *frm++ = category; *frm++ = action; ADDSHORT(frm, ieee80211_getcapinfo(vap, ni->ni_chan)); ADDSHORT(frm, 0); /* status code */ ADDSHORT(frm, args[1]); /* AID */ rs = ieee80211_get_suprates(ic, ic->ic_curchan); frm = ieee80211_add_rates(frm, rs); frm = ieee80211_add_xrates(frm, rs); frm = ieee80211_add_meshid(frm, vap); frm = ieee80211_add_meshconf(frm, vap); frm = ieee80211_add_meshpeer(frm, IEEE80211_ACTION_MESHPEERING_CONFIRM, args[0], args[1], 0); m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m); } else { vap->iv_stats.is_tx_nobuf++; ieee80211_free_node(ni); return ENOMEM; } } static int mesh_send_action_meshpeering_close(struct ieee80211_node *ni, int category, int action, void *args0) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; uint16_t *args = args0; struct mbuf *m; uint8_t *frm; IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni, "send PEER CLOSE action: localid 0x%x, peerid 0x%x reason %d (%s)", args[0], args[1], args[2], ieee80211_reason_to_string(args[2])); IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE, "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__, ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1); ieee80211_ref_node(ni); m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(uint16_t) /* action+category */ + sizeof(uint16_t) /* reason code */ + 2 + IEEE80211_MESHID_LEN + sizeof(struct ieee80211_meshpeer_ie) ); if (m != NULL) { /* * mesh peer close action frame format: * [1] category * [1] action * [tlv] mesh id * [tlv] mesh peer link mgmt */ *frm++ = category; *frm++ = action; frm = ieee80211_add_meshid(frm, vap); frm = ieee80211_add_meshpeer(frm, IEEE80211_ACTION_MESHPEERING_CLOSE, args[0], args[1], args[2]); m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m); } else { vap->iv_stats.is_tx_nobuf++; ieee80211_free_node(ni); return ENOMEM; } } static int mesh_send_action_meshlmetric(struct ieee80211_node *ni, int category, int action, void *arg0) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ieee80211_meshlmetric_ie *ie = arg0; struct mbuf *m; uint8_t *frm; if (ie->lm_flags & IEEE80211_MESH_LMETRIC_FLAGS_REQ) { IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni, "%s", "send LINK METRIC REQUEST action"); } else { IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni, "send LINK METRIC REPLY action: metric 0x%x", ie->lm_metric); } IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE, "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__, ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1); ieee80211_ref_node(ni); m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(uint16_t) + /* action+category */ sizeof(struct ieee80211_meshlmetric_ie) ); if (m != NULL) { /* * mesh link metric * [1] category * [1] action * [tlv] mesh link metric */ *frm++ = category; *frm++ = action; frm = ieee80211_add_meshlmetric(frm, ie->lm_flags, ie->lm_metric); m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m); } else { vap->iv_stats.is_tx_nobuf++; ieee80211_free_node(ni); return ENOMEM; } } static int mesh_send_action_meshgate(struct ieee80211_node *ni, int category, int action, void *arg0) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ieee80211_meshgann_ie *ie = arg0; struct mbuf *m; uint8_t *frm; IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE, "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__, ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1); ieee80211_ref_node(ni); m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(uint16_t) + /* action+category */ IEEE80211_MESHGANN_BASE_SZ ); if (m != NULL) { /* * mesh link metric * [1] category * [1] action * [tlv] mesh gate annoucement */ *frm++ = category; *frm++ = action; frm = ieee80211_add_meshgate(frm, ie); m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); return mesh_send_action(ni, vap->iv_myaddr, broadcastaddr, m); } else { vap->iv_stats.is_tx_nobuf++; ieee80211_free_node(ni); return ENOMEM; } } static void mesh_peer_timeout_setup(struct ieee80211_node *ni) { switch (ni->ni_mlstate) { case IEEE80211_NODE_MESH_HOLDING: ni->ni_mltval = ieee80211_mesh_holdingtimeout; break; case IEEE80211_NODE_MESH_CONFIRMRCV: ni->ni_mltval = ieee80211_mesh_confirmtimeout; break; case IEEE80211_NODE_MESH_IDLE: ni->ni_mltval = 0; break; default: ni->ni_mltval = ieee80211_mesh_retrytimeout; break; } if (ni->ni_mltval) callout_reset(&ni->ni_mltimer, ni->ni_mltval, mesh_peer_timeout_cb, ni); } /* * Same as above but backoffs timer statisically 50%. */ static void mesh_peer_timeout_backoff(struct ieee80211_node *ni) { uint32_t r; r = arc4random(); ni->ni_mltval += r % ni->ni_mltval; callout_reset(&ni->ni_mltimer, ni->ni_mltval, mesh_peer_timeout_cb, ni); } static __inline void mesh_peer_timeout_stop(struct ieee80211_node *ni) { callout_drain(&ni->ni_mltimer); } static void mesh_peer_backoff_cb(void *arg) { struct ieee80211_node *ni = (struct ieee80211_node *)arg; /* After backoff timeout, try to peer automatically again. */ ni->ni_mlhcnt = 0; } /* * Mesh Peer Link Management FSM timeout handling. */ static void mesh_peer_timeout_cb(void *arg) { struct ieee80211_node *ni = (struct ieee80211_node *)arg; uint16_t args[3]; IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_MESH, ni, "mesh link timeout, state %d, retry counter %d", ni->ni_mlstate, ni->ni_mlrcnt); switch (ni->ni_mlstate) { case IEEE80211_NODE_MESH_IDLE: case IEEE80211_NODE_MESH_ESTABLISHED: break; case IEEE80211_NODE_MESH_OPENSNT: case IEEE80211_NODE_MESH_OPENRCV: if (ni->ni_mlrcnt == ieee80211_mesh_maxretries) { args[0] = ni->ni_mlpid; args[2] = IEEE80211_REASON_MESH_MAX_RETRIES; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); ni->ni_mlrcnt = 0; mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING); mesh_peer_timeout_setup(ni); } else { args[0] = ni->ni_mlpid; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_OPEN, args); ni->ni_mlrcnt++; mesh_peer_timeout_backoff(ni); } break; case IEEE80211_NODE_MESH_CONFIRMRCV: args[0] = ni->ni_mlpid; args[2] = IEEE80211_REASON_MESH_CONFIRM_TIMEOUT; ieee80211_send_action(ni, IEEE80211_ACTION_CAT_SELF_PROT, IEEE80211_ACTION_MESHPEERING_CLOSE, args); mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING); mesh_peer_timeout_setup(ni); break; case IEEE80211_NODE_MESH_HOLDING: ni->ni_mlhcnt++; if (ni->ni_mlhcnt >= ieee80211_mesh_maxholding) callout_reset(&ni->ni_mlhtimer, ieee80211_mesh_backofftimeout, mesh_peer_backoff_cb, ni); mesh_linkchange(ni, IEEE80211_NODE_MESH_IDLE); break; } } static int mesh_verify_meshid(struct ieee80211vap *vap, const uint8_t *ie) { struct ieee80211_mesh_state *ms = vap->iv_mesh; if (ie == NULL || ie[1] != ms->ms_idlen) return 1; return memcmp(ms->ms_id, ie + 2, ms->ms_idlen); } /* * Check if we are using the same algorithms for this mesh. */ static int mesh_verify_meshconf(struct ieee80211vap *vap, const uint8_t *ie) { const struct ieee80211_meshconf_ie *meshconf = (const struct ieee80211_meshconf_ie *) ie; const struct ieee80211_mesh_state *ms = vap->iv_mesh; if (meshconf == NULL) return 1; if (meshconf->conf_pselid != ms->ms_ppath->mpp_ie) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH, "unknown path selection algorithm: 0x%x\n", meshconf->conf_pselid); return 1; } if (meshconf->conf_pmetid != ms->ms_pmetric->mpm_ie) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH, "unknown path metric algorithm: 0x%x\n", meshconf->conf_pmetid); return 1; } if (meshconf->conf_ccid != 0) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH, "unknown congestion control algorithm: 0x%x\n", meshconf->conf_ccid); return 1; } if (meshconf->conf_syncid != IEEE80211_MESHCONF_SYNC_NEIGHOFF) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH, "unknown sync algorithm: 0x%x\n", meshconf->conf_syncid); return 1; } if (meshconf->conf_authid != 0) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH, "unknown auth auth algorithm: 0x%x\n", meshconf->conf_pselid); return 1; } /* Not accepting peers */ if (!(meshconf->conf_cap & IEEE80211_MESHCONF_CAP_AP)) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH, "not accepting peers: 0x%x\n", meshconf->conf_cap); return 1; } return 0; } static int mesh_verify_meshpeer(struct ieee80211vap *vap, uint8_t subtype, const uint8_t *ie) { const struct ieee80211_meshpeer_ie *meshpeer = (const struct ieee80211_meshpeer_ie *) ie; if (meshpeer == NULL || meshpeer->peer_len < IEEE80211_MPM_BASE_SZ || meshpeer->peer_len > IEEE80211_MPM_MAX_SZ) return 1; if (meshpeer->peer_proto != IEEE80211_MPPID_MPM) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, "Only MPM protocol is supported (proto: 0x%02X)", meshpeer->peer_proto); return 1; } switch (subtype) { case IEEE80211_ACTION_MESHPEERING_OPEN: if (meshpeer->peer_len != IEEE80211_MPM_BASE_SZ) return 1; break; case IEEE80211_ACTION_MESHPEERING_CONFIRM: if (meshpeer->peer_len != IEEE80211_MPM_BASE_SZ + 2) return 1; break; case IEEE80211_ACTION_MESHPEERING_CLOSE: if (meshpeer->peer_len < IEEE80211_MPM_BASE_SZ + 2) return 1; if (meshpeer->peer_len == (IEEE80211_MPM_BASE_SZ + 2) && meshpeer->peer_linkid != 0) return 1; if (meshpeer->peer_rcode == 0) return 1; break; } return 0; } /* * Add a Mesh ID IE to a frame. */ uint8_t * ieee80211_add_meshid(uint8_t *frm, struct ieee80211vap *vap) { struct ieee80211_mesh_state *ms = vap->iv_mesh; KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a mbss vap")); *frm++ = IEEE80211_ELEMID_MESHID; *frm++ = ms->ms_idlen; memcpy(frm, ms->ms_id, ms->ms_idlen); return frm + ms->ms_idlen; } /* * Add a Mesh Configuration IE to a frame. * For now just use HWMP routing, Airtime link metric, Null Congestion * Signaling, Null Sync Protocol and Null Authentication. */ uint8_t * ieee80211_add_meshconf(uint8_t *frm, struct ieee80211vap *vap) { const struct ieee80211_mesh_state *ms = vap->iv_mesh; uint16_t caps; KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a MBSS vap")); *frm++ = IEEE80211_ELEMID_MESHCONF; *frm++ = IEEE80211_MESH_CONF_SZ; *frm++ = ms->ms_ppath->mpp_ie; /* path selection */ *frm++ = ms->ms_pmetric->mpm_ie; /* link metric */ *frm++ = IEEE80211_MESHCONF_CC_DISABLED; *frm++ = IEEE80211_MESHCONF_SYNC_NEIGHOFF; *frm++ = IEEE80211_MESHCONF_AUTH_DISABLED; /* NB: set the number of neighbors before the rest */ *frm = (ms->ms_neighbors > IEEE80211_MESH_MAX_NEIGHBORS ? IEEE80211_MESH_MAX_NEIGHBORS : ms->ms_neighbors) << 1; if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE) *frm |= IEEE80211_MESHCONF_FORM_GATE; frm += 1; caps = 0; if (ms->ms_flags & IEEE80211_MESHFLAGS_AP) caps |= IEEE80211_MESHCONF_CAP_AP; if (ms->ms_flags & IEEE80211_MESHFLAGS_FWD) caps |= IEEE80211_MESHCONF_CAP_FWRD; *frm++ = caps; return frm; } /* * Add a Mesh Peer Management IE to a frame. */ uint8_t * ieee80211_add_meshpeer(uint8_t *frm, uint8_t subtype, uint16_t localid, uint16_t peerid, uint16_t reason) { KASSERT(localid != 0, ("localid == 0")); *frm++ = IEEE80211_ELEMID_MESHPEER; switch (subtype) { case IEEE80211_ACTION_MESHPEERING_OPEN: *frm++ = IEEE80211_MPM_BASE_SZ; /* length */ ADDSHORT(frm, IEEE80211_MPPID_MPM); /* proto */ ADDSHORT(frm, localid); /* local ID */ break; case IEEE80211_ACTION_MESHPEERING_CONFIRM: KASSERT(peerid != 0, ("sending peer confirm without peer id")); *frm++ = IEEE80211_MPM_BASE_SZ + 2; /* length */ ADDSHORT(frm, IEEE80211_MPPID_MPM); /* proto */ ADDSHORT(frm, localid); /* local ID */ ADDSHORT(frm, peerid); /* peer ID */ break; case IEEE80211_ACTION_MESHPEERING_CLOSE: if (peerid) *frm++ = IEEE80211_MPM_MAX_SZ; /* length */ else *frm++ = IEEE80211_MPM_BASE_SZ + 2; /* length */ ADDSHORT(frm, IEEE80211_MPPID_MPM); /* proto */ ADDSHORT(frm, localid); /* local ID */ if (peerid) ADDSHORT(frm, peerid); /* peer ID */ ADDSHORT(frm, reason); break; } return frm; } /* * Compute an Airtime Link Metric for the link with this node. * * Based on Draft 3.0 spec (11B.10, p.149). */ /* * Max 802.11s overhead. */ #define IEEE80211_MESH_MAXOVERHEAD \ (sizeof(struct ieee80211_qosframe_addr4) \ + sizeof(struct ieee80211_meshcntl_ae10) \ + sizeof(struct llc) \ + IEEE80211_ADDR_LEN \ + IEEE80211_WEP_IVLEN \ + IEEE80211_WEP_KIDLEN \ + IEEE80211_WEP_CRCLEN \ + IEEE80211_WEP_MICLEN \ + IEEE80211_CRC_LEN) uint32_t mesh_airtime_calc(struct ieee80211_node *ni) { #define M_BITS 8 #define S_FACTOR (2 * M_BITS) struct ieee80211com *ic = ni->ni_ic; struct ifnet *ifp = ni->ni_vap->iv_ifp; const static int nbits = 8192 << M_BITS; uint32_t overhead, rate, errrate; uint64_t res; /* Time to transmit a frame */ rate = ni->ni_txrate; overhead = ieee80211_compute_duration(ic->ic_rt, ifp->if_mtu + IEEE80211_MESH_MAXOVERHEAD, rate, 0) << M_BITS; /* Error rate in percentage */ /* XXX assuming small failures are ok */ errrate = (((ifp->if_get_counter(ifp, IFCOUNTER_OERRORS) + ifp->if_get_counter(ifp, IFCOUNTER_IERRORS)) / 100) << M_BITS) / 100; res = (overhead + (nbits / rate)) * ((1 << S_FACTOR) / ((1 << M_BITS) - errrate)); return (uint32_t)(res >> S_FACTOR); #undef M_BITS #undef S_FACTOR } /* * Add a Mesh Link Metric report IE to a frame. */ uint8_t * ieee80211_add_meshlmetric(uint8_t *frm, uint8_t flags, uint32_t metric) { *frm++ = IEEE80211_ELEMID_MESHLINK; *frm++ = 5; *frm++ = flags; ADDWORD(frm, metric); return frm; } /* * Add a Mesh Gate Announcement IE to a frame. */ uint8_t * ieee80211_add_meshgate(uint8_t *frm, struct ieee80211_meshgann_ie *ie) { *frm++ = IEEE80211_ELEMID_MESHGANN; /* ie */ *frm++ = IEEE80211_MESHGANN_BASE_SZ; /* len */ *frm++ = ie->gann_flags; *frm++ = ie->gann_hopcount; *frm++ = ie->gann_ttl; IEEE80211_ADDR_COPY(frm, ie->gann_addr); frm += 6; ADDWORD(frm, ie->gann_seq); ADDSHORT(frm, ie->gann_interval); return frm; } #undef ADDSHORT #undef ADDWORD /* * Initialize any mesh-specific node state. */ void ieee80211_mesh_node_init(struct ieee80211vap *vap, struct ieee80211_node *ni) { ni->ni_flags |= IEEE80211_NODE_QOS; callout_init(&ni->ni_mltimer, 1); callout_init(&ni->ni_mlhtimer, 1); } /* * Cleanup any mesh-specific node state. */ void ieee80211_mesh_node_cleanup(struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_mesh_state *ms = vap->iv_mesh; callout_drain(&ni->ni_mltimer); callout_drain(&ni->ni_mlhtimer); /* NB: short-circuit callbacks after mesh_vdetach */ if (vap->iv_mesh != NULL) ms->ms_ppath->mpp_peerdown(ni); } void ieee80211_parse_meshid(struct ieee80211_node *ni, const uint8_t *ie) { ni->ni_meshidlen = ie[1]; memcpy(ni->ni_meshid, ie + 2, ie[1]); } /* * Setup mesh-specific node state on neighbor discovery. */ void ieee80211_mesh_init_neighbor(struct ieee80211_node *ni, const struct ieee80211_frame *wh, const struct ieee80211_scanparams *sp) { ieee80211_parse_meshid(ni, sp->meshid); } void ieee80211_mesh_update_beacon(struct ieee80211vap *vap, struct ieee80211_beacon_offsets *bo) { KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a MBSS vap")); if (isset(bo->bo_flags, IEEE80211_BEACON_MESHCONF)) { (void)ieee80211_add_meshconf(bo->bo_meshconf, vap); clrbit(bo->bo_flags, IEEE80211_BEACON_MESHCONF); } } static int mesh_ioctl_get80211(struct ieee80211vap *vap, struct ieee80211req *ireq) { struct ieee80211_mesh_state *ms = vap->iv_mesh; uint8_t tmpmeshid[IEEE80211_NWID_LEN]; struct ieee80211_mesh_route *rt; struct ieee80211req_mesh_route *imr; size_t len, off; uint8_t *p; int error; if (vap->iv_opmode != IEEE80211_M_MBSS) return ENOSYS; error = 0; switch (ireq->i_type) { case IEEE80211_IOC_MESH_ID: ireq->i_len = ms->ms_idlen; memcpy(tmpmeshid, ms->ms_id, ireq->i_len); error = copyout(tmpmeshid, ireq->i_data, ireq->i_len); break; case IEEE80211_IOC_MESH_AP: ireq->i_val = (ms->ms_flags & IEEE80211_MESHFLAGS_AP) != 0; break; case IEEE80211_IOC_MESH_FWRD: ireq->i_val = (ms->ms_flags & IEEE80211_MESHFLAGS_FWD) != 0; break; case IEEE80211_IOC_MESH_GATE: ireq->i_val = (ms->ms_flags & IEEE80211_MESHFLAGS_GATE) != 0; break; case IEEE80211_IOC_MESH_TTL: ireq->i_val = ms->ms_ttl; break; case IEEE80211_IOC_MESH_RTCMD: switch (ireq->i_val) { case IEEE80211_MESH_RTCMD_LIST: len = 0; MESH_RT_LOCK(ms); TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) { len += sizeof(*imr); } MESH_RT_UNLOCK(ms); if (len > ireq->i_len || ireq->i_len < sizeof(*imr)) { ireq->i_len = len; return ENOMEM; } ireq->i_len = len; /* XXX M_WAIT? */ p = IEEE80211_MALLOC(len, M_TEMP, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO); if (p == NULL) return ENOMEM; off = 0; MESH_RT_LOCK(ms); TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) { if (off >= len) break; imr = (struct ieee80211req_mesh_route *) (p + off); IEEE80211_ADDR_COPY(imr->imr_dest, rt->rt_dest); IEEE80211_ADDR_COPY(imr->imr_nexthop, rt->rt_nexthop); imr->imr_metric = rt->rt_metric; imr->imr_nhops = rt->rt_nhops; imr->imr_lifetime = ieee80211_mesh_rt_update(rt, 0); imr->imr_lastmseq = rt->rt_lastmseq; imr->imr_flags = rt->rt_flags; /* last */ off += sizeof(*imr); } MESH_RT_UNLOCK(ms); error = copyout(p, (uint8_t *)ireq->i_data, ireq->i_len); IEEE80211_FREE(p, M_TEMP); break; case IEEE80211_MESH_RTCMD_FLUSH: case IEEE80211_MESH_RTCMD_ADD: case IEEE80211_MESH_RTCMD_DELETE: return EINVAL; default: return ENOSYS; } break; case IEEE80211_IOC_MESH_PR_METRIC: len = strlen(ms->ms_pmetric->mpm_descr); if (ireq->i_len < len) return EINVAL; ireq->i_len = len; error = copyout(ms->ms_pmetric->mpm_descr, (uint8_t *)ireq->i_data, len); break; case IEEE80211_IOC_MESH_PR_PATH: len = strlen(ms->ms_ppath->mpp_descr); if (ireq->i_len < len) return EINVAL; ireq->i_len = len; error = copyout(ms->ms_ppath->mpp_descr, (uint8_t *)ireq->i_data, len); break; default: return ENOSYS; } return error; } IEEE80211_IOCTL_GET(mesh, mesh_ioctl_get80211); static int mesh_ioctl_set80211(struct ieee80211vap *vap, struct ieee80211req *ireq) { struct ieee80211_mesh_state *ms = vap->iv_mesh; uint8_t tmpmeshid[IEEE80211_NWID_LEN]; uint8_t tmpaddr[IEEE80211_ADDR_LEN]; char tmpproto[IEEE80211_MESH_PROTO_DSZ]; int error; if (vap->iv_opmode != IEEE80211_M_MBSS) return ENOSYS; error = 0; switch (ireq->i_type) { case IEEE80211_IOC_MESH_ID: if (ireq->i_val != 0 || ireq->i_len > IEEE80211_MESHID_LEN) return EINVAL; error = copyin(ireq->i_data, tmpmeshid, ireq->i_len); if (error != 0) break; memset(ms->ms_id, 0, IEEE80211_NWID_LEN); ms->ms_idlen = ireq->i_len; memcpy(ms->ms_id, tmpmeshid, ireq->i_len); error = ENETRESET; break; case IEEE80211_IOC_MESH_AP: if (ireq->i_val) ms->ms_flags |= IEEE80211_MESHFLAGS_AP; else ms->ms_flags &= ~IEEE80211_MESHFLAGS_AP; error = ENETRESET; break; case IEEE80211_IOC_MESH_FWRD: if (ireq->i_val) ms->ms_flags |= IEEE80211_MESHFLAGS_FWD; else ms->ms_flags &= ~IEEE80211_MESHFLAGS_FWD; mesh_gatemode_setup(vap); break; case IEEE80211_IOC_MESH_GATE: if (ireq->i_val) ms->ms_flags |= IEEE80211_MESHFLAGS_GATE; else ms->ms_flags &= ~IEEE80211_MESHFLAGS_GATE; break; case IEEE80211_IOC_MESH_TTL: ms->ms_ttl = (uint8_t) ireq->i_val; break; case IEEE80211_IOC_MESH_RTCMD: switch (ireq->i_val) { case IEEE80211_MESH_RTCMD_LIST: return EINVAL; case IEEE80211_MESH_RTCMD_FLUSH: ieee80211_mesh_rt_flush(vap); break; case IEEE80211_MESH_RTCMD_ADD: if (IEEE80211_ADDR_EQ(vap->iv_myaddr, ireq->i_data) || IEEE80211_ADDR_EQ(broadcastaddr, ireq->i_data)) return EINVAL; error = copyin(ireq->i_data, &tmpaddr, IEEE80211_ADDR_LEN); if (error == 0) ieee80211_mesh_discover(vap, tmpaddr, NULL); break; case IEEE80211_MESH_RTCMD_DELETE: ieee80211_mesh_rt_del(vap, ireq->i_data); break; default: return ENOSYS; } break; case IEEE80211_IOC_MESH_PR_METRIC: error = copyin(ireq->i_data, tmpproto, sizeof(tmpproto)); if (error == 0) { error = mesh_select_proto_metric(vap, tmpproto); if (error == 0) error = ENETRESET; } break; case IEEE80211_IOC_MESH_PR_PATH: error = copyin(ireq->i_data, tmpproto, sizeof(tmpproto)); if (error == 0) { error = mesh_select_proto_path(vap, tmpproto); if (error == 0) error = ENETRESET; } break; default: return ENOSYS; } return error; } IEEE80211_IOCTL_SET(mesh, mesh_ioctl_set80211); Index: head/sys/net80211/ieee80211_output.c =================================================================== --- head/sys/net80211/ieee80211_output.c (revision 348253) +++ head/sys/net80211/ieee80211_output.c (revision 348254) @@ -1,4012 +1,4014 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2001 Atsushi Onoe * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_wlan.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IEEE80211_SUPPORT_SUPERG #include #endif #ifdef IEEE80211_SUPPORT_TDMA #include #endif #include #include #include #if defined(INET) || defined(INET6) #include #endif #ifdef INET #include #include #include #endif #ifdef INET6 #include #endif #include #define ETHER_HEADER_COPY(dst, src) \ memcpy(dst, src, sizeof(struct ether_header)) static int ieee80211_fragment(struct ieee80211vap *, struct mbuf *, u_int hdrsize, u_int ciphdrsize, u_int mtu); static void ieee80211_tx_mgt_cb(struct ieee80211_node *, void *, int); #ifdef IEEE80211_DEBUG /* * Decide if an outbound management frame should be * printed when debugging is enabled. This filters some * of the less interesting frames that come frequently * (e.g. beacons). */ static __inline int doprint(struct ieee80211vap *vap, int subtype) { switch (subtype) { case IEEE80211_FC0_SUBTYPE_PROBE_RESP: return (vap->iv_opmode == IEEE80211_M_IBSS); } return 1; } #endif /* * Transmit a frame to the given destination on the given VAP. * * It's up to the caller to figure out the details of who this * is going to and resolving the node. * * This routine takes care of queuing it for power save, * A-MPDU state stuff, fast-frames state stuff, encapsulation * if required, then passing it up to the driver layer. * * This routine (for now) consumes the mbuf and frees the node * reference; it ideally will return a TX status which reflects * whether the mbuf was consumed or not, so the caller can * free the mbuf (if appropriate) and the node reference (again, * if appropriate.) */ int ieee80211_vap_pkt_send_dest(struct ieee80211vap *vap, struct mbuf *m, struct ieee80211_node *ni) { struct ieee80211com *ic = vap->iv_ic; struct ifnet *ifp = vap->iv_ifp; int mcast; if ((ni->ni_flags & IEEE80211_NODE_PWR_MGT) && (m->m_flags & M_PWR_SAV) == 0) { /* * Station in power save mode; pass the frame * to the 802.11 layer and continue. We'll get * the frame back when the time is right. * XXX lose WDS vap linkage? */ if (ieee80211_pwrsave(ni, m) != 0) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); ieee80211_free_node(ni); /* * We queued it fine, so tell the upper layer * that we consumed it. */ return (0); } /* calculate priority so drivers can find the tx queue */ if (ieee80211_classify(ni, m)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_OUTPUT, ni->ni_macaddr, NULL, "%s", "classification failure"); vap->iv_stats.is_tx_classify++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); ieee80211_free_node(ni); /* XXX better status? */ return (0); } /* * Stash the node pointer. Note that we do this after * any call to ieee80211_dwds_mcast because that code * uses any existing value for rcvif to identify the * interface it (might have been) received on. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)ni; mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1: 0; BPF_MTAP(ifp, m); /* 802.3 tx */ /* * Check if A-MPDU tx aggregation is setup or if we * should try to enable it. The sta must be associated * with HT and A-MPDU enabled for use. When the policy * routine decides we should enable A-MPDU we issue an * ADDBA request and wait for a reply. The frame being * encapsulated will go out w/o using A-MPDU, or possibly * it might be collected by the driver and held/retransmit. * The default ic_ampdu_enable routine handles staggering * ADDBA requests in case the receiver NAK's us or we are * otherwise unable to establish a BA stream. * * Don't treat group-addressed frames as candidates for aggregation; * net80211 doesn't support 802.11aa-2012 and so group addressed * frames will always have sequence numbers allocated from the NON_QOS * TID. */ if ((ni->ni_flags & IEEE80211_NODE_AMPDU_TX) && (vap->iv_flags_ht & IEEE80211_FHT_AMPDU_TX)) { if ((m->m_flags & M_EAPOL) == 0 && (! mcast)) { int tid = WME_AC_TO_TID(M_WME_GETAC(m)); struct ieee80211_tx_ampdu *tap = &ni->ni_tx_ampdu[tid]; ieee80211_txampdu_count_packet(tap); if (IEEE80211_AMPDU_RUNNING(tap)) { /* * Operational, mark frame for aggregation. * * XXX do tx aggregation here */ m->m_flags |= M_AMPDU_MPDU; } else if (!IEEE80211_AMPDU_REQUESTED(tap) && ic->ic_ampdu_enable(ni, tap)) { /* * Not negotiated yet, request service. */ ieee80211_ampdu_request(ni, tap); /* XXX hold frame for reply? */ } } } #ifdef IEEE80211_SUPPORT_SUPERG /* * Check for AMSDU/FF; queue for aggregation * * Note: we don't bother trying to do fast frames or * A-MSDU encapsulation for 802.3 drivers. Now, we * likely could do it for FF (because it's a magic * atheros tunnel LLC type) but I don't think we're going * to really need to. For A-MSDU we'd have to set the * A-MSDU QoS bit in the wifi header, so we just plain * can't do it. * * Strictly speaking, we could actually /do/ A-MSDU / FF * with A-MPDU together which for certain circumstances * is beneficial (eg A-MSDU of TCK ACKs.) However, * I'll ignore that for now so existing behaviour is maintained. * Later on it would be good to make "amsdu + ampdu" configurable. */ else if (__predict_true((vap->iv_caps & IEEE80211_C_8023ENCAP) == 0)) { if ((! mcast) && ieee80211_amsdu_tx_ok(ni)) { m = ieee80211_amsdu_check(ni, m); if (m == NULL) { /* NB: any ni ref held on stageq */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG, "%s: amsdu_check queued frame\n", __func__); return (0); } } else if ((! mcast) && IEEE80211_ATH_CAP(vap, ni, IEEE80211_NODE_FF)) { m = ieee80211_ff_check(ni, m); if (m == NULL) { /* NB: any ni ref held on stageq */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG, "%s: ff_check queued frame\n", __func__); return (0); } } } #endif /* IEEE80211_SUPPORT_SUPERG */ /* * Grab the TX lock - serialise the TX process from this * point (where TX state is being checked/modified) * through to driver queue. */ IEEE80211_TX_LOCK(ic); /* * XXX make the encap and transmit code a separate function * so things like the FF (and later A-MSDU) path can just call * it for flushed frames. */ if (__predict_true((vap->iv_caps & IEEE80211_C_8023ENCAP) == 0)) { /* * Encapsulate the packet in prep for transmission. */ m = ieee80211_encap(vap, ni, m); if (m == NULL) { /* NB: stat+msg handled in ieee80211_encap */ IEEE80211_TX_UNLOCK(ic); ieee80211_free_node(ni); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } } (void) ieee80211_parent_xmitpkt(ic, m); /* * Unlock at this point - no need to hold it across * ieee80211_free_node() (ie, the comlock) */ IEEE80211_TX_UNLOCK(ic); ic->ic_lastdata = ticks; return (0); } /* * Send the given mbuf through the given vap. * * This consumes the mbuf regardless of whether the transmit * was successful or not. * * This does none of the initial checks that ieee80211_start() * does (eg CAC timeout, interface wakeup) - the caller must * do this first. */ static int ieee80211_start_pkt(struct ieee80211vap *vap, struct mbuf *m) { #define IS_DWDS(vap) \ (vap->iv_opmode == IEEE80211_M_WDS && \ (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) == 0) struct ieee80211com *ic = vap->iv_ic; struct ifnet *ifp = vap->iv_ifp; struct ieee80211_node *ni; struct ether_header *eh; /* * Cancel any background scan. */ if (ic->ic_flags & IEEE80211_F_SCAN) ieee80211_cancel_anyscan(vap); /* * Find the node for the destination so we can do * things like power save and fast frames aggregation. * * NB: past this point various code assumes the first * mbuf has the 802.3 header present (and contiguous). */ ni = NULL; if (m->m_len < sizeof(struct ether_header) && (m = m_pullup(m, sizeof(struct ether_header))) == NULL) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT, "discard frame, %s\n", "m_pullup failed"); vap->iv_stats.is_tx_nobuf++; /* XXX */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } eh = mtod(m, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) { if (IS_DWDS(vap)) { /* * Only unicast frames from the above go out * DWDS vaps; multicast frames are handled by * dispatching the frame as it comes through * the AP vap (see below). */ IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_WDS, eh->ether_dhost, "mcast", "%s", "on DWDS"); vap->iv_stats.is_dwds_mcast++; m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* XXX better status? */ return (ENOBUFS); } if (vap->iv_opmode == IEEE80211_M_HOSTAP) { /* * Spam DWDS vap's w/ multicast traffic. */ /* XXX only if dwds in use? */ ieee80211_dwds_mcast(vap, m); } } #ifdef IEEE80211_SUPPORT_MESH if (vap->iv_opmode != IEEE80211_M_MBSS) { #endif ni = ieee80211_find_txnode(vap, eh->ether_dhost); if (ni == NULL) { /* NB: ieee80211_find_txnode does stat+msg */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); /* XXX better status? */ return (ENOBUFS); } if (ni->ni_associd == 0 && (ni->ni_flags & IEEE80211_NODE_ASSOCID)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_OUTPUT, eh->ether_dhost, NULL, "sta not associated (type 0x%04x)", htons(eh->ether_type)); vap->iv_stats.is_tx_notassoc++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); ieee80211_free_node(ni); /* XXX better status? */ return (ENOBUFS); } #ifdef IEEE80211_SUPPORT_MESH } else { if (!IEEE80211_ADDR_EQ(eh->ether_shost, vap->iv_myaddr)) { /* * Proxy station only if configured. */ if (!ieee80211_mesh_isproxyena(vap)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_OUTPUT | IEEE80211_MSG_MESH, eh->ether_dhost, NULL, "%s", "proxy not enabled"); vap->iv_stats.is_mesh_notproxy++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(m); /* XXX better status? */ return (ENOBUFS); } IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT, "forward frame from DS SA(%6D), DA(%6D)\n", eh->ether_shost, ":", eh->ether_dhost, ":"); ieee80211_mesh_proxy_check(vap, eh->ether_shost); } ni = ieee80211_mesh_discover(vap, eh->ether_dhost, m); if (ni == NULL) { /* * NB: ieee80211_mesh_discover holds/disposes * frame (e.g. queueing on path discovery). */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* XXX better status? */ return (ENOBUFS); } } #endif /* * We've resolved the sender, so attempt to transmit it. */ if (vap->iv_state == IEEE80211_S_SLEEP) { /* * In power save; queue frame and then wakeup device * for transmit. */ ic->ic_lastdata = ticks; if (ieee80211_pwrsave(ni, m) != 0) if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); ieee80211_free_node(ni); ieee80211_new_state(vap, IEEE80211_S_RUN, 0); return (0); } if (ieee80211_vap_pkt_send_dest(vap, m, ni) != 0) return (ENOBUFS); return (0); #undef IS_DWDS } /* * Start method for vap's. All packets from the stack come * through here. We handle common processing of the packets * before dispatching them to the underlying device. * * if_transmit() requires that the mbuf be consumed by this call * regardless of the return condition. */ int ieee80211_vap_transmit(struct ifnet *ifp, struct mbuf *m) { struct ieee80211vap *vap = ifp->if_softc; struct ieee80211com *ic = vap->iv_ic; /* * No data frames go out unless we're running. * Note in particular this covers CAC and CSA * states (though maybe we should check muting * for CSA). */ if (vap->iv_state != IEEE80211_S_RUN && vap->iv_state != IEEE80211_S_SLEEP) { IEEE80211_LOCK(ic); /* re-check under the com lock to avoid races */ if (vap->iv_state != IEEE80211_S_RUN && vap->iv_state != IEEE80211_S_SLEEP) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT, "%s: ignore queue, in %s state\n", __func__, ieee80211_state_name[vap->iv_state]); vap->iv_stats.is_tx_badstate++; IEEE80211_UNLOCK(ic); ifp->if_drv_flags |= IFF_DRV_OACTIVE; m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENETDOWN); } IEEE80211_UNLOCK(ic); } /* * Sanitize mbuf flags for net80211 use. We cannot * clear M_PWR_SAV or M_MORE_DATA because these may * be set for frames that are re-submitted from the * power save queue. * * NB: This must be done before ieee80211_classify as * it marks EAPOL in frames with M_EAPOL. */ m->m_flags &= ~(M_80211_TX - M_PWR_SAV - M_MORE_DATA); /* * Bump to the packet transmission path. * The mbuf will be consumed here. */ return (ieee80211_start_pkt(vap, m)); } void ieee80211_vap_qflush(struct ifnet *ifp) { /* Empty for now */ } /* * 802.11 raw output routine. * * XXX TODO: this (and other send routines) should correctly * XXX keep the pwr mgmt bit set if it decides to call into the * XXX driver to send a frame whilst the state is SLEEP. * * Otherwise the peer may decide that we're awake and flood us * with traffic we are still too asleep to receive! */ int ieee80211_raw_output(struct ieee80211vap *vap, struct ieee80211_node *ni, struct mbuf *m, const struct ieee80211_bpf_params *params) { struct ieee80211com *ic = vap->iv_ic; int error; /* * Set node - the caller has taken a reference, so ensure * that the mbuf has the same node value that * it would if it were going via the normal path. */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)ni; /* * Attempt to add bpf transmit parameters. * * For now it's ok to fail; the raw_xmit api still takes * them as an option. * * Later on when ic_raw_xmit() has params removed, * they'll have to be added - so fail the transmit if * they can't be. */ if (params) (void) ieee80211_add_xmit_params(m, params); error = ic->ic_raw_xmit(ni, m, params); if (error) { if_inc_counter(vap->iv_ifp, IFCOUNTER_OERRORS, 1); ieee80211_free_node(ni); } return (error); } static int ieee80211_validate_frame(struct mbuf *m, const struct ieee80211_bpf_params *params) { struct ieee80211_frame *wh; int type; if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_ack)) return (EINVAL); wh = mtod(m, struct ieee80211_frame *); if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) != IEEE80211_FC0_VERSION_0) return (EINVAL); type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK; if (type != IEEE80211_FC0_TYPE_DATA) { if ((wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) != IEEE80211_FC1_DIR_NODS) return (EINVAL); if (type != IEEE80211_FC0_TYPE_MGT && (wh->i_fc[1] & IEEE80211_FC1_MORE_FRAG) != 0) return (EINVAL); /* XXX skip other field checks? */ } if ((params && (params->ibp_flags & IEEE80211_BPF_CRYPTO) != 0) || (wh->i_fc[1] & IEEE80211_FC1_PROTECTED) != 0) { int subtype; subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK; /* * See IEEE Std 802.11-2012, * 8.2.4.1.9 'Protected Frame field' */ /* XXX no support for robust management frames yet. */ if (!(type == IEEE80211_FC0_TYPE_DATA || (type == IEEE80211_FC0_TYPE_MGT && subtype == IEEE80211_FC0_SUBTYPE_AUTH))) return (EINVAL); wh->i_fc[1] |= IEEE80211_FC1_PROTECTED; } if (m->m_pkthdr.len < ieee80211_anyhdrsize(wh)) return (EINVAL); return (0); } static int ieee80211_validate_rate(struct ieee80211_node *ni, uint8_t rate) { struct ieee80211com *ic = ni->ni_ic; if (IEEE80211_IS_HT_RATE(rate)) { if ((ic->ic_htcaps & IEEE80211_HTC_HT) == 0) return (EINVAL); rate = IEEE80211_RV(rate); if (rate <= 31) { if (rate > ic->ic_txstream * 8 - 1) return (EINVAL); return (0); } if (rate == 32) { if ((ic->ic_htcaps & IEEE80211_HTC_TXMCS32) == 0) return (EINVAL); return (0); } if ((ic->ic_htcaps & IEEE80211_HTC_TXUNEQUAL) == 0) return (EINVAL); switch (ic->ic_txstream) { case 0: case 1: return (EINVAL); case 2: if (rate > 38) return (EINVAL); return (0); case 3: if (rate > 52) return (EINVAL); return (0); case 4: default: if (rate > 76) return (EINVAL); return (0); } } if (!ieee80211_isratevalid(ic->ic_rt, rate)) return (EINVAL); return (0); } static int ieee80211_sanitize_rates(struct ieee80211_node *ni, struct mbuf *m, const struct ieee80211_bpf_params *params) { int error; if (!params) return (0); /* nothing to do */ /* NB: most drivers assume that ibp_rate0 is set (!= 0). */ if (params->ibp_rate0 != 0) { error = ieee80211_validate_rate(ni, params->ibp_rate0); if (error != 0) return (error); } else { /* XXX pre-setup some default (e.g., mgmt / mcast) rate */ /* XXX __DECONST? */ (void) m; } if (params->ibp_rate1 != 0 && (error = ieee80211_validate_rate(ni, params->ibp_rate1)) != 0) return (error); if (params->ibp_rate2 != 0 && (error = ieee80211_validate_rate(ni, params->ibp_rate2)) != 0) return (error); if (params->ibp_rate3 != 0 && (error = ieee80211_validate_rate(ni, params->ibp_rate3)) != 0) return (error); return (0); } /* * 802.11 output routine. This is (currently) used only to * connect bpf write calls to the 802.11 layer for injecting * raw 802.11 frames. */ int ieee80211_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { #define senderr(e) do { error = (e); goto bad;} while (0) const struct ieee80211_bpf_params *params = NULL; struct ieee80211_node *ni = NULL; struct ieee80211vap *vap; struct ieee80211_frame *wh; struct ieee80211com *ic = NULL; int error; int ret; if (ifp->if_drv_flags & IFF_DRV_OACTIVE) { /* * Short-circuit requests if the vap is marked OACTIVE * as this can happen because a packet came down through * ieee80211_start before the vap entered RUN state in * which case it's ok to just drop the frame. This * should not be necessary but callers of if_output don't * check OACTIVE. */ senderr(ENETDOWN); } vap = ifp->if_softc; ic = vap->iv_ic; /* * Hand to the 802.3 code if not tagged as * a raw 802.11 frame. */ if (dst->sa_family != AF_IEEE80211) return vap->iv_output(ifp, m, dst, ro); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) senderr(error); #endif if (ifp->if_flags & IFF_MONITOR) senderr(ENETDOWN); if (!IFNET_IS_UP_RUNNING(ifp)) senderr(ENETDOWN); if (vap->iv_state == IEEE80211_S_CAC) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT | IEEE80211_MSG_DOTH, "block %s frame in CAC state\n", "raw data"); vap->iv_stats.is_tx_badstate++; senderr(EIO); /* XXX */ } else if (vap->iv_state == IEEE80211_S_SCAN) senderr(EIO); /* XXX bypass bridge, pfil, carp, etc. */ /* * NB: DLT_IEEE802_11_RADIO identifies the parameters are * present by setting the sa_len field of the sockaddr (yes, * this is a hack). * NB: we assume sa_data is suitably aligned to cast. */ if (dst->sa_len != 0) params = (const struct ieee80211_bpf_params *)dst->sa_data; error = ieee80211_validate_frame(m, params); if (error != 0) senderr(error); wh = mtod(m, struct ieee80211_frame *); /* locate destination node */ switch (wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) { case IEEE80211_FC1_DIR_NODS: case IEEE80211_FC1_DIR_FROMDS: ni = ieee80211_find_txnode(vap, wh->i_addr1); break; case IEEE80211_FC1_DIR_TODS: case IEEE80211_FC1_DIR_DSTODS: ni = ieee80211_find_txnode(vap, wh->i_addr3); break; default: senderr(EDOOFUS); } if (ni == NULL) { /* * Permit packets w/ bpf params through regardless * (see below about sa_len). */ if (dst->sa_len == 0) senderr(EHOSTUNREACH); ni = ieee80211_ref_node(vap->iv_bss); } /* * Sanitize mbuf for net80211 flags leaked from above. * * NB: This must be done before ieee80211_classify as * it marks EAPOL in frames with M_EAPOL. */ m->m_flags &= ~M_80211_TX; m->m_flags |= M_ENCAP; /* mark encapsulated */ if (IEEE80211_IS_DATA(wh)) { /* calculate priority so drivers can find the tx queue */ if (ieee80211_classify(ni, m)) senderr(EIO); /* XXX */ /* NB: ieee80211_encap does not include 802.11 header */ IEEE80211_NODE_STAT_ADD(ni, tx_bytes, m->m_pkthdr.len - ieee80211_hdrsize(wh)); } else M_WME_SETAC(m, WME_AC_BE); error = ieee80211_sanitize_rates(ni, m, params); if (error != 0) senderr(error); IEEE80211_NODE_STAT(ni, tx_data); if (IEEE80211_IS_MULTICAST(wh->i_addr1)) { IEEE80211_NODE_STAT(ni, tx_mcast); m->m_flags |= M_MCAST; } else IEEE80211_NODE_STAT(ni, tx_ucast); IEEE80211_TX_LOCK(ic); ret = ieee80211_raw_output(vap, ni, m, params); IEEE80211_TX_UNLOCK(ic); return (ret); bad: if (m != NULL) m_freem(m); if (ni != NULL) ieee80211_free_node(ni); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return error; #undef senderr } /* * Set the direction field and address fields of an outgoing * frame. Note this should be called early on in constructing * a frame as it sets i_fc[1]; other bits can then be or'd in. */ void ieee80211_send_setup( struct ieee80211_node *ni, struct mbuf *m, int type, int tid, const uint8_t sa[IEEE80211_ADDR_LEN], const uint8_t da[IEEE80211_ADDR_LEN], const uint8_t bssid[IEEE80211_ADDR_LEN]) { #define WH4(wh) ((struct ieee80211_frame_addr4 *)wh) struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_tx_ampdu *tap; struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *); ieee80211_seq seqno; IEEE80211_TX_LOCK_ASSERT(ni->ni_ic); wh->i_fc[0] = IEEE80211_FC0_VERSION_0 | type; if ((type & IEEE80211_FC0_TYPE_MASK) == IEEE80211_FC0_TYPE_DATA) { switch (vap->iv_opmode) { case IEEE80211_M_STA: wh->i_fc[1] = IEEE80211_FC1_DIR_TODS; IEEE80211_ADDR_COPY(wh->i_addr1, bssid); IEEE80211_ADDR_COPY(wh->i_addr2, sa); IEEE80211_ADDR_COPY(wh->i_addr3, da); break; case IEEE80211_M_IBSS: case IEEE80211_M_AHDEMO: wh->i_fc[1] = IEEE80211_FC1_DIR_NODS; IEEE80211_ADDR_COPY(wh->i_addr1, da); IEEE80211_ADDR_COPY(wh->i_addr2, sa); IEEE80211_ADDR_COPY(wh->i_addr3, bssid); break; case IEEE80211_M_HOSTAP: wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS; IEEE80211_ADDR_COPY(wh->i_addr1, da); IEEE80211_ADDR_COPY(wh->i_addr2, bssid); IEEE80211_ADDR_COPY(wh->i_addr3, sa); break; case IEEE80211_M_WDS: wh->i_fc[1] = IEEE80211_FC1_DIR_DSTODS; IEEE80211_ADDR_COPY(wh->i_addr1, da); IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr); IEEE80211_ADDR_COPY(wh->i_addr3, da); IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, sa); break; case IEEE80211_M_MBSS: #ifdef IEEE80211_SUPPORT_MESH if (IEEE80211_IS_MULTICAST(da)) { wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS; /* XXX next hop */ IEEE80211_ADDR_COPY(wh->i_addr1, da); IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr); } else { wh->i_fc[1] = IEEE80211_FC1_DIR_DSTODS; IEEE80211_ADDR_COPY(wh->i_addr1, da); IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr); IEEE80211_ADDR_COPY(wh->i_addr3, da); IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, sa); } #endif break; case IEEE80211_M_MONITOR: /* NB: to quiet compiler */ break; } } else { wh->i_fc[1] = IEEE80211_FC1_DIR_NODS; IEEE80211_ADDR_COPY(wh->i_addr1, da); IEEE80211_ADDR_COPY(wh->i_addr2, sa); #ifdef IEEE80211_SUPPORT_MESH if (vap->iv_opmode == IEEE80211_M_MBSS) IEEE80211_ADDR_COPY(wh->i_addr3, sa); else #endif IEEE80211_ADDR_COPY(wh->i_addr3, bssid); } *(uint16_t *)&wh->i_dur[0] = 0; /* * XXX TODO: this is what the TX lock is for. * Here we're incrementing sequence numbers, and they * need to be in lock-step with what the driver is doing * both in TX ordering and crypto encap (IV increment.) * * If the driver does seqno itself, then we can skip * assigning sequence numbers here, and we can avoid * requiring the TX lock. */ tap = &ni->ni_tx_ampdu[tid]; if (tid != IEEE80211_NONQOS_TID && IEEE80211_AMPDU_RUNNING(tap)) { m->m_flags |= M_AMPDU_MPDU; /* NB: zero out i_seq field (for s/w encryption etc) */ *(uint16_t *)&wh->i_seq[0] = 0; } else { if (IEEE80211_HAS_SEQ(type & IEEE80211_FC0_TYPE_MASK, type & IEEE80211_FC0_SUBTYPE_MASK)) /* * 802.11-2012 9.3.2.10 - QoS multicast frames * come out of a different seqno space. */ if (IEEE80211_IS_MULTICAST(wh->i_addr1)) { seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++; } else { seqno = ni->ni_txseqs[tid]++; } else seqno = 0; *(uint16_t *)&wh->i_seq[0] = htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT); M_SEQNO_SET(m, seqno); } if (IEEE80211_IS_MULTICAST(wh->i_addr1)) m->m_flags |= M_MCAST; #undef WH4 } /* * Send a management frame to the specified node. The node pointer * must have a reference as the pointer will be passed to the driver * and potentially held for a long time. If the frame is successfully * dispatched to the driver, then it is responsible for freeing the * reference (and potentially free'ing up any associated storage); * otherwise deal with reclaiming any reference (on error). */ int ieee80211_mgmt_output(struct ieee80211_node *ni, struct mbuf *m, int type, struct ieee80211_bpf_params *params) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ieee80211_frame *wh; int ret; KASSERT(ni != NULL, ("null node")); if (vap->iv_state == IEEE80211_S_CAC) { IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT | IEEE80211_MSG_DOTH, ni, "block %s frame in CAC state", ieee80211_mgt_subtype_name(type)); vap->iv_stats.is_tx_badstate++; ieee80211_free_node(ni); m_freem(m); return EIO; /* XXX */ } M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT); if (m == NULL) { ieee80211_free_node(ni); return ENOMEM; } IEEE80211_TX_LOCK(ic); wh = mtod(m, struct ieee80211_frame *); ieee80211_send_setup(ni, m, IEEE80211_FC0_TYPE_MGT | type, IEEE80211_NONQOS_TID, vap->iv_myaddr, ni->ni_macaddr, ni->ni_bssid); if (params->ibp_flags & IEEE80211_BPF_CRYPTO) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_AUTH, wh->i_addr1, "encrypting frame (%s)", __func__); wh->i_fc[1] |= IEEE80211_FC1_PROTECTED; } m->m_flags |= M_ENCAP; /* mark encapsulated */ KASSERT(type != IEEE80211_FC0_SUBTYPE_PROBE_RESP, ("probe response?")); M_WME_SETAC(m, params->ibp_pri); #ifdef IEEE80211_DEBUG /* avoid printing too many frames */ if ((ieee80211_msg_debug(vap) && doprint(vap, type)) || ieee80211_msg_dumppkts(vap)) { printf("[%s] send %s on channel %u\n", ether_sprintf(wh->i_addr1), ieee80211_mgt_subtype_name(type), ieee80211_chan2ieee(ic, ic->ic_curchan)); } #endif IEEE80211_NODE_STAT(ni, tx_mgmt); ret = ieee80211_raw_output(vap, ni, m, params); IEEE80211_TX_UNLOCK(ic); return (ret); } static void ieee80211_nulldata_transmitted(struct ieee80211_node *ni, void *arg, int status) { struct ieee80211vap *vap = ni->ni_vap; wakeup(vap); } /* * Send a null data frame to the specified node. If the station * is setup for QoS then a QoS Null Data frame is constructed. * If this is a WDS station then a 4-address frame is constructed. * * NB: the caller is assumed to have setup a node reference * for use; this is necessary to deal with a race condition * when probing for inactive stations. Like ieee80211_mgmt_output * we must cleanup any node reference on error; however we * can safely just unref it as we know it will never be the * last reference to the node. */ int ieee80211_send_nulldata(struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct mbuf *m; struct ieee80211_frame *wh; int hdrlen; uint8_t *frm; int ret; if (vap->iv_state == IEEE80211_S_CAC) { IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT | IEEE80211_MSG_DOTH, ni, "block %s frame in CAC state", "null data"); ieee80211_unref_node(&ni); vap->iv_stats.is_tx_badstate++; return EIO; /* XXX */ } if (ni->ni_flags & (IEEE80211_NODE_QOS|IEEE80211_NODE_HT)) hdrlen = sizeof(struct ieee80211_qosframe); else hdrlen = sizeof(struct ieee80211_frame); /* NB: only WDS vap's get 4-address frames */ if (vap->iv_opmode == IEEE80211_M_WDS) hdrlen += IEEE80211_ADDR_LEN; if (ic->ic_flags & IEEE80211_F_DATAPAD) hdrlen = roundup(hdrlen, sizeof(uint32_t)); m = ieee80211_getmgtframe(&frm, ic->ic_headroom + hdrlen, 0); if (m == NULL) { /* XXX debug msg */ ieee80211_unref_node(&ni); vap->iv_stats.is_tx_nobuf++; return ENOMEM; } KASSERT(M_LEADINGSPACE(m) >= hdrlen, ("leading space %zd", M_LEADINGSPACE(m))); M_PREPEND(m, hdrlen, M_NOWAIT); if (m == NULL) { /* NB: cannot happen */ ieee80211_free_node(ni); return ENOMEM; } IEEE80211_TX_LOCK(ic); wh = mtod(m, struct ieee80211_frame *); /* NB: a little lie */ if (ni->ni_flags & IEEE80211_NODE_QOS) { const int tid = WME_AC_TO_TID(WME_AC_BE); uint8_t *qos; ieee80211_send_setup(ni, m, IEEE80211_FC0_TYPE_DATA | IEEE80211_FC0_SUBTYPE_QOS_NULL, tid, vap->iv_myaddr, ni->ni_macaddr, ni->ni_bssid); if (vap->iv_opmode == IEEE80211_M_WDS) qos = ((struct ieee80211_qosframe_addr4 *) wh)->i_qos; else qos = ((struct ieee80211_qosframe *) wh)->i_qos; qos[0] = tid & IEEE80211_QOS_TID; if (ic->ic_wme.wme_wmeChanParams.cap_wmeParams[WME_AC_BE].wmep_noackPolicy) qos[0] |= IEEE80211_QOS_ACKPOLICY_NOACK; qos[1] = 0; } else { ieee80211_send_setup(ni, m, IEEE80211_FC0_TYPE_DATA | IEEE80211_FC0_SUBTYPE_NODATA, IEEE80211_NONQOS_TID, vap->iv_myaddr, ni->ni_macaddr, ni->ni_bssid); } if (vap->iv_opmode != IEEE80211_M_WDS) { /* NB: power management bit is never sent by an AP */ if ((ni->ni_flags & IEEE80211_NODE_PWR_MGT) && vap->iv_opmode != IEEE80211_M_HOSTAP) wh->i_fc[1] |= IEEE80211_FC1_PWR_MGT; } if ((ic->ic_flags & IEEE80211_F_SCAN) && (ni->ni_flags & IEEE80211_NODE_PWR_MGT)) { ieee80211_add_callback(m, ieee80211_nulldata_transmitted, NULL); } m->m_len = m->m_pkthdr.len = hdrlen; m->m_flags |= M_ENCAP; /* mark encapsulated */ M_WME_SETAC(m, WME_AC_BE); IEEE80211_NODE_STAT(ni, tx_data); IEEE80211_NOTE(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_DUMPPKTS, ni, "send %snull data frame on channel %u, pwr mgt %s", ni->ni_flags & IEEE80211_NODE_QOS ? "QoS " : "", ieee80211_chan2ieee(ic, ic->ic_curchan), wh->i_fc[1] & IEEE80211_FC1_PWR_MGT ? "ena" : "dis"); ret = ieee80211_raw_output(vap, ni, m, NULL); IEEE80211_TX_UNLOCK(ic); return (ret); } /* * Assign priority to a frame based on any vlan tag assigned * to the station and/or any Diffserv setting in an IP header. * Finally, if an ACM policy is setup (in station mode) it's * applied. */ int ieee80211_classify(struct ieee80211_node *ni, struct mbuf *m) { const struct ether_header *eh = NULL; uint16_t ether_type; int v_wme_ac, d_wme_ac, ac; if (__predict_false(m->m_flags & M_ENCAP)) { struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *); struct llc *llc; int hdrlen, subtype; subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK; if (subtype & IEEE80211_FC0_SUBTYPE_NODATA) { ac = WME_AC_BE; goto done; } hdrlen = ieee80211_hdrsize(wh); if (m->m_pkthdr.len < hdrlen + sizeof(*llc)) return 1; llc = (struct llc *)mtodo(m, hdrlen); if (llc->llc_dsap != LLC_SNAP_LSAP || llc->llc_ssap != LLC_SNAP_LSAP || llc->llc_control != LLC_UI || llc->llc_snap.org_code[0] != 0 || llc->llc_snap.org_code[1] != 0 || llc->llc_snap.org_code[2] != 0) return 1; ether_type = llc->llc_snap.ether_type; } else { eh = mtod(m, struct ether_header *); ether_type = eh->ether_type; } /* * Always promote PAE/EAPOL frames to high priority. */ if (ether_type == htons(ETHERTYPE_PAE)) { /* NB: mark so others don't need to check header */ m->m_flags |= M_EAPOL; ac = WME_AC_VO; goto done; } /* * Non-qos traffic goes to BE. */ if ((ni->ni_flags & IEEE80211_NODE_QOS) == 0) { ac = WME_AC_BE; goto done; } /* * If node has a vlan tag then all traffic * to it must have a matching tag. */ v_wme_ac = 0; if (ni->ni_vlan != 0) { if ((m->m_flags & M_VLANTAG) == 0) { IEEE80211_NODE_STAT(ni, tx_novlantag); return 1; } if (EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) != EVL_VLANOFTAG(ni->ni_vlan)) { IEEE80211_NODE_STAT(ni, tx_vlanmismatch); return 1; } /* map vlan priority to AC */ v_wme_ac = TID_TO_WME_AC(EVL_PRIOFTAG(ni->ni_vlan)); } /* XXX m_copydata may be too slow for fast path */ #ifdef INET if (eh && eh->ether_type == htons(ETHERTYPE_IP)) { uint8_t tos; /* * IP frame, map the DSCP bits from the TOS field. */ /* NB: ip header may not be in first mbuf */ m_copydata(m, sizeof(struct ether_header) + offsetof(struct ip, ip_tos), sizeof(tos), &tos); tos >>= 5; /* NB: ECN + low 3 bits of DSCP */ d_wme_ac = TID_TO_WME_AC(tos); } else { #endif /* INET */ #ifdef INET6 if (eh && eh->ether_type == htons(ETHERTYPE_IPV6)) { uint32_t flow; uint8_t tos; /* * IPv6 frame, map the DSCP bits from the traffic class field. */ m_copydata(m, sizeof(struct ether_header) + offsetof(struct ip6_hdr, ip6_flow), sizeof(flow), (caddr_t) &flow); tos = (uint8_t)(ntohl(flow) >> 20); tos >>= 5; /* NB: ECN + low 3 bits of DSCP */ d_wme_ac = TID_TO_WME_AC(tos); } else { #endif /* INET6 */ d_wme_ac = WME_AC_BE; #ifdef INET6 } #endif #ifdef INET } #endif /* * Use highest priority AC. */ if (v_wme_ac > d_wme_ac) ac = v_wme_ac; else ac = d_wme_ac; /* * Apply ACM policy. */ if (ni->ni_vap->iv_opmode == IEEE80211_M_STA) { static const int acmap[4] = { WME_AC_BK, /* WME_AC_BE */ WME_AC_BK, /* WME_AC_BK */ WME_AC_BE, /* WME_AC_VI */ WME_AC_VI, /* WME_AC_VO */ }; struct ieee80211com *ic = ni->ni_ic; while (ac != WME_AC_BK && ic->ic_wme.wme_wmeBssChanParams.cap_wmeParams[ac].wmep_acm) ac = acmap[ac]; } done: M_WME_SETAC(m, ac); return 0; } /* * Insure there is sufficient contiguous space to encapsulate the * 802.11 data frame. If room isn't already there, arrange for it. * Drivers and cipher modules assume we have done the necessary work * and fail rudely if they don't find the space they need. */ struct mbuf * ieee80211_mbuf_adjust(struct ieee80211vap *vap, int hdrsize, struct ieee80211_key *key, struct mbuf *m) { #define TO_BE_RECLAIMED (sizeof(struct ether_header) - sizeof(struct llc)) int needed_space = vap->iv_ic->ic_headroom + hdrsize; if (key != NULL) { /* XXX belongs in crypto code? */ needed_space += key->wk_cipher->ic_header; /* XXX frags */ /* * When crypto is being done in the host we must insure * the data are writable for the cipher routines; clone * a writable mbuf chain. * XXX handle SWMIC specially */ if (key->wk_flags & (IEEE80211_KEY_SWENCRYPT|IEEE80211_KEY_SWENMIC)) { m = m_unshare(m, M_NOWAIT); if (m == NULL) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT, "%s: cannot get writable mbuf\n", __func__); vap->iv_stats.is_tx_nobuf++; /* XXX new stat */ return NULL; } } } /* * We know we are called just before stripping an Ethernet * header and prepending an LLC header. This means we know * there will be * sizeof(struct ether_header) - sizeof(struct llc) * bytes recovered to which we need additional space for the * 802.11 header and any crypto header. */ /* XXX check trailing space and copy instead? */ if (M_LEADINGSPACE(m) < needed_space - TO_BE_RECLAIMED) { struct mbuf *n = m_gethdr(M_NOWAIT, m->m_type); if (n == NULL) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT, "%s: cannot expand storage\n", __func__); vap->iv_stats.is_tx_nobuf++; m_freem(m); return NULL; } KASSERT(needed_space <= MHLEN, ("not enough room, need %u got %d\n", needed_space, MHLEN)); /* * Setup new mbuf to have leading space to prepend the * 802.11 header and any crypto header bits that are * required (the latter are added when the driver calls * back to ieee80211_crypto_encap to do crypto encapsulation). */ /* NB: must be first 'cuz it clobbers m_data */ m_move_pkthdr(n, m); n->m_len = 0; /* NB: m_gethdr does not set */ n->m_data += needed_space; /* * Pull up Ethernet header to create the expected layout. * We could use m_pullup but that's overkill (i.e. we don't * need the actual data) and it cannot fail so do it inline * for speed. */ /* NB: struct ether_header is known to be contiguous */ n->m_len += sizeof(struct ether_header); m->m_len -= sizeof(struct ether_header); m->m_data += sizeof(struct ether_header); /* * Replace the head of the chain. */ n->m_next = m; m = n; } return m; #undef TO_BE_RECLAIMED } /* * Return the transmit key to use in sending a unicast frame. * If a unicast key is set we use that. When no unicast key is set * we fall back to the default transmit key. */ static __inline struct ieee80211_key * ieee80211_crypto_getucastkey(struct ieee80211vap *vap, struct ieee80211_node *ni) { if (IEEE80211_KEY_UNDEFINED(&ni->ni_ucastkey)) { if (vap->iv_def_txkey == IEEE80211_KEYIX_NONE || IEEE80211_KEY_UNDEFINED(&vap->iv_nw_keys[vap->iv_def_txkey])) return NULL; return &vap->iv_nw_keys[vap->iv_def_txkey]; } else { return &ni->ni_ucastkey; } } /* * Return the transmit key to use in sending a multicast frame. * Multicast traffic always uses the group key which is installed as * the default tx key. */ static __inline struct ieee80211_key * ieee80211_crypto_getmcastkey(struct ieee80211vap *vap, struct ieee80211_node *ni) { if (vap->iv_def_txkey == IEEE80211_KEYIX_NONE || IEEE80211_KEY_UNDEFINED(&vap->iv_nw_keys[vap->iv_def_txkey])) return NULL; return &vap->iv_nw_keys[vap->iv_def_txkey]; } /* * Encapsulate an outbound data frame. The mbuf chain is updated. * If an error is encountered NULL is returned. The caller is required * to provide a node reference and pullup the ethernet header in the * first mbuf. * * NB: Packet is assumed to be processed by ieee80211_classify which * marked EAPOL frames w/ M_EAPOL. */ struct mbuf * ieee80211_encap(struct ieee80211vap *vap, struct ieee80211_node *ni, struct mbuf *m) { #define WH4(wh) ((struct ieee80211_frame_addr4 *)(wh)) #define MC01(mc) ((struct ieee80211_meshcntl_ae01 *)mc) struct ieee80211com *ic = ni->ni_ic; #ifdef IEEE80211_SUPPORT_MESH struct ieee80211_mesh_state *ms = vap->iv_mesh; struct ieee80211_meshcntl_ae10 *mc; struct ieee80211_mesh_route *rt = NULL; int dir = -1; #endif struct ether_header eh; struct ieee80211_frame *wh; struct ieee80211_key *key; struct llc *llc; int hdrsize, hdrspace, datalen, addqos, txfrag, is4addr, is_mcast; ieee80211_seq seqno; int meshhdrsize, meshae; uint8_t *qos; int is_amsdu = 0; IEEE80211_TX_LOCK_ASSERT(ic); is_mcast = !! (m->m_flags & (M_MCAST | M_BCAST)); /* * Copy existing Ethernet header to a safe place. The * rest of the code assumes it's ok to strip it when * reorganizing state for the final encapsulation. */ KASSERT(m->m_len >= sizeof(eh), ("no ethernet header!")); ETHER_HEADER_COPY(&eh, mtod(m, caddr_t)); /* * Insure space for additional headers. First identify * transmit key to use in calculating any buffer adjustments * required. This is also used below to do privacy * encapsulation work. Then calculate the 802.11 header * size and any padding required by the driver. * * Note key may be NULL if we fall back to the default * transmit key and that is not set. In that case the * buffer may not be expanded as needed by the cipher * routines, but they will/should discard it. */ if (vap->iv_flags & IEEE80211_F_PRIVACY) { if (vap->iv_opmode == IEEE80211_M_STA || !IEEE80211_IS_MULTICAST(eh.ether_dhost) || (vap->iv_opmode == IEEE80211_M_WDS && (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY))) key = ieee80211_crypto_getucastkey(vap, ni); else key = ieee80211_crypto_getmcastkey(vap, ni); if (key == NULL && (m->m_flags & M_EAPOL) == 0) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO, eh.ether_dhost, "no default transmit key (%s) deftxkey %u", __func__, vap->iv_def_txkey); vap->iv_stats.is_tx_nodefkey++; goto bad; } } else key = NULL; /* * XXX Some ap's don't handle QoS-encapsulated EAPOL * frames so suppress use. This may be an issue if other * ap's require all data frames to be QoS-encapsulated * once negotiated in which case we'll need to make this * configurable. * * Don't send multicast QoS frames. * Technically multicast frames can be QoS if all stations in the * BSS are also QoS. * * NB: mesh data frames are QoS, including multicast frames. */ addqos = (((is_mcast == 0) && (ni->ni_flags & (IEEE80211_NODE_QOS|IEEE80211_NODE_HT))) || (vap->iv_opmode == IEEE80211_M_MBSS)) && (m->m_flags & M_EAPOL) == 0; if (addqos) hdrsize = sizeof(struct ieee80211_qosframe); else hdrsize = sizeof(struct ieee80211_frame); #ifdef IEEE80211_SUPPORT_MESH if (vap->iv_opmode == IEEE80211_M_MBSS) { /* * Mesh data frames are encapsulated according to the * rules of Section 11B.8.5 (p.139 of D3.0 spec). * o Group Addressed data (aka multicast) originating * at the local sta are sent w/ 3-address format and * address extension mode 00 * o Individually Addressed data (aka unicast) originating * at the local sta are sent w/ 4-address format and * address extension mode 00 * o Group Addressed data forwarded from a non-mesh sta are * sent w/ 3-address format and address extension mode 01 * o Individually Address data from another sta are sent * w/ 4-address format and address extension mode 10 */ is4addr = 0; /* NB: don't use, disable */ if (!IEEE80211_IS_MULTICAST(eh.ether_dhost)) { rt = ieee80211_mesh_rt_find(vap, eh.ether_dhost); KASSERT(rt != NULL, ("route is NULL")); dir = IEEE80211_FC1_DIR_DSTODS; hdrsize += IEEE80211_ADDR_LEN; if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) { if (IEEE80211_ADDR_EQ(rt->rt_mesh_gate, vap->iv_myaddr)) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, eh.ether_dhost, "%s", "trying to send to ourself"); goto bad; } meshae = IEEE80211_MESH_AE_10; meshhdrsize = sizeof(struct ieee80211_meshcntl_ae10); } else { meshae = IEEE80211_MESH_AE_00; meshhdrsize = sizeof(struct ieee80211_meshcntl); } } else { dir = IEEE80211_FC1_DIR_FROMDS; if (!IEEE80211_ADDR_EQ(eh.ether_shost, vap->iv_myaddr)) { /* proxy group */ meshae = IEEE80211_MESH_AE_01; meshhdrsize = sizeof(struct ieee80211_meshcntl_ae01); } else { /* group */ meshae = IEEE80211_MESH_AE_00; meshhdrsize = sizeof(struct ieee80211_meshcntl); } } } else { #endif /* * 4-address frames need to be generated for: * o packets sent through a WDS vap (IEEE80211_M_WDS) * o packets sent through a vap marked for relaying * (e.g. a station operating with dynamic WDS) */ is4addr = vap->iv_opmode == IEEE80211_M_WDS || ((vap->iv_flags_ext & IEEE80211_FEXT_4ADDR) && !IEEE80211_ADDR_EQ(eh.ether_shost, vap->iv_myaddr)); if (is4addr) hdrsize += IEEE80211_ADDR_LEN; meshhdrsize = meshae = 0; #ifdef IEEE80211_SUPPORT_MESH } #endif /* * Honor driver DATAPAD requirement. */ if (ic->ic_flags & IEEE80211_F_DATAPAD) hdrspace = roundup(hdrsize, sizeof(uint32_t)); else hdrspace = hdrsize; if (__predict_true((m->m_flags & M_FF) == 0)) { /* * Normal frame. */ m = ieee80211_mbuf_adjust(vap, hdrspace + meshhdrsize, key, m); if (m == NULL) { /* NB: ieee80211_mbuf_adjust handles msgs+statistics */ goto bad; } /* NB: this could be optimized 'cuz of ieee80211_mbuf_adjust */ m_adj(m, sizeof(struct ether_header) - sizeof(struct llc)); llc = mtod(m, struct llc *); llc->llc_dsap = llc->llc_ssap = LLC_SNAP_LSAP; llc->llc_control = LLC_UI; llc->llc_snap.org_code[0] = 0; llc->llc_snap.org_code[1] = 0; llc->llc_snap.org_code[2] = 0; llc->llc_snap.ether_type = eh.ether_type; } else { #ifdef IEEE80211_SUPPORT_SUPERG /* * Aggregated frame. Check if it's for AMSDU or FF. * * XXX TODO: IEEE80211_NODE_AMSDU* isn't implemented * anywhere for some reason. But, since 11n requires * AMSDU RX, we can just assume "11n" == "AMSDU". */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG, "%s: called; M_FF\n", __func__); if (ieee80211_amsdu_tx_ok(ni)) { m = ieee80211_amsdu_encap(vap, m, hdrspace + meshhdrsize, key); is_amsdu = 1; } else { m = ieee80211_ff_encap(vap, m, hdrspace + meshhdrsize, key); } if (m == NULL) #endif goto bad; } datalen = m->m_pkthdr.len; /* NB: w/o 802.11 header */ M_PREPEND(m, hdrspace + meshhdrsize, M_NOWAIT); if (m == NULL) { vap->iv_stats.is_tx_nobuf++; goto bad; } wh = mtod(m, struct ieee80211_frame *); wh->i_fc[0] = IEEE80211_FC0_VERSION_0 | IEEE80211_FC0_TYPE_DATA; *(uint16_t *)wh->i_dur = 0; qos = NULL; /* NB: quiet compiler */ if (is4addr) { wh->i_fc[1] = IEEE80211_FC1_DIR_DSTODS; IEEE80211_ADDR_COPY(wh->i_addr1, ni->ni_macaddr); IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr); IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_dhost); IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, eh.ether_shost); } else switch (vap->iv_opmode) { case IEEE80211_M_STA: wh->i_fc[1] = IEEE80211_FC1_DIR_TODS; IEEE80211_ADDR_COPY(wh->i_addr1, ni->ni_bssid); IEEE80211_ADDR_COPY(wh->i_addr2, eh.ether_shost); IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_dhost); break; case IEEE80211_M_IBSS: case IEEE80211_M_AHDEMO: wh->i_fc[1] = IEEE80211_FC1_DIR_NODS; IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost); IEEE80211_ADDR_COPY(wh->i_addr2, eh.ether_shost); /* * NB: always use the bssid from iv_bss as the * neighbor's may be stale after an ibss merge */ IEEE80211_ADDR_COPY(wh->i_addr3, vap->iv_bss->ni_bssid); break; case IEEE80211_M_HOSTAP: wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS; IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost); IEEE80211_ADDR_COPY(wh->i_addr2, ni->ni_bssid); IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_shost); break; #ifdef IEEE80211_SUPPORT_MESH case IEEE80211_M_MBSS: /* NB: offset by hdrspace to deal with DATAPAD */ mc = (struct ieee80211_meshcntl_ae10 *) (mtod(m, uint8_t *) + hdrspace); wh->i_fc[1] = dir; switch (meshae) { case IEEE80211_MESH_AE_00: /* no proxy */ mc->mc_flags = 0; if (dir == IEEE80211_FC1_DIR_DSTODS) { /* ucast */ IEEE80211_ADDR_COPY(wh->i_addr1, ni->ni_macaddr); IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr); IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_dhost); IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, eh.ether_shost); qos =((struct ieee80211_qosframe_addr4 *) wh)->i_qos; } else if (dir == IEEE80211_FC1_DIR_FROMDS) { /* mcast */ IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost); IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr); IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_shost); qos = ((struct ieee80211_qosframe *) wh)->i_qos; } break; case IEEE80211_MESH_AE_01: /* mcast, proxy */ wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS; IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost); IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr); IEEE80211_ADDR_COPY(wh->i_addr3, vap->iv_myaddr); mc->mc_flags = 1; IEEE80211_ADDR_COPY(MC01(mc)->mc_addr4, eh.ether_shost); qos = ((struct ieee80211_qosframe *) wh)->i_qos; break; case IEEE80211_MESH_AE_10: /* ucast, proxy */ KASSERT(rt != NULL, ("route is NULL")); IEEE80211_ADDR_COPY(wh->i_addr1, rt->rt_nexthop); IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr); IEEE80211_ADDR_COPY(wh->i_addr3, rt->rt_mesh_gate); IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, vap->iv_myaddr); mc->mc_flags = IEEE80211_MESH_AE_10; IEEE80211_ADDR_COPY(mc->mc_addr5, eh.ether_dhost); IEEE80211_ADDR_COPY(mc->mc_addr6, eh.ether_shost); qos = ((struct ieee80211_qosframe_addr4 *) wh)->i_qos; break; default: KASSERT(0, ("meshae %d", meshae)); break; } mc->mc_ttl = ms->ms_ttl; ms->ms_seq++; le32enc(mc->mc_seq, ms->ms_seq); break; #endif case IEEE80211_M_WDS: /* NB: is4addr should always be true */ default: goto bad; } if (m->m_flags & M_MORE_DATA) wh->i_fc[1] |= IEEE80211_FC1_MORE_DATA; if (addqos) { int ac, tid; if (is4addr) { qos = ((struct ieee80211_qosframe_addr4 *) wh)->i_qos; /* NB: mesh case handled earlier */ } else if (vap->iv_opmode != IEEE80211_M_MBSS) qos = ((struct ieee80211_qosframe *) wh)->i_qos; ac = M_WME_GETAC(m); /* map from access class/queue to 11e header priorty value */ tid = WME_AC_TO_TID(ac); qos[0] = tid & IEEE80211_QOS_TID; if (ic->ic_wme.wme_wmeChanParams.cap_wmeParams[ac].wmep_noackPolicy) qos[0] |= IEEE80211_QOS_ACKPOLICY_NOACK; #ifdef IEEE80211_SUPPORT_MESH if (vap->iv_opmode == IEEE80211_M_MBSS) qos[1] = IEEE80211_QOS_MC; else #endif qos[1] = 0; wh->i_fc[0] |= IEEE80211_FC0_SUBTYPE_QOS; /* * If this is an A-MSDU then ensure we set the * relevant field. */ if (is_amsdu) qos[0] |= IEEE80211_QOS_AMSDU; /* * XXX TODO TX lock is needed for atomic updates of sequence * numbers. If the driver does it, then don't do it here; * and we don't need the TX lock held. */ if ((m->m_flags & M_AMPDU_MPDU) == 0) { /* * 802.11-2012 9.3.2.10 - * * If this is a multicast frame then we need * to ensure that the sequence number comes from * a separate seqno space and not the TID space. * * Otherwise multicast frames may actually cause * holes in the TX blockack window space and * upset various things. */ if (IEEE80211_IS_MULTICAST(wh->i_addr1)) seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++; else seqno = ni->ni_txseqs[tid]++; /* * NB: don't assign a sequence # to potential * aggregates; we expect this happens at the * point the frame comes off any aggregation q * as otherwise we may introduce holes in the * BA sequence space and/or make window accouting * more difficult. * * XXX may want to control this with a driver * capability; this may also change when we pull * aggregation up into net80211 */ *(uint16_t *)wh->i_seq = htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT); M_SEQNO_SET(m, seqno); } else { /* NB: zero out i_seq field (for s/w encryption etc) */ *(uint16_t *)wh->i_seq = 0; } } else { /* * XXX TODO TX lock is needed for atomic updates of sequence * numbers. If the driver does it, then don't do it here; * and we don't need the TX lock held. */ seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++; *(uint16_t *)wh->i_seq = htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT); M_SEQNO_SET(m, seqno); /* * XXX TODO: we shouldn't allow EAPOL, etc that would * be forced to be non-QoS traffic to be A-MSDU encapsulated. */ if (is_amsdu) printf("%s: XXX ERROR: is_amsdu set; not QoS!\n", __func__); } /* * Check if xmit fragmentation is required. * * If the hardware does fragmentation offload, then don't bother * doing it here. */ if (IEEE80211_CONF_FRAG_OFFLOAD(ic)) txfrag = 0; else txfrag = (m->m_pkthdr.len > vap->iv_fragthreshold && !IEEE80211_IS_MULTICAST(wh->i_addr1) && (vap->iv_caps & IEEE80211_C_TXFRAG) && (m->m_flags & (M_FF | M_AMPDU_MPDU)) == 0); if (key != NULL) { /* * IEEE 802.1X: send EAPOL frames always in the clear. * WPA/WPA2: encrypt EAPOL keys when pairwise keys are set. */ if ((m->m_flags & M_EAPOL) == 0 || ((vap->iv_flags & IEEE80211_F_WPA) && (vap->iv_opmode == IEEE80211_M_STA ? !IEEE80211_KEY_UNDEFINED(key) : !IEEE80211_KEY_UNDEFINED(&ni->ni_ucastkey)))) { wh->i_fc[1] |= IEEE80211_FC1_PROTECTED; if (!ieee80211_crypto_enmic(vap, key, m, txfrag)) { IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_OUTPUT, eh.ether_dhost, "%s", "enmic failed, discard frame"); vap->iv_stats.is_crypto_enmicfail++; goto bad; } } } if (txfrag && !ieee80211_fragment(vap, m, hdrsize, key != NULL ? key->wk_cipher->ic_header : 0, vap->iv_fragthreshold)) goto bad; m->m_flags |= M_ENCAP; /* mark encapsulated */ IEEE80211_NODE_STAT(ni, tx_data); if (IEEE80211_IS_MULTICAST(wh->i_addr1)) { IEEE80211_NODE_STAT(ni, tx_mcast); m->m_flags |= M_MCAST; } else IEEE80211_NODE_STAT(ni, tx_ucast); IEEE80211_NODE_STAT_ADD(ni, tx_bytes, datalen); return m; bad: if (m != NULL) m_freem(m); return NULL; #undef WH4 #undef MC01 } void ieee80211_free_mbuf(struct mbuf *m) { struct mbuf *next; if (m == NULL) return; do { next = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); } while ((m = next) != NULL); } /* * Fragment the frame according to the specified mtu. * The size of the 802.11 header (w/o padding) is provided * so we don't need to recalculate it. We create a new * mbuf for each fragment and chain it through m_nextpkt; * we might be able to optimize this by reusing the original * packet's mbufs but that is significantly more complicated. */ static int ieee80211_fragment(struct ieee80211vap *vap, struct mbuf *m0, u_int hdrsize, u_int ciphdrsize, u_int mtu) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_frame *wh, *whf; struct mbuf *m, *prev; u_int totalhdrsize, fragno, fragsize, off, remainder, payload; u_int hdrspace; KASSERT(m0->m_nextpkt == NULL, ("mbuf already chained?")); KASSERT(m0->m_pkthdr.len > mtu, ("pktlen %u mtu %u", m0->m_pkthdr.len, mtu)); /* * Honor driver DATAPAD requirement. */ if (ic->ic_flags & IEEE80211_F_DATAPAD) hdrspace = roundup(hdrsize, sizeof(uint32_t)); else hdrspace = hdrsize; wh = mtod(m0, struct ieee80211_frame *); /* NB: mark the first frag; it will be propagated below */ wh->i_fc[1] |= IEEE80211_FC1_MORE_FRAG; totalhdrsize = hdrspace + ciphdrsize; fragno = 1; off = mtu - ciphdrsize; remainder = m0->m_pkthdr.len - off; prev = m0; do { fragsize = MIN(totalhdrsize + remainder, mtu); m = m_get2(fragsize, M_NOWAIT, MT_DATA, M_PKTHDR); if (m == NULL) goto bad; /* leave room to prepend any cipher header */ m_align(m, fragsize - ciphdrsize); /* * Form the header in the fragment. Note that since * we mark the first fragment with the MORE_FRAG bit * it automatically is propagated to each fragment; we * need only clear it on the last fragment (done below). * NB: frag 1+ dont have Mesh Control field present. */ whf = mtod(m, struct ieee80211_frame *); memcpy(whf, wh, hdrsize); #ifdef IEEE80211_SUPPORT_MESH if (vap->iv_opmode == IEEE80211_M_MBSS) ieee80211_getqos(wh)[1] &= ~IEEE80211_QOS_MC; #endif *(uint16_t *)&whf->i_seq[0] |= htole16( (fragno & IEEE80211_SEQ_FRAG_MASK) << IEEE80211_SEQ_FRAG_SHIFT); fragno++; payload = fragsize - totalhdrsize; /* NB: destination is known to be contiguous */ m_copydata(m0, off, payload, mtod(m, uint8_t *) + hdrspace); m->m_len = hdrspace + payload; m->m_pkthdr.len = hdrspace + payload; m->m_flags |= M_FRAG; /* chain up the fragment */ prev->m_nextpkt = m; prev = m; /* deduct fragment just formed */ remainder -= payload; off += payload; } while (remainder != 0); /* set the last fragment */ m->m_flags |= M_LASTFRAG; whf->i_fc[1] &= ~IEEE80211_FC1_MORE_FRAG; /* strip first mbuf now that everything has been copied */ m_adj(m0, -(m0->m_pkthdr.len - (mtu - ciphdrsize))); m0->m_flags |= M_FIRSTFRAG | M_FRAG; vap->iv_stats.is_tx_fragframes++; vap->iv_stats.is_tx_frags += fragno-1; return 1; bad: /* reclaim fragments but leave original frame for caller to free */ ieee80211_free_mbuf(m0->m_nextpkt); m0->m_nextpkt = NULL; return 0; } /* * Add a supported rates element id to a frame. */ uint8_t * ieee80211_add_rates(uint8_t *frm, const struct ieee80211_rateset *rs) { int nrates; *frm++ = IEEE80211_ELEMID_RATES; nrates = rs->rs_nrates; if (nrates > IEEE80211_RATE_SIZE) nrates = IEEE80211_RATE_SIZE; *frm++ = nrates; memcpy(frm, rs->rs_rates, nrates); return frm + nrates; } /* * Add an extended supported rates element id to a frame. */ uint8_t * ieee80211_add_xrates(uint8_t *frm, const struct ieee80211_rateset *rs) { /* * Add an extended supported rates element if operating in 11g mode. */ if (rs->rs_nrates > IEEE80211_RATE_SIZE) { int nrates = rs->rs_nrates - IEEE80211_RATE_SIZE; *frm++ = IEEE80211_ELEMID_XRATES; *frm++ = nrates; memcpy(frm, rs->rs_rates + IEEE80211_RATE_SIZE, nrates); frm += nrates; } return frm; } /* * Add an ssid element to a frame. */ uint8_t * ieee80211_add_ssid(uint8_t *frm, const uint8_t *ssid, u_int len) { *frm++ = IEEE80211_ELEMID_SSID; *frm++ = len; memcpy(frm, ssid, len); return frm + len; } /* * Add an erp element to a frame. */ static uint8_t * ieee80211_add_erp(uint8_t *frm, struct ieee80211com *ic) { uint8_t erp; *frm++ = IEEE80211_ELEMID_ERP; *frm++ = 1; erp = 0; if (ic->ic_nonerpsta != 0) erp |= IEEE80211_ERP_NON_ERP_PRESENT; if (ic->ic_flags & IEEE80211_F_USEPROT) erp |= IEEE80211_ERP_USE_PROTECTION; if (ic->ic_flags & IEEE80211_F_USEBARKER) erp |= IEEE80211_ERP_LONG_PREAMBLE; *frm++ = erp; return frm; } /* * Add a CFParams element to a frame. */ static uint8_t * ieee80211_add_cfparms(uint8_t *frm, struct ieee80211com *ic) { #define ADDSHORT(frm, v) do { \ le16enc(frm, v); \ frm += 2; \ } while (0) *frm++ = IEEE80211_ELEMID_CFPARMS; *frm++ = 6; *frm++ = 0; /* CFP count */ *frm++ = 2; /* CFP period */ ADDSHORT(frm, 0); /* CFP MaxDuration (TU) */ ADDSHORT(frm, 0); /* CFP CurRemaining (TU) */ return frm; #undef ADDSHORT } static __inline uint8_t * add_appie(uint8_t *frm, const struct ieee80211_appie *ie) { memcpy(frm, ie->ie_data, ie->ie_len); return frm + ie->ie_len; } static __inline uint8_t * add_ie(uint8_t *frm, const uint8_t *ie) { memcpy(frm, ie, 2 + ie[1]); return frm + 2 + ie[1]; } #define WME_OUI_BYTES 0x00, 0x50, 0xf2 /* * Add a WME information element to a frame. */ uint8_t * ieee80211_add_wme_info(uint8_t *frm, struct ieee80211_wme_state *wme) { static const struct ieee80211_wme_info info = { .wme_id = IEEE80211_ELEMID_VENDOR, .wme_len = sizeof(struct ieee80211_wme_info) - 2, .wme_oui = { WME_OUI_BYTES }, .wme_type = WME_OUI_TYPE, .wme_subtype = WME_INFO_OUI_SUBTYPE, .wme_version = WME_VERSION, .wme_info = 0, }; memcpy(frm, &info, sizeof(info)); return frm + sizeof(info); } /* * Add a WME parameters element to a frame. */ static uint8_t * ieee80211_add_wme_param(uint8_t *frm, struct ieee80211_wme_state *wme) { #define SM(_v, _f) (((_v) << _f##_S) & _f) #define ADDSHORT(frm, v) do { \ le16enc(frm, v); \ frm += 2; \ } while (0) /* NB: this works 'cuz a param has an info at the front */ static const struct ieee80211_wme_info param = { .wme_id = IEEE80211_ELEMID_VENDOR, .wme_len = sizeof(struct ieee80211_wme_param) - 2, .wme_oui = { WME_OUI_BYTES }, .wme_type = WME_OUI_TYPE, .wme_subtype = WME_PARAM_OUI_SUBTYPE, .wme_version = WME_VERSION, }; int i; memcpy(frm, ¶m, sizeof(param)); frm += __offsetof(struct ieee80211_wme_info, wme_info); *frm++ = wme->wme_bssChanParams.cap_info; /* AC info */ *frm++ = 0; /* reserved field */ for (i = 0; i < WME_NUM_AC; i++) { const struct wmeParams *ac = &wme->wme_bssChanParams.cap_wmeParams[i]; *frm++ = SM(i, WME_PARAM_ACI) | SM(ac->wmep_acm, WME_PARAM_ACM) | SM(ac->wmep_aifsn, WME_PARAM_AIFSN) ; *frm++ = SM(ac->wmep_logcwmax, WME_PARAM_LOGCWMAX) | SM(ac->wmep_logcwmin, WME_PARAM_LOGCWMIN) ; ADDSHORT(frm, ac->wmep_txopLimit); } return frm; #undef SM #undef ADDSHORT } #undef WME_OUI_BYTES /* * Add an 11h Power Constraint element to a frame. */ static uint8_t * ieee80211_add_powerconstraint(uint8_t *frm, struct ieee80211vap *vap) { const struct ieee80211_channel *c = vap->iv_bss->ni_chan; /* XXX per-vap tx power limit? */ int8_t limit = vap->iv_ic->ic_txpowlimit / 2; frm[0] = IEEE80211_ELEMID_PWRCNSTR; frm[1] = 1; frm[2] = c->ic_maxregpower > limit ? c->ic_maxregpower - limit : 0; return frm + 3; } /* * Add an 11h Power Capability element to a frame. */ static uint8_t * ieee80211_add_powercapability(uint8_t *frm, const struct ieee80211_channel *c) { frm[0] = IEEE80211_ELEMID_PWRCAP; frm[1] = 2; frm[2] = c->ic_minpower; frm[3] = c->ic_maxpower; return frm + 4; } /* * Add an 11h Supported Channels element to a frame. */ static uint8_t * ieee80211_add_supportedchannels(uint8_t *frm, struct ieee80211com *ic) { static const int ielen = 26; frm[0] = IEEE80211_ELEMID_SUPPCHAN; frm[1] = ielen; /* XXX not correct */ memcpy(frm+2, ic->ic_chan_avail, ielen); return frm + 2 + ielen; } /* * Add an 11h Quiet time element to a frame. */ static uint8_t * ieee80211_add_quiet(uint8_t *frm, struct ieee80211vap *vap, int update) { struct ieee80211_quiet_ie *quiet = (struct ieee80211_quiet_ie *) frm; quiet->quiet_ie = IEEE80211_ELEMID_QUIET; quiet->len = 6; /* * Only update every beacon interval - otherwise probe responses * would update the quiet count value. */ if (update) { if (vap->iv_quiet_count_value == 1) vap->iv_quiet_count_value = vap->iv_quiet_count; else if (vap->iv_quiet_count_value > 1) vap->iv_quiet_count_value--; } if (vap->iv_quiet_count_value == 0) { /* value 0 is reserved as per 802.11h standerd */ vap->iv_quiet_count_value = 1; } quiet->tbttcount = vap->iv_quiet_count_value; quiet->period = vap->iv_quiet_period; quiet->duration = htole16(vap->iv_quiet_duration); quiet->offset = htole16(vap->iv_quiet_offset); return frm + sizeof(*quiet); } /* * Add an 11h Channel Switch Announcement element to a frame. * Note that we use the per-vap CSA count to adjust the global * counter so we can use this routine to form probe response * frames and get the current count. */ static uint8_t * ieee80211_add_csa(uint8_t *frm, struct ieee80211vap *vap) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_csa_ie *csa = (struct ieee80211_csa_ie *) frm; csa->csa_ie = IEEE80211_ELEMID_CSA; csa->csa_len = 3; csa->csa_mode = 1; /* XXX force quiet on channel */ csa->csa_newchan = ieee80211_chan2ieee(ic, ic->ic_csa_newchan); csa->csa_count = ic->ic_csa_count - vap->iv_csa_count; return frm + sizeof(*csa); } /* * Add an 11h country information element to a frame. */ static uint8_t * ieee80211_add_countryie(uint8_t *frm, struct ieee80211com *ic) { if (ic->ic_countryie == NULL || ic->ic_countryie_chan != ic->ic_bsschan) { /* * Handle lazy construction of ie. This is done on * first use and after a channel change that requires * re-calculation. */ if (ic->ic_countryie != NULL) IEEE80211_FREE(ic->ic_countryie, M_80211_NODE_IE); ic->ic_countryie = ieee80211_alloc_countryie(ic); if (ic->ic_countryie == NULL) return frm; ic->ic_countryie_chan = ic->ic_bsschan; } return add_appie(frm, ic->ic_countryie); } uint8_t * ieee80211_add_wpa(uint8_t *frm, const struct ieee80211vap *vap) { if (vap->iv_flags & IEEE80211_F_WPA1 && vap->iv_wpa_ie != NULL) return (add_ie(frm, vap->iv_wpa_ie)); else { /* XXX else complain? */ return (frm); } } uint8_t * ieee80211_add_rsn(uint8_t *frm, const struct ieee80211vap *vap) { if (vap->iv_flags & IEEE80211_F_WPA2 && vap->iv_rsn_ie != NULL) return (add_ie(frm, vap->iv_rsn_ie)); else { /* XXX else complain? */ return (frm); } } uint8_t * ieee80211_add_qos(uint8_t *frm, const struct ieee80211_node *ni) { if (ni->ni_flags & IEEE80211_NODE_QOS) { *frm++ = IEEE80211_ELEMID_QOS; *frm++ = 1; *frm++ = 0; } return (frm); } /* * Send a probe request frame with the specified ssid * and any optional information element data. */ int ieee80211_send_probereq(struct ieee80211_node *ni, const uint8_t sa[IEEE80211_ADDR_LEN], const uint8_t da[IEEE80211_ADDR_LEN], const uint8_t bssid[IEEE80211_ADDR_LEN], const uint8_t *ssid, size_t ssidlen) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ieee80211_node *bss; const struct ieee80211_txparam *tp; struct ieee80211_bpf_params params; const struct ieee80211_rateset *rs; struct mbuf *m; uint8_t *frm; int ret; bss = ieee80211_ref_node(vap->iv_bss); if (vap->iv_state == IEEE80211_S_CAC) { IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, ni, "block %s frame in CAC state", "probe request"); vap->iv_stats.is_tx_badstate++; ieee80211_free_node(bss); return EIO; /* XXX */ } /* * Hold a reference on the node so it doesn't go away until after * the xmit is complete all the way in the driver. On error we * will remove our reference. */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE, "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__, ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1); ieee80211_ref_node(ni); /* * prreq frame format * [tlv] ssid * [tlv] supported rates * [tlv] RSN (optional) * [tlv] extended supported rates * [tlv] HT cap (optional) * [tlv] VHT cap (optional) * [tlv] WPA (optional) * [tlv] user-specified ie's */ m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), 2 + IEEE80211_NWID_LEN + 2 + IEEE80211_RATE_SIZE + sizeof(struct ieee80211_ie_htcap) + sizeof(struct ieee80211_ie_vhtcap) + sizeof(struct ieee80211_ie_htinfo) /* XXX not needed? */ + sizeof(struct ieee80211_ie_wpa) + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE) + sizeof(struct ieee80211_ie_wpa) + (vap->iv_appie_probereq != NULL ? vap->iv_appie_probereq->ie_len : 0) ); if (m == NULL) { vap->iv_stats.is_tx_nobuf++; ieee80211_free_node(ni); ieee80211_free_node(bss); return ENOMEM; } frm = ieee80211_add_ssid(frm, ssid, ssidlen); rs = ieee80211_get_suprates(ic, ic->ic_curchan); frm = ieee80211_add_rates(frm, rs); frm = ieee80211_add_rsn(frm, vap); frm = ieee80211_add_xrates(frm, rs); /* * Note: we can't use bss; we don't have one yet. * * So, we should announce our capabilities * in this channel mode (2g/5g), not the * channel details itself. */ if ((vap->iv_opmode == IEEE80211_M_IBSS) && (vap->iv_flags_ht & IEEE80211_FHT_HT)) { struct ieee80211_channel *c; /* * Get the HT channel that we should try upgrading to. * If we can do 40MHz then this'll upgrade it appropriately. */ c = ieee80211_ht_adjust_channel(ic, ic->ic_curchan, vap->iv_flags_ht); frm = ieee80211_add_htcap_ch(frm, vap, c); } /* * XXX TODO: need to figure out what/how to update the * VHT channel. */ #if 0 (vap->iv_flags_vht & IEEE80211_FVHT_VHT) { struct ieee80211_channel *c; c = ieee80211_ht_adjust_channel(ic, ic->ic_curchan, vap->iv_flags_ht); c = ieee80211_vht_adjust_channel(ic, c, vap->iv_flags_vht); frm = ieee80211_add_vhtcap_ch(frm, vap, c); } #endif frm = ieee80211_add_wpa(frm, vap); if (vap->iv_appie_probereq != NULL) frm = add_appie(frm, vap->iv_appie_probereq); m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); KASSERT(M_LEADINGSPACE(m) >= sizeof(struct ieee80211_frame), ("leading space %zd", M_LEADINGSPACE(m))); M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT); if (m == NULL) { /* NB: cannot happen */ ieee80211_free_node(ni); ieee80211_free_node(bss); return ENOMEM; } IEEE80211_TX_LOCK(ic); ieee80211_send_setup(ni, m, IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_PROBE_REQ, IEEE80211_NONQOS_TID, sa, da, bssid); /* XXX power management? */ m->m_flags |= M_ENCAP; /* mark encapsulated */ M_WME_SETAC(m, WME_AC_BE); IEEE80211_NODE_STAT(ni, tx_probereq); IEEE80211_NODE_STAT(ni, tx_mgmt); IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_DUMPPKTS, "send probe req on channel %u bssid %s sa %6D da %6D ssid \"%.*s\"\n", ieee80211_chan2ieee(ic, ic->ic_curchan), ether_sprintf(bssid), sa, ":", da, ":", ssidlen, ssid); memset(¶ms, 0, sizeof(params)); params.ibp_pri = M_WME_GETAC(m); tp = &vap->iv_txparms[ieee80211_chan2mode(ic->ic_curchan)]; params.ibp_rate0 = tp->mgmtrate; if (IEEE80211_IS_MULTICAST(da)) { params.ibp_flags |= IEEE80211_BPF_NOACK; params.ibp_try0 = 1; } else params.ibp_try0 = tp->maxretry; params.ibp_power = ni->ni_txpower; ret = ieee80211_raw_output(vap, ni, m, ¶ms); IEEE80211_TX_UNLOCK(ic); ieee80211_free_node(bss); return (ret); } /* * Calculate capability information for mgt frames. */ uint16_t ieee80211_getcapinfo(struct ieee80211vap *vap, struct ieee80211_channel *chan) { struct ieee80211com *ic = vap->iv_ic; uint16_t capinfo; KASSERT(vap->iv_opmode != IEEE80211_M_STA, ("station mode")); if (vap->iv_opmode == IEEE80211_M_HOSTAP) capinfo = IEEE80211_CAPINFO_ESS; else if (vap->iv_opmode == IEEE80211_M_IBSS) capinfo = IEEE80211_CAPINFO_IBSS; else capinfo = 0; if (vap->iv_flags & IEEE80211_F_PRIVACY) capinfo |= IEEE80211_CAPINFO_PRIVACY; if ((ic->ic_flags & IEEE80211_F_SHPREAMBLE) && IEEE80211_IS_CHAN_2GHZ(chan)) capinfo |= IEEE80211_CAPINFO_SHORT_PREAMBLE; if (ic->ic_flags & IEEE80211_F_SHSLOT) capinfo |= IEEE80211_CAPINFO_SHORT_SLOTTIME; if (IEEE80211_IS_CHAN_5GHZ(chan) && (vap->iv_flags & IEEE80211_F_DOTH)) capinfo |= IEEE80211_CAPINFO_SPECTRUM_MGMT; return capinfo; } /* * Send a management frame. The node is for the destination (or ic_bss * when in station mode). Nodes other than ic_bss have their reference * count bumped to reflect our use for an indeterminant time. */ int ieee80211_send_mgmt(struct ieee80211_node *ni, int type, int arg) { #define HTFLAGS (IEEE80211_NODE_HT | IEEE80211_NODE_HTCOMPAT) #define senderr(_x, _v) do { vap->iv_stats._v++; ret = _x; goto bad; } while (0) struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ieee80211_node *bss = vap->iv_bss; struct ieee80211_bpf_params params; struct mbuf *m; uint8_t *frm; uint16_t capinfo; int has_challenge, is_shared_key, ret, status; KASSERT(ni != NULL, ("null node")); /* * Hold a reference on the node so it doesn't go away until after * the xmit is complete all the way in the driver. On error we * will remove our reference. */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE, "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__, ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1); ieee80211_ref_node(ni); memset(¶ms, 0, sizeof(params)); switch (type) { case IEEE80211_FC0_SUBTYPE_AUTH: status = arg >> 16; arg &= 0xffff; has_challenge = ((arg == IEEE80211_AUTH_SHARED_CHALLENGE || arg == IEEE80211_AUTH_SHARED_RESPONSE) && ni->ni_challenge != NULL); /* * Deduce whether we're doing open authentication or * shared key authentication. We do the latter if * we're in the middle of a shared key authentication * handshake or if we're initiating an authentication * request and configured to use shared key. */ is_shared_key = has_challenge || arg >= IEEE80211_AUTH_SHARED_RESPONSE || (arg == IEEE80211_AUTH_SHARED_REQUEST && bss->ni_authmode == IEEE80211_AUTH_SHARED); m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), 3 * sizeof(uint16_t) + (has_challenge && status == IEEE80211_STATUS_SUCCESS ? sizeof(uint16_t)+IEEE80211_CHALLENGE_LEN : 0) ); if (m == NULL) senderr(ENOMEM, is_tx_nobuf); ((uint16_t *)frm)[0] = (is_shared_key) ? htole16(IEEE80211_AUTH_ALG_SHARED) : htole16(IEEE80211_AUTH_ALG_OPEN); ((uint16_t *)frm)[1] = htole16(arg); /* sequence number */ ((uint16_t *)frm)[2] = htole16(status);/* status */ if (has_challenge && status == IEEE80211_STATUS_SUCCESS) { ((uint16_t *)frm)[3] = htole16((IEEE80211_CHALLENGE_LEN << 8) | IEEE80211_ELEMID_CHALLENGE); memcpy(&((uint16_t *)frm)[4], ni->ni_challenge, IEEE80211_CHALLENGE_LEN); m->m_pkthdr.len = m->m_len = 4 * sizeof(uint16_t) + IEEE80211_CHALLENGE_LEN; if (arg == IEEE80211_AUTH_SHARED_RESPONSE) { IEEE80211_NOTE(vap, IEEE80211_MSG_AUTH, ni, "request encrypt frame (%s)", __func__); /* mark frame for encryption */ params.ibp_flags |= IEEE80211_BPF_CRYPTO; } } else m->m_pkthdr.len = m->m_len = 3 * sizeof(uint16_t); /* XXX not right for shared key */ if (status == IEEE80211_STATUS_SUCCESS) IEEE80211_NODE_STAT(ni, tx_auth); else IEEE80211_NODE_STAT(ni, tx_auth_fail); if (vap->iv_opmode == IEEE80211_M_STA) ieee80211_add_callback(m, ieee80211_tx_mgt_cb, (void *) vap->iv_state); break; case IEEE80211_FC0_SUBTYPE_DEAUTH: IEEE80211_NOTE(vap, IEEE80211_MSG_AUTH, ni, "send station deauthenticate (reason: %d (%s))", arg, ieee80211_reason_to_string(arg)); m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(uint16_t)); if (m == NULL) senderr(ENOMEM, is_tx_nobuf); *(uint16_t *)frm = htole16(arg); /* reason */ m->m_pkthdr.len = m->m_len = sizeof(uint16_t); IEEE80211_NODE_STAT(ni, tx_deauth); IEEE80211_NODE_STAT_SET(ni, tx_deauth_code, arg); ieee80211_node_unauthorize(ni); /* port closed */ break; case IEEE80211_FC0_SUBTYPE_ASSOC_REQ: case IEEE80211_FC0_SUBTYPE_REASSOC_REQ: /* * asreq frame format * [2] capability information * [2] listen interval * [6*] current AP address (reassoc only) * [tlv] ssid * [tlv] supported rates * [tlv] extended supported rates * [4] power capability (optional) * [28] supported channels (optional) * [tlv] HT capabilities * [tlv] VHT capabilities * [tlv] WME (optional) * [tlv] Vendor OUI HT capabilities (optional) * [tlv] Atheros capabilities (if negotiated) * [tlv] AppIE's (optional) */ m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(uint16_t) + sizeof(uint16_t) + IEEE80211_ADDR_LEN + 2 + IEEE80211_NWID_LEN + 2 + IEEE80211_RATE_SIZE + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE) + 4 + 2 + 26 + sizeof(struct ieee80211_wme_info) + sizeof(struct ieee80211_ie_htcap) + sizeof(struct ieee80211_ie_vhtcap) + 4 + sizeof(struct ieee80211_ie_htcap) #ifdef IEEE80211_SUPPORT_SUPERG + sizeof(struct ieee80211_ath_ie) #endif + (vap->iv_appie_wpa != NULL ? vap->iv_appie_wpa->ie_len : 0) + (vap->iv_appie_assocreq != NULL ? vap->iv_appie_assocreq->ie_len : 0) ); if (m == NULL) senderr(ENOMEM, is_tx_nobuf); KASSERT(vap->iv_opmode == IEEE80211_M_STA, ("wrong mode %u", vap->iv_opmode)); capinfo = IEEE80211_CAPINFO_ESS; if (vap->iv_flags & IEEE80211_F_PRIVACY) capinfo |= IEEE80211_CAPINFO_PRIVACY; /* * NB: Some 11a AP's reject the request when * short preamble is set. */ if ((ic->ic_flags & IEEE80211_F_SHPREAMBLE) && IEEE80211_IS_CHAN_2GHZ(ic->ic_curchan)) capinfo |= IEEE80211_CAPINFO_SHORT_PREAMBLE; if (IEEE80211_IS_CHAN_ANYG(ic->ic_curchan) && (ic->ic_caps & IEEE80211_C_SHSLOT)) capinfo |= IEEE80211_CAPINFO_SHORT_SLOTTIME; if ((ni->ni_capinfo & IEEE80211_CAPINFO_SPECTRUM_MGMT) && (vap->iv_flags & IEEE80211_F_DOTH)) capinfo |= IEEE80211_CAPINFO_SPECTRUM_MGMT; *(uint16_t *)frm = htole16(capinfo); frm += 2; KASSERT(bss->ni_intval != 0, ("beacon interval is zero!")); *(uint16_t *)frm = htole16(howmany(ic->ic_lintval, bss->ni_intval)); frm += 2; if (type == IEEE80211_FC0_SUBTYPE_REASSOC_REQ) { IEEE80211_ADDR_COPY(frm, bss->ni_bssid); frm += IEEE80211_ADDR_LEN; } frm = ieee80211_add_ssid(frm, ni->ni_essid, ni->ni_esslen); frm = ieee80211_add_rates(frm, &ni->ni_rates); frm = ieee80211_add_rsn(frm, vap); frm = ieee80211_add_xrates(frm, &ni->ni_rates); if (capinfo & IEEE80211_CAPINFO_SPECTRUM_MGMT) { frm = ieee80211_add_powercapability(frm, ic->ic_curchan); frm = ieee80211_add_supportedchannels(frm, ic); } /* * Check the channel - we may be using an 11n NIC with an * 11n capable station, but we're configured to be an 11b * channel. */ if ((vap->iv_flags_ht & IEEE80211_FHT_HT) && IEEE80211_IS_CHAN_HT(ni->ni_chan) && ni->ni_ies.htcap_ie != NULL && ni->ni_ies.htcap_ie[0] == IEEE80211_ELEMID_HTCAP) { frm = ieee80211_add_htcap(frm, ni); } if ((vap->iv_flags_vht & IEEE80211_FVHT_VHT) && IEEE80211_IS_CHAN_VHT(ni->ni_chan) && ni->ni_ies.vhtcap_ie != NULL && ni->ni_ies.vhtcap_ie[0] == IEEE80211_ELEMID_VHT_CAP) { frm = ieee80211_add_vhtcap(frm, ni); } frm = ieee80211_add_wpa(frm, vap); if ((ic->ic_flags & IEEE80211_F_WME) && ni->ni_ies.wme_ie != NULL) frm = ieee80211_add_wme_info(frm, &ic->ic_wme); /* * Same deal - only send HT info if we're on an 11n * capable channel. */ if ((vap->iv_flags_ht & IEEE80211_FHT_HT) && IEEE80211_IS_CHAN_HT(ni->ni_chan) && ni->ni_ies.htcap_ie != NULL && ni->ni_ies.htcap_ie[0] == IEEE80211_ELEMID_VENDOR) { frm = ieee80211_add_htcap_vendor(frm, ni); } #ifdef IEEE80211_SUPPORT_SUPERG if (IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS)) { frm = ieee80211_add_ath(frm, IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS), ((vap->iv_flags & IEEE80211_F_WPA) == 0 && ni->ni_authmode != IEEE80211_AUTH_8021X) ? vap->iv_def_txkey : IEEE80211_KEYIX_NONE); } #endif /* IEEE80211_SUPPORT_SUPERG */ if (vap->iv_appie_assocreq != NULL) frm = add_appie(frm, vap->iv_appie_assocreq); m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); ieee80211_add_callback(m, ieee80211_tx_mgt_cb, (void *) vap->iv_state); break; case IEEE80211_FC0_SUBTYPE_ASSOC_RESP: case IEEE80211_FC0_SUBTYPE_REASSOC_RESP: /* * asresp frame format * [2] capability information * [2] status * [2] association ID * [tlv] supported rates * [tlv] extended supported rates * [tlv] HT capabilities (standard, if STA enabled) * [tlv] HT information (standard, if STA enabled) * [tlv] VHT capabilities (standard, if STA enabled) * [tlv] VHT information (standard, if STA enabled) * [tlv] WME (if configured and STA enabled) * [tlv] HT capabilities (vendor OUI, if STA enabled) * [tlv] HT information (vendor OUI, if STA enabled) * [tlv] Atheros capabilities (if STA enabled) * [tlv] AppIE's (optional) */ m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint16_t) + 2 + IEEE80211_RATE_SIZE + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE) + sizeof(struct ieee80211_ie_htcap) + 4 + sizeof(struct ieee80211_ie_htinfo) + 4 + sizeof(struct ieee80211_ie_vhtcap) + sizeof(struct ieee80211_ie_vht_operation) + sizeof(struct ieee80211_wme_param) #ifdef IEEE80211_SUPPORT_SUPERG + sizeof(struct ieee80211_ath_ie) #endif + (vap->iv_appie_assocresp != NULL ? vap->iv_appie_assocresp->ie_len : 0) ); if (m == NULL) senderr(ENOMEM, is_tx_nobuf); capinfo = ieee80211_getcapinfo(vap, bss->ni_chan); *(uint16_t *)frm = htole16(capinfo); frm += 2; *(uint16_t *)frm = htole16(arg); /* status */ frm += 2; if (arg == IEEE80211_STATUS_SUCCESS) { *(uint16_t *)frm = htole16(ni->ni_associd); IEEE80211_NODE_STAT(ni, tx_assoc); } else IEEE80211_NODE_STAT(ni, tx_assoc_fail); frm += 2; frm = ieee80211_add_rates(frm, &ni->ni_rates); frm = ieee80211_add_xrates(frm, &ni->ni_rates); /* NB: respond according to what we received */ if ((ni->ni_flags & HTFLAGS) == IEEE80211_NODE_HT) { frm = ieee80211_add_htcap(frm, ni); frm = ieee80211_add_htinfo(frm, ni); } if ((vap->iv_flags & IEEE80211_F_WME) && ni->ni_ies.wme_ie != NULL) frm = ieee80211_add_wme_param(frm, &ic->ic_wme); if ((ni->ni_flags & HTFLAGS) == HTFLAGS) { frm = ieee80211_add_htcap_vendor(frm, ni); frm = ieee80211_add_htinfo_vendor(frm, ni); } if (ni->ni_flags & IEEE80211_NODE_VHT) { frm = ieee80211_add_vhtcap(frm, ni); frm = ieee80211_add_vhtinfo(frm, ni); } #ifdef IEEE80211_SUPPORT_SUPERG if (IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS)) frm = ieee80211_add_ath(frm, IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS), ((vap->iv_flags & IEEE80211_F_WPA) == 0 && ni->ni_authmode != IEEE80211_AUTH_8021X) ? vap->iv_def_txkey : IEEE80211_KEYIX_NONE); #endif /* IEEE80211_SUPPORT_SUPERG */ if (vap->iv_appie_assocresp != NULL) frm = add_appie(frm, vap->iv_appie_assocresp); m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); break; case IEEE80211_FC0_SUBTYPE_DISASSOC: IEEE80211_NOTE(vap, IEEE80211_MSG_ASSOC, ni, "send station disassociate (reason: %d (%s))", arg, ieee80211_reason_to_string(arg)); m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), sizeof(uint16_t)); if (m == NULL) senderr(ENOMEM, is_tx_nobuf); *(uint16_t *)frm = htole16(arg); /* reason */ m->m_pkthdr.len = m->m_len = sizeof(uint16_t); IEEE80211_NODE_STAT(ni, tx_disassoc); IEEE80211_NODE_STAT_SET(ni, tx_disassoc_code, arg); break; default: IEEE80211_NOTE(vap, IEEE80211_MSG_ANY, ni, "invalid mgmt frame type %u", type); senderr(EINVAL, is_tx_unknownmgt); /* NOTREACHED */ } /* NB: force non-ProbeResp frames to the highest queue */ params.ibp_pri = WME_AC_VO; params.ibp_rate0 = bss->ni_txparms->mgmtrate; /* NB: we know all frames are unicast */ params.ibp_try0 = bss->ni_txparms->maxretry; params.ibp_power = bss->ni_txpower; return ieee80211_mgmt_output(ni, m, type, ¶ms); bad: ieee80211_free_node(ni); return ret; #undef senderr #undef HTFLAGS } /* * Return an mbuf with a probe response frame in it. * Space is left to prepend and 802.11 header at the * front but it's left to the caller to fill in. */ struct mbuf * ieee80211_alloc_proberesp(struct ieee80211_node *bss, int legacy) { struct ieee80211vap *vap = bss->ni_vap; struct ieee80211com *ic = bss->ni_ic; const struct ieee80211_rateset *rs; struct mbuf *m; uint16_t capinfo; uint8_t *frm; /* * probe response frame format * [8] time stamp * [2] beacon interval * [2] cabability information * [tlv] ssid * [tlv] supported rates * [tlv] parameter set (FH/DS) * [tlv] parameter set (IBSS) * [tlv] country (optional) * [3] power control (optional) * [5] channel switch announcement (CSA) (optional) * [tlv] extended rate phy (ERP) * [tlv] extended supported rates * [tlv] RSN (optional) * [tlv] HT capabilities * [tlv] HT information * [tlv] VHT capabilities * [tlv] VHT information * [tlv] WPA (optional) * [tlv] WME (optional) * [tlv] Vendor OUI HT capabilities (optional) * [tlv] Vendor OUI HT information (optional) * [tlv] Atheros capabilities * [tlv] AppIE's (optional) * [tlv] Mesh ID (MBSS) * [tlv] Mesh Conf (MBSS) */ m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), 8 + sizeof(uint16_t) + sizeof(uint16_t) + 2 + IEEE80211_NWID_LEN + 2 + IEEE80211_RATE_SIZE + 7 /* max(7,3) */ + IEEE80211_COUNTRY_MAX_SIZE + 3 + sizeof(struct ieee80211_csa_ie) + sizeof(struct ieee80211_quiet_ie) + 3 + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE) + sizeof(struct ieee80211_ie_wpa) + sizeof(struct ieee80211_ie_htcap) + sizeof(struct ieee80211_ie_htinfo) + sizeof(struct ieee80211_ie_wpa) + sizeof(struct ieee80211_wme_param) + 4 + sizeof(struct ieee80211_ie_htcap) + 4 + sizeof(struct ieee80211_ie_htinfo) + sizeof(struct ieee80211_ie_vhtcap) + sizeof(struct ieee80211_ie_vht_operation) #ifdef IEEE80211_SUPPORT_SUPERG + sizeof(struct ieee80211_ath_ie) #endif #ifdef IEEE80211_SUPPORT_MESH + 2 + IEEE80211_MESHID_LEN + sizeof(struct ieee80211_meshconf_ie) #endif + (vap->iv_appie_proberesp != NULL ? vap->iv_appie_proberesp->ie_len : 0) ); if (m == NULL) { vap->iv_stats.is_tx_nobuf++; return NULL; } memset(frm, 0, 8); /* timestamp should be filled later */ frm += 8; *(uint16_t *)frm = htole16(bss->ni_intval); frm += 2; capinfo = ieee80211_getcapinfo(vap, bss->ni_chan); *(uint16_t *)frm = htole16(capinfo); frm += 2; frm = ieee80211_add_ssid(frm, bss->ni_essid, bss->ni_esslen); rs = ieee80211_get_suprates(ic, bss->ni_chan); frm = ieee80211_add_rates(frm, rs); if (IEEE80211_IS_CHAN_FHSS(bss->ni_chan)) { *frm++ = IEEE80211_ELEMID_FHPARMS; *frm++ = 5; *frm++ = bss->ni_fhdwell & 0x00ff; *frm++ = (bss->ni_fhdwell >> 8) & 0x00ff; *frm++ = IEEE80211_FH_CHANSET( ieee80211_chan2ieee(ic, bss->ni_chan)); *frm++ = IEEE80211_FH_CHANPAT( ieee80211_chan2ieee(ic, bss->ni_chan)); *frm++ = bss->ni_fhindex; } else { *frm++ = IEEE80211_ELEMID_DSPARMS; *frm++ = 1; *frm++ = ieee80211_chan2ieee(ic, bss->ni_chan); } if (vap->iv_opmode == IEEE80211_M_IBSS) { *frm++ = IEEE80211_ELEMID_IBSSPARMS; *frm++ = 2; *frm++ = 0; *frm++ = 0; /* TODO: ATIM window */ } if ((vap->iv_flags & IEEE80211_F_DOTH) || (vap->iv_flags_ext & IEEE80211_FEXT_DOTD)) frm = ieee80211_add_countryie(frm, ic); if (vap->iv_flags & IEEE80211_F_DOTH) { if (IEEE80211_IS_CHAN_5GHZ(bss->ni_chan)) frm = ieee80211_add_powerconstraint(frm, vap); if (ic->ic_flags & IEEE80211_F_CSAPENDING) frm = ieee80211_add_csa(frm, vap); } if (vap->iv_flags & IEEE80211_F_DOTH) { if (IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) && (vap->iv_flags_ext & IEEE80211_FEXT_DFS)) { if (vap->iv_quiet) frm = ieee80211_add_quiet(frm, vap, 0); } } if (IEEE80211_IS_CHAN_ANYG(bss->ni_chan)) frm = ieee80211_add_erp(frm, ic); frm = ieee80211_add_xrates(frm, rs); frm = ieee80211_add_rsn(frm, vap); /* * NB: legacy 11b clients do not get certain ie's. * The caller identifies such clients by passing * a token in legacy to us. Could expand this to be * any legacy client for stuff like HT ie's. */ if (IEEE80211_IS_CHAN_HT(bss->ni_chan) && legacy != IEEE80211_SEND_LEGACY_11B) { frm = ieee80211_add_htcap(frm, bss); frm = ieee80211_add_htinfo(frm, bss); } if (IEEE80211_IS_CHAN_VHT(bss->ni_chan) && legacy != IEEE80211_SEND_LEGACY_11B) { frm = ieee80211_add_vhtcap(frm, bss); frm = ieee80211_add_vhtinfo(frm, bss); } frm = ieee80211_add_wpa(frm, vap); if (vap->iv_flags & IEEE80211_F_WME) frm = ieee80211_add_wme_param(frm, &ic->ic_wme); if (IEEE80211_IS_CHAN_HT(bss->ni_chan) && (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT) && legacy != IEEE80211_SEND_LEGACY_11B) { frm = ieee80211_add_htcap_vendor(frm, bss); frm = ieee80211_add_htinfo_vendor(frm, bss); } #ifdef IEEE80211_SUPPORT_SUPERG if ((vap->iv_flags & IEEE80211_F_ATHEROS) && legacy != IEEE80211_SEND_LEGACY_11B) frm = ieee80211_add_athcaps(frm, bss); #endif if (vap->iv_appie_proberesp != NULL) frm = add_appie(frm, vap->iv_appie_proberesp); #ifdef IEEE80211_SUPPORT_MESH if (vap->iv_opmode == IEEE80211_M_MBSS) { frm = ieee80211_add_meshid(frm, vap); frm = ieee80211_add_meshconf(frm, vap); } #endif m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); return m; } /* * Send a probe response frame to the specified mac address. * This does not go through the normal mgt frame api so we * can specify the destination address and re-use the bss node * for the sta reference. */ int ieee80211_send_proberesp(struct ieee80211vap *vap, const uint8_t da[IEEE80211_ADDR_LEN], int legacy) { struct ieee80211_node *bss = vap->iv_bss; struct ieee80211com *ic = vap->iv_ic; struct mbuf *m; int ret; if (vap->iv_state == IEEE80211_S_CAC) { IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, bss, "block %s frame in CAC state", "probe response"); vap->iv_stats.is_tx_badstate++; return EIO; /* XXX */ } /* * Hold a reference on the node so it doesn't go away until after * the xmit is complete all the way in the driver. On error we * will remove our reference. */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE, "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__, bss, ether_sprintf(bss->ni_macaddr), ieee80211_node_refcnt(bss)+1); ieee80211_ref_node(bss); m = ieee80211_alloc_proberesp(bss, legacy); if (m == NULL) { ieee80211_free_node(bss); return ENOMEM; } M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT); KASSERT(m != NULL, ("no room for header")); IEEE80211_TX_LOCK(ic); ieee80211_send_setup(bss, m, IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_PROBE_RESP, IEEE80211_NONQOS_TID, vap->iv_myaddr, da, bss->ni_bssid); /* XXX power management? */ m->m_flags |= M_ENCAP; /* mark encapsulated */ M_WME_SETAC(m, WME_AC_BE); IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_DUMPPKTS, "send probe resp on channel %u to %s%s\n", ieee80211_chan2ieee(ic, ic->ic_curchan), ether_sprintf(da), legacy ? " " : ""); IEEE80211_NODE_STAT(bss, tx_mgmt); ret = ieee80211_raw_output(vap, bss, m, NULL); IEEE80211_TX_UNLOCK(ic); return (ret); } /* * Allocate and build a RTS (Request To Send) control frame. */ struct mbuf * ieee80211_alloc_rts(struct ieee80211com *ic, const uint8_t ra[IEEE80211_ADDR_LEN], const uint8_t ta[IEEE80211_ADDR_LEN], uint16_t dur) { struct ieee80211_frame_rts *rts; struct mbuf *m; /* XXX honor ic_headroom */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m != NULL) { rts = mtod(m, struct ieee80211_frame_rts *); rts->i_fc[0] = IEEE80211_FC0_VERSION_0 | IEEE80211_FC0_TYPE_CTL | IEEE80211_FC0_SUBTYPE_RTS; rts->i_fc[1] = IEEE80211_FC1_DIR_NODS; *(u_int16_t *)rts->i_dur = htole16(dur); IEEE80211_ADDR_COPY(rts->i_ra, ra); IEEE80211_ADDR_COPY(rts->i_ta, ta); m->m_pkthdr.len = m->m_len = sizeof(struct ieee80211_frame_rts); } return m; } /* * Allocate and build a CTS (Clear To Send) control frame. */ struct mbuf * ieee80211_alloc_cts(struct ieee80211com *ic, const uint8_t ra[IEEE80211_ADDR_LEN], uint16_t dur) { struct ieee80211_frame_cts *cts; struct mbuf *m; /* XXX honor ic_headroom */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m != NULL) { cts = mtod(m, struct ieee80211_frame_cts *); cts->i_fc[0] = IEEE80211_FC0_VERSION_0 | IEEE80211_FC0_TYPE_CTL | IEEE80211_FC0_SUBTYPE_CTS; cts->i_fc[1] = IEEE80211_FC1_DIR_NODS; *(u_int16_t *)cts->i_dur = htole16(dur); IEEE80211_ADDR_COPY(cts->i_ra, ra); m->m_pkthdr.len = m->m_len = sizeof(struct ieee80211_frame_cts); } return m; } /* * Wrapper for CTS/RTS frame allocation. */ struct mbuf * ieee80211_alloc_prot(struct ieee80211_node *ni, const struct mbuf *m, uint8_t rate, int prot) { struct ieee80211com *ic = ni->ni_ic; const struct ieee80211_frame *wh; struct mbuf *mprot; uint16_t dur; int pktlen, isshort; KASSERT(prot == IEEE80211_PROT_RTSCTS || prot == IEEE80211_PROT_CTSONLY, ("wrong protection type %d", prot)); wh = mtod(m, const struct ieee80211_frame *); pktlen = m->m_pkthdr.len + IEEE80211_CRC_LEN; isshort = (ic->ic_flags & IEEE80211_F_SHPREAMBLE) != 0; dur = ieee80211_compute_duration(ic->ic_rt, pktlen, rate, isshort) + ieee80211_ack_duration(ic->ic_rt, rate, isshort); if (prot == IEEE80211_PROT_RTSCTS) { /* NB: CTS is the same size as an ACK */ dur += ieee80211_ack_duration(ic->ic_rt, rate, isshort); mprot = ieee80211_alloc_rts(ic, wh->i_addr1, wh->i_addr2, dur); } else mprot = ieee80211_alloc_cts(ic, ni->ni_vap->iv_myaddr, dur); return (mprot); } static void ieee80211_tx_mgt_timeout(void *arg) { struct ieee80211vap *vap = arg; IEEE80211_LOCK(vap->iv_ic); if (vap->iv_state != IEEE80211_S_INIT && (vap->iv_ic->ic_flags & IEEE80211_F_SCAN) == 0) { /* * NB: it's safe to specify a timeout as the reason here; * it'll only be used in the right state. */ ieee80211_new_state_locked(vap, IEEE80211_S_SCAN, IEEE80211_SCAN_FAIL_TIMEOUT); } IEEE80211_UNLOCK(vap->iv_ic); } /* * This is the callback set on net80211-sourced transmitted * authentication request frames. * * This does a couple of things: * * + If the frame transmitted was a success, it schedules a future * event which will transition the interface to scan. * If a state transition _then_ occurs before that event occurs, * said state transition will cancel this callout. * * + If the frame transmit was a failure, it immediately schedules * the transition back to scan. */ static void ieee80211_tx_mgt_cb(struct ieee80211_node *ni, void *arg, int status) { struct ieee80211vap *vap = ni->ni_vap; enum ieee80211_state ostate = (enum ieee80211_state) arg; /* * Frame transmit completed; arrange timer callback. If * transmit was successfully we wait for response. Otherwise * we arrange an immediate callback instead of doing the * callback directly since we don't know what state the driver * is in (e.g. what locks it is holding). This work should * not be too time-critical and not happen too often so the * added overhead is acceptable. * * XXX what happens if !acked but response shows up before callback? */ if (vap->iv_state == ostate) { callout_reset(&vap->iv_mgtsend, status == 0 ? IEEE80211_TRANS_WAIT*hz : 0, ieee80211_tx_mgt_timeout, vap); } } static void ieee80211_beacon_construct(struct mbuf *m, uint8_t *frm, struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_beacon_offsets *bo = &vap->iv_bcn_off; struct ieee80211com *ic = ni->ni_ic; struct ieee80211_rateset *rs = &ni->ni_rates; uint16_t capinfo; /* * beacon frame format * * TODO: update to 802.11-2012; a lot of stuff has changed; * vendor extensions should be at the end, etc. * * [8] time stamp * [2] beacon interval * [2] cabability information * [tlv] ssid * [tlv] supported rates * [3] parameter set (DS) * [8] CF parameter set (optional) * [tlv] parameter set (IBSS/TIM) * [tlv] country (optional) * [3] power control (optional) * [5] channel switch announcement (CSA) (optional) * XXX TODO: Quiet * XXX TODO: IBSS DFS * XXX TODO: TPC report * [tlv] extended rate phy (ERP) * [tlv] extended supported rates * [tlv] RSN parameters * XXX TODO: BSSLOAD * (XXX EDCA parameter set, QoS capability?) * XXX TODO: AP channel report * * [tlv] HT capabilities * [tlv] HT information * XXX TODO: 20/40 BSS coexistence * Mesh: * XXX TODO: Meshid * XXX TODO: mesh config * XXX TODO: mesh awake window * XXX TODO: beacon timing (mesh, etc) * XXX TODO: MCCAOP Advertisement Overview * XXX TODO: MCCAOP Advertisement * XXX TODO: Mesh channel switch parameters * VHT: * XXX TODO: VHT capabilities * XXX TODO: VHT operation * XXX TODO: VHT transmit power envelope * XXX TODO: channel switch wrapper element * XXX TODO: extended BSS load element * * XXX Vendor-specific OIDs (e.g. Atheros) * [tlv] WPA parameters * [tlv] WME parameters * [tlv] Vendor OUI HT capabilities (optional) * [tlv] Vendor OUI HT information (optional) * [tlv] Atheros capabilities (optional) * [tlv] TDMA parameters (optional) * [tlv] Mesh ID (MBSS) * [tlv] Mesh Conf (MBSS) * [tlv] application data (optional) */ memset(bo, 0, sizeof(*bo)); memset(frm, 0, 8); /* XXX timestamp is set by hardware/driver */ frm += 8; *(uint16_t *)frm = htole16(ni->ni_intval); frm += 2; capinfo = ieee80211_getcapinfo(vap, ni->ni_chan); bo->bo_caps = (uint16_t *)frm; *(uint16_t *)frm = htole16(capinfo); frm += 2; *frm++ = IEEE80211_ELEMID_SSID; if ((vap->iv_flags & IEEE80211_F_HIDESSID) == 0) { *frm++ = ni->ni_esslen; memcpy(frm, ni->ni_essid, ni->ni_esslen); frm += ni->ni_esslen; } else *frm++ = 0; frm = ieee80211_add_rates(frm, rs); if (!IEEE80211_IS_CHAN_FHSS(ni->ni_chan)) { *frm++ = IEEE80211_ELEMID_DSPARMS; *frm++ = 1; *frm++ = ieee80211_chan2ieee(ic, ni->ni_chan); } if (ic->ic_flags & IEEE80211_F_PCF) { bo->bo_cfp = frm; frm = ieee80211_add_cfparms(frm, ic); } bo->bo_tim = frm; if (vap->iv_opmode == IEEE80211_M_IBSS) { *frm++ = IEEE80211_ELEMID_IBSSPARMS; *frm++ = 2; *frm++ = 0; *frm++ = 0; /* TODO: ATIM window */ bo->bo_tim_len = 0; } else if (vap->iv_opmode == IEEE80211_M_HOSTAP || vap->iv_opmode == IEEE80211_M_MBSS) { /* TIM IE is the same for Mesh and Hostap */ struct ieee80211_tim_ie *tie = (struct ieee80211_tim_ie *) frm; tie->tim_ie = IEEE80211_ELEMID_TIM; tie->tim_len = 4; /* length */ tie->tim_count = 0; /* DTIM count */ tie->tim_period = vap->iv_dtim_period; /* DTIM period */ tie->tim_bitctl = 0; /* bitmap control */ tie->tim_bitmap[0] = 0; /* Partial Virtual Bitmap */ frm += sizeof(struct ieee80211_tim_ie); bo->bo_tim_len = 1; } bo->bo_tim_trailer = frm; if ((vap->iv_flags & IEEE80211_F_DOTH) || (vap->iv_flags_ext & IEEE80211_FEXT_DOTD)) frm = ieee80211_add_countryie(frm, ic); if (vap->iv_flags & IEEE80211_F_DOTH) { if (IEEE80211_IS_CHAN_5GHZ(ni->ni_chan)) frm = ieee80211_add_powerconstraint(frm, vap); bo->bo_csa = frm; if (ic->ic_flags & IEEE80211_F_CSAPENDING) frm = ieee80211_add_csa(frm, vap); } else bo->bo_csa = frm; bo->bo_quiet = NULL; if (vap->iv_flags & IEEE80211_F_DOTH) { if (IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) && (vap->iv_flags_ext & IEEE80211_FEXT_DFS) && (vap->iv_quiet == 1)) { /* * We only insert the quiet IE offset if * the quiet IE is enabled. Otherwise don't * put it here or we'll just overwrite * some other beacon contents. */ if (vap->iv_quiet) { bo->bo_quiet = frm; frm = ieee80211_add_quiet(frm,vap, 0); } } } if (IEEE80211_IS_CHAN_ANYG(ni->ni_chan)) { bo->bo_erp = frm; frm = ieee80211_add_erp(frm, ic); } frm = ieee80211_add_xrates(frm, rs); frm = ieee80211_add_rsn(frm, vap); if (IEEE80211_IS_CHAN_HT(ni->ni_chan)) { frm = ieee80211_add_htcap(frm, ni); bo->bo_htinfo = frm; frm = ieee80211_add_htinfo(frm, ni); } if (IEEE80211_IS_CHAN_VHT(ni->ni_chan)) { frm = ieee80211_add_vhtcap(frm, ni); bo->bo_vhtinfo = frm; frm = ieee80211_add_vhtinfo(frm, ni); /* Transmit power envelope */ /* Channel switch wrapper element */ /* Extended bss load element */ } frm = ieee80211_add_wpa(frm, vap); if (vap->iv_flags & IEEE80211_F_WME) { bo->bo_wme = frm; frm = ieee80211_add_wme_param(frm, &ic->ic_wme); } if (IEEE80211_IS_CHAN_HT(ni->ni_chan) && (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT)) { frm = ieee80211_add_htcap_vendor(frm, ni); frm = ieee80211_add_htinfo_vendor(frm, ni); } #ifdef IEEE80211_SUPPORT_SUPERG if (vap->iv_flags & IEEE80211_F_ATHEROS) { bo->bo_ath = frm; frm = ieee80211_add_athcaps(frm, ni); } #endif #ifdef IEEE80211_SUPPORT_TDMA if (vap->iv_caps & IEEE80211_C_TDMA) { bo->bo_tdma = frm; frm = ieee80211_add_tdma(frm, vap); } #endif if (vap->iv_appie_beacon != NULL) { bo->bo_appie = frm; bo->bo_appie_len = vap->iv_appie_beacon->ie_len; frm = add_appie(frm, vap->iv_appie_beacon); } /* XXX TODO: move meshid/meshconf up to before vendor extensions? */ #ifdef IEEE80211_SUPPORT_MESH if (vap->iv_opmode == IEEE80211_M_MBSS) { frm = ieee80211_add_meshid(frm, vap); bo->bo_meshconf = frm; frm = ieee80211_add_meshconf(frm, vap); } #endif bo->bo_tim_trailer_len = frm - bo->bo_tim_trailer; bo->bo_csa_trailer_len = frm - bo->bo_csa; m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *); } /* * Allocate a beacon frame and fillin the appropriate bits. */ struct mbuf * ieee80211_beacon_alloc(struct ieee80211_node *ni) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ifnet *ifp = vap->iv_ifp; struct ieee80211_frame *wh; struct mbuf *m; int pktlen; uint8_t *frm; /* * Update the "We're putting the quiet IE in the beacon" state. */ if (vap->iv_quiet == 1) vap->iv_flags_ext |= IEEE80211_FEXT_QUIET_IE; else if (vap->iv_quiet == 0) vap->iv_flags_ext &= ~IEEE80211_FEXT_QUIET_IE; /* * beacon frame format * * Note: This needs updating for 802.11-2012. * * [8] time stamp * [2] beacon interval * [2] cabability information * [tlv] ssid * [tlv] supported rates * [3] parameter set (DS) * [8] CF parameter set (optional) * [tlv] parameter set (IBSS/TIM) * [tlv] country (optional) * [3] power control (optional) * [5] channel switch announcement (CSA) (optional) * [tlv] extended rate phy (ERP) * [tlv] extended supported rates * [tlv] RSN parameters * [tlv] HT capabilities * [tlv] HT information * [tlv] VHT capabilities * [tlv] VHT operation * [tlv] Vendor OUI HT capabilities (optional) * [tlv] Vendor OUI HT information (optional) * XXX Vendor-specific OIDs (e.g. Atheros) * [tlv] WPA parameters * [tlv] WME parameters * [tlv] TDMA parameters (optional) * [tlv] Mesh ID (MBSS) * [tlv] Mesh Conf (MBSS) * [tlv] application data (optional) * NB: we allocate the max space required for the TIM bitmap. * XXX how big is this? */ pktlen = 8 /* time stamp */ + sizeof(uint16_t) /* beacon interval */ + sizeof(uint16_t) /* capabilities */ + 2 + ni->ni_esslen /* ssid */ + 2 + IEEE80211_RATE_SIZE /* supported rates */ + 2 + 1 /* DS parameters */ + 2 + 6 /* CF parameters */ + 2 + 4 + vap->iv_tim_len /* DTIM/IBSSPARMS */ + IEEE80211_COUNTRY_MAX_SIZE /* country */ + 2 + 1 /* power control */ + sizeof(struct ieee80211_csa_ie) /* CSA */ + sizeof(struct ieee80211_quiet_ie) /* Quiet */ + 2 + 1 /* ERP */ + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE) + (vap->iv_caps & IEEE80211_C_WPA ? /* WPA 1+2 */ 2*sizeof(struct ieee80211_ie_wpa) : 0) /* XXX conditional? */ + 4+2*sizeof(struct ieee80211_ie_htcap)/* HT caps */ + 4+2*sizeof(struct ieee80211_ie_htinfo)/* HT info */ + sizeof(struct ieee80211_ie_vhtcap)/* VHT caps */ + sizeof(struct ieee80211_ie_vht_operation)/* VHT info */ + (vap->iv_caps & IEEE80211_C_WME ? /* WME */ sizeof(struct ieee80211_wme_param) : 0) #ifdef IEEE80211_SUPPORT_SUPERG + sizeof(struct ieee80211_ath_ie) /* ATH */ #endif #ifdef IEEE80211_SUPPORT_TDMA + (vap->iv_caps & IEEE80211_C_TDMA ? /* TDMA */ sizeof(struct ieee80211_tdma_param) : 0) #endif #ifdef IEEE80211_SUPPORT_MESH + 2 + ni->ni_meshidlen + sizeof(struct ieee80211_meshconf_ie) #endif + IEEE80211_MAX_APPIE ; m = ieee80211_getmgtframe(&frm, ic->ic_headroom + sizeof(struct ieee80211_frame), pktlen); if (m == NULL) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_ANY, "%s: cannot get buf; size %u\n", __func__, pktlen); vap->iv_stats.is_tx_nobuf++; return NULL; } ieee80211_beacon_construct(m, frm, ni); M_PREPEND(m, sizeof(struct ieee80211_frame), M_NOWAIT); KASSERT(m != NULL, ("no space for 802.11 header?")); wh = mtod(m, struct ieee80211_frame *); wh->i_fc[0] = IEEE80211_FC0_VERSION_0 | IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_BEACON; wh->i_fc[1] = IEEE80211_FC1_DIR_NODS; *(uint16_t *)wh->i_dur = 0; IEEE80211_ADDR_COPY(wh->i_addr1, ifp->if_broadcastaddr); IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr); IEEE80211_ADDR_COPY(wh->i_addr3, ni->ni_bssid); *(uint16_t *)wh->i_seq = 0; return m; } /* * Update the dynamic parts of a beacon frame based on the current state. */ int ieee80211_beacon_update(struct ieee80211_node *ni, struct mbuf *m, int mcast) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211_beacon_offsets *bo = &vap->iv_bcn_off; struct ieee80211com *ic = ni->ni_ic; int len_changed = 0; uint16_t capinfo; struct ieee80211_frame *wh; ieee80211_seq seqno; IEEE80211_LOCK(ic); /* * Handle 11h channel change when we've reached the count. * We must recalculate the beacon frame contents to account * for the new channel. Note we do this only for the first * vap that reaches this point; subsequent vaps just update * their beacon state to reflect the recalculated channel. */ if (isset(bo->bo_flags, IEEE80211_BEACON_CSA) && vap->iv_csa_count == ic->ic_csa_count) { vap->iv_csa_count = 0; /* * Effect channel change before reconstructing the beacon * frame contents as many places reference ni_chan. */ if (ic->ic_csa_newchan != NULL) ieee80211_csa_completeswitch(ic); /* * NB: ieee80211_beacon_construct clears all pending * updates in bo_flags so we don't need to explicitly * clear IEEE80211_BEACON_CSA. */ ieee80211_beacon_construct(m, mtod(m, uint8_t*) + sizeof(struct ieee80211_frame), ni); /* XXX do WME aggressive mode processing? */ IEEE80211_UNLOCK(ic); return 1; /* just assume length changed */ } /* * Handle the quiet time element being added and removed. * Again, for now we just cheat and reconstruct the whole * beacon - that way the gap is provided as appropriate. * * So, track whether we have already added the IE versus * whether we want to be adding the IE. */ if ((vap->iv_flags_ext & IEEE80211_FEXT_QUIET_IE) && (vap->iv_quiet == 0)) { /* * Quiet time beacon IE enabled, but it's disabled; * recalc */ vap->iv_flags_ext &= ~IEEE80211_FEXT_QUIET_IE; ieee80211_beacon_construct(m, mtod(m, uint8_t*) + sizeof(struct ieee80211_frame), ni); /* XXX do WME aggressive mode processing? */ IEEE80211_UNLOCK(ic); return 1; /* just assume length changed */ } if (((vap->iv_flags_ext & IEEE80211_FEXT_QUIET_IE) == 0) && (vap->iv_quiet == 1)) { /* * Quiet time beacon IE disabled, but it's now enabled; * recalc */ vap->iv_flags_ext |= IEEE80211_FEXT_QUIET_IE; ieee80211_beacon_construct(m, mtod(m, uint8_t*) + sizeof(struct ieee80211_frame), ni); /* XXX do WME aggressive mode processing? */ IEEE80211_UNLOCK(ic); return 1; /* just assume length changed */ } wh = mtod(m, struct ieee80211_frame *); /* * XXX TODO Strictly speaking this should be incremented with the TX * lock held so as to serialise access to the non-qos TID sequence * number space. * * If the driver identifies it does its own TX seqno management then * we can skip this (and still not do the TX seqno.) */ seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++; *(uint16_t *)&wh->i_seq[0] = htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT); M_SEQNO_SET(m, seqno); /* XXX faster to recalculate entirely or just changes? */ capinfo = ieee80211_getcapinfo(vap, ni->ni_chan); *bo->bo_caps = htole16(capinfo); if (vap->iv_flags & IEEE80211_F_WME) { struct ieee80211_wme_state *wme = &ic->ic_wme; /* * Check for aggressive mode change. When there is * significant high priority traffic in the BSS * throttle back BE traffic by using conservative * parameters. Otherwise BE uses aggressive params * to optimize performance of legacy/non-QoS traffic. */ if (wme->wme_flags & WME_F_AGGRMODE) { if (wme->wme_hipri_traffic > wme->wme_hipri_switch_thresh) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME, "%s: traffic %u, disable aggressive mode\n", __func__, wme->wme_hipri_traffic); wme->wme_flags &= ~WME_F_AGGRMODE; ieee80211_wme_updateparams_locked(vap); wme->wme_hipri_traffic = wme->wme_hipri_switch_hysteresis; } else wme->wme_hipri_traffic = 0; } else { if (wme->wme_hipri_traffic <= wme->wme_hipri_switch_thresh) { IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME, "%s: traffic %u, enable aggressive mode\n", __func__, wme->wme_hipri_traffic); wme->wme_flags |= WME_F_AGGRMODE; ieee80211_wme_updateparams_locked(vap); wme->wme_hipri_traffic = 0; } else wme->wme_hipri_traffic = wme->wme_hipri_switch_hysteresis; } if (isset(bo->bo_flags, IEEE80211_BEACON_WME)) { (void) ieee80211_add_wme_param(bo->bo_wme, wme); clrbit(bo->bo_flags, IEEE80211_BEACON_WME); } } if (isset(bo->bo_flags, IEEE80211_BEACON_HTINFO)) { ieee80211_ht_update_beacon(vap, bo); clrbit(bo->bo_flags, IEEE80211_BEACON_HTINFO); } #ifdef IEEE80211_SUPPORT_TDMA if (vap->iv_caps & IEEE80211_C_TDMA) { /* * NB: the beacon is potentially updated every TBTT. */ ieee80211_tdma_update_beacon(vap, bo); } #endif #ifdef IEEE80211_SUPPORT_MESH if (vap->iv_opmode == IEEE80211_M_MBSS) ieee80211_mesh_update_beacon(vap, bo); #endif if (vap->iv_opmode == IEEE80211_M_HOSTAP || vap->iv_opmode == IEEE80211_M_MBSS) { /* NB: no IBSS support*/ struct ieee80211_tim_ie *tie = (struct ieee80211_tim_ie *) bo->bo_tim; if (isset(bo->bo_flags, IEEE80211_BEACON_TIM)) { u_int timlen, timoff, i; /* * ATIM/DTIM needs updating. If it fits in the * current space allocated then just copy in the * new bits. Otherwise we need to move any trailing * data to make room. Note that we know there is * contiguous space because ieee80211_beacon_allocate * insures there is space in the mbuf to write a * maximal-size virtual bitmap (based on iv_max_aid). */ /* * Calculate the bitmap size and offset, copy any * trailer out of the way, and then copy in the * new bitmap and update the information element. * Note that the tim bitmap must contain at least * one byte and any offset must be even. */ if (vap->iv_ps_pending != 0) { timoff = 128; /* impossibly large */ for (i = 0; i < vap->iv_tim_len; i++) if (vap->iv_tim_bitmap[i]) { timoff = i &~ 1; break; } KASSERT(timoff != 128, ("tim bitmap empty!")); for (i = vap->iv_tim_len-1; i >= timoff; i--) if (vap->iv_tim_bitmap[i]) break; timlen = 1 + (i - timoff); } else { timoff = 0; timlen = 1; } /* * TODO: validate this! */ if (timlen != bo->bo_tim_len) { /* copy up/down trailer */ int adjust = tie->tim_bitmap+timlen - bo->bo_tim_trailer; ovbcopy(bo->bo_tim_trailer, bo->bo_tim_trailer+adjust, bo->bo_tim_trailer_len); bo->bo_tim_trailer += adjust; bo->bo_erp += adjust; bo->bo_htinfo += adjust; bo->bo_vhtinfo += adjust; #ifdef IEEE80211_SUPPORT_SUPERG bo->bo_ath += adjust; #endif #ifdef IEEE80211_SUPPORT_TDMA bo->bo_tdma += adjust; #endif #ifdef IEEE80211_SUPPORT_MESH bo->bo_meshconf += adjust; #endif bo->bo_appie += adjust; bo->bo_wme += adjust; bo->bo_csa += adjust; bo->bo_quiet += adjust; bo->bo_tim_len = timlen; /* update information element */ tie->tim_len = 3 + timlen; tie->tim_bitctl = timoff; len_changed = 1; } memcpy(tie->tim_bitmap, vap->iv_tim_bitmap + timoff, bo->bo_tim_len); clrbit(bo->bo_flags, IEEE80211_BEACON_TIM); IEEE80211_DPRINTF(vap, IEEE80211_MSG_POWER, "%s: TIM updated, pending %u, off %u, len %u\n", __func__, vap->iv_ps_pending, timoff, timlen); } /* count down DTIM period */ if (tie->tim_count == 0) tie->tim_count = tie->tim_period - 1; else tie->tim_count--; /* update state for buffered multicast frames on DTIM */ if (mcast && tie->tim_count == 0) tie->tim_bitctl |= 1; else tie->tim_bitctl &= ~1; if (isset(bo->bo_flags, IEEE80211_BEACON_CSA)) { struct ieee80211_csa_ie *csa = (struct ieee80211_csa_ie *) bo->bo_csa; /* * Insert or update CSA ie. If we're just starting * to count down to the channel switch then we need * to insert the CSA ie. Otherwise we just need to * drop the count. The actual change happens above * when the vap's count reaches the target count. */ if (vap->iv_csa_count == 0) { memmove(&csa[1], csa, bo->bo_csa_trailer_len); bo->bo_erp += sizeof(*csa); bo->bo_htinfo += sizeof(*csa); bo->bo_vhtinfo += sizeof(*csa); bo->bo_wme += sizeof(*csa); #ifdef IEEE80211_SUPPORT_SUPERG bo->bo_ath += sizeof(*csa); #endif #ifdef IEEE80211_SUPPORT_TDMA bo->bo_tdma += sizeof(*csa); #endif #ifdef IEEE80211_SUPPORT_MESH bo->bo_meshconf += sizeof(*csa); #endif bo->bo_appie += sizeof(*csa); bo->bo_csa_trailer_len += sizeof(*csa); bo->bo_quiet += sizeof(*csa); bo->bo_tim_trailer_len += sizeof(*csa); m->m_len += sizeof(*csa); m->m_pkthdr.len += sizeof(*csa); ieee80211_add_csa(bo->bo_csa, vap); } else csa->csa_count--; vap->iv_csa_count++; /* NB: don't clear IEEE80211_BEACON_CSA */ } /* * Only add the quiet time IE if we've enabled it * as appropriate. */ if (IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) && (vap->iv_flags_ext & IEEE80211_FEXT_DFS)) { if (vap->iv_quiet && (vap->iv_flags_ext & IEEE80211_FEXT_QUIET_IE)) { ieee80211_add_quiet(bo->bo_quiet, vap, 1); } } if (isset(bo->bo_flags, IEEE80211_BEACON_ERP)) { /* * ERP element needs updating. */ (void) ieee80211_add_erp(bo->bo_erp, ic); clrbit(bo->bo_flags, IEEE80211_BEACON_ERP); } #ifdef IEEE80211_SUPPORT_SUPERG if (isset(bo->bo_flags, IEEE80211_BEACON_ATH)) { ieee80211_add_athcaps(bo->bo_ath, ni); clrbit(bo->bo_flags, IEEE80211_BEACON_ATH); } #endif } if (isset(bo->bo_flags, IEEE80211_BEACON_APPIE)) { const struct ieee80211_appie *aie = vap->iv_appie_beacon; int aielen; uint8_t *frm; aielen = 0; if (aie != NULL) aielen += aie->ie_len; if (aielen != bo->bo_appie_len) { /* copy up/down trailer */ int adjust = aielen - bo->bo_appie_len; ovbcopy(bo->bo_tim_trailer, bo->bo_tim_trailer+adjust, bo->bo_tim_trailer_len); bo->bo_tim_trailer += adjust; bo->bo_appie += adjust; bo->bo_appie_len = aielen; len_changed = 1; } frm = bo->bo_appie; if (aie != NULL) frm = add_appie(frm, aie); clrbit(bo->bo_flags, IEEE80211_BEACON_APPIE); } IEEE80211_UNLOCK(ic); return len_changed; } /* * Do Ethernet-LLC encapsulation for each payload in a fast frame * tunnel encapsulation. The frame is assumed to have an Ethernet * header at the front that must be stripped before prepending the * LLC followed by the Ethernet header passed in (with an Ethernet * type that specifies the payload size). */ struct mbuf * ieee80211_ff_encap1(struct ieee80211vap *vap, struct mbuf *m, const struct ether_header *eh) { struct llc *llc; uint16_t payload; /* XXX optimize by combining m_adj+M_PREPEND */ m_adj(m, sizeof(struct ether_header) - sizeof(struct llc)); llc = mtod(m, struct llc *); llc->llc_dsap = llc->llc_ssap = LLC_SNAP_LSAP; llc->llc_control = LLC_UI; llc->llc_snap.org_code[0] = 0; llc->llc_snap.org_code[1] = 0; llc->llc_snap.org_code[2] = 0; llc->llc_snap.ether_type = eh->ether_type; payload = m->m_pkthdr.len; /* NB: w/o Ethernet header */ M_PREPEND(m, sizeof(struct ether_header), M_NOWAIT); if (m == NULL) { /* XXX cannot happen */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG, "%s: no space for ether_header\n", __func__); vap->iv_stats.is_tx_nobuf++; return NULL; } ETHER_HEADER_COPY(mtod(m, void *), eh); mtod(m, struct ether_header *)->ether_type = htons(payload); return m; } /* * Complete an mbuf transmission. * * For now, this simply processes a completed frame after the * driver has completed it's transmission and/or retransmission. * It assumes the frame is an 802.11 encapsulated frame. * * Later on it will grow to become the exit path for a given frame * from the driver and, depending upon how it's been encapsulated * and already transmitted, it may end up doing A-MPDU retransmission, * power save requeuing, etc. * * In order for the above to work, the driver entry point to this * must not hold any driver locks. Thus, the driver needs to delay * any actual mbuf completion until it can release said locks. * * This frees the mbuf and if the mbuf has a node reference, * the node reference will be freed. */ void ieee80211_tx_complete(struct ieee80211_node *ni, struct mbuf *m, int status) { if (ni != NULL) { struct ifnet *ifp = ni->ni_vap->iv_ifp; if (status == 0) { if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if (m->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); if (m->m_flags & M_TXCB) ieee80211_process_callback(ni, m, status); ieee80211_free_node(ni); } m_freem(m); } Index: head/sys/net80211/ieee80211_wds.c =================================================================== --- head/sys/net80211/ieee80211_wds.c (revision 348253) +++ head/sys/net80211/ieee80211_wds.c (revision 348254) @@ -1,798 +1,800 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007-2008 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #ifdef __FreeBSD__ __FBSDID("$FreeBSD$"); #endif /* * IEEE 802.11 WDS mode support. */ #include "opt_inet.h" #include "opt_wlan.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IEEE80211_SUPPORT_SUPERG #include #endif static void wds_vattach(struct ieee80211vap *); static int wds_newstate(struct ieee80211vap *, enum ieee80211_state, int); static int wds_input(struct ieee80211_node *ni, struct mbuf *m, const struct ieee80211_rx_stats *rxs, int, int); static void wds_recv_mgmt(struct ieee80211_node *, struct mbuf *, int subtype, const struct ieee80211_rx_stats *, int, int); void ieee80211_wds_attach(struct ieee80211com *ic) { ic->ic_vattach[IEEE80211_M_WDS] = wds_vattach; } void ieee80211_wds_detach(struct ieee80211com *ic) { } static void wds_vdetach(struct ieee80211vap *vap) { if (vap->iv_bss != NULL) { /* XXX locking? */ if (vap->iv_bss->ni_wdsvap == vap) vap->iv_bss->ni_wdsvap = NULL; } } static void wds_vattach(struct ieee80211vap *vap) { vap->iv_newstate = wds_newstate; vap->iv_input = wds_input; vap->iv_recv_mgmt = wds_recv_mgmt; vap->iv_opdetach = wds_vdetach; } static void wds_flush(struct ieee80211_node *ni) { struct ieee80211com *ic = ni->ni_ic; struct mbuf *m, *next; int8_t rssi, nf; m = ieee80211_ageq_remove(&ic->ic_stageq, (void *)(uintptr_t) ieee80211_mac_hash(ic, ni->ni_macaddr)); if (m == NULL) return; IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_WDS, ni, "%s", "flush wds queue"); ic->ic_node_getsignal(ni, &rssi, &nf); for (; m != NULL; m = next) { next = m->m_nextpkt; m->m_nextpkt = NULL; ieee80211_input(ni, m, rssi, nf); } } static int ieee80211_create_wds(struct ieee80211vap *vap, struct ieee80211_channel *chan) { struct ieee80211com *ic = vap->iv_ic; struct ieee80211_node_table *nt = &ic->ic_sta; struct ieee80211_node *ni, *obss; IEEE80211_DPRINTF(vap, IEEE80211_MSG_WDS, "%s: creating link to %s on channel %u\n", __func__, ether_sprintf(vap->iv_des_bssid), ieee80211_chan2ieee(ic, chan)); /* NB: vap create must specify the bssid for the link */ KASSERT(vap->iv_flags & IEEE80211_F_DESBSSID, ("no bssid")); /* NB: we should only be called on RUN transition */ KASSERT(vap->iv_state == IEEE80211_S_RUN, ("!RUN state")); if ((vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) == 0) { /* * Dynamic/non-legacy WDS. Reference the associated * station specified by the desired bssid setup at vap * create. Point ni_wdsvap at the WDS vap so 4-address * frames received through the associated AP vap will * be dispatched upward (e.g. to a bridge) as though * they arrived on the WDS vap. */ IEEE80211_NODE_LOCK(nt); obss = NULL; ni = ieee80211_find_node_locked(&ic->ic_sta, vap->iv_des_bssid); if (ni == NULL) { /* * Node went away before we could hookup. This * should be ok; no traffic will flow and a leave * event will be dispatched that should cause * the vap to be destroyed. */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_WDS, "%s: station %s went away\n", __func__, ether_sprintf(vap->iv_des_bssid)); /* XXX stat? */ } else if (ni->ni_wdsvap != NULL) { /* * Node already setup with a WDS vap; we cannot * allow multiple references so disallow. If * ni_wdsvap points at us that's ok; we should * do nothing anyway. */ /* XXX printf instead? */ IEEE80211_DPRINTF(vap, IEEE80211_MSG_WDS, "%s: station %s in use with %s\n", __func__, ether_sprintf(vap->iv_des_bssid), ni->ni_wdsvap->iv_ifp->if_xname); /* XXX stat? */ } else { /* * Committed to new node, setup state. */ obss = vap->iv_bss; vap->iv_bss = ni; ni->ni_wdsvap = vap; } IEEE80211_NODE_UNLOCK(nt); if (obss != NULL) { /* NB: deferred to avoid recursive lock */ ieee80211_free_node(obss); } } else { /* * Legacy WDS vap setup. */ /* * The far end does not associate so we just create * create a new node and install it as the vap's * bss node. We must simulate an association and * authorize the port for traffic to flow. * XXX check if node already in sta table? */ ni = ieee80211_node_create_wds(vap, vap->iv_des_bssid, chan); if (ni != NULL) { obss = vap->iv_bss; vap->iv_bss = ieee80211_ref_node(ni); ni->ni_flags |= IEEE80211_NODE_AREF; if (obss != NULL) ieee80211_free_node(obss); /* give driver a chance to setup state like ni_txrate */ if (ic->ic_newassoc != NULL) ic->ic_newassoc(ni, 1); /* tell the authenticator about new station */ if (vap->iv_auth->ia_node_join != NULL) vap->iv_auth->ia_node_join(ni); if (ni->ni_authmode != IEEE80211_AUTH_8021X) ieee80211_node_authorize(ni); ieee80211_notify_node_join(ni, 1 /*newassoc*/); /* XXX inject l2uf frame */ } } /* * Flush any pending frames now that were setup. */ if (ni != NULL) wds_flush(ni); return (ni == NULL ? ENOENT : 0); } /* * Propagate multicast frames of an ap vap to all DWDS links. * The caller is assumed to have verified this frame is multicast. */ void ieee80211_dwds_mcast(struct ieee80211vap *vap0, struct mbuf *m) { struct ieee80211com *ic = vap0->iv_ic; const struct ether_header *eh = mtod(m, const struct ether_header *); struct ieee80211_node *ni; struct ieee80211vap *vap; struct ifnet *ifp; struct mbuf *mcopy; int err; KASSERT(ETHER_IS_MULTICAST(eh->ether_dhost), ("%s not mcast", ether_sprintf(eh->ether_dhost))); /* XXX locking */ TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) { /* only DWDS vaps are interesting */ if (vap->iv_opmode != IEEE80211_M_WDS || (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY)) continue; /* if it came in this interface, don't send it back out */ ifp = vap->iv_ifp; if (ifp == m->m_pkthdr.rcvif) continue; /* * Duplicate the frame and send it. */ mcopy = m_copypacket(m, M_NOWAIT); if (mcopy == NULL) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* XXX stat + msg */ continue; } ni = ieee80211_find_txnode(vap, eh->ether_dhost); if (ni == NULL) { /* NB: ieee80211_find_txnode does stat+msg */ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(mcopy); continue; } /* calculate priority so drivers can find the tx queue */ if (ieee80211_classify(ni, mcopy)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_OUTPUT | IEEE80211_MSG_WDS, eh->ether_dhost, NULL, "%s", "classification failure"); vap->iv_stats.is_tx_classify++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); m_freem(mcopy); ieee80211_free_node(ni); continue; } BPF_MTAP(ifp, m); /* 802.3 tx */ /* * Encapsulate the packet in prep for transmission. */ IEEE80211_TX_LOCK(ic); mcopy = ieee80211_encap(vap, ni, mcopy); if (mcopy == NULL) { /* NB: stat+msg handled in ieee80211_encap */ IEEE80211_TX_UNLOCK(ic); ieee80211_free_node(ni); continue; } mcopy->m_flags |= M_MCAST; + MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); mcopy->m_pkthdr.rcvif = (void *) ni; err = ieee80211_parent_xmitpkt(ic, mcopy); IEEE80211_TX_UNLOCK(ic); if (!err) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); } } } /* * Handle DWDS discovery on receipt of a 4-address frame in * ap mode. Queue the frame and post an event for someone * to plumb the necessary WDS vap for this station. Frames * received prior to the vap set running will then be reprocessed * as if they were just received. */ void ieee80211_dwds_discover(struct ieee80211_node *ni, struct mbuf *m) { struct ieee80211com *ic = ni->ni_ic; /* * Save the frame with an aging interval 4 times * the listen interval specified by the station. * Frames that sit around too long are reclaimed * using this information. * XXX handle overflow? * XXX per/vap beacon interval? */ + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); m->m_pkthdr.rcvif = (void *)(uintptr_t) ieee80211_mac_hash(ic, ni->ni_macaddr); (void) ieee80211_ageq_append(&ic->ic_stageq, m, ((ni->ni_intval * ic->ic_lintval) << 2) / 1024); ieee80211_notify_wds_discover(ni); } /* * IEEE80211_M_WDS vap state machine handler. */ static int wds_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg) { struct ieee80211com *ic = vap->iv_ic; enum ieee80211_state ostate; int error; IEEE80211_LOCK_ASSERT(ic); ostate = vap->iv_state; IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s\n", __func__, ieee80211_state_name[ostate], ieee80211_state_name[nstate]); vap->iv_state = nstate; /* state transition */ callout_stop(&vap->iv_mgtsend); /* XXX callout_drain */ if (ostate != IEEE80211_S_SCAN) ieee80211_cancel_scan(vap); /* background scan */ error = 0; switch (nstate) { case IEEE80211_S_INIT: switch (ostate) { case IEEE80211_S_SCAN: ieee80211_cancel_scan(vap); break; default: break; } if (ostate != IEEE80211_S_INIT) { /* NB: optimize INIT -> INIT case */ ieee80211_reset_bss(vap); } break; case IEEE80211_S_SCAN: switch (ostate) { case IEEE80211_S_INIT: ieee80211_check_scan_current(vap); break; default: break; } break; case IEEE80211_S_RUN: if (ostate == IEEE80211_S_INIT) { /* * Already have a channel; bypass the scan * and startup immediately. */ error = ieee80211_create_wds(vap, ic->ic_curchan); } break; default: break; } return error; } /* * Process a received frame. The node associated with the sender * should be supplied. If nothing was found in the node table then * the caller is assumed to supply a reference to iv_bss instead. * The RSSI and a timestamp are also supplied. The RSSI data is used * during AP scanning to select a AP to associate with; it can have * any units so long as values have consistent units and higher values * mean ``better signal''. The receive timestamp is currently not used * by the 802.11 layer. */ static int wds_input(struct ieee80211_node *ni, struct mbuf *m, const struct ieee80211_rx_stats *rxs, int rssi, int nf) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ifnet *ifp = vap->iv_ifp; struct ieee80211_frame *wh; struct ieee80211_key *key; struct ether_header *eh; int hdrspace, need_tap = 1; /* mbuf need to be tapped. */ uint8_t dir, type, subtype, qos; int is_hw_decrypted = 0; int has_decrypted = 0; /* * Some devices do hardware decryption all the way through * to pretending the frame wasn't encrypted in the first place. * So, tag it appropriately so it isn't discarded inappropriately. */ if ((rxs != NULL) && (rxs->c_pktflags & IEEE80211_RX_F_DECRYPTED)) is_hw_decrypted = 1; if (m->m_flags & M_AMPDU_MPDU) { /* * Fastpath for A-MPDU reorder q resubmission. Frames * w/ M_AMPDU_MPDU marked have already passed through * here but were received out of order and been held on * the reorder queue. When resubmitted they are marked * with the M_AMPDU_MPDU flag and we can bypass most of * the normal processing. */ wh = mtod(m, struct ieee80211_frame *); type = IEEE80211_FC0_TYPE_DATA; dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK; subtype = IEEE80211_FC0_SUBTYPE_QOS; hdrspace = ieee80211_hdrspace(ic, wh); /* XXX optimize? */ goto resubmit_ampdu; } KASSERT(ni != NULL, ("null node")); type = -1; /* undefined */ if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_min)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, NULL, "too short (1): len %u", m->m_pkthdr.len); vap->iv_stats.is_rx_tooshort++; goto out; } /* * Bit of a cheat here, we use a pointer for a 3-address * frame format but don't reference fields past outside * ieee80211_frame_min w/o first validating the data is * present. */ wh = mtod(m, struct ieee80211_frame *); if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) ni->ni_inact = ni->ni_inact_reload; if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) != IEEE80211_FC0_VERSION_0) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, NULL, "wrong version, fc %02x:%02x", wh->i_fc[0], wh->i_fc[1]); vap->iv_stats.is_rx_badversion++; goto err; } dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK; type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK; subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK; /* NB: WDS vap's do not scan */ if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_addr4)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, NULL, "too short (3): len %u", m->m_pkthdr.len); vap->iv_stats.is_rx_tooshort++; goto out; } /* NB: the TA is implicitly verified by finding the wds peer node */ if (!IEEE80211_ADDR_EQ(wh->i_addr1, vap->iv_myaddr) && !IEEE80211_ADDR_EQ(wh->i_addr1, ifp->if_broadcastaddr)) { /* not interested in */ IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT, wh->i_addr1, NULL, "%s", "not to bss"); vap->iv_stats.is_rx_wrongbss++; goto out; } IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi); ni->ni_noise = nf; if (IEEE80211_HAS_SEQ(type, subtype)) { uint8_t tid = ieee80211_gettid(wh); if (IEEE80211_QOS_HAS_SEQ(wh) && TID_TO_WME_AC(tid) >= WME_AC_VI) ic->ic_wme.wme_hipri_traffic++; if (! ieee80211_check_rxseq(ni, wh, wh->i_addr1, rxs)) goto out; } switch (type) { case IEEE80211_FC0_TYPE_DATA: hdrspace = ieee80211_hdrspace(ic, wh); if (m->m_len < hdrspace && (m = m_pullup(m, hdrspace)) == NULL) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, NULL, "data too short: expecting %u", hdrspace); vap->iv_stats.is_rx_tooshort++; goto out; /* XXX */ } if (dir != IEEE80211_FC1_DIR_DSTODS) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, "data", "incorrect dir 0x%x", dir); vap->iv_stats.is_rx_wrongdir++; goto out; } /* * Only legacy WDS traffic should take this path. */ if ((vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) == 0) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, "data", "%s", "not legacy wds"); vap->iv_stats.is_rx_wrongdir++;/*XXX*/ goto out; } /* * Handle A-MPDU re-ordering. If the frame is to be * processed directly then ieee80211_ampdu_reorder * will return 0; otherwise it has consumed the mbuf * and we should do nothing more with it. */ if ((m->m_flags & M_AMPDU) && ieee80211_ampdu_reorder(ni, m, rxs) != 0) { m = NULL; goto out; } resubmit_ampdu: /* * Handle privacy requirements. Note that we * must not be preempted from here until after * we (potentially) call ieee80211_crypto_demic; * otherwise we may violate assumptions in the * crypto cipher modules used to do delayed update * of replay sequence numbers. */ if (is_hw_decrypted || wh->i_fc[1] & IEEE80211_FC1_PROTECTED) { if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) { /* * Discard encrypted frames when privacy is off. */ IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, "WEP", "%s", "PRIVACY off"); vap->iv_stats.is_rx_noprivacy++; IEEE80211_NODE_STAT(ni, rx_noprivacy); goto out; } if (ieee80211_crypto_decap(ni, m, hdrspace, &key) == 0) { /* NB: stats+msgs handled in crypto_decap */ IEEE80211_NODE_STAT(ni, rx_wepfail); goto out; } wh = mtod(m, struct ieee80211_frame *); wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED; has_decrypted = 1; } else { /* XXX M_WEP and IEEE80211_F_PRIVACY */ key = NULL; } /* * Save QoS bits for use below--before we strip the header. */ if (subtype == IEEE80211_FC0_SUBTYPE_QOS) qos = ieee80211_getqos(wh)[0]; else qos = 0; /* * Next up, any fragmentation. */ if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) { m = ieee80211_defrag(ni, m, hdrspace); if (m == NULL) { /* Fragment dropped or frame not complete yet */ goto out; } } wh = NULL; /* no longer valid, catch any uses */ /* * Next strip any MSDU crypto bits. */ if (!ieee80211_crypto_demic(vap, key, m, 0)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT, ni->ni_macaddr, "data", "%s", "demic error"); vap->iv_stats.is_rx_demicfail++; IEEE80211_NODE_STAT(ni, rx_demicfail); goto out; } /* copy to listener after decrypt */ if (ieee80211_radiotap_active_vap(vap)) ieee80211_radiotap_rx(vap, m); need_tap = 0; /* * Finally, strip the 802.11 header. */ m = ieee80211_decap(vap, m, hdrspace); if (m == NULL) { /* XXX mask bit to check for both */ /* don't count Null data frames as errors */ if (subtype == IEEE80211_FC0_SUBTYPE_NODATA || subtype == IEEE80211_FC0_SUBTYPE_QOS_NULL) goto out; IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT, ni->ni_macaddr, "data", "%s", "decap error"); vap->iv_stats.is_rx_decap++; IEEE80211_NODE_STAT(ni, rx_decap); goto err; } eh = mtod(m, struct ether_header *); if (!ieee80211_node_is_authorized(ni)) { /* * Deny any non-PAE frames received prior to * authorization. For open/shared-key * authentication the port is mark authorized * after authentication completes. For 802.1x * the port is not marked authorized by the * authenticator until the handshake has completed. */ if (eh->ether_type != htons(ETHERTYPE_PAE)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT, eh->ether_shost, "data", "unauthorized port: ether type 0x%x len %u", eh->ether_type, m->m_pkthdr.len); vap->iv_stats.is_rx_unauth++; IEEE80211_NODE_STAT(ni, rx_unauth); goto err; } } else { /* * When denying unencrypted frames, discard * any non-PAE frames received without encryption. */ if ((vap->iv_flags & IEEE80211_F_DROPUNENC) && ((has_decrypted == 0) && (m->m_flags & M_WEP) == 0) && (is_hw_decrypted == 0) && eh->ether_type != htons(ETHERTYPE_PAE)) { /* * Drop unencrypted frames. */ vap->iv_stats.is_rx_unencrypted++; IEEE80211_NODE_STAT(ni, rx_unencrypted); goto out; } } /* XXX require HT? */ if (qos & IEEE80211_QOS_AMSDU) { m = ieee80211_decap_amsdu(ni, m); if (m == NULL) return IEEE80211_FC0_TYPE_DATA; } else { #ifdef IEEE80211_SUPPORT_SUPERG m = ieee80211_decap_fastframe(vap, ni, m); if (m == NULL) return IEEE80211_FC0_TYPE_DATA; #endif } ieee80211_deliver_data(vap, ni, m); return IEEE80211_FC0_TYPE_DATA; case IEEE80211_FC0_TYPE_MGT: vap->iv_stats.is_rx_mgmt++; IEEE80211_NODE_STAT(ni, rx_mgmt); if (dir != IEEE80211_FC1_DIR_NODS) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, "data", "incorrect dir 0x%x", dir); vap->iv_stats.is_rx_wrongdir++; goto err; } if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) { IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY, ni->ni_macaddr, "mgt", "too short: len %u", m->m_pkthdr.len); vap->iv_stats.is_rx_tooshort++; goto out; } #ifdef IEEE80211_DEBUG if (ieee80211_msg_debug(vap) || ieee80211_msg_dumppkts(vap)) { if_printf(ifp, "received %s from %s rssi %d\n", ieee80211_mgt_subtype_name(subtype), ether_sprintf(wh->i_addr2), rssi); } #endif if (wh->i_fc[1] & IEEE80211_FC1_PROTECTED) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "%s", "WEP set but not permitted"); vap->iv_stats.is_rx_mgtdiscard++; /* XXX */ goto out; } vap->iv_recv_mgmt(ni, m, subtype, rxs, rssi, nf); goto out; case IEEE80211_FC0_TYPE_CTL: vap->iv_stats.is_rx_ctl++; IEEE80211_NODE_STAT(ni, rx_ctrl); goto out; default: IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY, wh, "bad", "frame type 0x%x", type); /* should not come here */ break; } err: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); out: if (m != NULL) { if (need_tap && ieee80211_radiotap_active_vap(vap)) ieee80211_radiotap_rx(vap, m); m_freem(m); } return type; } static void wds_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0, int subtype, const struct ieee80211_rx_stats *rxs, int rssi, int nf) { struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ieee80211_frame *wh; u_int8_t *frm, *efrm; wh = mtod(m0, struct ieee80211_frame *); frm = (u_int8_t *)&wh[1]; efrm = mtod(m0, u_int8_t *) + m0->m_len; switch (subtype) { case IEEE80211_FC0_SUBTYPE_ACTION: case IEEE80211_FC0_SUBTYPE_ACTION_NOACK: if (ni == vap->iv_bss) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "%s", "unknown node"); vap->iv_stats.is_rx_mgtdiscard++; } else if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr1)) { /* NB: not interested in multicast frames. */ IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "%s", "not for us"); vap->iv_stats.is_rx_mgtdiscard++; } else if (vap->iv_state != IEEE80211_S_RUN) { IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "wrong state %s", ieee80211_state_name[vap->iv_state]); vap->iv_stats.is_rx_mgtdiscard++; } else { if (ieee80211_parse_action(ni, m0) == 0) (void)ic->ic_recv_action(ni, wh, frm, efrm); } break; case IEEE80211_FC0_SUBTYPE_ASSOC_REQ: case IEEE80211_FC0_SUBTYPE_ASSOC_RESP: case IEEE80211_FC0_SUBTYPE_REASSOC_REQ: case IEEE80211_FC0_SUBTYPE_REASSOC_RESP: case IEEE80211_FC0_SUBTYPE_PROBE_REQ: case IEEE80211_FC0_SUBTYPE_PROBE_RESP: case IEEE80211_FC0_SUBTYPE_TIMING_ADV: case IEEE80211_FC0_SUBTYPE_BEACON: case IEEE80211_FC0_SUBTYPE_ATIM: case IEEE80211_FC0_SUBTYPE_DISASSOC: case IEEE80211_FC0_SUBTYPE_AUTH: case IEEE80211_FC0_SUBTYPE_DEAUTH: IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh, NULL, "%s", "not handled"); vap->iv_stats.is_rx_mgtdiscard++; break; default: IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY, wh, "mgt", "subtype 0x%x not handled", subtype); vap->iv_stats.is_rx_badsubtype++; break; } } Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c (revision 348253) +++ head/sys/netinet/in_pcb.c (revision 348254) @@ -1,3434 +1,3425 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_pcbgroup.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #ifdef TCPHPTS #include #endif #include #include #endif #ifdef INET #include #endif #ifdef INET6 #include #include #include #include #endif /* INET6 */ #include #include #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 static struct callout ipport_tick_callout; /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Variables dealing with random ephemeral port allocation. */ VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ VNET_DEFINE(int, ipport_tcpallocs); VNET_DEFINE_STATIC(int, ipport_tcplastcount); #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " "allocations before switching to a sequental one"); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); #endif /* INET */ /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, uint16_t port, const union in_dependaddr *addr, int size) { struct inpcblbgroup *grp; size_t bytes; bytes = __offsetof(struct inpcblbgroup, il_inp[size]); grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); if (!grp) return (NULL); grp->il_vflag = vflag; grp->il_lport = port; grp->il_dependladdr = *addr; grp->il_inpsiz = size; CK_LIST_INSERT_HEAD(hdr, grp, il_list); return (grp); } static void in_pcblbgroup_free_deferred(epoch_context_t ctx) { struct inpcblbgroup *grp; grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); free(grp, M_PCB); } static void in_pcblbgroup_free(struct inpcblbgroup *grp) { CK_LIST_REMOVE(grp, il_list); epoch_call(net_epoch_preempt, &grp->il_epoch_ctx, in_pcblbgroup_free_deferred); } static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) { struct inpcblbgroup *grp; int i; grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, old_grp->il_lport, &old_grp->il_dependladdr, size); if (grp == NULL) return (NULL); KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, ("invalid new local group size %d and old local group count %d", grp->il_inpsiz, old_grp->il_inpcnt)); for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; grp->il_inpcnt = old_grp->il_inpcnt; in_pcblbgroup_free(old_grp); return (grp); } /* * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] * and shrink group if possible. */ static void in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, int i) { struct inpcblbgroup *grp, *new_grp; grp = *grpp; for (; i + 1 < grp->il_inpcnt; ++i) grp->il_inp[i] = grp->il_inp[i + 1]; grp->il_inpcnt--; if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && grp->il_inpcnt <= grp->il_inpsiz / 4) { /* Shrink this group. */ new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); if (new_grp != NULL) *grpp = new_grp; } } /* * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int in_pcbinslbgrouphash(struct inpcb *inp) { const static struct timeval interval = { 60, 0 }; static struct timeval lastprint; struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Don't allow jailed socket to join local group. */ if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred)) return (0); #ifdef INET6 /* * Don't allow IPv4 mapped INET6 wild socket. */ if ((inp->inp_vflag & INP_IPV4) && inp->inp_laddr.s_addr == INADDR_ANY && INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { return (0); } #endif idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; CK_LIST_FOREACH(grp, hdr, il_list) { if (grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) break; } if (grp == NULL) { /* Create new load balance group. */ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, INPCBLBGROUP_SIZMIN); if (grp == NULL) return (ENOBUFS); } else if (grp->il_inpcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) printf("lb group port %d, limit reached\n", ntohs(grp->il_lport)); return (0); } /* Expand this local group. */ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); if (grp == NULL) return (ENOBUFS); } KASSERT(grp->il_inpcnt < grp->il_inpsiz, ("invalid local group size %d and count %d", grp->il_inpsiz, grp->il_inpcnt)); grp->il_inp[grp->il_inpcnt] = inp; grp->il_inpcnt++; return (0); } /* * Remove PCB from load balance group. */ static void in_pcbremlbgrouphash(struct inpcb *inp) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int i; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; CK_LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_inpcnt == 1) { /* We are the last, free this local group. */ in_pcblbgroup_free(grp); } else { /* Pull up inpcbs, shrink group if possible. */ in_pcblbgroup_reorder(hdr, &grp, i); } return; } } } /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. */ static void inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) { porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif pcbinfo->ipi_listhead = listhead; CK_LIST_INIT(pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; int error; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); #ifdef NUMA inp->inp_numa_domain = M_NODOM; #endif inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif INP_WLOCK(inp); INP_LIST_WLOCK(pcbinfo); CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; #ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: if (error != 0) { crfree(inp->inp_cred); uma_zfree(pcbinfo->ipi_zone, inp); } #endif return (error); } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif /* * Select a local port (number) to use. */ #if defined(INET) || defined(INET6) int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, dorandom, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * For UDP(-Lite), use random port allocation as long as the user * allows it. For TCP (and as of yet unknown) connections, * use random port allocation only if the user allows it AND * ipport_tick() allows it. */ if (V_ipport_randomized && (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || pcbinfo == &V_ulitecbinfo)) dorandom = 1; else dorandom = 0; /* * It makes no sense to do random port allocation if * we have the only port available. */ if (first == last) dorandom = 0; /* Make sure to not include UDP(-Lite) packets in the count. */ if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) V_ipport_tcpallocs++; /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET /* Make the compiler happy. */ laddr.s_addr = 0; if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", __func__, inp)); laddr = *laddrp; } #endif tmpinp = NULL; /* Make compiler happy. */ lport = *lportp; if (dorandom) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } while (tmpinp != NULL); #ifdef INET if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) laddrp->s_addr = laddr.s_addr; #endif *lportp = lport; return (0); } /* * Return cached socket options. */ int inp_so_options(const struct inpcb *inp) { int so_options; so_options = 0; if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) so_options |= SO_REUSEPORT_LB; if ((inp->inp_flags2 & INP_REUSEPORT) != 0) so_options |= SO_REUSEPORT; if ((inp->inp_flags2 & INP_REUSEADDR) != 0) so_options |= SO_REUSEADDR; return (so_options); } #endif /* INET || INET6 */ /* * Check if a new BINDMULTI socket is allowed to be created. * * ni points to the new inp. * oi points to the exisitng inp. * * This checks whether the existing inp also has BINDMULTI and * whether the credentials match. */ int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) { /* Check permissions match */ if ((ni->inp_flags2 & INP_BINDMULTI) && (ni->inp_cred->cr_uid != oi->inp_cred->cr_uid)) return (0); /* Check the existing inp has BINDMULTI set */ if ((ni->inp_flags2 & INP_BINDMULTI) && ((oi->inp_flags2 & INP_BINDMULTI) == 0)) return (0); /* * We're okay - either INP_BINDMULTI isn't set on ni, or * it is and it matches the checks. */ return (1); } #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct sockaddr_in *sin; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here * so that we don't have to add to the (already messy) code below. */ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) return (EINVAL); #ifdef notdef /* * We should check the family, but old programs * incorrectly fail to initialize it. */ if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); #endif error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; /* * XXX: How to deal with SO_REUSEPORT_LB here? * Treat same as SO_REUSEPORT for now. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && ((t->inp_flags & INP_TIMEWAIT) == 0) && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_flags2 & INP_REUSEPORT) || (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); /* * If the socket is a BINDMULTI socket, then * the credentials need to match and the * original socket also has to have been bound * with BINDMULTI. */ if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t && (t->inp_flags & INP_TIMEWAIT)) { /* * XXXRW: If an incpb has had its timewait * state recycled, we treat the address as * being in use (for now). This is better * than a panic, but not desirable. */ tw = intotw(t); if (tw == NULL || ((reuseport & tw->tw_so_options) == 0 && (reuseport_lb & tw->tw_so_options) == 0)) { return (EADDRINUSE); } } else if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && (reuseport & inp_so_options(t)) == 0 && (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, struct mbuf *m) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, NULL, cred); if (error) return (error); /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } } /* Commit the remaining changes. */ inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; in_pcbrehash_mbuf(inp, m); if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } int in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) { return (in_pcbconnect_mbuf(inp, nam, cred, NULL)); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin; struct route sro; struct epoch_tracker et; int error; KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; bzero(&sro, sizeof(sro)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ NET_EPOCH_ENTER(et); if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) { ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); } if (ia == NULL) { error = ENETUNREACH; goto done; } if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = sro.ro_rt->rt_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { struct sockaddr_in sain; struct in_ifaddr *ia; bzero(&sain, sizeof(struct sockaddr_in)); sain.sin_family = AF_INET; sain.sin_len = sizeof(struct sockaddr_in); sain.sin_addr.s_addr = faddr->s_addr; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&sain))); if (cred == NULL || !prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: NET_EPOCH_EXIT(et); if (sro.ro_rt != NULL) RTFREE(sro.ro_rt); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. * * If the operation fails because the connection already exists, * *oinpp will be set to the PCB of that connection so that the * caller can decide to override it. In all other cases, *oinpp * is set to NULL. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct inpcb **oinpp, struct ucred *cred) { struct rm_priotracker in_ifa_tracker; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct in_ifaddr *ia; struct inpcb *oinp; struct in_addr laddr, faddr; u_short lport, fport; int error; /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (oinpp != NULL) *oinpp = NULL; if (nam->sa_len != sizeof (*sin)) return (EINVAL); if (sin->sin_family != AF_INET) return (EAFNOSUPPORT); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { IN_IFADDR_RLOCK(&in_ifa_tracker); faddr = IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); if (cred != NULL && (error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { IN_IFADDR_RLOCK(&in_ifa_tracker); if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&CK_STAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; IN_IFADDR_RLOCK(&in_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if ((ia->ia_ifp == ifp) && (cred == NULL || prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0)) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } IN_IFADDR_RUNLOCK(&in_ifa_tracker); } } if (error) return (error); } oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, NULL); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; return (EADDRINUSE); } if (lport == 0) { error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, cred); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; in_pcbrehash(inp); } #endif /* INET */ /* * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. * For most protocols, this will be invoked immediately prior to calling * in_pcbfree(). However, with TCP the inpcb may significantly outlive the * socket, in which case in_pcbfree() is deferred. */ void in_pcbdetach(struct inpcb *inp) { KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); #ifdef RATELIMIT if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released. This * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, * but where the inpcb lock may already held, or when acquiring a reference * via a pcbgroup. * * in_pcbref() should be used only to provide brief memory stability, and * must always be followed by a call to INP_WLOCK() and in_pcbrele() to * garbage collect the inpcb if it has been in_pcbfree()'d from another * context. Until in_pcbrele() has returned that the inpcb is still valid, * lock and rele are the *only* safe operations that may be performed on the * inpcb. * * While the inpcb will not be freed, releasing the inpcb lock means that the * connection's state may change, so the caller should be careful to * revalidate any cached state on reacquiring the lock. Drop the reference * using in_pcbrele(). */ void in_pcbref(struct inpcb *inp) { KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); refcount_acquire(&inp->inp_refcount); } /* * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we * return a flag indicating whether or not the inpcb remains valid. If it is * valid, we return with the inpcb lock held. * * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a * reference on an inpcb. Historically more work was done here (actually, in * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely * about memory stability (and continued use of the write lock). */ int in_pcbrele_rlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_RLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_RUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_RUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } int in_pcbrele_wlocked(struct inpcb *inp) { struct inpcbinfo *pcbinfo; KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); INP_WLOCK_ASSERT(inp); if (refcount_release(&inp->inp_refcount) == 0) { /* * If the inpcb has been freed, let the caller know, even if * this isn't the last reference. */ if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return (1); } return (0); } KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); #ifdef TCPHPTS if (inp->inp_in_hpts || inp->inp_in_input) { struct tcp_hpts_entry *hpts; /* * We should not be on the hpts at * this point in any form. we must * get the lock to be sure. */ hpts = tcp_hpts_lock(inp); if (inp->inp_in_hpts) panic("Hpts:%p inp:%p at free still on hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); hpts = tcp_input_lock(inp); if (inp->inp_in_input) panic("Hpts:%p inp:%p at free still on input hpts", hpts, inp); mtx_unlock(&hpts->p_mtx); } #endif INP_WUNLOCK(inp); pcbinfo = inp->inp_pcbinfo; uma_zfree(pcbinfo->ipi_zone, inp); return (1); } /* * Temporary wrapper. */ int in_pcbrele(struct inpcb *inp) { return (in_pcbrele_wlocked(inp)); } void in_pcblist_rele_rlocked(epoch_context_t ctx) { struct in_pcblist *il; struct inpcb *inp; struct inpcbinfo *pcbinfo; int i, n; il = __containerof(ctx, struct in_pcblist, il_epoch_ctx); pcbinfo = il->il_pcbinfo; n = il->il_count; INP_INFO_WLOCK(pcbinfo); for (i = 0; i < n; i++) { inp = il->il_inp_list[i]; INP_RLOCK(inp); if (!in_pcbrele_rlocked(inp)) INP_RUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); free(il, M_TEMP); } static void inpcbport_free(epoch_context_t ctx) { struct inpcbport *phd; phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); free(phd, M_PCB); } static void in_pcbfree_deferred(epoch_context_t ctx) { struct inpcb *inp; int released __unused; inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); INP_WLOCK(inp); CURVNET_SET(inp->inp_vnet); #ifdef INET struct ip_moptions *imo = inp->inp_moptions; inp->inp_moptions = NULL; #endif /* XXXRW: Do as much as possible here. */ #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif #ifdef INET6 struct ip6_moptions *im6o = NULL; if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); im6o = inp->in6p_moptions; inp->in6p_moptions = NULL; } #endif if (inp->inp_options) (void)m_free(inp->inp_options); inp->inp_vflag = 0; crfree(inp->inp_cred); #ifdef MAC mac_inpcb_destroy(inp); #endif released = in_pcbrele_wlocked(inp); MPASS(released); #ifdef INET6 ip6_freemoptions(im6o); #endif #ifdef INET inp_freemoptions(imo); #endif CURVNET_RESTORE(); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele(), but the inpcb is still unlocked. Almost all * work, including removal from global lists, is done in this context, where * the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); KASSERT((inp->inp_flags2 & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); if (inp->inp_flags2 & INP_FREED) { INP_WUNLOCK(inp); return; } #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_LOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK(pcbinfo); in_pcbremlists(inp); INP_LIST_WUNLOCK(pcbinfo); RO_INVALIDATE_CACHE(&inp->inp_route); /* mark as destruction in progress */ inp->inp_flags2 |= INP_FREED; INP_WUNLOCK(inp); epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbnotifyall() and in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); #ifdef INVARIANTS if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) MPASS(inp->inp_refcount > 1); #endif /* * XXXRW: Possibly we should protect the setting of INP_DROPPED with * the hash lock...? */ inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); in_pcbremlbgrouphash(inp); CK_LIST_REMOVE(inp, inp_hash); CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr_p) { struct sockaddr_in *sin; sin = malloc(sizeof *sin, M_SONAME, M_WAITOK | M_ZERO); sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = *addr_p; sin->sin_port = port; return (struct sockaddr *)sin; } int in_getsockaddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_lport; addr = inp->inp_laddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } int in_getpeeraddr(struct socket *so, struct sockaddr **nam) { struct inpcb *inp; struct in_addr addr; in_port_t port; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); INP_RLOCK(inp); port = inp->inp_fport; addr = inp->inp_faddr; INP_RUNLOCK(inp); *nam = in_sockaddr(port, &addr); return 0; } void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, struct inpcb *(*notify)(struct inpcb *, int)) { struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { INP_WUNLOCK(inp); continue; } #endif if (inp->inp_faddr.s_addr != faddr.s_addr || inp->inp_socket == NULL) { INP_WUNLOCK(inp); continue; } if ((*notify)(inp, errno)) INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb *inp; struct ip_moptions *imo; int i, gap; INP_INFO_WLOCK(pcbinfo); CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; if ((inp->inp_vflag & INP_IPV4) && imo != NULL) { /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. * * XXX This can all be deferred to an epoch_call */ for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) { if (imo->imo_membership[i]->inm_ifp == ifp) { IN_MULTI_LOCK_ASSERT(); in_leavegroup_locked(imo->imo_membership[i], NULL); gap++; } else if (gap != 0) imo->imo_membership[i - gap] = imo->imo_membership[i]; } imo->imo_num_memberships -= gap; } INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(pcbinfo); } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (cred == NULL || prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && !prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, uint16_t fport, int lookupflags) { struct inpcb *local_wild; const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; INP_HASH_LOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; /* * Order of socket selection: * 1. non-wild. * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). * * NOTE: * - Load balanced group does not contain jailed sockets * - Load balanced group does not contain IPv4 mapped INET6 wild sockets */ local_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) continue; #endif if (grp->il_lport != lport) continue; idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % grp->il_inpcnt; if (grp->il_laddr.s_addr == laddr->s_addr) return (grp->il_inp[idx]); if (grp->il_laddr.s_addr == INADDR_ANY && (lookupflags & INPLOOKUP_WILDCARD) != 0) local_wild = grp->il_inp[idx]; } return (local_wild); } #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. */ static struct inpcb * in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; bool locked; /* * First look for an exact match. */ tmpinp = NULL; INP_GROUP_LOCK(pcbgroup); head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbgroup->ipg_hashmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) goto found; if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) { inp = tmpinp; goto found; } #ifdef RSS /* * For incoming connections, we may wish to do a wildcard * match for an RSS-local socket. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbgroup->ipg_hashmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } #endif /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; struct inpcbhead *head; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_wildmask)]; CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) goto found; else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ inp = jail_wild; if (inp == NULL) inp = local_exact; if (inp == NULL) inp = local_wild; #ifdef INET6 if (inp == NULL) inp = local_wild_mapped; #endif if (inp != NULL) goto found; } /* if (lookupflags & INPLOOKUP_WILDCARD) */ INP_GROUP_UNLOCK(pcbgroup); return (NULL); found: if (lookupflags & INPLOOKUP_WLOCKPCB) locked = INP_TRY_WLOCK(inp); else if (lookupflags & INPLOOKUP_RLOCKPCB) locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WUNLOCK(inp); else INP_RUNLOCK(inp); return (NULL); } else if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) return (NULL); } else { INP_RLOCK(inp); if (in_pcbrele_rlocked(inp)) return (NULL); } } #ifdef INVARIANTS if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); #endif return (inp); } #endif /* PCBGROUP */ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has locked the hash list, and will not perform any further * locking or reference operations on either the hash list or the connection. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, struct ifnet *ifp) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; #ifdef INVARIANTS KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); if (!mtx_owned(&pcbinfo->ipi_hash_lock)) MPASS(in_epoch_verbose(net_epoch_preempt, 1)); #endif /* * First look for an exact match. */ tmpinp = NULL; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) { /* * XXX We should be able to directly return * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; } } if (tmpinp != NULL) return (tmpinp); /* * Then look in lb group (for wildcard match). */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport, lookupflags); if (inp != NULL) return (inp); } /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { struct inpcb *local_wild = NULL, *local_exact = NULL; #ifdef INET6 struct inpcb *local_wild_mapped = NULL; #endif struct inpcb *jail_wild = NULL; int injail; /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) continue; injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (inp->inp_laddr.s_addr == laddr.s_addr) { if (injail) return (inp); else local_exact = inp; } else if (inp->inp_laddr.s_addr == INADDR_ANY) { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } /* LIST_FOREACH */ if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation locks the * hash list lock, and will return the inpcb locked (i.e., requires * INPLOOKUP_LOCKPCB). */ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { struct inpcb *inp; INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_WUNLOCK(inp); inp = NULL; } } else if (lookupflags & INPLOOKUP_RLOCKPCB) { INP_RLOCK(inp); if (__predict_false(inp->inp_flags2 & INP_FREED)) { INP_RUNLOCK(inp); inp = NULL; } } else panic("%s: locking bug", __func__); #ifdef INVARIANTS if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) INP_WLOCK_ASSERT(inp); else INP_RLOCK_ASSERT(inp); } #endif } INP_HASH_RUNLOCK(pcbinfo); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. * * Possibly more of this logic should be in in_pcbgroup.c. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) { #if defined(PCBGROUP) && !defined(RSS) struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); /* * When not using RSS, use connection groups in preference to the * reservation table when looking up 4-tuples. When using RSS, just * use the reservation table, due to the cost of the Toeplitz hash * in software. * * XXXRW: This policy belongs in the pcbgroup code, as in principle * we could be doing RSS with a non-Toeplitz hash that is affordable * in software. */ #if defined(PCBGROUP) && !defined(RSS) if (in_pcbgroup_enabled(pcbinfo)) { pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp, struct mbuf *m) { #ifdef PCBGROUP struct inpcbgroup *pcbgroup; #endif KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); #ifdef PCBGROUP /* * If we can use a hardware-generated hash to look up the connection * group, use that connection group to find the inpcb. Otherwise * fall back on a software hash -- or the reservation table if we're * using RSS. * * XXXRW: As above, that policy belongs in the pcbgroup code. */ if (in_pcbgroup_enabled(pcbinfo) && !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), m->m_pkthdr.flowid); if (pcbgroup != NULL) return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #ifndef RSS pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, fport); return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, laddr, lport, lookupflags, ifp)); #endif } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, ifp)); } #endif /* INET */ /* * Insert PCB onto various hash lists. */ static int in_pcbinshash_internal(struct inpcb *inp, int do_pcbgroup_update) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; int so_options; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Add entry to load balance group. * Only do this if SO_REUSEPORT_LB is set. */ so_options = inp_so_options(inp); if (so_options & SO_REUSEPORT_LB) { int ret = in_pcbinslbgrouphash(inp); if (ret) { /* pcb lb group malloc fail (ret=ENOBUFS). */ return (ret); } } /* * Go through port list and look for a head for this lport. */ CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); phd->phd_port = inp->inp_lport; CK_LIST_INIT(&phd->phd_pcblist); CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); inp->inp_flags |= INP_INHASHLIST; #ifdef PCBGROUP if (do_pcbgroup_update) in_pcbgroup_update(inp); #endif return (0); } /* * For now, there are two public interfaces to insert an inpcb into the hash * lists -- one that does update pcbgroups, and one that doesn't. The latter * is used only in the TCP syncache, where in_pcbinshash is called before the * full 4-tuple is set for the inpcb, and we don't want to install in the * pcbgroup until later. * * XXXRW: This seems like a misfeature. in_pcbinshash should always update * connection groups, and partially initialised inpcbs should not be exposed * to either reservation hash tables or pcbgroups. */ int in_pcbinshash(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 1)); } int in_pcbinshash_nopcbgroup(struct inpcb *inp) { return (in_pcbinshash_internal(inp, 0)); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; u_int32_t hashkey_faddr; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("in_pcbrehash: !INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); else #endif hashkey_faddr = inp->inp_faddr.s_addr; head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; CK_LIST_REMOVE(inp, inp_hash); CK_LIST_INSERT_HEAD(head, inp, inp_hash); #ifdef PCBGROUP if (m != NULL) in_pcbgroup_update_mbuf(inp, m); else in_pcbgroup_update(inp); #endif } void in_pcbrehash(struct inpcb *inp) { in_pcbrehash_mbuf(inp, NULL); } /* * Remove PCB from various lists. */ static void in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { INP_INFO_RLOCK_ASSERT(pcbinfo); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } #endif INP_WLOCK_ASSERT(inp); INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); /* XXX: Only do if SO_REUSEPORT_LB set? */ in_pcbremlbgrouphash(inp); CK_LIST_REMOVE(inp, inp_hash); CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free); } INP_HASH_WUNLOCK(pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } CK_LIST_REMOVE(inp, inp_list); pcbinfo->ipi_count--; #ifdef PCBGROUP in_pcbgroup_remove(inp); #endif } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route); return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } /* * ipport_tick runs once per second, determining if random port allocation * should be continued. If more than ipport_randomcps ports have been * allocated in the last second, then we return to sequential port * allocation. We return to random allocation only once we drop below * ipport_randomcps for at least ipport_randomtime seconds. */ static void ipport_tick(void *xtp) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ if (V_ipport_tcpallocs <= V_ipport_tcplastcount + V_ipport_randomcps) { if (V_ipport_stoprandom > 0) V_ipport_stoprandom--; } else V_ipport_stoprandom = V_ipport_randomtime; V_ipport_tcplastcount = V_ipport_tcpallocs; CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); } static void ip_fini(void *xtp) { callout_stop(&ipport_tick_callout); } /* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ static void ipport_tick_init(const void *unused __unused) { /* Start ipport_tick. */ callout_init(&ipport_tick_callout, 1); callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { struct inpcb *inp; INP_INFO_WLOCK(&V_tcbinfo); CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } struct tcpcb * inp_inpcbtotcpcb(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return ((struct tcpcb *)inp->inp_ppcb); } int inp_ip_tos_get(const struct inpcb *inp) { return (inp->inp_ip_tos); } void inp_ip_tos_set(struct inpcb *inp, int val) { inp->inp_ip_tos = val; } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } struct inpcb * so_sotoinpcb(struct socket *so) { return (sotoinpcb(so)); } struct tcpcb * so_sototcpcb(struct socket *so) { return (sototcpcb(so)); } /* * Create an external-format (``xinpcb'') structure using the information in * the kernel-format in_pcb structure pointed to by inp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) { bzero(xi, sizeof(*xi)); xi->xi_len = sizeof(struct xinpcb); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi->xi_socket); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; xi->inp_flags = inp->inp_flags; xi->inp_flags2 = inp->inp_flags2; xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; xi->in6p_cksum = inp->in6p_cksum; xi->in6p_hops = inp->in6p_hops; xi->inp_ip_tos = inp->inp_ip_tos; xi->inp_vflag = inp->inp_vflag; xi->inp_ip_ttl = inp->inp_ip_ttl; xi->inp_ip_p = inp->inp_ip_p; xi->inp_ip_minttl = inp->inp_ip_minttl; } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ORIGDSTADDR) { db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_TIMEWAIT) { db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); db_print_indent(indent); db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ #ifdef RATELIMIT /* * Modify TX rate limit based on the existing "inp->inp_snd_tag", * if any. */ int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_modify == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_modify(mst, ¶ms); } return (error); } /* * Query existing TX rate limit based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_max_pacing_rate != NULL) *p_max_pacing_rate = params.rate_limit.max_rate; } return (error); } /* * Query existing TX queue level based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; struct ifnet *ifp; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); ifp = mst->ifp; if (ifp == NULL) return (EINVAL); if (ifp->if_snd_tag_query == NULL) return (EOPNOTSUPP); error = ifp->if_snd_tag_query(mst, ¶ms); if (error == 0 && p_txqueue_level != NULL) *p_txqueue_level = params.rate_limit.queue_level; return (error); } /* * Allocate a new TX rate limit send tag from the network interface * given by the "ifp" argument and save it in "inp->inp_snd_tag": */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = max_pacing_rate, }; int error; INP_WLOCK_ASSERT(inp); if (inp->inp_snd_tag != NULL) return (EINVAL); if (ifp->if_snd_tag_alloc == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); - - /* - * At success increment the refcount on - * the send tag's network interface: - */ - if (error == 0) - if_ref(inp->inp_snd_tag->ifp); } return (error); } /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: */ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; - struct ifnet *ifp; INP_WLOCK_ASSERT(inp); mst = inp->inp_snd_tag; inp->inp_snd_tag = NULL; if (mst == NULL) return; - ifp = mst->ifp; - if (ifp == NULL) - return; - - /* - * If the device was detached while we still had reference(s) - * on the ifp, we assume if_snd_tag_free() was replaced with - * stubs. - */ - ifp->if_snd_tag_free(mst); - - /* release reference count on network interface */ - if_rele(ifp); + m_snd_tag_rele(mst); } /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate * limit send tag based on the socket's so_max_pacing_rate value. */ void in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) { struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; int error; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* * NOTE: The so_max_pacing_rate value is read unlocked, * because atomic updates are not required since the variable * is checked at every mbuf we send. It is assumed that the * variable read itself will be atomic. */ max_pacing_rate = socket->so_max_pacing_rate; + + /* + * If the existing send tag is for the wrong interface due to + * a route change, first drop the existing tag. Set the + * CHANGED flag so that we will keep trying to allocate a new + * tag if we fail to allocate one this time. + */ + if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { + in_pcbdetach_txrtlmt(inp); + inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; + } /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until * all ratelimit connections are gone. The network interface * pointers compared below represent valid network interfaces, * except when comparing towards NULL. */ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { error = 0; } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); error = 0; } else if (inp->inp_snd_tag == NULL) { /* * In order to utilize packet pacing with RSS, we need * to wait until there is a valid RSS hash before we * can proceed: */ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), mb->m_pkthdr.flowid, max_pacing_rate); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } /* * Track route changes for TX rate limiting. */ void in_pcboutput_eagain(struct inpcb *inp) { bool did_upgrade; if (inp == NULL) return; if (inp->inp_snd_tag == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* detach rate limiting */ in_pcbdetach_txrtlmt(inp); /* make sure new mbuf send tag allocation is made */ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } #endif /* RATELIMIT */ Index: head/sys/netinet/ip_output.c =================================================================== --- head/sys/netinet/ip_output.c (revision 348253) +++ head/sys/netinet/ip_output.c (revision 348254) @@ -1,1494 +1,1507 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_ratelimit.h" #include "opt_ipsec.h" #include "opt_mbuf_stress_test.h" #include "opt_mpath.h" #include "opt_route.h" #include "opt_sctp.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #include #include #include #ifdef MBUF_STRESS_TEST static int mbuf_frag_size = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW, &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size"); #endif static void ip_mloopback(struct ifnet *, const struct mbuf *, int); extern int in_mcast_loop; extern struct protosw inetsw[]; static inline int ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error) { struct m_tag *fwd_tag = NULL; struct mbuf *m; struct in_addr odst; struct ip *ip; m = *mp; ip = mtod(m, struct ip *); /* Run through list of hooks for output packets. */ odst.s_addr = ip->ip_dst.s_addr; switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, PFIL_OUT, inp)) { case PFIL_DROPPED: *error = EPERM; /* FALLTHROUGH */ case PFIL_CONSUMED: return 1; /* Finished */ case PFIL_PASS: *error = 0; } m = *mp; ip = mtod(m, struct ip *); /* See if destination IP address was changed by packet filter. */ if (odst.s_addr != ip->ip_dst.s_addr) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip_input(). */ if (in_localip(ip->ip_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; return -1; /* Reloop */ } /* See if fib was changed by packet filter. */ if ((*fibnum) != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; *fibnum = M_GETFIB(m); return -1; /* Reloop for FIB change */ } /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; *error = netisr_queue(NETISR_IP, m); return 1; /* Finished */ } /* Or forward to some other address? */ if ((m->m_flags & M_IP_NEXTHOP) && ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) { bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP_NEXTHOP; m_tag_delete(m, fwd_tag); return -1; /* Reloop for CHANGE of dst */ } return 0; } +static int +ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m, + const struct sockaddr_in *gw, struct route *ro) +{ + struct m_snd_tag *mst; + int error; + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); + mst = NULL; + +#ifdef RATELIMIT + if (inp != NULL) { + if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || + (inp->inp_snd_tag != NULL && + inp->inp_snd_tag->ifp != ifp)) + in_pcboutput_txrtlmt(inp, ifp, m); + + if (inp->inp_snd_tag != NULL) + mst = inp->inp_snd_tag; + } +#endif + if (mst != NULL) { + KASSERT(m->m_pkthdr.rcvif == NULL, + ("trying to add a send tag to a forwarded packet")); + if (mst->ifp != ifp) { + error = EAGAIN; + goto done; + } + + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } + + error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro); + +done: + /* Check for route change invalidating send tags. */ +#ifdef RATELIMIT + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif + return (error); +} + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * In the IP forwarding case, the packet will arrive with options already * inserted, so must have a NULL opt pointer. */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { struct rm_priotracker in_ifa_tracker; struct epoch_tracker et; struct ip *ip; struct ifnet *ifp = NULL; /* keep compiler happy */ struct mbuf *m0; int hlen = sizeof (struct ip); int mtu; int error = 0; struct sockaddr_in *dst, sin; const struct sockaddr_in *gw; struct in_ifaddr *ia; struct in_addr src; int isbroadcast; uint16_t ip_len, ip_off; uint32_t fibnum; #if defined(IPSEC) || defined(IPSEC_SUPPORT) int no_route_but_check_spd = 0; #endif M_ASSERTPKTHDR(m); if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } #ifdef NUMA m->m_pkthdr.numa_domain = inp->inp_numa_domain; #endif } if (opt) { int len = 0; m = ip_insertoptions(m, opt, &len); if (len != 0) hlen = len; /* ip->ip_hl is updated above */ } ip = mtod(m, struct ip *); ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { ip->ip_v = IPVERSION; ip->ip_hl = hlen >> 2; ip_fillid(ip); } else { /* Header already set, fetch hlen from there */ hlen = ip->ip_hl << 2; } if ((flags & IP_FORWARDING) == 0) IPSTAT_INC(ips_localout); /* * dst/gw handling: * * gw is readonly but can point either to dst OR rt_gateway, * therefore we need restore gw if we're redoing lookup. */ fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); if (ro != NULL) dst = (struct sockaddr_in *)&ro->ro_dst; else dst = &sin; if (ro == NULL || ro->ro_rt == NULL) { bzero(dst, sizeof(*dst)); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = ip->ip_dst; } gw = dst; NET_EPOCH_ENTER(et); again: /* * Validate route against routing table additions; * a better/more specific route might have been added. */ if (inp != NULL && ro != NULL && ro->ro_rt != NULL) RT_VALIDATE(ro, &inp->inp_rt_cookie, fibnum); /* * If there is a cached route, * check that it is to the same destination * and is still up. If not, free it and try again. * The address family should also be checked in case of sharing the * cache with IPv6. * Also check whether routing cache needs invalidation. */ if (ro != NULL && ro->ro_rt != NULL && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || ro->ro_rt->rt_ifp == NULL || !RT_LINK_IS_UP(ro->ro_rt->rt_ifp) || dst->sin_family != AF_INET || dst->sin_addr.s_addr != ip->ip_dst.s_addr)) RO_INVALIDATE_CACHE(ro); ia = NULL; /* * If routing to interface only, short circuit routing lookup. * The use of an all-ones broadcast address implies this; an * interface is specified by the broadcast address of an interface, * or the destination address of a ptp interface. */ if (flags & IP_SENDONES) { if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } ip->ip_dst.s_addr = INADDR_BROADCAST; dst->sin_addr = ip->ip_dst; ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = 1; src = IA_SIN(ia)->sin_addr; } else if (flags & IP_ROUTETOIF) { if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst), M_GETFIB(m)))) == NULL && (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0, M_GETFIB(m)))) == NULL) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } ifp = ia->ia_ifp; mtu = ifp->if_mtu; ip->ip_ttl = 1; isbroadcast = ifp->if_flags & IFF_BROADCAST ? in_ifaddr_broadcast(dst->sin_addr, ia) : 0; src = IA_SIN(ia)->sin_addr; } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && imo != NULL && imo->imo_multicast_ifp != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ ifp = imo->imo_multicast_ifp; mtu = ifp->if_mtu; IFP_TO_IA(ifp, ia, &in_ifa_tracker); isbroadcast = 0; /* fool gcc */ /* Interface may have no addresses. */ if (ia != NULL) src = IA_SIN(ia)->sin_addr; else src.s_addr = INADDR_ANY; } else if (ro != NULL) { if (ro->ro_rt == NULL) { /* * We want to do any cloning requested by the link * layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), fibnum); #else in_rtalloc_ign(ro, 0, fibnum); #endif if (ro->ro_rt == NULL || (ro->ro_rt->rt_flags & RTF_UP) == 0 || ro->ro_rt->rt_ifp == NULL || !RT_LINK_IS_UP(ro->ro_rt->rt_ifp)) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; counter_u64_add(ro->ro_rt->rt_pksent, 1); rt_update_ro_flags(ro); if (ro->ro_rt->rt_flags & RTF_GATEWAY) gw = (struct sockaddr_in *)ro->ro_rt->rt_gateway; if (ro->ro_rt->rt_flags & RTF_HOST) isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST); else if (ifp->if_flags & IFF_BROADCAST) isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia); else isbroadcast = 0; if (ro->ro_rt->rt_flags & RTF_HOST) mtu = ro->ro_rt->rt_mtu; else mtu = ifp->if_mtu; src = IA_SIN(ia)->sin_addr; } else { struct nhop4_extended nh; bzero(&nh, sizeof(nh)); if (fib4_lookup_nh_ext(M_GETFIB(m), ip->ip_dst, 0, 0, &nh) != 0) { #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * There is no route for this packet, but it is * possible that a matching SPD entry exists. */ no_route_but_check_spd = 1; mtu = 0; /* Silence GCC warning. */ goto sendit; #endif IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } ifp = nh.nh_ifp; mtu = nh.nh_mtu; /* * We are rewriting here dst to be gw actually, contradicting * comment at the beginning of the function. However, in this * case we are always dealing with on stack dst. * In case if pfil(9) sends us back to beginning of the * function, the dst would be rewritten by ip_output_pfil(). */ MPASS(dst == &sin); dst->sin_addr = nh.nh_addr; ia = nh.nh_ia; src = nh.nh_src; isbroadcast = (((nh.nh_flags & (NHF_HOST | NHF_BROADCAST)) == (NHF_HOST | NHF_BROADCAST)) || ((ifp->if_flags & IFF_BROADCAST) && in_ifaddr_broadcast(dst->sin_addr, ia))); } /* Catch a possible divide by zero later. */ KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (rt_flags=0x%08x) ifp=%p", __func__, mtu, ro, (ro != NULL && ro->ro_rt != NULL) ? ro->ro_rt->rt_flags : 0, ifp)); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { m->m_flags |= M_MCAST; /* * IP destination address is multicast. Make sure "gw" * still points to the address in "ro". (It may have been * changed to point to a gateway address, above.) */ gw = dst; /* * See if the caller provided any multicast options */ if (imo != NULL) { ip->ip_ttl = imo->imo_multicast_ttl; if (imo->imo_multicast_vif != -1) ip->ip_src.s_addr = ip_mcast_src ? ip_mcast_src(imo->imo_multicast_vif) : INADDR_ANY; } else ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; /* * Confirm that the outgoing interface supports multicast. */ if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { IPSTAT_INC(ips_noroute); error = ENETUNREACH; goto bad; } } /* * If source address not specified yet, use address * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) ip->ip_src = src; if ((imo == NULL && in_mcast_loop) || (imo && imo->imo_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we are not a member * of the group; ip_input() will filter it later, * thus deferring a hash lookup and mutex acquisition * at the expense of a cheap copy using m_copym(). */ ip_mloopback(ifp, m, hlen); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IP_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip_mloopback(), * above, will be forwarded by the ip_input() routine, * if necessary. */ if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) { /* * If rsvp daemon is not running, do not * set ip_moptions. This ensures that the packet * is multicast and not just sent down one link * as prescribed by rsvpd. */ if (!V_rsvp_on) imo = NULL; if (ip_mforward && ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip_mloopback() will * loop back a copy. ip_input() will drop the copy if * this host does not belong to the destination group on * the loopback interface. */ if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) { m_freem(m); goto done; } goto sendit; } /* * If the source address is not specified yet, use the address * of the outoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) ip->ip_src = src; /* * Look for broadcast address and * verify user is allowed to send * such a packet. */ if (isbroadcast) { if ((ifp->if_flags & IFF_BROADCAST) == 0) { error = EADDRNOTAVAIL; goto bad; } if ((flags & IP_ALLOWBROADCAST) == 0) { error = EACCES; goto bad; } /* don't allow broadcast messages to be fragmented */ if (ip_len > mtu) { error = EMSGSIZE; goto bad; } m->m_flags |= M_BCAST; } else { m->m_flags &= ~M_BCAST; } sendit: #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } } /* * Check if there was a route for this packet; return error if not. */ if (no_route_but_check_spd) { IPSTAT_INC(ips_noroute); error = EHOSTUNREACH; goto bad; } /* Update variables that are affected by ipsec4_output(). */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #endif /* IPSEC */ /* Jump over all PFIL processing if hooks are not active. */ if (PFIL_HOOKED_OUT(V_inet_pfil_head)) { switch (ip_output_pfil(&m, ifp, inp, dst, &fibnum, &error)) { case 1: /* Finished */ goto done; case 0: /* Continue normally */ ip = mtod(m, struct ip *); break; case -1: /* Need to try again */ /* Reset everything for a new round */ if (ro != NULL) { RO_RTFREE(ro); ro->ro_prepend = NULL; } gw = dst; ip = mtod(m, struct ip *); goto again; } } /* IN_LOOPBACK must not appear on the wire - RFC1122. */ if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) || IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); error = EADDRNOTAVAIL; goto bad; } } m->m_pkthdr.csum_flags |= CSUM_IP; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) { in_delayed_cksum(m); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) { sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); m->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. */ if (ip_len <= mtu || (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } /* * Record statistics for this interface address. * With CSUM_TSO the byte/packet count will be slightly * incorrect because we count the IP+TCP headers only * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { if (m->m_pkthdr.csum_flags & CSUM_TSO) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } #ifdef MBUF_STRESS_TEST if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size) m = m_fragment(m, M_NOWAIT, mbuf_frag_size); #endif /* * Reset layer specific mbuf flags * to avoid confusing lower layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL); -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = (*ifp->if_output)(ifp, m, - (const struct sockaddr *)gw, ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip_output_send(inp, ifp, m, gw, ro); goto done; } /* Balk when DF bit is set or the interface didn't support TSO. */ if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; } /* * Too large for interface; fragment if possible. If successful, * on return, m will point to a list of packets to be sent. */ error = ip_fragment(ip, &m, mtu, ifp->if_hwassist); if (error) goto bad; for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia != NULL) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } /* * Reset layer specific mbuf flags * to avoid confusing upper layers. */ m_clrprotoflags(m); IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp, mtod(m, struct ip *), NULL); -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = (*ifp->if_output)(ifp, m, - (const struct sockaddr *)gw, ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip_output_send(inp, ifp, m, gw, ro); } else m_freem(m); } if (error == 0) IPSTAT_INC(ips_fragmented); done: NET_EPOCH_EXIT(et); return (error); bad: m_freem(m); goto done; } /* * Create a chain of fragments which fit the given mtu. m_frag points to the * mbuf to be fragmented; on return it points to the chain with the fragments. * Return 0 if no error. If error, m_frag may contain a partially built * chain of fragments that should be freed by the caller. * * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist) */ int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags) { int error = 0; int hlen = ip->ip_hl << 2; int len = (mtu - hlen) & ~7; /* size of payload in each fragment */ int off; struct mbuf *m0 = *m_frag; /* the original packet */ int firstlen; struct mbuf **mnext; int nfrags; uint16_t ip_len, ip_off; ip_len = ntohs(ip->ip_len); ip_off = ntohs(ip->ip_off); if (ip_off & IP_DF) { /* Fragmentation not allowed */ IPSTAT_INC(ips_cantfrag); return EMSGSIZE; } /* * Must be able to put at least 8 bytes per fragment. */ if (len < 8) return EMSGSIZE; /* * If the interface will not calculate checksums on * fragmented packets, then do it here. */ if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(m0); m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } #ifdef SCTP if (m0->m_pkthdr.csum_flags & CSUM_SCTP) { sctp_delayed_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_SCTP; } #endif if (len > PAGE_SIZE) { /* * Fragment large datagrams such that each segment * contains a multiple of PAGE_SIZE amount of data, * plus headers. This enables a receiver to perform * page-flipping zero-copy optimizations. * * XXX When does this help given that sender and receiver * could have different page sizes, and also mtu could * be less than the receiver's page size ? */ int newlen; off = MIN(mtu, m0->m_pkthdr.len); /* * firstlen (off - hlen) must be aligned on an * 8-byte boundary */ if (off < hlen) goto smart_frag_failure; off = ((off - hlen) & ~7) + hlen; newlen = (~PAGE_MASK) & mtu; if ((newlen + sizeof (struct ip)) > mtu) { /* we failed, go back the default */ smart_frag_failure: newlen = len; off = hlen + len; } len = newlen; } else { off = hlen + len; } firstlen = off - hlen; mnext = &m0->m_nextpkt; /* pointer to next packet */ /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto chain. * Here, m0 is the original packet, m is the fragment being created. * The fragments are linked off the m_nextpkt of the original * packet, which after processing serves as the first fragment. */ for (nfrags = 1; off < ip_len; off += len, nfrags++) { struct ip *mhip; /* ip header on the fragment */ struct mbuf *m; int mhlen = sizeof (struct ip); m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); error = ENOBUFS; IPSTAT_INC(ips_odropped); goto done; } /* * In the first mbuf, leave room for the link header, then * copy the original IP header including options. The payload * goes into an additional mbuf chain returned by m_copym(). */ m->m_data += max_linkhdr; mhip = mtod(m, struct ip *); *mhip = *ip; if (hlen > sizeof (struct ip)) { mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip); mhip->ip_v = IPVERSION; mhip->ip_hl = mhlen >> 2; } m->m_len = mhlen; /* XXX do we need to add ip_off below ? */ mhip->ip_off = ((off - hlen) >> 3) + ip_off; if (off + len >= ip_len) len = ip_len - off; else mhip->ip_off |= IP_MF; mhip->ip_len = htons((u_short)(len + mhlen)); m->m_next = m_copym(m0, off, len, M_NOWAIT); if (m->m_next == NULL) { /* copy failed */ m_free(m); error = ENOBUFS; /* ??? */ IPSTAT_INC(ips_odropped); goto done; } m->m_pkthdr.len = mhlen + len; #ifdef MAC mac_netinet_fragment(m0, m); #endif mhip->ip_off = htons(mhip->ip_off); mhip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { mhip->ip_sum = in_cksum(m, mhlen); m->m_pkthdr.csum_flags &= ~CSUM_IP; } *mnext = m; mnext = &m->m_nextpkt; } IPSTAT_ADD(ips_ofragments, nfrags); /* * Update first fragment by trimming what's been copied out * and updating header. */ m_adj(m0, hlen + firstlen - ip_len); m0->m_pkthdr.len = hlen + firstlen; ip->ip_len = htons((u_short)m0->m_pkthdr.len); ip->ip_off = htons(ip_off | IP_MF); ip->ip_sum = 0; if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) { ip->ip_sum = in_cksum(m0, hlen); m0->m_pkthdr.csum_flags &= ~CSUM_IP; } done: *m_frag = m0; return error; } void in_delayed_cksum(struct mbuf *m) { struct ip *ip; struct udphdr *uh; uint16_t cklen, csum, offset; ip = mtod(m, struct ip *); offset = ip->ip_hl << 2 ; if (m->m_pkthdr.csum_flags & CSUM_UDP) { /* if udp header is not in the first mbuf copy udplen */ if (offset + sizeof(struct udphdr) > m->m_len) { m_copydata(m, offset + offsetof(struct udphdr, uh_ulen), sizeof(cklen), (caddr_t)&cklen); cklen = ntohs(cklen); } else { uh = (struct udphdr *)mtodo(m, offset); cklen = ntohs(uh->uh_ulen); } csum = in_cksum_skip(m, cklen + offset, offset); if (csum == 0) csum = 0xffff; } else { cklen = ntohs(ip->ip_len); csum = in_cksum_skip(m, cklen, offset); } offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(csum) > m->m_len) m_copyback(m, offset, sizeof(csum), (caddr_t)&csum); else *(u_short *)mtodo(m, offset) = csum; } /* * IP socket option processing. */ int ip_ctloutput(struct socket *so, struct sockopt *sopt) { struct inpcb *inp = sotoinpcb(so); int error, optval; #ifdef RSS uint32_t rss_bucket; int retval; #endif error = optval = 0; if (sopt->sopt_level != IPPROTO_IP) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(inp); if ((so->so_options & SO_REUSEADDR) != 0) inp->inp_flags2 |= INP_REUSEADDR; else inp->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT) != 0) inp->inp_flags2 |= INP_REUSEPORT; else inp->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(inp); error = 0; break; case SO_REUSEPORT_LB: INP_WLOCK(inp); if ((so->so_options & SO_REUSEPORT_LB) != 0) inp->inp_flags2 |= INP_REUSEPORT_LB; else inp->inp_flags2 &= ~INP_REUSEPORT_LB; INP_WUNLOCK(inp); error = 0; break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT INP_WLOCK(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; INP_WUNLOCK(inp); error = 0; #else error = EOPNOTSUPP; #endif break; default: break; } } return (error); } switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { case IP_OPTIONS: #ifdef notyet case IP_RETOPTS: #endif { struct mbuf *m; if (sopt->sopt_valsize > MLEN) { error = EMSGSIZE; break; } m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA); if (m == NULL) { error = ENOBUFS; break; } m->m_len = sopt->sopt_valsize; error = sooptcopyin(sopt, mtod(m, char *), m->m_len, m->m_len); if (error) { m_free(m); break; } INP_WLOCK(inp); error = ip_pcbopts(inp, sopt->sopt_name, m); INP_WUNLOCK(inp); return (error); } case IP_BINDANY: if (sopt->sopt_td != NULL) { error = priv_check(sopt->sopt_td, PRIV_NETINET_BINDANY); if (error) break; } /* FALLTHROUGH */ case IP_BINDMULTI: #ifdef RSS case IP_RSS_LISTEN_BUCKET: #endif case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: case IP_RECVFLOWID: #ifdef RSS case IP_RECVRSSBUCKETID: #endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (sopt->sopt_name) { case IP_TOS: inp->inp_ip_tos = optval; break; case IP_TTL: inp->inp_ip_ttl = optval; break; case IP_MINTTL: if (optval >= 0 && optval <= MAXTTL) inp->inp_ip_minttl = optval; else error = EINVAL; break; #define OPTSET(bit) do { \ INP_WLOCK(inp); \ if (optval) \ inp->inp_flags |= bit; \ else \ inp->inp_flags &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(inp); \ if (val) \ inp->inp_flags2 |= bit; \ else \ inp->inp_flags2 &= ~bit; \ INP_WUNLOCK(inp); \ } while (0) case IP_RECVOPTS: OPTSET(INP_RECVOPTS); break; case IP_RECVRETOPTS: OPTSET(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: OPTSET(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: OPTSET2(INP_ORIGDSTADDR, optval); break; case IP_RECVTTL: OPTSET(INP_RECVTTL); break; case IP_RECVIF: OPTSET(INP_RECVIF); break; case IP_ONESBCAST: OPTSET(INP_ONESBCAST); break; case IP_DONTFRAG: OPTSET(INP_DONTFRAG); break; case IP_BINDANY: OPTSET(INP_BINDANY); break; case IP_RECVTOS: OPTSET(INP_RECVTOS); break; case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; case IP_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { inp->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; case IP_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif } break; #undef OPTSET #undef OPTSET2 /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: case IP_ADD_SOURCE_MEMBERSHIP: case IP_DROP_SOURCE_MEMBERSHIP: case IP_BLOCK_SOURCE: case IP_UNBLOCK_SOURCE: case IP_MSFILTER: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(inp); switch (optval) { case IP_PORTRANGE_DEFAULT: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags &= ~(INP_HIGHPORT); break; case IP_PORTRANGE_HIGH: inp->inp_flags &= ~(INP_LOWPORT); inp->inp_flags |= INP_HIGHPORT; break; case IP_PORTRANGE_LOW: inp->inp_flags &= ~(INP_HIGHPORT); inp->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(inp); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (sopt->sopt_name) { case IP_OPTIONS: case IP_RETOPTS: INP_RLOCK(inp); if (inp->inp_options) { struct mbuf *options; options = m_copym(inp->inp_options, 0, M_COPYALL, M_NOWAIT); INP_RUNLOCK(inp); if (options != NULL) { error = sooptcopyout(sopt, mtod(options, char *), options->m_len); m_freem(options); } else error = ENOMEM; } else { INP_RUNLOCK(inp); sopt->sopt_valsize = 0; } break; case IP_TOS: case IP_TTL: case IP_MINTTL: case IP_RECVOPTS: case IP_RECVRETOPTS: case IP_ORIGDSTADDR: case IP_RECVDSTADDR: case IP_RECVTTL: case IP_RECVIF: case IP_PORTRANGE: case IP_ONESBCAST: case IP_DONTFRAG: case IP_BINDANY: case IP_RECVTOS: case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { case IP_TOS: optval = inp->inp_ip_tos; break; case IP_TTL: optval = inp->inp_ip_ttl; break; case IP_MINTTL: optval = inp->inp_ip_minttl; break; #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0) case IP_RECVOPTS: optval = OPTBIT(INP_RECVOPTS); break; case IP_RECVRETOPTS: optval = OPTBIT(INP_RECVRETOPTS); break; case IP_RECVDSTADDR: optval = OPTBIT(INP_RECVDSTADDR); break; case IP_ORIGDSTADDR: optval = OPTBIT2(INP_ORIGDSTADDR); break; case IP_RECVTTL: optval = OPTBIT(INP_RECVTTL); break; case IP_RECVIF: optval = OPTBIT(INP_RECVIF); break; case IP_PORTRANGE: if (inp->inp_flags & INP_HIGHPORT) optval = IP_PORTRANGE_HIGH; else if (inp->inp_flags & INP_LOWPORT) optval = IP_PORTRANGE_LOW; else optval = 0; break; case IP_ONESBCAST: optval = OPTBIT(INP_ONESBCAST); break; case IP_DONTFRAG: optval = OPTBIT(INP_DONTFRAG); break; case IP_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IP_RECVTOS: optval = OPTBIT(INP_RECVTOS); break; case IP_FLOWID: optval = inp->inp_flowid; break; case IP_FLOWTYPE: optval = inp->inp_flowtype; break; case IP_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, inp->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IP_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; /* * Multicast socket options are processed by the in_mcast * module. */ case IP_MULTICAST_IF: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_MSFILTER: error = inp_getmoptions(inp, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IP_IPSEC_POLICY: if (IPSEC_ENABLED(ipv4)) { error = IPSEC_PCBCTL(ipv4, inp, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } return (error); } /* * Routine called from ip_output() to loop back a copy of an IP multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be a loopback interface -- evil, but easier than * replicating that code here. */ static void ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen) { struct ip *ip; struct mbuf *copym; /* * Make a deep copy of the packet because we're going to * modify the pack in order to generate checksums. */ copym = m_dup(m, M_NOWAIT); if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen)) copym = m_pullup(copym, hlen); if (copym != NULL) { /* If needed, compute the checksum and mark it as valid. */ if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { in_delayed_cksum(copym); copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } /* * We don't bother to fragment if the IP length is greater * than the interface's MTU. Can this possibly matter? */ ip = mtod(copym, struct ip *); ip->ip_sum = 0; ip->ip_sum = in_cksum(copym, hlen); if_simloop(ifp, copym, AF_INET, 0); } } Index: head/sys/netinet6/ip6_output.c =================================================================== --- head/sys/netinet6/ip6_output.c (revision 348253) +++ head/sys/netinet6/ip6_output.c (revision 348254) @@ -1,3160 +1,3173 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_ipsec.h" #include "opt_sctp.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #endif #include #include extern int in6_mcast_loop; struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options"); static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, struct ucred *, int); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *); static int ip6_getpcbopt(struct inpcb *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, struct ucred *, int, int, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct route_in6 *, int, struct ifnet *, const struct in6_addr *, u_long *, int *, u_int, u_int); static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long, u_long *, int *, u_int); static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); /* * Make an extension header from option data. hp is the source, and * mp is the destination. */ #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (/*CONSTCOND*/ 0) /* * Form a chain of extension headers. * m is the extension header mbuf * mp is the previous mbuf in the chain * p is the next header * i is the type of option. */ #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset) { u_short csum; csum = in_cksum_skip(m, offset + plen, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(csum) > m->m_len) m_copyback(m, offset, sizeof(csum), (caddr_t)&csum); else *(u_short *)mtodo(m, offset) = csum; } int ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, int fraglen , uint32_t id) { struct mbuf *m, **mnext, *m_frgpart; struct ip6_hdr *ip6, *mhip6; struct ip6_frag *ip6f; int off; int error; int tlen = m0->m_pkthdr.len; KASSERT((fraglen % 8 == 0), ("Fragment length must be a multiple of 8")); m = m0; ip6 = mtod(m, struct ip6_hdr *); mnext = &m->m_nextpkt; for (off = hlen; off < tlen; off += fraglen) { m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } /* * Make sure the complete packet header gets copied * from the originating mbuf to the newly created * mbuf. This also ensures that existing firewall * classification(s), VLAN tags and so on get copied * to the resulting fragmented packet(s): */ if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) { m_free(m); IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6STAT_INC(ip6s_odropped); return (error); } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + fraglen >= tlen) fraglen = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(fraglen + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copym(m0, off, fraglen, M_NOWAIT)) == NULL) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m_cat(m, m_frgpart); m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f); ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; IP6STAT_INC(ip6s_ofragments); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } return (0); } +static int +ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp, + struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro) +{ + struct m_snd_tag *mst; + int error; + + MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); + mst = NULL; + +#ifdef RATELIMIT + if (inp != NULL) { + if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 || + (inp->inp_snd_tag != NULL && + inp->inp_snd_tag->ifp != ifp)) + in_pcboutput_txrtlmt(inp, ifp, m); + + if (inp->inp_snd_tag != NULL) + mst = inp->inp_snd_tag; + } +#endif + if (mst != NULL) { + KASSERT(m->m_pkthdr.rcvif == NULL, + ("trying to add a send tag to a forwarded packet")); + if (mst->ifp != ifp) { + error = EAGAIN; + goto done; + } + + /* stamp send tag on mbuf */ + m->m_pkthdr.snd_tag = m_snd_tag_ref(mst); + m->m_pkthdr.csum_flags |= CSUM_SND_TAG; + } + + error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); + +done: + /* Check for route change invalidating send tags. */ +#ifdef RATELIMIT + if (error == EAGAIN) + in_pcboutput_eagain(inp); +#endif + return (error); +} + /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route_in6 ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_mtu. * * ifpp - XXX: just for statistics */ /* * XXX TODO: no flowid is assigned for outbound flows? */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { struct ip6_hdr *ip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; struct mbuf *mprev = NULL; int hlen, tlen, len; struct route_in6 ip6route; struct rtentry *rt = NULL; struct sockaddr_in6 *dst, src_sa, dst_sa; struct in6_addr odst; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr src0, dst0; u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int sw_csum, tso; int needfiblookup; uint32_t fibnum; struct m_tag *fwd_tag = NULL; uint32_t id; if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { /* unconditionally set flowid */ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } #ifdef NUMA m->m_pkthdr.numa_domain = inp->inp_numa_domain; #endif } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * IPSec checking which handles several cases. * FAST IPSEC: We re-injected the packet. * XXX: need scope argument. */ if (IPSEC_ENABLED(ipv6)) { if ((error = IPSEC_OUTPUT(ipv6, m, inp)) != 0) { if (error == EINPROGRESS) error = 0; goto done; } } #endif /* IPSEC */ bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ if (opt->ip6po_rthdr) { /* * Destination options header(1st part) * This only makes sense with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If there is at least one extension header, * separate IP6 header from the payload. */ if (optlen && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ u_char *nexthdrp = &ip6->ip6_nxt; mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); /* * If there is a routing header, discard the packet. */ if (exthdrs.ip6e_rthdr) { error = EINVAL; goto bad; } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } IP6STAT_INC(ip6s_localout); /* * Route packet. */ if (ro == NULL) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = V_ip6_defmcasthlim; } /* * Validate route against routing table additions; * a better/more specific route might have been added. * Make sure address family is set in route. */ if (inp) { ro->ro_dst.sin6_family = AF_INET6; RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum); } if (ro->ro_rt && fwd_tag == NULL && (ro->ro_rt->rt_flags & RTF_UP) && ro->ro_dst.sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) { rt = ro->ro_rt; ifp = ro->ro_rt->rt_ifp; } else { if (ro->ro_lle) LLE_FREE(ro->ro_lle); /* zeros ro_lle */ ro->ro_lle = NULL; if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp, &rt, fibnum); if (error != 0) { if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } } if (rt == NULL) { /* * If in6_selectroute() does not return a route entry, * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } /* * then rt (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } if (rt != NULL) { ia = (struct in6_ifaddr *)(rt->rt_ifa); counter_u64_add(rt->rt_pksent, 1); } /* Setup data structures for scope ID checks. */ src0 = ip6->ip6_src; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; dst0 = ip6->ip6_dst; /* re-initialize to be sure */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; /* Check for valid scope ID. */ if (in6_setscope(&src0, ifp, &zone) == 0 && sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id && in6_setscope(&dst0, ifp, &zone) == 0 && sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) { /* * The outgoing interface is in the zone of the source * and destination addresses. * * Because the loopback interface cannot receive * packets with a different scope ID than its own, * there is a trick is to pretend the outgoing packet * was received by the real network interface, by * setting "origifp" different from "ifp". This is * only allowed when "ifp" is a loopback network * interface. Refer to code in nd6_output_ifp() for * more details. */ origifp = ifp; /* * We should use ia_ifp to support the case of sending * packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) ifp = ia->ia_ifp; } else if ((ifp->if_flags & IFF_LOOPBACK) == 0 || sa6_recoverscope(&src_sa) != 0 || sa6_recoverscope(&dst_sa) != 0 || dst_sa.sin6_scope_id == 0 || (src_sa.sin6_scope_id != 0 && src_sa.sin6_scope_id != dst_sa.sin6_scope_id) || (origifp = ifnet_byindex(dst_sa.sin6_scope_id)) == NULL) { /* * If the destination network interface is not a * loopback interface, or the destination network * address has no scope ID, or the source address has * a scope ID set which is different from the * destination address one, or there is no network * interface representing this scope ID, the address * pair is considered invalid. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(ifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; } /* All scope ID checks are successful. */ if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_rt) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((rt->rt_flags & RTF_GATEWAY)) dst = (struct sockaddr_in6 *)rt->rt_gateway; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ } else { m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } if ((im6o == NULL && in6_mcast_loop) || (im6o && im6o->im6o_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we have not joined * the address; protocols will filter it later, * thus deferring a hash lookup and lock acquisition * at the expense of an m_copym(). */ ip6_mloopback(ifp, m); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it may not always be the case. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0) goto bad; /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not contiguous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * contiguous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ switch (pfil_run_hooks(V_inet6_pfil_head, &m, ifp, PFIL_OUT, inp)) { case PFIL_PASS: ip6 = mtod(m, struct ip6_hdr *); break; case PFIL_DROPPED: error = EPERM; /* FALLTHROUGH */ case PFIL_CONSUMED: goto done; } needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } else { RO_INVALIDATE_CACHE(ro); needfiblookup = 1; /* Redo the routing table lookup. */ } } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); RO_INVALIDATE_CACHE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { dst = (struct sockaddr_in6 *)&ro->ro_dst; bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); goto again; } passout: /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; /* * If we added extension headers, we will not do TSO and calculate the * checksums ourselves for now. * XXX-BZ Need a framework to know when the NIC can handle it, even * with ext. hdrs. */ if (sw_csum & CSUM_DELAY_DATA_IPV6) { sw_csum &= ~CSUM_DELAY_DATA_IPV6; in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr)); } #ifdef SCTP if (sw_csum & CSUM_SCTP_IPV6) { sw_csum &= ~CSUM_SCTP_IPV6; sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); } #endif m->m_pkthdr.csum_flags &= ifp->if_hwassist; tlen = m->m_pkthdr.len; if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ error = EMSGSIZE; goto bad; } if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) { /* case 2-b */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. Also return an * error code (this is not described in the API spec). */ if (inp != NULL) ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu); error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* case 1-a and 2-a */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ counter_u64_add(ia6->ia_ifa.ifa_opackets, 1); counter_u64_add(ia6->ia_ifa.ifa_obytes, m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = nd6_output_ifp(ifp, origifp, m, dst, - (struct route *)ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip6_output_send(inp, ifp, origifp, m, dst, ro); goto done; } /* * try to fragment the packet. case 1-b and 3 */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { u_char nextproto; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. * XXX-BZ handle the hw offloading case. Need flags. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, plen, hlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; id = htonl(ip6_randomid()); if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id))) goto sendorfree; in6_ifstat_inc(ifp, ifs6_out_fragok); } /* * Remove leading garbages. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } -#ifdef RATELIMIT - if (inp != NULL) { - if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) - in_pcboutput_txrtlmt(inp, ifp, m); - /* stamp send tag on mbuf */ - m->m_pkthdr.snd_tag = inp->inp_snd_tag; - } else { - m->m_pkthdr.snd_tag = NULL; - } -#endif - error = nd6_output_ifp(ifp, origifp, m, dst, - (struct route *)ro); -#ifdef RATELIMIT - /* check for route change */ - if (error == EAGAIN) - in_pcboutput_eagain(inp); -#endif + error = ip6_output_send(inp, ifp, origifp, m, dst, ro); } else m_freem(m); } if (error == 0) IP6STAT_INC(ip6s_fragmented); done: if (ro == &ip6route) RO_RTFREE(ro); return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: if (m) m_freem(m); goto done; } static int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ if (hlen > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == NULL) { mopt = m_get(M_NOWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ n = m_getcl(M_NOWAIT, MT_DATA, 0); if (n == NULL) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_NOWAIT); if (n == NULL) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if (M_WRITABLE(mlast) && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; mfrg = m_get(M_NOWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } /* * Calculates IPv6 path mtu for destination @dst. * Resulting MTU is stored in @mtup. * * Returns 0 on success. */ static int ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup) { struct nhop6_extended nh6; struct in6_addr kdst; uint32_t scopeid; struct ifnet *ifp; u_long mtu; int error; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, &nh6) != 0) return (EHOSTUNREACH); ifp = nh6.nh_ifp; mtu = nh6.nh_mtu; error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0); fib6_free_nh_ext(fibnum, &nh6); return (error); } /* * Calculates IPv6 path MTU for @dst based on transmit @ifp, * and cached data in @ro_pmtu. * MTU from (successful) route lookup is saved (along with dst) * inside @ro_pmtu to avoid subsequent route lookups after packet * filter processing. * * Stores mtu and always-frag value into @mtup and @alwaysfragp. * Returns 0 on success. */ static int ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup, int *alwaysfragp, u_int fibnum, u_int proto) { struct nhop6_basic nh6; struct in6_addr kdst; uint32_t scopeid; struct sockaddr_in6 *sa6_dst; u_long mtu; mtu = 0; if (do_lookup) { /* * Here ro_pmtu has final destination address, while * ro might represent immediate destination. * Use ro_pmtu destination since mtu might differ. */ sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst)) ro_pmtu->ro_mtu = 0; if (ro_pmtu->ro_mtu == 0) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_basic(fibnum, &kdst, scopeid, 0, 0, &nh6) == 0) ro_pmtu->ro_mtu = nh6.nh_mtu; } mtu = ro_pmtu->ro_mtu; } if (ro_pmtu->ro_rt) mtu = ro_pmtu->ro_rt->rt_mtu; return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto)); } /* * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and * hostcache data for @dst. * Stores mtu and always-frag value into @mtup and @alwaysfragp. * * Returns 0 on success. */ static int ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu, u_long *mtup, int *alwaysfragp, u_int proto) { u_long mtu = 0; int alwaysfrag = 0; int error = 0; if (rt_mtu > 0) { u_int32_t ifmtu; struct in_conninfo inc; bzero(&inc, sizeof(inc)); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; ifmtu = IN6_LINKMTU(ifp); /* TCP is known to react to pmtu changes so skip hc */ if (proto != IPPROTO_TCP) mtu = tcp_hc_getmtu(&inc); if (mtu) mtu = min(mtu, rt_mtu); else mtu = rt_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with framgent header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *in6p = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; #ifdef RSS uint32_t rss_bucket; int retval; #endif /* * Don't use more than a quarter of mbuf clusters. N.B.: * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow * on LP64 architectures, so cast to u_long to avoid undefined * behavior. ILP32 architectures cannot have nmbclusters * large enough to overflow for other reasons. */ #define IPV6_PKTOPTIONS_MBUF_LIMIT ((u_long)nmbclusters * MCLBYTES / 4) level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; error = 0; optval = 0; uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEADDR) != 0) in6p->inp_flags2 |= INP_REUSEADDR; else in6p->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(in6p); error = 0; break; case SO_REUSEPORT: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEPORT) != 0) in6p->inp_flags2 |= INP_REUSEPORT; else in6p->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(in6p); error = 0; break; case SO_REUSEPORT_LB: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEPORT_LB) != 0) in6p->inp_flags2 |= INP_REUSEPORT_LB; else in6p->inp_flags2 &= ~INP_REUSEPORT_LB; INP_WUNLOCK(in6p); error = 0; break; case SO_SETFIB: INP_WLOCK(in6p); in6p->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(in6p); error = 0; break; case SO_MAX_PACING_RATE: #ifdef RATELIMIT INP_WLOCK(in6p); in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED; INP_WUNLOCK(in6p); error = 0; #else error = EOPNOTSUPP; #endif break; default: break; } } } else { /* level == IPPROTO_IPV6 */ switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) { printf("ip6_ctloutput: mbuf limit hit\n"); error = ENOBUFS; break; } error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; error = ip6_pcbopts(&in6p->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RECVRSSBUCKETID: #endif case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_ORIGDSTADDR: case IPV6_BINDANY: case IPV6_BINDMULTI: #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: #endif if (optname == IPV6_BINDANY && td != NULL) { error = priv_check(td, PRIV_NETINET_BINDANY); if (error) break; } if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ in6p->in6p_hops = optval; if ((in6p->inp_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ INP_WLOCK(in6p); \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ INP_WLOCK(in6p); \ in6p->inp_flags |= IN6P_RFC2292; \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) #define OPTSET2_N(bit, val) do { \ if (val) \ in6p->inp_flags2 |= bit; \ else \ in6p->inp_flags2 &= ~bit; \ } while (0) #define OPTSET2(bit, val) do { \ INP_WLOCK(in6p); \ OPTSET2_N(bit, val); \ INP_WUNLOCK(in6p); \ } while (0) #define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0) #define OPTSET2292_EXCLUSIVE(bit) \ do { \ INP_WLOCK(in6p); \ if (OPTBIT(IN6P_RFC2292)) { \ error = EINVAL; \ } else { \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ } \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) case IPV6_RECVPKTINFO: OPTSET2292_EXCLUSIVE(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } INP_WLOCK(in6p); if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(in6p); return (ECONNRESET); } optp = &in6p->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(in6p); break; } case IPV6_RECVHOPLIMIT: OPTSET2292_EXCLUSIVE(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: OPTSET2292_EXCLUSIVE(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: OPTSET2292_EXCLUSIVE(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: OPTSET2292_EXCLUSIVE(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: OPTSET2292_EXCLUSIVE(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IPV6_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (in6p->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) in6p->inp_vflag &= ~INP_IPV4; else in6p->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ OPTSET2292_EXCLUSIVE(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_ORIGDSTADDR: OPTSET2(INP_ORIGDSTADDR, optval); break; case IPV6_BINDANY: OPTSET(INP_BINDANY); break; case IPV6_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { INP_WLOCK(in6p); in6p->inp_rss_listen_bucket = optval; OPTSET2_N(INP_RSS_BUCKET_SET, 1); INP_WUNLOCK(in6p); } else { error = EINVAL; } break; #endif } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; INP_WLOCK(in6p); if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(in6p); return (ECONNRESET); } optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(in6p); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; INP_WLOCK(in6p); if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(in6p); return (ECONNRESET); } optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); INP_WUNLOCK(in6p); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: case IPV6_MSFILTER: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = ip6_setmoptions(in6p, sopt); break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(in6p); switch (optval) { case IPV6_PORTRANGE_DEFAULT: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: in6p->inp_flags &= ~(INP_HIGHPORT); in6p->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(in6p); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { error = IPSEC_PCBCTL(ipv6, in6p, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_FLOWID: case IPV6_FLOWTYPE: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RSSBUCKETID: case IPV6_RECVRSSBUCKETID: #endif case IPV6_BINDMULTI: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = in6p->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_ORIGDSTADDR: optval = OPTBIT2(INP_ORIGDSTADDR); break; case IPV6_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IPV6_FLOWID: optval = in6p->inp_flowid; break; case IPV6_FLOWTYPE: optval = in6p->inp_flowtype; break; case IPV6_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IPV6_RSSBUCKETID: retval = rss_hash2bucket(in6p->inp_flowid, in6p->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IPV6_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IPV6_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; struct in6_addr addr; if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. * Copy faddr out of in6p to avoid holding lock * on inp during route lookup. */ INP_RLOCK(in6p); bcopy(&in6p->in6p_faddr, &addr, sizeof(addr)); INP_RUNLOCK(in6p); error = ip6_getpmtu_ctl(so->so_fibnum, &addr, &pmtu); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(in6p, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: error = ip6_getmoptions(in6p, sopt); break; #if defined(IPSEC) || defined(IPSEC_SUPPORT) case IPV6_IPSEC_POLICY: if (IPSEC_ENABLED(ipv6)) { error = IPSEC_PCBCTL(ipv6, in6p, sopt); break; } /* FALLTHROUGH */ #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } } return (error); } int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *in6p = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if (optval < -1 || (optval % 2) != 0) { /* * The API assumes non-negative even offset * values or -1 as a special value. */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else in6p->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = in6p->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK); *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ? td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, struct ucred *cred, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_NOWAIT); if (*pktopt == NULL) return (ENOBUFS); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto)); } #define GET_PKTOPT_VAR(field, lenexpr) do { \ if (pktopt && pktopt->field) { \ INP_RUNLOCK(in6p); \ optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK); \ malloc_optdata = true; \ INP_RLOCK(in6p); \ if (in6p->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ INP_RUNLOCK(in6p); \ free(optdata, M_TEMP); \ return (ECONNRESET); \ } \ pktopt = in6p->in6p_outputopts; \ if (pktopt && pktopt->field) { \ optdatalen = min(lenexpr, sopt->sopt_valsize); \ bcopy(&pktopt->field, optdata, optdatalen); \ } else { \ free(optdata, M_TEMP); \ optdata = NULL; \ malloc_optdata = false; \ } \ } \ } while(0) #define GET_PKTOPT_EXT_HDR(field) GET_PKTOPT_VAR(field, \ (((struct ip6_ext *)pktopt->field)->ip6e_len + 1) << 3) #define GET_PKTOPT_SOCKADDR(field) GET_PKTOPT_VAR(field, \ pktopt->field->sa_len) static int ip6_getpcbopt(struct inpcb *in6p, int optname, struct sockopt *sopt) { void *optdata = NULL; bool malloc_optdata = false; int optdatalen = 0; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; struct ip6_pktopts *pktopt; INP_RLOCK(in6p); pktopt = in6p->in6p_outputopts; switch (optname) { case IPV6_PKTINFO: optdata = (void *)&null_pktinfo; if (pktopt && pktopt->ip6po_pktinfo) { bcopy(pktopt->ip6po_pktinfo, &null_pktinfo, sizeof(null_pktinfo)); in6_clearscope(&null_pktinfo.ipi6_addr); } else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) deftclass = pktopt->ip6po_tclass; optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: GET_PKTOPT_EXT_HDR(ip6po_hbh); break; case IPV6_RTHDR: GET_PKTOPT_EXT_HDR(ip6po_rthdr); break; case IPV6_RTHDRDSTOPTS: GET_PKTOPT_EXT_HDR(ip6po_dest1); break; case IPV6_DSTOPTS: GET_PKTOPT_EXT_HDR(ip6po_dest2); break; case IPV6_NEXTHOP: GET_PKTOPT_SOCKADDR(ip6po_nexthop); break; case IPV6_USE_MIN_MTU: if (pktopt) defminmtu = pktopt->ip6po_minmtu; optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) defpreftemp = pktopt->ip6po_prefer_tempaddr; optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif INP_RUNLOCK(in6p); return (ENOPROTOOPT); } INP_RUNLOCK(in6p); error = sooptcopyout(sopt, optdata, optdatalen); if (malloc_optdata) free(optdata, M_TEMP); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_rt) { RTFREE(pktopt->ip6po_nextroute.ro_rt); pktopt->ip6po_nextroute.ro_rt = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { RTFREE(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto) { struct cmsghdr *cm = NULL; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { int minmtupolicy, preftemp; int error; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr)) return (EINVAL); /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex > V_if_index) return (ENXIO); if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)) return (ENETDOWN); if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { struct in6_ifaddr *ia; in6_setscope(&pktinfo->ipi6_addr, ifp, NULL); ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr); if (ia == NULL) return (EADDRNOTAVAIL); ifa_free(&ia->ia_ifa); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */ error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m) { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copym(m, 0, M_COPYALL, M_NOWAIT); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if (!M_WRITABLE(copym) || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } if_simloop(ifp, copym, AF_INET6, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); M_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *in6p) { int len; if (!in6p->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(in6p->in6p_outputopts->ip6po_dest1); len += elen(in6p->in6p_outputopts->ip6po_rthdr); len += elen(in6p->in6p_outputopts->ip6po_dest2); return len; #undef elen } Index: head/sys/netpfil/ipfw/ip_fw2.c =================================================================== --- head/sys/netpfil/ipfw/ip_fw2.c (revision 348253) +++ head/sys/netpfil/ipfw/ip_fw2.c (revision 348254) @@ -1,3491 +1,3491 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * The FreeBSD IP packet firewall, main file */ #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error "IPFIREWALL requires INET" #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #endif #include /* for struct grehdr */ #include #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * static variables followed by global ones. * All ipfw global variables are here. */ VNET_DEFINE_STATIC(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) VNET_DEFINE_STATIC(int, fw_permit_single_frag6) = 1; #define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif VNET_DEFINE(int, autoinc_step); VNET_DEFINE(int, fw_one_pass) = 1; VNET_DEFINE(unsigned int, fw_tables_max); VNET_DEFINE(unsigned int, fw_tables_sets) = 0; /* Don't use set-aware tables */ /* Use 128 tables by default */ static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; #ifndef LINEAR_SKIPTO static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_fast(ch, f, num, targ, back) #else static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards); #define JUMP(ch, f, num, targ, back) jump_linear(ch, f, num, targ, back) #endif /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. * If the bit is set, all rules in the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted individually. */ VNET_DEFINE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DEFINE(int, fw_verbose); /* counter for ipfw_log(NULL...) */ VNET_DEFINE(u_int64_t, norule_counter); VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); /* ipfw_vnet_ready controls when we are open for business */ VNET_DEFINE(int, ipfw_vnet_ready) = 0; VNET_DEFINE(int, ipfw_nat_ready) = 0; ipfw_nat_t *ipfw_nat_ptr = NULL; struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS); SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", "Maximum number of concurrently used tables"); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_tables_sets, "IU", "Use per-set namespace for tables"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, "Permit single packet IPv6 fragments"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) { if (ifp == NULL) /* no iface with this packet, match fails */ return (0); /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ if (cmd->name[0] == '\1') /* use tablearg to match */ return ipfw_lookup_table(chain, cmd->p.kidx, 0, &ifp->if_index, tablearg); /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { #if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) { if_addr_runlock(ifp); return(1); /* match */ } } if_addr_runlock(ifp); #endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. * The 'versrcreach' option just checks that the source address is * reachable via any route (except default) in the routing table. * These two are a measure to block forged packets. This is also * commonly known as "anti-spoofing" or Unicast Reverse Path * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that the syntax * is misleading, and the check may be performed on all IP packets * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { #if defined(USERSPACE) || !defined(__FreeBSD__) return 0; #else struct nhop4_basic nh4; if (fib4_lookup_nh_basic(fib, src, NHR_IFAIF, 0, &nh4) != 0) return (0); /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ifp != nh4.nh_ifp) return (0); /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && (nh4.nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ if (ifp == NULL && (nh4.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; #endif /* __FreeBSD__ */ } /* * Generate an SCTP packet containing an ABORT chunk. The verification tag * is given by vtag. The T-bit is set in the ABORT chunk if and only if * reflected is not 0. */ static struct mbuf * ipfw_send_abort(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t vtag, int reflected) { struct mbuf *m; struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; #endif struct sctphdr *sctp; struct sctp_chunkhdr *chunk; u_int16_t hlen, plen, tlen; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: hlen = sizeof(struct ip); break; #ifdef INET6 case 6: hlen = sizeof(struct ip6_hdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } plen = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); tlen = hlen + plen; m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = tlen; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, tlen); switch (id->addr_type) { case 4: ip = mtod(m, struct ip *); ip->ip_v = 4; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_tos = IPTOS_LOWDELAY; ip->ip_len = htons(tlen); ip->ip_id = htons(0); ip->ip_off = htons(0); ip->ip_ttl = V_ip_defttl; ip->ip_p = IPPROTO_SCTP; ip->ip_sum = 0; ip->ip_src.s_addr = htonl(id->dst_ip); ip->ip_dst.s_addr = htonl(id->src_ip); sctp = (struct sctphdr *)(ip + 1); break; #ifdef INET6 case 6: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_plen = htons(plen); ip6->ip6_nxt = IPPROTO_SCTP; ip6->ip6_hlim = IPV6_DEFHLIM; ip6->ip6_src = id->dst_ip6; ip6->ip6_dst = id->src_ip6; sctp = (struct sctphdr *)(ip6 + 1); break; #endif } sctp->src_port = htons(id->dst_port); sctp->dest_port = htons(id->src_port); sctp->v_tag = htonl(vtag); sctp->checksum = htonl(0); chunk = (struct sctp_chunkhdr *)(sctp + 1); chunk->chunk_type = SCTP_ABORT_ASSOCIATION; chunk->chunk_flags = 0; if (reflected != 0) { chunk->chunk_flags |= SCTP_HAD_NO_TCB; } chunk->chunk_length = htons(sizeof(struct sctp_chunkhdr)); sctp->checksum = sctp_calculate_cksum(m, hlen); return (m); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ struct mbuf * ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m = NULL; /* stupid compiler */ struct ip *h = NULL; /* stupid compiler */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif struct tcphdr *th = NULL; int len, dir; MGETHDR(m, M_NOWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: len = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case 6: len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (id->addr_type) { case 4: h = mtod(m, struct ip *); /* prepare for checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(sizeof(struct tcphdr)); if (dir) { h->ip_src.s_addr = htonl(id->src_ip); h->ip_dst.s_addr = htonl(id->dst_ip); } else { h->ip_src.s_addr = htonl(id->dst_ip); h->ip_dst.s_addr = htonl(id->src_ip); } th = (struct tcphdr *)(h + 1); break; #ifdef INET6 case 6: h6 = mtod(m, struct ip6_hdr *); /* prepare for checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(sizeof(struct tcphdr)); if (dir) { h6->ip6_src = id->src_ip6; h6->ip6_dst = id->dst_ip6; } else { h6->ip6_src = id->dst_ip6; h6->ip6_dst = id->src_ip6; } th = (struct tcphdr *)(h6 + 1); break; #endif } if (dir) { th->th_sport = htons(id->src_port); th->th_dport = htons(id->dst_port); } else { th->th_sport = htons(id->dst_port); th->th_dport = htons(id->src_port); } th->th_off = sizeof(struct tcphdr) >> 2; if (flags & TH_RST) { if (flags & TH_ACK) { th->th_seq = htonl(ack); th->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; th->th_ack = htonl(seq); th->th_flags = TH_RST | TH_ACK; } } else { /* * Keepalive - use caller provided sequence numbers */ th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_flags = TH_ACK; } switch (id->addr_type) { case 4: th->th_sum = in_cksum(m, len); /* finish the ip header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = htons(0); h->ip_len = htons(len); h->ip_ttl = V_ip_defttl; h->ip_sum = 0; break; #ifdef INET6 case 6: th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), sizeof(struct tcphdr)); /* finish the ip6 header */ h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; break; #endif } return (m); } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static const struct in6_addr lla_mask = {{{ 0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }}}; static int ipfw_localip6(struct in6_addr *in6) { struct rm_priotracker in6_ifa_tracker; struct in6_ifaddr *ia; if (IN6_IS_ADDR_MULTICAST(in6)) return (0); if (!IN6_IS_ADDR_LINKLOCAL(in6)) return (in6_localip(in6)); IN6_IFADDR_RLOCK(&in6_ifa_tracker); CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) continue; if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr, in6, &lla_mask)) { IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (1); } } IN6_IFADDR_RUNLOCK(&in6_ifa_tracker); return (0); } static int verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) { struct nhop6_basic nh6; if (IN6_IS_SCOPE_LINKLOCAL(src)) return (1); if (fib6_lookup_nh_basic(fib, src, 0, NHR_IFAIF, 0, &nh6) != 0) return (0); /* If ifp is provided, check for equality with route table. */ if (ifp != NULL && ifp != nh6.nh_ifp) return (0); /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && (nh6.nh_flags & NHF_DEFAULT) != 0) return (0); /* or if this is a blackhole/reject route */ if (ifp == NULL && (nh6.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0) return (0); /* found valid route */ return 1; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static int map_icmp_unreach(int code) { /* RFC 7915 p4.2 */ switch (code) { case ICMP_UNREACH_NET: case ICMP_UNREACH_HOST: case ICMP_UNREACH_SRCFAIL: case ICMP_UNREACH_NET_UNKNOWN: case ICMP_UNREACH_HOST_UNKNOWN: case ICMP_UNREACH_TOSNET: case ICMP_UNREACH_TOSHOST: return (ICMP6_DST_UNREACH_NOROUTE); case ICMP_UNREACH_PORT: return (ICMP6_DST_UNREACH_NOPORT); default: /* * Map the rest of codes into admit prohibited. * XXX: unreach proto should be mapped into ICMPv6 * parameter problem, but we use only unreach type. */ return (ICMP6_DST_UNREACH_ADMIN); } } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) == 0) { struct mbuf *m0; m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } FREE_PKT(m); } else if (code == ICMP6_UNREACH_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m0; struct sctphdr *sctp; u_int32_t v_tag; int reflected; sctp = (struct sctphdr *)((char *)ip6 + hlen); reflected = 1; v_tag = ntohl(sctp->v_tag); /* Investigate the first chunk header if available */ if (m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { struct sctp_chunkhdr *chunk; chunk = (struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (m->m_pkthdr.len > hlen + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((m->m_len >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { struct sctp_init *init; init = (struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but don't do to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m0 = NULL; } else { m0 = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST && code != ICMP6_UNREACH_ABORT) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST && code != ICMP_REJECT_ABORT) { /* Send an ICMP unreach */ icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (code == ICMP_REJECT_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } FREE_PKT(args->m); } else if (code == ICMP_REJECT_ABORT && args->f_id.proto == IPPROTO_SCTP) { struct mbuf *m; struct sctphdr *sctp; struct sctp_chunkhdr *chunk; struct sctp_init *init; u_int32_t v_tag; int reflected; sctp = L3HDR(struct sctphdr, mtod(args->m, struct ip *)); reflected = 1; v_tag = ntohl(sctp->v_tag); if (iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr)) { /* Look at the first chunk header if available */ chunk = (struct sctp_chunkhdr *)(sctp + 1); switch (chunk->chunk_type) { case SCTP_INITIATION: /* * Packets containing an INIT chunk MUST have * a zero v-tag. */ if (v_tag != 0) { v_tag = 0; break; } /* INIT chunk MUST NOT be bundled */ if (iplen > (ip->ip_hl << 2) + sizeof(struct sctphdr) + ntohs(chunk->chunk_length) + 3) { break; } /* Use the initiate tag if available */ if ((iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd))) { init = (struct sctp_init *)(chunk + 1); v_tag = ntohl(init->initiate_tag); reflected = 0; } break; case SCTP_ABORT_ASSOCIATION: /* * If the packet contains an ABORT chunk, don't * reply. * XXX: We should search through all chunks, * but don't do to avoid attacks. */ v_tag = 0; break; } } if (v_tag == 0) { m = NULL; } else { m = ipfw_send_abort(args->m, &(args->f_id), v_tag, reflected); } if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); FREE_PKT(args->m); } else FREE_PKT(args->m); args->m = NULL; } /* * Support for uid/gid/jail lookup. These tests are expensive * (because we may need to look into the list of active sockets) * so we cache the results. ugid_lookupp is 0 if we have not * yet done a lookup, 1 if we succeeded, and -1 if we tried * and failed. The function always returns the match value. * We could actually spare the variable and use *uc, setting * it to '(void *)check_uidgid if we have no info, NULL if * we tried and failed, or any other value if successful. */ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { #if defined(USERSPACE) return 0; // not supported in userspace #else #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; struct ipfw_flow_id *id; struct inpcb *pcb, *inp; int lookupflags; int match; id = &args->f_id; inp = args->inp; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *ugid_lookupp == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { *uc = crhold(inp->inp_cred); *ugid_lookupp = 1; } else *ugid_lookupp = -1; } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*ugid_lookupp == -1) return (0); if (id->proto == IPPROTO_TCP) { lookupflags = 0; pi = &V_tcbinfo; } else if (id->proto == IPPROTO_UDP) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else if (id->proto == IPPROTO_UDPLITE) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_ulitecbinfo; } else return 0; lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { if (id->addr_type == 6) { #ifdef INET6 if (args->flags & IPFW_ARGS_IN) pcb = in6_pcblookup_mbuf(pi, &id->src_ip6, htons(id->src_port), &id->dst_ip6, htons(id->dst_port), lookupflags, NULL, args->m); else pcb = in6_pcblookup_mbuf(pi, &id->dst_ip6, htons(id->dst_port), &id->src_ip6, htons(id->src_port), lookupflags, args->ifp, args->m); #else *ugid_lookupp = -1; return (0); #endif } else { src_ip.s_addr = htonl(id->src_ip); dst_ip.s_addr = htonl(id->dst_ip); if (args->flags & IPFW_ARGS_IN) pcb = in_pcblookup_mbuf(pi, src_ip, htons(id->src_port), dst_ip, htons(id->dst_port), lookupflags, NULL, args->m); else pcb = in_pcblookup_mbuf(pi, dst_ip, htons(id->dst_port), src_ip, htons(id->src_port), lookupflags, args->ifp, args->m); } if (pcb != NULL) { INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; INP_RUNLOCK(pcb); } if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); } } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return (match); #endif /* __FreeBSD__ */ #endif /* not supported in userspace */ } /* * Helper function to set args with info on the rule after the matching * one. slot is precise, whereas we guess rule_id as they are * assigned sequentially. */ static inline void set_match(struct ip_fw_args *args, int slot, struct ip_fw_chain *chain) { args->rule.chain_id = chain->id; args->rule.slot = slot + 1; /* we use 0 as a marker */ args->rule.rule_id = 1 + chain->map[slot]->id; args->rule.rulenum = chain->map[slot]->rulenum; args->flags |= IPFW_ARGS_REF; } #ifndef LINEAR_SKIPTO /* * Helper function to enable cached rule lookups using * cached_id and cached_pos fields in ipfw rule. */ static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; /* If possible use cached f_pos (in f->cached_pos), * whose version is written in f->cached_id * (horrible hacks to avoid changing the ABI). */ if (num != IP_FW_TARG && f->cached_id == chain->id) f_pos = f->cached_pos; else { int i = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && i <= f->rulenum) i = f->rulenum + 1; if (chain->idxmap != NULL) f_pos = chain->idxmap[i]; else f_pos = ipfw_find_rule(chain, i, 0); /* update the cache */ if (num != IP_FW_TARG) { f->cached_id = chain->id; f->cached_pos = f_pos; } } return (f_pos); } #else /* * Helper function to enable real fast rule lookups. */ static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num, int tablearg, int jump_backwards) { int f_pos; num = IP_FW_ARG_TABLEARG(chain, num, skipto); /* make sure we do not jump backward */ if (jump_backwards == 0 && num <= f->rulenum) num = f->rulenum + 1; f_pos = chain->idxmap[num]; return (f_pos); } #endif #define TARG(k, f) IP_FW_ARG_TABLEARG(chain, k, f) /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->ifp Incoming or outgoing interface. * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->next_hop6 IPv6 next hop we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->rule.info a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * args->rule contains the matching rule, * args->rule.info has additional information. * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * OR * args->mem Pointer to contigous memory chunk. * ip Is the beginning of the ip(4 or 6) header. * eh Ethernet header in case if input is Layer2. */ struct mbuf *m; struct ip *ip; struct ether_header *eh; /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ #ifndef __FreeBSD__ struct bsd_ucred ucred_cache; #else struct ucred *ucred_cache = NULL; #endif int ucred_lookup = 0; int f_pos = 0; /* index of current rule in the array */ int retval = 0; struct ifnet *oif, *iif; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header * or there is a single packet fragment (fragment header added * without needed). We will treat a single packet fragment as if * there was no fragment header (or log/block depending on the * V_fw_permit_single_frag6 sysctl setting). */ u_short offset = 0; u_short ip6f_mf = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ uint8_t proto; uint16_t src_port, dst_port; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ int iplen = 0; int pktlen; struct ipfw_dyn_info dyn_info; struct ip_fw *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; uint8_t icmp6_type = 0; uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ IPFW_RLOCK_TRACKER; bool mem; if ((mem = (args->flags & IPFW_ARGS_LENMASK))) { if (args->flags & IPFW_ARGS_ETHER) { eh = (struct ether_header *)args->mem; if (eh->ether_type == htons(ETHERTYPE_VLAN)) ip = (struct ip *) ((struct ether_vlan_header *)eh + 1); else ip = (struct ip *)(eh + 1); } else { eh = NULL; ip = (struct ip *)args->mem; } pktlen = IPFW_ARGS_LENGTH(args->flags); args->f_id.fib = args->ifp->if_fib; /* best guess */ } else { m = args->m; if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ if (args->flags & IPFW_ARGS_ETHER) { /* We need some amount of data to be contiguous. */ if (m->m_len < min(m->m_pkthdr.len, max_protohdr) && (args->m = m = m_pullup(m, min(m->m_pkthdr.len, max_protohdr))) == NULL) goto pullup_failed; eh = mtod(m, struct ether_header *); ip = (struct ip *)(eh + 1); } else { eh = NULL; ip = mtod(m, struct ip *); } pktlen = m->m_pkthdr.len; args->f_id.fib = M_GETFIB(m); /* mbuf not altered */ } dst_ip.s_addr = 0; /* make sure it is initialized */ src_ip.s_addr = 0; /* make sure it is initialized */ src_port = dst_port = 0; DYN_INFO_INIT(&dyn_info); /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) #define EHLEN (eh != NULL ? ((char *)ip - (char *)eh) : 0) #define PULLUP_LEN(_len, p, T) \ do { \ int x = (_len) + T + EHLEN; \ if (mem) { \ MPASS(pktlen >= x); \ p = (char *)args->mem + (_len) + EHLEN; \ } else { \ if (__predict_false((m)->m_len < x)) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = mtod(m, char *) + (_len) + EHLEN; \ } \ } while (0) /* * In case pointers got stale after pullups, update them. */ #define UPDATE_POINTERS() \ do { \ if (!mem) { \ if (eh != NULL) { \ eh = mtod(m, struct ether_header *); \ ip = (struct ip *)(eh + 1); \ } else \ ip = mtod(m, struct ip *); \ args->m = m; \ } \ } while (0) /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (eh == NULL || eh->ether_type == htons(ETHERTYPE_IPV6)) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->flags |= IPFW_ARGS_IP6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL && offset == 0) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); icmp6_type = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Routing Header type(%d)\n", ((struct ip6_rthdr *) ulp)->ip6r_type); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (V_fw_permit_single_frag6 == 0 && offset == 0 && ip6f_mf == 0) { if (V_fw_verbose) printf("IPFW2: IPV6 - Invalid " "Fragment Header\n"); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_GRE: /* RFC 1701 */ /* XXX GRE header check? */ PULLUP_TO(hlen, ulp, struct grehdr); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, offsetof( struct carp_header, carp_counter)); if (CARP_ADVERTISEMENT != ((struct carp_header *)ulp)->carp_type) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Extension Header(%d), ext_hd=%x\n", proto, ext_hd); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } UPDATE_POINTERS(); ip6 = (struct ip6_hdr *)ip; args->f_id.addr_type = 6; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); iplen = ntohs(ip6->ip6_plen) + sizeof(*ip6); } else if (pktlen >= sizeof(struct ip) && (eh == NULL || eh->ether_type == htons(ETHERTYPE_IP)) && ip->ip_v == 4) { is_ipv4 = 1; args->flags |= IPFW_ARGS_IP4; hlen = ip->ip_hl << 2; /* * Collect parameters into local variables for faster * matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; offset = ntohs(ip->ip_off) & IP_OFFMASK; iplen = ntohs(ip->ip_len); if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: if (pktlen >= hlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)); else if (pktlen >= hlen + sizeof(struct sctphdr)) PULLUP_LEN(hlen, ulp, pktlen - hlen); else PULLUP_LEN(hlen, ulp, sizeof(struct sctphdr)); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } UPDATE_POINTERS(); args->f_id.addr_type = 4; args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } else { proto = 0; dst_ip.s_addr = src_ip.s_addr = 0; args->f_id.addr_type = 1; /* XXX */ } #undef PULLUP_TO pktlen = iplen < pktlen ? iplen: pktlen; /* Properly initialize the rest of f_id */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); IPFW_PF_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ IPFW_PF_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->flags & IPFW_ARGS_REF) { /* * Packet has already been tagged as a result of a previous * match on rule args->rule aka args->rule_id (PIPE, QUEUE, * REASS, NETGRAPH, DIVERT/TEE...) * Validate the slot and continue from the next one * if still present, otherwise do a lookup. */ f_pos = (args->rule.chain_id == chain->id) ? args->rule.slot : ipfw_find_rule(chain, args->rule.rulenum, args->rule.rule_id); } else { f_pos = 0; } if (args->flags & IPFW_ARGS_IN) { iif = args->ifp; oif = NULL; } else { MPASS(args->flags & IPFW_ARGS_OUT); - iif = mem ? NULL : m->m_pkthdr.rcvif; + iif = mem ? NULL : m_rcvif(m); oif = args->ifp; } /* * Now scan the rules, and parse microinstructions for each rule. * We have two nested loops and an inner switch. Sometimes we * need to break out of one or both loops, or re-enter one of * the loops with updated variables. Loop variables are: * * f_pos (outer loop) points to the current rule. * On output it points to the matching rule. * done (outer loop) is used as a flag to break the loop. * l (inner loop) residual length of current rule. * cmd points to the current microinstruction. * * We break the inner loop by setting l=0 and possibly * cmdlen=0 if we don't want to advance cmd. * We break the outer loop by setting done=1 * We can restart the inner loop by setting l>0 and f_pos, f, cmd * as needed. */ for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ struct ip_fw *f; f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ /* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE) match = check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); #else (void *)&ucred_cache); #endif break; case O_RECV: match = iface_match(iif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_VIA: match = iface_match(args->ifp, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_MACADDR2: if (args->flags & IPFW_ARGS_ETHER) { u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->flags & IPFW_ARGS_ETHER) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (ntohs(eh->ether_type) >= p[0] && ntohs(eh->ether_type) <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->flags & IPFW_ARGS_ETHER); break; case O_DIVERTED: if ((args->flags & IPFW_ARGS_REF) == 0) break; /* * For diverted packets, args->rule.info * contains the divert port (in host format) * reason and direction. */ match = ((args->rule.info & IPFW_IS_MASK) == IPFW_IS_DIVERT) && ( ((args->rule.info & IPFW_INFO_IN) ? 1: 2) & cmd->arg1); break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_DST_LOOKUP: { void *pkey; uint32_t vidx, key; uint16_t keylen; if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { /* Determine lookup key type */ vidx = ((ipfw_insn_u32 *)cmd)->d[1]; if (vidx != 4 /* uid */ && vidx != 5 /* jail */ && is_ipv6 == 0 && is_ipv4 == 0) break; /* Determine key length */ if (vidx == 0 /* dst-ip */ || vidx == 1 /* src-ip */) keylen = is_ipv6 ? sizeof(struct in6_addr): sizeof(in_addr_t); else { keylen = sizeof(key); pkey = &key; } if (vidx == 0 /* dst-ip */) pkey = is_ipv4 ? (void *)&dst_ip: (void *)&args->f_id.dst_ip6; else if (vidx == 1 /* src-ip */) pkey = is_ipv4 ? (void *)&src_ip: (void *)&args->f_id.src_ip6; else if (vidx == 6 /* dscp */) { if (is_ipv4) key = ip->ip_tos >> 2; else { key = args->f_id.flow_id6; key = (key & 0x0f) << 2 | (key & 0xf000) >> 14; } key &= 0x3f; } else if (vidx == 2 /* dst-port */ || vidx == 3 /* src-port */) { /* Skip fragments */ if (offset != 0) break; /* Skip proto without ports */ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP && proto != IPPROTO_UDPLITE && proto != IPPROTO_SCTP) break; if (vidx == 2 /* dst-port */) key = dst_port; else key = src_port; } #ifndef USERSPACE else if (vidx == 4 /* uid */ || vidx == 5 /* jail */) { check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); if (vidx == 4 /* uid */) key = ucred_cache->cr_uid; else if (vidx == 5 /* jail */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ (void *)&ucred_cache); if (vidx == 4 /* uid */) key = ucred_cache.uid; else if (vidx == 5 /* jail */) key = ucred_cache.xid; #endif /* !__FreeBSD__ */ } #endif /* !USERSPACE */ else break; match = ipfw_lookup_table(chain, cmd->arg1, keylen, pkey, &vidx); if (!match) break; tablearg = vidx; break; } /* cmdlen =< F_INSN_SIZE(ipfw_insn_u32) */ /* FALLTHROUGH */ } case O_IP_SRC_LOOKUP: { void *pkey; uint32_t vidx; uint16_t keylen; if (is_ipv4) { keylen = sizeof(in_addr_t); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &dst_ip; else pkey = &src_ip; } else if (is_ipv6) { keylen = sizeof(struct in6_addr); if (cmd->opcode == O_IP_DST_LOOKUP) pkey = &args->f_id.dst_ip6; else pkey = &args->f_id.src_ip6; } else break; match = ipfw_lookup_table(chain, cmd->arg1, keylen, pkey, &vidx); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) { match = ((ipfw_insn_u32 *)cmd)->d[0] == TARG_VAL(chain, vidx, tag); if (!match) break; } tablearg = vidx; break; } case O_IP_FLOW_LOOKUP: { uint32_t v = 0; match = ipfw_lookup_table(chain, cmd->arg1, 0, &args->f_id, &v); if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == TARG_VAL(chain, v, tag); if (match) tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { match = in_localip(src_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.src_ip6); #endif break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { match = in_localip(dst_ip); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: match = is_ipv6 && ipfw_localip6(&args->f_id.dst_ip6); #endif break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto == IPPROTO_UDP || proto == IPPROTO_UDPLITE || proto == IPPROTO_TCP || proto == IPPROTO_SCTP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPTTL: if (!is_ipv4) break; case O_IPLEN: { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_DSCP: { uint32_t *p; uint16_t x; p = ((ipfw_insn_u32 *)cmd)->d; if (is_ipv4) x = ip->ip_tos >> 2; else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; x = (*v & 0x0F) << 2; v++; x |= *v >> 6; } else break; /* DSCP bitmask is stored as low_u32 high_u32 */ if (x >= 32) match = *(p + 1) & (1 << (x - 32)); else match = *p & (1 << x); } break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; #ifdef INET6 if (is_ipv6) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip; if (ip6->ip6_plen == 0) { /* * Jumbo payload is not * supported by this * opcode. */ break; } x = iplen - hlen; } else #endif /* INET6 */ x = iplen - (ip->ip_hl << 2); tcp = TCP(ulp); x -= tcp->th_off << 2; if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: if (proto == IPPROTO_TCP && offset == 0 && ulp){ PULLUP_LEN(hlen, ulp, (TCP(ulp)->th_off << 2)); match = tcpopts_match(TCP(ulp), cmd); } break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: if (proto == IPPROTO_TCP && offset == 0) { uint16_t x; uint16_t *p; int i; x = ntohs(TCP(ulp)->th_win); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; struct m_tag *mtag; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; /* * ALTQ uses mbuf tags from another * packet filtering system - pf(4). * We allocate a tag in its format * and fill it in, pretending to be pf(4). */ match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; mtag = m_tag_get(PACKET_TAG_PF, sizeof(struct pf_mtag), M_NOWAIT | M_ZERO); if (mtag == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } m_tag_prepend(m, mtag); at = (struct pf_mtag *)(mtag + 1); at->qid = altq->qid; at->hdr = ip; break; } case O_LOG: ipfw_log(chain, f, hlen, args, offset | ip6f_mf, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = (args->flags & IPFW_ARGS_OUT || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), iif, args->f_id.fib) : #endif verify_path(src_ip, iif, args->f_id.fib))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL, args->f_id.fib) : #endif verify_path(src_ip, NULL, args->f_id.fib)))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), iif, args->f_id.fib) : #endif verify_path(src_ip, iif, args->f_id.fib); else match = 1; break; case O_IPSEC: match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); match = 0; } else { if (mtag == NULL) { mtag = m_tag_alloc( MTAG_IPFW, tag, 0, M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } match = 1; } break; } case O_FIB: /* try match the specified fib */ if (args->f_id.fib == cmd->arg1) match = 1; break; case O_SOCKARG: { #ifndef USERSPACE /* not supported in userspace */ struct inpcb *inp = args->inp; struct inpcbinfo *pi; if (is_ipv6) /* XXX can we remove this ? */ break; if (proto == IPPROTO_TCP) pi = &V_tcbinfo; else if (proto == IPPROTO_UDP) pi = &V_udbinfo; else if (proto == IPPROTO_UDPLITE) pi = &V_ulitecbinfo; else break; /* * XXXRW: so_user_cookie should almost * certainly be inp_user_cookie? */ /* For incoming packet, lookup up the inpcb using the src/dest ip/port tuple */ if (inp == NULL) { inp = in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; INP_RUNLOCK(inp); } } else { if (inp->inp_socket) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; } } #endif /* !USERSPACE */ break; } case O_TAGGED: { struct m_tag *mtag; uint32_t tag = TARG(cmd->arg1, tag); if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * (setting l=0), or to the SKIPTO target (setting * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule (one * exception is O_SKIP_ACTION which could be * between these opcodes and 'action' one). * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet must be dropped (set retval, * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found. * The result of the lookup is cached so that * further instances of these opcodes become NOPs. * The jump to the next rule is done by setting * l=0, cmdlen=0. * * O_SKIP_ACTION: this opcode is not a real 'action' * either, and is stored right before the 'action' * part of the rule, right after the O_KEEP_STATE * opcode. It causes match failure so the real * 'action' could be executed only if the rule * is checked via dynamic rule from the state * table, as in such case execution starts * from the true 'action' opcode directly. * */ case O_LIMIT: case O_KEEP_STATE: if (ipfw_dyn_install_state(chain, f, (ipfw_insn_limit *)cmd, args, ulp, pktlen, &dyn_info, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_info. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (DYN_LOOKUP_NEEDED(&dyn_info, cmd) && (q = ipfw_dyn_lookup_state(args, ulp, pktlen, cmd, &dyn_info)) != NULL) { /* * Found dynamic entry, jump to the * 'action' part of the parent rule * by setting f, cmd, l and clearing * cmdlen. */ f = q; f_pos = dyn_info.f_pos; cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; cmdlen = 0; match = 1; break; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) l = 0; /* exit inner loop */ match = 1; break; case O_SKIP_ACTION: match = 0; /* skip to the next rule */ l = 0; /* exit inner loop */ break; case O_ACCEPT: retval = 0; /* accept */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, pipe); if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_DIVERT: case O_TEE: if (args->flags & IPFW_ARGS_ETHER) break; /* not on layer 2 */ /* otherwise this is terminal */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, divert); break; case O_COUNT: IPFW_INC_RULE_COUNTER(f, pktlen); l = 0; /* exit inner loop */ break; case O_SKIPTO: IPFW_INC_RULE_COUNTER(f, pktlen); f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0); /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the skipto rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; match = 1; cmdlen = 0; skip_or = 0; continue; break; /* not reached */ case O_CALLRETURN: { /* * Implementation of `subroutine' call/return, * in the stack carried in an mbuf tag. This * is different from `skipto' in that any call * address is possible (`skipto' must prevent * backward jumps to avoid endless loops). * We have `return' action when F_NOT flag is * present. The `m_tag_id' field is used as * stack pointer. */ struct m_tag *mtag; uint16_t jmpto, *stack; #define IS_CALL ((cmd->len & F_NOT) == 0) #define IS_RETURN ((cmd->len & F_NOT) != 0) /* * Hand-rolled version of m_tag_locate() with * wildcard `type'. * If not already tagged, allocate new tag. */ mtag = m_tag_first(m); while (mtag != NULL) { if (mtag->m_tag_cookie == MTAG_IPFW_CALL) break; mtag = m_tag_next(m, mtag); } if (mtag == NULL && IS_CALL) { mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, IPFW_CALLSTACK_SIZE * sizeof(uint16_t), M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } /* * On error both `call' and `return' just * continue with next rule. */ if (IS_RETURN && (mtag == NULL || mtag->m_tag_id == 0)) { l = 0; /* exit inner loop */ break; } if (IS_CALL && (mtag == NULL || mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { printf("ipfw: call stack error, " "go to next rule\n"); l = 0; /* exit inner loop */ break; } IPFW_INC_RULE_COUNTER(f, pktlen); stack = (uint16_t *)(mtag + 1); /* * The `call' action may use cached f_pos * (in f->next_rule), whose version is written * in f->next_rule. * The `return' action, however, doesn't have * fixed jump address in cmd->arg1 and can't use * cache. */ if (IS_CALL) { stack[mtag->m_tag_id] = f->rulenum; mtag->m_tag_id++; f_pos = JUMP(chain, f, cmd->arg1, tablearg, 1); } else { /* `return' action */ mtag->m_tag_id--; jmpto = stack[mtag->m_tag_id] + 1; f_pos = ipfw_find_rule(chain, jmpto, 0); } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the dest rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; cmdlen = 0; skip_or = 0; continue; break; /* NOTREACHED */ } #undef IS_CALL #undef IS_RETURN case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, iplen, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST( &args->f_id.dst_ip6)) { send_reject6(args, cmd->opcode == O_REJECT ? map_icmp_unreach(cmd->arg1): cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_FORWARD_IP: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { #ifdef INET6 /* * We use O_FORWARD_IP opcode for * fwd rule with tablearg, but tables * now support IPv6 addresses. And * when we are inspecting IPv6 packet, * we can use nh6 field from * table_value as next_hop6 address. */ if (is_ipv6) { struct ip_fw_nh6 *nh6; args->flags |= IPFW_ARGS_NH6; nh6 = &args->hopstore6; nh6->sin6_addr = TARG_VAL( chain, tablearg, nh6); nh6->sin6_port = sa->sin_port; nh6->sin6_scope_id = TARG_VAL( chain, tablearg, zoneid); } else #endif { args->flags |= IPFW_ARGS_NH4; args->hopstore.sin_port = sa->sin_port; sa = &args->hopstore; sa->sin_family = AF_INET; sa->sin_len = sizeof(*sa); sa->sin_addr.s_addr = htonl( TARG_VAL(chain, tablearg, nh4)); } } else { args->flags |= IPFW_ARGS_NH4PTR; args->next_hop = sa; } } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #ifdef INET6 case O_FORWARD_IP6: if (args->flags & IPFW_ARGS_ETHER) break; /* not valid on layer2 pkts */ if (q != f || dyn_info.direction == MATCH_FORWARD) { struct sockaddr_in6 *sin6; sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); args->flags |= IPFW_ARGS_NH6PTR; args->next_hop6 = sin6; } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #endif case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); args->rule.info = TARG(cmd->arg1, netgraph); if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_SETFIB: { uint32_t fib; IPFW_INC_RULE_COUNTER(f, pktlen); fib = TARG(cmd->arg1, fib) & 0x7FFF; if (fib >= rt_numfibs) fib = 0; M_SETFIB(m, fib); args->f_id.fib = fib; /* XXX */ l = 0; /* exit inner loop */ break; } case O_SETDSCP: { uint16_t code; code = TARG(cmd->arg1, dscp) & 0x3F; l = 0; /* exit inner loop */ if (is_ipv4) { uint16_t old; old = *(uint16_t *)ip; ip->ip_tos = (code << 2) | (ip->ip_tos & 0x03); ip->ip_sum = cksum_adjust(ip->ip_sum, old, *(uint16_t *)ip); } else if (is_ipv6) { uint8_t *v; v = &((struct ip6_hdr *)ip)->ip6_vfc; *v = (*v & 0xF0) | (code >> 2); v++; *v = (*v & 0x3F) | ((code & 0x03) << 6); } else break; IPFW_INC_RULE_COUNTER(f, pktlen); break; } case O_NAT: l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ /* * Ensure that we do not invoke NAT handler for * non IPv4 packets. Libalias expects only IPv4. */ if (!is_ipv4 || !IPFW_NAT_LOADED) { retval = IP_FW_DENY; break; } struct cfg_nat *t; int nat_id; args->rule.info = 0; set_match(args, f_pos, chain); /* Check if this is 'global' nat rule */ if (cmd->arg1 == IP_FW_NAT44_GLOBAL) { retval = ipfw_nat_ptr(args, NULL, m); break; } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = TARG(cmd->arg1, nat); t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; break; } if (cmd->arg1 != IP_FW_TARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); break; case O_REASS: { int ip_off; l = 0; /* in any case exit inner loop */ if (is_ipv6) /* IPv6 is not supported yet */ break; IPFW_INC_RULE_COUNTER(f, pktlen); ip_off = ntohs(ip->ip_off); /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; args->m = m = ip_reass(m); /* * do IP header checksum fixup. */ if (m == NULL) { /* fragment got swallowed */ retval = IP_FW_DENY; } else { /* good, packet complete */ int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); retval = IP_FW_REASS; args->rule.info = 0; set_match(args, f_pos, chain); } done = 1; /* exit outer loop */ break; } case O_EXTERNAL_ACTION: l = 0; /* in any case exit inner loop */ retval = ipfw_run_eaction(chain, args, cmd, &done); /* * If both @retval and @done are zero, * consider this as rule matching and * update counters. */ if (retval == 0 && done == 0) { IPFW_INC_RULE_COUNTER(f, pktlen); /* * Reset the result of the last * dynamic state lookup. * External action can change * @args content, and it may be * used for new state lookup later. */ DYN_INFO_INIT(&dyn_info); } break; default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ /* * if we get here with l=0, then match is irrelevant. */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner loop, scan opcodes */ #undef PULLUP_LEN if (done) break; /* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ if (done) { struct ip_fw *rule = chain->map[f_pos]; /* Update statistics */ IPFW_INC_RULE_COUNTER(rule, pktlen); } else { retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_PF_RUNLOCK(chain); #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); #endif return (retval); pullup_failed: if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * Set maximum number of tables that can be used in given VNET ipfw instance. */ #ifdef SYSCTL_NODE static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) { int error; unsigned int ntables; ntables = V_fw_tables_max; error = sysctl_handle_int(oidp, &ntables, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_resize_tables(&V_layer3_chain, ntables)); } /* * Switches table namespace between global and per-set. */ static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS) { int error; unsigned int sets; sets = V_fw_tables_sets; error = sysctl_handle_int(oidp, &sets, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_switch_tables_namespace(&V_layer3_chain, sets)); } #endif /* * Module and VNET glue */ /* * Stuff that must be initialised only on boot or module load */ static int ipfw_init(void) { int error = 0; /* * Only print out this stuff the first time around, * when called from the sysinit code. */ printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_to_accept ? "accept" : "deny"); /* * Note: V_xxx variables can be accessed here but the vnet specific * initializer may not have been called yet for the VIMAGE case. * Tuneables will have been processed. We will print out values for * the default vnet. * XXX This should all be rationalized AFTER 8.0 */ if (V_fw_verbose == 0) printf("disabled\n"); else if (V_verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", V_verbose_limit); /* Check user-supplied table count for validness */ if (default_fw_tables > IPFW_TABLES_MAX) default_fw_tables = IPFW_TABLES_MAX; ipfw_init_sopt_handler(); ipfw_init_obj_rewriter(); ipfw_iface_init(); return (error); } /* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { ipfw_iface_destroy(); ipfw_destroy_sopt_handler(); ipfw_destroy_obj_rewriter(); printf("IP firewall unloaded\n"); } /* * Stuff that must be initialized for every instance * (including the first of course). */ static int vnet_ipfw_init(const void *unused) { int error, first; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; first = IS_DEFAULT_VNET(curvnet) ? 1 : 0; /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif #ifdef IPFIREWALL_NAT LIST_INIT(&chain->nat); #endif /* Init shared services hash table */ ipfw_init_srv(chain); ipfw_init_counters(); /* Set initial number of tables */ V_fw_tables_max = default_fw_tables; error = ipfw_init_tables(chain, first); if (error) { printf("ipfw2: setting up tables failed\n"); free(chain->map, M_IPFW); free(rule, M_IPFW); return (ENOSPC); } IPFW_LOCK_INIT(chain); /* fill and insert the default rule */ rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw)); rule->cmd_len = 1; rule->cmd[0].len = 1; rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->default_rule = rule; ipfw_add_protected_rule(chain, rule, 0); ipfw_dyn_init(chain); ipfw_eaction_init(chain, first); #ifdef LINEAR_SKIPTO ipfw_init_skipto_cache(chain); #endif ipfw_bpf_init(first); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* * Hook the sockopt handler and pfil hooks for ipv4 and ipv6. * Even if the latter two fail we still keep the module alive * because the sockopt and layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so * changes in the underlying (per-vnet) variables trigger * immediate hook()/unhook() calls. * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ V_ip_fw_ctl_ptr = ipfw_ctl3; error = ipfw_attach_hooks(); return (error); } /* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { struct ip_fw *reap; struct ip_fw_chain *chain = &V_layer3_chain; int i, last; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* * disconnect from ipv4, ipv6, layer2 and sockopt. * Then grab, release and grab again the WLOCK so we make * sure the update is propagated and nobody will be in. */ ipfw_detach_hooks(); V_ip_fw_ctl_ptr = NULL; last = IS_DEFAULT_VNET(curvnet) ? 1 : 0; IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ IPFW_UH_WLOCK(chain); reap = NULL; IPFW_WLOCK(chain); for (i = 0; i < chain->n_rules; i++) ipfw_reap_add(chain, &reap, chain->map[i]); free(chain->map, M_IPFW); #ifdef LINEAR_SKIPTO ipfw_destroy_skipto_cache(chain); #endif IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); ipfw_destroy_tables(chain, last); ipfw_eaction_uninit(chain, last); if (reap != NULL) ipfw_reap_rules(reap); vnet_ipfw_iface_destroy(chain); ipfw_destroy_srv(chain); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ ipfw_destroy_counters(); ipfw_bpf_uninit(last); return (0); } /* * Module event handler. * In general we have the choice of handling most of these events by the * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to * use the SYSINIT handlers as they are more capable of expressing the * flow of control during module and vnet operations, so this is just * a skeleton. Note there is no SYSINIT equivalent of the module * SHUTDOWN handler, but we don't have anything to do in that case anyhow. */ static int ipfw_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* Called once at module load or * system boot if compiled in. */ break; case MOD_QUIESCE: /* Called before unload. May veto unloading. */ break; case MOD_UNLOAD: /* Called during unload. */ break; case MOD_SHUTDOWN: /* Called during system shutdown. */ break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; /* Define startup order. */ #define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_FIREWALL #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); FEATURE(ipfw_ctl3, "ipfw new sockopt calls"); MODULE_VERSION(ipfw, 3); /* should declare some dependencies here */ /* * Starting up. Done in order after ipfwmod() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_init, NULL); VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_init, NULL); /* * Closing up shop. These are done in REVERSE ORDER, but still * after ipfwmod() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); /* end of file */ Index: head/sys/sys/mbuf.h =================================================================== --- head/sys/sys/mbuf.h (revision 348253) +++ head/sys/sys/mbuf.h (revision 348254) @@ -1,1364 +1,1399 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)mbuf.h 8.5 (Berkeley) 2/19/95 * $FreeBSD$ */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ /* XXX: These includes suck. Sorry! */ #include #ifdef _KERNEL #include +#include #include #ifdef WITNESS #include #endif #endif #ifdef _KERNEL #include #define MBUF_PROBE1(probe, arg0) \ SDT_PROBE1(sdt, , , probe, arg0) #define MBUF_PROBE2(probe, arg0, arg1) \ SDT_PROBE2(sdt, , , probe, arg0, arg1) #define MBUF_PROBE3(probe, arg0, arg1, arg2) \ SDT_PROBE3(sdt, , , probe, arg0, arg1, arg2) #define MBUF_PROBE4(probe, arg0, arg1, arg2, arg3) \ SDT_PROBE4(sdt, , , probe, arg0, arg1, arg2, arg3) #define MBUF_PROBE5(probe, arg0, arg1, arg2, arg3, arg4) \ SDT_PROBE5(sdt, , , probe, arg0, arg1, arg2, arg3, arg4) SDT_PROBE_DECLARE(sdt, , , m__init); SDT_PROBE_DECLARE(sdt, , , m__gethdr); SDT_PROBE_DECLARE(sdt, , , m__get); SDT_PROBE_DECLARE(sdt, , , m__getcl); SDT_PROBE_DECLARE(sdt, , , m__clget); SDT_PROBE_DECLARE(sdt, , , m__cljget); SDT_PROBE_DECLARE(sdt, , , m__cljset); SDT_PROBE_DECLARE(sdt, , , m__free); SDT_PROBE_DECLARE(sdt, , , m__freem); #endif /* _KERNEL */ /* * Mbufs are of a single size, MSIZE (sys/param.h), which includes overhead. * An mbuf may add a single "mbuf cluster" of size MCLBYTES (also in * sys/param.h), which has no additional overhead and is used instead of the * internal data area; this is done when at least MINCLSIZE of data must be * stored. Additionally, it is possible to allocate a separate buffer * externally and attach it to the mbuf in a way similar to that of mbuf * clusters. * * NB: These calculation do not take actual compiler-induced alignment and * padding inside the complete struct mbuf into account. Appropriate * attention is required when changing members of struct mbuf. * * MLEN is data length in a normal mbuf. * MHLEN is data length in an mbuf with pktheader. * MINCLSIZE is a smallest amount of data that should be put into cluster. * * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are sensible. */ struct mbuf; #define MHSIZE offsetof(struct mbuf, m_dat) #define MPKTHSIZE offsetof(struct mbuf, m_pktdat) #define MLEN ((int)(MSIZE - MHSIZE)) #define MHLEN ((int)(MSIZE - MPKTHSIZE)) #define MINCLSIZE (MHLEN + 1) #define M_NODOM 255 #ifdef _KERNEL /*- * Macro for type conversion: convert mbuf pointer to data pointer of correct * type: * * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. * mtodo(m, o) -- Same as above but with offset 'o' into data. */ #define mtod(m, t) ((t)((m)->m_data)) #define mtodo(m, o) ((void *)(((m)->m_data) + (o))) /* * Argument structure passed to UMA routines during mbuf and packet * allocations. */ struct mb_args { int flags; /* Flags for mbuf being allocated */ short type; /* Type of mbuf being allocated */ }; #endif /* _KERNEL */ /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ void (*m_tag_free)(struct m_tag *); }; /* * Static network interface owned tag. * Allocated through ifp->if_snd_tag_alloc(). */ struct m_snd_tag { struct ifnet *ifp; /* network interface tag belongs to */ + volatile u_int refcount; }; /* * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set. * Size ILP32: 48 * LP64: 56 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ struct pkthdr { union { struct m_snd_tag *snd_tag; /* send tag, if any */ struct ifnet *rcvif; /* rcv interface */ }; SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ int32_t len; /* total packet length */ /* Layer crossing persistent information. */ uint32_t flowid; /* packet's 4-tuple system */ uint32_t csum_flags; /* checksum and offload features */ uint16_t fibnum; /* this packet should use this fib */ uint8_t numa_domain; /* NUMA domain of recvd pkt */ uint8_t rsstype; /* hash type */ union { uint64_t rcv_tstmp; /* timestamp in ns */ struct { uint8_t l2hlen; /* layer 2 hdr len */ uint8_t l3hlen; /* layer 3 hdr len */ uint8_t l4hlen; /* layer 4 hdr len */ uint8_t l5hlen; /* layer 5 hdr len */ uint32_t spare; }; }; union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_per; /* Layer specific non-persistent local storage for reassembly, etc. */ union { uint8_t eight[8]; uint16_t sixteen[4]; uint32_t thirtytwo[2]; uint64_t sixtyfour[1]; uintptr_t unintptr[1]; void *ptr; } PH_loc; }; #define ether_vtag PH_per.sixteen[0] #define PH_vt PH_per #define vt_nrecs sixteen[0] #define tso_segsz PH_per.sixteen[1] #define lro_nsegs tso_segsz #define csum_phsum PH_per.sixteen[2] #define csum_data PH_per.thirtytwo[1] #define pace_thoff PH_loc.sixteen[0] #define pace_tlen PH_loc.sixteen[1] #define pace_drphdrlen PH_loc.sixteen[2] #define pace_tos PH_loc.eight[6] #define pace_lock PH_loc.eight[7] /* * Description of external storage mapped into mbuf; valid only if M_EXT is * set. * Size ILP32: 28 * LP64: 48 * Compile-time assertions in uipc_mbuf.c test these values to ensure that * they are correct. */ typedef void m_ext_free_t(struct mbuf *); struct m_ext { union { /* * If EXT_FLAG_EMBREF is set, then we use refcount in the * mbuf, the 'ext_count' member. Otherwise, we have a * shadow copy and we use pointer 'ext_cnt'. The original * mbuf is responsible to carry the pointer to free routine * and its arguments. They aren't copied into shadows in * mb_dupcl() to avoid dereferencing next cachelines. */ volatile u_int ext_count; volatile u_int *ext_cnt; }; char *ext_buf; /* start of buffer */ uint32_t ext_size; /* size of buffer, for ext_free */ uint32_t ext_type:8, /* type of external storage */ ext_flags:24; /* external storage mbuf flags */ /* * Fields below store the free context for the external storage. * They are valid only in the refcount carrying mbuf, the one with * EXT_FLAG_EMBREF flag, with exclusion for EXT_EXTREF type, where * the free context is copied into all mbufs that use same external * storage. */ #define m_ext_copylen offsetof(struct m_ext, ext_free) m_ext_free_t *ext_free; /* free routine if not the usual */ void *ext_arg1; /* optional argument pointer */ void *ext_arg2; /* optional argument pointer */ }; /* * The core of the mbuf object along with some shortcut defines for practical * purposes. */ struct mbuf { /* * Header present at the beginning of every mbuf. * Size ILP32: 24 * LP64: 32 * Compile-time assertions in uipc_mbuf.c test these values to ensure * that they are correct. */ union { /* next buffer in chain */ struct mbuf *m_next; SLIST_ENTRY(mbuf) m_slist; STAILQ_ENTRY(mbuf) m_stailq; }; union { /* next chain in queue/record */ struct mbuf *m_nextpkt; SLIST_ENTRY(mbuf) m_slistpkt; STAILQ_ENTRY(mbuf) m_stailqpkt; }; caddr_t m_data; /* location of data */ int32_t m_len; /* amount of data in this mbuf */ uint32_t m_type:8, /* type of data in this mbuf */ m_flags:24; /* flags; see below */ #if !defined(__LP64__) uint32_t m_pad; /* pad for 64bit alignment */ #endif /* * A set of optional headers (packet header, external storage header) * and internal data storage. Historically, these arrays were sized * to MHLEN (space left after a packet header) and MLEN (space left * after only a regular mbuf header); they are now variable size in * order to support future work on variable-size mbufs. */ union { struct { struct pkthdr m_pkthdr; /* M_PKTHDR set */ union { struct m_ext m_ext; /* M_EXT set */ char m_pktdat[0]; }; }; char m_dat[0]; /* !M_PKTHDR, !M_EXT */ }; }; /* * mbuf flags of global significance and layer crossing. * Those of only protocol/layer specific significance are to be mapped * to M_PROTO[1-12] and cleared at layer handoff boundaries. * NB: Limited to the lower 24 bits. */ #define M_EXT 0x00000001 /* has associated external storage */ #define M_PKTHDR 0x00000002 /* start of record */ #define M_EOR 0x00000004 /* end of record */ #define M_RDONLY 0x00000008 /* associated data is marked read-only */ #define M_BCAST 0x00000010 /* send/received as link-level broadcast */ #define M_MCAST 0x00000020 /* send/received as link-level multicast */ #define M_PROMISC 0x00000040 /* packet was not for us */ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ #define M_NOMAP 0x00000100 /* mbuf data is unmapped (soon from Drew) */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ #define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ #define M_PROTO1 0x00001000 /* protocol-specific */ #define M_PROTO2 0x00002000 /* protocol-specific */ #define M_PROTO3 0x00004000 /* protocol-specific */ #define M_PROTO4 0x00008000 /* protocol-specific */ #define M_PROTO5 0x00010000 /* protocol-specific */ #define M_PROTO6 0x00020000 /* protocol-specific */ #define M_PROTO7 0x00040000 /* protocol-specific */ #define M_PROTO8 0x00080000 /* protocol-specific */ #define M_PROTO9 0x00100000 /* protocol-specific */ #define M_PROTO10 0x00200000 /* protocol-specific */ #define M_PROTO11 0x00400000 /* protocol-specific */ #define M_PROTO12 0x00800000 /* protocol-specific */ #define MB_DTOR_SKIP 0x1 /* don't pollute the cache by touching a freed mbuf */ /* * Flags to purge when crossing layers. */ #define M_PROTOFLAGS \ (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO4|M_PROTO5|M_PROTO6|M_PROTO7|M_PROTO8|\ M_PROTO9|M_PROTO10|M_PROTO11|M_PROTO12) /* * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \ M_TSTMP_HPREC|M_PROTOFLAGS) /* * Mbuf flag description for use with printf(9) %b identifier. */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ "\7M_PROMISC\10M_VLANTAG\13M_TSTMP\14M_TSTMP_HPREC" #define M_FLAG_PROTOBITS \ "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \ "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \ "\27M_PROTO11\30M_PROTO12" #define M_FLAG_PRINTF (M_FLAG_BITS M_FLAG_PROTOBITS) /* * Network interface cards are able to hash protocol fields (such as IPv4 * addresses and TCP port numbers) classify packets into flows. These flows * can then be used to maintain ordering while delivering packets to the OS * via parallel input queues, as well as to provide a stateless affinity * model. NIC drivers can pass up the hash via m->m_pkthdr.flowid, and set * m_flag fields to indicate how the hash should be interpreted by the * network stack. * * Most NICs support RSS, which provides ordering and explicit affinity, and * use the hash m_flag bits to indicate what header fields were covered by * the hash. M_HASHTYPE_OPAQUE and M_HASHTYPE_OPAQUE_HASH can be set by non- * RSS cards or configurations that provide an opaque flow identifier, allowing * for ordering and distribution without explicit affinity. Additionally, * M_HASHTYPE_OPAQUE_HASH indicates that the flow identifier has hash * properties. * * The meaning of the IPV6_EX suffix: * "o Home address from the home address option in the IPv6 destination * options header. If the extension header is not present, use the Source * IPv6 Address. * o IPv6 address that is contained in the Routing-Header-Type-2 from the * associated extension header. If the extension header is not present, * use the Destination IPv6 Address." * Quoted from: * https://docs.microsoft.com/en-us/windows-hardware/drivers/network/rss-hashing-types#ndishashipv6ex */ #define M_HASHTYPE_HASHPROP 0x80 /* has hash properties */ #define M_HASHTYPE_HASH(t) (M_HASHTYPE_HASHPROP | (t)) /* Microsoft RSS standard hash types */ #define M_HASHTYPE_NONE 0 #define M_HASHTYPE_RSS_IPV4 M_HASHTYPE_HASH(1) /* IPv4 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV4 M_HASHTYPE_HASH(2) /* TCPv4 4-tuple */ #define M_HASHTYPE_RSS_IPV6 M_HASHTYPE_HASH(3) /* IPv6 2-tuple */ #define M_HASHTYPE_RSS_TCP_IPV6 M_HASHTYPE_HASH(4) /* TCPv6 4-tuple */ #define M_HASHTYPE_RSS_IPV6_EX M_HASHTYPE_HASH(5) /* IPv6 2-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_TCP_IPV6_EX M_HASHTYPE_HASH(6) /* TCPv6 4-tuple + * ext hdrs */ #define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_HASH(7) /* IPv4 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6 M_HASHTYPE_HASH(9) /* IPv6 UDP 4-tuple*/ #define M_HASHTYPE_RSS_UDP_IPV6_EX M_HASHTYPE_HASH(10)/* IPv6 UDP 4-tuple + * ext hdrs */ #define M_HASHTYPE_OPAQUE 63 /* ordering, not affinity */ #define M_HASHTYPE_OPAQUE_HASH M_HASHTYPE_HASH(M_HASHTYPE_OPAQUE) /* ordering+hash, not affinity*/ #define M_HASHTYPE_CLEAR(m) ((m)->m_pkthdr.rsstype = 0) #define M_HASHTYPE_GET(m) ((m)->m_pkthdr.rsstype) #define M_HASHTYPE_SET(m, v) ((m)->m_pkthdr.rsstype = (v)) #define M_HASHTYPE_TEST(m, v) (M_HASHTYPE_GET(m) == (v)) #define M_HASHTYPE_ISHASH(m) (M_HASHTYPE_GET(m) & M_HASHTYPE_HASHPROP) /* * External mbuf storage buffer types. */ #define EXT_CLUSTER 1 /* mbuf cluster */ #define EXT_SFBUF 2 /* sendfile(2)'s sf_buf */ #define EXT_JUMBOP 3 /* jumbo cluster page sized */ #define EXT_JUMBO9 4 /* jumbo cluster 9216 bytes */ #define EXT_JUMBO16 5 /* jumbo cluster 16184 bytes */ #define EXT_PACKET 6 /* mbuf+cluster from packet zone */ #define EXT_MBUF 7 /* external mbuf reference */ #define EXT_RXRING 8 /* data in NIC receive ring */ #define EXT_VENDOR1 224 /* for vendor-internal use */ #define EXT_VENDOR2 225 /* for vendor-internal use */ #define EXT_VENDOR3 226 /* for vendor-internal use */ #define EXT_VENDOR4 227 /* for vendor-internal use */ #define EXT_EXP1 244 /* for experimental use */ #define EXT_EXP2 245 /* for experimental use */ #define EXT_EXP3 246 /* for experimental use */ #define EXT_EXP4 247 /* for experimental use */ #define EXT_NET_DRV 252 /* custom ext_buf provided by net driver(s) */ #define EXT_MOD_TYPE 253 /* custom module's ext_buf type */ #define EXT_DISPOSABLE 254 /* can throw this buffer away w/page flipping */ #define EXT_EXTREF 255 /* has externally maintained ext_cnt ptr */ /* * Flags for external mbuf buffer types. * NB: limited to the lower 24 bits. */ #define EXT_FLAG_EMBREF 0x000001 /* embedded ext_count */ #define EXT_FLAG_EXTREF 0x000002 /* external ext_cnt, notyet */ #define EXT_FLAG_NOFREE 0x000010 /* don't free mbuf to pool, notyet */ #define EXT_FLAG_VENDOR1 0x010000 /* These flags are vendor */ #define EXT_FLAG_VENDOR2 0x020000 /* or submodule specific, */ #define EXT_FLAG_VENDOR3 0x040000 /* not used by mbuf code. */ #define EXT_FLAG_VENDOR4 0x080000 /* Set/read by submodule. */ #define EXT_FLAG_EXP1 0x100000 /* for experimental use */ #define EXT_FLAG_EXP2 0x200000 /* for experimental use */ #define EXT_FLAG_EXP3 0x400000 /* for experimental use */ #define EXT_FLAG_EXP4 0x800000 /* for experimental use */ /* * EXT flag description for use with printf(9) %b identifier. */ #define EXT_FLAG_BITS \ "\20\1EXT_FLAG_EMBREF\2EXT_FLAG_EXTREF\5EXT_FLAG_NOFREE" \ "\21EXT_FLAG_VENDOR1\22EXT_FLAG_VENDOR2\23EXT_FLAG_VENDOR3" \ "\24EXT_FLAG_VENDOR4\25EXT_FLAG_EXP1\26EXT_FLAG_EXP2\27EXT_FLAG_EXP3" \ "\30EXT_FLAG_EXP4" /* * Flags indicating checksum, segmentation and other offload work to be * done, or already done, by hardware or lower layers. It is split into * separate inbound and outbound flags. * * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested * against ifnet if_hwassist. */ #define CSUM_IP 0x00000001 /* IP header checksum offload */ #define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */ #define CSUM_IP_TCP 0x00000004 /* TCP checksum offload */ #define CSUM_IP_SCTP 0x00000008 /* SCTP checksum offload */ #define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */ #define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */ #define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */ #define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */ #define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */ #define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */ #define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */ /* Inbound checksum support where the checksum was verified by hardware. */ #define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */ #define CSUM_L3_VALID 0x02000000 /* checksum is correct */ #define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */ #define CSUM_L4_VALID 0x08000000 /* checksum is correct */ #define CSUM_L5_CALC 0x10000000 /* calculated layer 5 csum */ #define CSUM_L5_VALID 0x20000000 /* checksum is correct */ #define CSUM_COALESCED 0x40000000 /* contains merged segments */ +#define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */ + /* * CSUM flag description for use with printf(9) %b identifier. */ #define CSUM_BITS \ "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \ "\6CSUM_IP_ISCSI" \ "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \ "\16CSUM_IP6_ISCSI" \ "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \ - "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED" + "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC #define CSUM_IP_VALID CSUM_L3_VALID #define CSUM_DATA_VALID CSUM_L4_VALID #define CSUM_PSEUDO_HDR CSUM_L4_CALC #define CSUM_SCTP_VALID CSUM_L4_VALID #define CSUM_DELAY_DATA (CSUM_TCP|CSUM_UDP) #define CSUM_DELAY_IP CSUM_IP /* Only v4, no v6 IP hdr csum */ #define CSUM_DELAY_DATA_IPV6 (CSUM_TCP_IPV6|CSUM_UDP_IPV6) #define CSUM_DATA_VALID_IPV6 CSUM_DATA_VALID #define CSUM_TCP CSUM_IP_TCP #define CSUM_UDP CSUM_IP_UDP #define CSUM_SCTP CSUM_IP_SCTP #define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO) #define CSUM_UDP_IPV6 CSUM_IP6_UDP #define CSUM_TCP_IPV6 CSUM_IP6_TCP #define CSUM_SCTP_IPV6 CSUM_IP6_SCTP /* * mbuf types describing the content of the mbuf (including external storage). */ #define MT_NOTMBUF 0 /* USED INTERNALLY ONLY! Object is not mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MT_HEADER MT_DATA /* packet header, use M_PKTHDR instead */ #define MT_VENDOR1 4 /* for vendor-internal use */ #define MT_VENDOR2 5 /* for vendor-internal use */ #define MT_VENDOR3 6 /* for vendor-internal use */ #define MT_VENDOR4 7 /* for vendor-internal use */ #define MT_SONAME 8 /* socket name */ #define MT_EXP1 9 /* for experimental use */ #define MT_EXP2 10 /* for experimental use */ #define MT_EXP3 11 /* for experimental use */ #define MT_EXP4 12 /* for experimental use */ #define MT_CONTROL 14 /* extra-data protocol message */ #define MT_EXTCONTROL 15 /* control message with externalized contents */ #define MT_OOBDATA 16 /* expedited data */ #define MT_NOINIT 255 /* Not a type but a flag to allocate a non-initialized mbuf */ /* * String names of mbuf-related UMA(9) and malloc(9) types. Exposed to * !_KERNEL so that monitoring tools can look up the zones with * libmemstat(3). */ #define MBUF_MEM_NAME "mbuf" #define MBUF_CLUSTER_MEM_NAME "mbuf_cluster" #define MBUF_PACKET_MEM_NAME "mbuf_packet" #define MBUF_JUMBOP_MEM_NAME "mbuf_jumbo_page" #define MBUF_JUMBO9_MEM_NAME "mbuf_jumbo_9k" #define MBUF_JUMBO16_MEM_NAME "mbuf_jumbo_16k" #define MBUF_TAG_MEM_NAME "mbuf_tag" #define MBUF_EXTREFCNT_MEM_NAME "mbuf_ext_refcnt" #ifdef _KERNEL #ifdef WITNESS #define MBUF_CHECKSLEEP(how) do { \ if (how == M_WAITOK) \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "Sleeping in \"%s\"", __func__); \ } while (0) #else #define MBUF_CHECKSLEEP(how) #endif /* * Network buffer allocation API * * The rest of it is defined in kern/kern_mbuf.c */ extern uma_zone_t zone_mbuf; extern uma_zone_t zone_clust; extern uma_zone_t zone_pack; extern uma_zone_t zone_jumbop; extern uma_zone_t zone_jumbo9; extern uma_zone_t zone_jumbo16; void mb_dupcl(struct mbuf *, struct mbuf *); void mb_free_ext(struct mbuf *); void m_adj(struct mbuf *, int); int m_apply(struct mbuf *, int, int, int (*)(void *, void *, u_int), void *); int m_append(struct mbuf *, int, c_caddr_t); void m_cat(struct mbuf *, struct mbuf *); void m_catpkt(struct mbuf *, struct mbuf *); int m_clget(struct mbuf *m, int how); void *m_cljget(struct mbuf *m, int how, int size); struct mbuf *m_collapse(struct mbuf *, int, int); void m_copyback(struct mbuf *, int, int, c_caddr_t); void m_copydata(const struct mbuf *, int, int, caddr_t); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_copypacket(struct mbuf *, int); void m_copy_pkthdr(struct mbuf *, struct mbuf *); struct mbuf *m_copyup(struct mbuf *, int, int); struct mbuf *m_defrag(struct mbuf *, int); void m_demote_pkthdr(struct mbuf *); void m_demote(struct mbuf *, int, int); struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(char *, caddr_t, u_int)); void m_dispose_extcontrolm(struct mbuf *m); struct mbuf *m_dup(const struct mbuf *, int); int m_dup_pkthdr(struct mbuf *, const struct mbuf *, int); void m_extadd(struct mbuf *, char *, u_int, m_ext_free_t, void *, void *, int, int); u_int m_fixhdr(struct mbuf *); struct mbuf *m_fragment(struct mbuf *, int, int); void m_freem(struct mbuf *); struct mbuf *m_get2(int, int, short, int); struct mbuf *m_getjcl(int, short, int, int); struct mbuf *m_getm2(struct mbuf *, int, int, short, int); struct mbuf *m_getptr(struct mbuf *, int, int *); u_int m_length(struct mbuf *, struct mbuf **); int m_mbuftouio(struct uio *, const struct mbuf *, int); void m_move_pkthdr(struct mbuf *, struct mbuf *); int m_pkthdr_init(struct mbuf *, int); struct mbuf *m_prepend(struct mbuf *, int, int); void m_print(const struct mbuf *, int); struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); int m_sanity(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); struct mbuf *m_uiotombuf(struct uio *, int, int, int, int); struct mbuf *m_unshare(struct mbuf *, int); +void m_snd_tag_init(struct m_snd_tag *, struct ifnet *); +void m_snd_tag_destroy(struct m_snd_tag *); static __inline int m_gettype(int size) { int type; switch (size) { case MSIZE: type = EXT_MBUF; break; case MCLBYTES: type = EXT_CLUSTER; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: type = EXT_JUMBOP; break; #endif case MJUM9BYTES: type = EXT_JUMBO9; break; case MJUM16BYTES: type = EXT_JUMBO16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (type); } /* * Associated an external reference counted buffer with an mbuf. */ static __inline void m_extaddref(struct mbuf *m, char *buf, u_int size, u_int *ref_cnt, m_ext_free_t freef, void *arg1, void *arg2) { KASSERT(ref_cnt != NULL, ("%s: ref_cnt not provided", __func__)); atomic_add_int(ref_cnt, 1); m->m_flags |= M_EXT; m->m_ext.ext_buf = buf; m->m_ext.ext_cnt = ref_cnt; m->m_data = m->m_ext.ext_buf; m->m_ext.ext_size = size; m->m_ext.ext_free = freef; m->m_ext.ext_arg1 = arg1; m->m_ext.ext_arg2 = arg2; m->m_ext.ext_type = EXT_EXTREF; m->m_ext.ext_flags = 0; } static __inline uma_zone_t m_getzone(int size) { uma_zone_t zone; switch (size) { case MCLBYTES: zone = zone_clust; break; #if MJUMPAGESIZE != MCLBYTES case MJUMPAGESIZE: zone = zone_jumbop; break; #endif case MJUM9BYTES: zone = zone_jumbo9; break; case MJUM16BYTES: zone = zone_jumbo16; break; default: panic("%s: invalid cluster size %d", __func__, size); } return (zone); } /* * Initialize an mbuf with linear storage. * * Inline because the consumer text overhead will be roughly the same to * initialize or call a function with this many parameters and M_PKTHDR * should go away with constant propagation for !MGETHDR. */ static __inline int m_init(struct mbuf *m, int how, short type, int flags) { int error; m->m_next = NULL; m->m_nextpkt = NULL; m->m_data = m->m_dat; m->m_len = 0; m->m_flags = flags; m->m_type = type; if (flags & M_PKTHDR) error = m_pkthdr_init(m, how); else error = 0; MBUF_PROBE5(m__init, m, how, type, flags, error); return (error); } static __inline struct mbuf * m_get(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = 0; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__get, how, type, m); return (m); } static __inline struct mbuf * m_gethdr(int how, short type) { struct mbuf *m; struct mb_args args; args.flags = M_PKTHDR; args.type = type; m = uma_zalloc_arg(zone_mbuf, &args, how); MBUF_PROBE3(m__gethdr, how, type, m); return (m); } static __inline struct mbuf * m_getcl(int how, short type, int flags) { struct mbuf *m; struct mb_args args; args.flags = flags; args.type = type; m = uma_zalloc_arg(zone_pack, &args, how); MBUF_PROBE4(m__getcl, how, type, flags, m); return (m); } /* * XXX: m_cljset() is a dangerous API. One must attach only a new, * unreferenced cluster to an mbuf(9). It is not possible to assert * that, so care can be taken only by users of the API. */ static __inline void m_cljset(struct mbuf *m, void *cl, int type) { int size; switch (type) { case EXT_CLUSTER: size = MCLBYTES; break; #if MJUMPAGESIZE != MCLBYTES case EXT_JUMBOP: size = MJUMPAGESIZE; break; #endif case EXT_JUMBO9: size = MJUM9BYTES; break; case EXT_JUMBO16: size = MJUM16BYTES; break; default: panic("%s: unknown cluster type %d", __func__, type); break; } m->m_data = m->m_ext.ext_buf = cl; m->m_ext.ext_free = m->m_ext.ext_arg1 = m->m_ext.ext_arg2 = NULL; m->m_ext.ext_size = size; m->m_ext.ext_type = type; m->m_ext.ext_flags = EXT_FLAG_EMBREF; m->m_ext.ext_count = 1; m->m_flags |= M_EXT; MBUF_PROBE3(m__cljset, m, cl, type); } static __inline void m_chtype(struct mbuf *m, short new_type) { m->m_type = new_type; } static __inline void m_clrprotoflags(struct mbuf *m) { while (m) { m->m_flags &= ~M_PROTOFLAGS; m = m->m_next; } } static __inline struct mbuf * m_last(struct mbuf *m) { while (m->m_next) m = m->m_next; return (m); } static inline u_int m_extrefcnt(struct mbuf *m) { KASSERT(m->m_flags & M_EXT, ("%s: M_EXT missing", __func__)); return ((m->m_ext.ext_flags & EXT_FLAG_EMBREF) ? m->m_ext.ext_count : *m->m_ext.ext_cnt); } /* * mbuf, cluster, and external object allocation macros (for compatibility * purposes). */ #define M_MOVE_PKTHDR(to, from) m_move_pkthdr((to), (from)) #define MGET(m, how, type) ((m) = m_get((how), (type))) #define MGETHDR(m, how, type) ((m) = m_gethdr((how), (type))) #define MCLGET(m, how) m_clget((m), (how)) #define MEXTADD(m, buf, size, free, arg1, arg2, flags, type) \ m_extadd((m), (char *)(buf), (size), (free), (arg1), (arg2), \ (flags), (type)) #define m_getm(m, len, how, type) \ m_getm2((m), (len), (how), (type), M_PKTHDR) /* * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can * be both the local data payload, or an external buffer area, depending on * whether M_EXT is set). */ #define M_WRITABLE(m) (!((m)->m_flags & M_RDONLY) && \ (!(((m)->m_flags & M_EXT)) || \ (m_extrefcnt(m) == 1))) /* Check if the supplied mbuf has a packet header, or else panic. */ #define M_ASSERTPKTHDR(m) \ KASSERT((m) != NULL && (m)->m_flags & M_PKTHDR, \ ("%s: no mbuf packet header!", __func__)) /* * Ensure that the supplied mbuf is a valid, non-free mbuf. * * XXX: Broken at the moment. Need some UMA magic to make it work again. */ #define M_ASSERTVALID(m) \ KASSERT((((struct mbuf *)m)->m_flags & 0) == 0, \ ("%s: attempted use of a free mbuf!", __func__)) /* * Return the address of the start of the buffer associated with an mbuf, * handling external storage, packet-header mbufs, and regular data mbufs. */ #define M_START(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \ ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] : \ &(m)->m_dat[0]) /* * Return the size of the buffer associated with an mbuf, handling external * storage, packet-header mbufs, and regular data mbufs. */ #define M_SIZE(m) \ (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \ ((m)->m_flags & M_PKTHDR) ? MHLEN : \ MLEN) /* * Set the m_data pointer of a newly allocated mbuf to place an object of the * specified size at the end of the mbuf, longword aligned. * * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as * separate macros, each asserting that it was called at the proper moment. * This required callers to themselves test the storage type and call the * right one. Rather than require callers to be aware of those layout * decisions, we centralize here. */ static __inline void m_align(struct mbuf *m, int len) { #ifdef INVARIANTS const char *msg = "%s: not a virgin mbuf"; #endif int adjust; KASSERT(m->m_data == M_START(m), (msg, __func__)); adjust = M_SIZE(m) - len; m->m_data += adjust &~ (sizeof(long)-1); } #define M_ALIGN(m, len) m_align(m, len) #define MH_ALIGN(m, len) m_align(m, len) #define MEXT_ALIGN(m, len) m_align(m, len) /* * Compute the amount of space available before the current start of data in * an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_LEADINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_LEADINGSPACE(m) \ (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0) /* * Compute the amount of space available after the end of data in an mbuf. * * The M_WRITABLE() is a temporary, conservative safety measure: the burden * of checking writability of the mbuf data area rests solely with the caller. * * NB: In previous versions, M_TRAILINGSPACE() would only check M_WRITABLE() * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ #define M_TRAILINGSPACE(m) \ (M_WRITABLE(m) ? \ ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0) /* * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be * allocated, how specifies whether to wait. If the allocation fails, the * original mbuf chain is freed and m is set to NULL. */ #define M_PREPEND(m, plen, how) do { \ struct mbuf **_mmp = &(m); \ struct mbuf *_mm = *_mmp; \ int _mplen = (plen); \ int __mhow = (how); \ \ MBUF_CHECKSLEEP(how); \ if (M_LEADINGSPACE(_mm) >= _mplen) { \ _mm->m_data -= _mplen; \ _mm->m_len += _mplen; \ } else \ _mm = m_prepend(_mm, _mplen, __mhow); \ if (_mm != NULL && _mm->m_flags & M_PKTHDR) \ _mm->m_pkthdr.len += _mplen; \ *_mmp = _mm; \ } while (0) /* * Change mbuf to new type. This is a relatively expensive operation and * should be avoided. */ #define MCHTYPE(m, t) m_chtype((m), (t)) +/* Return the rcvif of a packet header. */ +static __inline struct ifnet * +m_rcvif(struct mbuf *m) +{ + + M_ASSERTPKTHDR(m); + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) + return (NULL); + return (m->m_pkthdr.rcvif); +} + /* Length to m_copy to copy all. */ #define M_COPYALL 1000000000 extern int max_datalen; /* MHLEN - max_hdr */ extern int max_hdr; /* Largest link + protocol header */ extern int max_linkhdr; /* Largest link-level header */ extern int max_protohdr; /* Largest protocol header */ extern int nmbclusters; /* Maximum number of clusters */ /*- * Network packets may have annotations attached by affixing a list of * "packet tags" to the pkthdr structure. Packet tags are dynamically * allocated semi-opaque data structures that have a fixed header * (struct m_tag) that specifies the size of the memory block and a * pair that identifies it. The cookie is a 32-bit unique * unsigned value used to identify a module or ABI. By convention this value * is chosen as the date+time that the module is created, expressed as the * number of seconds since the epoch (e.g., using date -u +'%s'). The type * value is an ABI/module-specific value that identifies a particular * annotation and is private to the module. For compatibility with systems * like OpenBSD that define packet tags w/o an ABI/module cookie, the value * PACKET_ABI_COMPAT is used to implement m_tag_get and m_tag_find * compatibility shim functions and several tag types are defined below. * Users that do not require compatibility should use a private cookie value * so that packet tag-related definitions can be maintained privately. * * Note that the packet tag returned by m_tag_alloc has the default memory * alignment implemented by malloc. To reference private data one can use a * construct like: * * struct m_tag *mtag = m_tag_alloc(...); * struct foo *p = (struct foo *)(mtag+1); * * if the alignment of struct m_tag is sufficient for referencing members of * struct foo. Otherwise it is necessary to embed struct m_tag within the * private data structure to insure proper alignment; e.g., * * struct foo { * struct m_tag tag; * ... * }; * struct foo *p = (struct foo *) m_tag_alloc(...); * struct m_tag *mtag = &p->tag; */ /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise * tags are expected to ``vanish'' when they pass through a network * interface. For most interfaces this happens normally as the tags are * reclaimed when the mbuf is free'd. However in some special cases * reclaiming must be done manually. An example is packets that pass through * the loopback interface. Also, one must be careful to do this when * ``turning around'' packets (e.g., icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the tag id. * The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF (21 | MTAG_PERSISTENT) /* PF/ALTQ information */ #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ #define PACKET_TAG_IPSEC_NAT_T_PORTS 29 /* two uint16_t */ #define PACKET_TAG_ND_OUTGOING 30 /* ND outgoing */ /* Specific cookies and tags. */ /* Packet tag routines. */ struct m_tag *m_tag_alloc(u_int32_t, int, int, int); void m_tag_delete(struct mbuf *, struct m_tag *); void m_tag_delete_chain(struct mbuf *, struct m_tag *); void m_tag_free_default(struct m_tag *); struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, int, struct m_tag *); struct m_tag *m_tag_copy(struct m_tag *, int); int m_tag_copy_chain(struct mbuf *, const struct mbuf *, int); void m_tag_delete_nonpersistent(struct mbuf *); /* * Initialize the list of tags associated with an mbuf. */ static __inline void m_tag_init(struct mbuf *m) { SLIST_INIT(&m->m_pkthdr.tags); } /* * Set up the contents of a tag. Note that this does not fill in the free * method; the caller is expected to do that. * * XXX probably should be called m_tag_init, but that was already taken. */ static __inline void m_tag_setup(struct m_tag *t, u_int32_t cookie, int type, int len) { t->m_tag_id = type; t->m_tag_len = len; t->m_tag_cookie = cookie; } /* * Reclaim resources associated with a tag. */ static __inline void m_tag_free(struct m_tag *t) { (*t->m_tag_free)(t); } /* * Return the first tag associated with an mbuf. */ static __inline struct m_tag * m_tag_first(struct mbuf *m) { return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m __unused, struct m_tag *t) { return (SLIST_NEXT(t, m_tag_link)); } /* * Prepend a tag to the list of tags associated with an mbuf. */ static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Unlink a tag from the list of tags associated with an mbuf. */ static __inline void m_tag_unlink(struct mbuf *m, struct m_tag *t) { SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } /* These are for OpenBSD compatibility. */ #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return (m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait)); } static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); } +static inline struct m_snd_tag * +m_snd_tag_ref(struct m_snd_tag *mst) +{ + + refcount_acquire(&mst->refcount); + return (mst); +} + +static inline void +m_snd_tag_rele(struct m_snd_tag *mst) +{ + + if (refcount_release(&mst->refcount)) + m_snd_tag_destroy(mst); +} + static __inline struct mbuf * m_free(struct mbuf *m) { struct mbuf *n = m->m_next; MBUF_PROBE1(m__free, m); if ((m->m_flags & (M_PKTHDR|M_NOFREE)) == (M_PKTHDR|M_NOFREE)) m_tag_delete_chain(m, NULL); + if (m->m_flags & M_PKTHDR && m->m_pkthdr.csum_flags & CSUM_SND_TAG) + m_snd_tag_rele(m->m_pkthdr.snd_tag); if (m->m_flags & M_EXT) mb_free_ext(m); else if ((m->m_flags & M_NOFREE) == 0) uma_zfree(zone_mbuf, m); return (n); } static __inline int rt_m_getfib(struct mbuf *m) { KASSERT(m->m_flags & M_PKTHDR , ("Attempt to get FIB from non header mbuf.")); return (m->m_pkthdr.fibnum); } #define M_GETFIB(_m) rt_m_getfib(_m) #define M_SETFIB(_m, _fib) do { \ KASSERT((_m)->m_flags & M_PKTHDR, ("Attempt to set FIB on non header mbuf.")); \ ((_m)->m_pkthdr.fibnum) = (_fib); \ } while (0) /* flags passed as first argument for "m_ether_tcpip_hash()" */ #define MBUF_HASHFLAG_L2 (1 << 2) #define MBUF_HASHFLAG_L3 (1 << 3) #define MBUF_HASHFLAG_L4 (1 << 4) /* mbuf hashing helper routines */ uint32_t m_ether_tcpip_hash_init(void); uint32_t m_ether_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t); #ifdef MBUF_PROFILING void m_profile(struct mbuf *m); #define M_PROFILE(m) m_profile(m) #else #define M_PROFILE(m) #endif struct mbufq { STAILQ_HEAD(, mbuf) mq_head; int mq_len; int mq_maxlen; }; static inline void mbufq_init(struct mbufq *mq, int maxlen) { STAILQ_INIT(&mq->mq_head); mq->mq_maxlen = maxlen; mq->mq_len = 0; } static inline struct mbuf * mbufq_flush(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); STAILQ_INIT(&mq->mq_head); mq->mq_len = 0; return (m); } static inline void mbufq_drain(struct mbufq *mq) { struct mbuf *m, *n; n = mbufq_flush(mq); while ((m = n) != NULL) { n = STAILQ_NEXT(m, m_stailqpkt); m_freem(m); } } static inline struct mbuf * mbufq_first(const struct mbufq *mq) { return (STAILQ_FIRST(&mq->mq_head)); } static inline struct mbuf * mbufq_last(const struct mbufq *mq) { return (STAILQ_LAST(&mq->mq_head, mbuf, m_stailqpkt)); } static inline int mbufq_full(const struct mbufq *mq) { return (mq->mq_len >= mq->mq_maxlen); } static inline int mbufq_len(const struct mbufq *mq) { return (mq->mq_len); } static inline int mbufq_enqueue(struct mbufq *mq, struct mbuf *m) { if (mbufq_full(mq)) return (ENOBUFS); STAILQ_INSERT_TAIL(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; return (0); } static inline struct mbuf * mbufq_dequeue(struct mbufq *mq) { struct mbuf *m; m = STAILQ_FIRST(&mq->mq_head); if (m) { STAILQ_REMOVE_HEAD(&mq->mq_head, m_stailqpkt); m->m_nextpkt = NULL; mq->mq_len--; } return (m); } static inline void mbufq_prepend(struct mbufq *mq, struct mbuf *m) { STAILQ_INSERT_HEAD(&mq->mq_head, m, m_stailqpkt); mq->mq_len++; } /* * Note: this doesn't enforce the maximum list size for dst. */ static inline void mbufq_concat(struct mbufq *mq_dst, struct mbufq *mq_src) { mq_dst->mq_len += mq_src->mq_len; STAILQ_CONCAT(&mq_dst->mq_head, &mq_src->mq_head); mq_src->mq_len = 0; } #ifdef _SYS_TIMESPEC_H_ static inline void mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts) { KASSERT((m->m_flags & M_PKTHDR) != 0, ("mbuf %p no M_PKTHDR", m)); KASSERT((m->m_flags & M_TSTMP) != 0, ("mbuf %p no M_TSTMP", m)); ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000; } #endif #ifdef NETDUMP /* Invoked from the netdump client code. */ void netdump_mbuf_drain(void); void netdump_mbuf_dump(void); void netdump_mbuf_reinit(int nmbuf, int nclust, int clsize); #endif #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */